[Optimization] Update ZMQ server (#6735)

* add batch zmq send reaponse

* update

* Revert "update"

This reverts commit 0234a25b47.

* update

* remove lock

* fix unit test

* add unit test

* add unit test

* pre commit

* add unit test

* fix unit test

* add unit test

* fix worker>1

* update zmq_worker_pid

* fix unit test

* fix unit test

* fix unit test

* add unit test

* fix unit test

* fix first token time

* fix logprobs

* add unit test

* op

* remore debug log

---------

Co-authored-by: Jiang-Jia-Jun <163579578+Jiang-Jia-Jun@users.noreply.github.com>
This commit is contained in:
luukunn
2026-03-19 21:53:16 +08:00
committed by GitHub
parent 9148562ed0
commit c3d8db85c4
18 changed files with 2739 additions and 133 deletions
@@ -25,6 +25,7 @@ from typing import Any, ClassVar, Generic, Optional, TypeVar, Union
from pydantic import BaseModel, ConfigDict, Field
from typing_extensions import override
import fastdeploy.envs as envs
from fastdeploy.engine.request import RequestOutput
from fastdeploy.entrypoints.openai.protocol import (
ErrorInfo,
@@ -276,10 +277,9 @@ class ZmqOpenAIServing(OpenAIServing):
dealer, request_output_queue = await self.engine_client.connection_manager.get_connection(
request_id, num_choices
)
for pr in ctx.preprocess_requests:
dealer.write([b"", pr["request_id"].encode("utf-8")])
# if self.engine_client.check_model_weight_status():
# raise ValueError("Engine is clearing model weight")
if not envs.ZMQ_SEND_BATCH_DATA:
for pr in ctx.preprocess_requests:
dealer.write([b"", pr["request_id"].encode("utf-8")])
while num_choices > 0:
request_output_dicts = await asyncio.wait_for(request_output_queue.get(), timeout=60)
for request_output_dict in request_output_dicts: