[Optimization] Update ZMQ server (#6735)

* add batch zmq send reaponse * update * Revert "update" This reverts commit 0234a25b47. * update * remove lock * fix unit test * add unit test * add unit test * pre commit * add unit test * fix unit test * add unit test * fix worker>1 * update zmq_worker_pid * fix unit test * fix unit test * fix unit test * add unit test * fix unit test * fix first token time * fix logprobs * add unit test * op * remore debug log --------- Co-authored-by: Jiang-Jia-Jun <163579578+Jiang-Jia-Jun@users.noreply.github.com>
2026-04-23 00:17:25 +08:00 · 2026-03-19 21:53:16 +08:00
parent 9148562ed0
commit c3d8db85c4
18 changed files with 2739 additions and 133 deletions
@@ -25,6 +25,7 @@ from typing import Any, ClassVar, Generic, Optional, TypeVar, Union
 from pydantic import BaseModel, ConfigDict, Field
 from typing_extensions import override

+import fastdeploy.envs as envs
 from fastdeploy.engine.request import RequestOutput
 from fastdeploy.entrypoints.openai.protocol import (
    ErrorInfo,
@@ -276,10 +277,9 @@ class ZmqOpenAIServing(OpenAIServing):
            dealer, request_output_queue = await self.engine_client.connection_manager.get_connection(
                request_id, num_choices
            )
-            for pr in ctx.preprocess_requests:
-                dealer.write([b"", pr["request_id"].encode("utf-8")])
-            # if self.engine_client.check_model_weight_status():
-            #     raise ValueError("Engine is clearing model weight")
+            if not envs.ZMQ_SEND_BATCH_DATA:
+                for pr in ctx.preprocess_requests:
+                    dealer.write([b"", pr["request_id"].encode("utf-8")])
            while num_choices > 0:
                request_output_dicts = await asyncio.wait_for(request_output_queue.get(), timeout=60)
                for request_output_dict in request_output_dicts: