From b79b094dcc346172e65d34acb00cf464bc85ceac Mon Sep 17 00:00:00 2001 From: K11OntheBoat Date: Mon, 20 Apr 2026 15:55:06 +0800 Subject: [PATCH] Change default workers and max-concurrency when launch api-server (#7457) Co-authored-by: zhangxiao35 --- fastdeploy/entrypoints/api_server.py | 2 +- fastdeploy/entrypoints/openai/utils.py | 7 +++++-- 2 files changed, 6 insertions(+), 3 deletions(-) diff --git a/fastdeploy/entrypoints/api_server.py b/fastdeploy/entrypoints/api_server.py index c1d92a7d04..ce8324deb7 100644 --- a/fastdeploy/entrypoints/api_server.py +++ b/fastdeploy/entrypoints/api_server.py @@ -136,7 +136,7 @@ def main(): parser = FlexibleArgumentParser() parser.add_argument("--port", default=9904, type=int, help="port to the http server") parser.add_argument("--host", default="0.0.0.0", type=str, help="host to the http server") - parser.add_argument("--workers", default=1, type=int, help="number of workers") + parser.add_argument("--workers", default=4, type=int, help="number of workers") parser = EngineArgs.add_cli_args(parser) args = parser.parse_args() launch_api_server(args) diff --git a/fastdeploy/entrypoints/openai/utils.py b/fastdeploy/entrypoints/openai/utils.py index baa428b500..57976b0f5b 100644 --- a/fastdeploy/entrypoints/openai/utils.py +++ b/fastdeploy/entrypoints/openai/utils.py @@ -341,9 +341,10 @@ class DealerConnectionManager: def make_arg_parser(parser: FlexibleArgumentParser) -> FlexibleArgumentParser: + _is_multi_server = os.environ.get("FD_ENABLE_MULTI_API_SERVER") == "1" parser.add_argument("--port", default=8000, type=int, help="port to the http server") parser.add_argument("--host", default="0.0.0.0", type=str, help="host to the http server") - parser.add_argument("--workers", default=1, type=int, help="number of workers") + parser.add_argument("--workers", default=1 if _is_multi_server else 4, type=int, help="number of workers") parser.add_argument("--metrics-port", default=None, type=int, help="port for metrics server") parser.add_argument("--controller-port", default=-1, type=int, help="port for controller server") parser.add_argument( @@ -352,7 +353,9 @@ def make_arg_parser(parser: FlexibleArgumentParser) -> FlexibleArgumentParser: type=int, help="max waiting time for connection, if set value -1 means no waiting time limit", ) - parser.add_argument("--max-concurrency", default=512, type=int, help="max concurrency") + parser.add_argument( + "--max-concurrency", default=512 if _is_multi_server else 2048, type=int, help="max concurrency" + ) parser.add_argument( "--enable-mm-output", action="store_true", help="Enable 'multimodal_content' field in response output. "