diff --git a/fastdeploy/entrypoints/api_server.py b/fastdeploy/entrypoints/api_server.py index c1d92a7d04..ce8324deb7 100644 --- a/fastdeploy/entrypoints/api_server.py +++ b/fastdeploy/entrypoints/api_server.py @@ -136,7 +136,7 @@ def main(): parser = FlexibleArgumentParser() parser.add_argument("--port", default=9904, type=int, help="port to the http server") parser.add_argument("--host", default="0.0.0.0", type=str, help="host to the http server") - parser.add_argument("--workers", default=1, type=int, help="number of workers") + parser.add_argument("--workers", default=4, type=int, help="number of workers") parser = EngineArgs.add_cli_args(parser) args = parser.parse_args() launch_api_server(args) diff --git a/fastdeploy/entrypoints/openai/utils.py b/fastdeploy/entrypoints/openai/utils.py index baa428b500..57976b0f5b 100644 --- a/fastdeploy/entrypoints/openai/utils.py +++ b/fastdeploy/entrypoints/openai/utils.py @@ -341,9 +341,10 @@ class DealerConnectionManager: def make_arg_parser(parser: FlexibleArgumentParser) -> FlexibleArgumentParser: + _is_multi_server = os.environ.get("FD_ENABLE_MULTI_API_SERVER") == "1" parser.add_argument("--port", default=8000, type=int, help="port to the http server") parser.add_argument("--host", default="0.0.0.0", type=str, help="host to the http server") - parser.add_argument("--workers", default=1, type=int, help="number of workers") + parser.add_argument("--workers", default=1 if _is_multi_server else 4, type=int, help="number of workers") parser.add_argument("--metrics-port", default=None, type=int, help="port for metrics server") parser.add_argument("--controller-port", default=-1, type=int, help="port for controller server") parser.add_argument( @@ -352,7 +353,9 @@ def make_arg_parser(parser: FlexibleArgumentParser) -> FlexibleArgumentParser: type=int, help="max waiting time for connection, if set value -1 means no waiting time limit", ) - parser.add_argument("--max-concurrency", default=512, type=int, help="max concurrency") + parser.add_argument( + "--max-concurrency", default=512 if _is_multi_server else 2048, type=int, help="max concurrency" + ) parser.add_argument( "--enable-mm-output", action="store_true", help="Enable 'multimodal_content' field in response output. "