diff --git a/fastdeploy/cache_manager/transfer_factory/get_rdma_nics.sh b/fastdeploy/cache_manager/transfer_factory/get_rdma_nics.sh index 4fc07a98c9..7fce4f8e5a 100644 --- a/fastdeploy/cache_manager/transfer_factory/get_rdma_nics.sh +++ b/fastdeploy/cache_manager/transfer_factory/get_rdma_nics.sh @@ -4,9 +4,9 @@ NICNAME_TYPE=xgbe # 默认检测类型 type=$1 if [ "$ENABLE_EP_DP" == "1" ]; then - gpu_root_port_filename="${Cur_Dir}/gpu_rootport_${DP_RANK}.txt" + gpu_root_port_filename="${Cur_Dir}/gpu_rootport_${DP_RANK}_$$.txt" else - gpu_root_port_filename="${Cur_Dir}/gpu_rootport.txt" + gpu_root_port_filename="${Cur_Dir}/gpu_rootport_$$.txt" fi function __NEW_GPU_ROOTPORT_FILE__() { diff --git a/fastdeploy/engine/args_utils.py b/fastdeploy/engine/args_utils.py index 0d73354499..53113d4bc3 100644 --- a/fastdeploy/engine/args_utils.py +++ b/fastdeploy/engine/args_utils.py @@ -606,7 +606,12 @@ class EngineArgs: console_logger.info(f"Using `{name}`: {ports}") if not self.skip_port_check: - for port in ports: + cur_dp_ports = ports[ + num_cur_dp_ports + * self.local_data_parallel_id : num_cur_dp_ports + * (self.local_data_parallel_id + 1) + ] + for port in cur_dp_ports: assert is_port_available("0.0.0.0", port), f"Parameter `{name}`:{port} is already in use." console_logger.debug(f"post init {name}: {ports}") diff --git a/fastdeploy/entrypoints/openai/multi_api_server.py b/fastdeploy/entrypoints/openai/multi_api_server.py index 5e64228382..08aa3efbf8 100644 --- a/fastdeploy/entrypoints/openai/multi_api_server.py +++ b/fastdeploy/entrypoints/openai/multi_api_server.py @@ -136,7 +136,7 @@ def check_param(ports, num_servers): for port in ports: logger.info(f"check port {port}") if not is_port_available("0.0.0.0", int(port)): - return False + raise RuntimeError(f"Port {port} is not available.") return True @@ -149,13 +149,17 @@ def main(): parser.add_argument("--args", nargs=argparse.REMAINDER, help="remaining arguments are passed to api_server.py") args = parser.parse_args() - logger.info(f"Starting {args.num_servers} servers on ports: {args.ports} with args: {args.args}") + logger.info(f"Launching MultiAPIServer with command: {' '.join(sys.argv)}") device_count = 0 if current_platform.is_cuda(): - device_count = len(os.getenv("CUDA_VISIBLE_DEVICES", "0,1,2,3,4,5,6,7").split(",")) + if os.getenv("CUDA_VISIBLE_DEVICES") is None: + raise ValueError("Please manually set CUDA_VISIBLE_DEVICES when launching multi-api-server.") + device_count = len(os.getenv("CUDA_VISIBLE_DEVICES").split(",")) elif current_platform.is_xpu(): - device_count = len(os.getenv("XPU_VISIBLE_DEVICES", "0,1,2,3,4,5,6,7").split(",")) + if os.getenv("XPU_VISIBLE_DEVICES") is None: + raise ValueError("Please manually set XPU_VISIBLE_DEVICES when launching multi-api-server.") + device_count = len(os.getenv("XPU_VISIBLE_DEVICES").split(",")) processes = start_servers( server_count=args.num_servers, diff --git a/scripts/get_rdma_nics.sh b/scripts/get_rdma_nics.sh index 4fc07a98c9..7fce4f8e5a 100644 --- a/scripts/get_rdma_nics.sh +++ b/scripts/get_rdma_nics.sh @@ -4,9 +4,9 @@ NICNAME_TYPE=xgbe # 默认检测类型 type=$1 if [ "$ENABLE_EP_DP" == "1" ]; then - gpu_root_port_filename="${Cur_Dir}/gpu_rootport_${DP_RANK}.txt" + gpu_root_port_filename="${Cur_Dir}/gpu_rootport_${DP_RANK}_$$.txt" else - gpu_root_port_filename="${Cur_Dir}/gpu_rootport.txt" + gpu_root_port_filename="${Cur_Dir}/gpu_rootport_$$.txt" fi function __NEW_GPU_ROOTPORT_FILE__() { diff --git a/tests/entrypoints/openai/test_multi_api_server.py b/tests/entrypoints/openai/test_multi_api_server.py index a17f306acd..bf533b9576 100644 --- a/tests/entrypoints/openai/test_multi_api_server.py +++ b/tests/entrypoints/openai/test_multi_api_server.py @@ -122,7 +122,8 @@ class TestMultiApiServer(unittest.TestCase): # Mock port availability check - first port available, second not mock_is_port_available.side_effect = [True, False] - self.assertFalse(check_param(self.test_ports.split(","), self.test_server_count)) + with self.assertRaises(RuntimeError): + check_param(self.test_ports.split(","), self.test_server_count) @patch("fastdeploy.entrypoints.openai.multi_api_server.is_port_available") @patch("fastdeploy.entrypoints.openai.multi_api_server.start_servers")