From 9de6ae375c35e09b511aeb44acdb6a04c0bcfbd1 Mon Sep 17 00:00:00 2001 From: Copilot <198982749+Copilot@users.noreply.github.com> Date: Mon, 5 Jan 2026 09:45:19 +0800 Subject: [PATCH] [Cherry-Pick][APIServer][Feature] Add configurable worker health check timeout via FD_WORKER_ALIVE_TIMEOUT(#5865) (#5867) * Initial plan * Cherry-pick PR #5865: Add configurable worker health check timeout via FD_WORKER_ALIVE_TIMEOUT Co-authored-by: Jiang-Jia-Jun <163579578+Jiang-Jia-Jun@users.noreply.github.com> --------- Co-authored-by: copilot-swe-agent[bot] <198982749+Copilot@users.noreply.github.com> Co-authored-by: Jiang-Jia-Jun <163579578+Jiang-Jia-Jun@users.noreply.github.com> --- docs/usage/environment_variables.md | 3 +++ docs/zh/usage/environment_variables.md | 6 +++++- fastdeploy/entrypoints/openai/serving_chat.py | 5 +++-- fastdeploy/entrypoints/openai/serving_completion.py | 5 +++-- fastdeploy/envs.py | 2 ++ 5 files changed, 16 insertions(+), 5 deletions(-) diff --git a/docs/usage/environment_variables.md b/docs/usage/environment_variables.md index c4c319f83a..b0c63e8c64 100644 --- a/docs/usage/environment_variables.md +++ b/docs/usage/environment_variables.md @@ -88,5 +88,8 @@ environment_variables: dict[str, Callable[[], Any]] = { # Count for cache_transfer_manager process error "FD_CACHE_PROC_ERROR_COUNT": lambda: int(os.getenv("FD_CACHE_PROC_ERROR_COUNT", "10")), + + # Worker process health check timeout when waiting for responses in seconds (default: 30) + "FD_WORKER_ALIVE_TIMEOUT": lambda: int(os.getenv("FD_WORKER_ALIVE_TIMEOUT", "30")), } ``` diff --git a/docs/zh/usage/environment_variables.md b/docs/zh/usage/environment_variables.md index b0a162a8aa..119f9fb38b 100644 --- a/docs/zh/usage/environment_variables.md +++ b/docs/zh/usage/environment_variables.md @@ -87,5 +87,9 @@ environment_variables: dict[str, Callable[[], Any]] = { "FD_CACHE_PROC_EXIT_TIMEOUT": lambda: int(os.getenv("FD_CACHE_PROC_EXIT_TIMEOUT", "600")), # cache_transfer_manager 进程残留时连续错误阈值 - "FD_CACHE_PROC_ERROR_COUNT": lambda: int(os.getenv("FD_CACHE_PROC_ERROR_COUNT", "10")),} + "FD_CACHE_PROC_ERROR_COUNT": lambda: int(os.getenv("FD_CACHE_PROC_ERROR_COUNT", "10")), + + # Worker 进程响应等待时的健康检查超时时间(秒),默认 30 秒 + "FD_WORKER_ALIVE_TIMEOUT": lambda: int(os.getenv("FD_WORKER_ALIVE_TIMEOUT", "30")), +} ``` diff --git a/fastdeploy/entrypoints/openai/serving_chat.py b/fastdeploy/entrypoints/openai/serving_chat.py index b9daa74fb9..6c1d63a007 100644 --- a/fastdeploy/entrypoints/openai/serving_chat.py +++ b/fastdeploy/entrypoints/openai/serving_chat.py @@ -24,6 +24,7 @@ from typing import List, Optional import numpy as np +import fastdeploy.envs as envs from fastdeploy.entrypoints.openai.protocol import ( ChatCompletionRequest, ChatCompletionResponse, @@ -264,7 +265,7 @@ class OpenAIServingChat: except asyncio.TimeoutError: current_waiting_time += 10 if current_waiting_time == 300: - status, msg = self.engine_client.check_health() + status, msg = self.engine_client.check_health(time_interval_threashold=envs.FD_WORKER_ALIVE_TIMEOUT) if not status: if choices: chunk.choices = choices @@ -557,7 +558,7 @@ class OpenAIServingChat: except asyncio.TimeoutError: current_waiting_time += 10 if current_waiting_time == 300: - status, msg = self.engine_client.check_health() + status, msg = self.engine_client.check_health(time_interval_threashold=envs.FD_WORKER_ALIVE_TIMEOUT) if not status: raise ValueError(f"Engine is not healthy: {msg}") else: diff --git a/fastdeploy/entrypoints/openai/serving_completion.py b/fastdeploy/entrypoints/openai/serving_completion.py index fb3acb41ad..b7b1220a77 100644 --- a/fastdeploy/entrypoints/openai/serving_completion.py +++ b/fastdeploy/entrypoints/openai/serving_completion.py @@ -25,6 +25,7 @@ from typing import List, Optional import numpy as np +import fastdeploy.envs as envs from fastdeploy.engine.request import RequestOutput from fastdeploy.entrypoints.openai.protocol import ( CompletionLogprobs, @@ -280,7 +281,7 @@ class OpenAIServingCompletion: except asyncio.TimeoutError: current_waiting_time += 10 if current_waiting_time == 300: - status, msg = self.engine_client.check_health() + status, msg = self.engine_client.check_health(time_interval_threashold=envs.FD_WORKER_ALIVE_TIMEOUT) if not status: raise ValueError(f"Engine is not healthy: {msg}") else: @@ -436,7 +437,7 @@ class OpenAIServingCompletion: except asyncio.TimeoutError: current_waiting_time += 10 if current_waiting_time == 300: - status, msg = self.engine_client.check_health() + status, msg = self.engine_client.check_health(time_interval_threashold=envs.FD_WORKER_ALIVE_TIMEOUT) if not status: raise ValueError(f"Engine is not healthy: {msg}") else: diff --git a/fastdeploy/envs.py b/fastdeploy/envs.py index 93f135d09d..15282fe9c0 100644 --- a/fastdeploy/envs.py +++ b/fastdeploy/envs.py @@ -151,6 +151,8 @@ environment_variables: dict[str, Callable[[], Any]] = { # "Number of tokens in the group for Mixture of Experts (MoE) computation processing on HPU" "FD_HPU_CHUNK_SIZE": lambda: int(os.getenv("FD_HPU_CHUNK_SIZE", "64")), "FD_PREFILL_WAIT_DECODE_RESOURCE_SECONDS": lambda: int(os.getenv("FD_PREFILL_WAIT_DECODE_RESOURCE_SECONDS", "30")), + # Timeout for worker process health check in seconds + "FD_WORKER_ALIVE_TIMEOUT": lambda: int(os.getenv("FD_WORKER_ALIVE_TIMEOUT", "30")), }