[bugfix] kill cache_transfer_manager process (#4401)
CE Compile Job / ce_job_pre_check (push) Has been cancelled
CE Compile Job / print_ce_job_pre_check_outputs (push) Has been cancelled
CE Compile Job / FD-Clone-Linux (push) Has been cancelled
CE Compile Job / Show Code Archive Output (push) Has been cancelled
CE Compile Job / BUILD_SM8090 (push) Has been cancelled
CE Compile Job / BUILD_SM8689 (push) Has been cancelled
CE Compile Job / CE_UPLOAD (push) Has been cancelled
Deploy GitHub Pages / deploy (push) Has been cancelled
Publish Job / publish_pre_check (push) Has been cancelled
Publish Job / print_publish_pre_check_outputs (push) Has been cancelled
Publish Job / FD-Clone-Linux (push) Has been cancelled
Publish Job / Show Code Archive Output (push) Has been cancelled
Publish Job / BUILD_SM8090 (push) Has been cancelled
Publish Job / BUILD_SM8689 (push) Has been cancelled
Publish Job / PADDLE_PYPI_UPLOAD_8090 (push) Has been cancelled
Publish Job / PADDLE_PYPI_UPLOAD_8689 (push) Has been cancelled
Publish Job / Run FD Image Build (push) Has been cancelled
Publish Job / Run FastDeploy Unit Tests and Coverage (push) Has been cancelled
Publish Job / Run FastDeploy LogProb Tests (push) Has been cancelled
Publish Job / Extracted partial CE model tasks to run in CI. (push) Has been cancelled
Publish Job / Run Base Tests (push) Has been cancelled
Publish Job / Run Accuracy Tests (push) Has been cancelled
Publish Job / Run Stable Tests (push) Has been cancelled
CI Images Build / FD-Clone-Linux (push) Has been cancelled
CI Images Build / Show Code Archive Output (push) Has been cancelled
CI Images Build / CI Images Build (push) Has been cancelled
CI Images Build / BUILD_SM8090 (push) Has been cancelled
CI Images Build / Run FastDeploy Unit Tests and Coverage (push) Has been cancelled
CI Images Build / Run FastDeploy LogProb Tests (push) Has been cancelled
CI Images Build / Extracted partial CE model tasks to run in CI. (push) Has been cancelled
CI Images Build / Run Base Tests (push) Has been cancelled
CI Images Build / Run Accuracy Tests (push) Has been cancelled
CI Images Build / Run Stable Tests (push) Has been cancelled
CI Images Build / Publish Docker Images Pre Check (push) Has been cancelled

This commit is contained in:
xiaolei373
2025-10-16 20:45:24 +08:00
committed by GitHub
parent 0355235fb9
commit dbca63f862
3 changed files with 219 additions and 0 deletions
@@ -183,6 +183,18 @@ class CacheTransferManager:
suffix=args.engine_pid,
create=False,
)
max_chips_per_node = 16 if current_platform.is_iluvatar() else 8
array_size = min(max_chips_per_node, args.mp_num)
worker_healthy_live_array = np.zeros(shape=[array_size], dtype=np.int32)
self.worker_healthy_live_signal = IPCSignal(
name="worker_healthy_live_signal",
array=worker_healthy_live_array,
dtype=np.int32,
suffix=args.engine_worker_queue_port,
create=False,
)
# TODO XPU support RL
if not current_platform.is_xpu():
threading.Thread(target=self.clear_or_update_caches, args=[args], daemon=True).start()
@@ -319,10 +331,26 @@ class CacheTransferManager:
logger.debug(f"_do_swap_to_gpu_task: put_transfer_done_signal {result}")
logger.info(f"_do_swap_to_gpu_task: put_transfer_done_signal for transfer_task_id {transfer_task_id}")
def check_work_status(self, time_interval_threashold=envs.FD_CACHE_PROC_EXIT_TIMEOUT):
"""
Check the health of the model server by checking whether all workers are alive.
"""
if self.worker_healthy_live_signal.value[0]:
elapsed_time = time.time() - self.worker_healthy_live_signal.value[0]
if elapsed_time > time_interval_threashold:
return False, "Worker Service Not Healthy"
return True, ""
def do_data_transfer(self):
"""
do data transfer task
"""
consecutive_error_count = 0
max_errors = envs.FD_CACHE_PROC_ERROR_COUNT # 连续错误超过此次数后检测work进程是否还存在
while True:
try:
if self.rank == 0:
@@ -373,6 +401,28 @@ class CacheTransferManager:
self.cache_task_queue.barrier3.wait()
if self.rank == 0:
self.cache_task_queue.barrier3.reset()
consecutive_error_count = 0
except (BrokenPipeError, EOFError, ConnectionResetError) as e:
# cache_transfer_manager进程残留时会持续打印异常日志导致磁盘耗尽,此处增加检测work进程是否存活,
# 如果worker进程已经结束,此残留进程会终止循环退出,避免持续打印异常日志
logger.error(f"[CacheTransferManager] Connection broken: {e}")
consecutive_error_count += 1
if consecutive_error_count > max_errors:
try:
status, msg = self.check_work_status()
except Exception:
status = True
if status is False:
logger.critical(
f"The Worker process has been inactive for over {envs.FD_CACHE_PROC_EXIT_TIMEOUT} seconds, and the Cache process will automatically terminate (the waiting timeout can be extended via FD_CACHE_PROC_EXIT_TIMEOUT)."
)
break
time.sleep(1)
continue
except Exception as e:
logger.info(f"do_data_transfer: error: {e}, {str(traceback.format_exc())}")