[BugFix][KVCache] Add inter-process lock to fix NaN error under DP+EP (#6724)

* [BugFix] Support  to fix NaN bug in EP

* Optimze notion for all the funs

* Fix potential lock contention failure issues

* Update fastdeploy/inter_communicator/ipc_signal.py

Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com>

* Update envs.py

* Update default value for USE_KVCACHE_LOCK

Change default value of USE_KVCACHE_LOCK from 1 to 0.

* Update worker_process.py

* Fix suffix wrong

* Update test_prefix_cache_manager.py

---------

Co-authored-by: Jiang-Jia-Jun <jiangjiajun@baidu.com>
Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com>
This commit is contained in:
Jiang-Jia-Jun
2026-03-10 21:55:32 +08:00
committed by GitHub
parent 6520ae807c
commit b05a6c4206
7 changed files with 142 additions and 2 deletions
+42
View File
@@ -59,6 +59,7 @@ from fastdeploy.eplb.experts_manager import RedundantExpertManager
from fastdeploy.inter_communicator import EngineWorkerQueue as TaskQueue
from fastdeploy.inter_communicator import (
ExistTaskStatus,
IPCLock,
IPCSignal,
ModelWeightsStatus,
RearrangeExpertStatus,
@@ -284,6 +285,14 @@ class PaddleDisWorkerProc:
create=False,
)
# gpu_cache_lock: file-based lock for mutual exclusion between worker
# and CPU transfer when accessing GPU KV cache.
self.gpu_cache_lock = IPCLock(
name="gpu_cache_lock",
suffix=self.parallel_config.local_engine_worker_queue_port,
create=False,
)
def update_weights_from_tensor(self, mmap_infos):
"""
update_weights_from_tensor
@@ -426,6 +435,35 @@ class PaddleDisWorkerProc:
self.rearrange_experts_signal.value[0] = RearrangeExpertStatus.DONE.value
logger.info("redundant_expert: done")
def _acquire_kvcache_lock(self, tp_rank):
"""Acquire the GPU KV cache lock for the worker process.
Uses a file-based lock (fcntl.flock) to ensure mutual exclusion
between the worker and the CPU transfer process during model
execution. Only rank 0 acquires the lock to avoid deadlock among
tensor-parallel workers.
Args:
tp_rank: Tensor parallel rank of the current worker. Only rank 0
acquires the lock.
"""
if not envs.FD_USE_KVCACHE_LOCK:
return
if tp_rank == 0:
self.gpu_cache_lock.acquire()
def _release_kvcache_lock(self, tp_rank):
"""Release the GPU KV cache lock held by the worker process.
Args:
tp_rank: Tensor parallel rank of the current worker. Only rank 0
releases the lock.
"""
if not envs.FD_USE_KVCACHE_LOCK:
return
if tp_rank == 0:
self.gpu_cache_lock.release()
def event_loop_normal(self) -> None:
"""Main event loop for Paddle Distributed Workers.
TODO(gongshaotian): support remote calling of functions that control worker.
@@ -572,7 +610,11 @@ class PaddleDisWorkerProc:
# Execute model to generate token. The generated token will be written to the buffer.
# These generated tokens can be obtained through get_output op.
start_execute_time = time.time()
self._acquire_kvcache_lock(tp_rank)
self.worker.execute_model(req_dicts, max_occupied_batch_index)
self._release_kvcache_lock(tp_rank)
# Only v0 use this signal
if not envs.ENABLE_V1_KVCACHE_SCHEDULER:
self.exist_prefill_task_signal.value[0] = self.worker.exist_prefill()