[BugFix][KVCache] Add inter-process lock to fix NaN error under DP+EP (#6724)

* [BugFix] Support to fix NaN bug in EP * Optimze notion for all the funs * Fix potential lock contention failure issues * Update fastdeploy/inter_communicator/ipc_signal.py Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com> * Update envs.py * Update default value for USE_KVCACHE_LOCK Change default value of USE_KVCACHE_LOCK from 1 to 0. * Update worker_process.py * Fix suffix wrong * Update test_prefix_cache_manager.py --------- Co-authored-by: Jiang-Jia-Jun <jiangjiajun@baidu.com> Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com>
2026-04-23 00:17:25 +08:00 · 2026-03-10 21:55:32 +08:00
parent 6520ae807c
commit b05a6c4206
7 changed files with 142 additions and 2 deletions
@@ -59,6 +59,7 @@ from fastdeploy.eplb.experts_manager import RedundantExpertManager
 from fastdeploy.inter_communicator import EngineWorkerQueue as TaskQueue
 from fastdeploy.inter_communicator import (
    ExistTaskStatus,
+    IPCLock,
    IPCSignal,
    ModelWeightsStatus,
    RearrangeExpertStatus,
@@ -284,6 +285,14 @@ class PaddleDisWorkerProc:
            create=False,
        )

+        # gpu_cache_lock: file-based lock for mutual exclusion between worker
+        # and CPU transfer when accessing GPU KV cache.
+        self.gpu_cache_lock = IPCLock(
+            name="gpu_cache_lock",
+            suffix=self.parallel_config.local_engine_worker_queue_port,
+            create=False,
+        )
+
    def update_weights_from_tensor(self, mmap_infos):
        """
        update_weights_from_tensor
@@ -426,6 +435,35 @@ class PaddleDisWorkerProc:
                self.rearrange_experts_signal.value[0] = RearrangeExpertStatus.DONE.value
            logger.info("redundant_expert: done")

+    def _acquire_kvcache_lock(self, tp_rank):
+        """Acquire the GPU KV cache lock for the worker process.
+
+        Uses a file-based lock (fcntl.flock) to ensure mutual exclusion
+        between the worker and the CPU transfer process during model
+        execution. Only rank 0 acquires the lock to avoid deadlock among
+        tensor-parallel workers.
+
+        Args:
+            tp_rank: Tensor parallel rank of the current worker. Only rank 0
+                acquires the lock.
+        """
+        if not envs.FD_USE_KVCACHE_LOCK:
+            return
+        if tp_rank == 0:
+            self.gpu_cache_lock.acquire()
+
+    def _release_kvcache_lock(self, tp_rank):
+        """Release the GPU KV cache lock held by the worker process.
+
+        Args:
+            tp_rank: Tensor parallel rank of the current worker. Only rank 0
+                releases the lock.
+        """
+        if not envs.FD_USE_KVCACHE_LOCK:
+            return
+        if tp_rank == 0:
+            self.gpu_cache_lock.release()
+
    def event_loop_normal(self) -> None:
        """Main event loop for Paddle Distributed Workers.
        TODO(gongshaotian): support remote calling of functions that control worker.
@@ -572,7 +610,11 @@ class PaddleDisWorkerProc:
            # Execute model to generate token. The generated token will be written to the buffer.
            # These generated tokens can be obtained through get_output op.
            start_execute_time = time.time()
+
+            self._acquire_kvcache_lock(tp_rank)
            self.worker.execute_model(req_dicts, max_occupied_batch_index)
+            self._release_kvcache_lock(tp_rank)
+
            # Only v0 use this signal
            if not envs.ENABLE_V1_KVCACHE_SCHEDULER:
                self.exist_prefill_task_signal.value[0] = self.worker.exist_prefill()