[BugFix][KVCache] Add inter-process lock to fix NaN error under DP+EP (#6724)

* [BugFix] Support to fix NaN bug in EP * Optimze notion for all the funs * Fix potential lock contention failure issues * Update fastdeploy/inter_communicator/ipc_signal.py Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com> * Update envs.py * Update default value for USE_KVCACHE_LOCK Change default value of USE_KVCACHE_LOCK from 1 to 0. * Update worker_process.py * Fix suffix wrong * Update test_prefix_cache_manager.py --------- Co-authored-by: Jiang-Jia-Jun <jiangjiajun@baidu.com> Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com>
2026-04-23 00:17:25 +08:00 · 2026-03-10 21:55:32 +08:00
parent 6520ae807c
commit b05a6c4206
7 changed files with 142 additions and 2 deletions
@@ -55,6 +55,7 @@ from fastdeploy.input.preprocess import InputPreprocessor
 from fastdeploy.inter_communicator import (
    EngineCacheQueue,
    EngineWorkerQueue,
+    IPCLock,
    IPCSignal,
    ZmqIpcServer,
    ZmqTcpServer,
@@ -172,6 +173,10 @@ class EngineService:
            )
        self._init_worker_monitor_signals()

+        # Pass the GPU KV cache lock to cache_manager for mutual exclusion
+        # between the CPU transfer process and the worker process.
+        self.resource_manager.cache_manager.gpu_cache_lock = self.gpu_cache_lock
+
        if self.cfg.eplb_config.enable_eplb:
            current_suffix = self.cfg.parallel_config.local_engine_worker_queue_port
            init_eplb_signals(cfg, current_suffix)
@@ -381,6 +386,14 @@ class EngineService:
            create=True,
        )

+        # gpu_cache_lock: file-based lock for mutual exclusion between worker
+        # and CPU transfer when accessing GPU KV cache.
+        self.gpu_cache_lock = IPCLock(
+            name="gpu_cache_lock",
+            suffix=current_suffix,
+            create=True,
+        )
+
    def start_worker_queue_service(self, start_queue):
        """
        start queue service for engine worker communication