mirror of
https://github.com/PaddlePaddle/FastDeploy.git
synced 2026-04-23 00:17:25 +08:00
[BugFix][KVCache] Add inter-process lock to fix NaN error under DP+EP (#6724)
* [BugFix] Support to fix NaN bug in EP * Optimze notion for all the funs * Fix potential lock contention failure issues * Update fastdeploy/inter_communicator/ipc_signal.py Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com> * Update envs.py * Update default value for USE_KVCACHE_LOCK Change default value of USE_KVCACHE_LOCK from 1 to 0. * Update worker_process.py * Fix suffix wrong * Update test_prefix_cache_manager.py --------- Co-authored-by: Jiang-Jia-Jun <jiangjiajun@baidu.com> Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com>
This commit is contained in:
@@ -59,6 +59,7 @@ from fastdeploy.eplb.experts_manager import RedundantExpertManager
|
||||
from fastdeploy.inter_communicator import EngineWorkerQueue as TaskQueue
|
||||
from fastdeploy.inter_communicator import (
|
||||
ExistTaskStatus,
|
||||
IPCLock,
|
||||
IPCSignal,
|
||||
ModelWeightsStatus,
|
||||
RearrangeExpertStatus,
|
||||
@@ -284,6 +285,14 @@ class PaddleDisWorkerProc:
|
||||
create=False,
|
||||
)
|
||||
|
||||
# gpu_cache_lock: file-based lock for mutual exclusion between worker
|
||||
# and CPU transfer when accessing GPU KV cache.
|
||||
self.gpu_cache_lock = IPCLock(
|
||||
name="gpu_cache_lock",
|
||||
suffix=self.parallel_config.local_engine_worker_queue_port,
|
||||
create=False,
|
||||
)
|
||||
|
||||
def update_weights_from_tensor(self, mmap_infos):
|
||||
"""
|
||||
update_weights_from_tensor
|
||||
@@ -426,6 +435,35 @@ class PaddleDisWorkerProc:
|
||||
self.rearrange_experts_signal.value[0] = RearrangeExpertStatus.DONE.value
|
||||
logger.info("redundant_expert: done")
|
||||
|
||||
def _acquire_kvcache_lock(self, tp_rank):
|
||||
"""Acquire the GPU KV cache lock for the worker process.
|
||||
|
||||
Uses a file-based lock (fcntl.flock) to ensure mutual exclusion
|
||||
between the worker and the CPU transfer process during model
|
||||
execution. Only rank 0 acquires the lock to avoid deadlock among
|
||||
tensor-parallel workers.
|
||||
|
||||
Args:
|
||||
tp_rank: Tensor parallel rank of the current worker. Only rank 0
|
||||
acquires the lock.
|
||||
"""
|
||||
if not envs.FD_USE_KVCACHE_LOCK:
|
||||
return
|
||||
if tp_rank == 0:
|
||||
self.gpu_cache_lock.acquire()
|
||||
|
||||
def _release_kvcache_lock(self, tp_rank):
|
||||
"""Release the GPU KV cache lock held by the worker process.
|
||||
|
||||
Args:
|
||||
tp_rank: Tensor parallel rank of the current worker. Only rank 0
|
||||
releases the lock.
|
||||
"""
|
||||
if not envs.FD_USE_KVCACHE_LOCK:
|
||||
return
|
||||
if tp_rank == 0:
|
||||
self.gpu_cache_lock.release()
|
||||
|
||||
def event_loop_normal(self) -> None:
|
||||
"""Main event loop for Paddle Distributed Workers.
|
||||
TODO(gongshaotian): support remote calling of functions that control worker.
|
||||
@@ -572,7 +610,11 @@ class PaddleDisWorkerProc:
|
||||
# Execute model to generate token. The generated token will be written to the buffer.
|
||||
# These generated tokens can be obtained through get_output op.
|
||||
start_execute_time = time.time()
|
||||
|
||||
self._acquire_kvcache_lock(tp_rank)
|
||||
self.worker.execute_model(req_dicts, max_occupied_batch_index)
|
||||
self._release_kvcache_lock(tp_rank)
|
||||
|
||||
# Only v0 use this signal
|
||||
if not envs.ENABLE_V1_KVCACHE_SCHEDULER:
|
||||
self.exist_prefill_task_signal.value[0] = self.worker.exist_prefill()
|
||||
|
||||
Reference in New Issue
Block a user