[RL] add version to the key of cache storage && refine raising error (#6160)

* Waiting for cache transfer manager inited

* up

* up

* up

* up

* up

* fix according comments

* fix unittest

* fix

* fix unittest

* fix error

* pass storage_backend to worker
This commit is contained in:
jc
2026-01-27 10:47:46 +08:00
committed by GitHub
parent 7ffa88bb01
commit b1698a79cb
8 changed files with 121 additions and 107 deletions
@@ -248,6 +248,14 @@ class PrefixCacheManager:
suffix=engine_worker_queue_port,
create=False,
)
cache_transfer_inited_signal_data = np.zeros(shape=[tensor_parallel_size], dtype=np.int32)
self.cache_transfer_inited_signal = IPCSignal(
name="cache_transfer_inited_signal",
array=cache_transfer_inited_signal_data,
dtype=np.int32,
suffix=engine_worker_queue_port,
create=False,
)
# Run command to launch cache transfer managers
log_dir = envs.FD_LOG_DIR
@@ -262,9 +270,9 @@ class PrefixCacheManager:
val_shape_str = str(val_cache_shape)
val_cache_arg_str = f" --value_cache_shape {val_shape_str}"
if cache_config.kvcache_storage_backend:
kvcache_storage_backend_str = cache_config.kvcache_storage_backend
storage_arg_str = f" --kvcache_storage_backend {cache_config.kvcache_storage_backend}"
else:
kvcache_storage_backend_str = "none"
storage_arg_str = " "
if self.cache_config.swap_space or self.cache_config.kvcache_storage_backend:
for i in range(tensor_parallel_size):
@@ -274,7 +282,6 @@ class PrefixCacheManager:
+ " NCCL_MAX_NCHANNELS=1 NCCL_BUFFSIZE=0"
+ f" FD_ENABLE_SWAP_SPACE_CLEARING={envs.FD_ENABLE_SWAP_SPACE_CLEARING}"
+ f" {sys.executable} {py_path}"
+ f" --model_id {os.path.basename(self.config.model_config.model)}"
+ f" --device_id {int(device_ids[i])}"
+ f" --rank {i}"
+ f" --splitwise_role {self.splitwise_role}"
@@ -295,14 +302,19 @@ class PrefixCacheManager:
+ f" --speculative_config '{self.speculative_config.to_json_string()}'"
+ f" --default_dtype '{self.config.model_config.dtype}'"
+ (" --create_cache_tensor" if not self.enable_splitwise else "")
+ f" --kvcache_storage_backend {kvcache_storage_backend_str}"
+ storage_arg_str
+ f" --write_policy {cache_config.write_policy}"
+ f" --max_model_len {self.config.model_config.max_model_len}"
+ f" --model_path {self.config.model_config.model}"
+ f" >{log_dir}/launch_cache_transfer_manager_{int(device_ids[i])}.log 2>&1"
)
logger.info(f"Launch cache transfer manager, command:{launch_cmd}")
cache_manager_processes.append(subprocess.Popen(launch_cmd, shell=True, preexec_fn=os.setsid))
logger.info("PrefixCacheManager is waiting for cache transfer manager to be initialized.")
while np.sum(self.cache_transfer_inited_signal.value) != tensor_parallel_size:
time.sleep(1)
logger.info("PrefixCacheManager is waiting for kv cache to be initialized.")
while np.sum(self.cache_ready_signal.value) != tensor_parallel_size:
time.sleep(1)