[Feature] [PD] add simple router and refine splitwise deployment (#4709)

* add simple router and refine splitwise deployment

* fix
This commit is contained in:
Juncai
2025-11-06 14:56:02 +08:00
committed by GitHub
parent 831266da7a
commit 08ca0f6aea
39 changed files with 2397 additions and 171 deletions
+5 -2
View File
@@ -344,6 +344,8 @@ class CacheMessager:
)
item["layer_idx"] = current_layer_idx
if item["layer_idx"] == self.num_layers:
if "error" not in item["status"]:
item["status"] = "finished"
if item["transfer_protocol"] == "ipc":
self.messager["ipc"].write_block_by_sync(target_id)
logger.info(f"finish write cache {item['request_id']}")
@@ -359,7 +361,7 @@ class CacheMessager:
def _handle_connect_task(self):
while True:
try:
task = self.engine_worker_queue.get_connect_rdma_task()
task, _ = self.engine_worker_queue.get_connect_rdma_task()
if task is None:
time.sleep(0.001)
continue
@@ -376,7 +378,8 @@ class CacheMessager:
self.engine_worker_queue.connect_task_response_barrier.wait()
self.engine_worker_queue.put_connect_rdma_task_response(response)
except Exception as e:
logger.error(f"handle_connect_task has exception: {e}")
time.sleep(0.001)
logger.error(f"handle_connect_task has exception: {e}, {str(traceback.format_exc())}")
class CacheMessagerV1: