[PD Disaggregation] Add unittest for splitwise deployment with using rdma (#5189)
CE Compile Job / ce_job_pre_check (push) Has been cancelled
CE Compile Job / print_ce_job_pre_check_outputs (push) Has been cancelled
CE Compile Job / FD-Clone-Linux (push) Has been cancelled
CE Compile Job / Show Code Archive Output (push) Has been cancelled
CE Compile Job / BUILD_SM8090 (push) Has been cancelled
CE Compile Job / BUILD_SM8689 (push) Has been cancelled
CE Compile Job / CE_UPLOAD (push) Has been cancelled
Deploy GitHub Pages / deploy (push) Has been cancelled

* Add splitwise deployment with using rdma
* clean cuda
This commit is contained in:
Juncai
2025-11-27 14:27:17 +08:00
committed by GitHub
parent 373b5c3807
commit ce9a49f6bf
9 changed files with 723 additions and 139 deletions
+6 -6
View File
@@ -30,7 +30,7 @@ from utils.serving_utils import (
FD_CACHE_QUEUE_PORT,
FD_ENGINE_QUEUE_PORT,
FD_METRICS_PORT,
clean_ports,
clean,
get_registered_number,
)
@@ -64,7 +64,7 @@ def setup_and_run_server():
- Tears down server after all tests finish
"""
print("Pre-test port cleanup...")
clean_ports(PORTS_TO_CLEAN)
clean(PORTS_TO_CLEAN)
print("log dir clean ")
if os.path.exists("log_router") and os.path.isdir("log_router"):
@@ -111,7 +111,7 @@ def setup_and_run_server():
env_prefill["CUDA_VISIBLE_DEVICES"] = "0"
env_prefill["ENABLE_V1_KVCACHE_SCHEDULER"] = "0"
env_prefill["FD_LOG_DIR"] = "log_prefill"
prefill_log_path = "server.log"
prefill_log_path = "server_prefill.log"
prefill_cmd = [
sys.executable,
"-m",
@@ -161,7 +161,7 @@ def setup_and_run_server():
env_decode["CUDA_VISIBLE_DEVICES"] = "1"
env_decode["ENABLE_V1_KVCACHE_SCHEDULER"] = "0"
env_decode["FD_LOG_DIR"] = "log_decode"
decode_log_path = "decode_server.log"
decode_log_path = "server_decode.log"
decode_cmd = [
sys.executable,
"-m",
@@ -216,7 +216,7 @@ def setup_and_run_server():
try:
os.killpg(process_prefill.pid, signal.SIGTERM)
os.killpg(process_decode.pid, signal.SIGTERM)
clean_ports()
clean()
except Exception as e:
print(f"Failed to kill process group: {e}")
raise RuntimeError(f"API server did not start on port {FD_API_PORT}")
@@ -228,7 +228,7 @@ def setup_and_run_server():
os.killpg(process_router.pid, signal.SIGTERM)
os.killpg(process_prefill.pid, signal.SIGTERM)
os.killpg(process_decode.pid, signal.SIGTERM)
clean_ports(PORTS_TO_CLEAN)
clean(PORTS_TO_CLEAN)
print(f"Prefill server (pid={process_prefill.pid}) terminated")
print(f"Decode server (pid={process_decode.pid}) terminated")
except Exception as e: