PD deployment support without router (#7412)

This commit is contained in:
jc
2026-04-15 20:13:07 +08:00
committed by GitHub
parent a498720a75
commit e53f5184ac
6 changed files with 472 additions and 126 deletions
+6 -7
View File
@@ -2010,13 +2010,13 @@ class FDConfig:
and self.router_config
and self.router_config.router
):
# For RL scenario: version.yaml will be required for models in future releases.
# For RL scenario, version.yaml is required for models
# Temporarily enforce use router to be enabled.
self.model_config.read_model_version()
self.read_from_config()
self.postprocess()
self.init_cache_info()
self.init_pd_info()
if test_mode:
return
self.check()
@@ -2371,18 +2371,17 @@ class FDConfig:
logger.info("{:<20}:{:<6}{}".format(k, "", v))
logger.info("=============================================================")
def init_cache_info(self):
def init_pd_info(self):
"""
initialize cache info
initialize info for pd deployment
"""
# TODO: group the splitiwse params
# There are two methods for splitwise deployment:
# 1. v0 splitwise_scheduler or dp_scheduler
# 2. v1 local_scheduler + router
# 2. v1 local_scheduler + router (optional)
self.splitwise_version = None
if self.scheduler_config.name in ("splitwise", "dp"):
self.splitwise_version = "v0"
elif self.scheduler_config.name == "local" and self.router_config and self.router_config.router:
elif self.scheduler_config.name == "local":
self.splitwise_version = "v1"
# the information for registering this server to router or splitwise_scheduler
+8 -3
View File
@@ -600,10 +600,15 @@ class EngineArgs:
raise NotImplementedError("Only ENABLE_V1_KVCACHE_SCHEDULER=1 support max_logprobs=-1")
if self.splitwise_role != "mixed":
if self.scheduler_name == "local" and self.router is None:
if self.scheduler_name == "splitwise":
raise ValueError(
f"When using {self.splitwise_role} role and the {self.scheduler_name} "
f"scheduler, please provide --router argument."
"Setting scheduler_name as splitwise is not supported in pd deployment, "
"please use router as scheduler."
)
if self.scheduler_name == "local" and self.router is None:
console_logger.warning(
f"Running {self.splitwise_role} role with {self.scheduler_name} "
f"scheduler without --router. Router registration and request routing will be disabled."
)
if not (
+2 -2
View File
@@ -109,7 +109,7 @@ class ExpertService:
if envs.FD_ENABLE_RETURN_TEXT:
self.engine.create_data_processor()
if self.cfg.scheduler_config.name == "dp":
self.cfg.init_cache_info()
self.cfg.init_pd_info()
self.engine.scheduler.start(local_data_parallel_id)
if ipc_signal_suffix is not None:
@@ -122,7 +122,7 @@ class ExpertService:
self.llm_logger.info(f"start expert service {local_data_parallel_id}")
if self.cfg.scheduler_config.name == "splitwise":
self.cfg.init_cache_info()
self.cfg.init_pd_info()
role = self.cfg.scheduler_config.splitwise_role
host_ip = self.cfg.host_ip
self.engine.scheduler.start(role, host_ip, self.cfg.register_info)