[Feature] Support pd ep deployment with yiyan adapter (#4029)

* [Feature] Support mixed deployment with yiyan adapter in release2.2

* fix metrics

* add unit test

* add unit test

* add unit test

* Support pd ep deployment with yiyan adapter

* Support pd ep deployment with yiyan adapter

* refactor cache messager

* support scheduler v1 in PD

* suppport pd v1 + chunk prefill

* suppport pd v1 + chunk prefill

* add eplb

* support eplb

* support eplb

* support eplb

* support v1

* fix

* fix

* fix bug

* remove eplb support

* support prefix cache in P

* fix bug

* fix bug

* support one stop in V1

* fix bug

* fix ci

* fix ci

* fix

* fix

* fix

* fix

* fix

---------

Co-authored-by: YuBaoku <49938469+EmmonsCurse@users.noreply.github.com>
This commit is contained in:
chenjian
2025-09-22 16:41:38 +08:00
committed by GitHub
parent 9845f0d010
commit 918ccdb123
22 changed files with 1838 additions and 343 deletions
+64 -1
View File
@@ -18,6 +18,7 @@ import redis
from fastdeploy.utils import llm_logger
from .dp_scheduler import DPScheduler
from .global_scheduler import GlobalScheduler
from .local_scheduler import LocalScheduler
from .splitwise_scheduler import SplitWiseScheduler, SplitWiseSchedulerConfig
@@ -89,6 +90,54 @@ class LocalSchedulerConfig:
llm_logger.info("=============================================================")
class DPLocalSchedulerConfig(LocalSchedulerConfig):
"""
Configuration class for DPLocalScheduler.
Attributes:
max_size: Maximum number of concurrent requests (-1 for unlimited)
ttl: Time-to-live in seconds for request expiration
"""
def __init__(
self,
max_size: int = -1,
ttl: int = 900,
max_model_len: int = 8192,
enable_chunked_prefill: bool = False,
max_num_partial_prefills: int = 1,
max_long_partial_prefills: int = 1,
long_prefill_token_threshold: int = 0,
splitwise_role: str = "prefill",
**kwargs,
):
"""
Initialize LocalScheduler configuration.
Args:
max_size: Maximum concurrent requests (-1 for unlimited, 0 for disabled)
ttl: Time-to-live in seconds for request expiration (default 900s)
max_model_len: Maximum model context length in tokens
enable_chunked_prefill: Whether to enable chunked prefill processing
max_num_partial_prefills: Max partial prefill operations allowed
max_long_partial_prefills: Max long-running partial prefill ops
long_prefill_token_threshold: Token count threshold for long prefill
**kwargs: Additional unused arguments (for forward compatibility)
Note:
- If long_prefill_token_threshold is 0, it's auto-calculated as 4% of max_model_len
- See LocalScheduler class for implementation details
"""
self.max_size = max_size
self.ttl = ttl
self.max_model_len = max_model_len
self.enable_chunked_prefill = enable_chunked_prefill
self.max_num_partial_prefills = max_num_partial_prefills
self.max_long_partial_prefills = max_long_partial_prefills
self.long_prefill_token_threshold = long_prefill_token_threshold
if self.long_prefill_token_threshold == 0:
self.long_prefill_token_threshold = int(self.max_model_len * 0.04)
self.splitwise_role = splitwise_role
class GlobalSchedulerConfig:
"""
Configuration class for GlobalScheduler (Redis-based).
@@ -235,6 +284,9 @@ class SchedulerConfig:
if self.name == "splitwise":
self.config = SplitWiseSchedulerConfig(**args)
if self.name == "dp":
self.config = DPLocalSchedulerConfig(**args)
def check(self):
"""
Validate the configuration.
@@ -242,7 +294,7 @@ class SchedulerConfig:
Raises:
Exception: If invalid scheduler type is specified
"""
if self.name not in ["local", "global", "splitwise"]:
if self.name not in ["local", "global", "splitwise", "dp"]:
raise Exception(f"Unknown scheduler type {self.name}")
self.config.check()
@@ -280,6 +332,17 @@ class SchedulerConfig:
if self.name == "splitwise":
return SplitWiseScheduler(self.config)
if self.name == "dp":
return DPScheduler(
max_size=self.config.max_size,
ttl=self.config.ttl,
enable_chunked_prefill=self.config.enable_chunked_prefill,
max_num_partial_prefills=self.config.max_num_partial_prefills,
max_long_partial_prefills=self.config.max_long_partial_prefills,
long_prefill_token_threshold=self.config.long_prefill_token_threshold,
splitwise_role=self.config.splitwise_role,
)
return LocalScheduler(
max_size=self.config.max_size,
ttl=self.config.ttl,