[Speculative Decoding] Support mtp expert-parallel and support different modality deploy (#7018)

* support mtp ep and support different modality

* fix default arg
This commit is contained in:
freeliuzc
2026-03-26 13:52:16 +08:00
committed by GitHub
parent 61ebac49ef
commit 4fd877ed43
10 changed files with 112 additions and 19 deletions
+33 -1
View File
@@ -1354,6 +1354,37 @@ class EarlyStopConfig:
argument = self.enable_early_stop
class DeployModality(str, Enum):
"""Modality mode for the serving engine deployment.
Determines which input modalities the serving engine should handle:
- TEXT: Text-only deployment. The engine only processes text inputs,
skipping multimodal preprocessing (e.g., vision encoder, audio
encoder). This reduces GPU memory usage and startup time when
multimodal capabilities are not needed.
- MIXED: Multimodal deployment (default). The engine handles mixed-modality
inputs including text, images, audio, and video. All modality-specific
encoders and preprocessing pipelines are initialized at startup.
Usage:
--deploy-modality text # text-only, lower resource footprint
--deploy-modality mixed # full multimodal support (default)
"""
TEXT = "text"
MIXED = "mixed"
@classmethod
def from_str(cls, value: str) -> "DeployModality":
"""Parse a string into a DeployModality enum, with validation."""
value = value.strip().lower()
try:
return cls(value)
except ValueError:
valid = ", ".join(f"'{m.value}'" for m in cls)
raise ValueError(f"Invalid deploy_modality '{value}'. Must be one of: {valid}")
class LoadChoices(str, Enum):
"""LoadChoices"""
@@ -1830,6 +1861,7 @@ class FDConfig:
tool_parser: str = None,
test_mode=False,
routing_replay_config: Optional[RoutingReplayConfig] = None,
deploy_modality: DeployModality = DeployModality.MIXED,
):
self.model_config: ModelConfig = model_config # type: ignore
self.cache_config: CacheConfig = cache_config # type: ignore
@@ -1846,7 +1878,7 @@ class FDConfig:
self.structured_outputs_config: StructuredOutputsConfig = structured_outputs_config
self.router_config: RouterConfig = router_config
self.routing_replay_config = routing_replay_config
self.deploy_modality: DeployModality = deploy_modality
# Initialize cuda graph capture list
max_capture_shape = self.scheduler_config.max_num_seqs
if self.speculative_config is not None and self.speculative_config.method in [