mirror of
https://github.com/PaddlePaddle/FastDeploy.git
synced 2026-04-23 00:17:25 +08:00
[Speculative Decoding] Support mtp expert-parallel and support different modality deploy (#7018)
* support mtp ep and support different modality * fix default arg
This commit is contained in:
+33
-1
@@ -1354,6 +1354,37 @@ class EarlyStopConfig:
|
||||
argument = self.enable_early_stop
|
||||
|
||||
|
||||
class DeployModality(str, Enum):
|
||||
"""Modality mode for the serving engine deployment.
|
||||
|
||||
Determines which input modalities the serving engine should handle:
|
||||
- TEXT: Text-only deployment. The engine only processes text inputs,
|
||||
skipping multimodal preprocessing (e.g., vision encoder, audio
|
||||
encoder). This reduces GPU memory usage and startup time when
|
||||
multimodal capabilities are not needed.
|
||||
- MIXED: Multimodal deployment (default). The engine handles mixed-modality
|
||||
inputs including text, images, audio, and video. All modality-specific
|
||||
encoders and preprocessing pipelines are initialized at startup.
|
||||
|
||||
Usage:
|
||||
--deploy-modality text # text-only, lower resource footprint
|
||||
--deploy-modality mixed # full multimodal support (default)
|
||||
"""
|
||||
|
||||
TEXT = "text"
|
||||
MIXED = "mixed"
|
||||
|
||||
@classmethod
|
||||
def from_str(cls, value: str) -> "DeployModality":
|
||||
"""Parse a string into a DeployModality enum, with validation."""
|
||||
value = value.strip().lower()
|
||||
try:
|
||||
return cls(value)
|
||||
except ValueError:
|
||||
valid = ", ".join(f"'{m.value}'" for m in cls)
|
||||
raise ValueError(f"Invalid deploy_modality '{value}'. Must be one of: {valid}")
|
||||
|
||||
|
||||
class LoadChoices(str, Enum):
|
||||
"""LoadChoices"""
|
||||
|
||||
@@ -1830,6 +1861,7 @@ class FDConfig:
|
||||
tool_parser: str = None,
|
||||
test_mode=False,
|
||||
routing_replay_config: Optional[RoutingReplayConfig] = None,
|
||||
deploy_modality: DeployModality = DeployModality.MIXED,
|
||||
):
|
||||
self.model_config: ModelConfig = model_config # type: ignore
|
||||
self.cache_config: CacheConfig = cache_config # type: ignore
|
||||
@@ -1846,7 +1878,7 @@ class FDConfig:
|
||||
self.structured_outputs_config: StructuredOutputsConfig = structured_outputs_config
|
||||
self.router_config: RouterConfig = router_config
|
||||
self.routing_replay_config = routing_replay_config
|
||||
|
||||
self.deploy_modality: DeployModality = deploy_modality
|
||||
# Initialize cuda graph capture list
|
||||
max_capture_shape = self.scheduler_config.max_num_seqs
|
||||
if self.speculative_config is not None and self.speculative_config.method in [
|
||||
|
||||
Reference in New Issue
Block a user