[Speculative Decoding] Support mtp expert-parallel and support different modality deploy (#7018)

* support mtp ep and support different modality * fix default arg
2026-04-23 00:17:25 +08:00 · 2026-03-26 13:52:16 +08:00
parent 61ebac49ef
commit 4fd877ed43
10 changed files with 112 additions and 19 deletions
@@ -25,6 +25,7 @@ from fastdeploy import envs
 from fastdeploy.config import (
    CacheConfig,
    ConvertOption,
+    DeployModality,
    EarlyStopConfig,
    EPLBConfig,
    FDConfig,
@@ -551,6 +552,11 @@ class EngineArgs:
    Flag to enable prefill_use_worst_num_tokens. Default is False (disabled).
    """

+    deploy_modality: str = "mixed"
+    """
+    Deployment modality for the serving engine. Options: mixed, text. Default is mixed.
+    """
+
    def __post_init__(self):
        """
        Post-initialization processing to set default tokenizer if not provided.
@@ -1351,6 +1357,14 @@ class EngineArgs:
            help="Enable overlapping schedule.",
        )

+        model_group.add_argument(
+            "--deploy-modality",
+            type=str,
+            choices=["mixed", "text"],
+            default=EngineArgs.deploy_modality,
+            help="Deployment modality. 'mixed' for multimodal (text+image+audio), 'text' for text-only. Default is mixed.",
+        )
+
        return parser

    @classmethod
@@ -1512,4 +1526,5 @@ class EngineArgs:
            plas_attention_config=plas_attention_config,
            early_stop_config=early_stop_cfg,
            routing_replay_config=routing_replay_config,
+            deploy_modality=DeployModality.from_str(self.deploy_modality),
        )