[Speculative Decoding] Support mtp expert-parallel and support different modality deploy (#7018)

* support mtp ep and support different modality

* fix default arg
This commit is contained in:
freeliuzc
2026-03-26 13:52:16 +08:00
committed by GitHub
parent 61ebac49ef
commit 4fd877ed43
10 changed files with 112 additions and 19 deletions
+10
View File
@@ -34,6 +34,7 @@ with intercept_paddle_loggers():
from fastdeploy import envs
from fastdeploy.config import (
CacheConfig,
DeployModality,
DeviceConfig,
EarlyStopConfig,
EPLBConfig,
@@ -1118,6 +1119,14 @@ def parse_args():
help="enable to avoid cpu sync",
)
parser.add_argument(
"--deploy_modality",
type=str,
default="mixed",
choices=["mixed", "text"],
help="Deploy modality: 'mixed' for multimodal, 'text' for text-only.",
)
args = parser.parse_args()
return args
@@ -1248,6 +1257,7 @@ def initialize_fd_config(args, ranks: int = 1, local_rank: int = 0) -> FDConfig:
structured_outputs_config=structured_outputs_config,
eplb_config=eplb_config,
routing_replay_config=routing_replay_config,
deploy_modality=DeployModality.from_str(getattr(args, "deploy_modality", "mixed")),
)
logger.info(f"parallel_config.local_engine_worker_queue_port {parallel_config.local_engine_worker_queue_port}")