[Speculative Decoding] Support mtp expert-parallel and support different modality deploy (#7018)

* support mtp ep and support different modality

* fix default arg
This commit is contained in:
freeliuzc
2026-03-26 13:52:16 +08:00
committed by GitHub
parent 61ebac49ef
commit 4fd877ed43
10 changed files with 112 additions and 19 deletions
+13
View File
@@ -2019,6 +2019,13 @@ class GPUModelRunner(ModelRunnerBase):
return prefill_done_idxs
def _execute_empty_mtp_input(self, forward_meta) -> None:
"""
run ep inference forward with empty input.
"""
for _ in range(self.fd_config.speculative_config.num_model_steps):
self.proposer.model.empty_input_forward(forward_meta)
def execute_model(
self,
model_forward_batch: Optional[List[Request]] = None,
@@ -2046,6 +2053,12 @@ class GPUModelRunner(ModelRunnerBase):
model_inputs, p_done_idxs, _ = self._preprocess(model_forward_batch, num_running_requests)
model_output = self._execute(model_inputs)
if model_output is None or self.share_inputs["seq_lens_this_time_cpu"].numpy().sum().item() <= 0:
if (
self.fd_config.speculative_config.method == SpecMethod.MTP
and hasattr(self.proposer.model, "empty_input_forward")
and self.parallel_config.use_ep
):
self._execute_empty_mtp_input(self.forward_meta)
return
model_output_data, sampler_output, post_process_event = self._postprocess(
model_output, p_done_idxs, model_forward_batch, num_running_requests