mirror of
https://github.com/PaddlePaddle/FastDeploy.git
synced 2026-04-23 00:17:25 +08:00
[Speculative Decoding] Support mtp expert-parallel and support different modality deploy (#7018)
* support mtp ep and support different modality * fix default arg
This commit is contained in:
@@ -2019,6 +2019,13 @@ class GPUModelRunner(ModelRunnerBase):
|
||||
|
||||
return prefill_done_idxs
|
||||
|
||||
def _execute_empty_mtp_input(self, forward_meta) -> None:
|
||||
"""
|
||||
run ep inference forward with empty input.
|
||||
"""
|
||||
for _ in range(self.fd_config.speculative_config.num_model_steps):
|
||||
self.proposer.model.empty_input_forward(forward_meta)
|
||||
|
||||
def execute_model(
|
||||
self,
|
||||
model_forward_batch: Optional[List[Request]] = None,
|
||||
@@ -2046,6 +2053,12 @@ class GPUModelRunner(ModelRunnerBase):
|
||||
model_inputs, p_done_idxs, _ = self._preprocess(model_forward_batch, num_running_requests)
|
||||
model_output = self._execute(model_inputs)
|
||||
if model_output is None or self.share_inputs["seq_lens_this_time_cpu"].numpy().sum().item() <= 0:
|
||||
if (
|
||||
self.fd_config.speculative_config.method == SpecMethod.MTP
|
||||
and hasattr(self.proposer.model, "empty_input_forward")
|
||||
and self.parallel_config.use_ep
|
||||
):
|
||||
self._execute_empty_mtp_input(self.forward_meta)
|
||||
return
|
||||
model_output_data, sampler_output, post_process_event = self._postprocess(
|
||||
model_output, p_done_idxs, model_forward_batch, num_running_requests
|
||||
|
||||
Reference in New Issue
Block a user