[Speculative Decoding] Fix multistep MTP in splitewise-prefill mode (#5723)

This commit is contained in:
freeliuzc
2025-12-24 18:45:54 +08:00
committed by GitHub
parent e75f93d302
commit 2dc2ba49b5
+6
View File
@@ -1789,6 +1789,12 @@ class FDConfig:
# It will hang when real batch_size < tp_size
self.graph_opt_config.filter_capture_size(tp_size=self.parallel_config.tensor_parallel_size)
# adjust speculative config
if self.speculative_config is not None and self.speculative_config.method == "mtp":
if self.scheduler_config.splitwise_role == "prefill":
self.speculative_config.num_speculative_tokens = 1
self.speculative_config.num_model_steps = 1
if self.scheduler_config.splitwise_role == "mixed":
self._disable_sequence_parallel_moe_if_needed("Mixed")
self.model_config.moe_phase = MoEPhase(phase="prefill")