[Speculative Decoding] Auto-scale CUDA graph capture sizes for speculative decoding (#7215)

This commit is contained in:
GoldPancake
2026-04-07 20:22:28 +08:00
committed by GitHub
parent 446b26bbc0
commit 9d4fd19c3f
2 changed files with 54 additions and 43 deletions
+1 -2
View File
@@ -1893,8 +1893,7 @@ class GPUModelRunner(ModelRunnerBase):
logger.info(
f"Warm up the model with the num_tokens:{num_tokens}, expected_decode_len:{expected_decode_len}"
)
elif self.speculative_decoding and self.spec_method == SpecMethod.MTP:
# Capture Target Model without bsz 1
elif self.speculative_decoding and self.spec_method in [SpecMethod.MTP, SpecMethod.SUFFIX]:
for capture_size in sorted(capture_sizes, reverse=True):
expected_decode_len = (self.speculative_config.num_speculative_tokens + 1) * 2
self._dummy_run(