[Speculative Decoding] Auto-scale CUDA graph capture sizes for speculative decoding (#7215)

2026-04-23 00:17:25 +08:00 · 2026-04-07 20:22:28 +08:00
parent 446b26bbc0
commit 9d4fd19c3f
2 changed files with 54 additions and 43 deletions
@@ -1893,8 +1893,7 @@ class GPUModelRunner(ModelRunnerBase):
                    logger.info(
                        f"Warm up the model with the num_tokens:{num_tokens}, expected_decode_len:{expected_decode_len}"
                    )
-            elif self.speculative_decoding and self.spec_method == SpecMethod.MTP:
-                # Capture Target Model without bsz 1
+            elif self.speculative_decoding and self.spec_method in [SpecMethod.MTP, SpecMethod.SUFFIX]:
                for capture_size in sorted(capture_sizes, reverse=True):
                    expected_decode_len = (self.speculative_config.num_speculative_tokens + 1) * 2
                    self._dummy_run(