mirror of
https://github.com/PaddlePaddle/FastDeploy.git
synced 2026-04-23 00:17:25 +08:00
[Speculative Decoding] Auto-scale CUDA graph capture sizes for speculative decoding (#7215)
This commit is contained in:
@@ -1893,8 +1893,7 @@ class GPUModelRunner(ModelRunnerBase):
|
||||
logger.info(
|
||||
f"Warm up the model with the num_tokens:{num_tokens}, expected_decode_len:{expected_decode_len}"
|
||||
)
|
||||
elif self.speculative_decoding and self.spec_method == SpecMethod.MTP:
|
||||
# Capture Target Model without bsz 1
|
||||
elif self.speculative_decoding and self.spec_method in [SpecMethod.MTP, SpecMethod.SUFFIX]:
|
||||
for capture_size in sorted(capture_sizes, reverse=True):
|
||||
expected_decode_len = (self.speculative_config.num_speculative_tokens + 1) * 2
|
||||
self._dummy_run(
|
||||
|
||||
Reference in New Issue
Block a user