[Optimization][Speculative Decoding]Fuse padding sampling params (#6765)

* optimize speculate pre process unit test

* Add CUDA kernel for building sampling params in speculative decoding

* init infer seed in device

* format code

* add unittest & fix

* fix

* format-code

* format-code

* fix rebase

* .

* fix unitest
This commit is contained in:
huicongyao
2026-03-12 20:05:15 +08:00
committed by GitHub
parent a9ace998db
commit 2e63d88f7a
8 changed files with 389 additions and 11 deletions
+1 -1
View File
@@ -171,7 +171,7 @@ class InputBatch:
self.need_block_list = paddle.full([max_num_seqs], -1, dtype="int32")
self.need_block_len = paddle.full([1], 0, dtype="int32")
self.used_list_len = paddle.full([max_num_seqs], 0, dtype="int32")
self.infer_seed = paddle.full([max_num_seqs, 1], 0, dtype="int64", device="cpu")
self.infer_seed = paddle.full([max_num_seqs, 1], 0, dtype="int64")
self.first_token_ids = paddle.full([max_num_seqs, 1], -1, dtype="int64")
self.ori_seq_lens_encoder = paddle.full([max_num_seqs, 1], 0, dtype="int32")
self.system_lens = paddle.full([max_num_seqs, 1], 0, dtype="int32")