mirror of
https://github.com/PaddlePaddle/FastDeploy.git
synced 2026-04-23 00:17:25 +08:00
[Optimization][Speculative Decoding]Fuse padding sampling params (#6765)
* optimize speculate pre process unit test * Add CUDA kernel for building sampling params in speculative decoding * init infer seed in device * format code * add unittest & fix * fix * format-code * format-code * fix rebase * . * fix unitest
This commit is contained in:
@@ -171,7 +171,7 @@ class InputBatch:
|
||||
self.need_block_list = paddle.full([max_num_seqs], -1, dtype="int32")
|
||||
self.need_block_len = paddle.full([1], 0, dtype="int32")
|
||||
self.used_list_len = paddle.full([max_num_seqs], 0, dtype="int32")
|
||||
self.infer_seed = paddle.full([max_num_seqs, 1], 0, dtype="int64", device="cpu")
|
||||
self.infer_seed = paddle.full([max_num_seqs, 1], 0, dtype="int64")
|
||||
self.first_token_ids = paddle.full([max_num_seqs, 1], -1, dtype="int64")
|
||||
self.ori_seq_lens_encoder = paddle.full([max_num_seqs, 1], 0, dtype="int32")
|
||||
self.system_lens = paddle.full([max_num_seqs, 1], 0, dtype="int32")
|
||||
|
||||
Reference in New Issue
Block a user