mirror of
https://github.com/PaddlePaddle/FastDeploy.git
synced 2026-04-23 00:17:25 +08:00
[Optimization][Speculative Decoding]Fuse padding sampling params (#6765)
* optimize speculate pre process unit test * Add CUDA kernel for building sampling params in speculative decoding * init infer seed in device * format code * add unittest & fix * fix * format-code * format-code * fix rebase * . * fix unitest
This commit is contained in:
@@ -198,11 +198,11 @@ class GPUModelRunner(ModelRunnerBase):
|
||||
# Initialize input batch
|
||||
self.share_inputs = InputBatch(self.fd_config)
|
||||
self.share_inputs.init_share_inputs()
|
||||
increment_value = (
|
||||
self.increment_value = (
|
||||
4 if not self.speculative_decoding else (self.speculative_config.num_speculative_tokens + 1) * 4
|
||||
)
|
||||
self.infer_seed_increment = paddle.full(
|
||||
shape=[self.scheduler_config.max_num_seqs, 1], fill_value=increment_value, dtype="int64", device="cpu"
|
||||
shape=[self.scheduler_config.max_num_seqs, 1], fill_value=self.increment_value, dtype="int64", device="cpu"
|
||||
)
|
||||
|
||||
self.restore_chunked_prefill_request = dict()
|
||||
@@ -1667,6 +1667,8 @@ class GPUModelRunner(ModelRunnerBase):
|
||||
self.sampling_metadata,
|
||||
self.model_config.max_model_len,
|
||||
self.share_inputs,
|
||||
int(self._real_output_token_num_host),
|
||||
self.increment_value,
|
||||
accept_all_drafts,
|
||||
reject_all_drafts,
|
||||
)
|
||||
@@ -1836,8 +1838,9 @@ class GPUModelRunner(ModelRunnerBase):
|
||||
self._dummy_sampler_run(hidden_states, model_output, batch_size, accept_all_drafts, reject_all_drafts)
|
||||
|
||||
# 7. Updata 'infer_seed' and step_cuda()
|
||||
self.share_inputs["infer_seed"].add_(self.infer_seed_increment)
|
||||
self.share_inputs["infer_seed"][:] %= self.MAX_INFER_SEED
|
||||
if not self.speculative_decoding:
|
||||
self.share_inputs["infer_seed"].add_(self.infer_seed_increment)
|
||||
self.share_inputs["infer_seed"][:] %= self.MAX_INFER_SEED
|
||||
if int((self.share_inputs["seq_lens_this_time"] > 0).sum()) == 0:
|
||||
break
|
||||
|
||||
@@ -2270,6 +2273,8 @@ class GPUModelRunner(ModelRunnerBase):
|
||||
self.sampling_metadata,
|
||||
self.model_config.max_model_len,
|
||||
self.share_inputs,
|
||||
int(self._real_output_token_num_host),
|
||||
self.increment_value,
|
||||
)
|
||||
if self.parallel_config.tensor_parallel_size > 1:
|
||||
paddle.distributed.broadcast(
|
||||
@@ -2369,8 +2374,9 @@ class GPUModelRunner(ModelRunnerBase):
|
||||
self.proposer.run(share_inputs=self.share_inputs)
|
||||
|
||||
# 7. Update 'infer_seed' and step_cuda()
|
||||
self.share_inputs["infer_seed"].add_(self.infer_seed_increment)
|
||||
self.share_inputs["infer_seed"][:] %= self.MAX_INFER_SEED
|
||||
if not self.speculative_decoding:
|
||||
self.share_inputs["infer_seed"].add_(self.infer_seed_increment)
|
||||
self.share_inputs["infer_seed"][:] %= self.MAX_INFER_SEED
|
||||
if self.speculative_decoding:
|
||||
speculate_schedule_cache(
|
||||
self.share_inputs["draft_tokens"],
|
||||
|
||||
Reference in New Issue
Block a user