mirror of
https://github.com/PaddlePaddle/FastDeploy.git
synced 2026-04-23 00:17:25 +08:00
[XPU] Speculate Decoding + PD, benchmark fix (#6036)
* fix mtp pd * fix kernel * fix code style * fix kernel * fix test / clear debug code * fix test / clear debug code * fix codestyle * fix codestyle * fix codestyle
This commit is contained in:
@@ -52,6 +52,7 @@ from fastdeploy.model_executor.ops.xpu import (
|
||||
recover_decode_task,
|
||||
set_data_ipc,
|
||||
share_external_data,
|
||||
speculate_schedule_cache,
|
||||
)
|
||||
from fastdeploy.model_executor.xpu_pre_and_post_process import (
|
||||
step_xpu,
|
||||
@@ -1385,6 +1386,7 @@ class XPUModelRunner(ModelRunnerBase):
|
||||
# 0. set debug level
|
||||
# self._set_debug_level(0x1, model_forward_batch, is_dummy_run)
|
||||
with kv_signal_sender_context_manager(self.pd_disaggregation_mode) as sender:
|
||||
|
||||
self.share_inputs["kv_signal_sender"] = sender
|
||||
# 1. Prepare inputs of model and decoder.
|
||||
self._prepare_inputs(is_dummy_run=is_dummy_run)
|
||||
@@ -1481,13 +1483,36 @@ class XPUModelRunner(ModelRunnerBase):
|
||||
# 7. Updata 'infer_seed' and step_paddle()
|
||||
self.share_inputs["infer_seed"].add_(self.infer_seed_increment)
|
||||
self.share_inputs["infer_seed"][:] %= self.MAX_INFER_SEED
|
||||
step_xpu(
|
||||
self.share_inputs,
|
||||
self.cache_config.block_size,
|
||||
self.cache_config.enc_dec_block_num,
|
||||
self.fd_config.speculative_config,
|
||||
self.fd_config.cache_config.enable_prefix_caching,
|
||||
)
|
||||
|
||||
if not envs.ENABLE_V1_KVCACHE_SCHEDULER:
|
||||
step_xpu(
|
||||
self.share_inputs,
|
||||
self.cache_config.block_size,
|
||||
self.cache_config.enc_dec_block_num,
|
||||
self.fd_config.speculative_config,
|
||||
self.fd_config.cache_config.enable_prefix_caching,
|
||||
)
|
||||
elif self.speculative_decoding:
|
||||
speculate_schedule_cache(
|
||||
self.share_inputs["draft_tokens"],
|
||||
self.share_inputs["block_tables"],
|
||||
self.share_inputs["stop_flags"],
|
||||
self.share_inputs["prompt_lens"],
|
||||
self.share_inputs["seq_lens_this_time"],
|
||||
self.share_inputs["seq_lens_encoder"],
|
||||
self.share_inputs["seq_lens_decoder"],
|
||||
self.share_inputs["step_seq_lens_decoder"],
|
||||
self.share_inputs["step_draft_tokens"],
|
||||
self.share_inputs["step_seq_lens_this_time"],
|
||||
self.share_inputs["accept_num"],
|
||||
self.share_inputs["accept_tokens"],
|
||||
self.share_inputs["is_block_step"],
|
||||
self.share_inputs["not_need_stop"],
|
||||
self.share_inputs["stop_nums"],
|
||||
self.cache_config.block_size,
|
||||
self.speculative_config.num_speculative_tokens,
|
||||
)
|
||||
|
||||
return None
|
||||
|
||||
def _execute_empty_input(self, forward_meta) -> None:
|
||||
|
||||
Reference in New Issue
Block a user