[XPU] Speculate Decoding + PD, benchmark fix (#6036)

* fix mtp pd

* fix kernel

* fix code style

* fix kernel

* fix test / clear debug code

* fix test / clear debug code

* fix codestyle

* fix codestyle

* fix codestyle
This commit is contained in:
cmcamdy
2026-01-15 19:19:03 +08:00
committed by GitHub
parent 6619298b50
commit 59d8ae0a25
13 changed files with 995 additions and 31 deletions
+32 -7
View File
@@ -52,6 +52,7 @@ from fastdeploy.model_executor.ops.xpu import (
recover_decode_task,
set_data_ipc,
share_external_data,
speculate_schedule_cache,
)
from fastdeploy.model_executor.xpu_pre_and_post_process import (
step_xpu,
@@ -1385,6 +1386,7 @@ class XPUModelRunner(ModelRunnerBase):
# 0. set debug level
# self._set_debug_level(0x1, model_forward_batch, is_dummy_run)
with kv_signal_sender_context_manager(self.pd_disaggregation_mode) as sender:
self.share_inputs["kv_signal_sender"] = sender
# 1. Prepare inputs of model and decoder.
self._prepare_inputs(is_dummy_run=is_dummy_run)
@@ -1481,13 +1483,36 @@ class XPUModelRunner(ModelRunnerBase):
# 7. Updata 'infer_seed' and step_paddle()
self.share_inputs["infer_seed"].add_(self.infer_seed_increment)
self.share_inputs["infer_seed"][:] %= self.MAX_INFER_SEED
step_xpu(
self.share_inputs,
self.cache_config.block_size,
self.cache_config.enc_dec_block_num,
self.fd_config.speculative_config,
self.fd_config.cache_config.enable_prefix_caching,
)
if not envs.ENABLE_V1_KVCACHE_SCHEDULER:
step_xpu(
self.share_inputs,
self.cache_config.block_size,
self.cache_config.enc_dec_block_num,
self.fd_config.speculative_config,
self.fd_config.cache_config.enable_prefix_caching,
)
elif self.speculative_decoding:
speculate_schedule_cache(
self.share_inputs["draft_tokens"],
self.share_inputs["block_tables"],
self.share_inputs["stop_flags"],
self.share_inputs["prompt_lens"],
self.share_inputs["seq_lens_this_time"],
self.share_inputs["seq_lens_encoder"],
self.share_inputs["seq_lens_decoder"],
self.share_inputs["step_seq_lens_decoder"],
self.share_inputs["step_draft_tokens"],
self.share_inputs["step_seq_lens_this_time"],
self.share_inputs["accept_num"],
self.share_inputs["accept_tokens"],
self.share_inputs["is_block_step"],
self.share_inputs["not_need_stop"],
self.share_inputs["stop_nums"],
self.cache_config.block_size,
self.speculative_config.num_speculative_tokens,
)
return None
def _execute_empty_input(self, forward_meta) -> None: