[XPU] Split the block_attn operator into smaller operators (#6798)

* spliced block_attn

* adapt to latest vllm

* fix unit tests

* delete mtp+cudagraph 4 cards test

* fix vl model

* fix mtp

* fix slot mapping
This commit is contained in:
RuohengMa
2026-04-16 14:28:40 +08:00
committed by GitHub
parent 6b891da02b
commit de0c5e68fb
12 changed files with 2891 additions and 131 deletions
@@ -127,6 +127,7 @@ def xpu_pre_process(
is_profiling: bool = False,
forward_meta=None,
use_cudagraph=False,
num_speculative_tokens=0,
) -> XPUForwardMeta:
""" """
@@ -196,8 +197,15 @@ def xpu_pre_process(
xpu_forward_meta.decoder_context_len_cpu,
xpu_forward_meta.decoder_context_len_cache_cpu,
xpu_forward_meta.len_info_cpu,
xpu_forward_meta.slot_mapping_enc,
xpu_forward_meta.slot_mapping_dec,
) = get_infer_param(
seq_lens_encoder, seq_lens_decoder, seq_lens_this_time, xpu_forward_meta.block_tables, block_size
seq_lens_encoder,
seq_lens_decoder,
seq_lens_this_time,
xpu_forward_meta.block_tables,
block_size,
num_speculative_tokens,
)
xpu_forward_meta.enc_batch = xpu_forward_meta.len_info_cpu[0]
xpu_forward_meta.dec_batch = xpu_forward_meta.len_info_cpu[1]