[XPU] Split the block_attn operator into smaller operators (#6798)

* spliced block_attn * adapt to latest vllm * fix unit tests * delete mtp+cudagraph 4 cards test * fix vl model * fix mtp * fix slot mapping
2026-04-23 17:11:21 +08:00 · 2026-04-16 14:28:40 +08:00
parent 6b891da02b
commit de0c5e68fb
12 changed files with 2891 additions and 131 deletions
@@ -127,6 +127,7 @@ def xpu_pre_process(
    is_profiling: bool = False,
    forward_meta=None,
    use_cudagraph=False,
+    num_speculative_tokens=0,
 ) -> XPUForwardMeta:
    """ """

@@ -196,8 +197,15 @@ def xpu_pre_process(
        xpu_forward_meta.decoder_context_len_cpu,
        xpu_forward_meta.decoder_context_len_cache_cpu,
        xpu_forward_meta.len_info_cpu,
+        xpu_forward_meta.slot_mapping_enc,
+        xpu_forward_meta.slot_mapping_dec,
    ) = get_infer_param(
-        seq_lens_encoder, seq_lens_decoder, seq_lens_this_time, xpu_forward_meta.block_tables, block_size
+        seq_lens_encoder,
+        seq_lens_decoder,
+        seq_lens_this_time,
+        xpu_forward_meta.block_tables,
+        block_size,
+        num_speculative_tokens,
    )
    xpu_forward_meta.enc_batch = xpu_forward_meta.len_info_cpu[0]
    xpu_forward_meta.dec_batch = xpu_forward_meta.len_info_cpu[1]