[Optimization] [OP] [Models] dsk del prefill mask (#7313)

* dsk del prefill mask * dsk support 1M+ seq_len rope * update rope tests
2026-04-23 00:17:25 +08:00 · 2026-04-11 19:32:27 +08:00
parent 076ab07528
commit ba01d7a823
4 changed files with 83 additions and 49 deletions
@@ -72,6 +72,7 @@ if current_platform.is_cuda():
    from fastdeploy.model_executor.ops.gpu import (
        cp_gather_indexer_k_quant_cache,
        indexer_k_quant_and_cache,
+        merge_prefill_decode_output,
        radix_topk_ragged_transform,
    )

@@ -398,7 +399,6 @@ class DeepseekV3MLAAttention(nn.Layer):
            fmha_out_prefill.reshape_([-1, self.num_attention_heads_tp, self.qk_head_dim])
            fmha_out_prefill = fmha_out_prefill[:, :, : self.v_head_dim]
            fmha_out_prefill.reshape_([-1, self.num_attention_heads_tp * self.v_head_dim])
-            fmha_out_prefill = fmha_out_prefill * forward_meta.mask_encoder_batch.cast(fmha_out_prefill.dtype)
            fmha_out = fmha_out_prefill

        if need_do_decode:  # max_dec_len_this_time
@@ -433,7 +433,17 @@ class DeepseekV3MLAAttention(nn.Layer):
            )

            if need_do_prefill:
-                fmha_out += fmha_out_decode
+                merge_prefill_decode_output(
+                    fmha_out,
+                    fmha_out_decode,
+                    forward_meta.seq_lens_encoder,
+                    forward_meta.seq_lens_decoder,
+                    forward_meta.seq_lens_this_time,
+                    forward_meta.cu_seqlens_q,
+                    self.num_attention_heads_tp,
+                    self.v_head_dim,
+                    1,
+                )
            else:
                fmha_out = fmha_out_decode