From 734fbcffdea0568f3dedb1376a1aa0c57a33b109 Mon Sep 17 00:00:00 2001 From: ming1753 <61511741+ming1753@users.noreply.github.com> Date: Fri, 10 Apr 2026 11:31:51 +0800 Subject: [PATCH] [BugFix] Fix Async D2H copy bug & flash mash atten cache V out of bound bug (#7221) --- .../get_block_shape_and_split_kv_block.cu | 8 ++++---- .../gpu_ops/append_attn/pre_cache_len_concat.cu | 4 ++-- .../gpu_ops/flash_mask_attn/mainloop_attn.hpp | 17 +++++++++++++++++ 3 files changed, 23 insertions(+), 6 deletions(-) diff --git a/custom_ops/gpu_ops/append_attn/get_block_shape_and_split_kv_block.cu b/custom_ops/gpu_ops/append_attn/get_block_shape_and_split_kv_block.cu index f94e8493f7..d61aa3c231 100644 --- a/custom_ops/gpu_ops/append_attn/get_block_shape_and_split_kv_block.cu +++ b/custom_ops/gpu_ops/append_attn/get_block_shape_and_split_kv_block.cu @@ -296,7 +296,7 @@ void GetBlockShapeAndSplitKVBlock( if (!phi::backends::gpu::IsCUDAGraphCapturing()) #endif max_len_tensor_cpu.copy_( - max_len_tensor_gpu, max_len_tensor_cpu.place(), false); + max_len_tensor_gpu, max_len_tensor_cpu.place(), true); auto max_len_cpu_ptr = max_len_tensor_cpu.data(); int max_len_this_time = max_len_cpu_ptr[0]; @@ -378,7 +378,7 @@ void GetBlockShapeAndSplitKVBlock( if (!phi::backends::gpu::IsCUDAGraphCapturing()) #endif decoder_num_blocks_cpu.copy_( - decoder_num_blocks_device, decoder_num_blocks_cpu.place(), false); + decoder_num_blocks_device, decoder_num_blocks_cpu.place(), true); } } // mla_backend not need run the following code. @@ -409,7 +409,7 @@ void GetBlockShapeAndSplitKVBlock( block_size); kv_num_blocks_x_cpu.copy_( - kv_num_blocks_x, kv_num_blocks_x_cpu.place(), false); + kv_num_blocks_x, kv_num_blocks_x_cpu.place(), true); // Clear buffer const uint32_t encoder_max_tile_size_per_bs_q = div_up((max_enc_dec_len_this_time * group_size), encoder_block_shape_q); @@ -433,7 +433,7 @@ void GetBlockShapeAndSplitKVBlock( encoder_block_shape_q, group_size); encoder_num_blocks_x_cpu.copy_( - encoder_num_blocks_x, encoder_num_blocks_x_cpu.place(), false); + encoder_num_blocks_x, encoder_num_blocks_x_cpu.place(), true); } } diff --git a/custom_ops/gpu_ops/append_attn/pre_cache_len_concat.cu b/custom_ops/gpu_ops/append_attn/pre_cache_len_concat.cu index 492b3a2664..435c87ba4b 100644 --- a/custom_ops/gpu_ops/append_attn/pre_cache_len_concat.cu +++ b/custom_ops/gpu_ops/append_attn/pre_cache_len_concat.cu @@ -87,9 +87,9 @@ std::vector PreCacheLenConcat( bsz, block_size); paddle::Tensor pre_cache_num_blocks_cpu = - pre_cache_num_blocks.copy_to(paddle::CPUPlace(), false); + pre_cache_num_blocks.copy_to(paddle::CPUPlace(), true); paddle::Tensor kv_token_num_cpu = - kv_token_num.copy_to(paddle::CPUPlace(), false); + kv_token_num.copy_to(paddle::CPUPlace(), true); return { cu_seqlens_k, diff --git a/custom_ops/gpu_ops/flash_mask_attn/mainloop_attn.hpp b/custom_ops/gpu_ops/flash_mask_attn/mainloop_attn.hpp index cb76da20d6..277ed46f85 100644 --- a/custom_ops/gpu_ops/flash_mask_attn/mainloop_attn.hpp +++ b/custom_ops/gpu_ops/flash_mask_attn/mainloop_attn.hpp @@ -490,6 +490,23 @@ struct CollectiveMainloopAttn { softmax.rescale_o(tOrO, scores_scale); consumer_wait(pipeline_v, smem_pipe_read_v); + if (seq_len_k - n_block * kBlockN < kBlockN) { + int valid_k = seq_len_k - n_block * kBlockN; + auto sVt_this = sVt(_, _, smem_pipe_read_v.index()); + constexpr int kHdLo = decltype(get<0, 0>(shape(sVt_this)))::value; + constexpr int kHdHi = decltype(get<0, 1>(shape(sVt_this)))::value; + if (thread_idx >= valid_k && thread_idx < kBlockN) { +#pragma unroll + for (int hd_hi = 0; hd_hi < kHdHi; ++hd_hi) { +#pragma unroll + for (int hd_lo = 0; hd_lo < kHdLo; ++hd_lo) { + sVt_this(make_coord(make_coord(hd_lo, hd_hi), thread_idx)) = + Element(0); + } + } + } + cutlass::arch::fence_view_async_shared(); + } gemm( tiled_mma1, tOrP, tOrV(_, _, _, smem_pipe_read_v.index()), tOrO); warp_scheduler_barrier_arrive();