[BugFix] Fix Async D2H copy bug & flash mash atten cache V out of bound bug (#7221)

2026-04-23 00:17:25 +08:00 · 2026-04-10 11:31:51 +08:00
parent 3c54a41131
commit 734fbcffde
3 changed files with 23 additions and 6 deletions
@@ -296,7 +296,7 @@ void GetBlockShapeAndSplitKVBlock(
  if (!phi::backends::gpu::IsCUDAGraphCapturing())
 #endif
    max_len_tensor_cpu.copy_(
-        max_len_tensor_gpu, max_len_tensor_cpu.place(), false);
+        max_len_tensor_gpu, max_len_tensor_cpu.place(), true);

  auto max_len_cpu_ptr = max_len_tensor_cpu.data<int>();
  int max_len_this_time = max_len_cpu_ptr[0];
@@ -378,7 +378,7 @@ void GetBlockShapeAndSplitKVBlock(
      if (!phi::backends::gpu::IsCUDAGraphCapturing())
 #endif
        decoder_num_blocks_cpu.copy_(
-            decoder_num_blocks_device, decoder_num_blocks_cpu.place(), false);
+            decoder_num_blocks_device, decoder_num_blocks_cpu.place(), true);
    }
  }
  // mla_backend not need run the following code.
@@ -409,7 +409,7 @@ void GetBlockShapeAndSplitKVBlock(
        block_size);

    kv_num_blocks_x_cpu.copy_(
-        kv_num_blocks_x, kv_num_blocks_x_cpu.place(), false);
+        kv_num_blocks_x, kv_num_blocks_x_cpu.place(), true);
    // Clear buffer
    const uint32_t encoder_max_tile_size_per_bs_q =
        div_up((max_enc_dec_len_this_time * group_size), encoder_block_shape_q);
@@ -433,7 +433,7 @@ void GetBlockShapeAndSplitKVBlock(
                                        encoder_block_shape_q,
                                        group_size);
    encoder_num_blocks_x_cpu.copy_(
-        encoder_num_blocks_x, encoder_num_blocks_x_cpu.place(), false);
+        encoder_num_blocks_x, encoder_num_blocks_x_cpu.place(), true);
  }
 }

@@ -87,9 +87,9 @@ std::vector<paddle::Tensor> PreCacheLenConcat(
      bsz,
      block_size);
  paddle::Tensor pre_cache_num_blocks_cpu =
-      pre_cache_num_blocks.copy_to(paddle::CPUPlace(), false);
+      pre_cache_num_blocks.copy_to(paddle::CPUPlace(), true);
  paddle::Tensor kv_token_num_cpu =
-      kv_token_num.copy_to(paddle::CPUPlace(), false);
+      kv_token_num.copy_to(paddle::CPUPlace(), true);

  return {
      cu_seqlens_k,