mirror of
https://github.com/PaddlePaddle/FastDeploy.git
synced 2026-04-23 00:17:25 +08:00
[BugFix] Fix Async D2H copy bug & flash mash atten cache V out of bound bug (#7221)
This commit is contained in:
@@ -296,7 +296,7 @@ void GetBlockShapeAndSplitKVBlock(
|
||||
if (!phi::backends::gpu::IsCUDAGraphCapturing())
|
||||
#endif
|
||||
max_len_tensor_cpu.copy_(
|
||||
max_len_tensor_gpu, max_len_tensor_cpu.place(), false);
|
||||
max_len_tensor_gpu, max_len_tensor_cpu.place(), true);
|
||||
|
||||
auto max_len_cpu_ptr = max_len_tensor_cpu.data<int>();
|
||||
int max_len_this_time = max_len_cpu_ptr[0];
|
||||
@@ -378,7 +378,7 @@ void GetBlockShapeAndSplitKVBlock(
|
||||
if (!phi::backends::gpu::IsCUDAGraphCapturing())
|
||||
#endif
|
||||
decoder_num_blocks_cpu.copy_(
|
||||
decoder_num_blocks_device, decoder_num_blocks_cpu.place(), false);
|
||||
decoder_num_blocks_device, decoder_num_blocks_cpu.place(), true);
|
||||
}
|
||||
}
|
||||
// mla_backend not need run the following code.
|
||||
@@ -409,7 +409,7 @@ void GetBlockShapeAndSplitKVBlock(
|
||||
block_size);
|
||||
|
||||
kv_num_blocks_x_cpu.copy_(
|
||||
kv_num_blocks_x, kv_num_blocks_x_cpu.place(), false);
|
||||
kv_num_blocks_x, kv_num_blocks_x_cpu.place(), true);
|
||||
// Clear buffer
|
||||
const uint32_t encoder_max_tile_size_per_bs_q =
|
||||
div_up((max_enc_dec_len_this_time * group_size), encoder_block_shape_q);
|
||||
@@ -433,7 +433,7 @@ void GetBlockShapeAndSplitKVBlock(
|
||||
encoder_block_shape_q,
|
||||
group_size);
|
||||
encoder_num_blocks_x_cpu.copy_(
|
||||
encoder_num_blocks_x, encoder_num_blocks_x_cpu.place(), false);
|
||||
encoder_num_blocks_x, encoder_num_blocks_x_cpu.place(), true);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -87,9 +87,9 @@ std::vector<paddle::Tensor> PreCacheLenConcat(
|
||||
bsz,
|
||||
block_size);
|
||||
paddle::Tensor pre_cache_num_blocks_cpu =
|
||||
pre_cache_num_blocks.copy_to(paddle::CPUPlace(), false);
|
||||
pre_cache_num_blocks.copy_to(paddle::CPUPlace(), true);
|
||||
paddle::Tensor kv_token_num_cpu =
|
||||
kv_token_num.copy_to(paddle::CPUPlace(), false);
|
||||
kv_token_num.copy_to(paddle::CPUPlace(), true);
|
||||
|
||||
return {
|
||||
cu_seqlens_k,
|
||||
|
||||
Reference in New Issue
Block a user