【Hackathon 9th No.86】autogen MultiQueryDecoderAttention template_instantiation -part (#4383)

* split MultiQueryDecoderAttention template_instantiation

* update comment

* CI
This commit is contained in:
Zhenghai Zhang
2025-10-16 17:08:19 +08:00
committed by GitHub
parent f72be7a2c8
commit 6adfbe07ad
27 changed files with 3975 additions and 3836 deletions
@@ -1,59 +0,0 @@
// Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "../append_attention_c16_impl.cuh"
template void CascadeAppendAttentionC16Kernel<paddle::bfloat16, paddle::bfloat16>(
const AppendAttnMetaData& meta_data,
const paddle::Tensor& qkv, // [token_num, num_heads, head_dim]
const paddle::Tensor&
cache_k, // [max_block_num, num_heads, block_size, head_dim]
const paddle::Tensor&
cache_v, // [max_block_num, num_heads, head_dim, block_size]
const paddle::optional<paddle::Tensor>& attn_mask,
const paddle::optional<paddle::Tensor>&
cache_k_scale, // [num_kv_heads, head_dim]
const paddle::optional<paddle::Tensor>&
cache_v_scale, // [num_kv_heads, head_dim]
const paddle::optional<paddle::Tensor>&
cache_k_zp, // [num_kv_heads, head_dim]
const paddle::optional<paddle::Tensor>&
cache_v_zp, // [num_kv_heads, head_dim]
const paddle::optional<paddle::Tensor>&
shift_bias, // [num_kv_heads, head_dim]
const paddle::optional<paddle::Tensor>&
smooth_weight, // [num_kv_heads, head_dim]
const paddle::Tensor& seq_lens_q,
const paddle::Tensor& seq_lens_kv,
const paddle::Tensor& seq_lens_encoder,
const paddle::Tensor& batch_id_per_token,
const paddle::Tensor& cu_seqlens_q,
const paddle::Tensor& block_table,
const paddle::Tensor& batch_ids,
const paddle::Tensor& tile_ids_per_batch,
const int num_blocks,
const int block_shape_q,
const int max_seq_len,
const int max_dec_len,
const float quant_max_bound,
const float quant_min_bound,
const float in_scale,
const int max_partition_size,
const int encoder_max_partition_size,
const int speculate_max_draft_token_num,
const bool causal,
const bool is_decoder,
const bool enable_prefill,
cudaStream_t& stream,
paddle::Tensor* out);
@@ -1,58 +0,0 @@
// Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "../append_attention_c16_impl.cuh"
template void CascadeAppendAttentionC16Kernel<paddle::bfloat16, paddle::float8_e4m3fn>(
const AppendAttnMetaData& meta_data,
const paddle::Tensor& qkv, // [token_num, num_heads, head_dim]
const paddle::Tensor&
cache_k, // [max_block_num, num_heads, block_size, head_dim]
const paddle::Tensor&
cache_v, // [max_block_num, num_heads, head_dim, block_size]
const paddle::optional<paddle::Tensor>& attn_mask,
const paddle::optional<paddle::Tensor>&
cache_k_scale, // [num_kv_heads, head_dim]
const paddle::optional<paddle::Tensor>&
cache_v_scale, // [num_kv_heads, head_dim]
const paddle::optional<paddle::Tensor>&
cache_k_zp, // [num_kv_heads, head_dim]
const paddle::optional<paddle::Tensor>&
cache_v_zp, // [num_kv_heads, head_dim]
const paddle::optional<paddle::Tensor>&
shift_bias, // [num_kv_heads, head_dim]
const paddle::optional<paddle::Tensor>&
smooth_weight, // [num_kv_heads, head_dim]
const paddle::Tensor& seq_lens_q,
const paddle::Tensor& seq_lens_kv,
const paddle::Tensor& seq_lens_encoder,
const paddle::Tensor& batch_id_per_token,
const paddle::Tensor& cu_seqlens_q,
const paddle::Tensor& block_table,
const paddle::Tensor& batch_ids,
const paddle::Tensor& tile_ids_per_batch,
const int num_blocks,
const int block_shape_q,
const int max_seq_len,
const int max_dec_len,
const float quant_max_bound,
const float quant_min_bound,
const float in_scale,
const int max_partition_size,
const int encoder_max_partition_size,
const int speculate_max_draft_token_num,
const bool causal,
const bool is_decoder,
const bool enable_prefill,
cudaStream_t& stream,
paddle::Tensor* out);
@@ -1,58 +0,0 @@
// Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "../append_attention_c16_impl.cuh"
template void CascadeAppendAttentionC16Kernel<paddle::bfloat16, int8_t>(
const AppendAttnMetaData& meta_data,
const paddle::Tensor& qkv, // [token_num, num_heads, head_dim]
const paddle::Tensor&
cache_k, // [max_block_num, num_heads, block_size, head_dim]
const paddle::Tensor&
cache_v, // [max_block_num, num_heads, head_dim, block_size]
const paddle::optional<paddle::Tensor>& attn_mask,
const paddle::optional<paddle::Tensor>&
cache_k_scale, // [num_kv_heads, head_dim]
const paddle::optional<paddle::Tensor>&
cache_v_scale, // [num_kv_heads, head_dim]
const paddle::optional<paddle::Tensor>&
cache_k_zp, // [num_kv_heads, head_dim]
const paddle::optional<paddle::Tensor>&
cache_v_zp, // [num_kv_heads, head_dim]
const paddle::optional<paddle::Tensor>&
shift_bias, // [num_kv_heads, head_dim]
const paddle::optional<paddle::Tensor>&
smooth_weight, // [num_kv_heads, head_dim]
const paddle::Tensor& seq_lens_q,
const paddle::Tensor& seq_lens_kv,
const paddle::Tensor& seq_lens_encoder,
const paddle::Tensor& batch_id_per_token,
const paddle::Tensor& cu_seqlens_q,
const paddle::Tensor& block_table,
const paddle::Tensor& batch_ids,
const paddle::Tensor& tile_ids_per_batch,
const int num_blocks,
const int block_shape_q,
const int max_seq_len,
const int max_dec_len,
const float quant_max_bound,
const float quant_min_bound,
const float in_scale,
const int max_partition_size,
const int encoder_max_partition_size,
const int speculate_max_draft_token_num,
const bool causal,
const bool is_decoder,
const bool enable_prefill,
cudaStream_t& stream,
paddle::Tensor* out);
@@ -1,58 +0,0 @@
// Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "../append_attention_c16_impl.cuh"
template void CascadeAppendAttentionC16Kernel<paddle::float16, paddle::float16>(
const AppendAttnMetaData& meta_data,
const paddle::Tensor& qkv, // [token_num, num_heads, head_dim]
const paddle::Tensor&
cache_k, // [max_block_num, num_heads, block_size, head_dim]
const paddle::Tensor&
cache_v, // [max_block_num, num_heads, head_dim, block_size]
const paddle::optional<paddle::Tensor>& attn_mask,
const paddle::optional<paddle::Tensor>&
cache_k_scale, // [num_kv_heads, head_dim]
const paddle::optional<paddle::Tensor>&
cache_v_scale, // [num_kv_heads, head_dim]
const paddle::optional<paddle::Tensor>&
cache_k_zp, // [num_kv_heads, head_dim]
const paddle::optional<paddle::Tensor>&
cache_v_zp, // [num_kv_heads, head_dim]
const paddle::optional<paddle::Tensor>&
shift_bias, // [num_kv_heads, head_dim]
const paddle::optional<paddle::Tensor>&
smooth_weight, // [num_kv_heads, head_dim]
const paddle::Tensor& seq_lens_q,
const paddle::Tensor& seq_lens_kv,
const paddle::Tensor& seq_lens_encoder,
const paddle::Tensor& batch_id_per_token,
const paddle::Tensor& cu_seqlens_q,
const paddle::Tensor& block_table,
const paddle::Tensor& batch_ids,
const paddle::Tensor& tile_ids_per_batch,
const int num_blocks,
const int block_shape_q,
const int max_seq_len,
const int max_dec_len,
const float quant_max_bound,
const float quant_min_bound,
const float in_scale,
const int max_partition_size,
const int encoder_max_partition_size,
const int speculate_max_draft_token_num,
const bool causal,
const bool is_decoder,
const bool enable_prefill,
cudaStream_t& stream,
paddle::Tensor* out);
@@ -1,58 +0,0 @@
// Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "../append_attention_c16_impl.cuh"
template void CascadeAppendAttentionC16Kernel<paddle::float16, paddle::float8_e4m3fn>(
const AppendAttnMetaData& meta_data,
const paddle::Tensor& qkv, // [token_num, num_heads, head_dim]
const paddle::Tensor&
cache_k, // [max_block_num, num_heads, block_size, head_dim]
const paddle::Tensor&
cache_v, // [max_block_num, num_heads, head_dim, block_size]
const paddle::optional<paddle::Tensor>& attn_mask,
const paddle::optional<paddle::Tensor>&
cache_k_scale, // [num_kv_heads, head_dim]
const paddle::optional<paddle::Tensor>&
cache_v_scale, // [num_kv_heads, head_dim]
const paddle::optional<paddle::Tensor>&
cache_k_zp, // [num_kv_heads, head_dim]
const paddle::optional<paddle::Tensor>&
cache_v_zp, // [num_kv_heads, head_dim]
const paddle::optional<paddle::Tensor>&
shift_bias, // [num_kv_heads, head_dim]
const paddle::optional<paddle::Tensor>&
smooth_weight, // [num_kv_heads, head_dim]
const paddle::Tensor& seq_lens_q,
const paddle::Tensor& seq_lens_kv,
const paddle::Tensor& seq_lens_encoder,
const paddle::Tensor& batch_id_per_token,
const paddle::Tensor& cu_seqlens_q,
const paddle::Tensor& block_table,
const paddle::Tensor& batch_ids,
const paddle::Tensor& tile_ids_per_batch,
const int num_blocks,
const int block_shape_q,
const int max_seq_len,
const int max_dec_len,
const float quant_max_bound,
const float quant_min_bound,
const float in_scale,
const int max_partition_size,
const int encoder_max_partition_size,
const int speculate_max_draft_token_num,
const bool causal,
const bool is_decoder,
const bool enable_prefill,
cudaStream_t& stream,
paddle::Tensor* out);
@@ -1,58 +0,0 @@
// Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "../append_attention_c16_impl.cuh"
template void CascadeAppendAttentionC16Kernel<paddle::float16, int8_t>(
const AppendAttnMetaData& meta_data,
const paddle::Tensor& qkv, // [token_num, num_heads, head_dim]
const paddle::Tensor&
cache_k, // [max_block_num, num_heads, block_size, head_dim]
const paddle::Tensor&
cache_v, // [max_block_num, num_heads, head_dim, block_size]
const paddle::optional<paddle::Tensor>& attn_mask,
const paddle::optional<paddle::Tensor>&
cache_k_scale, // [num_kv_heads, head_dim]
const paddle::optional<paddle::Tensor>&
cache_v_scale, // [num_kv_heads, head_dim]
const paddle::optional<paddle::Tensor>&
cache_k_zp, // [num_kv_heads, head_dim]
const paddle::optional<paddle::Tensor>&
cache_v_zp, // [num_kv_heads, head_dim]
const paddle::optional<paddle::Tensor>&
shift_bias, // [num_kv_heads, head_dim]
const paddle::optional<paddle::Tensor>&
smooth_weight, // [num_kv_heads, head_dim]
const paddle::Tensor& seq_lens_q,
const paddle::Tensor& seq_lens_kv,
const paddle::Tensor& seq_lens_encoder,
const paddle::Tensor& batch_id_per_token,
const paddle::Tensor& cu_seqlens_q,
const paddle::Tensor& block_table,
const paddle::Tensor& batch_ids,
const paddle::Tensor& tile_ids_per_batch,
const int num_blocks,
const int block_shape_q,
const int max_seq_len,
const int max_dec_len,
const float quant_max_bound,
const float quant_min_bound,
const float in_scale,
const int max_partition_size,
const int encoder_max_partition_size,
const int speculate_max_draft_token_num,
const bool causal,
const bool is_decoder,
const bool enable_prefill,
cudaStream_t& stream,
paddle::Tensor* out);
@@ -1,58 +0,0 @@
// Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "../append_attention_c4_impl.cuh"
template void CascadeAppendAttentionC4Kernel<paddle::bfloat16, paddle::bfloat16>(
const AppendAttnMetaData& meta_data,
const paddle::Tensor& qkv, // [token_num, num_heads, head_dim]
const paddle::Tensor&
cache_k, // [max_block_num, num_heads, block_size, head_dim]
const paddle::Tensor&
cache_v, // [max_block_num, num_heads, head_dim, block_size]
const paddle::optional<paddle::Tensor>& attn_mask,
const paddle::optional<paddle::Tensor>&
cache_k_scale, // [num_kv_heads, head_dim]
const paddle::optional<paddle::Tensor>&
cache_v_scale, // [num_kv_heads, head_dim]
const paddle::optional<paddle::Tensor>&
cache_k_zp, // [num_kv_heads, head_dim]
const paddle::optional<paddle::Tensor>&
cache_v_zp, // [num_kv_heads, head_dim]
const paddle::optional<paddle::Tensor>&
shift_bias, // [num_kv_heads, head_dim]
const paddle::optional<paddle::Tensor>&
smooth_weight, // [num_kv_heads, head_dim]
const paddle::Tensor& seq_lens_q,
const paddle::Tensor& seq_lens_kv,
const paddle::Tensor& seq_lens_encoder,
const paddle::Tensor& batch_id_per_token,
const paddle::Tensor& cu_seqlens_q,
const paddle::Tensor& block_table,
const paddle::Tensor& batch_ids,
const paddle::Tensor& tile_ids_per_batch,
const int num_blocks,
const int block_shape_q,
const int max_seq_len,
const int max_dec_len,
const float quant_max_bound,
const float quant_min_bound,
const float in_scale,
const int max_partition_size,
const int encoder_max_partition_size,
const int speculate_max_draft_token_num,
const bool causal,
const bool is_decoder,
const bool enable_prefill,
cudaStream_t& stream,
paddle::Tensor* out);
@@ -1,58 +0,0 @@
// Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "../append_attention_c4_impl.cuh"
template void CascadeAppendAttentionC4Kernel<paddle::bfloat16, paddle::float8_e4m3fn>(
const AppendAttnMetaData& meta_data,
const paddle::Tensor& qkv, // [token_num, num_heads, head_dim]
const paddle::Tensor&
cache_k, // [max_block_num, num_heads, block_size, head_dim]
const paddle::Tensor&
cache_v, // [max_block_num, num_heads, head_dim, block_size]
const paddle::optional<paddle::Tensor>& attn_mask,
const paddle::optional<paddle::Tensor>&
cache_k_scale, // [num_kv_heads, head_dim]
const paddle::optional<paddle::Tensor>&
cache_v_scale, // [num_kv_heads, head_dim]
const paddle::optional<paddle::Tensor>&
cache_k_zp, // [num_kv_heads, head_dim]
const paddle::optional<paddle::Tensor>&
cache_v_zp, // [num_kv_heads, head_dim]
const paddle::optional<paddle::Tensor>&
shift_bias, // [num_kv_heads, head_dim]
const paddle::optional<paddle::Tensor>&
smooth_weight, // [num_kv_heads, head_dim]
const paddle::Tensor& seq_lens_q,
const paddle::Tensor& seq_lens_kv,
const paddle::Tensor& seq_lens_encoder,
const paddle::Tensor& batch_id_per_token,
const paddle::Tensor& cu_seqlens_q,
const paddle::Tensor& block_table,
const paddle::Tensor& batch_ids,
const paddle::Tensor& tile_ids_per_batch,
const int num_blocks,
const int block_shape_q,
const int max_seq_len,
const int max_dec_len,
const float quant_max_bound,
const float quant_min_bound,
const float in_scale,
const int max_partition_size,
const int encoder_max_partition_size,
const int speculate_max_draft_token_num,
const bool causal,
const bool is_decoder,
const bool enable_prefill,
cudaStream_t& stream,
paddle::Tensor* out);
@@ -1,58 +0,0 @@
// Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "../append_attention_c4_impl.cuh"
template void CascadeAppendAttentionC4Kernel<paddle::bfloat16, int8_t>(
const AppendAttnMetaData& meta_data,
const paddle::Tensor& qkv, // [token_num, num_heads, head_dim]
const paddle::Tensor&
cache_k, // [max_block_num, num_heads, block_size, head_dim]
const paddle::Tensor&
cache_v, // [max_block_num, num_heads, head_dim, block_size]
const paddle::optional<paddle::Tensor>& attn_mask,
const paddle::optional<paddle::Tensor>&
cache_k_scale, // [num_kv_heads, head_dim]
const paddle::optional<paddle::Tensor>&
cache_v_scale, // [num_kv_heads, head_dim]
const paddle::optional<paddle::Tensor>&
cache_k_zp, // [num_kv_heads, head_dim]
const paddle::optional<paddle::Tensor>&
cache_v_zp, // [num_kv_heads, head_dim]
const paddle::optional<paddle::Tensor>&
shift_bias, // [num_kv_heads, head_dim]
const paddle::optional<paddle::Tensor>&
smooth_weight, // [num_kv_heads, head_dim]
const paddle::Tensor& seq_lens_q,
const paddle::Tensor& seq_lens_kv,
const paddle::Tensor& seq_lens_encoder,
const paddle::Tensor& batch_id_per_token,
const paddle::Tensor& cu_seqlens_q,
const paddle::Tensor& block_table,
const paddle::Tensor& batch_ids,
const paddle::Tensor& tile_ids_per_batch,
const int num_blocks,
const int block_shape_q,
const int max_seq_len,
const int max_dec_len,
const float quant_max_bound,
const float quant_min_bound,
const float in_scale,
const int max_partition_size,
const int encoder_max_partition_size,
const int speculate_max_draft_token_num,
const bool causal,
const bool is_decoder,
const bool enable_prefill,
cudaStream_t& stream,
paddle::Tensor* out);
@@ -1,59 +0,0 @@
// Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "../append_attention_c4_impl.cuh"
template void CascadeAppendAttentionC4Kernel<paddle::float16, paddle::float16>(
const AppendAttnMetaData& meta_data,
const paddle::Tensor& qkv, // [token_num, num_heads, head_dim]
const paddle::Tensor&
cache_k, // [max_block_num, num_heads, block_size, head_dim]
const paddle::Tensor&
cache_v, // [max_block_num, num_heads, head_dim, block_size]
const paddle::optional<paddle::Tensor>& attn_mask,
const paddle::optional<paddle::Tensor>&
cache_k_scale, // [num_kv_heads, head_dim]
const paddle::optional<paddle::Tensor>&
cache_v_scale, // [num_kv_heads, head_dim]
const paddle::optional<paddle::Tensor>&
cache_k_zp, // [num_kv_heads, head_dim]
const paddle::optional<paddle::Tensor>&
cache_v_zp, // [num_kv_heads, head_dim]
const paddle::optional<paddle::Tensor>&
shift_bias, // [num_kv_heads, head_dim]
const paddle::optional<paddle::Tensor>&
smooth_weight, // [num_kv_heads, head_dim]
const paddle::Tensor& seq_lens_q,
const paddle::Tensor& seq_lens_kv,
const paddle::Tensor& seq_lens_encoder,
const paddle::Tensor& batch_id_per_token,
const paddle::Tensor& cu_seqlens_q,
const paddle::Tensor& block_table,
const paddle::Tensor& batch_ids,
const paddle::Tensor& tile_ids_per_batch,
const int num_blocks,
const int block_shape_q,
const int max_seq_len,
const int max_dec_len,
const float quant_max_bound,
const float quant_min_bound,
const float in_scale,
const int max_partition_size,
const int encoder_max_partition_size,
const int speculate_max_draft_token_num,
const bool causal,
const bool is_decoder,
const bool enable_prefill,
cudaStream_t& stream,
paddle::Tensor* out);
@@ -1,58 +0,0 @@
// Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "../append_attention_c4_impl.cuh"
template void CascadeAppendAttentionC4Kernel<paddle::float16, paddle::float8_e4m3fn>(
const AppendAttnMetaData& meta_data,
const paddle::Tensor& qkv, // [token_num, num_heads, head_dim]
const paddle::Tensor&
cache_k, // [max_block_num, num_heads, block_size, head_dim]
const paddle::Tensor&
cache_v, // [max_block_num, num_heads, head_dim, block_size]
const paddle::optional<paddle::Tensor>& attn_mask,
const paddle::optional<paddle::Tensor>&
cache_k_scale, // [num_kv_heads, head_dim]
const paddle::optional<paddle::Tensor>&
cache_v_scale, // [num_kv_heads, head_dim]
const paddle::optional<paddle::Tensor>&
cache_k_zp, // [num_kv_heads, head_dim]
const paddle::optional<paddle::Tensor>&
cache_v_zp, // [num_kv_heads, head_dim]
const paddle::optional<paddle::Tensor>&
shift_bias, // [num_kv_heads, head_dim]
const paddle::optional<paddle::Tensor>&
smooth_weight, // [num_kv_heads, head_dim]
const paddle::Tensor& seq_lens_q,
const paddle::Tensor& seq_lens_kv,
const paddle::Tensor& seq_lens_encoder,
const paddle::Tensor& batch_id_per_token,
const paddle::Tensor& cu_seqlens_q,
const paddle::Tensor& block_table,
const paddle::Tensor& batch_ids,
const paddle::Tensor& tile_ids_per_batch,
const int num_blocks,
const int block_shape_q,
const int max_seq_len,
const int max_dec_len,
const float quant_max_bound,
const float quant_min_bound,
const float in_scale,
const int max_partition_size,
const int encoder_max_partition_size,
const int speculate_max_draft_token_num,
const bool causal,
const bool is_decoder,
const bool enable_prefill,
cudaStream_t& stream,
paddle::Tensor* out);
@@ -1,58 +0,0 @@
// Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "../append_attention_c4_impl.cuh"
template void CascadeAppendAttentionC4Kernel<paddle::float16, int8_t>(
const AppendAttnMetaData& meta_data,
const paddle::Tensor& qkv, // [token_num, num_heads, head_dim]
const paddle::Tensor&
cache_k, // [max_block_num, num_heads, block_size, head_dim]
const paddle::Tensor&
cache_v, // [max_block_num, num_heads, head_dim, block_size]
const paddle::optional<paddle::Tensor>& attn_mask,
const paddle::optional<paddle::Tensor>&
cache_k_scale, // [num_kv_heads, head_dim]
const paddle::optional<paddle::Tensor>&
cache_v_scale, // [num_kv_heads, head_dim]
const paddle::optional<paddle::Tensor>&
cache_k_zp, // [num_kv_heads, head_dim]
const paddle::optional<paddle::Tensor>&
cache_v_zp, // [num_kv_heads, head_dim]
const paddle::optional<paddle::Tensor>&
shift_bias, // [num_kv_heads, head_dim]
const paddle::optional<paddle::Tensor>&
smooth_weight, // [num_kv_heads, head_dim]
const paddle::Tensor& seq_lens_q,
const paddle::Tensor& seq_lens_kv,
const paddle::Tensor& seq_lens_encoder,
const paddle::Tensor& batch_id_per_token,
const paddle::Tensor& cu_seqlens_q,
const paddle::Tensor& block_table,
const paddle::Tensor& batch_ids,
const paddle::Tensor& tile_ids_per_batch,
const int num_blocks,
const int block_shape_q,
const int max_seq_len,
const int max_dec_len,
const float quant_max_bound,
const float quant_min_bound,
const float in_scale,
const int max_partition_size,
const int encoder_max_partition_size,
const int speculate_max_draft_token_num,
const bool causal,
const bool is_decoder,
const bool enable_prefill,
cudaStream_t& stream,
paddle::Tensor* out);