From 1502b6f43e4b38605458d0e6257d18047d251bd0 Mon Sep 17 00:00:00 2001 From: chen <103103266+ckl117@users.noreply.github.com> Date: Wed, 25 Mar 2026 22:22:10 +0800 Subject: [PATCH] add instantiations for decoder rope enfore_fmul_rn=true (#7009) --- .../decoder_write_cache_with_rope_kernel.cu | 114 +++++++++++++++++ .../speculate_write_cache_with_rope_kernel.cu | 118 ++++++++++++++++++ 2 files changed, 232 insertions(+) diff --git a/custom_ops/gpu_ops/append_attn/decoder_write_cache_with_rope_kernel.cu b/custom_ops/gpu_ops/append_attn/decoder_write_cache_with_rope_kernel.cu index c16af564fe..963ccfa23d 100644 --- a/custom_ops/gpu_ops/append_attn/decoder_write_cache_with_rope_kernel.cu +++ b/custom_ops/gpu_ops/append_attn/decoder_write_cache_with_rope_kernel.cu @@ -1163,3 +1163,117 @@ template void DecoderWriteCacheWithRoPEKernel( const paddle::optional& q_norm_weight, const paddle::optional& k_norm_weight, const float rms_norm_eps); + +template void DecoderWriteCacheWithRoPEKernel( + const AppendAttnMetaData& meta_data, + const paddle::Tensor& + qkv, // [token_num, 3, num_head, head_dim] ([token_num, num_head + 2 * + // kv_num_heads, head_dim] if GQA) + const paddle::Tensor& seq_lens, + const paddle::Tensor& seq_lens_encoder, + const paddle::Tensor& cu_seqlens_q, + const paddle::Tensor& block_tables, + const paddle::optional& rotary_embs, + const paddle::optional& qkv_out_scales, + const paddle::optional& qkv_biases, + const paddle::optional& cache_k_scale, + const paddle::optional& cache_v_scale, + const paddle::optional& cache_k_zp, + const paddle::optional& cache_v_zp, + const std::string& cache_quant_type_str, + const bool use_neox_rotary_style, + const bool rope_3d, + const int max_seq_len, + cudaStream_t& stream, + paddle::Tensor* qkv_out, + paddle::Tensor* key_cache_out, + paddle::Tensor* value_cache_out, + const paddle::optional& q_norm_weight, + const paddle::optional& k_norm_weight, + const float rms_norm_eps); + +template void +DecoderWriteCacheWithRoPEKernel( + const AppendAttnMetaData& meta_data, + const paddle::Tensor& + qkv, // [token_num, 3, num_head, head_dim] ([token_num, num_head + 2 * + // kv_num_heads, head_dim] if GQA) + const paddle::Tensor& seq_lens, + const paddle::Tensor& seq_lens_encoder, + const paddle::Tensor& cu_seqlens_q, + const paddle::Tensor& block_tables, + const paddle::optional& rotary_embs, + const paddle::optional& qkv_out_scales, + const paddle::optional& qkv_biases, + const paddle::optional& cache_k_scale, + const paddle::optional& cache_v_scale, + const paddle::optional& cache_k_zp, + const paddle::optional& cache_v_zp, + const std::string& cache_quant_type_str, + const bool use_neox_rotary_style, + const bool rope_3d, + const int max_seq_len, + cudaStream_t& stream, + paddle::Tensor* qkv_out, + paddle::Tensor* key_cache_out, + paddle::Tensor* value_cache_out, + const paddle::optional& q_norm_weight, + const paddle::optional& k_norm_weight, + const float rms_norm_eps); + +template void DecoderWriteCacheWithRoPEKernel( + const AppendAttnMetaData& meta_data, + const paddle::Tensor& + qkv, // [token_num, 3, num_head, head_dim] ([token_num, num_head + 2 * + // kv_num_heads, head_dim] if GQA) + const paddle::Tensor& seq_lens, + const paddle::Tensor& seq_lens_encoder, + const paddle::Tensor& cu_seqlens_q, + const paddle::Tensor& block_tables, + const paddle::optional& rotary_embs, + const paddle::optional& qkv_out_scales, + const paddle::optional& qkv_biases, + const paddle::optional& cache_k_scale, + const paddle::optional& cache_v_scale, + const paddle::optional& cache_k_zp, + const paddle::optional& cache_v_zp, + const std::string& cache_quant_type_str, + const bool use_neox_rotary_style, + const bool rope_3d, + const int max_seq_len, + cudaStream_t& stream, + paddle::Tensor* qkv_out, + paddle::Tensor* key_cache_out, + paddle::Tensor* value_cache_out, + const paddle::optional& q_norm_weight, + const paddle::optional& k_norm_weight, + const float rms_norm_eps); + +template void +DecoderWriteCacheWithRoPEKernel( + const AppendAttnMetaData& meta_data, + const paddle::Tensor& + qkv, // [token_num, 3, num_head, head_dim] ([token_num, num_head + 2 * + // kv_num_heads, head_dim] if GQA) + const paddle::Tensor& seq_lens, + const paddle::Tensor& seq_lens_encoder, + const paddle::Tensor& cu_seqlens_q, + const paddle::Tensor& block_tables, + const paddle::optional& rotary_embs, + const paddle::optional& qkv_out_scales, + const paddle::optional& qkv_biases, + const paddle::optional& cache_k_scale, + const paddle::optional& cache_v_scale, + const paddle::optional& cache_k_zp, + const paddle::optional& cache_v_zp, + const std::string& cache_quant_type_str, + const bool use_neox_rotary_style, + const bool rope_3d, + const int max_seq_len, + cudaStream_t& stream, + paddle::Tensor* qkv_out, + paddle::Tensor* key_cache_out, + paddle::Tensor* value_cache_out, + const paddle::optional& q_norm_weight, + const paddle::optional& k_norm_weight, + const float rms_norm_eps); diff --git a/custom_ops/gpu_ops/append_attn/speculate_write_cache_with_rope_kernel.cu b/custom_ops/gpu_ops/append_attn/speculate_write_cache_with_rope_kernel.cu index fdf01a1df4..e87289a74e 100644 --- a/custom_ops/gpu_ops/append_attn/speculate_write_cache_with_rope_kernel.cu +++ b/custom_ops/gpu_ops/append_attn/speculate_write_cache_with_rope_kernel.cu @@ -979,3 +979,121 @@ SpeculateWriteCacheWithRoPEKernel( const paddle::optional& q_norm_weight, const paddle::optional& k_norm_weight, const float rms_norm_eps); + +template void SpeculateWriteCacheWithRoPEKernel( + const AppendAttnMetaData& meta_data, + const paddle::Tensor& + qkv, // [token_num, 3, num_head, head_dim] ([token_num, num_head + 2 * + // gqa_group_size, head_dim] if GQA) + const paddle::Tensor& seq_lens, + const paddle::Tensor& seq_lens_encoder, + const paddle::Tensor& batch_id_per_token, + const paddle::Tensor& cu_seqlens_q, + const paddle::Tensor& block_tables, + const paddle::optional& rotary_embs, + const paddle::optional& qkv_out_scales, + const paddle::optional& qkv_biases, + const paddle::optional& cache_k_scale, + const paddle::optional& cache_v_scale, + const paddle::optional& cache_k_zp, + const paddle::optional& cache_v_zp, + const std::string& cache_quant_type_str, + const bool use_neox_rotary_style, + const bool rope_3d, + const int max_seq_len, + cudaStream_t& stream, + paddle::Tensor* qkv_out, + paddle::Tensor* key_cache_out, + paddle::Tensor* value_cache_out, + const paddle::optional& q_norm_weight, + const paddle::optional& k_norm_weight, + const float rms_norm_eps); + +template void +SpeculateWriteCacheWithRoPEKernel( + const AppendAttnMetaData& meta_data, + const paddle::Tensor& + qkv, // [token_num, 3, num_head, head_dim] ([token_num, num_head + 2 * + // gqa_group_size, head_dim] if GQA) + const paddle::Tensor& seq_lens, + const paddle::Tensor& seq_lens_encoder, + const paddle::Tensor& batch_id_per_token, + const paddle::Tensor& cu_seqlens_q, + const paddle::Tensor& block_tables, + const paddle::optional& rotary_embs, + const paddle::optional& qkv_out_scales, + const paddle::optional& qkv_biases, + const paddle::optional& cache_k_scale, + const paddle::optional& cache_v_scale, + const paddle::optional& cache_k_zp, + const paddle::optional& cache_v_zp, + const std::string& cache_quant_type_str, + const bool use_neox_rotary_style, + const bool rope_3d, + const int max_seq_len, + cudaStream_t& stream, + paddle::Tensor* qkv_out, + paddle::Tensor* key_cache_out, + paddle::Tensor* value_cache_out, + const paddle::optional& q_norm_weight, + const paddle::optional& k_norm_weight, + const float rms_norm_eps); + +template void SpeculateWriteCacheWithRoPEKernel( + const AppendAttnMetaData& meta_data, + const paddle::Tensor& + qkv, // [token_num, 3, num_head, head_dim] ([token_num, num_head + 2 * + // gqa_group_size, head_dim] if GQA) + const paddle::Tensor& seq_lens, + const paddle::Tensor& seq_lens_encoder, + const paddle::Tensor& batch_id_per_token, + const paddle::Tensor& cu_seqlens_q, + const paddle::Tensor& block_tables, + const paddle::optional& rotary_embs, + const paddle::optional& qkv_out_scales, + const paddle::optional& qkv_biases, + const paddle::optional& cache_k_scale, + const paddle::optional& cache_v_scale, + const paddle::optional& cache_k_zp, + const paddle::optional& cache_v_zp, + const std::string& cache_quant_type_str, + const bool use_neox_rotary_style, + const bool rope_3d, + const int max_seq_len, + cudaStream_t& stream, + paddle::Tensor* qkv_out, + paddle::Tensor* key_cache_out, + paddle::Tensor* value_cache_out, + const paddle::optional& q_norm_weight, + const paddle::optional& k_norm_weight, + const float rms_norm_eps); + +template void +SpeculateWriteCacheWithRoPEKernel( + const AppendAttnMetaData& meta_data, + const paddle::Tensor& + qkv, // [token_num, 3, num_head, head_dim] ([token_num, num_head + 2 * + // gqa_group_size, head_dim] if GQA) + const paddle::Tensor& seq_lens, + const paddle::Tensor& seq_lens_encoder, + const paddle::Tensor& batch_id_per_token, + const paddle::Tensor& cu_seqlens_q, + const paddle::Tensor& block_tables, + const paddle::optional& rotary_embs, + const paddle::optional& qkv_out_scales, + const paddle::optional& qkv_biases, + const paddle::optional& cache_k_scale, + const paddle::optional& cache_v_scale, + const paddle::optional& cache_k_zp, + const paddle::optional& cache_v_zp, + const std::string& cache_quant_type_str, + const bool use_neox_rotary_style, + const bool rope_3d, + const int max_seq_len, + cudaStream_t& stream, + paddle::Tensor* qkv_out, + paddle::Tensor* key_cache_out, + paddle::Tensor* value_cache_out, + const paddle::optional& q_norm_weight, + const paddle::optional& k_norm_weight, + const float rms_norm_eps);