Support GPT-OSS-BF16 (#4240)

* [Feature] AppendAtten support sinks & HEAD_DIM=64 * fix bug * fix bug * fix bug * fix bug * [Feature] support gpt-oss * fix bug * add mask * support-gpt-oss * support-gpt-oss * fix long seq * support wint8 * support wint8 * support wint8 * update test * change sliding windows init pos --------- Co-authored-by: ming1753 <ideaminghp@163.com> Co-authored-by: YuBaoku <49938469+EmmonsCurse@users.noreply.github.com> Co-authored-by: ming1753 <61511741+ming1753@users.noreply.github.com>
2026-04-23 00:17:25 +08:00 · 2025-10-20 14:44:58 +08:00
parent 80a16c4c87
commit 1b9f351d21
32 changed files with 1502 additions and 172 deletions
@@ -81,6 +81,7 @@ std::vector<paddle::Tensor> AppendAttention(
    const paddle::optional<paddle::Tensor> &kv_signal_data,
    const paddle::optional<paddle::Tensor>& q_norm_weight,
    const paddle::optional<paddle::Tensor>& k_norm_weight,
+    const paddle::optional<paddle::Tensor>& sinks,
    const float rms_norm_eps,
    const std::string &compute_dtype, const std::string &cache_quant_type_str,
    const bool use_neox_rotary_style, const bool rope_3d,
@@ -89,7 +90,8 @@ std::vector<paddle::Tensor> AppendAttention(
    const int encoder_block_shape_q, const int decoder_block_shape_q,
    const int max_partition_size, const int encoder_max_partition_size,
    const int speculate_max_draft_token_num, const bool causal,
-    const bool speculate_decoder);
+    const bool speculate_decoder,
+    const int sliding_window);

 std::vector<paddle::Tensor> AppendAttentionWithOutput(
    const paddle::Tensor &qkv, const paddle::Tensor &key_cache,
@@ -124,6 +126,7 @@ std::vector<paddle::Tensor> AppendAttentionWithOutput(
    const paddle::optional<paddle::Tensor> &kv_signal_data,
    const paddle::optional<paddle::Tensor>& q_norm_weight,
    const paddle::optional<paddle::Tensor>& k_norm_weight,
+    const paddle::optional<paddle::Tensor>& sinks,
    const float rms_norm_eps,
    const std::string &compute_dtype, const std::string &cache_quant_type_str,
    const bool use_neox_rotary_style, const bool rope_3d,
@@ -132,7 +135,8 @@ std::vector<paddle::Tensor> AppendAttentionWithOutput(
    const int encoder_block_shape_q, const int decoder_block_shape_q,
    const int max_partition_size, const int encoder_max_partition_size,
    const int speculate_max_draft_token_num, const bool causal,
-    const bool speculate_decoder);
+    const bool speculate_decoder,
+    const int sliding_window);

 std::vector<paddle::Tensor> GQARopeWriteCacheKernel(
    const paddle::Tensor &qkv, const paddle::Tensor &key_cache,
@@ -248,15 +252,18 @@ std::vector<std::vector<int>> GetExpertTokenNum(const paddle::Tensor &topk_ids,
 paddle::Tensor MoeExpertFFNFunc(
    const paddle::Tensor& permute_input,
    const paddle::Tensor& tokens_expert_prefix_sum,
-    const paddle::Tensor& up_gate_proj_weight, const paddle::Tensor& down_proj_weight,
+    const paddle::Tensor& up_gate_proj_weight,
+    const paddle::Tensor& down_proj_weight,
    const paddle::optional<paddle::Tensor>& up_gate_proj_bias,
    const paddle::optional<paddle::Tensor>& up_gate_proj_scale,
    const paddle::optional<paddle::Tensor>& down_proj_scale,
    const paddle::optional<paddle::Tensor>& down_proj_in_scale,
    const paddle::optional<paddle::Tensor>& expert_idx_per_token,
-    const std::string& quant_method, const bool used_in_ep_low_latency,
+    const std::string& quant_method,
+    const bool used_in_ep_low_latency,
    const int estimate_total_token_nums,
-    const int hadamard_block_size);
+    const int hadamard_block_size,
+    const std::string& activation);

 paddle::Tensor MoeExpertFFNWint2Func(
    const paddle::Tensor& permute_input,