Support GPT-OSS-BF16 (#4240)

* [Feature] AppendAtten support sinks & HEAD_DIM=64

* fix bug

* fix bug

* fix bug

* fix bug

* [Feature] support gpt-oss

* fix bug

* add mask

* support-gpt-oss

* support-gpt-oss

* fix long seq

* support wint8

* support wint8

* support wint8

* update test

* change sliding windows init pos

---------

Co-authored-by: ming1753 <ideaminghp@163.com>
Co-authored-by: YuBaoku <49938469+EmmonsCurse@users.noreply.github.com>
Co-authored-by: ming1753 <61511741+ming1753@users.noreply.github.com>
This commit is contained in:
Haonan Luo
2025-10-20 14:44:58 +08:00
committed by GitHub
parent 80a16c4c87
commit 1b9f351d21
32 changed files with 1502 additions and 172 deletions
+12 -5
View File
@@ -81,6 +81,7 @@ std::vector<paddle::Tensor> AppendAttention(
const paddle::optional<paddle::Tensor> &kv_signal_data,
const paddle::optional<paddle::Tensor>& q_norm_weight,
const paddle::optional<paddle::Tensor>& k_norm_weight,
const paddle::optional<paddle::Tensor>& sinks,
const float rms_norm_eps,
const std::string &compute_dtype, const std::string &cache_quant_type_str,
const bool use_neox_rotary_style, const bool rope_3d,
@@ -89,7 +90,8 @@ std::vector<paddle::Tensor> AppendAttention(
const int encoder_block_shape_q, const int decoder_block_shape_q,
const int max_partition_size, const int encoder_max_partition_size,
const int speculate_max_draft_token_num, const bool causal,
const bool speculate_decoder);
const bool speculate_decoder,
const int sliding_window);
std::vector<paddle::Tensor> AppendAttentionWithOutput(
const paddle::Tensor &qkv, const paddle::Tensor &key_cache,
@@ -124,6 +126,7 @@ std::vector<paddle::Tensor> AppendAttentionWithOutput(
const paddle::optional<paddle::Tensor> &kv_signal_data,
const paddle::optional<paddle::Tensor>& q_norm_weight,
const paddle::optional<paddle::Tensor>& k_norm_weight,
const paddle::optional<paddle::Tensor>& sinks,
const float rms_norm_eps,
const std::string &compute_dtype, const std::string &cache_quant_type_str,
const bool use_neox_rotary_style, const bool rope_3d,
@@ -132,7 +135,8 @@ std::vector<paddle::Tensor> AppendAttentionWithOutput(
const int encoder_block_shape_q, const int decoder_block_shape_q,
const int max_partition_size, const int encoder_max_partition_size,
const int speculate_max_draft_token_num, const bool causal,
const bool speculate_decoder);
const bool speculate_decoder,
const int sliding_window);
std::vector<paddle::Tensor> GQARopeWriteCacheKernel(
const paddle::Tensor &qkv, const paddle::Tensor &key_cache,
@@ -248,15 +252,18 @@ std::vector<std::vector<int>> GetExpertTokenNum(const paddle::Tensor &topk_ids,
paddle::Tensor MoeExpertFFNFunc(
const paddle::Tensor& permute_input,
const paddle::Tensor& tokens_expert_prefix_sum,
const paddle::Tensor& up_gate_proj_weight, const paddle::Tensor& down_proj_weight,
const paddle::Tensor& up_gate_proj_weight,
const paddle::Tensor& down_proj_weight,
const paddle::optional<paddle::Tensor>& up_gate_proj_bias,
const paddle::optional<paddle::Tensor>& up_gate_proj_scale,
const paddle::optional<paddle::Tensor>& down_proj_scale,
const paddle::optional<paddle::Tensor>& down_proj_in_scale,
const paddle::optional<paddle::Tensor>& expert_idx_per_token,
const std::string& quant_method, const bool used_in_ep_low_latency,
const std::string& quant_method,
const bool used_in_ep_low_latency,
const int estimate_total_token_nums,
const int hadamard_block_size);
const int hadamard_block_size,
const std::string& activation);
paddle::Tensor MoeExpertFFNWint2Func(
const paddle::Tensor& permute_input,