[Feature]Supports SWA based on appendattn (#6547)

This commit is contained in:
AIbin
2026-03-01 19:02:08 +08:00
committed by GitHub
parent ea4d10d174
commit 59b578c337
17 changed files with 410 additions and 240 deletions
+4 -2
View File
@@ -118,7 +118,8 @@ std::vector<paddle::Tensor> AppendAttention(
const int speculate_max_draft_token_num,
const bool causal,
const bool speculate_decoder,
const int sliding_window);
const int sliding_window,
const int sink_size);
std::vector<paddle::Tensor> AppendAttentionWithOutput(
const paddle::Tensor& qkv,
@@ -174,7 +175,8 @@ std::vector<paddle::Tensor> AppendAttentionWithOutput(
const int speculate_max_draft_token_num,
const bool causal,
const bool speculate_decoder,
const int sliding_window);
const int sliding_window,
const int sink_size);
std::vector<paddle::Tensor> GQARopeWriteCacheKernel(
const paddle::Tensor& qkv,