mirror of
https://github.com/PaddlePaddle/FastDeploy.git
synced 2026-04-23 17:11:21 +08:00
[Feature]Supports SWA based on appendattn (#6547)
This commit is contained in:
@@ -118,7 +118,8 @@ std::vector<paddle::Tensor> AppendAttention(
|
||||
const int speculate_max_draft_token_num,
|
||||
const bool causal,
|
||||
const bool speculate_decoder,
|
||||
const int sliding_window);
|
||||
const int sliding_window,
|
||||
const int sink_size);
|
||||
|
||||
std::vector<paddle::Tensor> AppendAttentionWithOutput(
|
||||
const paddle::Tensor& qkv,
|
||||
@@ -174,7 +175,8 @@ std::vector<paddle::Tensor> AppendAttentionWithOutput(
|
||||
const int speculate_max_draft_token_num,
|
||||
const bool causal,
|
||||
const bool speculate_decoder,
|
||||
const int sliding_window);
|
||||
const int sliding_window,
|
||||
const int sink_size);
|
||||
|
||||
std::vector<paddle::Tensor> GQARopeWriteCacheKernel(
|
||||
const paddle::Tensor& qkv,
|
||||
|
||||
Reference in New Issue
Block a user