mirror of
https://github.com/PaddlePaddle/FastDeploy.git
synced 2026-04-24 01:29:57 +08:00
* rm inplace info && to(gpu) * update append_attention * unpin paddle version * add full_cuda_graph=False * add blank line --------- Co-authored-by: SigureMo <sigure.qaq@gmail.com>
This commit is contained in:
@@ -262,15 +262,15 @@ class AppendAttentionBackend(AttentionBackend):
|
||||
# 3. generate output tensor of different dtypes
|
||||
if out_scale > 0.0:
|
||||
if abs(quant_max_bound - 127) < 0.000001:
|
||||
res = paddle.empty([token_nums, q_num_heads * head_dims], dtype="int8").to(qkv.place)
|
||||
res = paddle.empty([token_nums, q_num_heads * head_dims], dtype="int8")
|
||||
elif abs(quant_max_bound - 448) < 0.000001:
|
||||
res = paddle.empty([token_nums, q_num_heads * head_dims], dtype="float8_e4m3fn").to(qkv.place)
|
||||
res = paddle.empty([token_nums, q_num_heads * head_dims], dtype="float8_e4m3fn")
|
||||
else:
|
||||
raise NotImplementedError("Only supported attr of quant_max_bound in ['127', '448'].")
|
||||
else:
|
||||
res = paddle.empty([token_nums, q_num_heads * head_dims], dtype=D_type).to(qkv.place)
|
||||
res = paddle.empty([token_nums, q_num_heads * head_dims], dtype=D_type)
|
||||
|
||||
append_attention_with_output(
|
||||
res = append_attention_with_output(
|
||||
qkv,
|
||||
cache_k,
|
||||
cache_v,
|
||||
|
||||
Reference in New Issue
Block a user