[SOT][Cudagraph] Remove BreakGraph of #3302 && update CustomOp (#3694)

* rm inplace info && to(gpu)

* update append_attention

* unpin paddle version

* add full_cuda_graph=False

* add blank line

---------

Co-authored-by: SigureMo <sigure.qaq@gmail.com>
This commit is contained in:
Ryan
2025-10-17 10:57:55 +08:00
committed by GitHub
parent a37c9416ac
commit 49cea8fb1c
5 changed files with 12 additions and 11 deletions
@@ -262,15 +262,15 @@ class AppendAttentionBackend(AttentionBackend):
# 3. generate output tensor of different dtypes
if out_scale > 0.0:
if abs(quant_max_bound - 127) < 0.000001:
res = paddle.empty([token_nums, q_num_heads * head_dims], dtype="int8").to(qkv.place)
res = paddle.empty([token_nums, q_num_heads * head_dims], dtype="int8")
elif abs(quant_max_bound - 448) < 0.000001:
res = paddle.empty([token_nums, q_num_heads * head_dims], dtype="float8_e4m3fn").to(qkv.place)
res = paddle.empty([token_nums, q_num_heads * head_dims], dtype="float8_e4m3fn")
else:
raise NotImplementedError("Only supported attr of quant_max_bound in ['127', '448'].")
else:
res = paddle.empty([token_nums, q_num_heads * head_dims], dtype=D_type).to(qkv.place)
res = paddle.empty([token_nums, q_num_heads * head_dims], dtype=D_type)
append_attention_with_output(
res = append_attention_with_output(
qkv,
cache_k,
cache_v,