[SOT][Cudagraph] Remove BreakGraph of #3302 && update CustomOp (#3694)

* rm inplace info && to(gpu) * update append_attention * unpin paddle version * add full_cuda_graph=False * add blank line --------- Co-authored-by: SigureMo <sigure.qaq@gmail.com>
2026-04-24 01:29:57 +08:00 · 2025-10-17 10:57:55 +08:00
parent a37c9416ac
commit 49cea8fb1c
5 changed files with 12 additions and 11 deletions
@@ -262,15 +262,15 @@ class AppendAttentionBackend(AttentionBackend):
            # 3. generate output tensor of different dtypes
            if out_scale > 0.0:
                if abs(quant_max_bound - 127) < 0.000001:
-                    res = paddle.empty([token_nums, q_num_heads * head_dims], dtype="int8").to(qkv.place)
+                    res = paddle.empty([token_nums, q_num_heads * head_dims], dtype="int8")
                elif abs(quant_max_bound - 448) < 0.000001:
-                    res = paddle.empty([token_nums, q_num_heads * head_dims], dtype="float8_e4m3fn").to(qkv.place)
+                    res = paddle.empty([token_nums, q_num_heads * head_dims], dtype="float8_e4m3fn")
                else:
                    raise NotImplementedError("Only supported attr of quant_max_bound in ['127', '448'].")
            else:
-                res = paddle.empty([token_nums, q_num_heads * head_dims], dtype=D_type).to(qkv.place)
+                res = paddle.empty([token_nums, q_num_heads * head_dims], dtype=D_type)

-            append_attention_with_output(
+            res = append_attention_with_output(
                qkv,
                cache_k,
                cache_v,