[Graph Optimization] remove static_op_get_block_shape_and_split_kv_block from cudagraph (#6081)

* rm static_op_get_block_shape_and_split_kv_block from cudagraph

* update max_capture_shape

* fallback: zeros -> empty to avoid coverage check

* check graph_opt_config exists

* add max_capture_shape_dy2st && full_cuda_graph: false -> true in 28B vl test

* add use_cudagraph flag to control step_use_cudagraph
This commit is contained in:
Ryan
2026-01-20 14:05:18 +08:00
committed by GitHub
parent 45ebb2efb4
commit dda27e50f5
5 changed files with 23 additions and 8 deletions
@@ -172,7 +172,10 @@ class AppendAttentionBackend(AttentionBackend):
list(
set(
paddle.get_flags(flag)[flag].split(",")
+ ["custom_op.static_op_append_attention_with_output_"]
+ [
"custom_op.static_op_append_attention_with_output_",
"custom_op.static_op_get_block_shape_and_split_kv_block",
]
)
)
)
@@ -350,7 +353,7 @@ class AppendAttentionBackend(AttentionBackend):
else:
raise NotImplementedError("Only supported attr of quant_max_bound in ['127', '448'].")
else:
res = paddle.empty([token_nums, q_num_heads * head_dims], dtype=D_type)
res = paddle.zeros([token_nums, q_num_heads * head_dims], dtype=D_type)
res = append_attention_with_output(
qkv,