mirror of
https://github.com/PaddlePaddle/FastDeploy.git
synced 2026-04-23 00:17:25 +08:00
[Graph Optimization] remove static_op_get_block_shape_and_split_kv_block from cudagraph (#6081)
* rm static_op_get_block_shape_and_split_kv_block from cudagraph * update max_capture_shape * fallback: zeros -> empty to avoid coverage check * check graph_opt_config exists * add max_capture_shape_dy2st && full_cuda_graph: false -> true in 28B vl test * add use_cudagraph flag to control step_use_cudagraph
This commit is contained in:
@@ -172,7 +172,10 @@ class AppendAttentionBackend(AttentionBackend):
|
||||
list(
|
||||
set(
|
||||
paddle.get_flags(flag)[flag].split(",")
|
||||
+ ["custom_op.static_op_append_attention_with_output_"]
|
||||
+ [
|
||||
"custom_op.static_op_append_attention_with_output_",
|
||||
"custom_op.static_op_get_block_shape_and_split_kv_block",
|
||||
]
|
||||
)
|
||||
)
|
||||
)
|
||||
@@ -350,7 +353,7 @@ class AppendAttentionBackend(AttentionBackend):
|
||||
else:
|
||||
raise NotImplementedError("Only supported attr of quant_max_bound in ['127', '448'].")
|
||||
else:
|
||||
res = paddle.empty([token_nums, q_num_heads * head_dims], dtype=D_type)
|
||||
res = paddle.zeros([token_nums, q_num_heads * head_dims], dtype=D_type)
|
||||
|
||||
res = append_attention_with_output(
|
||||
qkv,
|
||||
|
||||
Reference in New Issue
Block a user