diff --git a/fastdeploy/model_executor/layers/attention/append_attn_backend.py b/fastdeploy/model_executor/layers/attention/append_attn_backend.py index eff16aad4b..13d40dcd75 100644 --- a/fastdeploy/model_executor/layers/attention/append_attn_backend.py +++ b/fastdeploy/model_executor/layers/attention/append_attn_backend.py @@ -164,6 +164,20 @@ class AppendAttentionBackend(AttentionBackend): self.rank, self.device_id = init_rank_and_device_id(fd_config) self.use_output = not fd_config.graph_opt_config.full_cuda_graph + if self.use_output: + flag = "FLAGS_cuda_graph_blacklist" + paddle.set_flags( + { + flag: ",".join( + list( + set( + paddle.get_flags(flag)[flag].split(",") + + ["custom_op.static_op_append_attention_with_output_"] + ) + ) + ) + } + ) self.fd_config = fd_config def init_attention_metadata(self, forward_meta: ForwardMeta): diff --git a/tests/ce/deploy/ernie45t_21b_cinn_fp8.yaml b/tests/ce/deploy/ernie45t_21b_cinn_fp8.yaml index d02af4a6c6..e0ef3f5987 100644 --- a/tests/ce/deploy/ernie45t_21b_cinn_fp8.yaml +++ b/tests/ce/deploy/ernie45t_21b_cinn_fp8.yaml @@ -6,3 +6,4 @@ graph_optimization_config: graph_opt_level: 2 sot_warmup_sizes: [2,16,32,64] use_cudagraph: True + full_cuda_graph: False diff --git a/tests/ce/deploy/ernie45t_21b_cinn_wint4.yaml b/tests/ce/deploy/ernie45t_21b_cinn_wint4.yaml index 50276f1b3c..a00c155172 100644 --- a/tests/ce/deploy/ernie45t_21b_cinn_wint4.yaml +++ b/tests/ce/deploy/ernie45t_21b_cinn_wint4.yaml @@ -6,3 +6,4 @@ graph_optimization_config: graph_opt_level: 2 sot_warmup_sizes: [2,16,32,64] use_cudagraph: True + full_cuda_graph: False diff --git a/tests/ce/deploy/ernie45t_21b_sot_fp8.yaml b/tests/ce/deploy/ernie45t_21b_sot_fp8.yaml index 269afb1004..7ffdc7eac4 100644 --- a/tests/ce/deploy/ernie45t_21b_sot_fp8.yaml +++ b/tests/ce/deploy/ernie45t_21b_sot_fp8.yaml @@ -6,3 +6,4 @@ graph_optimization_config: graph_opt_level: 1 sot_warmup_sizes: [2,16,32,64] use_cudagraph: True + full_cuda_graph: False diff --git a/tests/ce/deploy/ernie45t_21b_sot_wint4.yaml b/tests/ce/deploy/ernie45t_21b_sot_wint4.yaml index 46142bf612..243e5335b6 100644 --- a/tests/ce/deploy/ernie45t_21b_sot_wint4.yaml +++ b/tests/ce/deploy/ernie45t_21b_sot_wint4.yaml @@ -6,3 +6,4 @@ graph_optimization_config: graph_opt_level: 1 sot_warmup_sizes: [2,16,32,64] use_cudagraph: True + full_cuda_graph: False diff --git a/tests/graph_optimization/test_static_graph_cuda_graph_split.py b/tests/graph_optimization/test_static_graph_cuda_graph_split.py index 9d2b419512..50aa9a7ab7 100644 --- a/tests/graph_optimization/test_static_graph_cuda_graph_split.py +++ b/tests/graph_optimization/test_static_graph_cuda_graph_split.py @@ -89,7 +89,9 @@ class TestStaticGraphCUDAGraphSplit(unittest.TestCase): def test(self): """Run test case""" # Set FastDeploy config - graph_opt_config = GraphOptimizationConfig({"use_cudagraph": True, "graph_opt_level": 1}) + graph_opt_config = GraphOptimizationConfig( + {"use_cudagraph": True, "graph_opt_level": 1, "full_cuda_graph": False} + ) scheduler_config = SchedulerConfig({"max_num_seqs": 1}) graph_opt_config._set_cudagraph_sizes(max_capture_size=scheduler_config.max_num_seqs) graph_opt_config.init_with_cudagrpah_size(max_capture_size=scheduler_config.max_num_seqs)