From ceaf5df350aa1d474db50f78698cf35a6ebeead3 Mon Sep 17 00:00:00 2001
From: yzwu <yuzzhewu@163.com>
Date: Wed, 1 Apr 2026 19:13:34 +0800
Subject: [PATCH] [Iluvatar] Fix cuda graph error for tp > 1 in ernie models
 (#7126)

---
 fastdeploy/distributed/communication.py       | 20 ++++++--
 .../model_loader/default_loader.py            |  2 +-
 .../model_loader/default_loader_v1.py         |  2 +-
 fastdeploy/worker/iluvatar_model_runner.py    | 14 ++++--
 scripts/run_ci_iluvatar.sh                    | 48 ++++++++++++++++++-
 5 files changed, 75 insertions(+), 11 deletions(-)

diff --git a/fastdeploy/distributed/communication.py b/fastdeploy/distributed/communication.py
index aa0164f9ea..475ce14076 100644
--- a/fastdeploy/distributed/communication.py
+++ b/fastdeploy/distributed/communication.py
@@ -21,6 +21,7 @@ import paddle.distributed as dist
 from paddle.distributed import fleet
 
 import fastdeploy.envs as envs
+from fastdeploy.platforms import current_platform
 from fastdeploy.utils import get_logger, register_custom_python_op
 
 logger = get_logger("communication")
@@ -161,12 +162,21 @@ try:
             return _TP_AR.custom_all_reduce(input_)
 
         if paddle.in_dynamic_mode():
-            if group_ is not None:
-                dist.all_reduce(input_, group=group_)
+            if current_platform.is_iluvatar():
+                # use_calc_stream = False will raise event sync error when enable cuda graph and tp_size > 1
+                if group_ is not None:
+                    stream.all_reduce(input_, op=ReduceOp.SUM, group=group_, sync_op=True, use_calc_stream=True)
+                else:
+                    hcg = fleet.get_hybrid_communicate_group()
+                    mp_group = hcg.get_model_parallel_group()
+                    stream.all_reduce(input_, op=ReduceOp.SUM, group=mp_group, sync_op=True, use_calc_stream=True)
             else:
-                hcg = fleet.get_hybrid_communicate_group()
-                mp_group = hcg.get_model_parallel_group()
-                dist.all_reduce(input_, group=mp_group)
+                if group_ is not None:
+                    dist.all_reduce(input_, group=group_)
+                else:
+                    hcg = fleet.get_hybrid_communicate_group()
+                    mp_group = hcg.get_model_parallel_group()
+                    dist.all_reduce(input_, group=mp_group)
         else:
             dist.all_reduce(input_)
         return input_
diff --git a/fastdeploy/model_executor/model_loader/default_loader.py b/fastdeploy/model_executor/model_loader/default_loader.py
index ca0dfa84f9..9945f9f31a 100644
--- a/fastdeploy/model_executor/model_loader/default_loader.py
+++ b/fastdeploy/model_executor/model_loader/default_loader.py
@@ -43,7 +43,7 @@ class DefaultModelLoader(BaseModelLoader):
 
     def clean_memory_fragments(self, state_dict: dict) -> None:
         """clean_memory_fragments"""
-        if current_platform.is_cuda() or current_platform.is_maca():
+        if current_platform.is_cuda() or current_platform.is_maca() or current_platform.is_iluvatar():
             if state_dict:
                 for k, v in state_dict.items():
                     if isinstance(v, paddle.Tensor):
diff --git a/fastdeploy/model_executor/model_loader/default_loader_v1.py b/fastdeploy/model_executor/model_loader/default_loader_v1.py
index b47d0e3f74..ce78758b10 100644
--- a/fastdeploy/model_executor/model_loader/default_loader_v1.py
+++ b/fastdeploy/model_executor/model_loader/default_loader_v1.py
@@ -49,7 +49,7 @@ class DefaultModelLoaderV1(BaseModelLoader):
 
     def clean_memory_fragments(self) -> None:
         """clean_memory_fragments"""
-        if current_platform.is_cuda() or current_platform.is_maca():
+        if current_platform.is_cuda() or current_platform.is_maca() or current_platform.is_iluvatar():
             paddle.device.empty_cache()
             paddle.device.synchronize()
 
diff --git a/fastdeploy/worker/iluvatar_model_runner.py b/fastdeploy/worker/iluvatar_model_runner.py
index 1d6eefd52c..362653eacc 100644
--- a/fastdeploy/worker/iluvatar_model_runner.py
+++ b/fastdeploy/worker/iluvatar_model_runner.py
@@ -17,6 +17,7 @@
 from functools import partial
 
 import paddle
+from paddleformers.utils.log import logger
 
 from fastdeploy import envs
 from fastdeploy.config import FDConfig
@@ -66,10 +67,17 @@ class IluvatarModelRunner(GPUModelRunner):
                 not self.cache_config.enable_chunked_prefill
             ), "Iluvatar does not support chunked prefill for VL model"
 
-        if hasattr(self.quant_config, "moe_quant_type") and self.quant_config.moe_quant_type == "wint4":
-            assert not self.use_cudagraph, "Iluvatar does not support cuda graph for weight_only_int4"
+        if self.model_config.model_type == "ernie4_5_moe_vl" and self.parallel_config.tensor_parallel_size > 1:
+            # ernie-vl does not support cuda graph for tp > 1
+            logger.warning("disable cudagraph since ernie-vl does not support cuda graph for tp > 1")
+            self.use_cudagraph = False
 
-        print(f"self.use_cudagraph={self.use_cudagraph}")
+        if hasattr(self.quant_config, "moe_quant_type") and self.quant_config.moe_quant_type == "wint4":
+            # Iluvatar does not support cuda graph for weight_only_int4 yet
+            logger.warning("disable cudagraph since iluvatar does not support cuda graph for weight_only_int4")
+            self.use_cudagraph = False
+
+        logger.info(f"self.use_cudagraph={self.use_cudagraph}")
         # VL neox style = True
         emb_shape = self.share_inputs["rope_emb"].shape
         if emb_shape[-1] == self.model_config.head_dim // 2:
diff --git a/scripts/run_ci_iluvatar.sh b/scripts/run_ci_iluvatar.sh
index 6e249af64f..50642cd641 100644
--- a/scripts/run_ci_iluvatar.sh
+++ b/scripts/run_ci_iluvatar.sh
@@ -187,7 +187,7 @@ function check_server_status() {
     echo -e "\n"
 }
 
-echo "============ Online: start to test ERNIE-4.5-21B-A3B-Paddle ==========="
+echo "============ Online: start to test ERNIE-4.5-21B-A3B-Paddle (wint8, tp=1, enable_cudagraph) ==========="
 clear_message
 echo "Start server..."
 python -m fastdeploy.entrypoints.openai.api_server \
@@ -233,6 +233,52 @@ fi
 # fi
 echo -e "\nPASSED"
 
+echo "============ Online: start to test ERNIE-4.5-21B-A3B-Paddle (wint8, tp=2, enable_cudagraph) ==========="
+clear_message
+echo "Start server..."
+python -m fastdeploy.entrypoints.openai.api_server \
+       --model ${MODEL_DIR}/ERNIE-4.5-21B-A3B-Paddle \
+       --port 8180 \
+       --tensor-parallel-size 2 \
+       --quantization wint8 \
+       --max-model-len 32768 \
+       --max-num-seqs 8 \
+       --block-size 16 \
+       --graph-optimization-config '{"use_cudagraph": true}' > server.log 2>&1 &
+
+check_server_status
+
+echo "Start inference..."
+cp ${CI_PATH}/test.jsonl ./
+python3 -u ${CI_PATH}/bench_gsm8k.py --port 8180 --num-questions 10 --num-shots 5 --parallel 8
+
+exit_code=$?
+echo -e "\nexit_code is ${exit_code}"
+
+echo -e "\nStop server..."
+stop_processes
+echo -e "\nStop server done."
+
+if [ ${exit_code} -ne 0 ]; then
+    print_error_message
+    exit 1
+fi
+
+acc=`python3 -c "import json; [print(json.loads(line)['latency']) for line in open('result.jsonl')]"`
+latency=`python3 -c "import json; [print(json.loads(line)['latency']) for line in open('result.jsonl')]"`
+expected_lowerest_acc=0.8
+expected_largest_latency=60
+if awk -v a="$acc" -v b="$expected_lowerest_acc" 'BEGIN {exit !(a < b)}'; then
+    echo -e "\nExit with Accucary error, current accuracy $acc less than $expected_lowerest_acc "
+    exit 1
+fi
+
+# if awk -v a="$latency" -v b="$expected_largest_latency" 'BEGIN {exit !(a > b)}'; then
+#     echo -e "\nExit with Latency Error, current latency $latency greater than $expected_largest_latency "
+#     exit 1
+# fi
+echo -e "\nPASSED"
+
 echo -e "\n============ Online: start to test ERNIE-4.5-VL-28B-A3B-Paddle ==========="
 clear_message
 echo "Start server..."