[Iluvatar] Fix cuda graph error for tp > 1 in ernie models (#7126)

2026-04-22 16:07:51 +08:00 · 2026-04-01 19:13:34 +08:00
parent fdfc908e2f
commit ceaf5df350
5 changed files with 75 additions and 11 deletions
@@ -21,6 +21,7 @@ import paddle.distributed as dist
 from paddle.distributed import fleet

 import fastdeploy.envs as envs
+from fastdeploy.platforms import current_platform
 from fastdeploy.utils import get_logger, register_custom_python_op

 logger = get_logger("communication")
@@ -161,12 +162,21 @@ try:
            return _TP_AR.custom_all_reduce(input_)

        if paddle.in_dynamic_mode():
-            if group_ is not None:
-                dist.all_reduce(input_, group=group_)
+            if current_platform.is_iluvatar():
+                # use_calc_stream = False will raise event sync error when enable cuda graph and tp_size > 1
+                if group_ is not None:
+                    stream.all_reduce(input_, op=ReduceOp.SUM, group=group_, sync_op=True, use_calc_stream=True)
+                else:
+                    hcg = fleet.get_hybrid_communicate_group()
+                    mp_group = hcg.get_model_parallel_group()
+                    stream.all_reduce(input_, op=ReduceOp.SUM, group=mp_group, sync_op=True, use_calc_stream=True)
            else:
-                hcg = fleet.get_hybrid_communicate_group()
-                mp_group = hcg.get_model_parallel_group()
-                dist.all_reduce(input_, group=mp_group)
+                if group_ is not None:
+                    dist.all_reduce(input_, group=group_)
+                else:
+                    hcg = fleet.get_hybrid_communicate_group()
+                    mp_group = hcg.get_model_parallel_group()
+                    dist.all_reduce(input_, group=mp_group)
        else:
            dist.all_reduce(input_)
        return input_
@@ -43,7 +43,7 @@ class DefaultModelLoader(BaseModelLoader):

    def clean_memory_fragments(self, state_dict: dict) -> None:
        """clean_memory_fragments"""
-        if current_platform.is_cuda() or current_platform.is_maca():
+        if current_platform.is_cuda() or current_platform.is_maca() or current_platform.is_iluvatar():
            if state_dict:
                for k, v in state_dict.items():
                    if isinstance(v, paddle.Tensor):
@@ -49,7 +49,7 @@ class DefaultModelLoaderV1(BaseModelLoader):

    def clean_memory_fragments(self) -> None:
        """clean_memory_fragments"""
-        if current_platform.is_cuda() or current_platform.is_maca():
+        if current_platform.is_cuda() or current_platform.is_maca() or current_platform.is_iluvatar():
            paddle.device.empty_cache()
            paddle.device.synchronize()

@@ -17,6 +17,7 @@
 from functools import partial

 import paddle
+from paddleformers.utils.log import logger

 from fastdeploy import envs
 from fastdeploy.config import FDConfig
@@ -66,10 +67,17 @@ class IluvatarModelRunner(GPUModelRunner):
                not self.cache_config.enable_chunked_prefill
            ), "Iluvatar does not support chunked prefill for VL model"

-        if hasattr(self.quant_config, "moe_quant_type") and self.quant_config.moe_quant_type == "wint4":
-            assert not self.use_cudagraph, "Iluvatar does not support cuda graph for weight_only_int4"
+        if self.model_config.model_type == "ernie4_5_moe_vl" and self.parallel_config.tensor_parallel_size > 1:
+            # ernie-vl does not support cuda graph for tp > 1
+            logger.warning("disable cudagraph since ernie-vl does not support cuda graph for tp > 1")
+            self.use_cudagraph = False

-        print(f"self.use_cudagraph={self.use_cudagraph}")
+        if hasattr(self.quant_config, "moe_quant_type") and self.quant_config.moe_quant_type == "wint4":
+            # Iluvatar does not support cuda graph for weight_only_int4 yet
+            logger.warning("disable cudagraph since iluvatar does not support cuda graph for weight_only_int4")
+            self.use_cudagraph = False
+
+        logger.info(f"self.use_cudagraph={self.use_cudagraph}")
        # VL neox style = True
        emb_shape = self.share_inputs["rope_emb"].shape
        if emb_shape[-1] == self.model_config.head_dim // 2:
@@ -187,7 +187,7 @@ function check_server_status() {
    echo -e "\n"
 }

-echo "============ Online: start to test ERNIE-4.5-21B-A3B-Paddle ==========="
+echo "============ Online: start to test ERNIE-4.5-21B-A3B-Paddle (wint8, tp=1, enable_cudagraph) ==========="
 clear_message
 echo "Start server..."
 python -m fastdeploy.entrypoints.openai.api_server \
@@ -233,6 +233,52 @@ fi
 # fi
 echo -e "\nPASSED"

+echo "============ Online: start to test ERNIE-4.5-21B-A3B-Paddle (wint8, tp=2, enable_cudagraph) ==========="
+clear_message
+echo "Start server..."
+python -m fastdeploy.entrypoints.openai.api_server \
+       --model ${MODEL_DIR}/ERNIE-4.5-21B-A3B-Paddle \
+       --port 8180 \
+       --tensor-parallel-size 2 \
+       --quantization wint8 \
+       --max-model-len 32768 \
+       --max-num-seqs 8 \
+       --block-size 16 \
+       --graph-optimization-config '{"use_cudagraph": true}' > server.log 2>&1 &
+
+check_server_status
+
+echo "Start inference..."
+cp ${CI_PATH}/test.jsonl ./
+python3 -u ${CI_PATH}/bench_gsm8k.py --port 8180 --num-questions 10 --num-shots 5 --parallel 8
+
+exit_code=$?
+echo -e "\nexit_code is ${exit_code}"
+
+echo -e "\nStop server..."
+stop_processes
+echo -e "\nStop server done."
+
+if [ ${exit_code} -ne 0 ]; then
+    print_error_message
+    exit 1
+fi
+
+acc=`python3 -c "import json; [print(json.loads(line)['latency']) for line in open('result.jsonl')]"`
+latency=`python3 -c "import json; [print(json.loads(line)['latency']) for line in open('result.jsonl')]"`
+expected_lowerest_acc=0.8
+expected_largest_latency=60
+if awk -v a="$acc" -v b="$expected_lowerest_acc" 'BEGIN {exit !(a < b)}'; then
+    echo -e "\nExit with Accucary error, current accuracy $acc less than $expected_lowerest_acc "
+    exit 1
+fi
+
+# if awk -v a="$latency" -v b="$expected_largest_latency" 'BEGIN {exit !(a > b)}'; then
+#     echo -e "\nExit with Latency Error, current latency $latency greater than $expected_largest_latency "
+#     exit 1
+# fi
+echo -e "\nPASSED"
+
 echo -e "\n============ Online: start to test ERNIE-4.5-VL-28B-A3B-Paddle ==========="
 clear_message
 echo "Start server..."