[Iluvatar] Fix cuda graph error for tp > 1 in ernie models (#7126)

2026-04-22 16:07:51 +08:00 · 2026-04-01 19:13:34 +08:00
parent fdfc908e2f
commit ceaf5df350
5 changed files with 75 additions and 11 deletions
@@ -21,6 +21,7 @@ import paddle.distributed as dist
 from paddle.distributed import fleet
 import fastdeploy.envs as envs
 from fastdeploy.platforms import current_platform
 from fastdeploy.utils import get_logger, register_custom_python_op
 logger = get_logger("communication")
@@ -161,12 +162,21 @@ try:
            return _TP_AR.custom_all_reduce(input_)
        if paddle.in_dynamic_mode():
-            if group_ is not None:
+            if current_platform.is_iluvatar():
-                dist.all_reduce(input_, group=group_)
+                # use_calc_stream = False will raise event sync error when enable cuda graph and tp_size > 1
                if group_ is not None:
                    stream.all_reduce(input_, op=ReduceOp.SUM, group=group_, sync_op=True, use_calc_stream=True)
                else:
                    hcg = fleet.get_hybrid_communicate_group()
                    mp_group = hcg.get_model_parallel_group()
                    stream.all_reduce(input_, op=ReduceOp.SUM, group=mp_group, sync_op=True, use_calc_stream=True)
            else:
-                hcg = fleet.get_hybrid_communicate_group()
+                if group_ is not None:
-                mp_group = hcg.get_model_parallel_group()
+                    dist.all_reduce(input_, group=group_)
-                dist.all_reduce(input_, group=mp_group)
+                else:
                    hcg = fleet.get_hybrid_communicate_group()
                    mp_group = hcg.get_model_parallel_group()
                    dist.all_reduce(input_, group=mp_group)
        else:
            dist.all_reduce(input_)
        return input_
@@ -43,7 +43,7 @@ class DefaultModelLoader(BaseModelLoader):
    def clean_memory_fragments(self, state_dict: dict) -> None:
        """clean_memory_fragments"""
-        if current_platform.is_cuda() or current_platform.is_maca():
+        if current_platform.is_cuda() or current_platform.is_maca() or current_platform.is_iluvatar():
            if state_dict:
                for k, v in state_dict.items():
                    if isinstance(v, paddle.Tensor):
@@ -49,7 +49,7 @@ class DefaultModelLoaderV1(BaseModelLoader):
    def clean_memory_fragments(self) -> None:
        """clean_memory_fragments"""
-        if current_platform.is_cuda() or current_platform.is_maca():
+        if current_platform.is_cuda() or current_platform.is_maca() or current_platform.is_iluvatar():
            paddle.device.empty_cache()
            paddle.device.synchronize()
@@ -17,6 +17,7 @@
 from functools import partial
 import paddle
 from paddleformers.utils.log import logger
 from fastdeploy import envs
 from fastdeploy.config import FDConfig
@@ -66,10 +67,17 @@ class IluvatarModelRunner(GPUModelRunner):
                not self.cache_config.enable_chunked_prefill
            ), "Iluvatar does not support chunked prefill for VL model"
-        if hasattr(self.quant_config, "moe_quant_type") and self.quant_config.moe_quant_type == "wint4":
+        if self.model_config.model_type == "ernie4_5_moe_vl" and self.parallel_config.tensor_parallel_size > 1:
-            assert not self.use_cudagraph, "Iluvatar does not support cuda graph for weight_only_int4"
+            # ernie-vl does not support cuda graph for tp > 1
            logger.warning("disable cudagraph since ernie-vl does not support cuda graph for tp > 1")
            self.use_cudagraph = False
-        print(f"self.use_cudagraph={self.use_cudagraph}")
+        if hasattr(self.quant_config, "moe_quant_type") and self.quant_config.moe_quant_type == "wint4":
            # Iluvatar does not support cuda graph for weight_only_int4 yet
            logger.warning("disable cudagraph since iluvatar does not support cuda graph for weight_only_int4")
            self.use_cudagraph = False
        logger.info(f"self.use_cudagraph={self.use_cudagraph}")
        # VL neox style = True
        emb_shape = self.share_inputs["rope_emb"].shape
        if emb_shape[-1] == self.model_config.head_dim // 2:
@@ -187,7 +187,7 @@ function check_server_status() {
    echo -e "\n"
 }
-echo "============ Online: start to test ERNIE-4.5-21B-A3B-Paddle ==========="
+echo "============ Online: start to test ERNIE-4.5-21B-A3B-Paddle (wint8, tp=1, enable_cudagraph) ==========="
 clear_message
 echo "Start server..."
 python -m fastdeploy.entrypoints.openai.api_server \
@@ -233,6 +233,52 @@ fi
 # fi
 echo -e "\nPASSED"
 echo "============ Online: start to test ERNIE-4.5-21B-A3B-Paddle (wint8, tp=2, enable_cudagraph) ==========="
 clear_message
 echo "Start server..."
 python -m fastdeploy.entrypoints.openai.api_server \
       --model ${MODEL_DIR}/ERNIE-4.5-21B-A3B-Paddle \
       --port 8180 \
       --tensor-parallel-size 2 \
       --quantization wint8 \
       --max-model-len 32768 \
       --max-num-seqs 8 \
       --block-size 16 \
       --graph-optimization-config '{"use_cudagraph": true}' > server.log 2>&1 &
 check_server_status
 echo "Start inference..."
 cp ${CI_PATH}/test.jsonl ./
 python3 -u ${CI_PATH}/bench_gsm8k.py --port 8180 --num-questions 10 --num-shots 5 --parallel 8
 exit_code=$?
 echo -e "\nexit_code is ${exit_code}"
 echo -e "\nStop server..."
 stop_processes
 echo -e "\nStop server done."
 if [ ${exit_code} -ne 0 ]; then
    print_error_message
    exit 1
 fi
 acc=`python3 -c "import json; [print(json.loads(line)['latency']) for line in open('result.jsonl')]"`
 latency=`python3 -c "import json; [print(json.loads(line)['latency']) for line in open('result.jsonl')]"`
 expected_lowerest_acc=0.8
 expected_largest_latency=60
 if awk -v a="$acc" -v b="$expected_lowerest_acc" 'BEGIN {exit !(a < b)}'; then
    echo -e "\nExit with Accucary error, current accuracy $acc less than $expected_lowerest_acc "
    exit 1
 fi
 # if awk -v a="$latency" -v b="$expected_largest_latency" 'BEGIN {exit !(a > b)}'; then
 #     echo -e "\nExit with Latency Error, current latency $latency greater than $expected_largest_latency "
 #     exit 1
 # fi
 echo -e "\nPASSED"
 echo -e "\n============ Online: start to test ERNIE-4.5-VL-28B-A3B-Paddle ==========="
 clear_message
 echo "Start server..."