From ceaf5df350aa1d474db50f78698cf35a6ebeead3 Mon Sep 17 00:00:00 2001 From: yzwu Date: Wed, 1 Apr 2026 19:13:34 +0800 Subject: [PATCH] [Iluvatar] Fix cuda graph error for tp > 1 in ernie models (#7126) --- fastdeploy/distributed/communication.py | 20 ++++++-- .../model_loader/default_loader.py | 2 +- .../model_loader/default_loader_v1.py | 2 +- fastdeploy/worker/iluvatar_model_runner.py | 14 ++++-- scripts/run_ci_iluvatar.sh | 48 ++++++++++++++++++- 5 files changed, 75 insertions(+), 11 deletions(-) diff --git a/fastdeploy/distributed/communication.py b/fastdeploy/distributed/communication.py index aa0164f9ea..475ce14076 100644 --- a/fastdeploy/distributed/communication.py +++ b/fastdeploy/distributed/communication.py @@ -21,6 +21,7 @@ import paddle.distributed as dist from paddle.distributed import fleet import fastdeploy.envs as envs +from fastdeploy.platforms import current_platform from fastdeploy.utils import get_logger, register_custom_python_op logger = get_logger("communication") @@ -161,12 +162,21 @@ try: return _TP_AR.custom_all_reduce(input_) if paddle.in_dynamic_mode(): - if group_ is not None: - dist.all_reduce(input_, group=group_) + if current_platform.is_iluvatar(): + # use_calc_stream = False will raise event sync error when enable cuda graph and tp_size > 1 + if group_ is not None: + stream.all_reduce(input_, op=ReduceOp.SUM, group=group_, sync_op=True, use_calc_stream=True) + else: + hcg = fleet.get_hybrid_communicate_group() + mp_group = hcg.get_model_parallel_group() + stream.all_reduce(input_, op=ReduceOp.SUM, group=mp_group, sync_op=True, use_calc_stream=True) else: - hcg = fleet.get_hybrid_communicate_group() - mp_group = hcg.get_model_parallel_group() - dist.all_reduce(input_, group=mp_group) + if group_ is not None: + dist.all_reduce(input_, group=group_) + else: + hcg = fleet.get_hybrid_communicate_group() + mp_group = hcg.get_model_parallel_group() + dist.all_reduce(input_, group=mp_group) else: dist.all_reduce(input_) return input_ diff --git a/fastdeploy/model_executor/model_loader/default_loader.py b/fastdeploy/model_executor/model_loader/default_loader.py index ca0dfa84f9..9945f9f31a 100644 --- a/fastdeploy/model_executor/model_loader/default_loader.py +++ b/fastdeploy/model_executor/model_loader/default_loader.py @@ -43,7 +43,7 @@ class DefaultModelLoader(BaseModelLoader): def clean_memory_fragments(self, state_dict: dict) -> None: """clean_memory_fragments""" - if current_platform.is_cuda() or current_platform.is_maca(): + if current_platform.is_cuda() or current_platform.is_maca() or current_platform.is_iluvatar(): if state_dict: for k, v in state_dict.items(): if isinstance(v, paddle.Tensor): diff --git a/fastdeploy/model_executor/model_loader/default_loader_v1.py b/fastdeploy/model_executor/model_loader/default_loader_v1.py index b47d0e3f74..ce78758b10 100644 --- a/fastdeploy/model_executor/model_loader/default_loader_v1.py +++ b/fastdeploy/model_executor/model_loader/default_loader_v1.py @@ -49,7 +49,7 @@ class DefaultModelLoaderV1(BaseModelLoader): def clean_memory_fragments(self) -> None: """clean_memory_fragments""" - if current_platform.is_cuda() or current_platform.is_maca(): + if current_platform.is_cuda() or current_platform.is_maca() or current_platform.is_iluvatar(): paddle.device.empty_cache() paddle.device.synchronize() diff --git a/fastdeploy/worker/iluvatar_model_runner.py b/fastdeploy/worker/iluvatar_model_runner.py index 1d6eefd52c..362653eacc 100644 --- a/fastdeploy/worker/iluvatar_model_runner.py +++ b/fastdeploy/worker/iluvatar_model_runner.py @@ -17,6 +17,7 @@ from functools import partial import paddle +from paddleformers.utils.log import logger from fastdeploy import envs from fastdeploy.config import FDConfig @@ -66,10 +67,17 @@ class IluvatarModelRunner(GPUModelRunner): not self.cache_config.enable_chunked_prefill ), "Iluvatar does not support chunked prefill for VL model" - if hasattr(self.quant_config, "moe_quant_type") and self.quant_config.moe_quant_type == "wint4": - assert not self.use_cudagraph, "Iluvatar does not support cuda graph for weight_only_int4" + if self.model_config.model_type == "ernie4_5_moe_vl" and self.parallel_config.tensor_parallel_size > 1: + # ernie-vl does not support cuda graph for tp > 1 + logger.warning("disable cudagraph since ernie-vl does not support cuda graph for tp > 1") + self.use_cudagraph = False - print(f"self.use_cudagraph={self.use_cudagraph}") + if hasattr(self.quant_config, "moe_quant_type") and self.quant_config.moe_quant_type == "wint4": + # Iluvatar does not support cuda graph for weight_only_int4 yet + logger.warning("disable cudagraph since iluvatar does not support cuda graph for weight_only_int4") + self.use_cudagraph = False + + logger.info(f"self.use_cudagraph={self.use_cudagraph}") # VL neox style = True emb_shape = self.share_inputs["rope_emb"].shape if emb_shape[-1] == self.model_config.head_dim // 2: diff --git a/scripts/run_ci_iluvatar.sh b/scripts/run_ci_iluvatar.sh index 6e249af64f..50642cd641 100644 --- a/scripts/run_ci_iluvatar.sh +++ b/scripts/run_ci_iluvatar.sh @@ -187,7 +187,7 @@ function check_server_status() { echo -e "\n" } -echo "============ Online: start to test ERNIE-4.5-21B-A3B-Paddle ===========" +echo "============ Online: start to test ERNIE-4.5-21B-A3B-Paddle (wint8, tp=1, enable_cudagraph) ===========" clear_message echo "Start server..." python -m fastdeploy.entrypoints.openai.api_server \ @@ -233,6 +233,52 @@ fi # fi echo -e "\nPASSED" +echo "============ Online: start to test ERNIE-4.5-21B-A3B-Paddle (wint8, tp=2, enable_cudagraph) ===========" +clear_message +echo "Start server..." +python -m fastdeploy.entrypoints.openai.api_server \ + --model ${MODEL_DIR}/ERNIE-4.5-21B-A3B-Paddle \ + --port 8180 \ + --tensor-parallel-size 2 \ + --quantization wint8 \ + --max-model-len 32768 \ + --max-num-seqs 8 \ + --block-size 16 \ + --graph-optimization-config '{"use_cudagraph": true}' > server.log 2>&1 & + +check_server_status + +echo "Start inference..." +cp ${CI_PATH}/test.jsonl ./ +python3 -u ${CI_PATH}/bench_gsm8k.py --port 8180 --num-questions 10 --num-shots 5 --parallel 8 + +exit_code=$? +echo -e "\nexit_code is ${exit_code}" + +echo -e "\nStop server..." +stop_processes +echo -e "\nStop server done." + +if [ ${exit_code} -ne 0 ]; then + print_error_message + exit 1 +fi + +acc=`python3 -c "import json; [print(json.loads(line)['latency']) for line in open('result.jsonl')]"` +latency=`python3 -c "import json; [print(json.loads(line)['latency']) for line in open('result.jsonl')]"` +expected_lowerest_acc=0.8 +expected_largest_latency=60 +if awk -v a="$acc" -v b="$expected_lowerest_acc" 'BEGIN {exit !(a < b)}'; then + echo -e "\nExit with Accucary error, current accuracy $acc less than $expected_lowerest_acc " + exit 1 +fi + +# if awk -v a="$latency" -v b="$expected_largest_latency" 'BEGIN {exit !(a > b)}'; then +# echo -e "\nExit with Latency Error, current latency $latency greater than $expected_largest_latency " +# exit 1 +# fi +echo -e "\nPASSED" + echo -e "\n============ Online: start to test ERNIE-4.5-VL-28B-A3B-Paddle ===========" clear_message echo "Start server..."