[Iluvatar] Fix cuda graph error for tp > 1 in ernie models (#7126)

This commit is contained in:
yzwu
2026-04-01 19:13:34 +08:00
committed by GitHub
parent fdfc908e2f
commit ceaf5df350
5 changed files with 75 additions and 11 deletions
+15 -5
View File
@@ -21,6 +21,7 @@ import paddle.distributed as dist
from paddle.distributed import fleet from paddle.distributed import fleet
import fastdeploy.envs as envs import fastdeploy.envs as envs
from fastdeploy.platforms import current_platform
from fastdeploy.utils import get_logger, register_custom_python_op from fastdeploy.utils import get_logger, register_custom_python_op
logger = get_logger("communication") logger = get_logger("communication")
@@ -161,12 +162,21 @@ try:
return _TP_AR.custom_all_reduce(input_) return _TP_AR.custom_all_reduce(input_)
if paddle.in_dynamic_mode(): if paddle.in_dynamic_mode():
if group_ is not None: if current_platform.is_iluvatar():
dist.all_reduce(input_, group=group_) # use_calc_stream = False will raise event sync error when enable cuda graph and tp_size > 1
if group_ is not None:
stream.all_reduce(input_, op=ReduceOp.SUM, group=group_, sync_op=True, use_calc_stream=True)
else:
hcg = fleet.get_hybrid_communicate_group()
mp_group = hcg.get_model_parallel_group()
stream.all_reduce(input_, op=ReduceOp.SUM, group=mp_group, sync_op=True, use_calc_stream=True)
else: else:
hcg = fleet.get_hybrid_communicate_group() if group_ is not None:
mp_group = hcg.get_model_parallel_group() dist.all_reduce(input_, group=group_)
dist.all_reduce(input_, group=mp_group) else:
hcg = fleet.get_hybrid_communicate_group()
mp_group = hcg.get_model_parallel_group()
dist.all_reduce(input_, group=mp_group)
else: else:
dist.all_reduce(input_) dist.all_reduce(input_)
return input_ return input_
@@ -43,7 +43,7 @@ class DefaultModelLoader(BaseModelLoader):
def clean_memory_fragments(self, state_dict: dict) -> None: def clean_memory_fragments(self, state_dict: dict) -> None:
"""clean_memory_fragments""" """clean_memory_fragments"""
if current_platform.is_cuda() or current_platform.is_maca(): if current_platform.is_cuda() or current_platform.is_maca() or current_platform.is_iluvatar():
if state_dict: if state_dict:
for k, v in state_dict.items(): for k, v in state_dict.items():
if isinstance(v, paddle.Tensor): if isinstance(v, paddle.Tensor):
@@ -49,7 +49,7 @@ class DefaultModelLoaderV1(BaseModelLoader):
def clean_memory_fragments(self) -> None: def clean_memory_fragments(self) -> None:
"""clean_memory_fragments""" """clean_memory_fragments"""
if current_platform.is_cuda() or current_platform.is_maca(): if current_platform.is_cuda() or current_platform.is_maca() or current_platform.is_iluvatar():
paddle.device.empty_cache() paddle.device.empty_cache()
paddle.device.synchronize() paddle.device.synchronize()
+11 -3
View File
@@ -17,6 +17,7 @@
from functools import partial from functools import partial
import paddle import paddle
from paddleformers.utils.log import logger
from fastdeploy import envs from fastdeploy import envs
from fastdeploy.config import FDConfig from fastdeploy.config import FDConfig
@@ -66,10 +67,17 @@ class IluvatarModelRunner(GPUModelRunner):
not self.cache_config.enable_chunked_prefill not self.cache_config.enable_chunked_prefill
), "Iluvatar does not support chunked prefill for VL model" ), "Iluvatar does not support chunked prefill for VL model"
if hasattr(self.quant_config, "moe_quant_type") and self.quant_config.moe_quant_type == "wint4": if self.model_config.model_type == "ernie4_5_moe_vl" and self.parallel_config.tensor_parallel_size > 1:
assert not self.use_cudagraph, "Iluvatar does not support cuda graph for weight_only_int4" # ernie-vl does not support cuda graph for tp > 1
logger.warning("disable cudagraph since ernie-vl does not support cuda graph for tp > 1")
self.use_cudagraph = False
print(f"self.use_cudagraph={self.use_cudagraph}") if hasattr(self.quant_config, "moe_quant_type") and self.quant_config.moe_quant_type == "wint4":
# Iluvatar does not support cuda graph for weight_only_int4 yet
logger.warning("disable cudagraph since iluvatar does not support cuda graph for weight_only_int4")
self.use_cudagraph = False
logger.info(f"self.use_cudagraph={self.use_cudagraph}")
# VL neox style = True # VL neox style = True
emb_shape = self.share_inputs["rope_emb"].shape emb_shape = self.share_inputs["rope_emb"].shape
if emb_shape[-1] == self.model_config.head_dim // 2: if emb_shape[-1] == self.model_config.head_dim // 2:
+47 -1
View File
@@ -187,7 +187,7 @@ function check_server_status() {
echo -e "\n" echo -e "\n"
} }
echo "============ Online: start to test ERNIE-4.5-21B-A3B-Paddle ===========" echo "============ Online: start to test ERNIE-4.5-21B-A3B-Paddle (wint8, tp=1, enable_cudagraph) ==========="
clear_message clear_message
echo "Start server..." echo "Start server..."
python -m fastdeploy.entrypoints.openai.api_server \ python -m fastdeploy.entrypoints.openai.api_server \
@@ -233,6 +233,52 @@ fi
# fi # fi
echo -e "\nPASSED" echo -e "\nPASSED"
echo "============ Online: start to test ERNIE-4.5-21B-A3B-Paddle (wint8, tp=2, enable_cudagraph) ==========="
clear_message
echo "Start server..."
python -m fastdeploy.entrypoints.openai.api_server \
--model ${MODEL_DIR}/ERNIE-4.5-21B-A3B-Paddle \
--port 8180 \
--tensor-parallel-size 2 \
--quantization wint8 \
--max-model-len 32768 \
--max-num-seqs 8 \
--block-size 16 \
--graph-optimization-config '{"use_cudagraph": true}' > server.log 2>&1 &
check_server_status
echo "Start inference..."
cp ${CI_PATH}/test.jsonl ./
python3 -u ${CI_PATH}/bench_gsm8k.py --port 8180 --num-questions 10 --num-shots 5 --parallel 8
exit_code=$?
echo -e "\nexit_code is ${exit_code}"
echo -e "\nStop server..."
stop_processes
echo -e "\nStop server done."
if [ ${exit_code} -ne 0 ]; then
print_error_message
exit 1
fi
acc=`python3 -c "import json; [print(json.loads(line)['latency']) for line in open('result.jsonl')]"`
latency=`python3 -c "import json; [print(json.loads(line)['latency']) for line in open('result.jsonl')]"`
expected_lowerest_acc=0.8
expected_largest_latency=60
if awk -v a="$acc" -v b="$expected_lowerest_acc" 'BEGIN {exit !(a < b)}'; then
echo -e "\nExit with Accucary error, current accuracy $acc less than $expected_lowerest_acc "
exit 1
fi
# if awk -v a="$latency" -v b="$expected_largest_latency" 'BEGIN {exit !(a > b)}'; then
# echo -e "\nExit with Latency Error, current latency $latency greater than $expected_largest_latency "
# exit 1
# fi
echo -e "\nPASSED"
echo -e "\n============ Online: start to test ERNIE-4.5-VL-28B-A3B-Paddle ===========" echo -e "\n============ Online: start to test ERNIE-4.5-VL-28B-A3B-Paddle ==========="
clear_message clear_message
echo "Start server..." echo "Start server..."