[Iluvatar] Fix cuda graph error for tp > 1 in ernie models (#7126)

This commit is contained in:
yzwu
2026-04-01 19:13:34 +08:00
committed by GitHub
parent fdfc908e2f
commit ceaf5df350
5 changed files with 75 additions and 11 deletions
+15 -5
View File
@@ -21,6 +21,7 @@ import paddle.distributed as dist
from paddle.distributed import fleet
import fastdeploy.envs as envs
from fastdeploy.platforms import current_platform
from fastdeploy.utils import get_logger, register_custom_python_op
logger = get_logger("communication")
@@ -161,12 +162,21 @@ try:
return _TP_AR.custom_all_reduce(input_)
if paddle.in_dynamic_mode():
if group_ is not None:
dist.all_reduce(input_, group=group_)
if current_platform.is_iluvatar():
# use_calc_stream = False will raise event sync error when enable cuda graph and tp_size > 1
if group_ is not None:
stream.all_reduce(input_, op=ReduceOp.SUM, group=group_, sync_op=True, use_calc_stream=True)
else:
hcg = fleet.get_hybrid_communicate_group()
mp_group = hcg.get_model_parallel_group()
stream.all_reduce(input_, op=ReduceOp.SUM, group=mp_group, sync_op=True, use_calc_stream=True)
else:
hcg = fleet.get_hybrid_communicate_group()
mp_group = hcg.get_model_parallel_group()
dist.all_reduce(input_, group=mp_group)
if group_ is not None:
dist.all_reduce(input_, group=group_)
else:
hcg = fleet.get_hybrid_communicate_group()
mp_group = hcg.get_model_parallel_group()
dist.all_reduce(input_, group=mp_group)
else:
dist.all_reduce(input_)
return input_
@@ -43,7 +43,7 @@ class DefaultModelLoader(BaseModelLoader):
def clean_memory_fragments(self, state_dict: dict) -> None:
"""clean_memory_fragments"""
if current_platform.is_cuda() or current_platform.is_maca():
if current_platform.is_cuda() or current_platform.is_maca() or current_platform.is_iluvatar():
if state_dict:
for k, v in state_dict.items():
if isinstance(v, paddle.Tensor):
@@ -49,7 +49,7 @@ class DefaultModelLoaderV1(BaseModelLoader):
def clean_memory_fragments(self) -> None:
"""clean_memory_fragments"""
if current_platform.is_cuda() or current_platform.is_maca():
if current_platform.is_cuda() or current_platform.is_maca() or current_platform.is_iluvatar():
paddle.device.empty_cache()
paddle.device.synchronize()
+11 -3
View File
@@ -17,6 +17,7 @@
from functools import partial
import paddle
from paddleformers.utils.log import logger
from fastdeploy import envs
from fastdeploy.config import FDConfig
@@ -66,10 +67,17 @@ class IluvatarModelRunner(GPUModelRunner):
not self.cache_config.enable_chunked_prefill
), "Iluvatar does not support chunked prefill for VL model"
if hasattr(self.quant_config, "moe_quant_type") and self.quant_config.moe_quant_type == "wint4":
assert not self.use_cudagraph, "Iluvatar does not support cuda graph for weight_only_int4"
if self.model_config.model_type == "ernie4_5_moe_vl" and self.parallel_config.tensor_parallel_size > 1:
# ernie-vl does not support cuda graph for tp > 1
logger.warning("disable cudagraph since ernie-vl does not support cuda graph for tp > 1")
self.use_cudagraph = False
print(f"self.use_cudagraph={self.use_cudagraph}")
if hasattr(self.quant_config, "moe_quant_type") and self.quant_config.moe_quant_type == "wint4":
# Iluvatar does not support cuda graph for weight_only_int4 yet
logger.warning("disable cudagraph since iluvatar does not support cuda graph for weight_only_int4")
self.use_cudagraph = False
logger.info(f"self.use_cudagraph={self.use_cudagraph}")
# VL neox style = True
emb_shape = self.share_inputs["rope_emb"].shape
if emb_shape[-1] == self.model_config.head_dim // 2:
+47 -1
View File
@@ -187,7 +187,7 @@ function check_server_status() {
echo -e "\n"
}
echo "============ Online: start to test ERNIE-4.5-21B-A3B-Paddle ==========="
echo "============ Online: start to test ERNIE-4.5-21B-A3B-Paddle (wint8, tp=1, enable_cudagraph) ==========="
clear_message
echo "Start server..."
python -m fastdeploy.entrypoints.openai.api_server \
@@ -233,6 +233,52 @@ fi
# fi
echo -e "\nPASSED"
echo "============ Online: start to test ERNIE-4.5-21B-A3B-Paddle (wint8, tp=2, enable_cudagraph) ==========="
clear_message
echo "Start server..."
python -m fastdeploy.entrypoints.openai.api_server \
--model ${MODEL_DIR}/ERNIE-4.5-21B-A3B-Paddle \
--port 8180 \
--tensor-parallel-size 2 \
--quantization wint8 \
--max-model-len 32768 \
--max-num-seqs 8 \
--block-size 16 \
--graph-optimization-config '{"use_cudagraph": true}' > server.log 2>&1 &
check_server_status
echo "Start inference..."
cp ${CI_PATH}/test.jsonl ./
python3 -u ${CI_PATH}/bench_gsm8k.py --port 8180 --num-questions 10 --num-shots 5 --parallel 8
exit_code=$?
echo -e "\nexit_code is ${exit_code}"
echo -e "\nStop server..."
stop_processes
echo -e "\nStop server done."
if [ ${exit_code} -ne 0 ]; then
print_error_message
exit 1
fi
acc=`python3 -c "import json; [print(json.loads(line)['latency']) for line in open('result.jsonl')]"`
latency=`python3 -c "import json; [print(json.loads(line)['latency']) for line in open('result.jsonl')]"`
expected_lowerest_acc=0.8
expected_largest_latency=60
if awk -v a="$acc" -v b="$expected_lowerest_acc" 'BEGIN {exit !(a < b)}'; then
echo -e "\nExit with Accucary error, current accuracy $acc less than $expected_lowerest_acc "
exit 1
fi
# if awk -v a="$latency" -v b="$expected_largest_latency" 'BEGIN {exit !(a > b)}'; then
# echo -e "\nExit with Latency Error, current latency $latency greater than $expected_largest_latency "
# exit 1
# fi
echo -e "\nPASSED"
echo -e "\n============ Online: start to test ERNIE-4.5-VL-28B-A3B-Paddle ==========="
clear_message
echo "Start server..."