mirror of
https://github.com/PaddlePaddle/FastDeploy.git
synced 2026-04-22 16:07:51 +08:00
[Iluvatar] Fix cuda graph error for tp > 1 in ernie models (#7126)
This commit is contained in:
@@ -21,6 +21,7 @@ import paddle.distributed as dist
|
||||
from paddle.distributed import fleet
|
||||
|
||||
import fastdeploy.envs as envs
|
||||
from fastdeploy.platforms import current_platform
|
||||
from fastdeploy.utils import get_logger, register_custom_python_op
|
||||
|
||||
logger = get_logger("communication")
|
||||
@@ -161,6 +162,15 @@ try:
|
||||
return _TP_AR.custom_all_reduce(input_)
|
||||
|
||||
if paddle.in_dynamic_mode():
|
||||
if current_platform.is_iluvatar():
|
||||
# use_calc_stream = False will raise event sync error when enable cuda graph and tp_size > 1
|
||||
if group_ is not None:
|
||||
stream.all_reduce(input_, op=ReduceOp.SUM, group=group_, sync_op=True, use_calc_stream=True)
|
||||
else:
|
||||
hcg = fleet.get_hybrid_communicate_group()
|
||||
mp_group = hcg.get_model_parallel_group()
|
||||
stream.all_reduce(input_, op=ReduceOp.SUM, group=mp_group, sync_op=True, use_calc_stream=True)
|
||||
else:
|
||||
if group_ is not None:
|
||||
dist.all_reduce(input_, group=group_)
|
||||
else:
|
||||
|
||||
@@ -43,7 +43,7 @@ class DefaultModelLoader(BaseModelLoader):
|
||||
|
||||
def clean_memory_fragments(self, state_dict: dict) -> None:
|
||||
"""clean_memory_fragments"""
|
||||
if current_platform.is_cuda() or current_platform.is_maca():
|
||||
if current_platform.is_cuda() or current_platform.is_maca() or current_platform.is_iluvatar():
|
||||
if state_dict:
|
||||
for k, v in state_dict.items():
|
||||
if isinstance(v, paddle.Tensor):
|
||||
|
||||
@@ -49,7 +49,7 @@ class DefaultModelLoaderV1(BaseModelLoader):
|
||||
|
||||
def clean_memory_fragments(self) -> None:
|
||||
"""clean_memory_fragments"""
|
||||
if current_platform.is_cuda() or current_platform.is_maca():
|
||||
if current_platform.is_cuda() or current_platform.is_maca() or current_platform.is_iluvatar():
|
||||
paddle.device.empty_cache()
|
||||
paddle.device.synchronize()
|
||||
|
||||
|
||||
@@ -17,6 +17,7 @@
|
||||
from functools import partial
|
||||
|
||||
import paddle
|
||||
from paddleformers.utils.log import logger
|
||||
|
||||
from fastdeploy import envs
|
||||
from fastdeploy.config import FDConfig
|
||||
@@ -66,10 +67,17 @@ class IluvatarModelRunner(GPUModelRunner):
|
||||
not self.cache_config.enable_chunked_prefill
|
||||
), "Iluvatar does not support chunked prefill for VL model"
|
||||
|
||||
if hasattr(self.quant_config, "moe_quant_type") and self.quant_config.moe_quant_type == "wint4":
|
||||
assert not self.use_cudagraph, "Iluvatar does not support cuda graph for weight_only_int4"
|
||||
if self.model_config.model_type == "ernie4_5_moe_vl" and self.parallel_config.tensor_parallel_size > 1:
|
||||
# ernie-vl does not support cuda graph for tp > 1
|
||||
logger.warning("disable cudagraph since ernie-vl does not support cuda graph for tp > 1")
|
||||
self.use_cudagraph = False
|
||||
|
||||
print(f"self.use_cudagraph={self.use_cudagraph}")
|
||||
if hasattr(self.quant_config, "moe_quant_type") and self.quant_config.moe_quant_type == "wint4":
|
||||
# Iluvatar does not support cuda graph for weight_only_int4 yet
|
||||
logger.warning("disable cudagraph since iluvatar does not support cuda graph for weight_only_int4")
|
||||
self.use_cudagraph = False
|
||||
|
||||
logger.info(f"self.use_cudagraph={self.use_cudagraph}")
|
||||
# VL neox style = True
|
||||
emb_shape = self.share_inputs["rope_emb"].shape
|
||||
if emb_shape[-1] == self.model_config.head_dim // 2:
|
||||
|
||||
@@ -187,7 +187,7 @@ function check_server_status() {
|
||||
echo -e "\n"
|
||||
}
|
||||
|
||||
echo "============ Online: start to test ERNIE-4.5-21B-A3B-Paddle ==========="
|
||||
echo "============ Online: start to test ERNIE-4.5-21B-A3B-Paddle (wint8, tp=1, enable_cudagraph) ==========="
|
||||
clear_message
|
||||
echo "Start server..."
|
||||
python -m fastdeploy.entrypoints.openai.api_server \
|
||||
@@ -233,6 +233,52 @@ fi
|
||||
# fi
|
||||
echo -e "\nPASSED"
|
||||
|
||||
echo "============ Online: start to test ERNIE-4.5-21B-A3B-Paddle (wint8, tp=2, enable_cudagraph) ==========="
|
||||
clear_message
|
||||
echo "Start server..."
|
||||
python -m fastdeploy.entrypoints.openai.api_server \
|
||||
--model ${MODEL_DIR}/ERNIE-4.5-21B-A3B-Paddle \
|
||||
--port 8180 \
|
||||
--tensor-parallel-size 2 \
|
||||
--quantization wint8 \
|
||||
--max-model-len 32768 \
|
||||
--max-num-seqs 8 \
|
||||
--block-size 16 \
|
||||
--graph-optimization-config '{"use_cudagraph": true}' > server.log 2>&1 &
|
||||
|
||||
check_server_status
|
||||
|
||||
echo "Start inference..."
|
||||
cp ${CI_PATH}/test.jsonl ./
|
||||
python3 -u ${CI_PATH}/bench_gsm8k.py --port 8180 --num-questions 10 --num-shots 5 --parallel 8
|
||||
|
||||
exit_code=$?
|
||||
echo -e "\nexit_code is ${exit_code}"
|
||||
|
||||
echo -e "\nStop server..."
|
||||
stop_processes
|
||||
echo -e "\nStop server done."
|
||||
|
||||
if [ ${exit_code} -ne 0 ]; then
|
||||
print_error_message
|
||||
exit 1
|
||||
fi
|
||||
|
||||
acc=`python3 -c "import json; [print(json.loads(line)['latency']) for line in open('result.jsonl')]"`
|
||||
latency=`python3 -c "import json; [print(json.loads(line)['latency']) for line in open('result.jsonl')]"`
|
||||
expected_lowerest_acc=0.8
|
||||
expected_largest_latency=60
|
||||
if awk -v a="$acc" -v b="$expected_lowerest_acc" 'BEGIN {exit !(a < b)}'; then
|
||||
echo -e "\nExit with Accucary error, current accuracy $acc less than $expected_lowerest_acc "
|
||||
exit 1
|
||||
fi
|
||||
|
||||
# if awk -v a="$latency" -v b="$expected_largest_latency" 'BEGIN {exit !(a > b)}'; then
|
||||
# echo -e "\nExit with Latency Error, current latency $latency greater than $expected_largest_latency "
|
||||
# exit 1
|
||||
# fi
|
||||
echo -e "\nPASSED"
|
||||
|
||||
echo -e "\n============ Online: start to test ERNIE-4.5-VL-28B-A3B-Paddle ==========="
|
||||
clear_message
|
||||
echo "Start server..."
|
||||
|
||||
Reference in New Issue
Block a user