mirror of
https://github.com/PaddlePaddle/FastDeploy.git
synced 2026-04-22 16:07:51 +08:00
[Iluvatar] Fix cuda graph error for tp > 1 in ernie models (#7126)
This commit is contained in:
@@ -21,6 +21,7 @@ import paddle.distributed as dist
|
|||||||
from paddle.distributed import fleet
|
from paddle.distributed import fleet
|
||||||
|
|
||||||
import fastdeploy.envs as envs
|
import fastdeploy.envs as envs
|
||||||
|
from fastdeploy.platforms import current_platform
|
||||||
from fastdeploy.utils import get_logger, register_custom_python_op
|
from fastdeploy.utils import get_logger, register_custom_python_op
|
||||||
|
|
||||||
logger = get_logger("communication")
|
logger = get_logger("communication")
|
||||||
@@ -161,12 +162,21 @@ try:
|
|||||||
return _TP_AR.custom_all_reduce(input_)
|
return _TP_AR.custom_all_reduce(input_)
|
||||||
|
|
||||||
if paddle.in_dynamic_mode():
|
if paddle.in_dynamic_mode():
|
||||||
if group_ is not None:
|
if current_platform.is_iluvatar():
|
||||||
dist.all_reduce(input_, group=group_)
|
# use_calc_stream = False will raise event sync error when enable cuda graph and tp_size > 1
|
||||||
|
if group_ is not None:
|
||||||
|
stream.all_reduce(input_, op=ReduceOp.SUM, group=group_, sync_op=True, use_calc_stream=True)
|
||||||
|
else:
|
||||||
|
hcg = fleet.get_hybrid_communicate_group()
|
||||||
|
mp_group = hcg.get_model_parallel_group()
|
||||||
|
stream.all_reduce(input_, op=ReduceOp.SUM, group=mp_group, sync_op=True, use_calc_stream=True)
|
||||||
else:
|
else:
|
||||||
hcg = fleet.get_hybrid_communicate_group()
|
if group_ is not None:
|
||||||
mp_group = hcg.get_model_parallel_group()
|
dist.all_reduce(input_, group=group_)
|
||||||
dist.all_reduce(input_, group=mp_group)
|
else:
|
||||||
|
hcg = fleet.get_hybrid_communicate_group()
|
||||||
|
mp_group = hcg.get_model_parallel_group()
|
||||||
|
dist.all_reduce(input_, group=mp_group)
|
||||||
else:
|
else:
|
||||||
dist.all_reduce(input_)
|
dist.all_reduce(input_)
|
||||||
return input_
|
return input_
|
||||||
|
|||||||
@@ -43,7 +43,7 @@ class DefaultModelLoader(BaseModelLoader):
|
|||||||
|
|
||||||
def clean_memory_fragments(self, state_dict: dict) -> None:
|
def clean_memory_fragments(self, state_dict: dict) -> None:
|
||||||
"""clean_memory_fragments"""
|
"""clean_memory_fragments"""
|
||||||
if current_platform.is_cuda() or current_platform.is_maca():
|
if current_platform.is_cuda() or current_platform.is_maca() or current_platform.is_iluvatar():
|
||||||
if state_dict:
|
if state_dict:
|
||||||
for k, v in state_dict.items():
|
for k, v in state_dict.items():
|
||||||
if isinstance(v, paddle.Tensor):
|
if isinstance(v, paddle.Tensor):
|
||||||
|
|||||||
@@ -49,7 +49,7 @@ class DefaultModelLoaderV1(BaseModelLoader):
|
|||||||
|
|
||||||
def clean_memory_fragments(self) -> None:
|
def clean_memory_fragments(self) -> None:
|
||||||
"""clean_memory_fragments"""
|
"""clean_memory_fragments"""
|
||||||
if current_platform.is_cuda() or current_platform.is_maca():
|
if current_platform.is_cuda() or current_platform.is_maca() or current_platform.is_iluvatar():
|
||||||
paddle.device.empty_cache()
|
paddle.device.empty_cache()
|
||||||
paddle.device.synchronize()
|
paddle.device.synchronize()
|
||||||
|
|
||||||
|
|||||||
@@ -17,6 +17,7 @@
|
|||||||
from functools import partial
|
from functools import partial
|
||||||
|
|
||||||
import paddle
|
import paddle
|
||||||
|
from paddleformers.utils.log import logger
|
||||||
|
|
||||||
from fastdeploy import envs
|
from fastdeploy import envs
|
||||||
from fastdeploy.config import FDConfig
|
from fastdeploy.config import FDConfig
|
||||||
@@ -66,10 +67,17 @@ class IluvatarModelRunner(GPUModelRunner):
|
|||||||
not self.cache_config.enable_chunked_prefill
|
not self.cache_config.enable_chunked_prefill
|
||||||
), "Iluvatar does not support chunked prefill for VL model"
|
), "Iluvatar does not support chunked prefill for VL model"
|
||||||
|
|
||||||
if hasattr(self.quant_config, "moe_quant_type") and self.quant_config.moe_quant_type == "wint4":
|
if self.model_config.model_type == "ernie4_5_moe_vl" and self.parallel_config.tensor_parallel_size > 1:
|
||||||
assert not self.use_cudagraph, "Iluvatar does not support cuda graph for weight_only_int4"
|
# ernie-vl does not support cuda graph for tp > 1
|
||||||
|
logger.warning("disable cudagraph since ernie-vl does not support cuda graph for tp > 1")
|
||||||
|
self.use_cudagraph = False
|
||||||
|
|
||||||
print(f"self.use_cudagraph={self.use_cudagraph}")
|
if hasattr(self.quant_config, "moe_quant_type") and self.quant_config.moe_quant_type == "wint4":
|
||||||
|
# Iluvatar does not support cuda graph for weight_only_int4 yet
|
||||||
|
logger.warning("disable cudagraph since iluvatar does not support cuda graph for weight_only_int4")
|
||||||
|
self.use_cudagraph = False
|
||||||
|
|
||||||
|
logger.info(f"self.use_cudagraph={self.use_cudagraph}")
|
||||||
# VL neox style = True
|
# VL neox style = True
|
||||||
emb_shape = self.share_inputs["rope_emb"].shape
|
emb_shape = self.share_inputs["rope_emb"].shape
|
||||||
if emb_shape[-1] == self.model_config.head_dim // 2:
|
if emb_shape[-1] == self.model_config.head_dim // 2:
|
||||||
|
|||||||
@@ -187,7 +187,7 @@ function check_server_status() {
|
|||||||
echo -e "\n"
|
echo -e "\n"
|
||||||
}
|
}
|
||||||
|
|
||||||
echo "============ Online: start to test ERNIE-4.5-21B-A3B-Paddle ==========="
|
echo "============ Online: start to test ERNIE-4.5-21B-A3B-Paddle (wint8, tp=1, enable_cudagraph) ==========="
|
||||||
clear_message
|
clear_message
|
||||||
echo "Start server..."
|
echo "Start server..."
|
||||||
python -m fastdeploy.entrypoints.openai.api_server \
|
python -m fastdeploy.entrypoints.openai.api_server \
|
||||||
@@ -233,6 +233,52 @@ fi
|
|||||||
# fi
|
# fi
|
||||||
echo -e "\nPASSED"
|
echo -e "\nPASSED"
|
||||||
|
|
||||||
|
echo "============ Online: start to test ERNIE-4.5-21B-A3B-Paddle (wint8, tp=2, enable_cudagraph) ==========="
|
||||||
|
clear_message
|
||||||
|
echo "Start server..."
|
||||||
|
python -m fastdeploy.entrypoints.openai.api_server \
|
||||||
|
--model ${MODEL_DIR}/ERNIE-4.5-21B-A3B-Paddle \
|
||||||
|
--port 8180 \
|
||||||
|
--tensor-parallel-size 2 \
|
||||||
|
--quantization wint8 \
|
||||||
|
--max-model-len 32768 \
|
||||||
|
--max-num-seqs 8 \
|
||||||
|
--block-size 16 \
|
||||||
|
--graph-optimization-config '{"use_cudagraph": true}' > server.log 2>&1 &
|
||||||
|
|
||||||
|
check_server_status
|
||||||
|
|
||||||
|
echo "Start inference..."
|
||||||
|
cp ${CI_PATH}/test.jsonl ./
|
||||||
|
python3 -u ${CI_PATH}/bench_gsm8k.py --port 8180 --num-questions 10 --num-shots 5 --parallel 8
|
||||||
|
|
||||||
|
exit_code=$?
|
||||||
|
echo -e "\nexit_code is ${exit_code}"
|
||||||
|
|
||||||
|
echo -e "\nStop server..."
|
||||||
|
stop_processes
|
||||||
|
echo -e "\nStop server done."
|
||||||
|
|
||||||
|
if [ ${exit_code} -ne 0 ]; then
|
||||||
|
print_error_message
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
|
||||||
|
acc=`python3 -c "import json; [print(json.loads(line)['latency']) for line in open('result.jsonl')]"`
|
||||||
|
latency=`python3 -c "import json; [print(json.loads(line)['latency']) for line in open('result.jsonl')]"`
|
||||||
|
expected_lowerest_acc=0.8
|
||||||
|
expected_largest_latency=60
|
||||||
|
if awk -v a="$acc" -v b="$expected_lowerest_acc" 'BEGIN {exit !(a < b)}'; then
|
||||||
|
echo -e "\nExit with Accucary error, current accuracy $acc less than $expected_lowerest_acc "
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
|
||||||
|
# if awk -v a="$latency" -v b="$expected_largest_latency" 'BEGIN {exit !(a > b)}'; then
|
||||||
|
# echo -e "\nExit with Latency Error, current latency $latency greater than $expected_largest_latency "
|
||||||
|
# exit 1
|
||||||
|
# fi
|
||||||
|
echo -e "\nPASSED"
|
||||||
|
|
||||||
echo -e "\n============ Online: start to test ERNIE-4.5-VL-28B-A3B-Paddle ==========="
|
echo -e "\n============ Online: start to test ERNIE-4.5-VL-28B-A3B-Paddle ==========="
|
||||||
clear_message
|
clear_message
|
||||||
echo "Start server..."
|
echo "Start server..."
|
||||||
|
|||||||
Reference in New Issue
Block a user