mirror of
https://github.com/PaddlePaddle/FastDeploy.git
synced 2026-04-23 00:17:25 +08:00
xpu (#4969)
This commit is contained in:
@@ -353,6 +353,13 @@ def h2d_copy(dst, src, blocking=True):
|
|||||||
def v1_loader_support(fd_config):
|
def v1_loader_support(fd_config):
|
||||||
_v1_no_support_archs = ["Qwen2VLForConditionalGeneration"]
|
_v1_no_support_archs = ["Qwen2VLForConditionalGeneration"]
|
||||||
|
|
||||||
|
def _get_unsupported_quant():
|
||||||
|
if current_platform.is_cuda():
|
||||||
|
return {"w4a8", "w4afp8", "wint2"}
|
||||||
|
elif current_platform.is_xpu():
|
||||||
|
return {"w4a8", "w8a8"}
|
||||||
|
return set()
|
||||||
|
|
||||||
def _err_msg(msg: str) -> str:
|
def _err_msg(msg: str) -> str:
|
||||||
logger.info(msg + "; fallback to the v0 loader for model loading.")
|
logger.info(msg + "; fallback to the v0 loader for model loading.")
|
||||||
|
|
||||||
@@ -375,7 +382,7 @@ def v1_loader_support(fd_config):
|
|||||||
else:
|
else:
|
||||||
moe_quant_type = fd_config.quant_config.name()
|
moe_quant_type = fd_config.quant_config.name()
|
||||||
dense_quant_type = fd_config.quant_config.name()
|
dense_quant_type = fd_config.quant_config.name()
|
||||||
unsupported_quant = {"w4a8", "w4afp8", "wint2"}
|
unsupported_quant = _get_unsupported_quant()
|
||||||
|
|
||||||
if unsupported_quant & {moe_quant_type, dense_quant_type}:
|
if unsupported_quant & {moe_quant_type, dense_quant_type}:
|
||||||
_err_msg("v1 loader currently does not support w4a8/w4afp8/win2 quantization")
|
_err_msg("v1 loader currently does not support w4a8/w4afp8/win2 quantization")
|
||||||
|
|||||||
@@ -27,6 +27,7 @@ moviepy
|
|||||||
use-triton-in-paddle
|
use-triton-in-paddle
|
||||||
crcmod
|
crcmod
|
||||||
fastsafetensors==0.1.14
|
fastsafetensors==0.1.14
|
||||||
|
safetensors==0.7.0rc0
|
||||||
msgpack
|
msgpack
|
||||||
gunicorn
|
gunicorn
|
||||||
opentelemetry-api>=1.24.0
|
opentelemetry-api>=1.24.0
|
||||||
|
|||||||
@@ -29,6 +29,7 @@ triton
|
|||||||
use-triton-in-paddle
|
use-triton-in-paddle
|
||||||
crcmod
|
crcmod
|
||||||
fastsafetensors==0.1.14
|
fastsafetensors==0.1.14
|
||||||
|
safetensors==0.7.0rc0
|
||||||
msgpack
|
msgpack
|
||||||
gunicorn
|
gunicorn
|
||||||
modelscope
|
modelscope
|
||||||
|
|||||||
@@ -81,8 +81,7 @@ python -m fastdeploy.entrypoints.openai.api_server \
|
|||||||
--num-gpu-blocks-override 16384 \
|
--num-gpu-blocks-override 16384 \
|
||||||
--max-model-len 32768 \
|
--max-model-len 32768 \
|
||||||
--max-num-seqs 128 \
|
--max-num-seqs 128 \
|
||||||
--quantization wint4 \
|
--quantization wint4 > server.log 2>&1 &
|
||||||
--load-choices default > server.log 2>&1 &
|
|
||||||
|
|
||||||
sleep 60
|
sleep 60
|
||||||
# 探活
|
# 探活
|
||||||
@@ -157,8 +156,7 @@ python -m fastdeploy.entrypoints.openai.api_server \
|
|||||||
--num-gpu-blocks-override 16384 \
|
--num-gpu-blocks-override 16384 \
|
||||||
--max-model-len 32768 \
|
--max-model-len 32768 \
|
||||||
--max-num-seqs 64 \
|
--max-num-seqs 64 \
|
||||||
--quantization "W4A8" \
|
--quantization "W4A8" > server.log 2>&1 &
|
||||||
--load-choices default > server.log 2>&1 &
|
|
||||||
|
|
||||||
sleep 60
|
sleep 60
|
||||||
# 探活
|
# 探活
|
||||||
@@ -236,8 +234,7 @@ python -m fastdeploy.entrypoints.openai.api_server \
|
|||||||
--enable-mm \
|
--enable-mm \
|
||||||
--mm-processor-kwargs '{"video_max_frames": 30}' \
|
--mm-processor-kwargs '{"video_max_frames": 30}' \
|
||||||
--limit-mm-per-prompt '{"image": 10, "video": 3}' \
|
--limit-mm-per-prompt '{"image": 10, "video": 3}' \
|
||||||
--reasoning-parser ernie-45-vl \
|
--reasoning-parser ernie-45-vl > server.log 2>&1 &
|
||||||
--load-choices default > server.log 2>&1 &
|
|
||||||
|
|
||||||
sleep 60
|
sleep 60
|
||||||
# 探活
|
# 探活
|
||||||
|
|||||||
Reference in New Issue
Block a user