mirror of
https://github.com/PaddlePaddle/FastDeploy.git
synced 2026-04-23 00:17:25 +08:00
MoE Default use triton's blockwise fp8 in TP Case (#3678)
This commit is contained in:
@@ -116,7 +116,6 @@ export FD_ATTENTION_BACKEND=FLASH_ATTN
|
||||
export FD_LOG_DIR="prefill_log"
|
||||
|
||||
quant_type=block_wise_fp8
|
||||
export FD_USE_DEEP_GEMM=0
|
||||
|
||||
python -m fastdeploy.entrypoints.openai.api_server --model baidu/ERNIE-4.5-21B-A3B-Paddle \
|
||||
--max-model-len 131072 \
|
||||
@@ -136,7 +135,6 @@ export FLAGS_max_partition_size=2048
|
||||
export FD_LOG_DIR="decode_log"
|
||||
|
||||
quant_type=block_wise_fp8
|
||||
export FD_USE_DEEP_GEMM=0
|
||||
|
||||
python -m fastdeploy.entrypoints.openai.api_server --model baidu/ERNIE-4.5-21B-A3B-Paddle \
|
||||
--max-model-len 131072 \
|
||||
|
||||
Reference in New Issue
Block a user