mirror of
https://github.com/PaddlePaddle/FastDeploy.git
synced 2026-04-23 00:17:25 +08:00
support dsv3 use flashmla (#6593)
This commit is contained in:
@@ -69,3 +69,34 @@ python -m fastdeploy.entrypoints.openai.multi_api_server \
|
||||
--graph-optimization-config '{"use_cudagraph":true}' \
|
||||
|
||||
```
|
||||
|
||||
**示例3:** H800上16卡部署 blockwise_fp8 模型16K上下文的服务
|
||||
|
||||
这个例子中支持使用FlashMLA算子做MLA的计算
|
||||
|
||||
```shell
|
||||
MODEL_PATH=/models/DeepSeek-V3.2-Exp-BF16
|
||||
|
||||
export FD_DISABLE_CHUNKED_PREFILL=1
|
||||
export FD_ATTENTION_BACKEND="MLA_ATTN"
|
||||
export FLAGS_flash_attn_version=3
|
||||
export USE_FLASH_MLA=1
|
||||
|
||||
export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
|
||||
export FD_ENABLE_MULTI_API_SERVER=1
|
||||
python -m fastdeploy.entrypoints.openai.multi_api_server \
|
||||
--ports "9811,9812,9813,9814,9815,9816,9817,9818" \
|
||||
--num-servers 8 \
|
||||
--args --model "$model_path" \
|
||||
--ips "10.95.246.220,10.95.230.91" \
|
||||
--no-enable-prefix-caching \
|
||||
--quantization block_wise_fp8 \
|
||||
--disable-sequence-parallel-moe \
|
||||
--tensor-parallel-size 1 \
|
||||
--num-gpu-blocks-override 1024 \
|
||||
--data-parallel-size 16 \
|
||||
--max-model-len 16384 \
|
||||
--enable-expert-parallel \
|
||||
--max-num-seqs 20 \
|
||||
--graph-optimization-config '{"use_cudagraph":true}' \
|
||||
```
|
||||
|
||||
Reference in New Issue
Block a user