mirror of
https://github.com/PaddlePaddle/FastDeploy.git
synced 2026-04-23 00:17:25 +08:00
[FDConfig]Turn on the CUDAGraph + RL switch (#4508)
* Turn on the CUDAGraph + RL switch * reduce max_num_seqs and number of request
This commit is contained in:
@@ -38,7 +38,7 @@ python -m fastdeploy.entrypoints.openai.api_server \
|
||||
--cache-queue-port ${FD_CACHE_QUEUE_PORT} \
|
||||
--quantization wint8 \
|
||||
--max-model-len 32768 \
|
||||
--max-num-seqs 256 \
|
||||
--max-num-seqs 1 \
|
||||
--gpu-memory-utilization 0.9 \
|
||||
--model "$MODEL_PATH" \
|
||||
--load-strategy ipc_snapshot \
|
||||
|
||||
@@ -12,7 +12,7 @@ PORT="${FD_API_PORT}" # 这里需要配合启动脚本那个URL PORT
|
||||
BASE_URL="http://$HOST:$PORT"
|
||||
|
||||
TOTAL_ROUNDS=30
|
||||
CHAT_REQUESTS_PER_ROUND=5
|
||||
CHAT_REQUESTS_PER_ROUND=1
|
||||
export CUDA_VISIBLE_DEVICES=0,1
|
||||
MAX_MEMORY_MB=10240 # 10GB
|
||||
|
||||
|
||||
Reference in New Issue
Block a user