[FDConfig]Turn on the CUDAGraph + RL switch (#4508)

* Turn on the CUDAGraph + RL switch * reduce max_num_seqs and number of request
2026-04-23 00:17:25 +08:00 · 2025-10-23 11:08:07 +08:00
parent 918e4e9850
commit 8a02ab43a8
3 changed files with 5 additions and 6 deletions
@@ -38,7 +38,7 @@ python -m fastdeploy.entrypoints.openai.api_server \
       --cache-queue-port ${FD_CACHE_QUEUE_PORT} \
       --quantization wint8 \
       --max-model-len 32768 \
-       --max-num-seqs 256 \
+       --max-num-seqs 1 \
       --gpu-memory-utilization 0.9 \
       --model "$MODEL_PATH" \
       --load-strategy ipc_snapshot \
@@ -12,7 +12,7 @@ PORT="${FD_API_PORT}"  # 这里需要配合启动脚本那个URL PORT
 BASE_URL="http://$HOST:$PORT"

 TOTAL_ROUNDS=30
-CHAT_REQUESTS_PER_ROUND=5
+CHAT_REQUESTS_PER_ROUND=1
 export CUDA_VISIBLE_DEVICES=0,1
 MAX_MEMORY_MB=10240  # 10GB