[FDConfig]Turn on the CUDAGraph + RL switch (#4508)

* Turn on the CUDAGraph + RL switch

* reduce max_num_seqs and number of request
This commit is contained in:
RAM
2025-10-23 11:08:07 +08:00
committed by GitHub
parent 918e4e9850
commit 8a02ab43a8
3 changed files with 5 additions and 6 deletions
+3 -4
View File
@@ -1510,9 +1510,7 @@ class FDConfig:
self.structured_outputs_config.guided_decoding_backend = "xgrammar" self.structured_outputs_config.guided_decoding_backend = "xgrammar"
# Adjustment GraphOptConfig # Adjustment GraphOptConfig
if (self.scheduler_config.splitwise_role != "mixed") or ( if self.scheduler_config.splitwise_role != "mixed":
self.load_config is not None and self.load_config.dynamic_load_weight is True
):
self.graph_opt_config.use_cudagraph = False self.graph_opt_config.use_cudagraph = False
logger.info( logger.info(
"CUDAGraph does not support to be started together with PD Disaggregation temporarily, but has been automatically closed!" "CUDAGraph does not support to be started together with PD Disaggregation temporarily, but has been automatically closed!"
@@ -1630,11 +1628,12 @@ class FDConfig:
self.scheduler_config.check() self.scheduler_config.check()
# Check graph optimization config # Check graph optimization config
if self.graph_opt_config.graph_opt_level > 0 or self.graph_opt_config.use_cudagraph: if self.graph_opt_config.graph_opt_level > 0:
if self.load_config is not None: if self.load_config is not None:
assert ( assert (
self.load_config.dynamic_load_weight is False self.load_config.dynamic_load_weight is False
), "Static graph cannot be used in RL scene temporarily" ), "Static graph cannot be used in RL scene temporarily"
if int(envs.ENABLE_V1_KVCACHE_SCHEDULER) == 1: if int(envs.ENABLE_V1_KVCACHE_SCHEDULER) == 1:
assert ( assert (
int(envs.FD_DISABLED_RECOVER) == 0 int(envs.FD_DISABLED_RECOVER) == 0
+1 -1
View File
@@ -38,7 +38,7 @@ python -m fastdeploy.entrypoints.openai.api_server \
--cache-queue-port ${FD_CACHE_QUEUE_PORT} \ --cache-queue-port ${FD_CACHE_QUEUE_PORT} \
--quantization wint8 \ --quantization wint8 \
--max-model-len 32768 \ --max-model-len 32768 \
--max-num-seqs 256 \ --max-num-seqs 1 \
--gpu-memory-utilization 0.9 \ --gpu-memory-utilization 0.9 \
--model "$MODEL_PATH" \ --model "$MODEL_PATH" \
--load-strategy ipc_snapshot \ --load-strategy ipc_snapshot \
+1 -1
View File
@@ -12,7 +12,7 @@ PORT="${FD_API_PORT}" # 这里需要配合启动脚本那个URL PORT
BASE_URL="http://$HOST:$PORT" BASE_URL="http://$HOST:$PORT"
TOTAL_ROUNDS=30 TOTAL_ROUNDS=30
CHAT_REQUESTS_PER_ROUND=5 CHAT_REQUESTS_PER_ROUND=1
export CUDA_VISIBLE_DEVICES=0,1 export CUDA_VISIBLE_DEVICES=0,1
MAX_MEMORY_MB=10240 # 10GB MAX_MEMORY_MB=10240 # 10GB