mirror of
https://github.com/PaddlePaddle/FastDeploy.git
synced 2026-04-23 00:17:25 +08:00
[FDConfig]Turn on the CUDAGraph + RL switch (#4508)
* Turn on the CUDAGraph + RL switch * reduce max_num_seqs and number of request
This commit is contained in:
@@ -1510,9 +1510,7 @@ class FDConfig:
|
|||||||
self.structured_outputs_config.guided_decoding_backend = "xgrammar"
|
self.structured_outputs_config.guided_decoding_backend = "xgrammar"
|
||||||
|
|
||||||
# Adjustment GraphOptConfig
|
# Adjustment GraphOptConfig
|
||||||
if (self.scheduler_config.splitwise_role != "mixed") or (
|
if self.scheduler_config.splitwise_role != "mixed":
|
||||||
self.load_config is not None and self.load_config.dynamic_load_weight is True
|
|
||||||
):
|
|
||||||
self.graph_opt_config.use_cudagraph = False
|
self.graph_opt_config.use_cudagraph = False
|
||||||
logger.info(
|
logger.info(
|
||||||
"CUDAGraph does not support to be started together with PD Disaggregation temporarily, but has been automatically closed!"
|
"CUDAGraph does not support to be started together with PD Disaggregation temporarily, but has been automatically closed!"
|
||||||
@@ -1630,11 +1628,12 @@ class FDConfig:
|
|||||||
self.scheduler_config.check()
|
self.scheduler_config.check()
|
||||||
|
|
||||||
# Check graph optimization config
|
# Check graph optimization config
|
||||||
if self.graph_opt_config.graph_opt_level > 0 or self.graph_opt_config.use_cudagraph:
|
if self.graph_opt_config.graph_opt_level > 0:
|
||||||
if self.load_config is not None:
|
if self.load_config is not None:
|
||||||
assert (
|
assert (
|
||||||
self.load_config.dynamic_load_weight is False
|
self.load_config.dynamic_load_weight is False
|
||||||
), "Static graph cannot be used in RL scene temporarily"
|
), "Static graph cannot be used in RL scene temporarily"
|
||||||
|
|
||||||
if int(envs.ENABLE_V1_KVCACHE_SCHEDULER) == 1:
|
if int(envs.ENABLE_V1_KVCACHE_SCHEDULER) == 1:
|
||||||
assert (
|
assert (
|
||||||
int(envs.FD_DISABLED_RECOVER) == 0
|
int(envs.FD_DISABLED_RECOVER) == 0
|
||||||
|
|||||||
@@ -38,7 +38,7 @@ python -m fastdeploy.entrypoints.openai.api_server \
|
|||||||
--cache-queue-port ${FD_CACHE_QUEUE_PORT} \
|
--cache-queue-port ${FD_CACHE_QUEUE_PORT} \
|
||||||
--quantization wint8 \
|
--quantization wint8 \
|
||||||
--max-model-len 32768 \
|
--max-model-len 32768 \
|
||||||
--max-num-seqs 256 \
|
--max-num-seqs 1 \
|
||||||
--gpu-memory-utilization 0.9 \
|
--gpu-memory-utilization 0.9 \
|
||||||
--model "$MODEL_PATH" \
|
--model "$MODEL_PATH" \
|
||||||
--load-strategy ipc_snapshot \
|
--load-strategy ipc_snapshot \
|
||||||
|
|||||||
@@ -12,7 +12,7 @@ PORT="${FD_API_PORT}" # 这里需要配合启动脚本那个URL PORT
|
|||||||
BASE_URL="http://$HOST:$PORT"
|
BASE_URL="http://$HOST:$PORT"
|
||||||
|
|
||||||
TOTAL_ROUNDS=30
|
TOTAL_ROUNDS=30
|
||||||
CHAT_REQUESTS_PER_ROUND=5
|
CHAT_REQUESTS_PER_ROUND=1
|
||||||
export CUDA_VISIBLE_DEVICES=0,1
|
export CUDA_VISIBLE_DEVICES=0,1
|
||||||
MAX_MEMORY_MB=10240 # 10GB
|
MAX_MEMORY_MB=10240 # 10GB
|
||||||
|
|
||||||
|
|||||||
Reference in New Issue
Block a user