diff --git a/fastdeploy/config.py b/fastdeploy/config.py index b4bf00ceb1..d0af627470 100644 --- a/fastdeploy/config.py +++ b/fastdeploy/config.py @@ -2064,7 +2064,10 @@ class FDConfig: if self.scheduler_config.max_num_batched_tokens is None: if int(envs.ENABLE_V1_KVCACHE_SCHEDULER): - self.scheduler_config.max_num_batched_tokens = 8192 # if set to max_model_len, it's easy to be OOM + if int(envs.FD_DISABLE_CHUNKED_PREFILL): + self.scheduler_config.max_num_batched_tokens = self.model_config.max_model_len + else: + self.scheduler_config.max_num_batched_tokens = 8192 # if set to max_model_len, it's easy to be OOM else: if self.cache_config.enable_chunked_prefill: self.scheduler_config.max_num_batched_tokens = 2048 diff --git a/fastdeploy/engine/args_utils.py b/fastdeploy/engine/args_utils.py index 069fdfbf9d..1a275b59bd 100644 --- a/fastdeploy/engine/args_utils.py +++ b/fastdeploy/engine/args_utils.py @@ -1498,7 +1498,11 @@ class EngineArgs: if self.max_num_batched_tokens is None: if int(envs.ENABLE_V1_KVCACHE_SCHEDULER): - if current_platform.is_maca() or current_platform.is_iluvatar(): + if ( + int(envs.FD_DISABLE_CHUNKED_PREFILL) + or current_platform.is_maca() + or current_platform.is_iluvatar() + ): self.max_num_batched_tokens = self.max_model_len else: self.max_num_batched_tokens = 8192 # if set to max_model_len, it's easy to be OOM