From 894f4e312b97da19ac9a9a3e5ca47421859cccdf Mon Sep 17 00:00:00 2001 From: kevin Date: Fri, 26 Dec 2025 15:31:27 +0800 Subject: [PATCH] [FDConfig] disable chunked_mm_input in ernie5 (#5774) * disable chunked_mm_input in ernie5 * update code * update code * update test case * update testcase * upate case --- fastdeploy/config.py | 20 +++++++++++++------ .../engine/sched/resource_manager_v1.py | 3 ++- fastdeploy/utils.py | 13 +++++------- tests/eplb/test_eplb_utils.py | 1 + tests/eplb/test_experts_manager.py | 1 + .../test_cuda_graph_dynamic_subgraph.py | 1 + .../test_cuda_graph_recapture.py | 1 + .../test_cuda_graph_spec_decode.py | 1 + .../test_graph_opt_backend.py | 2 ++ .../test_static_graph_cuda_graph_split.py | 1 + tests/layers/test_speculative_sampler.py | 1 + tests/utils.py | 1 + tests/utils/test_config.py | 5 +++++ tests/utils/test_download.py | 2 +- tests/v1/cache_manager/test_prefix_cache.py | 1 + tests/v1/cache_manager/test_revert_blocks.py | 1 + tests/v1/test_resource_manager_v1.py | 1 + 17 files changed, 40 insertions(+), 16 deletions(-) diff --git a/fastdeploy/config.py b/fastdeploy/config.py index 1563e2030a..9f6cda508d 100644 --- a/fastdeploy/config.py +++ b/fastdeploy/config.py @@ -133,6 +133,11 @@ class ErnieArchitectures: "Ernie4_5_VLMoeForProcessRewardModel", } + ERNIE5_MODELS = { + "Ernie5ForCausalLM", + "Ernie5MoeForCausalLM", + } + @classmethod def register_ernie_model_arch(cls, model_class): if model_class.name().startswith("Ernie") and model_class.name() not in cls.ARCHITECTURES: @@ -148,6 +153,11 @@ class ErnieArchitectures: """Check if the given architecture is an ERNIE architecture.""" return architecture in cls.ARCHITECTURES + @classmethod + def is_ernie5_arch(cls, architectures): + """Check if the given architecture is an ERNIE5 architecture.""" + return any(arch in architectures for arch in cls.ERNIE5_MODELS) + PRETRAINED_INIT_CONFIGURATION = { "top_p": 1.0, @@ -248,12 +258,6 @@ class ModelConfig: self._post_init() - def disable_mm_prefill_batch(self): - """ - check if the model architecture disable for mm prefill - """ - return self._architecture in ["Ernie5ForCausalLM", "Ernie5MoeForCausalLM"] - def _post_init(self): self.is_unified_ckpt = check_unified_ckpt(self.model) self.runner_type = self._get_runner_type(self.architectures, self.runner) @@ -1805,6 +1809,10 @@ class FDConfig: # It will hang when real batch_size < tp_size self.graph_opt_config.filter_capture_size(tp_size=self.parallel_config.tensor_parallel_size) + if ErnieArchitectures.is_ernie5_arch(self.model_config.architectures): + # ernie5 model not support chunked_mm_input + self.cache_config.disable_chunked_mm_input = True + self.postprocess_devices_and_ports() def postprocess_devices_and_ports(self): diff --git a/fastdeploy/engine/sched/resource_manager_v1.py b/fastdeploy/engine/sched/resource_manager_v1.py index ea94af68a6..b6da9a9922 100644 --- a/fastdeploy/engine/sched/resource_manager_v1.py +++ b/fastdeploy/engine/sched/resource_manager_v1.py @@ -32,6 +32,7 @@ from fastdeploy.cache_manager.multimodal_cache_manager import ( EncoderCacheManager, ProcessorCacheManager, ) +from fastdeploy.config import ErnieArchitectures from fastdeploy.engine.request import ( ImagePosition, Request, @@ -680,7 +681,7 @@ class ResourceManagerV1(ResourceManager): request = self.waiting[0] if ( - self.config.model_config.disable_mm_prefill_batch() + ErnieArchitectures.is_ernie5_arch(self.config.model_config.architectures) and self._is_mm_request(request) and self.exist_mm_prefill(scheduled_reqs) ) or (paddle.is_compiled_with_xpu() and self.exist_prefill(scheduled_reqs)): diff --git a/fastdeploy/utils.py b/fastdeploy/utils.py index 715715fb0c..9352e6ab88 100644 --- a/fastdeploy/utils.py +++ b/fastdeploy/utils.py @@ -1078,7 +1078,6 @@ def check_download_links(bos_client, links, timeout=1): def init_bos_client(): from baidubce.auth.bce_credentials import BceCredentials from baidubce.bce_client_configuration import BceClientConfiguration - from baidubce.exception import BceHttpClientError, BceServerError from baidubce.services.bos.bos_client import BosClient cfg = BceClientConfiguration( @@ -1089,14 +1088,12 @@ def init_bos_client(): try: client = BosClient(cfg) client.list_buckets() - except BceServerError as e: - if e.status_code == 403: - raise Exception("BOS authentication failed: Invalid AK/SK") from e - raise Exception(f"BOS connection failed: {str(e)}") from e - except BceHttpClientError as e: - raise Exception(f"Invalid BOS endpoint configuration: {str(e)}") from e except Exception as e: - raise Exception(f"BOS client validation error: {str(e)}") from e + raise Exception( + "Create BOSClient Error, Please check your ENV [ ENCODE_FEATURE_BOS_AK, ENCODE_FEATURE_BOS_SK, ENCODE_FEATURE_ENDPOINT ] \n" + f"Current ENV AK: {envs.ENCODE_FEATURE_BOS_AK}, SK: {envs.ENCODE_FEATURE_BOS_SK}, Endpoint: {envs.ENCODE_FEATURE_ENDPOINT} \n" + f"{str(e)}" + ) return client diff --git a/tests/eplb/test_eplb_utils.py b/tests/eplb/test_eplb_utils.py index 675a2daee1..7ba49b8c82 100644 --- a/tests/eplb/test_eplb_utils.py +++ b/tests/eplb/test_eplb_utils.py @@ -175,6 +175,7 @@ class TestInitEplbSignals(unittest.TestCase): model_cfg.moe_num_experts = 64 model_cfg.moe_layer_start_index = 1 model_cfg.model = "/test/model" + model_cfg.architectures = ["test_model"] cache_cfg.bytes_per_layer_per_block = 1 parallel_cfg = ParallelConfig(args) diff --git a/tests/eplb/test_experts_manager.py b/tests/eplb/test_experts_manager.py index 01882f71d3..24e8dbd5aa 100644 --- a/tests/eplb/test_experts_manager.py +++ b/tests/eplb/test_experts_manager.py @@ -55,6 +55,7 @@ class TestRedundantExpertManager(unittest.TestCase): model_cfg.moe_num_experts = 64 model_cfg.moe_layer_start_index = 1 model_cfg.model = "/test/model" + model_cfg.architectures = ["test_model"] cache_cfg.bytes_per_layer_per_block = 1 parallel_cfg = ParallelConfig(args) diff --git a/tests/graph_optimization/test_cuda_graph_dynamic_subgraph.py b/tests/graph_optimization/test_cuda_graph_dynamic_subgraph.py index 522e647589..53507330ca 100644 --- a/tests/graph_optimization/test_cuda_graph_dynamic_subgraph.py +++ b/tests/graph_optimization/test_cuda_graph_dynamic_subgraph.py @@ -159,6 +159,7 @@ class TestCUDAGrpahSubgraph(unittest.TestCase): parallel_config = ParallelConfig(args={}) model_config = Mock() model_config.max_model_len = 512 + model_config.architectures = ["test_model"] # Initialize cuda graph capture list graph_opt_config._set_cudagraph_sizes(max_capture_size=scheduler_config.max_num_seqs) graph_opt_config.init_with_cudagrpah_size(max_capture_size=scheduler_config.max_num_seqs) diff --git a/tests/graph_optimization/test_cuda_graph_recapture.py b/tests/graph_optimization/test_cuda_graph_recapture.py index 85516f8bd3..932a1966f9 100644 --- a/tests/graph_optimization/test_cuda_graph_recapture.py +++ b/tests/graph_optimization/test_cuda_graph_recapture.py @@ -112,6 +112,7 @@ class TestCUDAGrpahRecapture(unittest.TestCase): parallel_config = ParallelConfig(args={}) model_config = Mock() model_config.max_model_len = 5120 + model_config.architectures = ["test_model"] fd_config = FDConfig( graph_opt_config=graph_opt_config, scheduler_config=scheduler_config, diff --git a/tests/graph_optimization/test_cuda_graph_spec_decode.py b/tests/graph_optimization/test_cuda_graph_spec_decode.py index 10f4237a9d..f81d4b11cf 100644 --- a/tests/graph_optimization/test_cuda_graph_spec_decode.py +++ b/tests/graph_optimization/test_cuda_graph_spec_decode.py @@ -105,6 +105,7 @@ class TestCUDAGrpahSpecDecode(unittest.TestCase): parallel_config = ParallelConfig(args={}) model_config = Mock() model_config.max_model_len = 512 + model_config.architectures = ["test_model"] # Initialize cuda graph capture list graph_opt_config._set_cudagraph_sizes(max_capture_size=scheduler_config.max_num_seqs) graph_opt_config.init_with_cudagrpah_size(max_capture_size=scheduler_config.max_num_seqs) diff --git a/tests/graph_optimization/test_graph_opt_backend.py b/tests/graph_optimization/test_graph_opt_backend.py index ff5d1fcd62..e4bac358e5 100644 --- a/tests/graph_optimization/test_graph_opt_backend.py +++ b/tests/graph_optimization/test_graph_opt_backend.py @@ -97,6 +97,7 @@ class TestGraphOptBackend(unittest.TestCase): baseline_parallel_config = ParallelConfig(args={}) model_config = Mock() model_config.max_model_len = 512 + model_config.architectures = ["test_model"] self.baseline_fd_config = FDConfig( graph_opt_config=baseline_graph_opt_config, scheduler_config=baseline_scheduler_config, @@ -144,6 +145,7 @@ class TestGraphOptBackend(unittest.TestCase): parallel_config = ParallelConfig(args={}) model_config = Mock() model_config.max_model_len = 512 + model_config.architectures = ["test_model"] # Create FD config return FDConfig( diff --git a/tests/graph_optimization/test_static_graph_cuda_graph_split.py b/tests/graph_optimization/test_static_graph_cuda_graph_split.py index 366b35d61d..9d2b419512 100644 --- a/tests/graph_optimization/test_static_graph_cuda_graph_split.py +++ b/tests/graph_optimization/test_static_graph_cuda_graph_split.py @@ -97,6 +97,7 @@ class TestStaticGraphCUDAGraphSplit(unittest.TestCase): parallel_config = ParallelConfig(args={}) model_config = Mock() model_config.max_model_len = 512 + model_config.architectures = ["test_model"] fd_config = FDConfig( graph_opt_config=graph_opt_config, scheduler_config=scheduler_config, diff --git a/tests/layers/test_speculative_sampler.py b/tests/layers/test_speculative_sampler.py index e145030710..c62baa74ec 100644 --- a/tests/layers/test_speculative_sampler.py +++ b/tests/layers/test_speculative_sampler.py @@ -83,6 +83,7 @@ def _create_default_sampling_metadata( def _create_fd_config(max_model_len): model_config: Mock = Mock() model_config.max_model_len = max_model_len + model_config.architectures = ["test_model"] speculative_config = SpeculativeConfig({}) graph_opt_config = GraphOptimizationConfig({}) scheduler_config = SchedulerConfig({}) diff --git a/tests/utils.py b/tests/utils.py index 80709b81b6..46c60b0c26 100644 --- a/tests/utils.py +++ b/tests/utils.py @@ -61,6 +61,7 @@ class FakeModelConfig: self.enable_mm = False self.max_model_len = 512 self.logprobs_mode = "raw_logprobs" + self.architectures = ["test_model"] def get_default_test_fd_config(): diff --git a/tests/utils/test_config.py b/tests/utils/test_config.py index 326f1251fa..44a5f56e56 100644 --- a/tests/utils/test_config.py +++ b/tests/utils/test_config.py @@ -39,6 +39,7 @@ class TestConfig(unittest.TestCase): scheduler_config = SchedulerConfig({}) model_config = Mock() model_config.max_model_len = 512 + model_config.architectures = ["test_model"] fd_config = FDConfig( parallel_config=parallel_config, graph_opt_config=graph_opt_config, @@ -60,6 +61,7 @@ class TestConfig(unittest.TestCase): scheduler_config = SchedulerConfig({}) model_config = Mock() model_config.max_model_len = 512 + model_config.architectures = ["test_model"] fd_config = FDConfig( parallel_config=parallel_config, graph_opt_config=graph_opt_config, @@ -81,6 +83,7 @@ class TestConfig(unittest.TestCase): scheduler_config = SchedulerConfig({}) model_config: Mock = Mock() model_config.max_model_len = 512 + model_config.architectures = ["test_model"] fd_config = FDConfig( parallel_config=parallel_config, @@ -120,6 +123,7 @@ class TestConfig(unittest.TestCase): scheduler_config.splitwise_role = "prefill" model_config: Mock = Mock() model_config.max_model_len = 512 + model_config.architectures = ["test_model"] fd_config = FDConfig( parallel_config=parallel_config, @@ -162,6 +166,7 @@ class TestConfig(unittest.TestCase): scheduler_config = SchedulerConfig({}) model_config: Mock = Mock() model_config.max_model_len = 512 + model_config.architectures = ["test_model"] fd_config = FDConfig( parallel_config=parallel_config, diff --git a/tests/utils/test_download.py b/tests/utils/test_download.py index 50b8e99c07..15369f4671 100644 --- a/tests/utils/test_download.py +++ b/tests/utils/test_download.py @@ -127,7 +127,7 @@ class TestInitBosClient(unittest.TestCase): with self.assertRaises(Exception) as context: init_bos_client() - self.assertIn("BOS client validation error", str(context.exception)) + self.assertIn("Create BOSClient Error, Please check your ENV", str(context.exception)) os.environ.clear() diff --git a/tests/v1/cache_manager/test_prefix_cache.py b/tests/v1/cache_manager/test_prefix_cache.py index 8107d5597b..1d4111f681 100644 --- a/tests/v1/cache_manager/test_prefix_cache.py +++ b/tests/v1/cache_manager/test_prefix_cache.py @@ -33,6 +33,7 @@ def make_prefix_cache_manager(max_num_seqs, enable_mm=False, num_gpu_blocks_over model_cfg = SimpleNamespace(enable_mm=enable_mm, max_model_len=4196) speculative_cfg = SimpleNamespace(method=None) model_cfg.print = print + model_cfg.architectures = ["test_model"] cache_cfg.bytes_per_layer_per_block = 1 parallel_cfg = ParallelConfig(args) scheduler_cfg = SchedulerConfig(args) diff --git a/tests/v1/cache_manager/test_revert_blocks.py b/tests/v1/cache_manager/test_revert_blocks.py index 5c23f4faea..8e3e864c66 100644 --- a/tests/v1/cache_manager/test_revert_blocks.py +++ b/tests/v1/cache_manager/test_revert_blocks.py @@ -35,6 +35,7 @@ def make_prefix_cache_manager(max_num_seqs, enable_mm=False, num_gpu_blocks_over model_cfg = SimpleNamespace(enable_mm=enable_mm, max_model_len=4196) speculative_cfg = SimpleNamespace(method=None) model_cfg.print = print + model_cfg.architectures = ["test_model"] cache_cfg.bytes_per_layer_per_block = 1 parallel_cfg = ParallelConfig(args) scheduler_cfg = SchedulerConfig(args) diff --git a/tests/v1/test_resource_manager_v1.py b/tests/v1/test_resource_manager_v1.py index 4825072c42..06f55865e8 100644 --- a/tests/v1/test_resource_manager_v1.py +++ b/tests/v1/test_resource_manager_v1.py @@ -44,6 +44,7 @@ class TestResourceManagerV1(unittest.TestCase): speculative_cfg = SimpleNamespace(method=None) model_cfg.print = print model_cfg.max_model_len = 5120 + model_cfg.architectures = ["test_model"] cache_cfg.bytes_per_layer_per_block = 1 parallel_cfg = ParallelConfig(args) scheduler_cfg = SchedulerConfig(args)