From 894f4e312b97da19ac9a9a3e5ca47421859cccdf Mon Sep 17 00:00:00 2001
From: kevin <chengyf112@gmail.com>
Date: Fri, 26 Dec 2025 15:31:27 +0800
Subject: [PATCH] [FDConfig] disable chunked_mm_input in ernie5 (#5774)

* disable chunked_mm_input in ernie5

* update code

* update code

* update test case

* update testcase

* upate case
---
 fastdeploy/config.py                          | 20 +++++++++++++------
 .../engine/sched/resource_manager_v1.py       |  3 ++-
 fastdeploy/utils.py                           | 13 +++++-------
 tests/eplb/test_eplb_utils.py                 |  1 +
 tests/eplb/test_experts_manager.py            |  1 +
 .../test_cuda_graph_dynamic_subgraph.py       |  1 +
 .../test_cuda_graph_recapture.py              |  1 +
 .../test_cuda_graph_spec_decode.py            |  1 +
 .../test_graph_opt_backend.py                 |  2 ++
 .../test_static_graph_cuda_graph_split.py     |  1 +
 tests/layers/test_speculative_sampler.py      |  1 +
 tests/utils.py                                |  1 +
 tests/utils/test_config.py                    |  5 +++++
 tests/utils/test_download.py                  |  2 +-
 tests/v1/cache_manager/test_prefix_cache.py   |  1 +
 tests/v1/cache_manager/test_revert_blocks.py  |  1 +
 tests/v1/test_resource_manager_v1.py          |  1 +
 17 files changed, 40 insertions(+), 16 deletions(-)

diff --git a/fastdeploy/config.py b/fastdeploy/config.py
index 1563e2030a..9f6cda508d 100644
--- a/fastdeploy/config.py
+++ b/fastdeploy/config.py
@@ -133,6 +133,11 @@ class ErnieArchitectures:
         "Ernie4_5_VLMoeForProcessRewardModel",
     }
 
+    ERNIE5_MODELS = {
+        "Ernie5ForCausalLM",
+        "Ernie5MoeForCausalLM",
+    }
+
     @classmethod
     def register_ernie_model_arch(cls, model_class):
         if model_class.name().startswith("Ernie") and model_class.name() not in cls.ARCHITECTURES:
@@ -148,6 +153,11 @@ class ErnieArchitectures:
         """Check if the given architecture is an ERNIE architecture."""
         return architecture in cls.ARCHITECTURES
 
+    @classmethod
+    def is_ernie5_arch(cls, architectures):
+        """Check if the given architecture is an ERNIE5 architecture."""
+        return any(arch in architectures for arch in cls.ERNIE5_MODELS)
+
 
 PRETRAINED_INIT_CONFIGURATION = {
     "top_p": 1.0,
@@ -248,12 +258,6 @@ class ModelConfig:
 
         self._post_init()
 
-    def disable_mm_prefill_batch(self):
-        """
-        check if the model architecture disable for mm prefill
-        """
-        return self._architecture in ["Ernie5ForCausalLM", "Ernie5MoeForCausalLM"]
-
     def _post_init(self):
         self.is_unified_ckpt = check_unified_ckpt(self.model)
         self.runner_type = self._get_runner_type(self.architectures, self.runner)
@@ -1805,6 +1809,10 @@ class FDConfig:
                 # It will hang when real batch_size < tp_size
                 self.graph_opt_config.filter_capture_size(tp_size=self.parallel_config.tensor_parallel_size)
 
+        if ErnieArchitectures.is_ernie5_arch(self.model_config.architectures):
+            # ernie5 model not support chunked_mm_input
+            self.cache_config.disable_chunked_mm_input = True
+
         self.postprocess_devices_and_ports()
 
     def postprocess_devices_and_ports(self):
diff --git a/fastdeploy/engine/sched/resource_manager_v1.py b/fastdeploy/engine/sched/resource_manager_v1.py
index ea94af68a6..b6da9a9922 100644
--- a/fastdeploy/engine/sched/resource_manager_v1.py
+++ b/fastdeploy/engine/sched/resource_manager_v1.py
@@ -32,6 +32,7 @@ from fastdeploy.cache_manager.multimodal_cache_manager import (
     EncoderCacheManager,
     ProcessorCacheManager,
 )
+from fastdeploy.config import ErnieArchitectures
 from fastdeploy.engine.request import (
     ImagePosition,
     Request,
@@ -680,7 +681,7 @@ class ResourceManagerV1(ResourceManager):
 
                     request = self.waiting[0]
                     if (
-                        self.config.model_config.disable_mm_prefill_batch()
+                        ErnieArchitectures.is_ernie5_arch(self.config.model_config.architectures)
                         and self._is_mm_request(request)
                         and self.exist_mm_prefill(scheduled_reqs)
                     ) or (paddle.is_compiled_with_xpu() and self.exist_prefill(scheduled_reqs)):
diff --git a/fastdeploy/utils.py b/fastdeploy/utils.py
index 715715fb0c..9352e6ab88 100644
--- a/fastdeploy/utils.py
+++ b/fastdeploy/utils.py
@@ -1078,7 +1078,6 @@ def check_download_links(bos_client, links, timeout=1):
 def init_bos_client():
     from baidubce.auth.bce_credentials import BceCredentials
     from baidubce.bce_client_configuration import BceClientConfiguration
-    from baidubce.exception import BceHttpClientError, BceServerError
     from baidubce.services.bos.bos_client import BosClient
 
     cfg = BceClientConfiguration(
@@ -1089,14 +1088,12 @@ def init_bos_client():
     try:
         client = BosClient(cfg)
         client.list_buckets()
-    except BceServerError as e:
-        if e.status_code == 403:
-            raise Exception("BOS authentication failed: Invalid AK/SK") from e
-        raise Exception(f"BOS connection failed: {str(e)}") from e
-    except BceHttpClientError as e:
-        raise Exception(f"Invalid BOS endpoint configuration: {str(e)}") from e
     except Exception as e:
-        raise Exception(f"BOS client validation error: {str(e)}") from e
+        raise Exception(
+            "Create BOSClient Error, Please check your ENV [ ENCODE_FEATURE_BOS_AK, ENCODE_FEATURE_BOS_SK, ENCODE_FEATURE_ENDPOINT ] \n"
+            f"Current ENV AK: {envs.ENCODE_FEATURE_BOS_AK}, SK: {envs.ENCODE_FEATURE_BOS_SK}, Endpoint: {envs.ENCODE_FEATURE_ENDPOINT} \n"
+            f"{str(e)}"
+        )
     return client
 
 
diff --git a/tests/eplb/test_eplb_utils.py b/tests/eplb/test_eplb_utils.py
index 675a2daee1..7ba49b8c82 100644
--- a/tests/eplb/test_eplb_utils.py
+++ b/tests/eplb/test_eplb_utils.py
@@ -175,6 +175,7 @@ class TestInitEplbSignals(unittest.TestCase):
         model_cfg.moe_num_experts = 64
         model_cfg.moe_layer_start_index = 1
         model_cfg.model = "/test/model"
+        model_cfg.architectures = ["test_model"]
         cache_cfg.bytes_per_layer_per_block = 1
 
         parallel_cfg = ParallelConfig(args)
diff --git a/tests/eplb/test_experts_manager.py b/tests/eplb/test_experts_manager.py
index 01882f71d3..24e8dbd5aa 100644
--- a/tests/eplb/test_experts_manager.py
+++ b/tests/eplb/test_experts_manager.py
@@ -55,6 +55,7 @@ class TestRedundantExpertManager(unittest.TestCase):
         model_cfg.moe_num_experts = 64
         model_cfg.moe_layer_start_index = 1
         model_cfg.model = "/test/model"
+        model_cfg.architectures = ["test_model"]
         cache_cfg.bytes_per_layer_per_block = 1
 
         parallel_cfg = ParallelConfig(args)
diff --git a/tests/graph_optimization/test_cuda_graph_dynamic_subgraph.py b/tests/graph_optimization/test_cuda_graph_dynamic_subgraph.py
index 522e647589..53507330ca 100644
--- a/tests/graph_optimization/test_cuda_graph_dynamic_subgraph.py
+++ b/tests/graph_optimization/test_cuda_graph_dynamic_subgraph.py
@@ -159,6 +159,7 @@ class TestCUDAGrpahSubgraph(unittest.TestCase):
         parallel_config = ParallelConfig(args={})
         model_config = Mock()
         model_config.max_model_len = 512
+        model_config.architectures = ["test_model"]
         # Initialize cuda graph capture list
         graph_opt_config._set_cudagraph_sizes(max_capture_size=scheduler_config.max_num_seqs)
         graph_opt_config.init_with_cudagrpah_size(max_capture_size=scheduler_config.max_num_seqs)
diff --git a/tests/graph_optimization/test_cuda_graph_recapture.py b/tests/graph_optimization/test_cuda_graph_recapture.py
index 85516f8bd3..932a1966f9 100644
--- a/tests/graph_optimization/test_cuda_graph_recapture.py
+++ b/tests/graph_optimization/test_cuda_graph_recapture.py
@@ -112,6 +112,7 @@ class TestCUDAGrpahRecapture(unittest.TestCase):
         parallel_config = ParallelConfig(args={})
         model_config = Mock()
         model_config.max_model_len = 5120
+        model_config.architectures = ["test_model"]
         fd_config = FDConfig(
             graph_opt_config=graph_opt_config,
             scheduler_config=scheduler_config,
diff --git a/tests/graph_optimization/test_cuda_graph_spec_decode.py b/tests/graph_optimization/test_cuda_graph_spec_decode.py
index 10f4237a9d..f81d4b11cf 100644
--- a/tests/graph_optimization/test_cuda_graph_spec_decode.py
+++ b/tests/graph_optimization/test_cuda_graph_spec_decode.py
@@ -105,6 +105,7 @@ class TestCUDAGrpahSpecDecode(unittest.TestCase):
         parallel_config = ParallelConfig(args={})
         model_config = Mock()
         model_config.max_model_len = 512
+        model_config.architectures = ["test_model"]
         # Initialize cuda graph capture list
         graph_opt_config._set_cudagraph_sizes(max_capture_size=scheduler_config.max_num_seqs)
         graph_opt_config.init_with_cudagrpah_size(max_capture_size=scheduler_config.max_num_seqs)
diff --git a/tests/graph_optimization/test_graph_opt_backend.py b/tests/graph_optimization/test_graph_opt_backend.py
index ff5d1fcd62..e4bac358e5 100644
--- a/tests/graph_optimization/test_graph_opt_backend.py
+++ b/tests/graph_optimization/test_graph_opt_backend.py
@@ -97,6 +97,7 @@ class TestGraphOptBackend(unittest.TestCase):
         baseline_parallel_config = ParallelConfig(args={})
         model_config = Mock()
         model_config.max_model_len = 512
+        model_config.architectures = ["test_model"]
         self.baseline_fd_config = FDConfig(
             graph_opt_config=baseline_graph_opt_config,
             scheduler_config=baseline_scheduler_config,
@@ -144,6 +145,7 @@ class TestGraphOptBackend(unittest.TestCase):
         parallel_config = ParallelConfig(args={})
         model_config = Mock()
         model_config.max_model_len = 512
+        model_config.architectures = ["test_model"]
 
         # Create FD config
         return FDConfig(
diff --git a/tests/graph_optimization/test_static_graph_cuda_graph_split.py b/tests/graph_optimization/test_static_graph_cuda_graph_split.py
index 366b35d61d..9d2b419512 100644
--- a/tests/graph_optimization/test_static_graph_cuda_graph_split.py
+++ b/tests/graph_optimization/test_static_graph_cuda_graph_split.py
@@ -97,6 +97,7 @@ class TestStaticGraphCUDAGraphSplit(unittest.TestCase):
         parallel_config = ParallelConfig(args={})
         model_config = Mock()
         model_config.max_model_len = 512
+        model_config.architectures = ["test_model"]
         fd_config = FDConfig(
             graph_opt_config=graph_opt_config,
             scheduler_config=scheduler_config,
diff --git a/tests/layers/test_speculative_sampler.py b/tests/layers/test_speculative_sampler.py
index e145030710..c62baa74ec 100644
--- a/tests/layers/test_speculative_sampler.py
+++ b/tests/layers/test_speculative_sampler.py
@@ -83,6 +83,7 @@ def _create_default_sampling_metadata(
 def _create_fd_config(max_model_len):
     model_config: Mock = Mock()
     model_config.max_model_len = max_model_len
+    model_config.architectures = ["test_model"]
     speculative_config = SpeculativeConfig({})
     graph_opt_config = GraphOptimizationConfig({})
     scheduler_config = SchedulerConfig({})
diff --git a/tests/utils.py b/tests/utils.py
index 80709b81b6..46c60b0c26 100644
--- a/tests/utils.py
+++ b/tests/utils.py
@@ -61,6 +61,7 @@ class FakeModelConfig:
         self.enable_mm = False
         self.max_model_len = 512
         self.logprobs_mode = "raw_logprobs"
+        self.architectures = ["test_model"]
 
 
 def get_default_test_fd_config():
diff --git a/tests/utils/test_config.py b/tests/utils/test_config.py
index 326f1251fa..44a5f56e56 100644
--- a/tests/utils/test_config.py
+++ b/tests/utils/test_config.py
@@ -39,6 +39,7 @@ class TestConfig(unittest.TestCase):
         scheduler_config = SchedulerConfig({})
         model_config = Mock()
         model_config.max_model_len = 512
+        model_config.architectures = ["test_model"]
         fd_config = FDConfig(
             parallel_config=parallel_config,
             graph_opt_config=graph_opt_config,
@@ -60,6 +61,7 @@ class TestConfig(unittest.TestCase):
         scheduler_config = SchedulerConfig({})
         model_config = Mock()
         model_config.max_model_len = 512
+        model_config.architectures = ["test_model"]
         fd_config = FDConfig(
             parallel_config=parallel_config,
             graph_opt_config=graph_opt_config,
@@ -81,6 +83,7 @@ class TestConfig(unittest.TestCase):
         scheduler_config = SchedulerConfig({})
         model_config: Mock = Mock()
         model_config.max_model_len = 512
+        model_config.architectures = ["test_model"]
 
         fd_config = FDConfig(
             parallel_config=parallel_config,
@@ -120,6 +123,7 @@ class TestConfig(unittest.TestCase):
         scheduler_config.splitwise_role = "prefill"
         model_config: Mock = Mock()
         model_config.max_model_len = 512
+        model_config.architectures = ["test_model"]
 
         fd_config = FDConfig(
             parallel_config=parallel_config,
@@ -162,6 +166,7 @@ class TestConfig(unittest.TestCase):
         scheduler_config = SchedulerConfig({})
         model_config: Mock = Mock()
         model_config.max_model_len = 512
+        model_config.architectures = ["test_model"]
 
         fd_config = FDConfig(
             parallel_config=parallel_config,
diff --git a/tests/utils/test_download.py b/tests/utils/test_download.py
index 50b8e99c07..15369f4671 100644
--- a/tests/utils/test_download.py
+++ b/tests/utils/test_download.py
@@ -127,7 +127,7 @@ class TestInitBosClient(unittest.TestCase):
 
         with self.assertRaises(Exception) as context:
             init_bos_client()
-        self.assertIn("BOS client validation error", str(context.exception))
+        self.assertIn("Create BOSClient Error, Please check your ENV", str(context.exception))
         os.environ.clear()
 
 
diff --git a/tests/v1/cache_manager/test_prefix_cache.py b/tests/v1/cache_manager/test_prefix_cache.py
index 8107d5597b..1d4111f681 100644
--- a/tests/v1/cache_manager/test_prefix_cache.py
+++ b/tests/v1/cache_manager/test_prefix_cache.py
@@ -33,6 +33,7 @@ def make_prefix_cache_manager(max_num_seqs, enable_mm=False, num_gpu_blocks_over
     model_cfg = SimpleNamespace(enable_mm=enable_mm, max_model_len=4196)
     speculative_cfg = SimpleNamespace(method=None)
     model_cfg.print = print
+    model_cfg.architectures = ["test_model"]
     cache_cfg.bytes_per_layer_per_block = 1
     parallel_cfg = ParallelConfig(args)
     scheduler_cfg = SchedulerConfig(args)
diff --git a/tests/v1/cache_manager/test_revert_blocks.py b/tests/v1/cache_manager/test_revert_blocks.py
index 5c23f4faea..8e3e864c66 100644
--- a/tests/v1/cache_manager/test_revert_blocks.py
+++ b/tests/v1/cache_manager/test_revert_blocks.py
@@ -35,6 +35,7 @@ def make_prefix_cache_manager(max_num_seqs, enable_mm=False, num_gpu_blocks_over
     model_cfg = SimpleNamespace(enable_mm=enable_mm, max_model_len=4196)
     speculative_cfg = SimpleNamespace(method=None)
     model_cfg.print = print
+    model_cfg.architectures = ["test_model"]
     cache_cfg.bytes_per_layer_per_block = 1
     parallel_cfg = ParallelConfig(args)
     scheduler_cfg = SchedulerConfig(args)
diff --git a/tests/v1/test_resource_manager_v1.py b/tests/v1/test_resource_manager_v1.py
index 4825072c42..06f55865e8 100644
--- a/tests/v1/test_resource_manager_v1.py
+++ b/tests/v1/test_resource_manager_v1.py
@@ -44,6 +44,7 @@ class TestResourceManagerV1(unittest.TestCase):
         speculative_cfg = SimpleNamespace(method=None)
         model_cfg.print = print
         model_cfg.max_model_len = 5120
+        model_cfg.architectures = ["test_model"]
         cache_cfg.bytes_per_layer_per_block = 1
         parallel_cfg = ParallelConfig(args)
         scheduler_cfg = SchedulerConfig(args)