[Feature] consider multimodal model when dummy run (#6045)

* add mm do profile * updata code * update code * update code * update code * update test case * update code * update code * fix xpu bug * update code * add mm do profile * update test case * update code
2026-04-23 00:17:25 +08:00 · 2026-02-09 17:49:55 +08:00
parent 783d56e28a
commit d60daca4a8
25 changed files with 166 additions and 19 deletions
@@ -15,6 +15,7 @@
 """

 import traceback
+from collections.abc import Mapping

 import numpy as np
 from paddleformers.generation import GenerationConfig
@@ -318,3 +319,16 @@ class Ernie4_5_VLProcessor(Ernie4_5Processor):
        outs["position_ids"] = np.array(outs["position_ids"], dtype=np.int64)
        outs["mm_num_token_func"] = self.ernie4_5_processor.mm_num_tokens
        return outs
+
+    def get_mm_max_tokens_per_item(
+        self,
+        seq_len: int,
+    ) -> Mapping[str, int]:
+        """
+        Get maximum number of tokens per multimodal item.
+        Args:
+            seq_len: Maximum model length
+            Returns:
+                A mapping from modalities to their respective maximum token counts.
+        """
+        return self.ernie4_5_processor.get_mm_max_tokens_per_item(seq_len)
@@ -20,6 +20,7 @@ import copy
 import os
 import pickle
 from collections import defaultdict
+from collections.abc import Mapping
 from typing import Any, Dict, List, Optional, Tuple, Union

 import numpy as np
@@ -32,7 +33,7 @@ from fastdeploy.engine.request import ImagePosition
 from fastdeploy.entrypoints.chat_utils import parse_chat_messages
 from fastdeploy.input.ernie4_5_tokenizer import Ernie4_5Tokenizer
 from fastdeploy.input.mm_data_processor import MMBaseDataProcessor
-from fastdeploy.input.utils import IDS_TYPE_FLAG
+from fastdeploy.input.utils import IDS_TYPE_FLAG, MAX_IMAGE_DIMENSION
 from fastdeploy.multimodal.hasher import MultimodalHasher
 from fastdeploy.utils import data_processor_logger

@@ -745,3 +746,45 @@ class DataProcessor(MMBaseDataProcessor):
        req = pickle.dumps((mm_hashes, mm_items))
        socket.send_multipart([b"", req])
        data_processor_logger.info(f"Update cache of mm_hashes: {mm_hashes}")
+
+    def get_image_size_with_most_features(self):
+        resized_height, resized_width = self.image_preprocessor.get_smarted_resize(
+            height=MAX_IMAGE_DIMENSION,
+            width=MAX_IMAGE_DIMENSION,
+            min_pixels=self.image_min_pixels,
+            max_pixels=self.image_max_pixels,
+        )[0]
+        return (resized_height, resized_width)
+
+    def get_max_image_tokens(
+        self,
+        seq_len: int,
+    ) -> int:
+        target_height, target_width = self.get_image_size_with_most_features()
+        patches_h, patches_w = self.image_preprocessor.get_smarted_resize(
+            height=target_height,
+            width=target_width,
+            min_pixels=self.image_min_pixels,
+            max_pixels=self.image_max_pixels,
+        )[1]
+        num_image_tokens = (patches_h * patches_w) // (self.spatial_conv_size**2)
+        return min(num_image_tokens, seq_len)
+
+    def get_max_video_tokens(self, seq_len: int) -> int:
+        target_height, target_width = self.get_image_size_with_most_features()
+        patches_h, patches_w = self.image_preprocessor.get_smarted_resize(
+            height=target_height,
+            width=target_width,
+            min_pixels=self.video_min_pixels,
+            max_pixels=self.video_max_pixels,
+        )[1]
+        num_video_tokens = (patches_h * patches_w) // (self.spatial_conv_size**2 * self.temporal_conv_size)
+        return min(num_video_tokens, seq_len)
+
+    def get_mm_max_tokens_per_item(
+        self,
+        seq_len: int,
+    ) -> Mapping[str, int]:
+        max_image_tokens = self.get_max_image_tokens(seq_len)
+        max_video_tokens = self.get_max_video_tokens(seq_len)
+        return {"image": max_image_tokens, "video": max_video_tokens}