[Feature] consider multimodal model when dummy run (#6045)

* add mm do profile

* updata code

* update code

* update code

* update code

* update test case

* update code

* update code

* fix xpu bug

* update code

* add mm do profile

* update test case

* update code
This commit is contained in:
kevin
2026-02-09 17:49:55 +08:00
committed by GitHub
parent 783d56e28a
commit d60daca4a8
25 changed files with 166 additions and 19 deletions
@@ -15,6 +15,7 @@
"""
import traceback
from collections.abc import Mapping
import numpy as np
from paddleformers.generation import GenerationConfig
@@ -318,3 +319,16 @@ class Ernie4_5_VLProcessor(Ernie4_5Processor):
outs["position_ids"] = np.array(outs["position_ids"], dtype=np.int64)
outs["mm_num_token_func"] = self.ernie4_5_processor.mm_num_tokens
return outs
def get_mm_max_tokens_per_item(
self,
seq_len: int,
) -> Mapping[str, int]:
"""
Get maximum number of tokens per multimodal item.
Args:
seq_len: Maximum model length
Returns:
A mapping from modalities to their respective maximum token counts.
"""
return self.ernie4_5_processor.get_mm_max_tokens_per_item(seq_len)
@@ -20,6 +20,7 @@ import copy
import os
import pickle
from collections import defaultdict
from collections.abc import Mapping
from typing import Any, Dict, List, Optional, Tuple, Union
import numpy as np
@@ -32,7 +33,7 @@ from fastdeploy.engine.request import ImagePosition
from fastdeploy.entrypoints.chat_utils import parse_chat_messages
from fastdeploy.input.ernie4_5_tokenizer import Ernie4_5Tokenizer
from fastdeploy.input.mm_data_processor import MMBaseDataProcessor
from fastdeploy.input.utils import IDS_TYPE_FLAG
from fastdeploy.input.utils import IDS_TYPE_FLAG, MAX_IMAGE_DIMENSION
from fastdeploy.multimodal.hasher import MultimodalHasher
from fastdeploy.utils import data_processor_logger
@@ -745,3 +746,45 @@ class DataProcessor(MMBaseDataProcessor):
req = pickle.dumps((mm_hashes, mm_items))
socket.send_multipart([b"", req])
data_processor_logger.info(f"Update cache of mm_hashes: {mm_hashes}")
def get_image_size_with_most_features(self):
resized_height, resized_width = self.image_preprocessor.get_smarted_resize(
height=MAX_IMAGE_DIMENSION,
width=MAX_IMAGE_DIMENSION,
min_pixels=self.image_min_pixels,
max_pixels=self.image_max_pixels,
)[0]
return (resized_height, resized_width)
def get_max_image_tokens(
self,
seq_len: int,
) -> int:
target_height, target_width = self.get_image_size_with_most_features()
patches_h, patches_w = self.image_preprocessor.get_smarted_resize(
height=target_height,
width=target_width,
min_pixels=self.image_min_pixels,
max_pixels=self.image_max_pixels,
)[1]
num_image_tokens = (patches_h * patches_w) // (self.spatial_conv_size**2)
return min(num_image_tokens, seq_len)
def get_max_video_tokens(self, seq_len: int) -> int:
target_height, target_width = self.get_image_size_with_most_features()
patches_h, patches_w = self.image_preprocessor.get_smarted_resize(
height=target_height,
width=target_width,
min_pixels=self.video_min_pixels,
max_pixels=self.video_max_pixels,
)[1]
num_video_tokens = (patches_h * patches_w) // (self.spatial_conv_size**2 * self.temporal_conv_size)
return min(num_video_tokens, seq_len)
def get_mm_max_tokens_per_item(
self,
seq_len: int,
) -> Mapping[str, int]:
max_image_tokens = self.get_max_image_tokens(seq_len)
max_video_tokens = self.get_max_video_tokens(seq_len)
return {"image": max_image_tokens, "video": max_video_tokens}