[DataProcessor] Move image_processor to unified directory and add MultiModalProcessor (#7109)

* first commit * step 9~10 * update multimodal * update multimodal * fix load tokenizer * add unit test * fix unit test & AdaptiveImageProcessor * Delete unused code
2026-04-22 16:07:51 +08:00 · 2026-04-08 10:16:27 +08:00
parent d693d4be14
commit 8496ec71a6
15 changed files with 3037 additions and 1401 deletions
@@ -14,7 +14,13 @@
 # limitations under the License.
 """

-from .get_image_preprocessor import get_image_preprocessor
-from .image_preprocessor_adaptive import AdaptiveImageProcessor
+# Backward compatibility: this module has been migrated to
+# fastdeploy.input.image_processors.adaptive_processor
+# This file will be removed in a future version.
+
+from fastdeploy.input.image_processors.adaptive_processor import (  # noqa: F401
+    AdaptiveImageProcessor,
+    get_image_preprocessor,
+)

 __all__ = ["get_image_preprocessor", "AdaptiveImageProcessor"]
@@ -14,21 +14,10 @@
 # limitations under the License.
 """

-"""get image preprocessor"""
+# Backward compatibility: this module has been migrated to
+# fastdeploy.input.image_processors.adaptive_processor
+# This file will be removed in a future version.

-from fastdeploy.utils import data_processor_logger
-
-from .image_preprocessor_adaptive import AdaptiveImageProcessor
-
-
-def get_image_preprocessor(args):
-    """
-    get_image_preprocessor from args
-    """
-
-    if args.vision_model_name_or_path is None:
-        return None
-
-    data_processor_logger.info("use AdaptiveImageProcessor")
-    image_preprocess = AdaptiveImageProcessor.from_pretrained(args.vision_model_name_or_path)
-    return image_preprocess
+from fastdeploy.input.image_processors.adaptive_processor import (  # noqa: F401
+    get_image_preprocessor,
+)
@@ -14,498 +14,12 @@
 # limitations under the License.
 """

-"""image preprocessor adaptive"""
+# Backward compatibility: this module has been migrated to
+# fastdeploy.input.image_processors.adaptive_processor
+# This file will be removed in a future version.

-from typing import List, Optional, Union
-
-import numpy as np
-import paddle
-import PIL
-from paddleformers.transformers.feature_extraction_utils import BatchFeature
-from paddleformers.transformers.image_processing_utils import BaseImageProcessor
-from paddleformers.transformers.image_transforms import (
-    convert_to_rgb,
-    normalize,
-    rescale,
-    resize,
-    to_channel_dimension_format,
+from fastdeploy.input.image_processors.adaptive_processor import (  # noqa: F401
+    AdaptiveImageProcessor,
+    make_batched_images,
+    make_batched_videos,
 )
-from paddleformers.transformers.image_utils import (
-    ChannelDimension,
-    ImageInput,
-    PILImageResampling,
-    get_image_size,
-    infer_channel_dimension_format,
-    is_valid_image,
-    make_list_of_images,
-    to_numpy_array,
-    valid_images,
-)
-from paddleformers.transformers.legacy.tokenizer_utils_base import TensorType
-from PIL import Image
-
-from fastdeploy.input.image_processors.common import is_scaled_image
-from fastdeploy.input.image_processors.common import smart_resize_qwen as smart_resize
-from fastdeploy.utils import data_processor_logger
-
-OPENAI_CLIP_MEAN = [0.48145466, 0.4578275, 0.40821073]
-OPENAI_CLIP_STD = [0.26862954, 0.26130258, 0.27577711]
-
-IMAGE_FACTOR = 28
-MIN_PIXELS = 4 * 28 * 28
-MAX_PIXELS = 16384 * 28 * 28
-MAX_RATIO = 200
-
-
-VideoInput = Union[
-    List["PIL.Image.Image"],
-    "np.ndarray",
-    "paddle.Tensor",
-    List["np.ndarray"],
-    List["paddle.Tensor"],
-    List[List["PIL.Image.Image"]],
-    List[List["np.ndarrray"]],
-    List[List["paddle.Tensor"]],
-]
-
-
-__all__ = [
-    "AdaptiveImageProcessor",
-]
-
-
-def make_batched_images(images) -> List[List[ImageInput]]:
-    """
-    Accepts images in list or nested list format, and makes a list of images for preprocessing.
-        images (`Union[List[List[ImageInput]], List[ImageInput], ImageInput]`):
-            The input image.
-
-    Returns:
-        list: A list of images.
-    """
-    if isinstance(images, (list, tuple)) and isinstance(images[0], (list, tuple)) and is_valid_image(images[0][0]):
-        return [img for img_list in images for img in img_list]
-
-    elif isinstance(images, (list, tuple)) and is_valid_image(images[0]):
-        return images
-
-    elif is_valid_image(images):
-        return [images]
-
-    raise ValueError(f"Could not make batched images from {images}")
-
-
-# Copied from transformers.models.llava_next_video.image_processing_llava_next_video.make_batched_videos
-def make_batched_videos(videos) -> List[VideoInput]:
-    """dummy"""
-    if isinstance(videos, (list, tuple)) and isinstance(videos[0], (list, tuple)) and is_valid_image(videos[0][0]):
-        return videos
-
-    elif isinstance(videos, (list, tuple)) and is_valid_image(videos[0]):
-        if isinstance(videos[0], Image.Image):
-            return [videos]
-        elif len(videos[0].shape) == 4:
-            return [list(video) for video in videos]
-
-    elif is_valid_image(videos) and len(videos.shape) == 4:
-        return [list(videos)]
-
-    raise ValueError(f"Could not make batched video from {videos}")
-
-
-class AdaptiveImageProcessor(BaseImageProcessor):
-    r"""
-    Constructs a adaptive image processor that dynamically resizes images based on the original images.
-
-    Args:
-        do_resize (`bool`, *optional*, defaults to `True`):
-            Whether to resize the image's (height, width) dimensions.
-        resample (`PILImageResampling`, *optional*, defaults to `Resampling.BICUBIC`):
-            Resampling filter to use when resizing the image.
-        do_rescale (`bool`, *optional*, defaults to `True`):
-            Whether to rescale the image by the specified scale `rescale_factor`.
-        rescale_factor (`int` or `float`, *optional*, defaults to `1/255`):
-            Scale factor to use if rescaling the image.
-        do_normalize (`bool`, *optional*, defaults to `True`):
-            Whether to normalize the image.
-        image_mean (`float` or `List[float]`, *optional*, defaults to `[0.48145466, 0.4578275, 0.40821073]`):
-            Mean to use if normalizing the image. This is a float or list of floats for each channel in the image.
-        image_std (`float` or `List[float]`, *optional*, defaults to `[0.26862954, 0.26130258, 0.27577711]`):
-            Standard deviation to use if normalizing the image. This is a float or list of floats for each channel
-            in the image.
-        do_convert_rgb (`bool`, *optional*, defaults to `True`):
-            Whether to convert the image to RGB.
-        min_pixels (`int`, *optional*, defaults to `56 * 56`):
-            The min pixels of the image to resize the image.
-        max_pixels (`int`, *optional*, defaults to `28 * 28 * 1280`):
-            The max pixels of the image to resize the image.
-        patch_size (`int`, *optional*, defaults to 14):
-            The spacial patch size of the vision encoder.
-        temporal_conv_size (`int`, *optional*, defaults to 2):
-            The temporal conv size in resampler.
-        merge_size (`int`, *optional*, defaults to 2):
-            The merge size of the vision encoder to llm encoder.
-    """
-
-    model_input_names = [
-        "pixel_values",
-        "image_grid_thw",
-        "pixel_values_videos",
-        "video_grid_thw",
-    ]
-
-    def __init__(
-        self,
-        do_resize: bool = True,
-        resample: PILImageResampling = PILImageResampling.BICUBIC,
-        do_rescale: bool = True,
-        rescale_factor: float = 1 / 255,
-        do_normalize: bool = True,
-        image_mean: Optional[Union[float, List[float]]] = None,
-        image_std: Optional[Union[float, List[float]]] = None,
-        do_convert_rgb: bool = True,
-        min_pixels: int = 56 * 56,
-        max_pixels: int = 28 * 28 * 1280,
-        patch_size: int = 14,
-        temporal_conv_size: int = 2,
-        merge_size: int = 2,
-        **kwargs,
-    ) -> None:
-        """init"""
-        super().__init__(**kwargs)
-        self.do_resize = do_resize
-        self.resample = resample
-        self.do_rescale = do_rescale
-        self.rescale_factor = rescale_factor
-        self.do_normalize = do_normalize
-        self.image_mean = image_mean if image_mean is not None else OPENAI_CLIP_MEAN
-        self.image_std = image_std if image_std is not None else OPENAI_CLIP_STD
-        self.min_pixels = min_pixels
-        self.max_pixels = max_pixels
-        self.patch_size = patch_size
-        self.temporal_conv_size = temporal_conv_size
-        self.merge_size = merge_size
-        self.size = {"min_pixels": min_pixels, "max_pixels": max_pixels}
-        self.do_convert_rgb = do_convert_rgb
-
-    def set_pixels(self, min_pixels=None, max_pixels=None, msg=""):
-        """设定pixels"""
-        if min_pixels is not None:
-            assert isinstance(min_pixels, int) and min_pixels >= 0, "min_pixels must be positive int"
-            data_processor_logger.info(f"{msg} AdaptiveImageProcessor set min_pixels = {min_pixels}")
-            self.min_pixels = min_pixels
-            self.size["min_pixels"] = int(min_pixels)
-        if max_pixels is not None:
-            assert isinstance(max_pixels, int) and max_pixels > 0, "max_pixels must be positive int"
-            data_processor_logger.info(f"{msg} AdaptiveImageProcessor set max_pixels = {max_pixels}")
-            self.max_pixels = max_pixels
-            self.size["max_pixels"] = int(max_pixels)
-
-    def get_smarted_resize(self, height, width, min_pixels=None, max_pixels=None):
-        """dummy"""
-        actual_min_pixels = min_pixels if min_pixels is not None else self.min_pixels
-        actual_max_pixels = max_pixels if max_pixels is not None else self.max_pixels
-        resized_height, resized_width = smart_resize(
-            height,
-            width,
-            factor=self.patch_size * self.merge_size,
-            min_pixels=actual_min_pixels,
-            max_pixels=actual_max_pixels,
-        )
-        return (resized_height, resized_width), (
-            resized_height // self.patch_size,
-            resized_width // self.patch_size,
-        )
-
-    def _preprocess(
-        self,
-        images: Union[ImageInput, VideoInput],
-        do_resize: bool = True,
-        resample: PILImageResampling = None,
-        do_rescale: bool = True,
-        rescale_factor: float = 1 / 255,
-        do_normalize: bool = True,
-        image_mean: Optional[Union[float, List[float]]] = None,
-        image_std: Optional[Union[float, List[float]]] = None,
-        do_convert_rgb: bool = False,
-        data_format: Optional[ChannelDimension] = ChannelDimension.FIRST,
-        input_data_format: Optional[Union[str, ChannelDimension]] = None,
-        predetermined_grid_thw=None,
-    ):
-        """
-        Preprocess an image or batch of images. Copy of the `preprocess` method from `CLIPImageProcessor`.
-
-        Args:
-            images (`ImageInput`):
-                Image or batch of images to preprocess. Expects pixel values ranging from 0 to 255.
-                If pixel values range from 0 to 1, set `do_rescale=False`.
-            vision_info (`List[Dict]`, *optional*):
-                Optional list of dictionaries containing additional information about vision inputs.
-            do_resize (`bool`, *optional*, defaults to `self.do_resize`):
-                Whether to resize the image.
-            resample (`PILImageResampling`, *optional*, defaults to `self.resample`):
-                Resampling filter to use if resizing the image. This can be one of the `PILImageResampling` enums.
-            do_rescale (`bool`, *optional*, defaults to `self.do_rescale`):
-                Whether to rescale the image.
-            rescale_factor (`float`, *optional*, defaults to `self.rescale_factor`):
-                Scale factor to use if rescaling the image.
-            do_normalize (`bool`, *optional*, defaults to `self.do_normalize`):
-                Whether to normalize the image.
-            image_mean (`float` or `List[float]`, *optional*, defaults to `self.image_mean`):
-                Mean to use if normalizing the image.
-                Can be a float or a list of floats corresponding to the number of channels in the image.
-            image_std (`float` or `List[float]`, *optional*, defaults to `self.image_std`):
-                Standard deviation to use if normalizing the image.
-                Can be a float or a list of floats corresponding to the number of channels in the image.
-            do_convert_rgb (`bool`, *optional*, defaults to `self.do_convert_rgb`):
-                Whether to convert the image to RGB.
-            data_format (`ChannelDimension`, *optional*, defaults to `ChannelDimension.FIRST`):
-                The channel dimension format for the output image. Can be one of:
-                - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
-                - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
-                - Unset: Use the channel dimension format of the input image.
-            input_data_format (`ChannelDimension` or `str`, *optional*):
-                The channel dimension format for the input image. Can be one of:
-                - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
-                - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
-                - `"none"` or `ChannelDimension.NONE`: image in (height, width) format.
-                - `"none"` or `ChannelDimension.NONE`: image in (height, width) format.
-        """
-        images = make_list_of_images(images)
-
-        if do_convert_rgb:
-            images = [convert_to_rgb(image) for image in images]
-
-        # All transformations expect numpy arrays.
-        images = [to_numpy_array(image) for image in images]
-
-        if is_scaled_image(images[0]) and do_rescale:
-            data_processor_logger.warning(
-                "It looks like you are trying to rescale already rescaled images. If the input"
-                " images have pixel values between 0 and 1, set `do_rescale=False` to avoid rescaling them again."
-            )
-        if input_data_format is None:
-            # We assume that all images have the same channel dimension format.
-            input_data_format = infer_channel_dimension_format(images[0])
-
-        height, width = get_image_size(images[0], channel_dim=input_data_format)
-        resized_height, resized_width = height, width
-        processed_images = []
-
-        if predetermined_grid_thw is not None:
-            assert len(predetermined_grid_thw) == len(
-                images
-            ), f"len(predetermined_grid_thw) {len(predetermined_grid_thw)} == len(images) {len(images)}"
-
-        for img_idx, image in enumerate(images):
-            if do_resize:
-                if predetermined_grid_thw is not None:
-                    (resized_height, resized_width) = predetermined_grid_thw[img_idx]
-                    resized_height *= self.patch_size
-                    resized_width *= self.patch_size
-                else:
-                    resized_height, resized_width = smart_resize(
-                        height,
-                        width,
-                        factor=self.patch_size * self.merge_size,
-                        min_pixels=self.min_pixels,
-                        max_pixels=self.max_pixels,
-                    )
-                image = image.astype("uint8")  # TODO : 需要手动加上，否则多除255 导致结果会出错
-                # 直接fromarray，不要靠paddleformers里面的
-                image = Image.fromarray(image)
-                image = resize(
-                    image,
-                    size=(resized_height, resized_width),
-                    resample=resample,
-                    data_format=input_data_format,
-                )
-            if do_rescale:
-                image = rescale(image, scale=rescale_factor, data_format=input_data_format)
-
-            if do_normalize:
-                image = normalize(
-                    image=image,
-                    mean=image_mean,
-                    std=image_std,
-                    data_format=input_data_format,
-                )
-
-            image = to_channel_dimension_format(image, data_format, input_channel_dim=input_data_format)  # [C, H, W]
-
-            processed_images.append(image)
-        patches = np.array(processed_images)
-        if data_format == ChannelDimension.LAST:
-            patches = patches.transpose([0, 3, 1, 2])
-
-        channel = patches.shape[1]  # [time, C, H, W]
-        grid_t = patches.shape[0]
-        grid_h, grid_w = (
-            resized_height // self.patch_size,
-            resized_width // self.patch_size,
-        )
-        patches = patches.reshape(
-            [
-                grid_t,
-                channel,
-                grid_h // self.merge_size,
-                self.merge_size,
-                self.patch_size,
-                grid_w // self.merge_size,
-                self.merge_size,
-                self.patch_size,
-            ]
-        )
-        # [grid_t, grid_h/merge_size, grid_w/merge_size, merge_size, merge_size, C, psz, psz]
-        patches = patches.transpose([0, 2, 5, 3, 6, 1, 4, 7])
-
-        flatten_patches = patches.reshape(
-            [
-                grid_t * grid_h * grid_w,
-                channel * self.patch_size * self.patch_size,
-            ]
-        )  # [grid_t * grid_h * grid_w, C * psz * psz]
-
-        return flatten_patches, (grid_t, grid_h, grid_w)
-
-    def preprocess(
-        self,
-        images: ImageInput,
-        videos: VideoInput = None,
-        do_resize: bool = True,
-        size: Optional[Union[int, List[int]]] = None,
-        resample: PILImageResampling = None,
-        do_rescale: bool = True,
-        rescale_factor: float = 1 / 255,
-        do_normalize: bool = True,
-        image_mean: Optional[Union[float, List[float]]] = None,
-        image_std: Optional[Union[float, List[float]]] = None,
-        do_convert_rgb: bool = False,
-        return_tensors: Optional[Union[str, TensorType]] = None,
-        data_format: Optional[ChannelDimension] = ChannelDimension.FIRST,
-        input_data_format: Optional[Union[str, ChannelDimension]] = None,
-        predetermined_grid_thw=None,
-    ):
-        """
-        Args:
-            images (`ImageInput`):
-                Image to preprocess. Expects a single or batch of images with pixel values ranging from 0 to 255. If
-                passing in images with pixel values between 0 and 1, set `do_rescale=False`.
-            videos (`VideoInput`):
-                Video to preprocess. Expects a single or batch of videos with pixel values ranging from 0 to 255. If
-                passing in videos with pixel values between 0 and 1, set `do_rescale=False`.
-            do_resize (`bool`, *optional*, defaults to `self.do_resize`):
-                Whether to resize the image.
-            size (`Dict[str, int]`, *optional*, defaults to `self.size`):
-                Size of the image after resizing. Shortest edge of the image is resized to size["shortest_edge"], with
-                the longest edge resized to keep the input aspect ratio.
-            resample (`int`, *optional*, defaults to `self.resample`):
-                Resampling filter to use if resizing the image. This can be one of the enum `PILImageResampling`. Only
-                has an effect if `do_resize` is set to `True`.
-            do_rescale (`bool`, *optional*, defaults to `self.do_rescale`):
-                Whether to rescale the image.
-            rescale_factor (`float`, *optional*, defaults to `self.rescale_factor`):
-                Rescale factor to rescale the image by if `do_rescale` is set to `True`.
-            do_normalize (`bool`, *optional*, defaults to `self.do_normalize`):
-                Whether to normalize the image.
-            image_mean (`float` or `List[float]`, *optional*, defaults to `self.image_mean`):
-                Image mean to use for normalization. Only has an effect if `do_normalize` is set to `True`.
-            image_std (`float` or `List[float]`, *optional*, defaults to `self.image_std`):
-                Image standard deviation to use for normalization. Only has an effect if `do_normalize` is set to
-                `True`.
-            do_convert_rgb (`bool`, *optional*, defaults to `self.do_convert_rgb`):
-                Whether to convert the image to RGB.
-            return_tensors (`str` or `TensorType`, *optional*):
-                The type of tensors to return. Can be one of:
-                - Unset: Return a list of `np.ndarray`.
-                - `TensorType.PADDLE` or `'pt'`: Return a batch of type `torch.Tensor`.
-                - `TensorType.NUMPY` or `'np'`: Return a batch of type `np.ndarray`.
-            data_format (`ChannelDimension` or `str`, *optional*, defaults to `ChannelDimension.FIRST`):
-                The channel dimension format for the output image. Can be one of:
-                - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
-                - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
-                - Unset: Use the channel dimension format of the input image.
-            input_data_format (`ChannelDimension` or `str`, *optional*):
-                The channel dimension format for the input image. If unset, the channel dimension format is inferred
-                from the input image. Can be one of:
-                - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
-                - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
-                - `"none"` or `ChannelDimension.NONE`: image in (height, width) format.
-
-        """
-        do_resize = do_resize if do_resize is not None else self.do_resize
-        size = size if size is not None else self.size
-        resample = resample if resample is not None else self.resample
-        do_rescale = do_rescale if do_rescale is not None else self.do_rescale
-        rescale_factor = rescale_factor if rescale_factor is not None else self.rescale_factor
-        do_normalize = do_normalize if do_normalize is not None else self.do_normalize
-        image_mean = image_mean if image_mean is not None else self.image_mean
-        image_std = image_std if image_std is not None else self.image_std
-        do_convert_rgb = do_convert_rgb if do_convert_rgb is not None else self.do_convert_rgb
-
-        if images is not None:
-            images = make_batched_images(images)
-        if videos is not None:
-            videos = make_batched_videos(videos)
-
-        if images is not None and not valid_images(images):
-            raise ValueError("Invalid image type. Must be of type PIL.Image.Image, numpy.ndarray, " "paddle.Tensor.")
-
-        if images is not None:
-            pixel_values, vision_grid_thws = [], []
-            for img_idx, image in enumerate(images):
-                if predetermined_grid_thw is not None:
-                    predetermined_grid_thw_one = [predetermined_grid_thw[img_idx]]
-                else:
-                    predetermined_grid_thw_one = None
-                patches, image_grid_thw = self._preprocess(
-                    image,
-                    do_resize=do_resize,
-                    resample=resample,
-                    do_rescale=do_rescale,
-                    rescale_factor=rescale_factor,
-                    do_normalize=do_normalize,
-                    image_mean=image_mean,
-                    image_std=image_std,
-                    data_format=data_format,
-                    do_convert_rgb=do_convert_rgb,
-                    input_data_format=input_data_format,
-                    predetermined_grid_thw=predetermined_grid_thw_one,
-                )
-                pixel_values.extend(patches)
-                vision_grid_thws.append(image_grid_thw)
-            pixel_values = np.array(pixel_values)
-            vision_grid_thws = np.array(vision_grid_thws)
-            data = {
-                "pixel_values": pixel_values,
-                "image_grid_thw": vision_grid_thws,
-            }
-
-        if videos is not None:
-            pixel_values, vision_grid_thws = [], []
-            for images in videos:
-                patches, video_grid_thw = self._preprocess(
-                    images,
-                    do_resize=do_resize,
-                    resample=resample,
-                    do_rescale=do_rescale,
-                    rescale_factor=rescale_factor,
-                    do_normalize=do_normalize,
-                    image_mean=image_mean,
-                    image_std=image_std,
-                    data_format=data_format,
-                    do_convert_rgb=do_convert_rgb,
-                    input_data_format=input_data_format,
-                    predetermined_grid_thw=predetermined_grid_thw,
-                )
-                pixel_values.extend(patches)
-                vision_grid_thws.append(video_grid_thw)
-            pixel_values = np.array(pixel_values)
-            vision_grid_thws = np.array(vision_grid_thws)
-
-            data = {
-                "pixel_values_videos": pixel_values,
-                "video_grid_thw": vision_grid_thws,
-            }
-
-        return BatchFeature(data=data, tensor_type=return_tensors)
@@ -11,3 +11,17 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+
+from fastdeploy.input.image_processors.adaptive_processor import (  # noqa: F401
+    AdaptiveImageProcessor,
+    get_image_preprocessor,
+)
+from fastdeploy.input.image_processors.paddleocr_processor import (  # noqa: F401
+    ImageProcessor as PaddleOCRImageProcessor,
+)
+from fastdeploy.input.image_processors.qwen3_processor import (  # noqa: F401
+    ImageProcessor as Qwen3ImageProcessor,
+)
+from fastdeploy.input.image_processors.qwen_processor import (  # noqa: F401
+    ImageProcessor as QwenImageProcessor,
+)
@@ -0,0 +1,524 @@
+"""
+# Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+
+"""image preprocessor adaptive"""
+
+from typing import List, Optional, Union
+
+import numpy as np
+import paddle
+import PIL
+from paddleformers.transformers.feature_extraction_utils import BatchFeature
+from paddleformers.transformers.image_processing_utils import BaseImageProcessor
+from paddleformers.transformers.image_transforms import (
+    convert_to_rgb,
+    normalize,
+    rescale,
+    resize,
+    to_channel_dimension_format,
+)
+from paddleformers.transformers.image_utils import (
+    ChannelDimension,
+    ImageInput,
+    PILImageResampling,
+    get_image_size,
+    infer_channel_dimension_format,
+    is_valid_image,
+    make_list_of_images,
+    to_numpy_array,
+    valid_images,
+)
+from paddleformers.transformers.legacy.tokenizer_utils_base import TensorType
+from PIL import Image
+
+from fastdeploy.input.image_processors.common import is_scaled_image
+from fastdeploy.input.image_processors.common import smart_resize_qwen as smart_resize
+from fastdeploy.utils import data_processor_logger
+
+OPENAI_CLIP_MEAN = [0.48145466, 0.4578275, 0.40821073]
+OPENAI_CLIP_STD = [0.26862954, 0.26130258, 0.27577711]
+
+IMAGE_FACTOR = 28
+MIN_PIXELS = 4 * 28 * 28
+MAX_PIXELS = 16384 * 28 * 28
+MAX_RATIO = 200
+
+
+VideoInput = Union[
+    List["PIL.Image.Image"],
+    "np.ndarray",
+    "paddle.Tensor",
+    List["np.ndarray"],
+    List["paddle.Tensor"],
+    List[List["PIL.Image.Image"]],
+    List[List["np.ndarray"]],
+    List[List["paddle.Tensor"]],
+]
+
+
+__all__ = [
+    "AdaptiveImageProcessor",
+    "get_image_preprocessor",
+    "make_batched_images",
+    "make_batched_videos",
+]
+
+
+def make_batched_images(images) -> List[List[ImageInput]]:
+    """
+    Accepts images in list or nested list format, and makes a list of images for preprocessing.
+        images (`Union[List[List[ImageInput]], List[ImageInput], ImageInput]`):
+            The input image.
+
+    Returns:
+        list: A list of images.
+    """
+    if isinstance(images, (list, tuple)) and isinstance(images[0], (list, tuple)) and is_valid_image(images[0][0]):
+        return [img for img_list in images for img in img_list]
+
+    elif isinstance(images, (list, tuple)) and is_valid_image(images[0]):
+        return images
+
+    elif is_valid_image(images):
+        return [images]
+
+    raise ValueError(f"Could not make batched images from {images}")
+
+
+# Copied from transformers.models.llava_next_video.image_processing_llava_next_video.make_batched_videos
+def make_batched_videos(videos) -> List[VideoInput]:
+    """dummy"""
+    if isinstance(videos, (list, tuple)) and isinstance(videos[0], (list, tuple)) and is_valid_image(videos[0][0]):
+        return videos
+
+    elif isinstance(videos, (list, tuple)) and is_valid_image(videos[0]):
+        if isinstance(videos[0], Image.Image):
+            return [videos]
+        elif len(videos[0].shape) == 4:
+            return [list(video) for video in videos]
+
+    elif is_valid_image(videos) and len(videos.shape) == 4:
+        return [list(videos)]
+
+    raise ValueError(f"Could not make batched video from {videos}")
+
+
+class AdaptiveImageProcessor(BaseImageProcessor):
+    r"""
+    Constructs a adaptive image processor that dynamically resizes images based on the original images.
+
+    Args:
+        do_resize (`bool`, *optional*, defaults to `True`):
+            Whether to resize the image's (height, width) dimensions.
+        resample (`PILImageResampling`, *optional*, defaults to `Resampling.BICUBIC`):
+            Resampling filter to use when resizing the image.
+        do_rescale (`bool`, *optional*, defaults to `True`):
+            Whether to rescale the image by the specified scale `rescale_factor`.
+        rescale_factor (`int` or `float`, *optional*, defaults to `1/255`):
+            Scale factor to use if rescaling the image.
+        do_normalize (`bool`, *optional*, defaults to `True`):
+            Whether to normalize the image.
+        image_mean (`float` or `List[float]`, *optional*, defaults to `[0.48145466, 0.4578275, 0.40821073]`):
+            Mean to use if normalizing the image. This is a float or list of floats for each channel in the image.
+        image_std (`float` or `List[float]`, *optional*, defaults to `[0.26862954, 0.26130258, 0.27577711]`):
+            Standard deviation to use if normalizing the image. This is a float or list of floats for each channel
+            in the image.
+        do_convert_rgb (`bool`, *optional*, defaults to `True`):
+            Whether to convert the image to RGB.
+        min_pixels (`int`, *optional*, defaults to `56 * 56`):
+            The min pixels of the image to resize the image.
+        max_pixels (`int`, *optional*, defaults to `28 * 28 * 1280`):
+            The max pixels of the image to resize the image.
+        patch_size (`int`, *optional*, defaults to 14):
+            The spacial patch size of the vision encoder.
+        temporal_conv_size (`int`, *optional*, defaults to 2):
+            The temporal conv size in resampler.
+        merge_size (`int`, *optional*, defaults to 2):
+            The merge size of the vision encoder to llm encoder.
+    """
+
+    model_input_names = [
+        "pixel_values",
+        "image_grid_thw",
+        "pixel_values_videos",
+        "video_grid_thw",
+    ]
+
+    def __init__(
+        self,
+        do_resize: bool = True,
+        resample: PILImageResampling = PILImageResampling.BICUBIC,
+        do_rescale: bool = True,
+        rescale_factor: float = 1 / 255,
+        do_normalize: bool = True,
+        image_mean: Optional[Union[float, List[float]]] = None,
+        image_std: Optional[Union[float, List[float]]] = None,
+        do_convert_rgb: bool = True,
+        min_pixels: int = 56 * 56,
+        max_pixels: int = 28 * 28 * 1280,
+        patch_size: int = 14,
+        temporal_conv_size: int = 2,
+        merge_size: int = 2,
+        **kwargs,
+    ) -> None:
+        """init"""
+        super().__init__(**kwargs)
+        self.do_resize = do_resize
+        self.resample = resample
+        self.do_rescale = do_rescale
+        self.rescale_factor = rescale_factor
+        self.do_normalize = do_normalize
+        self.image_mean = image_mean if image_mean is not None else OPENAI_CLIP_MEAN
+        self.image_std = image_std if image_std is not None else OPENAI_CLIP_STD
+        self.min_pixels = min_pixels
+        self.max_pixels = max_pixels
+        self.patch_size = patch_size
+        self.temporal_conv_size = temporal_conv_size
+        self.merge_size = merge_size
+        self.size = {"min_pixels": min_pixels, "max_pixels": max_pixels}
+        self.do_convert_rgb = do_convert_rgb
+
+    def set_pixels(self, min_pixels=None, max_pixels=None, msg=""):
+        """设定pixels"""
+        if min_pixels is not None:
+            assert isinstance(min_pixels, int) and min_pixels >= 0, "min_pixels must be positive int"
+            data_processor_logger.info(f"{msg} AdaptiveImageProcessor set min_pixels = {min_pixels}")
+            self.min_pixels = min_pixels
+            self.size["min_pixels"] = int(min_pixels)
+        if max_pixels is not None:
+            assert isinstance(max_pixels, int) and max_pixels > 0, "max_pixels must be positive int"
+            data_processor_logger.info(f"{msg} AdaptiveImageProcessor set max_pixels = {max_pixels}")
+            self.max_pixels = max_pixels
+            self.size["max_pixels"] = int(max_pixels)
+
+    def get_smarted_resize(self, height, width, min_pixels=None, max_pixels=None):
+        """dummy"""
+        actual_min_pixels = min_pixels if min_pixels is not None else self.min_pixels
+        actual_max_pixels = max_pixels if max_pixels is not None else self.max_pixels
+        resized_height, resized_width = smart_resize(
+            height,
+            width,
+            factor=self.patch_size * self.merge_size,
+            min_pixels=actual_min_pixels,
+            max_pixels=actual_max_pixels,
+        )
+        return (resized_height, resized_width), (
+            resized_height // self.patch_size,
+            resized_width // self.patch_size,
+        )
+
+    def _preprocess(
+        self,
+        images: Union[ImageInput, VideoInput],
+        do_resize: bool = True,
+        resample: PILImageResampling = None,
+        do_rescale: bool = True,
+        rescale_factor: float = 1 / 255,
+        do_normalize: bool = True,
+        image_mean: Optional[Union[float, List[float]]] = None,
+        image_std: Optional[Union[float, List[float]]] = None,
+        do_convert_rgb: bool = False,
+        data_format: Optional[ChannelDimension] = ChannelDimension.FIRST,
+        input_data_format: Optional[Union[str, ChannelDimension]] = None,
+        predetermined_grid_thw=None,
+    ):
+        """
+        Preprocess an image or batch of images. Copy of the `preprocess` method from `CLIPImageProcessor`.
+
+        Args:
+            images (`ImageInput`):
+                Image or batch of images to preprocess. Expects pixel values ranging from 0 to 255.
+                If pixel values range from 0 to 1, set `do_rescale=False`.
+            vision_info (`List[Dict]`, *optional*):
+                Optional list of dictionaries containing additional information about vision inputs.
+            do_resize (`bool`, *optional*, defaults to `self.do_resize`):
+                Whether to resize the image.
+            resample (`PILImageResampling`, *optional*, defaults to `self.resample`):
+                Resampling filter to use if resizing the image. This can be one of the `PILImageResampling` enums.
+            do_rescale (`bool`, *optional*, defaults to `self.do_rescale`):
+                Whether to rescale the image.
+            rescale_factor (`float`, *optional*, defaults to `self.rescale_factor`):
+                Scale factor to use if rescaling the image.
+            do_normalize (`bool`, *optional*, defaults to `self.do_normalize`):
+                Whether to normalize the image.
+            image_mean (`float` or `List[float]`, *optional*, defaults to `self.image_mean`):
+                Mean to use if normalizing the image.
+                Can be a float or a list of floats corresponding to the number of channels in the image.
+            image_std (`float` or `List[float]`, *optional*, defaults to `self.image_std`):
+                Standard deviation to use if normalizing the image.
+                Can be a float or a list of floats corresponding to the number of channels in the image.
+            do_convert_rgb (`bool`, *optional*, defaults to `self.do_convert_rgb`):
+                Whether to convert the image to RGB.
+            data_format (`ChannelDimension`, *optional*, defaults to `ChannelDimension.FIRST`):
+                The channel dimension format for the output image. Can be one of:
+                - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
+                - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
+                - Unset: Use the channel dimension format of the input image.
+            input_data_format (`ChannelDimension` or `str`, *optional*):
+                The channel dimension format for the input image. Can be one of:
+                - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
+                - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
+                - `"none"` or `ChannelDimension.NONE`: image in (height, width) format.
+                - `"none"` or `ChannelDimension.NONE`: image in (height, width) format.
+        """
+        images = make_list_of_images(images)
+
+        if do_convert_rgb:
+            images = [convert_to_rgb(image) for image in images]
+
+        # All transformations expect numpy arrays.
+        images = [to_numpy_array(image) for image in images]
+
+        if is_scaled_image(images[0]) and do_rescale:
+            data_processor_logger.warning(
+                "It looks like you are trying to rescale already rescaled images. If the input"
+                " images have pixel values between 0 and 1, set `do_rescale=False` to avoid rescaling them again."
+            )
+        if input_data_format is None:
+            # We assume that all images have the same channel dimension format.
+            input_data_format = infer_channel_dimension_format(images[0])
+
+        height, width = get_image_size(images[0], channel_dim=input_data_format)
+        resized_height, resized_width = height, width
+        processed_images = []
+
+        if predetermined_grid_thw is not None:
+            assert len(predetermined_grid_thw) == len(
+                images
+            ), f"len(predetermined_grid_thw) {len(predetermined_grid_thw)} == len(images) {len(images)}"
+
+        for img_idx, image in enumerate(images):
+            if do_resize:
+                if predetermined_grid_thw is not None:
+                    (resized_height, resized_width) = predetermined_grid_thw[img_idx]
+                    resized_height *= self.patch_size
+                    resized_width *= self.patch_size
+                else:
+                    resized_height, resized_width = smart_resize(
+                        height,
+                        width,
+                        factor=self.patch_size * self.merge_size,
+                        min_pixels=self.min_pixels,
+                        max_pixels=self.max_pixels,
+                    )
+                image = image.astype("uint8")  # TODO : 需要手动加上，否则多除255 导致结果会出错
+                # 直接fromarray，不要靠paddleformers里面的
+                image = Image.fromarray(image)
+                image = resize(
+                    image,
+                    size=(resized_height, resized_width),
+                    resample=resample,
+                    data_format=input_data_format,
+                )
+            if do_rescale:
+                image = rescale(image, scale=rescale_factor, data_format=input_data_format)
+
+            if do_normalize:
+                image = normalize(
+                    image=image,
+                    mean=image_mean,
+                    std=image_std,
+                    data_format=input_data_format,
+                )
+
+            image = to_channel_dimension_format(image, data_format, input_channel_dim=input_data_format)  # [C, H, W]
+
+            processed_images.append(image)
+        patches = np.array(processed_images)
+        if data_format == ChannelDimension.LAST:
+            patches = patches.transpose([0, 3, 1, 2])
+
+        channel = patches.shape[1]  # [time, C, H, W]
+        grid_t = patches.shape[0]
+        grid_h, grid_w = (
+            resized_height // self.patch_size,
+            resized_width // self.patch_size,
+        )
+        patches = patches.reshape(
+            [
+                grid_t,
+                channel,
+                grid_h // self.merge_size,
+                self.merge_size,
+                self.patch_size,
+                grid_w // self.merge_size,
+                self.merge_size,
+                self.patch_size,
+            ]
+        )
+        # [grid_t, grid_h/merge_size, grid_w/merge_size, merge_size, merge_size, C, psz, psz]
+        patches = patches.transpose([0, 2, 5, 3, 6, 1, 4, 7])
+
+        flatten_patches = patches.reshape(
+            [
+                grid_t * grid_h * grid_w,
+                channel * self.patch_size * self.patch_size,
+            ]
+        )  # [grid_t * grid_h * grid_w, C * psz * psz]
+
+        return flatten_patches, (grid_t, grid_h, grid_w)
+
+    def preprocess(
+        self,
+        images: ImageInput,
+        videos: VideoInput = None,
+        do_resize: bool = True,
+        size: Optional[Union[int, List[int]]] = None,
+        resample: PILImageResampling = None,
+        do_rescale: bool = True,
+        rescale_factor: float = 1 / 255,
+        do_normalize: bool = True,
+        image_mean: Optional[Union[float, List[float]]] = None,
+        image_std: Optional[Union[float, List[float]]] = None,
+        do_convert_rgb: bool = False,
+        return_tensors: Optional[Union[str, TensorType]] = None,
+        data_format: Optional[ChannelDimension] = ChannelDimension.FIRST,
+        input_data_format: Optional[Union[str, ChannelDimension]] = None,
+        predetermined_grid_thw=None,
+    ):
+        """
+        Args:
+            images (`ImageInput`):
+                Image to preprocess. Expects a single or batch of images with pixel values ranging from 0 to 255. If
+                passing in images with pixel values between 0 and 1, set `do_rescale=False`.
+            videos (`VideoInput`):
+                Video to preprocess. Expects a single or batch of videos with pixel values ranging from 0 to 255. If
+                passing in videos with pixel values between 0 and 1, set `do_rescale=False`.
+            do_resize (`bool`, *optional*, defaults to `self.do_resize`):
+                Whether to resize the image.
+            size (`Dict[str, int]`, *optional*, defaults to `self.size`):
+                Size of the image after resizing. Shortest edge of the image is resized to size["shortest_edge"], with
+                the longest edge resized to keep the input aspect ratio.
+            resample (`int`, *optional*, defaults to `self.resample`):
+                Resampling filter to use if resizing the image. This can be one of the enum `PILImageResampling`. Only
+                has an effect if `do_resize` is set to `True`.
+            do_rescale (`bool`, *optional*, defaults to `self.do_rescale`):
+                Whether to rescale the image.
+            rescale_factor (`float`, *optional*, defaults to `self.rescale_factor`):
+                Rescale factor to rescale the image by if `do_rescale` is set to `True`.
+            do_normalize (`bool`, *optional*, defaults to `self.do_normalize`):
+                Whether to normalize the image.
+            image_mean (`float` or `List[float]`, *optional*, defaults to `self.image_mean`):
+                Image mean to use for normalization. Only has an effect if `do_normalize` is set to `True`.
+            image_std (`float` or `List[float]`, *optional*, defaults to `self.image_std`):
+                Image standard deviation to use for normalization. Only has an effect if `do_normalize` is set to
+                `True`.
+            do_convert_rgb (`bool`, *optional*, defaults to `self.do_convert_rgb`):
+                Whether to convert the image to RGB.
+            return_tensors (`str` or `TensorType`, *optional*):
+                The type of tensors to return. Can be one of:
+                - Unset: Return a list of `np.ndarray`.
+                - `TensorType.PADDLE` or `'pt'`: Return a batch of type `torch.Tensor`.
+                - `TensorType.NUMPY` or `'np'`: Return a batch of type `np.ndarray`.
+            data_format (`ChannelDimension` or `str`, *optional*, defaults to `ChannelDimension.FIRST`):
+                The channel dimension format for the output image. Can be one of:
+                - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
+                - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
+                - Unset: Use the channel dimension format of the input image.
+            input_data_format (`ChannelDimension` or `str`, *optional*):
+                The channel dimension format for the input image. If unset, the channel dimension format is inferred
+                from the input image. Can be one of:
+                - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
+                - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
+                - `"none"` or `ChannelDimension.NONE`: image in (height, width) format.
+
+        """
+        do_resize = do_resize if do_resize is not None else self.do_resize
+        size = size if size is not None else self.size
+        resample = resample if resample is not None else self.resample
+        do_rescale = do_rescale if do_rescale is not None else self.do_rescale
+        rescale_factor = rescale_factor if rescale_factor is not None else self.rescale_factor
+        do_normalize = do_normalize if do_normalize is not None else self.do_normalize
+        image_mean = image_mean if image_mean is not None else self.image_mean
+        image_std = image_std if image_std is not None else self.image_std
+        do_convert_rgb = do_convert_rgb if do_convert_rgb is not None else self.do_convert_rgb
+
+        if images is not None:
+            images = make_batched_images(images)
+        if videos is not None:
+            videos = make_batched_videos(videos)
+
+        if images is not None and not valid_images(images):
+            raise ValueError("Invalid image type. Must be of type PIL.Image.Image, numpy.ndarray, " "paddle.Tensor.")
+
+        data = {}
+
+        if images is not None:
+            pixel_values, vision_grid_thws = [], []
+            for img_idx, image in enumerate(images):
+                if predetermined_grid_thw is not None:
+                    predetermined_grid_thw_one = [predetermined_grid_thw[img_idx]]
+                else:
+                    predetermined_grid_thw_one = None
+                patches, image_grid_thw = self._preprocess(
+                    image,
+                    do_resize=do_resize,
+                    resample=resample,
+                    do_rescale=do_rescale,
+                    rescale_factor=rescale_factor,
+                    do_normalize=do_normalize,
+                    image_mean=image_mean,
+                    image_std=image_std,
+                    data_format=data_format,
+                    do_convert_rgb=do_convert_rgb,
+                    input_data_format=input_data_format,
+                    predetermined_grid_thw=predetermined_grid_thw_one,
+                )
+                pixel_values.extend(patches)
+                vision_grid_thws.append(image_grid_thw)
+            pixel_values = np.array(pixel_values)
+            vision_grid_thws = np.array(vision_grid_thws)
+            data["pixel_values"] = pixel_values
+            data["image_grid_thw"] = vision_grid_thws
+
+        if videos is not None:
+            pixel_values, vision_grid_thws = [], []
+            for images in videos:
+                patches, video_grid_thw = self._preprocess(
+                    images,
+                    do_resize=do_resize,
+                    resample=resample,
+                    do_rescale=do_rescale,
+                    rescale_factor=rescale_factor,
+                    do_normalize=do_normalize,
+                    image_mean=image_mean,
+                    image_std=image_std,
+                    data_format=data_format,
+                    do_convert_rgb=do_convert_rgb,
+                    input_data_format=input_data_format,
+                    predetermined_grid_thw=predetermined_grid_thw,
+                )
+                pixel_values.extend(patches)
+                vision_grid_thws.append(video_grid_thw)
+            pixel_values = np.array(pixel_values)
+            vision_grid_thws = np.array(vision_grid_thws)
+            data["pixel_values_videos"] = pixel_values
+            data["video_grid_thw"] = vision_grid_thws
+
+        return BatchFeature(data=data, tensor_type=return_tensors)
+
+
+def get_image_preprocessor(args):
+    """
+    get_image_preprocessor from args
+    """
+
+    if args.vision_model_name_or_path is None:
+        return None
+
+    data_processor_logger.info("use AdaptiveImageProcessor")
+    image_preprocess = AdaptiveImageProcessor.from_pretrained(args.vision_model_name_or_path)
+    return image_preprocess
@@ -0,0 +1,225 @@
+"""
+# Copyright (c) 2025  PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+
+"""Image processor class for PaddleOCR-VL."""
+
+import json
+from pathlib import Path
+from typing import Dict, List, Optional, Union
+
+import numpy as np
+from paddleformers.transformers.feature_extraction_utils import BatchFeature
+from paddleformers.transformers.image_processing_utils import BaseImageProcessor
+from paddleformers.transformers.image_utils import (
+    ImageInput,
+    is_valid_image,
+    make_list_of_images,
+    to_numpy_array,
+)
+
+from fastdeploy.input.image_processors.common import (
+    smart_resize_paddleocr as smart_resize,
+)
+
+_OPENAI_CLIP_MEAN = [0.48145466, 0.4578275, 0.40821073]
+_OPENAI_CLIP_STD = [0.26862954, 0.26130258, 0.27577711]
+
+
+def make_batched_images(images) -> List[ImageInput]:
+    """
+    Accepts images in list or nested list format, and makes a flat list of images for preprocessing.
+    Args:
+        images (`Union[List[List[ImageInput]], List[ImageInput], ImageInput]`):
+            The input image.
+    Returns:
+        List[ImageInput]: A flat list of images.
+    """
+    if isinstance(images, (list, tuple)) and isinstance(images[0], (list, tuple)) and is_valid_image(images[0][0]):
+        return [img for img_list in images for img in img_list]
+
+    elif isinstance(images, (list, tuple)) and is_valid_image(images[0]):
+        return images
+
+    elif is_valid_image(images):
+        return [images]
+
+    raise ValueError(f"Could not make batched images from {images}")
+
+
+def adjust_size(size, patch_size):
+    num_patches = size // patch_size
+    if num_patches % 2 != 0:
+        num_patches -= 1
+    return num_patches * patch_size
+
+
+class ImageProcessor(BaseImageProcessor):
+    model_input_names = [
+        "pixel_values",
+        "image_grid_thw",
+        "pixel_values_videos",
+        "video_grid_thw",
+    ]
+
+    def __init__(
+        self,
+        do_resize: bool = True,
+        resample: int = 3,
+        do_rescale: bool = True,
+        rescale_factor: Union[int, float] = 1 / 255,
+        do_normalize: bool = True,
+        image_mean: Optional[Union[float, List[float]]] = None,
+        image_std: Optional[Union[float, List[float]]] = None,
+        do_convert_rgb: bool = True,
+        min_pixels: int = 28 * 28 * 130,
+        max_pixels: int = 28 * 28 * 1280,
+        patch_size: int = 14,
+        temporal_patch_size: int = 1,
+        merge_size: int = 2,
+        **kwargs,
+    ) -> None:
+        super().__init__()
+        self.do_resize = do_resize
+        self.resample = resample
+        self.do_rescale = do_rescale
+        self.rescale_factor = rescale_factor
+        self.do_normalize = do_normalize
+        self.image_mean = image_mean if image_mean is not None else _OPENAI_CLIP_MEAN
+        self.image_std = image_std if image_std is not None else _OPENAI_CLIP_STD
+        self.min_pixels = min_pixels
+        self.max_pixels = max_pixels
+        self.patch_size = patch_size
+        self.temporal_patch_size = temporal_patch_size
+        self.merge_size = merge_size
+        self.size = {"min_pixels": min_pixels, "max_pixels": max_pixels}  # not used
+        self.do_convert_rgb = do_convert_rgb
+
+    @classmethod
+    def from_pretrained(cls, pretrained_model_dir):
+        pretrained_model_dir = Path(pretrained_model_dir)
+        image_processor_config_path = pretrained_model_dir / "preprocessor_config.json"
+        with open(image_processor_config_path, "r", encoding="utf-8") as f:
+            image_processor_config = json.load(f)
+        return cls(**image_processor_config)
+
+    def _preprocess(
+        self,
+        images,
+        do_resize: Optional[bool] = None,
+        do_rescale: Optional[bool] = None,
+        rescale_factor: Optional[float] = None,
+        do_normalize: Optional[bool] = None,
+        image_mean: Optional[Union[float, List[float]]] = None,
+        image_std: Optional[Union[float, List[float]]] = None,
+        do_convert_rgb: Optional[bool] = None,
+    ):
+        images = make_list_of_images(images)
+
+        if do_convert_rgb:
+            images = [image.convert("RGB") for image in images]
+
+        width, height = images[0].size
+        resized_height, resized_width = height, width
+        processed_images = []
+
+        for image in images:
+            if do_resize:
+                resized_height, resized_width = smart_resize(
+                    height,
+                    width,
+                    factor=self.patch_size * self.merge_size,
+                    min_pixels=self.min_pixels,
+                    max_pixels=self.max_pixels,
+                )
+
+                image = image.resize((resized_width, resized_height), resample=self.resample)
+
+            image = to_numpy_array(image)
+
+            if do_rescale:
+                image = (image * rescale_factor).astype(np.float32)
+
+            if do_normalize:
+                image = image.astype(np.float32)
+                image -= np.array(image_mean, dtype=np.float32)
+                image /= np.array(image_std, dtype=np.float32)
+
+            processed_images.append(image)
+
+        patches = np.array(processed_images)
+        patches = patches.transpose(0, 3, 1, 2)
+        if patches.shape[0] == 1:
+            patches = np.tile(patches, (self.temporal_patch_size, 1, 1, 1))
+        channel = patches.shape[1]
+        grid_t = patches.shape[0] // self.temporal_patch_size
+        grid_h, grid_w = (
+            resized_height // self.patch_size,
+            resized_width // self.patch_size,
+        )
+
+        patches = patches.reshape(
+            grid_t,
+            self.temporal_patch_size,
+            channel,
+            grid_h,
+            self.patch_size,
+            grid_w,
+            self.patch_size,
+        )
+        patches = patches.transpose(0, 3, 5, 2, 1, 4, 6)
+        assert self.temporal_patch_size == 1
+        flatten_patches = patches.reshape(grid_t * grid_h * grid_w, channel, self.patch_size, self.patch_size)
+        return flatten_patches, np.array([grid_t, grid_h, grid_w])
+
+    def preprocess(
+        self,
+        images,
+        videos=None,
+        do_resize: Optional[bool] = None,
+        size: Optional[Dict[str, int]] = None,
+        do_rescale: Optional[bool] = None,
+        rescale_factor: Optional[float] = None,
+        do_normalize: Optional[bool] = None,
+        image_mean: Optional[Union[float, List[float]]] = None,
+        image_std: Optional[Union[float, List[float]]] = None,
+        do_convert_rgb: Optional[bool] = None,
+        return_tensors=None,
+    ):
+        do_resize = do_resize if do_resize is not None else self.do_resize
+        size = size if size is not None else self.size
+        do_rescale = do_rescale if do_rescale is not None else self.do_rescale
+        rescale_factor = rescale_factor if rescale_factor is not None else self.rescale_factor
+        do_normalize = do_normalize if do_normalize is not None else self.do_normalize
+        image_mean = image_mean if image_mean is not None else self.image_mean
+        image_std = image_std if image_std is not None else self.image_std
+        do_convert_rgb = do_convert_rgb if do_convert_rgb is not None else self.do_convert_rgb
+
+        if videos is not None:
+            raise NotImplementedError("Videos are not yet supported")
+
+        patches, image_grid_thw = self._preprocess(
+            images,
+            do_resize=do_resize,
+            do_rescale=do_rescale,
+            rescale_factor=rescale_factor,
+            do_normalize=do_normalize,
+            image_mean=image_mean,
+            image_std=image_std,
+            do_convert_rgb=do_convert_rgb,
+        )
+        pixel_values = np.array(patches)
+        data = {"pixel_values": pixel_values, "grid_thw": image_grid_thw}
+        return BatchFeature(data=data, tensor_type=return_tensors)
@@ -0,0 +1,333 @@
+"""
+# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+
+from typing import List, Optional, Union
+
+import numpy as np
+import paddle
+import PIL
+from paddleformers.transformers.feature_extraction_utils import BatchFeature
+from paddleformers.transformers.image_processing_utils import BaseImageProcessor
+from paddleformers.transformers.image_transforms import (
+    normalize,
+    rescale,
+    resize,
+    to_channel_dimension_format,
+)
+from paddleformers.transformers.image_utils import (
+    ChannelDimension,
+    ImageInput,
+    PILImageResampling,
+    get_image_size,
+    infer_channel_dimension_format,
+    make_list_of_images,
+    to_numpy_array,
+    valid_images,
+)
+from paddleformers.transformers.legacy.tokenizer_utils_base import TensorType
+from PIL import Image
+
+from fastdeploy.input.image_processors.common import is_scaled_image, smart_resize
+from fastdeploy.utils import data_processor_logger
+
+IMAGE_MEAN = [0.5, 0.5, 0.5]
+IMAGE_STD = [0.5, 0.5, 0.5]
+
+MIN_PIXELS = 65536
+MAX_PIXELS = 16777216
+
+
+VideoInput = Union[
+    List["PIL.Image.Image"],
+    "np.ndarray",
+    "paddle.Tensor",
+    List["np.ndarray"],
+    List["paddle.Tensor"],
+    List[List["PIL.Image.Image"]],
+    List[List["np.ndarray"]],
+    List[List["paddle.Tensor"]],
+]
+
+
+class ImageProcessor(BaseImageProcessor):
+    """
+    Adaptive image processor for dynamic image resizing and preprocessing.
+
+    This processor handles image resizing, rescaling, normalization and format conversion.
+    It dynamically adjusts image dimensions based on original size and specified constraints.
+    """
+
+    def __init__(
+        self,
+        patch_size: int = 16,
+        merge_size: int = 2,
+        temporal_patch_size: int = 2,
+        min_pixels: int = MIN_PIXELS,
+        max_pixels: int = MAX_PIXELS,
+        image_mean: Union[float, List[float]] = IMAGE_MEAN,
+        image_std: Union[float, List[float]] = IMAGE_STD,
+        rescale_factor: float = 1 / 255,
+        do_rescale: bool = True,
+        do_normalize: bool = True,
+        resample: PILImageResampling = PILImageResampling.BICUBIC,
+        **kwargs,
+    ) -> None:
+        """
+        Initialize image processor with configuration parameters.
+
+        Args:
+            patch_size (int): Spatial patch size for vision encoder
+            merge_size (int): Merge size between vision and LLM encoders
+            temporal_patch_size (int): Temporal patch size for video processing
+            min_pixels (int): Minimum allowed pixels in resized image
+            max_pixels (int): Maximum allowed pixels in resized image
+            image_mean (float/list): Mean values for normalization per channel
+            image_std (float/list): Std values for normalization per channel
+            rescale_factor (float): Scaling factor for pixel values (default 1/255)
+            do_rescale (bool): Whether to rescale images
+            do_normalize (bool): Whether to normalize images
+            resample: Resampling method for image resizing
+            **kwargs: Additional base class arguments
+        """
+        super().__init__(**kwargs)
+        self.patch_size = patch_size
+        self.merge_size = merge_size
+        self.temporal_patch_size = temporal_patch_size
+
+        self.min_pixels = min_pixels
+        self.max_pixels = max_pixels
+
+        self.image_mean = image_mean
+        self.image_std = image_std
+        self.rescale_factor = rescale_factor
+        self.do_rescale = do_rescale
+        self.do_normalize = do_normalize
+
+        self.resample = resample
+
+    def _preprocess(
+        self,
+        images: Union[ImageInput, VideoInput],
+        min_pixels: int,
+        max_pixels: int,
+        image_mean: Optional[Union[float, List[float]]],
+        image_std: Optional[Union[float, List[float]]],
+        rescale_factor: float,
+        do_rescale: bool,
+        do_normalize: bool,
+        resample: PILImageResampling,
+        data_format: Optional[ChannelDimension],
+        input_data_format: Optional[Union[str, ChannelDimension]],
+    ):
+        """
+        Internal method for image preprocessing pipeline.
+
+        Args:
+            images: Input image or batch of images
+            min_pixels: Minimum allowed pixels in output
+            max_pixels: Maximum allowed pixels in output
+            image_mean: Normalization mean values
+            image_std: Normalization std values
+            rescale_factor: Pixel value scaling factor
+            do_rescale: Whether to rescale pixel values
+            do_normalize: Whether to normalize pixel values
+            resample: Resampling method
+            data_format: Output channel format
+            input_data_format: Input channel format
+
+        Returns:
+            tuple: (flatten_patches, grid_dimensions)
+                - flatten_patches: Flattened image patches
+                - grid_dimensions: Grid dimensions [t, h, w]
+        """
+        images = make_list_of_images(images)
+
+        # All transformations expect numpy arrays.
+        images = [to_numpy_array(image) for image in images]
+
+        if is_scaled_image(images[0]) and do_rescale:
+            data_processor_logger.warning(
+                "It looks like you are trying to rescale already rescaled images. If the input"
+                " images have pixel values between 0 and 1, set `do_rescale=False` to avoid rescaling them again."
+            )
+        if input_data_format is None:
+            # We assume that all images have the same channel dimension format.
+            input_data_format = infer_channel_dimension_format(images[0])
+
+        # Get original dimensions and calculate optimal resize dimensions
+        height, width = get_image_size(images[0], channel_dim=input_data_format)
+        resized_height, resized_width = smart_resize(
+            height,
+            width,
+            factor=self.patch_size * self.merge_size,  # Combine patch and merge factors
+            min_pixels=min_pixels,
+            max_pixels=max_pixels,
+        )
+
+        processed_images = []
+        for image in images:
+            if height != resized_height or width != resized_width:
+                # Convert to uint8 before resizing to avoid double scaling
+                image = image.astype("uint8")
+                # Convert to PIL Image and resize
+                image = Image.fromarray(image)
+                image = resize(
+                    image,
+                    size=(resized_height, resized_width),
+                    resample=resample,
+                    data_format=input_data_format,
+                )
+
+            if do_rescale and do_normalize:
+                # Adjust mean and std for combined rescale+normalize
+                image_mean = np.array(image_mean, dtype=np.float32) * (1.0 / rescale_factor)
+                image_std = np.array(image_std, dtype=np.float32) * (1.0 / rescale_factor)
+                do_rescale = False  # Skip separate rescale step
+
+            # mutual exclusion and upper branch
+            if do_rescale:
+                image = image.astype(np.float32)
+                image = rescale(image, scale=rescale_factor, data_format=input_data_format)
+
+            if do_normalize:
+                image = image.astype(np.float32)
+                image = normalize(
+                    image=image,
+                    mean=image_mean,
+                    std=image_std,
+                    data_format=input_data_format,
+                )
+
+            image = to_channel_dimension_format(image, data_format, input_channel_dim=input_data_format)  # [C, H, W]
+            processed_images.append(image)
+
+        # Convert processed images to numpy array
+        patches = np.array(processed_images)
+
+        # Pad temporal dimension if needed
+        if patches.shape[0] % self.temporal_patch_size != 0:
+            repeats = np.repeat(
+                patches[-1][np.newaxis],
+                self.temporal_patch_size - (patches.shape[0] % self.temporal_patch_size),
+                axis=0,
+            )
+            patches = np.concatenate([patches, repeats], axis=0)
+
+        # Convert to channels-first format if needed
+        if data_format == ChannelDimension.LAST:
+            patches = patches.transpose([0, 3, 1, 2])  # [N, H, W, C] -> [N, C, H, W]
+
+        grid_t, channel = patches.shape[:2]
+        grid_t = grid_t // self.temporal_patch_size
+
+        grid_h, grid_w = (
+            resized_height // self.patch_size,
+            resized_width // self.patch_size,
+        )
+        # Reshape into hierarchical patch structure
+        patches = patches.reshape(
+            [
+                grid_t,
+                self.temporal_patch_size,
+                channel,
+                grid_h // self.merge_size,
+                self.merge_size,
+                self.patch_size,
+                grid_w // self.merge_size,
+                self.merge_size,
+                self.patch_size,
+            ]
+        )
+        # Reorder dimensions for better memory access pattern
+        # [grid_t, grid_h/merge_size, grid_w/merge_size, merge_size, merge_size, C, temporal_patch_size, psz, psz]
+        patches = patches.transpose([0, 3, 6, 4, 7, 2, 1, 5, 8])
+
+        flatten_patches = patches.reshape(
+            [
+                grid_t * grid_h * grid_w,
+                channel * self.temporal_patch_size * self.patch_size * self.patch_size,
+            ]
+        )
+
+        return flatten_patches, np.array([grid_t, grid_h, grid_w])
+
+    def preprocess(
+        self,
+        images: Union[ImageInput, VideoInput],
+        min_pixels: Optional[int] = None,
+        max_pixels: Optional[int] = None,
+        image_mean: Optional[Union[float, List[float]]] = None,
+        image_std: Optional[Union[float, List[float]]] = None,
+        rescale_factor: Optional[float] = None,
+        do_rescale: Optional[bool] = None,
+        do_normalize: Optional[bool] = None,
+        resample: Optional[PILImageResampling] = None,
+        return_tensors: Optional[Union[str, TensorType]] = None,
+        data_format: Optional[ChannelDimension] = ChannelDimension.FIRST,
+        input_data_format: Optional[Union[str, ChannelDimension]] = ChannelDimension.LAST,
+    ):
+        """
+        Main preprocessing method for images/videos.
+
+        Args:
+            images: Input image/video data
+            min_pixels: Override for minimum pixels
+            max_pixels: Override for maximum pixels
+            image_mean: Override for normalization mean
+            image_std: Override for normalization std
+            rescale_factor: Override for rescaling factor
+            do_rescale: Override for rescaling flag
+            do_normalize: Override for normalization flag
+            resample: Override for resampling method
+            return_tensors: Desired output tensor format
+            data_format: Output channel dimension format
+            input_data_format: Input channel dimension format
+
+        Returns:
+            BatchFeature: Processed features containing:
+                - pixel_values: Preprocessed pixel data
+                - grid_thw: Grid dimensions [temporal, height, width]
+
+        Raises:
+            ValueError: For invalid image types or dimensions
+        """
+        min_pixels = min_pixels if min_pixels is not None else self.min_pixels
+        max_pixels = max_pixels if max_pixels is not None else self.max_pixels
+        image_mean = image_mean if image_mean is not None else self.image_mean
+        image_std = image_std if image_std is not None else self.image_std
+        rescale_factor = rescale_factor if rescale_factor is not None else self.rescale_factor
+        do_rescale = do_rescale if do_rescale is not None else self.do_rescale
+        do_normalize = do_normalize if do_normalize is not None else self.do_normalize
+        resample = resample if resample is not None else self.resample
+
+        if images is not None and not valid_images(images):
+            raise ValueError("Invalid image type. Must be of type PIL.Image.Image, numpy.ndarray, " "paddle.Tensor.")
+
+        pixel_values, grid_thw = self._preprocess(
+            images,
+            min_pixels=min_pixels,
+            max_pixels=max_pixels,
+            image_mean=image_mean,
+            image_std=image_std,
+            rescale_factor=rescale_factor,
+            do_rescale=do_rescale,
+            do_normalize=do_normalize,
+            resample=resample,
+            data_format=data_format,
+            input_data_format=input_data_format,
+        )
+        data = {"pixel_values": pixel_values, "grid_thw": grid_thw}
+        return BatchFeature(data=data, tensor_type=return_tensors)
@@ -0,0 +1,332 @@
+"""
+# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+
+from typing import List, Optional, Union
+
+import numpy as np
+import paddle
+import PIL
+from paddleformers.transformers.feature_extraction_utils import BatchFeature
+from paddleformers.transformers.image_processing_utils import BaseImageProcessor
+from paddleformers.transformers.image_transforms import (
+    normalize,
+    rescale,
+    resize,
+    to_channel_dimension_format,
+)
+from paddleformers.transformers.image_utils import (
+    ChannelDimension,
+    ImageInput,
+    PILImageResampling,
+    get_image_size,
+    infer_channel_dimension_format,
+    make_list_of_images,
+    to_numpy_array,
+    valid_images,
+)
+from paddleformers.transformers.legacy.tokenizer_utils_base import TensorType
+from PIL import Image
+
+from fastdeploy.input.image_processors.common import is_scaled_image, smart_resize
+from fastdeploy.utils import data_processor_logger
+
+OPENAI_CLIP_MEAN = [0.48145466, 0.4578275, 0.40821073]
+OPENAI_CLIP_STD = [0.26862954, 0.26130258, 0.27577711]
+
+MIN_PIXELS = 4 * 28 * 28
+MAX_PIXELS = 16384 * 28 * 28
+
+
+VideoInput = Union[
+    List["PIL.Image.Image"],
+    "np.ndarray",
+    "paddle.Tensor",
+    List["np.ndarray"],
+    List["paddle.Tensor"],
+    List[List["PIL.Image.Image"]],
+    List[List["np.ndarray"]],
+    List[List["paddle.Tensor"]],
+]
+
+
+class ImageProcessor(BaseImageProcessor):
+    """
+    Adaptive image processor for dynamic image resizing and preprocessing.
+
+    This processor handles image resizing, rescaling, normalization and format conversion.
+    It dynamically adjusts image dimensions based on original size and specified constraints.
+    """
+
+    def __init__(
+        self,
+        patch_size: int = 14,
+        merge_size: int = 2,
+        temporal_patch_size: int = 2,
+        min_pixels: int = MIN_PIXELS,
+        max_pixels: int = MAX_PIXELS,
+        image_mean: Union[float, List[float]] = OPENAI_CLIP_MEAN,
+        image_std: Union[float, List[float]] = OPENAI_CLIP_STD,
+        rescale_factor: float = 1 / 255,
+        do_rescale: bool = True,
+        do_normalize: bool = True,
+        resample: PILImageResampling = PILImageResampling.BICUBIC,
+        **kwargs,
+    ) -> None:
+        """
+        Initialize image processor with configuration parameters.
+
+        Args:
+            patch_size (int): Spatial patch size for vision encoder
+            merge_size (int): Merge size between vision and LLM encoders
+            temporal_patch_size (int): Temporal patch size for video processing
+            min_pixels (int): Minimum allowed pixels in resized image
+            max_pixels (int): Maximum allowed pixels in resized image
+            image_mean (float/list): Mean values for normalization per channel
+            image_std (float/list): Std values for normalization per channel
+            rescale_factor (float): Scaling factor for pixel values (default 1/255)
+            do_rescale (bool): Whether to rescale images
+            do_normalize (bool): Whether to normalize images
+            resample: Resampling method for image resizing
+            **kwargs: Additional base class arguments
+        """
+        super().__init__(**kwargs)
+        self.patch_size = patch_size
+        self.merge_size = merge_size
+        self.temporal_patch_size = temporal_patch_size
+
+        self.min_pixels = min_pixels
+        self.max_pixels = max_pixels
+
+        self.image_mean = image_mean
+        self.image_std = image_std
+        self.rescale_factor = rescale_factor
+        self.do_rescale = do_rescale
+        self.do_normalize = do_normalize
+
+        self.resample = resample
+
+    def _preprocess(
+        self,
+        images: Union[ImageInput, VideoInput],
+        min_pixels: int,
+        max_pixels: int,
+        image_mean: Optional[Union[float, List[float]]],
+        image_std: Optional[Union[float, List[float]]],
+        rescale_factor: float,
+        do_rescale: bool,
+        do_normalize: bool,
+        resample: PILImageResampling,
+        data_format: Optional[ChannelDimension],
+        input_data_format: Optional[Union[str, ChannelDimension]],
+    ):
+        """
+        Internal method for image preprocessing pipeline.
+
+        Args:
+            images: Input image or batch of images
+            min_pixels: Minimum allowed pixels in output
+            max_pixels: Maximum allowed pixels in output
+            image_mean: Normalization mean values
+            image_std: Normalization std values
+            rescale_factor: Pixel value scaling factor
+            do_rescale: Whether to rescale pixel values
+            do_normalize: Whether to normalize pixel values
+            resample: Resampling method
+            data_format: Output channel format
+            input_data_format: Input channel format
+
+        Returns:
+            tuple: (flatten_patches, grid_dimensions)
+                - flatten_patches: Flattened image patches
+                - grid_dimensions: Grid dimensions [t, h, w]
+        """
+        images = make_list_of_images(images)
+
+        # All transformations expect numpy arrays.
+        images = [to_numpy_array(image) for image in images]
+
+        if is_scaled_image(images[0]) and do_rescale:
+            data_processor_logger.warning(
+                "It looks like you are trying to rescale already rescaled images. If the input"
+                " images have pixel values between 0 and 1, set `do_rescale=False` to avoid rescaling them again."
+            )
+        if input_data_format is None:
+            # We assume that all images have the same channel dimension format.
+            input_data_format = infer_channel_dimension_format(images[0])
+
+        # Get original dimensions and calculate optimal resize dimensions
+        height, width = get_image_size(images[0], channel_dim=input_data_format)
+        resized_height, resized_width = smart_resize(
+            height,
+            width,
+            factor=self.patch_size * self.merge_size,  # Combine patch and merge factors
+            min_pixels=min_pixels,
+            max_pixels=max_pixels,
+        )
+
+        processed_images = []
+        for image in images:
+            if height != resized_height or width != resized_width:
+                # Convert to uint8 before resizing to avoid double scaling
+                image = image.astype("uint8")
+                # Convert to PIL Image and resize
+                image = Image.fromarray(image)
+                image = resize(
+                    image,
+                    size=(resized_height, resized_width),
+                    resample=resample,
+                    data_format=input_data_format,
+                )
+
+            if do_rescale and do_normalize:
+                # Adjust mean and std for combined rescale+normalize
+                image_mean = np.array(image_mean, dtype=np.float32) * (1.0 / rescale_factor)
+                image_std = np.array(image_std, dtype=np.float32) * (1.0 / rescale_factor)
+                do_rescale = False  # Skip separate rescale step
+
+            if do_rescale:
+                image = image.astype(np.float32)
+                image = rescale(image, scale=rescale_factor, data_format=input_data_format)
+
+            if do_normalize:
+                image = image.astype(np.float32)
+                image = normalize(
+                    image=image,
+                    mean=image_mean,
+                    std=image_std,
+                    data_format=input_data_format,
+                )
+
+            image = to_channel_dimension_format(image, data_format, input_channel_dim=input_data_format)  # [C, H, W]
+            processed_images.append(image)
+
+        # Convert processed images to numpy array
+        patches = np.array(processed_images)
+
+        # Pad temporal dimension if needed
+        if patches.shape[0] % self.temporal_patch_size != 0:
+            repeats = np.repeat(
+                patches[-1][np.newaxis],
+                self.temporal_patch_size - (patches.shape[0] % self.temporal_patch_size),
+                axis=0,
+            )
+            patches = np.concatenate([patches, repeats], axis=0)
+
+        # Convert to channels-first format if needed
+        if data_format == ChannelDimension.LAST:
+            patches = patches.transpose([0, 3, 1, 2])  # [N, H, W, C] -> [N, C, H, W]
+
+        grid_t, channel = patches.shape[:2]
+        grid_t = grid_t // self.temporal_patch_size
+
+        grid_h, grid_w = (
+            resized_height // self.patch_size,
+            resized_width // self.patch_size,
+        )
+        # Reshape into hierarchical patch structure
+        patches = patches.reshape(
+            [
+                grid_t,
+                self.temporal_patch_size,
+                channel,
+                grid_h // self.merge_size,
+                self.merge_size,
+                self.patch_size,
+                grid_w // self.merge_size,
+                self.merge_size,
+                self.patch_size,
+            ]
+        )
+        # Reorder dimensions for better memory access pattern
+        # [grid_t, grid_h/merge_size, grid_w/merge_size, merge_size, merge_size, C, temporal_patch_size, psz, psz]
+        patches = patches.transpose([0, 3, 6, 4, 7, 2, 1, 5, 8])
+
+        flatten_patches = patches.reshape(
+            [
+                grid_t * grid_h * grid_w,
+                channel * self.temporal_patch_size * self.patch_size * self.patch_size,
+            ]
+        )
+
+        return flatten_patches, np.array([grid_t, grid_h, grid_w])
+
+    def preprocess(
+        self,
+        images: Union[ImageInput, VideoInput],
+        min_pixels: Optional[int] = None,
+        max_pixels: Optional[int] = None,
+        image_mean: Optional[Union[float, List[float]]] = None,
+        image_std: Optional[Union[float, List[float]]] = None,
+        rescale_factor: Optional[float] = None,
+        do_rescale: Optional[bool] = None,
+        do_normalize: Optional[bool] = None,
+        resample: Optional[PILImageResampling] = None,
+        return_tensors: Optional[Union[str, TensorType]] = None,
+        data_format: Optional[ChannelDimension] = ChannelDimension.FIRST,
+        input_data_format: Optional[Union[str, ChannelDimension]] = ChannelDimension.LAST,
+    ):
+        """
+        Main preprocessing method for images/videos.
+
+        Args:
+            images: Input image/video data
+            min_pixels: Override for minimum pixels
+            max_pixels: Override for maximum pixels
+            image_mean: Override for normalization mean
+            image_std: Override for normalization std
+            rescale_factor: Override for rescaling factor
+            do_rescale: Override for rescaling flag
+            do_normalize: Override for normalization flag
+            resample: Override for resampling method
+            return_tensors: Desired output tensor format
+            data_format: Output channel dimension format
+            input_data_format: Input channel dimension format
+
+        Returns:
+            BatchFeature: Processed features containing:
+                - pixel_values: Preprocessed pixel data
+                - grid_thw: Grid dimensions [temporal, height, width]
+
+        Raises:
+            ValueError: For invalid image types or dimensions
+        """
+        min_pixels = min_pixels if min_pixels is not None else self.min_pixels
+        max_pixels = max_pixels if max_pixels is not None else self.max_pixels
+        image_mean = image_mean if image_mean is not None else self.image_mean
+        image_std = image_std if image_std is not None else self.image_std
+        rescale_factor = rescale_factor if rescale_factor is not None else self.rescale_factor
+        do_rescale = do_rescale if do_rescale is not None else self.do_rescale
+        do_normalize = do_normalize if do_normalize is not None else self.do_normalize
+        resample = resample if resample is not None else self.resample
+
+        if images is not None and not valid_images(images):
+            raise ValueError("Invalid image type. Must be of type PIL.Image.Image, numpy.ndarray, " "paddle.Tensor.")
+
+        pixel_values, grid_thw = self._preprocess(
+            images,
+            min_pixels=min_pixels,
+            max_pixels=max_pixels,
+            image_mean=image_mean,
+            image_std=image_std,
+            rescale_factor=rescale_factor,
+            do_rescale=do_rescale,
+            do_normalize=do_normalize,
+            resample=resample,
+            data_format=data_format,
+            input_data_format=input_data_format,
+        )
+        data = {"pixel_values": pixel_values, "grid_thw": grid_thw}
+        return BatchFeature(data=data, tensor_type=return_tensors)
@@ -0,0 +1,453 @@
+"""
+# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+
+"""Unified multimodal processor for all VL model types.
+
+Consolidates the four separate VL processor wrappers (QwenVLProcessor,
+Qwen3VLProcessor, PaddleOCRVLProcessor, Ernie4_5_VLProcessor) into a
+single class that dispatches per ``model_type``.
+"""
+
+from collections.abc import Mapping
+from typing import Any, Dict, Optional
+
+import numpy as np
+
+from fastdeploy.input.base_processor import BaseTextProcessor
+from fastdeploy.input.utils import IDS_TYPE_FLAG, process_stop_token_ids
+from fastdeploy.utils import data_processor_logger
+
+QWEN_VL = "qwen_vl"
+QWEN3_VL = "qwen3_vl"
+PADDLEOCR_VL = "paddleocr_vl"
+ERNIE4_5_VL = "ernie4_5_vl"
+
+_SUPPORTED_MODEL_TYPES = {QWEN_VL, QWEN3_VL, PADDLEOCR_VL, ERNIE4_5_VL}
+
+_QWEN_EXPECTED_KWARGS = {
+    "video_max_frames": int,
+    "video_min_frames": int,
+}
+
+_ERNIE_EXPECTED_KWARGS = {
+    "spatial_conv_size": int,
+    "temporal_conv_size": int,
+    "image_min_pixels": int,
+    "image_max_pixels": int,
+    "video_min_pixels": int,
+    "video_max_pixels": int,
+    "video_target_frames": int,
+    "video_frames_sample": str,
+    "video_max_frames": int,
+    "video_min_frames": int,
+    "video_fps": int,
+}
+
+_DEFAULT_MM_LIMITS = {"image": 1, "video": 1, "audio": 1}
+
+_SAMPLING_EPS = 1e-5
+
+
+class MultiModalProcessor(BaseTextProcessor):
+    """Unified multimodal processor for all supported VL model types.
+
+    Dispatches image-processor creation, config initialisation, and
+    encoding logic based on ``model_type``.
+    """
+
+    def __init__(
+        self,
+        model_name_or_path: str,
+        model_type: str,
+        config=None,
+        limit_mm_per_prompt: Optional[Dict[str, Any]] = None,
+        mm_processor_kwargs: Optional[Dict[str, Any]] = None,
+        reasoning_parser_obj=None,
+        tool_parser_obj=None,
+        enable_processor_cache: bool = False,
+    ):
+        if model_type not in _SUPPORTED_MODEL_TYPES:
+            raise ValueError(
+                f"Unsupported model_type '{model_type}'. " f"Must be one of {sorted(_SUPPORTED_MODEL_TYPES)}."
+            )
+        self.model_type = model_type
+        self.config = config
+        self.enable_processor_cache = enable_processor_cache
+
+        tokenizer_type = "ernie4_5" if model_type == ERNIE4_5_VL else "auto"
+
+        super().__init__(
+            model_name_or_path,
+            tokenizer_type=tokenizer_type,
+            reasoning_parser_obj=reasoning_parser_obj,
+            tool_parser_obj=tool_parser_obj,
+        )
+
+        data_processor_logger.info(f"model_name_or_path: {model_name_or_path}")
+
+        processor_kwargs = self._parse_processor_kwargs(mm_processor_kwargs)
+        self._init_mm_processor(processor_kwargs)
+        self._init_mm_config()
+        self.limit_mm_per_prompt = self._parse_limits(limit_mm_per_prompt)
+
+    def _load_tokenizer(self):
+        """Load the appropriate tokenizer based on model_type."""
+        if self.tokenizer_type == "ernie4_5":
+            import os
+
+            from fastdeploy.input.ernie4_5_tokenizer import Ernie4_5Tokenizer
+
+            vocab_file_names = ["tokenizer.model", "spm.model", "ernie_token_100k.model"]
+            for name in vocab_file_names:
+                if os.path.exists(os.path.join(self.model_name_or_path, name)):
+                    Ernie4_5Tokenizer.resource_files_names["vocab_file"] = name
+                    break
+            tokenizer = Ernie4_5Tokenizer.from_pretrained(self.model_name_or_path)
+        else:
+            from paddleformers.transformers import AutoTokenizer
+
+            tokenizer = AutoTokenizer.from_pretrained(self.model_name_or_path, padding_side="left", use_fast=True)
+        return tokenizer
+
+    def _init_mm_processor(self, processor_kwargs: dict):
+        """Create the model-type-specific internal DataProcessor."""
+        if self.model_type == QWEN_VL:
+            from fastdeploy.input.qwen_vl_processor.process import DataProcessor
+
+            tokens_per_second = getattr(getattr(self.config, "vision_config", None), "tokens_per_second", 2)
+            self.processor = DataProcessor(
+                model_path=self.model_name_or_path,
+                enable_processor_cache=self.enable_processor_cache,
+                tokens_per_second=tokens_per_second,
+                tokenizer=self.tokenizer,
+                **processor_kwargs,
+            )
+        elif self.model_type == QWEN3_VL:
+            from fastdeploy.input.qwen3_vl_processor.process import DataProcessor
+
+            self.processor = DataProcessor(
+                model_path=self.model_name_or_path,
+                enable_processor_cache=self.enable_processor_cache,
+                tokenizer=self.tokenizer,
+                **processor_kwargs,
+            )
+        elif self.model_type == PADDLEOCR_VL:
+            from fastdeploy.input.paddleocr_vl_processor.process import DataProcessor
+
+            tokens_per_second = getattr(getattr(self.config, "vision_config", None), "tokens_per_second", 2)
+            self.processor = DataProcessor(
+                model_path=self.model_name_or_path,
+                enable_processor_cache=self.enable_processor_cache,
+                tokens_per_second=tokens_per_second,
+                tokenizer=self.tokenizer,
+                **processor_kwargs,
+            )
+        elif self.model_type == ERNIE4_5_VL:
+            from fastdeploy.input.ernie4_5_vl_processor.process import DataProcessor
+
+            self.processor = DataProcessor(
+                tokenizer_name=self.model_name_or_path,
+                image_preprocessor_name=self.model_name_or_path,
+                enable_processor_cache=self.enable_processor_cache,
+                **processor_kwargs,
+            )
+            self.processor.eval()
+
+    def _init_mm_config(self):
+        """Set model-type-specific multimodal configuration attributes."""
+        if self.model_type in (QWEN_VL, QWEN3_VL):
+            self.image_patch_id = self.processor.image_token_id
+        elif self.model_type == PADDLEOCR_VL:
+            self.image_patch_id = self.processor.image_patch_id
+        elif self.model_type == ERNIE4_5_VL:
+            self.image_patch_id = self.processor.image_patch_id
+            self.spatial_conv_size = self.processor.spatial_conv_size
+
+    def _parse_processor_kwargs(self, kwargs: Optional[dict]) -> dict:
+        """Parse and validate multimodal processor kwargs."""
+        if not kwargs:
+            return {}
+
+        try:
+            if not isinstance(kwargs, dict):
+                raise ValueError("mm-processor-kwargs must be a dictionary")
+
+            data_processor_logger.info(f"Processing kwargs: {kwargs}")
+
+            if self.model_type == ERNIE4_5_VL:
+                expected_types = _ERNIE_EXPECTED_KWARGS
+            else:
+                expected_types = _QWEN_EXPECTED_KWARGS
+
+            for key, value in kwargs.items():
+                if key in expected_types and not isinstance(value, expected_types[key]):
+                    raise ValueError(
+                        f"Invalid type for {key}: expected "
+                        f"{expected_types[key].__name__}, got {type(value).__name__}"
+                    )
+            return kwargs
+
+        except Exception as e:
+            data_processor_logger.warning(f"Invalid mm-processor-kwargs format: {e}")
+            return {}
+
+    def _parse_limits(self, limits: Optional[dict]) -> dict:
+        """Parse multimodal input limits, merging with defaults."""
+        if not limits:
+            return dict(_DEFAULT_MM_LIMITS)
+
+        try:
+            if not isinstance(limits, dict):
+                raise ValueError("limit-mm-per-prompt must be a dictionary")
+            data_processor_logger.info(f"_parse_limits:{limits}")
+            return {**_DEFAULT_MM_LIMITS, **limits}
+        except Exception as e:
+            data_processor_logger.warning(f"Invalid limit-mm-per-prompt format: {e}, using default limits")
+            return dict(_DEFAULT_MM_LIMITS)
+
+    def _check_mm_limits(self, item):
+        """Validate multimodal inputs against configured limits."""
+        if isinstance(item, dict):
+            mm_data = item
+        else:
+            mm_data = {"image": [], "video": []}
+            for message in item:
+                if isinstance(message.get("content"), list):
+                    for part in message["content"]:
+                        part_type = part.get("type")
+                        if part_type in ("image_url", "image"):
+                            mm_data["image"].append(part)
+                        elif part_type in ("video_url", "video"):
+                            mm_data["video"].append(part)
+
+        for modality, data in mm_data.items():
+            if modality in self.limit_mm_per_prompt:
+                limit = self.limit_mm_per_prompt[modality]
+                if len(data) > limit:
+                    raise ValueError(f"Too many {modality} items in prompt, " f"got {len(data)} but limit is {limit}")
+
+    def get_mm_max_tokens_per_item(self, seq_len: int) -> Optional[Mapping[str, int]]:
+        """Return per-modality max token counts, if available."""
+        if self.model_type == ERNIE4_5_VL:
+            return self.processor.get_mm_max_tokens_per_item(seq_len)
+        return None
+
+    def process_request_dict(self, request, max_model_len=None):
+        """Process a request dictionary into model inputs.
+
+        Unified template-method flow for all VL model types.  Per-model
+        differences are handled by small conditional branches rather than
+        duplicating the entire pipeline.
+        """
+        request = self._apply_default_parameters(request)
+
+        if not request.get("eos_token_ids"):
+            request["eos_token_ids"] = self.eos_token_ids
+
+        self._process_stop_tokens(request)
+
+        if self.model_type != PADDLEOCR_VL:
+            self._process_bad_words(request)
+
+        if self.model_type == ERNIE4_5_VL:
+            logits_processors_args = self._prepare_think_stop_sentence(
+                request.get("logits_processors_args") or {}, max_model_len
+            )
+            request["logits_processors_args"] = logits_processors_args
+
+        outputs = self._tokenize_request(request)
+
+        self._process_post_tokens(request, outputs)
+
+        if self.model_type in (QWEN_VL, QWEN3_VL):
+            request["enable_thinking"] = False
+
+        outputs = self.pack_outputs(outputs)
+
+        if self.model_type in (QWEN3_VL, ERNIE4_5_VL) and request.get("prompt_token_ids"):
+            pass  # preserve existing prompt_token_ids
+        else:
+            request["prompt_token_ids"] = outputs["input_ids"].tolist()
+        request["prompt_token_ids_len"] = len(request["prompt_token_ids"])
+        request["multimodal_inputs"] = outputs
+
+        if max_model_len is not None and len(request["prompt_token_ids"]) > max_model_len:
+            request["prompt_token_ids"] = request["prompt_token_ids"][: max_model_len - 1]
+
+        if self.model_type == ERNIE4_5_VL:
+            logits_processors_args = self._update_thinking_prompt_state(
+                request["prompt_token_ids"], request.get("logits_processors_args") or {}
+            )
+            request["logits_processors_args"] = logits_processors_args
+
+        max_tokens = max_model_len - len(request["prompt_token_ids"])
+        if request.get("max_tokens") is None:
+            request["max_tokens"] = max(1, max_tokens)
+        else:
+            request["max_tokens"] = min(max_tokens, request["max_tokens"])
+
+        if self.model_type == ERNIE4_5_VL and request.get("reasoning_max_tokens") is None:
+            request["reasoning_max_tokens"] = max(int(request["max_tokens"] * 0.8), 1)
+
+        if self.model_type in (PADDLEOCR_VL, ERNIE4_5_VL):
+            if request.get("top_p") is not None and request.get("top_p") < _SAMPLING_EPS:
+                request["top_p"] = _SAMPLING_EPS
+                request["top_k"] = 1
+
+        if self.model_type != QWEN3_VL and self.reasoning_parser:
+            self._apply_reasoning_parser(request)
+
+        if self.model_type == ERNIE4_5_VL:
+            if request.get("response_max_tokens") is not None and request.get("enable_thinking") is False:
+                request["max_tokens"] = min(request["response_max_tokens"], request["max_tokens"])
+
+        data_processor_logger.info(f"Processed request {request}")
+        return request
+
+    def _process_stop_tokens(self, request):
+        """Handle stop token processing based on model type."""
+        if self.model_type == QWEN3_VL:
+            stop_sequences = request.get("stop", [])
+            if stop_sequences:
+                stop_seqs, stop_seqs_len = self.update_stop_seq(stop_sequences)
+                request["stop_token_ids"] = stop_seqs
+                request["stop_seqs_len"] = stop_seqs_len
+        else:
+            process_stop_token_ids(request, self.update_stop_seq)
+
+    def _process_bad_words(self, request):
+        """Process bad_words into token ids."""
+        bad_words = request.get("bad_words")
+        bad_words_token_ids = request.get("bad_words_token_ids")
+        if bad_words:
+            bad_words_token_ids = self.update_bad_words(bad_words, bad_words_token_ids)
+            request["bad_words_token_ids"] = bad_words_token_ids
+
+    def _tokenize_request(self, request):
+        """Core tokenization dispatch: prompt_token_ids > prompt > messages."""
+        default_thinking = True if self.model_type == ERNIE4_5_VL else False
+
+        if request.get("prompt_token_ids") and self.model_type in (QWEN3_VL, ERNIE4_5_VL):
+            messages = request.get("messages")
+            if messages:
+                self._check_mm_limits(messages)
+            request.setdefault("enable_thinking", default_thinking)
+            return self.processor.prompt_token_ids2outputs(request)
+
+        elif request.get("prompt"):
+            multimodal_data = request.get("multimodal_data") or {}
+            self._check_mm_limits(multimodal_data)
+            images = multimodal_data.get("image", None)
+            videos = multimodal_data.get("video", None)
+            if self.model_type == ERNIE4_5_VL:
+                request["prompt_tokens"] = request.get("prompt")
+            request.setdefault("enable_thinking", default_thinking)
+            return self.processor.text2ids(request["prompt"], images, videos)
+
+        elif request.get("messages"):
+            messages = request["messages"]
+            self._check_mm_limits(messages)
+            chat_template_kwargs = request.get("chat_template_kwargs")
+            if chat_template_kwargs:
+                if isinstance(chat_template_kwargs, dict):
+                    for k, v in chat_template_kwargs.items():
+                        if k not in request or request[k] is None:
+                            request[k] = v
+                else:
+                    raise ValueError("Invalid input: chat_template_kwargs must be a dict")
+            request.setdefault("enable_thinking", default_thinking)
+            return self.processor.request2ids(request)
+
+        else:
+            raise ValueError(f"Request must contain 'prompt', or 'messages': {request}")
+
+    def _process_post_tokens(self, request, outputs):
+        """Handle post-tokenization token appending."""
+        if self.model_type == PADDLEOCR_VL:
+            metadata = request.get("metadata")
+            if metadata and metadata.get("generated_token_ids"):
+                self._append_completion_tokens_qwen(outputs, metadata["generated_token_ids"])
+        else:
+            if request.get("completion_token_ids"):
+                self.append_completion_tokens(outputs, request["completion_token_ids"])
+
+    def _apply_reasoning_parser(self, request):
+        """Apply reasoning parser and update model status dict."""
+        model_status = self.reasoning_parser.get_model_status(request["prompt_token_ids"])
+        parts = request["request_id"].split("_")
+        if len(parts) > 1:
+            real_req_id = parts[0]
+            index = int(parts[1])
+            n = request.get("n", 1)
+            for idx in range(index * n, (index + 1) * n):
+                self.model_status_dict[f"{real_req_id}_{idx}"] = model_status
+        else:
+            self.model_status_dict[request["request_id"]] = model_status
+        request["enable_thinking"] = model_status == "think_start"
+
+    def append_completion_tokens(self, multimodal_inputs, completion_token_ids):
+        """Append completion tokens to existing multimodal outputs."""
+        if self.model_type == ERNIE4_5_VL:
+            self._append_completion_tokens_ernie(multimodal_inputs, completion_token_ids)
+        else:
+            self._append_completion_tokens_qwen(multimodal_inputs, completion_token_ids)
+
+    def _append_completion_tokens_qwen(self, multimodal_inputs, completion_token_ids):
+        """Append completion tokens for qwen_vl / qwen3_vl / paddleocr_vl."""
+        num_tokens = len(completion_token_ids)
+        multimodal_inputs["input_ids"].extend(completion_token_ids)
+        multimodal_inputs["token_type_ids"].extend([0] * num_tokens)
+
+        pos_ids = self.processor._compute_text_positions(multimodal_inputs["cur_position"], num_tokens)
+        multimodal_inputs["position_ids"].append(pos_ids)
+        multimodal_inputs["cur_position"] += num_tokens
+
+    def _append_completion_tokens_ernie(self, multimodal_inputs, completion_token_ids):
+        """Append completion tokens for ernie4_5_vl."""
+        num_tokens = len(completion_token_ids)
+        multimodal_inputs["input_ids"].extend(completion_token_ids)
+        multimodal_inputs["token_type_ids"].extend([IDS_TYPE_FLAG["text"]] * num_tokens)
+
+        start = multimodal_inputs["cur_position"]
+        for i in range(num_tokens):
+            multimodal_inputs["position_ids"].append([start + i] * 3)
+        multimodal_inputs["cur_position"] += num_tokens
+
+    def pack_outputs(self, outputs):
+        """Convert intermediate processing outputs to final format."""
+        if not outputs["images"]:
+            outputs["images"] = None
+            outputs["grid_thw"] = None
+            outputs["image_type_ids"] = None
+        else:
+            outputs["images"] = np.vstack(outputs["images"])
+            outputs["grid_thw"] = np.vstack(outputs["grid_thw"])
+            outputs["image_type_ids"] = np.array(outputs["image_type_ids"])
+
+        outputs["input_ids"] = np.array(outputs["input_ids"], dtype=np.int64)
+        outputs["token_type_ids"] = np.array(outputs["token_type_ids"], dtype=np.int64)
+        outputs["mm_num_token_func"] = self.processor.mm_num_tokens
+
+        if self.model_type in (QWEN_VL, QWEN3_VL, PADDLEOCR_VL):
+            outputs["position_ids"] = np.concatenate(outputs["position_ids"], axis=1, dtype=np.int64)
+            outputs["image_patch_id"] = self.processor.image_token_id
+            outputs["video_patch_id"] = self.processor.video_token_id
+            outputs["position_ids"] = outputs["position_ids"].transpose(1, 0)
+        else:
+            outputs["position_ids"] = np.array(outputs["position_ids"], dtype=np.int64)
+            outputs["image_patch_id"] = self.image_patch_id
+
+        return outputs
@@ -14,216 +14,12 @@
 # limitations under the License.
 """

-"""Image processor class for Keye."""
+# Backward compatibility: this module has been migrated to
+# fastdeploy.input.image_processors.paddleocr_processor
+# This file will be removed in a future version.

-# TODO: Support videos
-
-import json
-from pathlib import Path
-from typing import Dict, List, Optional, Union
-
-import numpy as np
-from paddleformers.transformers.feature_extraction_utils import BatchFeature
-from paddleformers.transformers.image_processing_utils import BaseImageProcessor
-from paddleformers.transformers.image_utils import (
-    ImageInput,
-    is_valid_image,
-    make_list_of_images,
-    to_numpy_array,
+from fastdeploy.input.image_processors.paddleocr_processor import (  # noqa: F401
+    ImageProcessor,
+    make_batched_images,
+    smart_resize,
 )
-
-from fastdeploy.input.image_processors.common import (
-    smart_resize_paddleocr as smart_resize,
-)
-
-_OPENAI_CLIP_MEAN = [0.48145466, 0.4578275, 0.40821073]
-_OPENAI_CLIP_STD = [0.26862954, 0.26130258, 0.27577711]
-
-
-def make_batched_images(images) -> List[List[ImageInput]]:
-    """
-    Accepts images in list or nested list format, and makes a list of images for preprocessing.
-
-    Args:
-        images (`Union[List[List[ImageInput]], List[ImageInput], ImageInput]`):
-            The input image.
-
-    Returns:
-        list: A list of images.
-    """
-    if isinstance(images, (list, tuple)) and isinstance(images[0], (list, tuple)) and is_valid_image(images[0][0]):
-        return [img for img_list in images for img in img_list]
-
-    elif isinstance(images, (list, tuple)) and is_valid_image(images[0]):
-        return images
-
-    elif is_valid_image(images):
-        return [images]
-
-    raise ValueError(f"Could not make batched images from {images}")
-
-
-def adjust_size(size, patch_size):
-    num_patches = size // patch_size
-    if num_patches % 2 != 0:
-        num_patches -= 1
-    return num_patches * patch_size
-
-
-class ImageProcessor(BaseImageProcessor):
-    model_input_names = [
-        "pixel_values",
-        "image_grid_thw",
-        "pixel_values_videos",
-        "video_grid_thw",
-    ]
-
-    def __init__(
-        self,
-        do_resize: bool = True,
-        resample: int = 3,
-        do_rescale: bool = True,
-        rescale_factor: Union[int, float] = 1 / 255,
-        do_normalize: bool = True,
-        image_mean: Optional[Union[float, List[float]]] = None,
-        image_std: Optional[Union[float, List[float]]] = None,
-        do_convert_rgb: bool = True,
-        min_pixels: int = 28 * 28 * 130,
-        max_pixels: int = 28 * 28 * 1280,
-        patch_size: int = 14,
-        temporal_patch_size: int = 1,
-        merge_size: int = 2,
-        **kwargs,
-    ) -> None:
-        super().__init__()
-        self.do_resize = do_resize
-        self.resample = resample
-        self.do_rescale = do_rescale
-        self.rescale_factor = rescale_factor
-        self.do_normalize = do_normalize
-        self.image_mean = image_mean if image_mean is not None else _OPENAI_CLIP_MEAN
-        self.image_std = image_std if image_std is not None else _OPENAI_CLIP_STD
-        self.min_pixels = min_pixels
-        self.max_pixels = max_pixels
-        self.patch_size = patch_size
-        self.temporal_patch_size = temporal_patch_size
-        self.merge_size = merge_size
-        self.size = {"min_pixels": min_pixels, "max_pixels": max_pixels}  # not used
-        self.do_convert_rgb = do_convert_rgb
-
-    @classmethod
-    def from_pretrained(cls, pretrained_model_dir):
-        pretrained_model_dir = Path(pretrained_model_dir)
-        image_processor_config_path = pretrained_model_dir / "preprocessor_config.json"
-        with open(image_processor_config_path, "r", encoding="utf-8") as f:
-            image_processor_config = json.load(f)
-        return cls(**image_processor_config)
-
-    def _preprocess(
-        self,
-        images,
-        do_resize: Optional[bool] = None,
-        do_rescale: Optional[bool] = None,
-        rescale_factor: Optional[float] = None,
-        do_normalize: Optional[bool] = None,
-        image_mean: Optional[Union[float, List[float]]] = None,
-        image_std: Optional[Union[float, List[float]]] = None,
-        do_convert_rgb: Optional[bool] = None,
-    ):
-        images = make_list_of_images(images)
-
-        if do_convert_rgb:
-            images = [image.convert("RGB") for image in images]
-
-        width, height = images[0].size
-        resized_height, resized_width = height, width
-        processed_images = []
-
-        for image in images:
-            if do_resize:
-                resized_height, resized_width = smart_resize(
-                    height,
-                    width,
-                    factor=self.patch_size * self.merge_size,
-                    min_pixels=self.min_pixels,
-                    max_pixels=self.max_pixels,
-                )
-
-                image = image.resize((resized_width, resized_height), resample=self.resample)
-
-            image = to_numpy_array(image)
-
-            if do_rescale:
-                image = (image * rescale_factor).astype(np.float32)
-
-            if do_normalize:
-                image = image.astype(np.float32)
-                image -= np.array(image_mean, dtype=np.float32)
-                image /= np.array(image_std, dtype=np.float32)
-
-            processed_images.append(image)
-
-        patches = np.array(processed_images)
-        patches = patches.transpose(0, 3, 1, 2)
-        if patches.shape[0] == 1:
-            patches = np.tile(patches, (self.temporal_patch_size, 1, 1, 1))
-        channel = patches.shape[1]
-        grid_t = patches.shape[0] // self.temporal_patch_size
-        grid_h, grid_w = (
-            resized_height // self.patch_size,
-            resized_width // self.patch_size,
-        )
-
-        patches = patches.reshape(
-            grid_t,
-            self.temporal_patch_size,
-            channel,
-            grid_h,
-            self.patch_size,
-            grid_w,
-            self.patch_size,
-        )
-        patches = patches.transpose(0, 3, 5, 2, 1, 4, 6)
-        assert self.temporal_patch_size == 1
-        flatten_patches = patches.reshape(grid_t * grid_h * grid_w, channel, self.patch_size, self.patch_size)
-        return flatten_patches, np.array([grid_t, grid_h, grid_w])
-
-    def preprocess(
-        self,
-        images,
-        videos=None,
-        do_resize: Optional[bool] = None,
-        size: Optional[Dict[str, int]] = None,
-        do_rescale: Optional[bool] = None,
-        rescale_factor: Optional[float] = None,
-        do_normalize: Optional[bool] = None,
-        image_mean: Optional[Union[float, List[float]]] = None,
-        image_std: Optional[Union[float, List[float]]] = None,
-        do_convert_rgb: Optional[bool] = None,
-        return_tensors=None,
-    ):
-        do_resize = do_resize if do_resize is not None else self.do_resize
-        size = size if size is not None else self.size
-        do_rescale = do_rescale if do_rescale is not None else self.do_rescale
-        rescale_factor = rescale_factor if rescale_factor is not None else self.rescale_factor
-        do_normalize = do_normalize if do_normalize is not None else self.do_normalize
-        image_mean = image_mean if image_mean is not None else self.image_mean
-        image_std = image_std if image_std is not None else self.image_std
-        do_convert_rgb = do_convert_rgb if do_convert_rgb is not None else self.do_convert_rgb
-
-        if videos is not None:
-            raise NotImplementedError("Videos are not yet supported")
-
-        patches, image_grid_thw = self._preprocess(
-            images,
-            do_resize=do_resize,
-            do_rescale=do_rescale,
-            rescale_factor=rescale_factor,
-            do_normalize=do_normalize,
-            image_mean=image_mean,
-            image_std=image_std,
-            do_convert_rgb=do_convert_rgb,
-        )
-        pixel_values = np.array(patches)
-        data = {"pixel_values": pixel_values, "grid_thw": image_grid_thw}
-        return BatchFeature(data=data, tensor_type=return_tensors)
@@ -91,54 +91,34 @@ class InputPreprocessor:
                    tool_parser_obj=tool_parser_obj,
                )
            else:
+                from fastdeploy.input.multimodal_processor import (
+                    ERNIE4_5_VL,
+                    PADDLEOCR_VL,
+                    QWEN3_VL,
+                    QWEN_VL,
+                    MultiModalProcessor,
+                )
+
                if ErnieArchitectures.contains_ernie_arch(architecture):
-                    from fastdeploy.input.ernie4_5_vl_processor import (
-                        Ernie4_5_VLProcessor,
-                    )
-
-                    self.processor = Ernie4_5_VLProcessor(
-                        model_name_or_path=self.model_name_or_path,
-                        limit_mm_per_prompt=self.limit_mm_per_prompt,
-                        mm_processor_kwargs=self.mm_processor_kwargs,
-                        reasoning_parser_obj=reasoning_parser_obj,
-                        tool_parser_obj=tool_parser_obj,
-                        enable_processor_cache=self.enable_processor_cache,
-                    )
+                    model_type = ERNIE4_5_VL
                elif "PaddleOCRVL" in architecture:
-                    from fastdeploy.input.paddleocr_vl_processor import (
-                        PaddleOCRVLProcessor,
-                    )
-
-                    self.processor = PaddleOCRVLProcessor(
-                        config=self.model_config,
-                        model_name_or_path=self.model_name_or_path,
-                        limit_mm_per_prompt=self.limit_mm_per_prompt,
-                        mm_processor_kwargs=self.mm_processor_kwargs,
-                        reasoning_parser_obj=reasoning_parser_obj,
-                    )
+                    model_type = PADDLEOCR_VL
                elif "Qwen2_5_VL" in architecture:
-                    from fastdeploy.input.qwen_vl_processor import QwenVLProcessor
-
-                    self.processor = QwenVLProcessor(
-                        config=self.model_config,
-                        model_name_or_path=self.model_name_or_path,
-                        limit_mm_per_prompt=self.limit_mm_per_prompt,
-                        mm_processor_kwargs=self.mm_processor_kwargs,
-                        reasoning_parser_obj=reasoning_parser_obj,
-                        enable_processor_cache=self.enable_processor_cache,
-                    )
+                    model_type = QWEN_VL
                elif "Qwen3VL" in architecture:
-                    from fastdeploy.input.qwen3_vl_processor import Qwen3VLProcessor
-
-                    self.processor = Qwen3VLProcessor(
-                        config=self.model_config,
-                        model_name_or_path=self.model_name_or_path,
-                        limit_mm_per_prompt=self.limit_mm_per_prompt,
-                        mm_processor_kwargs=self.mm_processor_kwargs,
-                        reasoning_parser_obj=reasoning_parser_obj,
-                        enable_processor_cache=self.enable_processor_cache,
-                    )
+                    model_type = QWEN3_VL
                else:
                    raise ValueError(f"Unsupported model processor architecture: {architecture}. ")

+                self.processor = MultiModalProcessor(
+                    model_name_or_path=self.model_name_or_path,
+                    model_type=model_type,
+                    config=self.model_config,
+                    limit_mm_per_prompt=self.limit_mm_per_prompt,
+                    mm_processor_kwargs=self.mm_processor_kwargs,
+                    reasoning_parser_obj=reasoning_parser_obj,
+                    tool_parser_obj=tool_parser_obj,
+                    enable_processor_cache=self.enable_processor_cache,
+                )
+
        return self.processor
@@ -14,320 +14,10 @@
 # limitations under the License.
 """

-from typing import List, Optional, Union
+# Backward compatibility: this module has been migrated to
+# fastdeploy.input.image_processors.qwen3_processor
+# This file will be removed in a future version.

-import numpy as np
-import paddle
-import PIL
-from paddleformers.transformers.feature_extraction_utils import BatchFeature
-from paddleformers.transformers.image_processing_utils import BaseImageProcessor
-from paddleformers.transformers.image_transforms import (
-    normalize,
-    rescale,
-    resize,
-    to_channel_dimension_format,
+from fastdeploy.input.image_processors.qwen3_processor import (  # noqa: F401
+    ImageProcessor,
 )
-from paddleformers.transformers.image_utils import (
-    ChannelDimension,
-    ImageInput,
-    PILImageResampling,
-    get_image_size,
-    infer_channel_dimension_format,
-    make_list_of_images,
-    to_numpy_array,
-    valid_images,
-)
-from paddleformers.transformers.legacy.tokenizer_utils_base import TensorType
-from PIL import Image
-
-from fastdeploy.input.image_processors.common import is_scaled_image, smart_resize
-from fastdeploy.utils import data_processor_logger
-
-IMAGE_MEAN = [0.5, 0.5, 0.5]
-IMAGE_STD = [0.5, 0.5, 0.5]
-
-MIN_PIXELS = 65536
-MAX_PIXELS = 16777216
-
-
-VideoInput = Union[
-    List["PIL.Image.Image"],
-    "np.ndarray",
-    "paddle.Tensor",
-    List["np.ndarray"],
-    List["paddle.Tensor"],
-    List[List["PIL.Image.Image"]],
-    List[List["np.ndarray"]],
-    List[List["paddle.Tensor"]],
-]
-
-
-class ImageProcessor(BaseImageProcessor):
-    """
-    Adaptive image processor for dynamic image resizing and preprocessing.
-
-    This processor handles image resizing, rescaling, normalization and format conversion.
-    It dynamically adjusts image dimensions based on original size and specified constraints.
-    """
-
-    def __init__(
-        self,
-        patch_size: int = 16,
-        merge_size: int = 2,
-        temporal_patch_size: int = 2,
-        min_pixels: int = MIN_PIXELS,
-        max_pixels: int = MAX_PIXELS,
-        image_mean: Union[float, List[float]] = IMAGE_MEAN,
-        image_std: Union[float, List[float]] = IMAGE_STD,
-        rescale_factor: float = 1 / 255,
-        do_rescale: bool = True,
-        do_normalize: bool = True,
-        resample: PILImageResampling = PILImageResampling.BICUBIC,
-        **kwargs,
-    ) -> None:
-        """
-        Initialize image processor with configuration parameters.
-
-        Args:
-            patch_size (int): Spatial patch size for vision encoder
-            merge_size (int): Merge size between vision and LLM encoders
-            temporal_patch_size (int): Temporal patch size for video processing
-            min_pixels (int): Minimum allowed pixels in resized image
-            max_pixels (int): Maximum allowed pixels in resized image
-            image_mean (float/list): Mean values for normalization per channel
-            image_std (float/list): Std values for normalization per channel
-            rescale_factor (float): Scaling factor for pixel values (default 1/255)
-            do_rescale (bool): Whether to rescale images
-            do_normalize (bool): Whether to normalize images
-            resample: Resampling method for image resizing
-            **kwargs: Additional base class arguments
-        """
-        super().__init__(**kwargs)
-        self.patch_size = patch_size
-        self.merge_size = merge_size
-        self.temporal_patch_size = temporal_patch_size
-
-        self.min_pixels = min_pixels
-        self.max_pixels = max_pixels
-
-        self.image_mean = image_mean
-        self.image_std = image_std
-        self.rescale_factor = rescale_factor
-        self.do_rescale = do_rescale
-        self.do_normalize = do_normalize
-
-        self.resample = resample
-
-    def _preprocess(
-        self,
-        images: Union[ImageInput, VideoInput],
-        min_pixels: int,
-        max_pixels: int,
-        image_mean: Optional[Union[float, List[float]]],
-        image_std: Optional[Union[float, List[float]]],
-        rescale_factor: float,
-        do_rescale: bool,
-        do_normalize: bool,
-        resample: PILImageResampling,
-        data_format: Optional[ChannelDimension],
-        input_data_format: Optional[Union[str, ChannelDimension]],
-    ):
-        """
-        Internal method for image preprocessing pipeline.
-
-        Args:
-            images: Input image or batch of images
-            min_pixels: Minimum allowed pixels in output
-            max_pixels: Maximum allowed pixels in output
-            image_mean: Normalization mean values
-            image_std: Normalization std values
-            rescale_factor: Pixel value scaling factor
-            do_rescale: Whether to rescale pixel values
-            do_normalize: Whether to normalize pixel values
-            resample: Resampling method
-            data_format: Output channel format
-            input_data_format: Input channel format
-
-        Returns:
-            tuple: (flatten_patches, grid_dimensions)
-                - flatten_patches: Flattened image patches
-                - grid_dimensions: Grid dimensions [t, h, w]
-        """
-        images = make_list_of_images(images)
-
-        # All transformations expect numpy arrays.
-        images = [to_numpy_array(image) for image in images]
-
-        if is_scaled_image(images[0]) and do_rescale:
-            data_processor_logger.warning(
-                "It looks like you are trying to rescale already rescaled images. If the input"
-                " images have pixel values between 0 and 1, set `do_rescale=False` to avoid rescaling them again."
-            )
-        if input_data_format is None:
-            # We assume that all images have the same channel dimension format.
-            input_data_format = infer_channel_dimension_format(images[0])
-
-        # Get original dimensions and calculate optimal resize dimensions
-        height, width = get_image_size(images[0], channel_dim=input_data_format)
-        resized_height, resized_width = smart_resize(
-            height,
-            width,
-            factor=self.patch_size * self.merge_size,  # Combine patch and merge factors
-            min_pixels=min_pixels,
-            max_pixels=max_pixels,
-        )
-
-        processed_images = []
-        for image in images:
-            if height != resized_height or width != resized_width:
-                # Convert to uint8 before resizing to avoid double scaling
-                image = image.astype("uint8")
-                # Convert to PIL Image and resize
-                image = Image.fromarray(image)
-                image = resize(
-                    image,
-                    size=(resized_height, resized_width),
-                    resample=resample,
-                    data_format=input_data_format,
-                )
-
-            if do_rescale and do_normalize:
-                # Adjust mean and std for combined rescale+normalize
-                image_mean = np.array(image_mean, dtype=np.float32) * (1.0 / rescale_factor)
-                image_std = np.array(image_std, dtype=np.float32) * (1.0 / rescale_factor)
-                do_rescale = False  # Skip separate rescale step
-
-            # mutual exclusion and upper branch
-            if do_rescale:
-                image = image.astype(np.float32)
-                image = rescale(image, scale=rescale_factor, data_format=input_data_format)
-
-            if do_normalize:
-                image = image.astype(np.float32)
-                image = normalize(
-                    image=image,
-                    mean=image_mean,
-                    std=image_std,
-                    data_format=input_data_format,
-                )
-
-            image = to_channel_dimension_format(image, data_format, input_channel_dim=input_data_format)  # [C, H, W]
-            processed_images.append(image)
-
-        # Convert processed images to numpy array
-        patches = np.array(processed_images)
-
-        # Pad temporal dimension if needed
-        if patches.shape[0] % self.temporal_patch_size != 0:
-            repeats = np.repeat(
-                patches[-1][np.newaxis],
-                self.temporal_patch_size - (patches.shape[0] % self.temporal_patch_size),
-                axis=0,
-            )
-            patches = np.concatenate([patches, repeats], axis=0)
-
-        # Convert to channels-first format if needed
-        if data_format == ChannelDimension.LAST:
-            patches = patches.transpose([0, 3, 1, 2])  # [N, H, W, C] -> [N, C, H, W]
-
-        grid_t, channel = patches.shape[:2]
-        grid_t = grid_t // self.temporal_patch_size
-
-        grid_h, grid_w = (
-            resized_height // self.patch_size,
-            resized_width // self.patch_size,
-        )
-        # Reshape into hierarchical patch structure
-        patches = patches.reshape(
-            [
-                grid_t,
-                self.temporal_patch_size,
-                channel,
-                grid_h // self.merge_size,
-                self.merge_size,
-                self.patch_size,
-                grid_w // self.merge_size,
-                self.merge_size,
-                self.patch_size,
-            ]
-        )
-        # Reorder dimensions for better memory access pattern
-        # [grid_t, grid_h/merge_size, grid_w/merge_size, merge_size, merge_size, C, temporal_patch_size, psz, psz]
-        patches = patches.transpose([0, 3, 6, 4, 7, 2, 1, 5, 8])
-
-        flatten_patches = patches.reshape(
-            [
-                grid_t * grid_h * grid_w,
-                channel * self.temporal_patch_size * self.patch_size * self.patch_size,
-            ]
-        )
-
-        return flatten_patches, np.array([grid_t, grid_h, grid_w])
-
-    def preprocess(
-        self,
-        images: Union[ImageInput, VideoInput],
-        min_pixels: Optional[int] = None,
-        max_pixels: Optional[int] = None,
-        image_mean: Optional[Union[float, List[float]]] = None,
-        image_std: Optional[Union[float, List[float]]] = None,
-        rescale_factor: Optional[float] = None,
-        do_rescale: Optional[bool] = None,
-        do_normalize: Optional[bool] = None,
-        resample: Optional[PILImageResampling] = None,
-        return_tensors: Optional[Union[str, TensorType]] = None,
-        data_format: Optional[ChannelDimension] = ChannelDimension.FIRST,
-        input_data_format: Optional[Union[str, ChannelDimension]] = ChannelDimension.LAST,
-    ):
-        """
-        Main preprocessing method for images/videos.
-
-        Args:
-            images: Input image/video data
-            min_pixels: Override for minimum pixels
-            max_pixels: Override for maximum pixels
-            image_mean: Override for normalization mean
-            image_std: Override for normalization std
-            rescale_factor: Override for rescaling factor
-            do_rescale: Override for rescaling flag
-            do_normalize: Override for normalization flag
-            resample: Override for resampling method
-            return_tensors: Desired output tensor format
-            data_format: Output channel dimension format
-            input_data_format: Input channel dimension format
-
-        Returns:
-            BatchFeature: Processed features containing:
-                - pixel_values: Preprocessed pixel data
-                - grid_thw: Grid dimensions [temporal, height, width]
-
-        Raises:
-            ValueError: For invalid image types or dimensions
-        """
-        min_pixels = min_pixels if min_pixels is not None else self.min_pixels
-        max_pixels = max_pixels if max_pixels is not None else self.max_pixels
-        image_mean = image_mean if image_mean is not None else self.image_mean
-        image_std = image_std if image_std is not None else self.image_std
-        rescale_factor = rescale_factor if rescale_factor is not None else self.rescale_factor
-        do_rescale = do_rescale if do_rescale is not None else self.do_rescale
-        do_normalize = do_normalize if do_normalize is not None else self.do_normalize
-        resample = resample if resample is not None else self.resample
-
-        if images is not None and not valid_images(images):
-            raise ValueError("Invalid image type. Must be of type PIL.Image.Image, numpy.ndarray, " "paddle.Tensor.")
-
-        pixel_values, grid_thw = self._preprocess(
-            images,
-            min_pixels=min_pixels,
-            max_pixels=max_pixels,
-            image_mean=image_mean,
-            image_std=image_std,
-            rescale_factor=rescale_factor,
-            do_rescale=do_rescale,
-            do_normalize=do_normalize,
-            resample=resample,
-            data_format=data_format,
-            input_data_format=input_data_format,
-        )
-        data = {"pixel_values": pixel_values, "grid_thw": grid_thw}
-        return BatchFeature(data=data, tensor_type=return_tensors)
@@ -14,319 +14,10 @@
 # limitations under the License.
 """

-from typing import List, Optional, Union
+# Backward compatibility: this module has been migrated to
+# fastdeploy.input.image_processors.qwen_processor
+# This file will be removed in a future version.

-import numpy as np
-import paddle
-import PIL
-from paddleformers.transformers.feature_extraction_utils import BatchFeature
-from paddleformers.transformers.image_processing_utils import BaseImageProcessor
-from paddleformers.transformers.image_transforms import (
-    normalize,
-    rescale,
-    resize,
-    to_channel_dimension_format,
+from fastdeploy.input.image_processors.qwen_processor import (  # noqa: F401
+    ImageProcessor,
 )
-from paddleformers.transformers.image_utils import (
-    ChannelDimension,
-    ImageInput,
-    PILImageResampling,
-    get_image_size,
-    infer_channel_dimension_format,
-    make_list_of_images,
-    to_numpy_array,
-    valid_images,
-)
-from paddleformers.transformers.legacy.tokenizer_utils_base import TensorType
-from PIL import Image
-
-from fastdeploy.input.image_processors.common import is_scaled_image, smart_resize
-from fastdeploy.utils import data_processor_logger
-
-OPENAI_CLIP_MEAN = [0.48145466, 0.4578275, 0.40821073]
-OPENAI_CLIP_STD = [0.26862954, 0.26130258, 0.27577711]
-
-MIN_PIXELS = 4 * 28 * 28
-MAX_PIXELS = 16384 * 28 * 28
-
-
-VideoInput = Union[
-    List["PIL.Image.Image"],
-    "np.ndarray",
-    "paddle.Tensor",
-    List["np.ndarray"],
-    List["paddle.Tensor"],
-    List[List["PIL.Image.Image"]],
-    List[List["np.ndarray"]],
-    List[List["paddle.Tensor"]],
-]
-
-
-class ImageProcessor(BaseImageProcessor):
-    """
-    Adaptive image processor for dynamic image resizing and preprocessing.
-
-    This processor handles image resizing, rescaling, normalization and format conversion.
-    It dynamically adjusts image dimensions based on original size and specified constraints.
-    """
-
-    def __init__(
-        self,
-        patch_size: int = 14,
-        merge_size: int = 2,
-        temporal_patch_size: int = 2,
-        min_pixels: int = MIN_PIXELS,
-        max_pixels: int = MAX_PIXELS,
-        image_mean: Union[float, List[float]] = OPENAI_CLIP_MEAN,
-        image_std: Union[float, List[float]] = OPENAI_CLIP_STD,
-        rescale_factor: float = 1 / 255,
-        do_rescale: bool = True,
-        do_normalize: bool = True,
-        resample: PILImageResampling = PILImageResampling.BICUBIC,
-        **kwargs,
-    ) -> None:
-        """
-        Initialize image processor with configuration parameters.
-
-        Args:
-            patch_size (int): Spatial patch size for vision encoder
-            merge_size (int): Merge size between vision and LLM encoders
-            temporal_patch_size (int): Temporal patch size for video processing
-            min_pixels (int): Minimum allowed pixels in resized image
-            max_pixels (int): Maximum allowed pixels in resized image
-            image_mean (float/list): Mean values for normalization per channel
-            image_std (float/list): Std values for normalization per channel
-            rescale_factor (float): Scaling factor for pixel values (default 1/255)
-            do_rescale (bool): Whether to rescale images
-            do_normalize (bool): Whether to normalize images
-            resample: Resampling method for image resizing
-            **kwargs: Additional base class arguments
-        """
-        super().__init__(**kwargs)
-        self.patch_size = patch_size
-        self.merge_size = merge_size
-        self.temporal_patch_size = temporal_patch_size
-
-        self.min_pixels = min_pixels
-        self.max_pixels = max_pixels
-
-        self.image_mean = image_mean
-        self.image_std = image_std
-        self.rescale_factor = rescale_factor
-        self.do_rescale = do_rescale
-        self.do_normalize = do_normalize
-
-        self.resample = resample
-
-    def _preprocess(
-        self,
-        images: Union[ImageInput, VideoInput],
-        min_pixels: int,
-        max_pixels: int,
-        image_mean: Optional[Union[float, List[float]]],
-        image_std: Optional[Union[float, List[float]]],
-        rescale_factor: float,
-        do_rescale: bool,
-        do_normalize: bool,
-        resample: PILImageResampling,
-        data_format: Optional[ChannelDimension],
-        input_data_format: Optional[Union[str, ChannelDimension]],
-    ):
-        """
-        Internal method for image preprocessing pipeline.
-
-        Args:
-            images: Input image or batch of images
-            min_pixels: Minimum allowed pixels in output
-            max_pixels: Maximum allowed pixels in output
-            image_mean: Normalization mean values
-            image_std: Normalization std values
-            rescale_factor: Pixel value scaling factor
-            do_rescale: Whether to rescale pixel values
-            do_normalize: Whether to normalize pixel values
-            resample: Resampling method
-            data_format: Output channel format
-            input_data_format: Input channel format
-
-        Returns:
-            tuple: (flatten_patches, grid_dimensions)
-                - flatten_patches: Flattened image patches
-                - grid_dimensions: Grid dimensions [t, h, w]
-        """
-        images = make_list_of_images(images)
-
-        # All transformations expect numpy arrays.
-        images = [to_numpy_array(image) for image in images]
-
-        if is_scaled_image(images[0]) and do_rescale:
-            data_processor_logger.warning(
-                "It looks like you are trying to rescale already rescaled images. If the input"
-                " images have pixel values between 0 and 1, set `do_rescale=False` to avoid rescaling them again."
-            )
-        if input_data_format is None:
-            # We assume that all images have the same channel dimension format.
-            input_data_format = infer_channel_dimension_format(images[0])
-
-        # Get original dimensions and calculate optimal resize dimensions
-        height, width = get_image_size(images[0], channel_dim=input_data_format)
-        resized_height, resized_width = smart_resize(
-            height,
-            width,
-            factor=self.patch_size * self.merge_size,  # Combine patch and merge factors
-            min_pixels=min_pixels,
-            max_pixels=max_pixels,
-        )
-
-        processed_images = []
-        for image in images:
-            if height != resized_height or width != resized_width:
-                # Convert to uint8 before resizing to avoid double scaling
-                image = image.astype("uint8")
-                # Convert to PIL Image and resize
-                image = Image.fromarray(image)
-                image = resize(
-                    image,
-                    size=(resized_height, resized_width),
-                    resample=resample,
-                    data_format=input_data_format,
-                )
-
-            if do_rescale and do_normalize:
-                # Adjust mean and std for combined rescale+normalize
-                image_mean = np.array(image_mean, dtype=np.float32) * (1.0 / rescale_factor)
-                image_std = np.array(image_std, dtype=np.float32) * (1.0 / rescale_factor)
-                do_rescale = False  # Skip separate rescale step
-
-            if do_rescale:
-                image = image.astype(np.float32)
-                image = rescale(image, scale=rescale_factor, data_format=input_data_format)
-
-            if do_normalize:
-                image = image.astype(np.float32)
-                image = normalize(
-                    image=image,
-                    mean=image_mean,
-                    std=image_std,
-                    data_format=input_data_format,
-                )
-
-            image = to_channel_dimension_format(image, data_format, input_channel_dim=input_data_format)  # [C, H, W]
-            processed_images.append(image)
-
-        # Convert processed images to numpy array
-        patches = np.array(processed_images)
-
-        # Pad temporal dimension if needed
-        if patches.shape[0] % self.temporal_patch_size != 0:
-            repeats = np.repeat(
-                patches[-1][np.newaxis],
-                self.temporal_patch_size - (patches.shape[0] % self.temporal_patch_size),
-                axis=0,
-            )
-            patches = np.concatenate([patches, repeats], axis=0)
-
-        # Convert to channels-first format if needed
-        if data_format == ChannelDimension.LAST:
-            patches = patches.transpose([0, 3, 1, 2])  # [N, H, W, C] -> [N, C, H, W]
-
-        grid_t, channel = patches.shape[:2]
-        grid_t = grid_t // self.temporal_patch_size
-
-        grid_h, grid_w = (
-            resized_height // self.patch_size,
-            resized_width // self.patch_size,
-        )
-        # Reshape into hierarchical patch structure
-        patches = patches.reshape(
-            [
-                grid_t,
-                self.temporal_patch_size,
-                channel,
-                grid_h // self.merge_size,
-                self.merge_size,
-                self.patch_size,
-                grid_w // self.merge_size,
-                self.merge_size,
-                self.patch_size,
-            ]
-        )
-        # Reorder dimensions for better memory access pattern
-        # [grid_t, grid_h/merge_size, grid_w/merge_size, merge_size, merge_size, C, temporal_patch_size, psz, psz]
-        patches = patches.transpose([0, 3, 6, 4, 7, 2, 1, 5, 8])
-
-        flatten_patches = patches.reshape(
-            [
-                grid_t * grid_h * grid_w,
-                channel * self.temporal_patch_size * self.patch_size * self.patch_size,
-            ]
-        )
-
-        return flatten_patches, np.array([grid_t, grid_h, grid_w])
-
-    def preprocess(
-        self,
-        images: Union[ImageInput, VideoInput],
-        min_pixels: Optional[int] = None,
-        max_pixels: Optional[int] = None,
-        image_mean: Optional[Union[float, List[float]]] = None,
-        image_std: Optional[Union[float, List[float]]] = None,
-        rescale_factor: Optional[float] = None,
-        do_rescale: Optional[bool] = None,
-        do_normalize: Optional[bool] = None,
-        resample: Optional[PILImageResampling] = None,
-        return_tensors: Optional[Union[str, TensorType]] = None,
-        data_format: Optional[ChannelDimension] = ChannelDimension.FIRST,
-        input_data_format: Optional[Union[str, ChannelDimension]] = ChannelDimension.LAST,
-    ):
-        """
-        Main preprocessing method for images/videos.
-
-        Args:
-            images: Input image/video data
-            min_pixels: Override for minimum pixels
-            max_pixels: Override for maximum pixels
-            image_mean: Override for normalization mean
-            image_std: Override for normalization std
-            rescale_factor: Override for rescaling factor
-            do_rescale: Override for rescaling flag
-            do_normalize: Override for normalization flag
-            resample: Override for resampling method
-            return_tensors: Desired output tensor format
-            data_format: Output channel dimension format
-            input_data_format: Input channel dimension format
-
-        Returns:
-            BatchFeature: Processed features containing:
-                - pixel_values: Preprocessed pixel data
-                - grid_thw: Grid dimensions [temporal, height, width]
-
-        Raises:
-            ValueError: For invalid image types or dimensions
-        """
-        min_pixels = min_pixels if min_pixels is not None else self.min_pixels
-        max_pixels = max_pixels if max_pixels is not None else self.max_pixels
-        image_mean = image_mean if image_mean is not None else self.image_mean
-        image_std = image_std if image_std is not None else self.image_std
-        rescale_factor = rescale_factor if rescale_factor is not None else self.rescale_factor
-        do_rescale = do_rescale if do_rescale is not None else self.do_rescale
-        do_normalize = do_normalize if do_normalize is not None else self.do_normalize
-        resample = resample if resample is not None else self.resample
-
-        if images is not None and not valid_images(images):
-            raise ValueError("Invalid image type. Must be of type PIL.Image.Image, numpy.ndarray, " "paddle.Tensor.")
-
-        pixel_values, grid_thw = self._preprocess(
-            images,
-            min_pixels=min_pixels,
-            max_pixels=max_pixels,
-            image_mean=image_mean,
-            image_std=image_std,
-            rescale_factor=rescale_factor,
-            do_rescale=do_rescale,
-            do_normalize=do_normalize,
-            resample=resample,
-            data_format=data_format,
-            input_data_format=input_data_format,
-        )
-        data = {"pixel_values": pixel_values, "grid_thw": grid_thw}
-        return BatchFeature(data=data, tensor_type=return_tensors)
@@ -340,9 +340,7 @@ class TestImagePreprocessorAdaptive(unittest.TestCase):
        # Create a scaled image (values between 0-1)
        img_array = np.random.rand(224, 224, 3).astype(np.float32) * 0.5
        # Use patch to capture warning
-        with patch(
-            "fastdeploy.input.ernie4_5_vl_processor.image_preprocessor.image_preprocessor_adaptive.data_processor_logger"
-        ) as mock_logger:
+        with patch("fastdeploy.input.image_processors.adaptive_processor.data_processor_logger") as mock_logger:
            # Directly call _preprocess, pass scaled image
            self.processor._preprocess(
                [img_array],  # Pass scaled numpy array
@@ -356,9 +354,7 @@ class TestImagePreprocessorAdaptive(unittest.TestCase):
        """Test invalid image check in preprocess (line 464)"""
        # Test invalid image type - need to ensure valid_images returns False
        # Use patch to make valid_images return False, but make_batched_images succeeds
-        with patch(
-            "fastdeploy.input.ernie4_5_vl_processor.image_preprocessor.image_preprocessor_adaptive.valid_images"
-        ) as mock_valid:
+        with patch("fastdeploy.input.image_processors.adaptive_processor.valid_images") as mock_valid:
            mock_valid.return_value = False
            valid_images_list = [Image.new("RGB", (224, 224))]  # Valid image, but valid_images returns False
            with self.assertRaises(ValueError) as context: