[DataProcessor] Move image_processor to unified directory and add MultiModalProcessor (#7109)

* first commit * step 9~10 * update multimodal * update multimodal * fix load tokenizer * add unit test * fix unit test & AdaptiveImageProcessor * Delete unused code
2026-04-23 00:17:25 +08:00 · 2026-04-08 10:16:27 +08:00
parent d693d4be14
commit 8496ec71a6
15 changed files with 3037 additions and 1401 deletions
@@ -14,7 +14,13 @@
 # limitations under the License.
 """
-from .get_image_preprocessor import get_image_preprocessor
+# Backward compatibility: this module has been migrated to
-from .image_preprocessor_adaptive import AdaptiveImageProcessor
+# fastdeploy.input.image_processors.adaptive_processor
 # This file will be removed in a future version.
 from fastdeploy.input.image_processors.adaptive_processor import (  # noqa: F401
    AdaptiveImageProcessor,
    get_image_preprocessor,
 )
 __all__ = ["get_image_preprocessor", "AdaptiveImageProcessor"]
@@ -14,21 +14,10 @@
 # limitations under the License.
 """
-"""get image preprocessor"""
+# Backward compatibility: this module has been migrated to
 # fastdeploy.input.image_processors.adaptive_processor
 # This file will be removed in a future version.
-from fastdeploy.utils import data_processor_logger
+from fastdeploy.input.image_processors.adaptive_processor import (  # noqa: F401
-
+    get_image_preprocessor,
-from .image_preprocessor_adaptive import AdaptiveImageProcessor
+)
 def get_image_preprocessor(args):
    """
    get_image_preprocessor from args
    """
    if args.vision_model_name_or_path is None:
        return None
    data_processor_logger.info("use AdaptiveImageProcessor")
    image_preprocess = AdaptiveImageProcessor.from_pretrained(args.vision_model_name_or_path)
    return image_preprocess
@@ -14,498 +14,12 @@
 # limitations under the License.
 """
-"""image preprocessor adaptive"""
+# Backward compatibility: this module has been migrated to
 # fastdeploy.input.image_processors.adaptive_processor
 # This file will be removed in a future version.
-from typing import List, Optional, Union
+from fastdeploy.input.image_processors.adaptive_processor import (  # noqa: F401
-
+    AdaptiveImageProcessor,
-import numpy as np
+    make_batched_images,
-import paddle
+    make_batched_videos,
 import PIL
 from paddleformers.transformers.feature_extraction_utils import BatchFeature
 from paddleformers.transformers.image_processing_utils import BaseImageProcessor
 from paddleformers.transformers.image_transforms import (
    convert_to_rgb,
    normalize,
    rescale,
    resize,
    to_channel_dimension_format,
 )
 from paddleformers.transformers.image_utils import (
    ChannelDimension,
    ImageInput,
    PILImageResampling,
    get_image_size,
    infer_channel_dimension_format,
    is_valid_image,
    make_list_of_images,
    to_numpy_array,
    valid_images,
 )
 from paddleformers.transformers.legacy.tokenizer_utils_base import TensorType
 from PIL import Image
 from fastdeploy.input.image_processors.common import is_scaled_image
 from fastdeploy.input.image_processors.common import smart_resize_qwen as smart_resize
 from fastdeploy.utils import data_processor_logger
 OPENAI_CLIP_MEAN = [0.48145466, 0.4578275, 0.40821073]
 OPENAI_CLIP_STD = [0.26862954, 0.26130258, 0.27577711]
 IMAGE_FACTOR = 28
 MIN_PIXELS = 4 * 28 * 28
 MAX_PIXELS = 16384 * 28 * 28
 MAX_RATIO = 200
 VideoInput = Union[
    List["PIL.Image.Image"],
    "np.ndarray",
    "paddle.Tensor",
    List["np.ndarray"],
    List["paddle.Tensor"],
    List[List["PIL.Image.Image"]],
    List[List["np.ndarrray"]],
    List[List["paddle.Tensor"]],
 ]
 __all__ = [
    "AdaptiveImageProcessor",
 ]
 def make_batched_images(images) -> List[List[ImageInput]]:
    """
    Accepts images in list or nested list format, and makes a list of images for preprocessing.
        images (`Union[List[List[ImageInput]], List[ImageInput], ImageInput]`):
            The input image.
    Returns:
        list: A list of images.
    """
    if isinstance(images, (list, tuple)) and isinstance(images[0], (list, tuple)) and is_valid_image(images[0][0]):
        return [img for img_list in images for img in img_list]
    elif isinstance(images, (list, tuple)) and is_valid_image(images[0]):
        return images
    elif is_valid_image(images):
        return [images]
    raise ValueError(f"Could not make batched images from {images}")
 # Copied from transformers.models.llava_next_video.image_processing_llava_next_video.make_batched_videos
 def make_batched_videos(videos) -> List[VideoInput]:
    """dummy"""
    if isinstance(videos, (list, tuple)) and isinstance(videos[0], (list, tuple)) and is_valid_image(videos[0][0]):
        return videos
    elif isinstance(videos, (list, tuple)) and is_valid_image(videos[0]):
        if isinstance(videos[0], Image.Image):
            return [videos]
        elif len(videos[0].shape) == 4:
            return [list(video) for video in videos]
    elif is_valid_image(videos) and len(videos.shape) == 4:
        return [list(videos)]
    raise ValueError(f"Could not make batched video from {videos}")
 class AdaptiveImageProcessor(BaseImageProcessor):
    r"""
    Constructs a adaptive image processor that dynamically resizes images based on the original images.
    Args:
        do_resize (`bool`, *optional*, defaults to `True`):
            Whether to resize the image's (height, width) dimensions.
        resample (`PILImageResampling`, *optional*, defaults to `Resampling.BICUBIC`):
            Resampling filter to use when resizing the image.
        do_rescale (`bool`, *optional*, defaults to `True`):
            Whether to rescale the image by the specified scale `rescale_factor`.
        rescale_factor (`int` or `float`, *optional*, defaults to `1/255`):
            Scale factor to use if rescaling the image.
        do_normalize (`bool`, *optional*, defaults to `True`):
            Whether to normalize the image.
        image_mean (`float` or `List[float]`, *optional*, defaults to `[0.48145466, 0.4578275, 0.40821073]`):
            Mean to use if normalizing the image. This is a float or list of floats for each channel in the image.
        image_std (`float` or `List[float]`, *optional*, defaults to `[0.26862954, 0.26130258, 0.27577711]`):
            Standard deviation to use if normalizing the image. This is a float or list of floats for each channel
            in the image.
        do_convert_rgb (`bool`, *optional*, defaults to `True`):
            Whether to convert the image to RGB.
        min_pixels (`int`, *optional*, defaults to `56 * 56`):
            The min pixels of the image to resize the image.
        max_pixels (`int`, *optional*, defaults to `28 * 28 * 1280`):
            The max pixels of the image to resize the image.
        patch_size (`int`, *optional*, defaults to 14):
            The spacial patch size of the vision encoder.
        temporal_conv_size (`int`, *optional*, defaults to 2):
            The temporal conv size in resampler.
        merge_size (`int`, *optional*, defaults to 2):
            The merge size of the vision encoder to llm encoder.
    """
    model_input_names = [
        "pixel_values",
        "image_grid_thw",
        "pixel_values_videos",
        "video_grid_thw",
    ]
    def __init__(
        self,
        do_resize: bool = True,
        resample: PILImageResampling = PILImageResampling.BICUBIC,
        do_rescale: bool = True,
        rescale_factor: float = 1 / 255,
        do_normalize: bool = True,
        image_mean: Optional[Union[float, List[float]]] = None,
        image_std: Optional[Union[float, List[float]]] = None,
        do_convert_rgb: bool = True,
        min_pixels: int = 56 * 56,
        max_pixels: int = 28 * 28 * 1280,
        patch_size: int = 14,
        temporal_conv_size: int = 2,
        merge_size: int = 2,
        **kwargs,
    ) -> None:
        """init"""
        super().__init__(**kwargs)
        self.do_resize = do_resize
        self.resample = resample
        self.do_rescale = do_rescale
        self.rescale_factor = rescale_factor
        self.do_normalize = do_normalize
        self.image_mean = image_mean if image_mean is not None else OPENAI_CLIP_MEAN
        self.image_std = image_std if image_std is not None else OPENAI_CLIP_STD
        self.min_pixels = min_pixels
        self.max_pixels = max_pixels
        self.patch_size = patch_size
        self.temporal_conv_size = temporal_conv_size
        self.merge_size = merge_size
        self.size = {"min_pixels": min_pixels, "max_pixels": max_pixels}
        self.do_convert_rgb = do_convert_rgb
    def set_pixels(self, min_pixels=None, max_pixels=None, msg=""):
        """设定pixels"""
        if min_pixels is not None:
            assert isinstance(min_pixels, int) and min_pixels >= 0, "min_pixels must be positive int"
            data_processor_logger.info(f"{msg} AdaptiveImageProcessor set min_pixels = {min_pixels}")
            self.min_pixels = min_pixels
            self.size["min_pixels"] = int(min_pixels)
        if max_pixels is not None:
            assert isinstance(max_pixels, int) and max_pixels > 0, "max_pixels must be positive int"
            data_processor_logger.info(f"{msg} AdaptiveImageProcessor set max_pixels = {max_pixels}")
            self.max_pixels = max_pixels
            self.size["max_pixels"] = int(max_pixels)
    def get_smarted_resize(self, height, width, min_pixels=None, max_pixels=None):
        """dummy"""
        actual_min_pixels = min_pixels if min_pixels is not None else self.min_pixels
        actual_max_pixels = max_pixels if max_pixels is not None else self.max_pixels
        resized_height, resized_width = smart_resize(
            height,
            width,
            factor=self.patch_size * self.merge_size,
            min_pixels=actual_min_pixels,
            max_pixels=actual_max_pixels,
        )
        return (resized_height, resized_width), (
            resized_height // self.patch_size,
            resized_width // self.patch_size,
        )
    def _preprocess(
        self,
        images: Union[ImageInput, VideoInput],
        do_resize: bool = True,
        resample: PILImageResampling = None,
        do_rescale: bool = True,
        rescale_factor: float = 1 / 255,
        do_normalize: bool = True,
        image_mean: Optional[Union[float, List[float]]] = None,
        image_std: Optional[Union[float, List[float]]] = None,
        do_convert_rgb: bool = False,
        data_format: Optional[ChannelDimension] = ChannelDimension.FIRST,
        input_data_format: Optional[Union[str, ChannelDimension]] = None,
        predetermined_grid_thw=None,
    ):
        """
        Preprocess an image or batch of images. Copy of the `preprocess` method from `CLIPImageProcessor`.
        Args:
            images (`ImageInput`):
                Image or batch of images to preprocess. Expects pixel values ranging from 0 to 255.
                If pixel values range from 0 to 1, set `do_rescale=False`.
            vision_info (`List[Dict]`, *optional*):
                Optional list of dictionaries containing additional information about vision inputs.
            do_resize (`bool`, *optional*, defaults to `self.do_resize`):
                Whether to resize the image.
            resample (`PILImageResampling`, *optional*, defaults to `self.resample`):
                Resampling filter to use if resizing the image. This can be one of the `PILImageResampling` enums.
            do_rescale (`bool`, *optional*, defaults to `self.do_rescale`):
                Whether to rescale the image.
            rescale_factor (`float`, *optional*, defaults to `self.rescale_factor`):
                Scale factor to use if rescaling the image.
            do_normalize (`bool`, *optional*, defaults to `self.do_normalize`):
                Whether to normalize the image.
            image_mean (`float` or `List[float]`, *optional*, defaults to `self.image_mean`):
                Mean to use if normalizing the image.
                Can be a float or a list of floats corresponding to the number of channels in the image.
            image_std (`float` or `List[float]`, *optional*, defaults to `self.image_std`):
                Standard deviation to use if normalizing the image.
                Can be a float or a list of floats corresponding to the number of channels in the image.
            do_convert_rgb (`bool`, *optional*, defaults to `self.do_convert_rgb`):
                Whether to convert the image to RGB.
            data_format (`ChannelDimension`, *optional*, defaults to `ChannelDimension.FIRST`):
                The channel dimension format for the output image. Can be one of:
                - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
                - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
                - Unset: Use the channel dimension format of the input image.
            input_data_format (`ChannelDimension` or `str`, *optional*):
                The channel dimension format for the input image. Can be one of:
                - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
                - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
                - `"none"` or `ChannelDimension.NONE`: image in (height, width) format.
                - `"none"` or `ChannelDimension.NONE`: image in (height, width) format.
        """
        images = make_list_of_images(images)
        if do_convert_rgb:
            images = [convert_to_rgb(image) for image in images]
        # All transformations expect numpy arrays.
        images = [to_numpy_array(image) for image in images]
        if is_scaled_image(images[0]) and do_rescale:
            data_processor_logger.warning(
                "It looks like you are trying to rescale already rescaled images. If the input"
                " images have pixel values between 0 and 1, set `do_rescale=False` to avoid rescaling them again."
            )
        if input_data_format is None:
            # We assume that all images have the same channel dimension format.
            input_data_format = infer_channel_dimension_format(images[0])
        height, width = get_image_size(images[0], channel_dim=input_data_format)
        resized_height, resized_width = height, width
        processed_images = []
        if predetermined_grid_thw is not None:
            assert len(predetermined_grid_thw) == len(
                images
            ), f"len(predetermined_grid_thw) {len(predetermined_grid_thw)} == len(images) {len(images)}"
        for img_idx, image in enumerate(images):
            if do_resize:
                if predetermined_grid_thw is not None:
                    (resized_height, resized_width) = predetermined_grid_thw[img_idx]
                    resized_height *= self.patch_size
                    resized_width *= self.patch_size
                else:
                    resized_height, resized_width = smart_resize(
                        height,
                        width,
                        factor=self.patch_size * self.merge_size,
                        min_pixels=self.min_pixels,
                        max_pixels=self.max_pixels,
                    )
                image = image.astype("uint8")  # TODO : 需要手动加上，否则多除255 导致结果会出错
                # 直接fromarray，不要靠paddleformers里面的
                image = Image.fromarray(image)
                image = resize(
                    image,
                    size=(resized_height, resized_width),
                    resample=resample,
                    data_format=input_data_format,
                )
            if do_rescale:
                image = rescale(image, scale=rescale_factor, data_format=input_data_format)
            if do_normalize:
                image = normalize(
                    image=image,
                    mean=image_mean,
                    std=image_std,
                    data_format=input_data_format,
                )
            image = to_channel_dimension_format(image, data_format, input_channel_dim=input_data_format)  # [C, H, W]
            processed_images.append(image)
        patches = np.array(processed_images)
        if data_format == ChannelDimension.LAST:
            patches = patches.transpose([0, 3, 1, 2])
        channel = patches.shape[1]  # [time, C, H, W]
        grid_t = patches.shape[0]
        grid_h, grid_w = (
            resized_height // self.patch_size,
            resized_width // self.patch_size,
        )
        patches = patches.reshape(
            [
                grid_t,
                channel,
                grid_h // self.merge_size,
                self.merge_size,
                self.patch_size,
                grid_w // self.merge_size,
                self.merge_size,
                self.patch_size,
            ]
        )
        # [grid_t, grid_h/merge_size, grid_w/merge_size, merge_size, merge_size, C, psz, psz]
        patches = patches.transpose([0, 2, 5, 3, 6, 1, 4, 7])
        flatten_patches = patches.reshape(
            [
                grid_t * grid_h * grid_w,
                channel * self.patch_size * self.patch_size,
            ]
        )  # [grid_t * grid_h * grid_w, C * psz * psz]
        return flatten_patches, (grid_t, grid_h, grid_w)
    def preprocess(
        self,
        images: ImageInput,
        videos: VideoInput = None,
        do_resize: bool = True,
        size: Optional[Union[int, List[int]]] = None,
        resample: PILImageResampling = None,
        do_rescale: bool = True,
        rescale_factor: float = 1 / 255,
        do_normalize: bool = True,
        image_mean: Optional[Union[float, List[float]]] = None,
        image_std: Optional[Union[float, List[float]]] = None,
        do_convert_rgb: bool = False,
        return_tensors: Optional[Union[str, TensorType]] = None,
        data_format: Optional[ChannelDimension] = ChannelDimension.FIRST,
        input_data_format: Optional[Union[str, ChannelDimension]] = None,
        predetermined_grid_thw=None,
    ):
        """
        Args:
            images (`ImageInput`):
                Image to preprocess. Expects a single or batch of images with pixel values ranging from 0 to 255. If
                passing in images with pixel values between 0 and 1, set `do_rescale=False`.
            videos (`VideoInput`):
                Video to preprocess. Expects a single or batch of videos with pixel values ranging from 0 to 255. If
                passing in videos with pixel values between 0 and 1, set `do_rescale=False`.
            do_resize (`bool`, *optional*, defaults to `self.do_resize`):
                Whether to resize the image.
            size (`Dict[str, int]`, *optional*, defaults to `self.size`):
                Size of the image after resizing. Shortest edge of the image is resized to size["shortest_edge"], with
                the longest edge resized to keep the input aspect ratio.
            resample (`int`, *optional*, defaults to `self.resample`):
                Resampling filter to use if resizing the image. This can be one of the enum `PILImageResampling`. Only
                has an effect if `do_resize` is set to `True`.
            do_rescale (`bool`, *optional*, defaults to `self.do_rescale`):
                Whether to rescale the image.
            rescale_factor (`float`, *optional*, defaults to `self.rescale_factor`):
                Rescale factor to rescale the image by if `do_rescale` is set to `True`.
            do_normalize (`bool`, *optional*, defaults to `self.do_normalize`):
                Whether to normalize the image.
            image_mean (`float` or `List[float]`, *optional*, defaults to `self.image_mean`):
                Image mean to use for normalization. Only has an effect if `do_normalize` is set to `True`.
            image_std (`float` or `List[float]`, *optional*, defaults to `self.image_std`):
                Image standard deviation to use for normalization. Only has an effect if `do_normalize` is set to
                `True`.
            do_convert_rgb (`bool`, *optional*, defaults to `self.do_convert_rgb`):
                Whether to convert the image to RGB.
            return_tensors (`str` or `TensorType`, *optional*):
                The type of tensors to return. Can be one of:
                - Unset: Return a list of `np.ndarray`.
                - `TensorType.PADDLE` or `'pt'`: Return a batch of type `torch.Tensor`.
                - `TensorType.NUMPY` or `'np'`: Return a batch of type `np.ndarray`.
            data_format (`ChannelDimension` or `str`, *optional*, defaults to `ChannelDimension.FIRST`):
                The channel dimension format for the output image. Can be one of:
                - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
                - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
                - Unset: Use the channel dimension format of the input image.
            input_data_format (`ChannelDimension` or `str`, *optional*):
                The channel dimension format for the input image. If unset, the channel dimension format is inferred
                from the input image. Can be one of:
                - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
                - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
                - `"none"` or `ChannelDimension.NONE`: image in (height, width) format.
        """
        do_resize = do_resize if do_resize is not None else self.do_resize
        size = size if size is not None else self.size
        resample = resample if resample is not None else self.resample
        do_rescale = do_rescale if do_rescale is not None else self.do_rescale
        rescale_factor = rescale_factor if rescale_factor is not None else self.rescale_factor
        do_normalize = do_normalize if do_normalize is not None else self.do_normalize
        image_mean = image_mean if image_mean is not None else self.image_mean
        image_std = image_std if image_std is not None else self.image_std
        do_convert_rgb = do_convert_rgb if do_convert_rgb is not None else self.do_convert_rgb
        if images is not None:
            images = make_batched_images(images)
        if videos is not None:
            videos = make_batched_videos(videos)
        if images is not None and not valid_images(images):
            raise ValueError("Invalid image type. Must be of type PIL.Image.Image, numpy.ndarray, " "paddle.Tensor.")
        if images is not None:
            pixel_values, vision_grid_thws = [], []
            for img_idx, image in enumerate(images):
                if predetermined_grid_thw is not None:
                    predetermined_grid_thw_one = [predetermined_grid_thw[img_idx]]
                else:
                    predetermined_grid_thw_one = None
                patches, image_grid_thw = self._preprocess(
                    image,
                    do_resize=do_resize,
                    resample=resample,
                    do_rescale=do_rescale,
                    rescale_factor=rescale_factor,
                    do_normalize=do_normalize,
                    image_mean=image_mean,
                    image_std=image_std,
                    data_format=data_format,
                    do_convert_rgb=do_convert_rgb,
                    input_data_format=input_data_format,
                    predetermined_grid_thw=predetermined_grid_thw_one,
                )
                pixel_values.extend(patches)
                vision_grid_thws.append(image_grid_thw)
            pixel_values = np.array(pixel_values)
            vision_grid_thws = np.array(vision_grid_thws)
            data = {
                "pixel_values": pixel_values,
                "image_grid_thw": vision_grid_thws,
            }
        if videos is not None:
            pixel_values, vision_grid_thws = [], []
            for images in videos:
                patches, video_grid_thw = self._preprocess(
                    images,
                    do_resize=do_resize,
                    resample=resample,
                    do_rescale=do_rescale,
                    rescale_factor=rescale_factor,
                    do_normalize=do_normalize,
                    image_mean=image_mean,
                    image_std=image_std,
                    data_format=data_format,
                    do_convert_rgb=do_convert_rgb,
                    input_data_format=input_data_format,
                    predetermined_grid_thw=predetermined_grid_thw,
                )
                pixel_values.extend(patches)
                vision_grid_thws.append(video_grid_thw)
            pixel_values = np.array(pixel_values)
            vision_grid_thws = np.array(vision_grid_thws)
            data = {
                "pixel_values_videos": pixel_values,
                "video_grid_thw": vision_grid_thws,
            }
        return BatchFeature(data=data, tensor_type=return_tensors)
@@ -11,3 +11,17 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 from fastdeploy.input.image_processors.adaptive_processor import (  # noqa: F401
    AdaptiveImageProcessor,
    get_image_preprocessor,
 )
 from fastdeploy.input.image_processors.paddleocr_processor import (  # noqa: F401
    ImageProcessor as PaddleOCRImageProcessor,
 )
 from fastdeploy.input.image_processors.qwen3_processor import (  # noqa: F401
    ImageProcessor as Qwen3ImageProcessor,
 )
 from fastdeploy.input.image_processors.qwen_processor import (  # noqa: F401
    ImageProcessor as QwenImageProcessor,
 )
@@ -0,0 +1,524 @@
 """
 # Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 """
 """image preprocessor adaptive"""
 from typing import List, Optional, Union
 import numpy as np
 import paddle
 import PIL
 from paddleformers.transformers.feature_extraction_utils import BatchFeature
 from paddleformers.transformers.image_processing_utils import BaseImageProcessor
 from paddleformers.transformers.image_transforms import (
    convert_to_rgb,
    normalize,
    rescale,
    resize,
    to_channel_dimension_format,
 )
 from paddleformers.transformers.image_utils import (
    ChannelDimension,
    ImageInput,
    PILImageResampling,
    get_image_size,
    infer_channel_dimension_format,
    is_valid_image,
    make_list_of_images,
    to_numpy_array,
    valid_images,
 )
 from paddleformers.transformers.legacy.tokenizer_utils_base import TensorType
 from PIL import Image
 from fastdeploy.input.image_processors.common import is_scaled_image
 from fastdeploy.input.image_processors.common import smart_resize_qwen as smart_resize
 from fastdeploy.utils import data_processor_logger
 OPENAI_CLIP_MEAN = [0.48145466, 0.4578275, 0.40821073]
 OPENAI_CLIP_STD = [0.26862954, 0.26130258, 0.27577711]
 IMAGE_FACTOR = 28
 MIN_PIXELS = 4 * 28 * 28
 MAX_PIXELS = 16384 * 28 * 28
 MAX_RATIO = 200
 VideoInput = Union[
    List["PIL.Image.Image"],
    "np.ndarray",
    "paddle.Tensor",
    List["np.ndarray"],
    List["paddle.Tensor"],
    List[List["PIL.Image.Image"]],
    List[List["np.ndarray"]],
    List[List["paddle.Tensor"]],
 ]
 __all__ = [
    "AdaptiveImageProcessor",
    "get_image_preprocessor",
    "make_batched_images",
    "make_batched_videos",
 ]
 def make_batched_images(images) -> List[List[ImageInput]]:
    """
    Accepts images in list or nested list format, and makes a list of images for preprocessing.
        images (`Union[List[List[ImageInput]], List[ImageInput], ImageInput]`):
            The input image.
    Returns:
        list: A list of images.
    """
    if isinstance(images, (list, tuple)) and isinstance(images[0], (list, tuple)) and is_valid_image(images[0][0]):
        return [img for img_list in images for img in img_list]
    elif isinstance(images, (list, tuple)) and is_valid_image(images[0]):
        return images
    elif is_valid_image(images):
        return [images]
    raise ValueError(f"Could not make batched images from {images}")
 # Copied from transformers.models.llava_next_video.image_processing_llava_next_video.make_batched_videos
 def make_batched_videos(videos) -> List[VideoInput]:
    """dummy"""
    if isinstance(videos, (list, tuple)) and isinstance(videos[0], (list, tuple)) and is_valid_image(videos[0][0]):
        return videos
    elif isinstance(videos, (list, tuple)) and is_valid_image(videos[0]):
        if isinstance(videos[0], Image.Image):
            return [videos]
        elif len(videos[0].shape) == 4:
            return [list(video) for video in videos]
    elif is_valid_image(videos) and len(videos.shape) == 4:
        return [list(videos)]
    raise ValueError(f"Could not make batched video from {videos}")
 class AdaptiveImageProcessor(BaseImageProcessor):
    r"""
    Constructs a adaptive image processor that dynamically resizes images based on the original images.
    Args:
        do_resize (`bool`, *optional*, defaults to `True`):
            Whether to resize the image's (height, width) dimensions.
        resample (`PILImageResampling`, *optional*, defaults to `Resampling.BICUBIC`):
            Resampling filter to use when resizing the image.
        do_rescale (`bool`, *optional*, defaults to `True`):
            Whether to rescale the image by the specified scale `rescale_factor`.
        rescale_factor (`int` or `float`, *optional*, defaults to `1/255`):
            Scale factor to use if rescaling the image.
        do_normalize (`bool`, *optional*, defaults to `True`):
            Whether to normalize the image.
        image_mean (`float` or `List[float]`, *optional*, defaults to `[0.48145466, 0.4578275, 0.40821073]`):
            Mean to use if normalizing the image. This is a float or list of floats for each channel in the image.
        image_std (`float` or `List[float]`, *optional*, defaults to `[0.26862954, 0.26130258, 0.27577711]`):
            Standard deviation to use if normalizing the image. This is a float or list of floats for each channel
            in the image.
        do_convert_rgb (`bool`, *optional*, defaults to `True`):
            Whether to convert the image to RGB.
        min_pixels (`int`, *optional*, defaults to `56 * 56`):
            The min pixels of the image to resize the image.
        max_pixels (`int`, *optional*, defaults to `28 * 28 * 1280`):
            The max pixels of the image to resize the image.
        patch_size (`int`, *optional*, defaults to 14):
            The spacial patch size of the vision encoder.
        temporal_conv_size (`int`, *optional*, defaults to 2):
            The temporal conv size in resampler.
        merge_size (`int`, *optional*, defaults to 2):
            The merge size of the vision encoder to llm encoder.
    """
    model_input_names = [
        "pixel_values",
        "image_grid_thw",
        "pixel_values_videos",
        "video_grid_thw",
    ]
    def __init__(
        self,
        do_resize: bool = True,
        resample: PILImageResampling = PILImageResampling.BICUBIC,
        do_rescale: bool = True,
        rescale_factor: float = 1 / 255,
        do_normalize: bool = True,
        image_mean: Optional[Union[float, List[float]]] = None,
        image_std: Optional[Union[float, List[float]]] = None,
        do_convert_rgb: bool = True,
        min_pixels: int = 56 * 56,
        max_pixels: int = 28 * 28 * 1280,
        patch_size: int = 14,
        temporal_conv_size: int = 2,
        merge_size: int = 2,
        **kwargs,
    ) -> None:
        """init"""
        super().__init__(**kwargs)
        self.do_resize = do_resize
        self.resample = resample
        self.do_rescale = do_rescale
        self.rescale_factor = rescale_factor
        self.do_normalize = do_normalize
        self.image_mean = image_mean if image_mean is not None else OPENAI_CLIP_MEAN
        self.image_std = image_std if image_std is not None else OPENAI_CLIP_STD
        self.min_pixels = min_pixels
        self.max_pixels = max_pixels
        self.patch_size = patch_size
        self.temporal_conv_size = temporal_conv_size
        self.merge_size = merge_size
        self.size = {"min_pixels": min_pixels, "max_pixels": max_pixels}
        self.do_convert_rgb = do_convert_rgb
    def set_pixels(self, min_pixels=None, max_pixels=None, msg=""):
        """设定pixels"""
        if min_pixels is not None:
            assert isinstance(min_pixels, int) and min_pixels >= 0, "min_pixels must be positive int"
            data_processor_logger.info(f"{msg} AdaptiveImageProcessor set min_pixels = {min_pixels}")
            self.min_pixels = min_pixels
            self.size["min_pixels"] = int(min_pixels)
        if max_pixels is not None:
            assert isinstance(max_pixels, int) and max_pixels > 0, "max_pixels must be positive int"
            data_processor_logger.info(f"{msg} AdaptiveImageProcessor set max_pixels = {max_pixels}")
            self.max_pixels = max_pixels
            self.size["max_pixels"] = int(max_pixels)
    def get_smarted_resize(self, height, width, min_pixels=None, max_pixels=None):
        """dummy"""
        actual_min_pixels = min_pixels if min_pixels is not None else self.min_pixels
        actual_max_pixels = max_pixels if max_pixels is not None else self.max_pixels
        resized_height, resized_width = smart_resize(
            height,
            width,
            factor=self.patch_size * self.merge_size,
            min_pixels=actual_min_pixels,
            max_pixels=actual_max_pixels,
        )
        return (resized_height, resized_width), (
            resized_height // self.patch_size,
            resized_width // self.patch_size,
        )
    def _preprocess(
        self,
        images: Union[ImageInput, VideoInput],
        do_resize: bool = True,
        resample: PILImageResampling = None,
        do_rescale: bool = True,
        rescale_factor: float = 1 / 255,
        do_normalize: bool = True,
        image_mean: Optional[Union[float, List[float]]] = None,
        image_std: Optional[Union[float, List[float]]] = None,
        do_convert_rgb: bool = False,
        data_format: Optional[ChannelDimension] = ChannelDimension.FIRST,
        input_data_format: Optional[Union[str, ChannelDimension]] = None,
        predetermined_grid_thw=None,
    ):
        """
        Preprocess an image or batch of images. Copy of the `preprocess` method from `CLIPImageProcessor`.
        Args:
            images (`ImageInput`):
                Image or batch of images to preprocess. Expects pixel values ranging from 0 to 255.
                If pixel values range from 0 to 1, set `do_rescale=False`.
            vision_info (`List[Dict]`, *optional*):
                Optional list of dictionaries containing additional information about vision inputs.
            do_resize (`bool`, *optional*, defaults to `self.do_resize`):
                Whether to resize the image.
            resample (`PILImageResampling`, *optional*, defaults to `self.resample`):
                Resampling filter to use if resizing the image. This can be one of the `PILImageResampling` enums.
            do_rescale (`bool`, *optional*, defaults to `self.do_rescale`):
                Whether to rescale the image.
            rescale_factor (`float`, *optional*, defaults to `self.rescale_factor`):
                Scale factor to use if rescaling the image.
            do_normalize (`bool`, *optional*, defaults to `self.do_normalize`):
                Whether to normalize the image.
            image_mean (`float` or `List[float]`, *optional*, defaults to `self.image_mean`):
                Mean to use if normalizing the image.
                Can be a float or a list of floats corresponding to the number of channels in the image.
            image_std (`float` or `List[float]`, *optional*, defaults to `self.image_std`):
                Standard deviation to use if normalizing the image.
                Can be a float or a list of floats corresponding to the number of channels in the image.
            do_convert_rgb (`bool`, *optional*, defaults to `self.do_convert_rgb`):
                Whether to convert the image to RGB.
            data_format (`ChannelDimension`, *optional*, defaults to `ChannelDimension.FIRST`):
                The channel dimension format for the output image. Can be one of:
                - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
                - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
                - Unset: Use the channel dimension format of the input image.
            input_data_format (`ChannelDimension` or `str`, *optional*):
                The channel dimension format for the input image. Can be one of:
                - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
                - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
                - `"none"` or `ChannelDimension.NONE`: image in (height, width) format.
                - `"none"` or `ChannelDimension.NONE`: image in (height, width) format.
        """
        images = make_list_of_images(images)
        if do_convert_rgb:
            images = [convert_to_rgb(image) for image in images]
        # All transformations expect numpy arrays.
        images = [to_numpy_array(image) for image in images]
        if is_scaled_image(images[0]) and do_rescale:
            data_processor_logger.warning(
                "It looks like you are trying to rescale already rescaled images. If the input"
                " images have pixel values between 0 and 1, set `do_rescale=False` to avoid rescaling them again."
            )
        if input_data_format is None:
            # We assume that all images have the same channel dimension format.
            input_data_format = infer_channel_dimension_format(images[0])
        height, width = get_image_size(images[0], channel_dim=input_data_format)
        resized_height, resized_width = height, width
        processed_images = []
        if predetermined_grid_thw is not None:
            assert len(predetermined_grid_thw) == len(
                images
            ), f"len(predetermined_grid_thw) {len(predetermined_grid_thw)} == len(images) {len(images)}"
        for img_idx, image in enumerate(images):
            if do_resize:
                if predetermined_grid_thw is not None:
                    (resized_height, resized_width) = predetermined_grid_thw[img_idx]
                    resized_height *= self.patch_size
                    resized_width *= self.patch_size
                else:
                    resized_height, resized_width = smart_resize(
                        height,
                        width,
                        factor=self.patch_size * self.merge_size,
                        min_pixels=self.min_pixels,
                        max_pixels=self.max_pixels,
                    )
                image = image.astype("uint8")  # TODO : 需要手动加上，否则多除255 导致结果会出错
                # 直接fromarray，不要靠paddleformers里面的
                image = Image.fromarray(image)
                image = resize(
                    image,
                    size=(resized_height, resized_width),
                    resample=resample,
                    data_format=input_data_format,
                )
            if do_rescale:
                image = rescale(image, scale=rescale_factor, data_format=input_data_format)
            if do_normalize:
                image = normalize(
                    image=image,
                    mean=image_mean,
                    std=image_std,
                    data_format=input_data_format,
                )
            image = to_channel_dimension_format(image, data_format, input_channel_dim=input_data_format)  # [C, H, W]
            processed_images.append(image)
        patches = np.array(processed_images)
        if data_format == ChannelDimension.LAST:
            patches = patches.transpose([0, 3, 1, 2])
        channel = patches.shape[1]  # [time, C, H, W]
        grid_t = patches.shape[0]
        grid_h, grid_w = (
            resized_height // self.patch_size,
            resized_width // self.patch_size,
        )
        patches = patches.reshape(
            [
                grid_t,
                channel,
                grid_h // self.merge_size,
                self.merge_size,
                self.patch_size,
                grid_w // self.merge_size,
                self.merge_size,
                self.patch_size,
            ]
        )
        # [grid_t, grid_h/merge_size, grid_w/merge_size, merge_size, merge_size, C, psz, psz]
        patches = patches.transpose([0, 2, 5, 3, 6, 1, 4, 7])
        flatten_patches = patches.reshape(
            [
                grid_t * grid_h * grid_w,
                channel * self.patch_size * self.patch_size,
            ]
        )  # [grid_t * grid_h * grid_w, C * psz * psz]
        return flatten_patches, (grid_t, grid_h, grid_w)
    def preprocess(
        self,
        images: ImageInput,
        videos: VideoInput = None,
        do_resize: bool = True,
        size: Optional[Union[int, List[int]]] = None,
        resample: PILImageResampling = None,
        do_rescale: bool = True,
        rescale_factor: float = 1 / 255,
        do_normalize: bool = True,
        image_mean: Optional[Union[float, List[float]]] = None,
        image_std: Optional[Union[float, List[float]]] = None,
        do_convert_rgb: bool = False,
        return_tensors: Optional[Union[str, TensorType]] = None,
        data_format: Optional[ChannelDimension] = ChannelDimension.FIRST,
        input_data_format: Optional[Union[str, ChannelDimension]] = None,
        predetermined_grid_thw=None,
    ):
        """
        Args:
            images (`ImageInput`):
                Image to preprocess. Expects a single or batch of images with pixel values ranging from 0 to 255. If
                passing in images with pixel values between 0 and 1, set `do_rescale=False`.
            videos (`VideoInput`):
                Video to preprocess. Expects a single or batch of videos with pixel values ranging from 0 to 255. If
                passing in videos with pixel values between 0 and 1, set `do_rescale=False`.
            do_resize (`bool`, *optional*, defaults to `self.do_resize`):
                Whether to resize the image.
            size (`Dict[str, int]`, *optional*, defaults to `self.size`):
                Size of the image after resizing. Shortest edge of the image is resized to size["shortest_edge"], with
                the longest edge resized to keep the input aspect ratio.
            resample (`int`, *optional*, defaults to `self.resample`):
                Resampling filter to use if resizing the image. This can be one of the enum `PILImageResampling`. Only
                has an effect if `do_resize` is set to `True`.
            do_rescale (`bool`, *optional*, defaults to `self.do_rescale`):
                Whether to rescale the image.
            rescale_factor (`float`, *optional*, defaults to `self.rescale_factor`):
                Rescale factor to rescale the image by if `do_rescale` is set to `True`.
            do_normalize (`bool`, *optional*, defaults to `self.do_normalize`):
                Whether to normalize the image.
            image_mean (`float` or `List[float]`, *optional*, defaults to `self.image_mean`):
                Image mean to use for normalization. Only has an effect if `do_normalize` is set to `True`.
            image_std (`float` or `List[float]`, *optional*, defaults to `self.image_std`):
                Image standard deviation to use for normalization. Only has an effect if `do_normalize` is set to
                `True`.
            do_convert_rgb (`bool`, *optional*, defaults to `self.do_convert_rgb`):
                Whether to convert the image to RGB.
            return_tensors (`str` or `TensorType`, *optional*):
                The type of tensors to return. Can be one of:
                - Unset: Return a list of `np.ndarray`.
                - `TensorType.PADDLE` or `'pt'`: Return a batch of type `torch.Tensor`.
                - `TensorType.NUMPY` or `'np'`: Return a batch of type `np.ndarray`.
            data_format (`ChannelDimension` or `str`, *optional*, defaults to `ChannelDimension.FIRST`):
                The channel dimension format for the output image. Can be one of:
                - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
                - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
                - Unset: Use the channel dimension format of the input image.
            input_data_format (`ChannelDimension` or `str`, *optional*):
                The channel dimension format for the input image. If unset, the channel dimension format is inferred
                from the input image. Can be one of:
                - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
                - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
                - `"none"` or `ChannelDimension.NONE`: image in (height, width) format.
        """
        do_resize = do_resize if do_resize is not None else self.do_resize
        size = size if size is not None else self.size
        resample = resample if resample is not None else self.resample
        do_rescale = do_rescale if do_rescale is not None else self.do_rescale
        rescale_factor = rescale_factor if rescale_factor is not None else self.rescale_factor
        do_normalize = do_normalize if do_normalize is not None else self.do_normalize
        image_mean = image_mean if image_mean is not None else self.image_mean
        image_std = image_std if image_std is not None else self.image_std
        do_convert_rgb = do_convert_rgb if do_convert_rgb is not None else self.do_convert_rgb
        if images is not None:
            images = make_batched_images(images)
        if videos is not None:
            videos = make_batched_videos(videos)
        if images is not None and not valid_images(images):
            raise ValueError("Invalid image type. Must be of type PIL.Image.Image, numpy.ndarray, " "paddle.Tensor.")
        data = {}
        if images is not None:
            pixel_values, vision_grid_thws = [], []
            for img_idx, image in enumerate(images):
                if predetermined_grid_thw is not None:
                    predetermined_grid_thw_one = [predetermined_grid_thw[img_idx]]
                else:
                    predetermined_grid_thw_one = None
                patches, image_grid_thw = self._preprocess(
                    image,
                    do_resize=do_resize,
                    resample=resample,
                    do_rescale=do_rescale,
                    rescale_factor=rescale_factor,
                    do_normalize=do_normalize,
                    image_mean=image_mean,
                    image_std=image_std,
                    data_format=data_format,
                    do_convert_rgb=do_convert_rgb,
                    input_data_format=input_data_format,
                    predetermined_grid_thw=predetermined_grid_thw_one,
                )
                pixel_values.extend(patches)
                vision_grid_thws.append(image_grid_thw)
            pixel_values = np.array(pixel_values)
            vision_grid_thws = np.array(vision_grid_thws)
            data["pixel_values"] = pixel_values
            data["image_grid_thw"] = vision_grid_thws
        if videos is not None:
            pixel_values, vision_grid_thws = [], []
            for images in videos:
                patches, video_grid_thw = self._preprocess(
                    images,
                    do_resize=do_resize,
                    resample=resample,
                    do_rescale=do_rescale,
                    rescale_factor=rescale_factor,
                    do_normalize=do_normalize,
                    image_mean=image_mean,
                    image_std=image_std,
                    data_format=data_format,
                    do_convert_rgb=do_convert_rgb,
                    input_data_format=input_data_format,
                    predetermined_grid_thw=predetermined_grid_thw,
                )
                pixel_values.extend(patches)
                vision_grid_thws.append(video_grid_thw)
            pixel_values = np.array(pixel_values)
            vision_grid_thws = np.array(vision_grid_thws)
            data["pixel_values_videos"] = pixel_values
            data["video_grid_thw"] = vision_grid_thws
        return BatchFeature(data=data, tensor_type=return_tensors)
 def get_image_preprocessor(args):
    """
    get_image_preprocessor from args
    """
    if args.vision_model_name_or_path is None:
        return None
    data_processor_logger.info("use AdaptiveImageProcessor")
    image_preprocess = AdaptiveImageProcessor.from_pretrained(args.vision_model_name_or_path)
    return image_preprocess
@@ -0,0 +1,225 @@
 """
 # Copyright (c) 2025  PaddlePaddle Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 """
 """Image processor class for PaddleOCR-VL."""
 import json
 from pathlib import Path
 from typing import Dict, List, Optional, Union
 import numpy as np
 from paddleformers.transformers.feature_extraction_utils import BatchFeature
 from paddleformers.transformers.image_processing_utils import BaseImageProcessor
 from paddleformers.transformers.image_utils import (
    ImageInput,
    is_valid_image,
    make_list_of_images,
    to_numpy_array,
 )
 from fastdeploy.input.image_processors.common import (
    smart_resize_paddleocr as smart_resize,
 )
 _OPENAI_CLIP_MEAN = [0.48145466, 0.4578275, 0.40821073]
 _OPENAI_CLIP_STD = [0.26862954, 0.26130258, 0.27577711]
 def make_batched_images(images) -> List[ImageInput]:
    """
    Accepts images in list or nested list format, and makes a flat list of images for preprocessing.
    Args:
        images (`Union[List[List[ImageInput]], List[ImageInput], ImageInput]`):
            The input image.
    Returns:
        List[ImageInput]: A flat list of images.
    """
    if isinstance(images, (list, tuple)) and isinstance(images[0], (list, tuple)) and is_valid_image(images[0][0]):
        return [img for img_list in images for img in img_list]
    elif isinstance(images, (list, tuple)) and is_valid_image(images[0]):
        return images
    elif is_valid_image(images):
        return [images]
    raise ValueError(f"Could not make batched images from {images}")
 def adjust_size(size, patch_size):
    num_patches = size // patch_size
    if num_patches % 2 != 0:
        num_patches -= 1
    return num_patches * patch_size
 class ImageProcessor(BaseImageProcessor):
    model_input_names = [
        "pixel_values",
        "image_grid_thw",
        "pixel_values_videos",
        "video_grid_thw",
    ]
    def __init__(
        self,
        do_resize: bool = True,
        resample: int = 3,
        do_rescale: bool = True,
        rescale_factor: Union[int, float] = 1 / 255,
        do_normalize: bool = True,
        image_mean: Optional[Union[float, List[float]]] = None,
        image_std: Optional[Union[float, List[float]]] = None,
        do_convert_rgb: bool = True,
        min_pixels: int = 28 * 28 * 130,
        max_pixels: int = 28 * 28 * 1280,
        patch_size: int = 14,
        temporal_patch_size: int = 1,
        merge_size: int = 2,
        **kwargs,
    ) -> None:
        super().__init__()
        self.do_resize = do_resize
        self.resample = resample
        self.do_rescale = do_rescale
        self.rescale_factor = rescale_factor
        self.do_normalize = do_normalize
        self.image_mean = image_mean if image_mean is not None else _OPENAI_CLIP_MEAN
        self.image_std = image_std if image_std is not None else _OPENAI_CLIP_STD
        self.min_pixels = min_pixels
        self.max_pixels = max_pixels
        self.patch_size = patch_size
        self.temporal_patch_size = temporal_patch_size
        self.merge_size = merge_size
        self.size = {"min_pixels": min_pixels, "max_pixels": max_pixels}  # not used
        self.do_convert_rgb = do_convert_rgb
    @classmethod
    def from_pretrained(cls, pretrained_model_dir):
        pretrained_model_dir = Path(pretrained_model_dir)
        image_processor_config_path = pretrained_model_dir / "preprocessor_config.json"
        with open(image_processor_config_path, "r", encoding="utf-8") as f:
            image_processor_config = json.load(f)
        return cls(**image_processor_config)
    def _preprocess(
        self,
        images,
        do_resize: Optional[bool] = None,
        do_rescale: Optional[bool] = None,
        rescale_factor: Optional[float] = None,
        do_normalize: Optional[bool] = None,
        image_mean: Optional[Union[float, List[float]]] = None,
        image_std: Optional[Union[float, List[float]]] = None,
        do_convert_rgb: Optional[bool] = None,
    ):
        images = make_list_of_images(images)
        if do_convert_rgb:
            images = [image.convert("RGB") for image in images]
        width, height = images[0].size
        resized_height, resized_width = height, width
        processed_images = []
        for image in images:
            if do_resize:
                resized_height, resized_width = smart_resize(
                    height,
                    width,
                    factor=self.patch_size * self.merge_size,
                    min_pixels=self.min_pixels,
                    max_pixels=self.max_pixels,
                )
                image = image.resize((resized_width, resized_height), resample=self.resample)
            image = to_numpy_array(image)
            if do_rescale:
                image = (image * rescale_factor).astype(np.float32)
            if do_normalize:
                image = image.astype(np.float32)
                image -= np.array(image_mean, dtype=np.float32)
                image /= np.array(image_std, dtype=np.float32)
            processed_images.append(image)
        patches = np.array(processed_images)
        patches = patches.transpose(0, 3, 1, 2)
        if patches.shape[0] == 1:
            patches = np.tile(patches, (self.temporal_patch_size, 1, 1, 1))
        channel = patches.shape[1]
        grid_t = patches.shape[0] // self.temporal_patch_size
        grid_h, grid_w = (
            resized_height // self.patch_size,
            resized_width // self.patch_size,
        )
        patches = patches.reshape(
            grid_t,
            self.temporal_patch_size,
            channel,
            grid_h,
            self.patch_size,
            grid_w,
            self.patch_size,
        )
        patches = patches.transpose(0, 3, 5, 2, 1, 4, 6)
        assert self.temporal_patch_size == 1
        flatten_patches = patches.reshape(grid_t * grid_h * grid_w, channel, self.patch_size, self.patch_size)
        return flatten_patches, np.array([grid_t, grid_h, grid_w])
    def preprocess(
        self,
        images,
        videos=None,
        do_resize: Optional[bool] = None,
        size: Optional[Dict[str, int]] = None,
        do_rescale: Optional[bool] = None,
        rescale_factor: Optional[float] = None,
        do_normalize: Optional[bool] = None,
        image_mean: Optional[Union[float, List[float]]] = None,
        image_std: Optional[Union[float, List[float]]] = None,
        do_convert_rgb: Optional[bool] = None,
        return_tensors=None,
    ):
        do_resize = do_resize if do_resize is not None else self.do_resize
        size = size if size is not None else self.size
        do_rescale = do_rescale if do_rescale is not None else self.do_rescale
        rescale_factor = rescale_factor if rescale_factor is not None else self.rescale_factor
        do_normalize = do_normalize if do_normalize is not None else self.do_normalize
        image_mean = image_mean if image_mean is not None else self.image_mean
        image_std = image_std if image_std is not None else self.image_std
        do_convert_rgb = do_convert_rgb if do_convert_rgb is not None else self.do_convert_rgb
        if videos is not None:
            raise NotImplementedError("Videos are not yet supported")
        patches, image_grid_thw = self._preprocess(
            images,
            do_resize=do_resize,
            do_rescale=do_rescale,
            rescale_factor=rescale_factor,
            do_normalize=do_normalize,
            image_mean=image_mean,
            image_std=image_std,
            do_convert_rgb=do_convert_rgb,
        )
        pixel_values = np.array(patches)
        data = {"pixel_values": pixel_values, "grid_thw": image_grid_thw}
        return BatchFeature(data=data, tensor_type=return_tensors)
@@ -0,0 +1,333 @@
 """
 # Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 """
 from typing import List, Optional, Union
 import numpy as np
 import paddle
 import PIL
 from paddleformers.transformers.feature_extraction_utils import BatchFeature
 from paddleformers.transformers.image_processing_utils import BaseImageProcessor
 from paddleformers.transformers.image_transforms import (
    normalize,
    rescale,
    resize,
    to_channel_dimension_format,
 )
 from paddleformers.transformers.image_utils import (
    ChannelDimension,
    ImageInput,
    PILImageResampling,
    get_image_size,
    infer_channel_dimension_format,
    make_list_of_images,
    to_numpy_array,
    valid_images,
 )
 from paddleformers.transformers.legacy.tokenizer_utils_base import TensorType
 from PIL import Image
 from fastdeploy.input.image_processors.common import is_scaled_image, smart_resize
 from fastdeploy.utils import data_processor_logger
 IMAGE_MEAN = [0.5, 0.5, 0.5]
 IMAGE_STD = [0.5, 0.5, 0.5]
 MIN_PIXELS = 65536
 MAX_PIXELS = 16777216
 VideoInput = Union[
    List["PIL.Image.Image"],
    "np.ndarray",
    "paddle.Tensor",
    List["np.ndarray"],
    List["paddle.Tensor"],
    List[List["PIL.Image.Image"]],
    List[List["np.ndarray"]],
    List[List["paddle.Tensor"]],
 ]
 class ImageProcessor(BaseImageProcessor):
    """
    Adaptive image processor for dynamic image resizing and preprocessing.
    This processor handles image resizing, rescaling, normalization and format conversion.
    It dynamically adjusts image dimensions based on original size and specified constraints.
    """
    def __init__(
        self,
        patch_size: int = 16,
        merge_size: int = 2,
        temporal_patch_size: int = 2,
        min_pixels: int = MIN_PIXELS,
        max_pixels: int = MAX_PIXELS,
        image_mean: Union[float, List[float]] = IMAGE_MEAN,
        image_std: Union[float, List[float]] = IMAGE_STD,
        rescale_factor: float = 1 / 255,
        do_rescale: bool = True,
        do_normalize: bool = True,
        resample: PILImageResampling = PILImageResampling.BICUBIC,
        **kwargs,
    ) -> None:
        """
        Initialize image processor with configuration parameters.
        Args:
            patch_size (int): Spatial patch size for vision encoder
            merge_size (int): Merge size between vision and LLM encoders
            temporal_patch_size (int): Temporal patch size for video processing
            min_pixels (int): Minimum allowed pixels in resized image
            max_pixels (int): Maximum allowed pixels in resized image
            image_mean (float/list): Mean values for normalization per channel
            image_std (float/list): Std values for normalization per channel
            rescale_factor (float): Scaling factor for pixel values (default 1/255)
            do_rescale (bool): Whether to rescale images
            do_normalize (bool): Whether to normalize images
            resample: Resampling method for image resizing
            **kwargs: Additional base class arguments
        """
        super().__init__(**kwargs)
        self.patch_size = patch_size
        self.merge_size = merge_size
        self.temporal_patch_size = temporal_patch_size
        self.min_pixels = min_pixels
        self.max_pixels = max_pixels
        self.image_mean = image_mean
        self.image_std = image_std
        self.rescale_factor = rescale_factor
        self.do_rescale = do_rescale
        self.do_normalize = do_normalize
        self.resample = resample
    def _preprocess(
        self,
        images: Union[ImageInput, VideoInput],
        min_pixels: int,
        max_pixels: int,
        image_mean: Optional[Union[float, List[float]]],
        image_std: Optional[Union[float, List[float]]],
        rescale_factor: float,
        do_rescale: bool,
        do_normalize: bool,
        resample: PILImageResampling,
        data_format: Optional[ChannelDimension],
        input_data_format: Optional[Union[str, ChannelDimension]],
    ):
        """
        Internal method for image preprocessing pipeline.
        Args:
            images: Input image or batch of images
            min_pixels: Minimum allowed pixels in output
            max_pixels: Maximum allowed pixels in output
            image_mean: Normalization mean values
            image_std: Normalization std values
            rescale_factor: Pixel value scaling factor
            do_rescale: Whether to rescale pixel values
            do_normalize: Whether to normalize pixel values
            resample: Resampling method
            data_format: Output channel format
            input_data_format: Input channel format
        Returns:
            tuple: (flatten_patches, grid_dimensions)
                - flatten_patches: Flattened image patches
                - grid_dimensions: Grid dimensions [t, h, w]
        """
        images = make_list_of_images(images)
        # All transformations expect numpy arrays.
        images = [to_numpy_array(image) for image in images]
        if is_scaled_image(images[0]) and do_rescale:
            data_processor_logger.warning(
                "It looks like you are trying to rescale already rescaled images. If the input"
                " images have pixel values between 0 and 1, set `do_rescale=False` to avoid rescaling them again."
            )
        if input_data_format is None:
            # We assume that all images have the same channel dimension format.
            input_data_format = infer_channel_dimension_format(images[0])
        # Get original dimensions and calculate optimal resize dimensions
        height, width = get_image_size(images[0], channel_dim=input_data_format)
        resized_height, resized_width = smart_resize(
            height,
            width,
            factor=self.patch_size * self.merge_size,  # Combine patch and merge factors
            min_pixels=min_pixels,
            max_pixels=max_pixels,
        )
        processed_images = []
        for image in images:
            if height != resized_height or width != resized_width:
                # Convert to uint8 before resizing to avoid double scaling
                image = image.astype("uint8")
                # Convert to PIL Image and resize
                image = Image.fromarray(image)
                image = resize(
                    image,
                    size=(resized_height, resized_width),
                    resample=resample,
                    data_format=input_data_format,
                )
            if do_rescale and do_normalize:
                # Adjust mean and std for combined rescale+normalize
                image_mean = np.array(image_mean, dtype=np.float32) * (1.0 / rescale_factor)
                image_std = np.array(image_std, dtype=np.float32) * (1.0 / rescale_factor)
                do_rescale = False  # Skip separate rescale step
            # mutual exclusion and upper branch
            if do_rescale:
                image = image.astype(np.float32)
                image = rescale(image, scale=rescale_factor, data_format=input_data_format)
            if do_normalize:
                image = image.astype(np.float32)
                image = normalize(
                    image=image,
                    mean=image_mean,
                    std=image_std,
                    data_format=input_data_format,
                )
            image = to_channel_dimension_format(image, data_format, input_channel_dim=input_data_format)  # [C, H, W]
            processed_images.append(image)
        # Convert processed images to numpy array
        patches = np.array(processed_images)
        # Pad temporal dimension if needed
        if patches.shape[0] % self.temporal_patch_size != 0:
            repeats = np.repeat(
                patches[-1][np.newaxis],
                self.temporal_patch_size - (patches.shape[0] % self.temporal_patch_size),
                axis=0,
            )
            patches = np.concatenate([patches, repeats], axis=0)
        # Convert to channels-first format if needed
        if data_format == ChannelDimension.LAST:
            patches = patches.transpose([0, 3, 1, 2])  # [N, H, W, C] -> [N, C, H, W]
        grid_t, channel = patches.shape[:2]
        grid_t = grid_t // self.temporal_patch_size
        grid_h, grid_w = (
            resized_height // self.patch_size,
            resized_width // self.patch_size,
        )
        # Reshape into hierarchical patch structure
        patches = patches.reshape(
            [
                grid_t,
                self.temporal_patch_size,
                channel,
                grid_h // self.merge_size,
                self.merge_size,
                self.patch_size,
                grid_w // self.merge_size,
                self.merge_size,
                self.patch_size,
            ]
        )
        # Reorder dimensions for better memory access pattern
        # [grid_t, grid_h/merge_size, grid_w/merge_size, merge_size, merge_size, C, temporal_patch_size, psz, psz]
        patches = patches.transpose([0, 3, 6, 4, 7, 2, 1, 5, 8])
        flatten_patches = patches.reshape(
            [
                grid_t * grid_h * grid_w,
                channel * self.temporal_patch_size * self.patch_size * self.patch_size,
            ]
        )
        return flatten_patches, np.array([grid_t, grid_h, grid_w])
    def preprocess(
        self,
        images: Union[ImageInput, VideoInput],
        min_pixels: Optional[int] = None,
        max_pixels: Optional[int] = None,
        image_mean: Optional[Union[float, List[float]]] = None,
        image_std: Optional[Union[float, List[float]]] = None,
        rescale_factor: Optional[float] = None,
        do_rescale: Optional[bool] = None,
        do_normalize: Optional[bool] = None,
        resample: Optional[PILImageResampling] = None,
        return_tensors: Optional[Union[str, TensorType]] = None,
        data_format: Optional[ChannelDimension] = ChannelDimension.FIRST,
        input_data_format: Optional[Union[str, ChannelDimension]] = ChannelDimension.LAST,
    ):
        """
        Main preprocessing method for images/videos.
        Args:
            images: Input image/video data
            min_pixels: Override for minimum pixels
            max_pixels: Override for maximum pixels
            image_mean: Override for normalization mean
            image_std: Override for normalization std
            rescale_factor: Override for rescaling factor
            do_rescale: Override for rescaling flag
            do_normalize: Override for normalization flag
            resample: Override for resampling method
            return_tensors: Desired output tensor format
            data_format: Output channel dimension format
            input_data_format: Input channel dimension format
        Returns:
            BatchFeature: Processed features containing:
                - pixel_values: Preprocessed pixel data
                - grid_thw: Grid dimensions [temporal, height, width]
        Raises:
            ValueError: For invalid image types or dimensions
        """
        min_pixels = min_pixels if min_pixels is not None else self.min_pixels
        max_pixels = max_pixels if max_pixels is not None else self.max_pixels
        image_mean = image_mean if image_mean is not None else self.image_mean
        image_std = image_std if image_std is not None else self.image_std
        rescale_factor = rescale_factor if rescale_factor is not None else self.rescale_factor
        do_rescale = do_rescale if do_rescale is not None else self.do_rescale
        do_normalize = do_normalize if do_normalize is not None else self.do_normalize
        resample = resample if resample is not None else self.resample
        if images is not None and not valid_images(images):
            raise ValueError("Invalid image type. Must be of type PIL.Image.Image, numpy.ndarray, " "paddle.Tensor.")
        pixel_values, grid_thw = self._preprocess(
            images,
            min_pixels=min_pixels,
            max_pixels=max_pixels,
            image_mean=image_mean,
            image_std=image_std,
            rescale_factor=rescale_factor,
            do_rescale=do_rescale,
            do_normalize=do_normalize,
            resample=resample,
            data_format=data_format,
            input_data_format=input_data_format,
        )
        data = {"pixel_values": pixel_values, "grid_thw": grid_thw}
        return BatchFeature(data=data, tensor_type=return_tensors)
@@ -0,0 +1,332 @@
 """
 # Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 """
 from typing import List, Optional, Union
 import numpy as np
 import paddle
 import PIL
 from paddleformers.transformers.feature_extraction_utils import BatchFeature
 from paddleformers.transformers.image_processing_utils import BaseImageProcessor
 from paddleformers.transformers.image_transforms import (
    normalize,
    rescale,
    resize,
    to_channel_dimension_format,
 )
 from paddleformers.transformers.image_utils import (
    ChannelDimension,
    ImageInput,
    PILImageResampling,
    get_image_size,
    infer_channel_dimension_format,
    make_list_of_images,
    to_numpy_array,
    valid_images,
 )
 from paddleformers.transformers.legacy.tokenizer_utils_base import TensorType
 from PIL import Image
 from fastdeploy.input.image_processors.common import is_scaled_image, smart_resize
 from fastdeploy.utils import data_processor_logger
 OPENAI_CLIP_MEAN = [0.48145466, 0.4578275, 0.40821073]
 OPENAI_CLIP_STD = [0.26862954, 0.26130258, 0.27577711]
 MIN_PIXELS = 4 * 28 * 28
 MAX_PIXELS = 16384 * 28 * 28
 VideoInput = Union[
    List["PIL.Image.Image"],
    "np.ndarray",
    "paddle.Tensor",
    List["np.ndarray"],
    List["paddle.Tensor"],
    List[List["PIL.Image.Image"]],
    List[List["np.ndarray"]],
    List[List["paddle.Tensor"]],
 ]
 class ImageProcessor(BaseImageProcessor):
    """
    Adaptive image processor for dynamic image resizing and preprocessing.
    This processor handles image resizing, rescaling, normalization and format conversion.
    It dynamically adjusts image dimensions based on original size and specified constraints.
    """
    def __init__(
        self,
        patch_size: int = 14,
        merge_size: int = 2,
        temporal_patch_size: int = 2,
        min_pixels: int = MIN_PIXELS,
        max_pixels: int = MAX_PIXELS,
        image_mean: Union[float, List[float]] = OPENAI_CLIP_MEAN,
        image_std: Union[float, List[float]] = OPENAI_CLIP_STD,
        rescale_factor: float = 1 / 255,
        do_rescale: bool = True,
        do_normalize: bool = True,
        resample: PILImageResampling = PILImageResampling.BICUBIC,
        **kwargs,
    ) -> None:
        """
        Initialize image processor with configuration parameters.
        Args:
            patch_size (int): Spatial patch size for vision encoder
            merge_size (int): Merge size between vision and LLM encoders
            temporal_patch_size (int): Temporal patch size for video processing
            min_pixels (int): Minimum allowed pixels in resized image
            max_pixels (int): Maximum allowed pixels in resized image
            image_mean (float/list): Mean values for normalization per channel
            image_std (float/list): Std values for normalization per channel
            rescale_factor (float): Scaling factor for pixel values (default 1/255)
            do_rescale (bool): Whether to rescale images
            do_normalize (bool): Whether to normalize images
            resample: Resampling method for image resizing
            **kwargs: Additional base class arguments
        """
        super().__init__(**kwargs)
        self.patch_size = patch_size
        self.merge_size = merge_size
        self.temporal_patch_size = temporal_patch_size
        self.min_pixels = min_pixels
        self.max_pixels = max_pixels
        self.image_mean = image_mean
        self.image_std = image_std
        self.rescale_factor = rescale_factor
        self.do_rescale = do_rescale
        self.do_normalize = do_normalize
        self.resample = resample
    def _preprocess(
        self,
        images: Union[ImageInput, VideoInput],
        min_pixels: int,
        max_pixels: int,
        image_mean: Optional[Union[float, List[float]]],
        image_std: Optional[Union[float, List[float]]],
        rescale_factor: float,
        do_rescale: bool,
        do_normalize: bool,
        resample: PILImageResampling,
        data_format: Optional[ChannelDimension],
        input_data_format: Optional[Union[str, ChannelDimension]],
    ):
        """
        Internal method for image preprocessing pipeline.
        Args:
            images: Input image or batch of images
            min_pixels: Minimum allowed pixels in output
            max_pixels: Maximum allowed pixels in output
            image_mean: Normalization mean values
            image_std: Normalization std values
            rescale_factor: Pixel value scaling factor
            do_rescale: Whether to rescale pixel values
            do_normalize: Whether to normalize pixel values
            resample: Resampling method
            data_format: Output channel format
            input_data_format: Input channel format
        Returns:
            tuple: (flatten_patches, grid_dimensions)
                - flatten_patches: Flattened image patches
                - grid_dimensions: Grid dimensions [t, h, w]
        """
        images = make_list_of_images(images)
        # All transformations expect numpy arrays.
        images = [to_numpy_array(image) for image in images]
        if is_scaled_image(images[0]) and do_rescale:
            data_processor_logger.warning(
                "It looks like you are trying to rescale already rescaled images. If the input"
                " images have pixel values between 0 and 1, set `do_rescale=False` to avoid rescaling them again."
            )
        if input_data_format is None:
            # We assume that all images have the same channel dimension format.
            input_data_format = infer_channel_dimension_format(images[0])
        # Get original dimensions and calculate optimal resize dimensions
        height, width = get_image_size(images[0], channel_dim=input_data_format)
        resized_height, resized_width = smart_resize(
            height,
            width,
            factor=self.patch_size * self.merge_size,  # Combine patch and merge factors
            min_pixels=min_pixels,
            max_pixels=max_pixels,
        )
        processed_images = []
        for image in images:
            if height != resized_height or width != resized_width:
                # Convert to uint8 before resizing to avoid double scaling
                image = image.astype("uint8")
                # Convert to PIL Image and resize
                image = Image.fromarray(image)
                image = resize(
                    image,
                    size=(resized_height, resized_width),
                    resample=resample,
                    data_format=input_data_format,
                )
            if do_rescale and do_normalize:
                # Adjust mean and std for combined rescale+normalize
                image_mean = np.array(image_mean, dtype=np.float32) * (1.0 / rescale_factor)
                image_std = np.array(image_std, dtype=np.float32) * (1.0 / rescale_factor)
                do_rescale = False  # Skip separate rescale step
            if do_rescale:
                image = image.astype(np.float32)
                image = rescale(image, scale=rescale_factor, data_format=input_data_format)
            if do_normalize:
                image = image.astype(np.float32)
                image = normalize(
                    image=image,
                    mean=image_mean,
                    std=image_std,
                    data_format=input_data_format,
                )
            image = to_channel_dimension_format(image, data_format, input_channel_dim=input_data_format)  # [C, H, W]
            processed_images.append(image)
        # Convert processed images to numpy array
        patches = np.array(processed_images)
        # Pad temporal dimension if needed
        if patches.shape[0] % self.temporal_patch_size != 0:
            repeats = np.repeat(
                patches[-1][np.newaxis],
                self.temporal_patch_size - (patches.shape[0] % self.temporal_patch_size),
                axis=0,
            )
            patches = np.concatenate([patches, repeats], axis=0)
        # Convert to channels-first format if needed
        if data_format == ChannelDimension.LAST:
            patches = patches.transpose([0, 3, 1, 2])  # [N, H, W, C] -> [N, C, H, W]
        grid_t, channel = patches.shape[:2]
        grid_t = grid_t // self.temporal_patch_size
        grid_h, grid_w = (
            resized_height // self.patch_size,
            resized_width // self.patch_size,
        )
        # Reshape into hierarchical patch structure
        patches = patches.reshape(
            [
                grid_t,
                self.temporal_patch_size,
                channel,
                grid_h // self.merge_size,
                self.merge_size,
                self.patch_size,
                grid_w // self.merge_size,
                self.merge_size,
                self.patch_size,
            ]
        )
        # Reorder dimensions for better memory access pattern
        # [grid_t, grid_h/merge_size, grid_w/merge_size, merge_size, merge_size, C, temporal_patch_size, psz, psz]
        patches = patches.transpose([0, 3, 6, 4, 7, 2, 1, 5, 8])
        flatten_patches = patches.reshape(
            [
                grid_t * grid_h * grid_w,
                channel * self.temporal_patch_size * self.patch_size * self.patch_size,
            ]
        )
        return flatten_patches, np.array([grid_t, grid_h, grid_w])
    def preprocess(
        self,
        images: Union[ImageInput, VideoInput],
        min_pixels: Optional[int] = None,
        max_pixels: Optional[int] = None,
        image_mean: Optional[Union[float, List[float]]] = None,
        image_std: Optional[Union[float, List[float]]] = None,
        rescale_factor: Optional[float] = None,
        do_rescale: Optional[bool] = None,
        do_normalize: Optional[bool] = None,
        resample: Optional[PILImageResampling] = None,
        return_tensors: Optional[Union[str, TensorType]] = None,
        data_format: Optional[ChannelDimension] = ChannelDimension.FIRST,
        input_data_format: Optional[Union[str, ChannelDimension]] = ChannelDimension.LAST,
    ):
        """
        Main preprocessing method for images/videos.
        Args:
            images: Input image/video data
            min_pixels: Override for minimum pixels
            max_pixels: Override for maximum pixels
            image_mean: Override for normalization mean
            image_std: Override for normalization std
            rescale_factor: Override for rescaling factor
            do_rescale: Override for rescaling flag
            do_normalize: Override for normalization flag
            resample: Override for resampling method
            return_tensors: Desired output tensor format
            data_format: Output channel dimension format
            input_data_format: Input channel dimension format
        Returns:
            BatchFeature: Processed features containing:
                - pixel_values: Preprocessed pixel data
                - grid_thw: Grid dimensions [temporal, height, width]
        Raises:
            ValueError: For invalid image types or dimensions
        """
        min_pixels = min_pixels if min_pixels is not None else self.min_pixels
        max_pixels = max_pixels if max_pixels is not None else self.max_pixels
        image_mean = image_mean if image_mean is not None else self.image_mean
        image_std = image_std if image_std is not None else self.image_std
        rescale_factor = rescale_factor if rescale_factor is not None else self.rescale_factor
        do_rescale = do_rescale if do_rescale is not None else self.do_rescale
        do_normalize = do_normalize if do_normalize is not None else self.do_normalize
        resample = resample if resample is not None else self.resample
        if images is not None and not valid_images(images):
            raise ValueError("Invalid image type. Must be of type PIL.Image.Image, numpy.ndarray, " "paddle.Tensor.")
        pixel_values, grid_thw = self._preprocess(
            images,
            min_pixels=min_pixels,
            max_pixels=max_pixels,
            image_mean=image_mean,
            image_std=image_std,
            rescale_factor=rescale_factor,
            do_rescale=do_rescale,
            do_normalize=do_normalize,
            resample=resample,
            data_format=data_format,
            input_data_format=input_data_format,
        )
        data = {"pixel_values": pixel_values, "grid_thw": grid_thw}
        return BatchFeature(data=data, tensor_type=return_tensors)
@@ -0,0 +1,453 @@
 """
 # Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 """
 """Unified multimodal processor for all VL model types.
 Consolidates the four separate VL processor wrappers (QwenVLProcessor,
 Qwen3VLProcessor, PaddleOCRVLProcessor, Ernie4_5_VLProcessor) into a
 single class that dispatches per ``model_type``.
 """
 from collections.abc import Mapping
 from typing import Any, Dict, Optional
 import numpy as np
 from fastdeploy.input.base_processor import BaseTextProcessor
 from fastdeploy.input.utils import IDS_TYPE_FLAG, process_stop_token_ids
 from fastdeploy.utils import data_processor_logger
 QWEN_VL = "qwen_vl"
 QWEN3_VL = "qwen3_vl"
 PADDLEOCR_VL = "paddleocr_vl"
 ERNIE4_5_VL = "ernie4_5_vl"
 _SUPPORTED_MODEL_TYPES = {QWEN_VL, QWEN3_VL, PADDLEOCR_VL, ERNIE4_5_VL}
 _QWEN_EXPECTED_KWARGS = {
    "video_max_frames": int,
    "video_min_frames": int,
 }
 _ERNIE_EXPECTED_KWARGS = {
    "spatial_conv_size": int,
    "temporal_conv_size": int,
    "image_min_pixels": int,
    "image_max_pixels": int,
    "video_min_pixels": int,
    "video_max_pixels": int,
    "video_target_frames": int,
    "video_frames_sample": str,
    "video_max_frames": int,
    "video_min_frames": int,
    "video_fps": int,
 }
 _DEFAULT_MM_LIMITS = {"image": 1, "video": 1, "audio": 1}
 _SAMPLING_EPS = 1e-5
 class MultiModalProcessor(BaseTextProcessor):
    """Unified multimodal processor for all supported VL model types.
    Dispatches image-processor creation, config initialisation, and
    encoding logic based on ``model_type``.
    """
    def __init__(
        self,
        model_name_or_path: str,
        model_type: str,
        config=None,
        limit_mm_per_prompt: Optional[Dict[str, Any]] = None,
        mm_processor_kwargs: Optional[Dict[str, Any]] = None,
        reasoning_parser_obj=None,
        tool_parser_obj=None,
        enable_processor_cache: bool = False,
    ):
        if model_type not in _SUPPORTED_MODEL_TYPES:
            raise ValueError(
                f"Unsupported model_type '{model_type}'. " f"Must be one of {sorted(_SUPPORTED_MODEL_TYPES)}."
            )
        self.model_type = model_type
        self.config = config
        self.enable_processor_cache = enable_processor_cache
        tokenizer_type = "ernie4_5" if model_type == ERNIE4_5_VL else "auto"
        super().__init__(
            model_name_or_path,
            tokenizer_type=tokenizer_type,
            reasoning_parser_obj=reasoning_parser_obj,
            tool_parser_obj=tool_parser_obj,
        )
        data_processor_logger.info(f"model_name_or_path: {model_name_or_path}")
        processor_kwargs = self._parse_processor_kwargs(mm_processor_kwargs)
        self._init_mm_processor(processor_kwargs)
        self._init_mm_config()
        self.limit_mm_per_prompt = self._parse_limits(limit_mm_per_prompt)
    def _load_tokenizer(self):
        """Load the appropriate tokenizer based on model_type."""
        if self.tokenizer_type == "ernie4_5":
            import os
            from fastdeploy.input.ernie4_5_tokenizer import Ernie4_5Tokenizer
            vocab_file_names = ["tokenizer.model", "spm.model", "ernie_token_100k.model"]
            for name in vocab_file_names:
                if os.path.exists(os.path.join(self.model_name_or_path, name)):
                    Ernie4_5Tokenizer.resource_files_names["vocab_file"] = name
                    break
            tokenizer = Ernie4_5Tokenizer.from_pretrained(self.model_name_or_path)
        else:
            from paddleformers.transformers import AutoTokenizer
            tokenizer = AutoTokenizer.from_pretrained(self.model_name_or_path, padding_side="left", use_fast=True)
        return tokenizer
    def _init_mm_processor(self, processor_kwargs: dict):
        """Create the model-type-specific internal DataProcessor."""
        if self.model_type == QWEN_VL:
            from fastdeploy.input.qwen_vl_processor.process import DataProcessor
            tokens_per_second = getattr(getattr(self.config, "vision_config", None), "tokens_per_second", 2)
            self.processor = DataProcessor(
                model_path=self.model_name_or_path,
                enable_processor_cache=self.enable_processor_cache,
                tokens_per_second=tokens_per_second,
                tokenizer=self.tokenizer,
                **processor_kwargs,
            )
        elif self.model_type == QWEN3_VL:
            from fastdeploy.input.qwen3_vl_processor.process import DataProcessor
            self.processor = DataProcessor(
                model_path=self.model_name_or_path,
                enable_processor_cache=self.enable_processor_cache,
                tokenizer=self.tokenizer,
                **processor_kwargs,
            )
        elif self.model_type == PADDLEOCR_VL:
            from fastdeploy.input.paddleocr_vl_processor.process import DataProcessor
            tokens_per_second = getattr(getattr(self.config, "vision_config", None), "tokens_per_second", 2)
            self.processor = DataProcessor(
                model_path=self.model_name_or_path,
                enable_processor_cache=self.enable_processor_cache,
                tokens_per_second=tokens_per_second,
                tokenizer=self.tokenizer,
                **processor_kwargs,
            )
        elif self.model_type == ERNIE4_5_VL:
            from fastdeploy.input.ernie4_5_vl_processor.process import DataProcessor
            self.processor = DataProcessor(
                tokenizer_name=self.model_name_or_path,
                image_preprocessor_name=self.model_name_or_path,
                enable_processor_cache=self.enable_processor_cache,
                **processor_kwargs,
            )
            self.processor.eval()
    def _init_mm_config(self):
        """Set model-type-specific multimodal configuration attributes."""
        if self.model_type in (QWEN_VL, QWEN3_VL):
            self.image_patch_id = self.processor.image_token_id
        elif self.model_type == PADDLEOCR_VL:
            self.image_patch_id = self.processor.image_patch_id
        elif self.model_type == ERNIE4_5_VL:
            self.image_patch_id = self.processor.image_patch_id
            self.spatial_conv_size = self.processor.spatial_conv_size
    def _parse_processor_kwargs(self, kwargs: Optional[dict]) -> dict:
        """Parse and validate multimodal processor kwargs."""
        if not kwargs:
            return {}
        try:
            if not isinstance(kwargs, dict):
                raise ValueError("mm-processor-kwargs must be a dictionary")
            data_processor_logger.info(f"Processing kwargs: {kwargs}")
            if self.model_type == ERNIE4_5_VL:
                expected_types = _ERNIE_EXPECTED_KWARGS
            else:
                expected_types = _QWEN_EXPECTED_KWARGS
            for key, value in kwargs.items():
                if key in expected_types and not isinstance(value, expected_types[key]):
                    raise ValueError(
                        f"Invalid type for {key}: expected "
                        f"{expected_types[key].__name__}, got {type(value).__name__}"
                    )
            return kwargs
        except Exception as e:
            data_processor_logger.warning(f"Invalid mm-processor-kwargs format: {e}")
            return {}
    def _parse_limits(self, limits: Optional[dict]) -> dict:
        """Parse multimodal input limits, merging with defaults."""
        if not limits:
            return dict(_DEFAULT_MM_LIMITS)
        try:
            if not isinstance(limits, dict):
                raise ValueError("limit-mm-per-prompt must be a dictionary")
            data_processor_logger.info(f"_parse_limits:{limits}")
            return {**_DEFAULT_MM_LIMITS, **limits}
        except Exception as e:
            data_processor_logger.warning(f"Invalid limit-mm-per-prompt format: {e}, using default limits")
            return dict(_DEFAULT_MM_LIMITS)
    def _check_mm_limits(self, item):
        """Validate multimodal inputs against configured limits."""
        if isinstance(item, dict):
            mm_data = item
        else:
            mm_data = {"image": [], "video": []}
            for message in item:
                if isinstance(message.get("content"), list):
                    for part in message["content"]:
                        part_type = part.get("type")
                        if part_type in ("image_url", "image"):
                            mm_data["image"].append(part)
                        elif part_type in ("video_url", "video"):
                            mm_data["video"].append(part)
        for modality, data in mm_data.items():
            if modality in self.limit_mm_per_prompt:
                limit = self.limit_mm_per_prompt[modality]
                if len(data) > limit:
                    raise ValueError(f"Too many {modality} items in prompt, " f"got {len(data)} but limit is {limit}")
    def get_mm_max_tokens_per_item(self, seq_len: int) -> Optional[Mapping[str, int]]:
        """Return per-modality max token counts, if available."""
        if self.model_type == ERNIE4_5_VL:
            return self.processor.get_mm_max_tokens_per_item(seq_len)
        return None
    def process_request_dict(self, request, max_model_len=None):
        """Process a request dictionary into model inputs.
        Unified template-method flow for all VL model types.  Per-model
        differences are handled by small conditional branches rather than
        duplicating the entire pipeline.
        """
        request = self._apply_default_parameters(request)
        if not request.get("eos_token_ids"):
            request["eos_token_ids"] = self.eos_token_ids
        self._process_stop_tokens(request)
        if self.model_type != PADDLEOCR_VL:
            self._process_bad_words(request)
        if self.model_type == ERNIE4_5_VL:
            logits_processors_args = self._prepare_think_stop_sentence(
                request.get("logits_processors_args") or {}, max_model_len
            )
            request["logits_processors_args"] = logits_processors_args
        outputs = self._tokenize_request(request)
        self._process_post_tokens(request, outputs)
        if self.model_type in (QWEN_VL, QWEN3_VL):
            request["enable_thinking"] = False
        outputs = self.pack_outputs(outputs)
        if self.model_type in (QWEN3_VL, ERNIE4_5_VL) and request.get("prompt_token_ids"):
            pass  # preserve existing prompt_token_ids
        else:
            request["prompt_token_ids"] = outputs["input_ids"].tolist()
        request["prompt_token_ids_len"] = len(request["prompt_token_ids"])
        request["multimodal_inputs"] = outputs
        if max_model_len is not None and len(request["prompt_token_ids"]) > max_model_len:
            request["prompt_token_ids"] = request["prompt_token_ids"][: max_model_len - 1]
        if self.model_type == ERNIE4_5_VL:
            logits_processors_args = self._update_thinking_prompt_state(
                request["prompt_token_ids"], request.get("logits_processors_args") or {}
            )
            request["logits_processors_args"] = logits_processors_args
        max_tokens = max_model_len - len(request["prompt_token_ids"])
        if request.get("max_tokens") is None:
            request["max_tokens"] = max(1, max_tokens)
        else:
            request["max_tokens"] = min(max_tokens, request["max_tokens"])
        if self.model_type == ERNIE4_5_VL and request.get("reasoning_max_tokens") is None:
            request["reasoning_max_tokens"] = max(int(request["max_tokens"] * 0.8), 1)
        if self.model_type in (PADDLEOCR_VL, ERNIE4_5_VL):
            if request.get("top_p") is not None and request.get("top_p") < _SAMPLING_EPS:
                request["top_p"] = _SAMPLING_EPS
                request["top_k"] = 1
        if self.model_type != QWEN3_VL and self.reasoning_parser:
            self._apply_reasoning_parser(request)
        if self.model_type == ERNIE4_5_VL:
            if request.get("response_max_tokens") is not None and request.get("enable_thinking") is False:
                request["max_tokens"] = min(request["response_max_tokens"], request["max_tokens"])
        data_processor_logger.info(f"Processed request {request}")
        return request
    def _process_stop_tokens(self, request):
        """Handle stop token processing based on model type."""
        if self.model_type == QWEN3_VL:
            stop_sequences = request.get("stop", [])
            if stop_sequences:
                stop_seqs, stop_seqs_len = self.update_stop_seq(stop_sequences)
                request["stop_token_ids"] = stop_seqs
                request["stop_seqs_len"] = stop_seqs_len
        else:
            process_stop_token_ids(request, self.update_stop_seq)
    def _process_bad_words(self, request):
        """Process bad_words into token ids."""
        bad_words = request.get("bad_words")
        bad_words_token_ids = request.get("bad_words_token_ids")
        if bad_words:
            bad_words_token_ids = self.update_bad_words(bad_words, bad_words_token_ids)
            request["bad_words_token_ids"] = bad_words_token_ids
    def _tokenize_request(self, request):
        """Core tokenization dispatch: prompt_token_ids > prompt > messages."""
        default_thinking = True if self.model_type == ERNIE4_5_VL else False
        if request.get("prompt_token_ids") and self.model_type in (QWEN3_VL, ERNIE4_5_VL):
            messages = request.get("messages")
            if messages:
                self._check_mm_limits(messages)
            request.setdefault("enable_thinking", default_thinking)
            return self.processor.prompt_token_ids2outputs(request)
        elif request.get("prompt"):
            multimodal_data = request.get("multimodal_data") or {}
            self._check_mm_limits(multimodal_data)
            images = multimodal_data.get("image", None)
            videos = multimodal_data.get("video", None)
            if self.model_type == ERNIE4_5_VL:
                request["prompt_tokens"] = request.get("prompt")
            request.setdefault("enable_thinking", default_thinking)
            return self.processor.text2ids(request["prompt"], images, videos)
        elif request.get("messages"):
            messages = request["messages"]
            self._check_mm_limits(messages)
            chat_template_kwargs = request.get("chat_template_kwargs")
            if chat_template_kwargs:
                if isinstance(chat_template_kwargs, dict):
                    for k, v in chat_template_kwargs.items():
                        if k not in request or request[k] is None:
                            request[k] = v
                else:
                    raise ValueError("Invalid input: chat_template_kwargs must be a dict")
            request.setdefault("enable_thinking", default_thinking)
            return self.processor.request2ids(request)
        else:
            raise ValueError(f"Request must contain 'prompt', or 'messages': {request}")
    def _process_post_tokens(self, request, outputs):
        """Handle post-tokenization token appending."""
        if self.model_type == PADDLEOCR_VL:
            metadata = request.get("metadata")
            if metadata and metadata.get("generated_token_ids"):
                self._append_completion_tokens_qwen(outputs, metadata["generated_token_ids"])
        else:
            if request.get("completion_token_ids"):
                self.append_completion_tokens(outputs, request["completion_token_ids"])
    def _apply_reasoning_parser(self, request):
        """Apply reasoning parser and update model status dict."""
        model_status = self.reasoning_parser.get_model_status(request["prompt_token_ids"])
        parts = request["request_id"].split("_")
        if len(parts) > 1:
            real_req_id = parts[0]
            index = int(parts[1])
            n = request.get("n", 1)
            for idx in range(index * n, (index + 1) * n):
                self.model_status_dict[f"{real_req_id}_{idx}"] = model_status
        else:
            self.model_status_dict[request["request_id"]] = model_status
        request["enable_thinking"] = model_status == "think_start"
    def append_completion_tokens(self, multimodal_inputs, completion_token_ids):
        """Append completion tokens to existing multimodal outputs."""
        if self.model_type == ERNIE4_5_VL:
            self._append_completion_tokens_ernie(multimodal_inputs, completion_token_ids)
        else:
            self._append_completion_tokens_qwen(multimodal_inputs, completion_token_ids)
    def _append_completion_tokens_qwen(self, multimodal_inputs, completion_token_ids):
        """Append completion tokens for qwen_vl / qwen3_vl / paddleocr_vl."""
        num_tokens = len(completion_token_ids)
        multimodal_inputs["input_ids"].extend(completion_token_ids)
        multimodal_inputs["token_type_ids"].extend([0] * num_tokens)
        pos_ids = self.processor._compute_text_positions(multimodal_inputs["cur_position"], num_tokens)
        multimodal_inputs["position_ids"].append(pos_ids)
        multimodal_inputs["cur_position"] += num_tokens
    def _append_completion_tokens_ernie(self, multimodal_inputs, completion_token_ids):
        """Append completion tokens for ernie4_5_vl."""
        num_tokens = len(completion_token_ids)
        multimodal_inputs["input_ids"].extend(completion_token_ids)
        multimodal_inputs["token_type_ids"].extend([IDS_TYPE_FLAG["text"]] * num_tokens)
        start = multimodal_inputs["cur_position"]
        for i in range(num_tokens):
            multimodal_inputs["position_ids"].append([start + i] * 3)
        multimodal_inputs["cur_position"] += num_tokens
    def pack_outputs(self, outputs):
        """Convert intermediate processing outputs to final format."""
        if not outputs["images"]:
            outputs["images"] = None
            outputs["grid_thw"] = None
            outputs["image_type_ids"] = None
        else:
            outputs["images"] = np.vstack(outputs["images"])
            outputs["grid_thw"] = np.vstack(outputs["grid_thw"])
            outputs["image_type_ids"] = np.array(outputs["image_type_ids"])
        outputs["input_ids"] = np.array(outputs["input_ids"], dtype=np.int64)
        outputs["token_type_ids"] = np.array(outputs["token_type_ids"], dtype=np.int64)
        outputs["mm_num_token_func"] = self.processor.mm_num_tokens
        if self.model_type in (QWEN_VL, QWEN3_VL, PADDLEOCR_VL):
            outputs["position_ids"] = np.concatenate(outputs["position_ids"], axis=1, dtype=np.int64)
            outputs["image_patch_id"] = self.processor.image_token_id
            outputs["video_patch_id"] = self.processor.video_token_id
            outputs["position_ids"] = outputs["position_ids"].transpose(1, 0)
        else:
            outputs["position_ids"] = np.array(outputs["position_ids"], dtype=np.int64)
            outputs["image_patch_id"] = self.image_patch_id
        return outputs
@@ -14,216 +14,12 @@
 # limitations under the License.
 """
-"""Image processor class for Keye."""
+# Backward compatibility: this module has been migrated to
 # fastdeploy.input.image_processors.paddleocr_processor
 # This file will be removed in a future version.
-# TODO: Support videos
+from fastdeploy.input.image_processors.paddleocr_processor import (  # noqa: F401
-
+    ImageProcessor,
-import json
+    make_batched_images,
-from pathlib import Path
+    smart_resize,
 from typing import Dict, List, Optional, Union
 import numpy as np
 from paddleformers.transformers.feature_extraction_utils import BatchFeature
 from paddleformers.transformers.image_processing_utils import BaseImageProcessor
 from paddleformers.transformers.image_utils import (
    ImageInput,
    is_valid_image,
    make_list_of_images,
    to_numpy_array,
 )
 from fastdeploy.input.image_processors.common import (
    smart_resize_paddleocr as smart_resize,
 )
 _OPENAI_CLIP_MEAN = [0.48145466, 0.4578275, 0.40821073]
 _OPENAI_CLIP_STD = [0.26862954, 0.26130258, 0.27577711]
 def make_batched_images(images) -> List[List[ImageInput]]:
    """
    Accepts images in list or nested list format, and makes a list of images for preprocessing.
    Args:
        images (`Union[List[List[ImageInput]], List[ImageInput], ImageInput]`):
            The input image.
    Returns:
        list: A list of images.
    """
    if isinstance(images, (list, tuple)) and isinstance(images[0], (list, tuple)) and is_valid_image(images[0][0]):
        return [img for img_list in images for img in img_list]
    elif isinstance(images, (list, tuple)) and is_valid_image(images[0]):
        return images
    elif is_valid_image(images):
        return [images]
    raise ValueError(f"Could not make batched images from {images}")
 def adjust_size(size, patch_size):
    num_patches = size // patch_size
    if num_patches % 2 != 0:
        num_patches -= 1
    return num_patches * patch_size
 class ImageProcessor(BaseImageProcessor):
    model_input_names = [
        "pixel_values",
        "image_grid_thw",
        "pixel_values_videos",
        "video_grid_thw",
    ]
    def __init__(
        self,
        do_resize: bool = True,
        resample: int = 3,
        do_rescale: bool = True,
        rescale_factor: Union[int, float] = 1 / 255,
        do_normalize: bool = True,
        image_mean: Optional[Union[float, List[float]]] = None,
        image_std: Optional[Union[float, List[float]]] = None,
        do_convert_rgb: bool = True,
        min_pixels: int = 28 * 28 * 130,
        max_pixels: int = 28 * 28 * 1280,
        patch_size: int = 14,
        temporal_patch_size: int = 1,
        merge_size: int = 2,
        **kwargs,
    ) -> None:
        super().__init__()
        self.do_resize = do_resize
        self.resample = resample
        self.do_rescale = do_rescale
        self.rescale_factor = rescale_factor
        self.do_normalize = do_normalize
        self.image_mean = image_mean if image_mean is not None else _OPENAI_CLIP_MEAN
        self.image_std = image_std if image_std is not None else _OPENAI_CLIP_STD
        self.min_pixels = min_pixels
        self.max_pixels = max_pixels
        self.patch_size = patch_size
        self.temporal_patch_size = temporal_patch_size
        self.merge_size = merge_size
        self.size = {"min_pixels": min_pixels, "max_pixels": max_pixels}  # not used
        self.do_convert_rgb = do_convert_rgb
    @classmethod
    def from_pretrained(cls, pretrained_model_dir):
        pretrained_model_dir = Path(pretrained_model_dir)
        image_processor_config_path = pretrained_model_dir / "preprocessor_config.json"
        with open(image_processor_config_path, "r", encoding="utf-8") as f:
            image_processor_config = json.load(f)
        return cls(**image_processor_config)
    def _preprocess(
        self,
        images,
        do_resize: Optional[bool] = None,
        do_rescale: Optional[bool] = None,
        rescale_factor: Optional[float] = None,
        do_normalize: Optional[bool] = None,
        image_mean: Optional[Union[float, List[float]]] = None,
        image_std: Optional[Union[float, List[float]]] = None,
        do_convert_rgb: Optional[bool] = None,
    ):
        images = make_list_of_images(images)
        if do_convert_rgb:
            images = [image.convert("RGB") for image in images]
        width, height = images[0].size
        resized_height, resized_width = height, width
        processed_images = []
        for image in images:
            if do_resize:
                resized_height, resized_width = smart_resize(
                    height,
                    width,
                    factor=self.patch_size * self.merge_size,
                    min_pixels=self.min_pixels,
                    max_pixels=self.max_pixels,
                )
                image = image.resize((resized_width, resized_height), resample=self.resample)
            image = to_numpy_array(image)
            if do_rescale:
                image = (image * rescale_factor).astype(np.float32)
            if do_normalize:
                image = image.astype(np.float32)
                image -= np.array(image_mean, dtype=np.float32)
                image /= np.array(image_std, dtype=np.float32)
            processed_images.append(image)
        patches = np.array(processed_images)
        patches = patches.transpose(0, 3, 1, 2)
        if patches.shape[0] == 1:
            patches = np.tile(patches, (self.temporal_patch_size, 1, 1, 1))
        channel = patches.shape[1]
        grid_t = patches.shape[0] // self.temporal_patch_size
        grid_h, grid_w = (
            resized_height // self.patch_size,
            resized_width // self.patch_size,
        )
        patches = patches.reshape(
            grid_t,
            self.temporal_patch_size,
            channel,
            grid_h,
            self.patch_size,
            grid_w,
            self.patch_size,
        )
        patches = patches.transpose(0, 3, 5, 2, 1, 4, 6)
        assert self.temporal_patch_size == 1
        flatten_patches = patches.reshape(grid_t * grid_h * grid_w, channel, self.patch_size, self.patch_size)
        return flatten_patches, np.array([grid_t, grid_h, grid_w])
    def preprocess(
        self,
        images,
        videos=None,
        do_resize: Optional[bool] = None,
        size: Optional[Dict[str, int]] = None,
        do_rescale: Optional[bool] = None,
        rescale_factor: Optional[float] = None,
        do_normalize: Optional[bool] = None,
        image_mean: Optional[Union[float, List[float]]] = None,
        image_std: Optional[Union[float, List[float]]] = None,
        do_convert_rgb: Optional[bool] = None,
        return_tensors=None,
    ):
        do_resize = do_resize if do_resize is not None else self.do_resize
        size = size if size is not None else self.size
        do_rescale = do_rescale if do_rescale is not None else self.do_rescale
        rescale_factor = rescale_factor if rescale_factor is not None else self.rescale_factor
        do_normalize = do_normalize if do_normalize is not None else self.do_normalize
        image_mean = image_mean if image_mean is not None else self.image_mean
        image_std = image_std if image_std is not None else self.image_std
        do_convert_rgb = do_convert_rgb if do_convert_rgb is not None else self.do_convert_rgb
        if videos is not None:
            raise NotImplementedError("Videos are not yet supported")
        patches, image_grid_thw = self._preprocess(
            images,
            do_resize=do_resize,
            do_rescale=do_rescale,
            rescale_factor=rescale_factor,
            do_normalize=do_normalize,
            image_mean=image_mean,
            image_std=image_std,
            do_convert_rgb=do_convert_rgb,
        )
        pixel_values = np.array(patches)
        data = {"pixel_values": pixel_values, "grid_thw": image_grid_thw}
        return BatchFeature(data=data, tensor_type=return_tensors)
@@ -91,54 +91,34 @@ class InputPreprocessor:
                    tool_parser_obj=tool_parser_obj,
                )
            else:
                from fastdeploy.input.multimodal_processor import (
                    ERNIE4_5_VL,
                    PADDLEOCR_VL,
                    QWEN3_VL,
                    QWEN_VL,
                    MultiModalProcessor,
                )
                if ErnieArchitectures.contains_ernie_arch(architecture):
-                    from fastdeploy.input.ernie4_5_vl_processor import (
+                    model_type = ERNIE4_5_VL
                        Ernie4_5_VLProcessor,
                    )
                    self.processor = Ernie4_5_VLProcessor(
                        model_name_or_path=self.model_name_or_path,
                        limit_mm_per_prompt=self.limit_mm_per_prompt,
                        mm_processor_kwargs=self.mm_processor_kwargs,
                        reasoning_parser_obj=reasoning_parser_obj,
                        tool_parser_obj=tool_parser_obj,
                        enable_processor_cache=self.enable_processor_cache,
                    )
                elif "PaddleOCRVL" in architecture:
-                    from fastdeploy.input.paddleocr_vl_processor import (
+                    model_type = PADDLEOCR_VL
                        PaddleOCRVLProcessor,
                    )
                    self.processor = PaddleOCRVLProcessor(
                        config=self.model_config,
                        model_name_or_path=self.model_name_or_path,
                        limit_mm_per_prompt=self.limit_mm_per_prompt,
                        mm_processor_kwargs=self.mm_processor_kwargs,
                        reasoning_parser_obj=reasoning_parser_obj,
                    )
                elif "Qwen2_5_VL" in architecture:
-                    from fastdeploy.input.qwen_vl_processor import QwenVLProcessor
+                    model_type = QWEN_VL
                    self.processor = QwenVLProcessor(
                        config=self.model_config,
                        model_name_or_path=self.model_name_or_path,
                        limit_mm_per_prompt=self.limit_mm_per_prompt,
                        mm_processor_kwargs=self.mm_processor_kwargs,
                        reasoning_parser_obj=reasoning_parser_obj,
                        enable_processor_cache=self.enable_processor_cache,
                    )
                elif "Qwen3VL" in architecture:
-                    from fastdeploy.input.qwen3_vl_processor import Qwen3VLProcessor
+                    model_type = QWEN3_VL
                    self.processor = Qwen3VLProcessor(
                        config=self.model_config,
                        model_name_or_path=self.model_name_or_path,
                        limit_mm_per_prompt=self.limit_mm_per_prompt,
                        mm_processor_kwargs=self.mm_processor_kwargs,
                        reasoning_parser_obj=reasoning_parser_obj,
                        enable_processor_cache=self.enable_processor_cache,
                    )
                else:
                    raise ValueError(f"Unsupported model processor architecture: {architecture}. ")
                self.processor = MultiModalProcessor(
                    model_name_or_path=self.model_name_or_path,
                    model_type=model_type,
                    config=self.model_config,
                    limit_mm_per_prompt=self.limit_mm_per_prompt,
                    mm_processor_kwargs=self.mm_processor_kwargs,
                    reasoning_parser_obj=reasoning_parser_obj,
                    tool_parser_obj=tool_parser_obj,
                    enable_processor_cache=self.enable_processor_cache,
                )
        return self.processor
@@ -14,320 +14,10 @@
 # limitations under the License.
 """
-from typing import List, Optional, Union
+# Backward compatibility: this module has been migrated to
 # fastdeploy.input.image_processors.qwen3_processor
 # This file will be removed in a future version.
-import numpy as np
+from fastdeploy.input.image_processors.qwen3_processor import (  # noqa: F401
-import paddle
+    ImageProcessor,
 import PIL
 from paddleformers.transformers.feature_extraction_utils import BatchFeature
 from paddleformers.transformers.image_processing_utils import BaseImageProcessor
 from paddleformers.transformers.image_transforms import (
    normalize,
    rescale,
    resize,
    to_channel_dimension_format,
 )
 from paddleformers.transformers.image_utils import (
    ChannelDimension,
    ImageInput,
    PILImageResampling,
    get_image_size,
    infer_channel_dimension_format,
    make_list_of_images,
    to_numpy_array,
    valid_images,
 )
 from paddleformers.transformers.legacy.tokenizer_utils_base import TensorType
 from PIL import Image
 from fastdeploy.input.image_processors.common import is_scaled_image, smart_resize
 from fastdeploy.utils import data_processor_logger
 IMAGE_MEAN = [0.5, 0.5, 0.5]
 IMAGE_STD = [0.5, 0.5, 0.5]
 MIN_PIXELS = 65536
 MAX_PIXELS = 16777216
 VideoInput = Union[
    List["PIL.Image.Image"],
    "np.ndarray",
    "paddle.Tensor",
    List["np.ndarray"],
    List["paddle.Tensor"],
    List[List["PIL.Image.Image"]],
    List[List["np.ndarray"]],
    List[List["paddle.Tensor"]],
 ]
 class ImageProcessor(BaseImageProcessor):
    """
    Adaptive image processor for dynamic image resizing and preprocessing.
    This processor handles image resizing, rescaling, normalization and format conversion.
    It dynamically adjusts image dimensions based on original size and specified constraints.
    """
    def __init__(
        self,
        patch_size: int = 16,
        merge_size: int = 2,
        temporal_patch_size: int = 2,
        min_pixels: int = MIN_PIXELS,
        max_pixels: int = MAX_PIXELS,
        image_mean: Union[float, List[float]] = IMAGE_MEAN,
        image_std: Union[float, List[float]] = IMAGE_STD,
        rescale_factor: float = 1 / 255,
        do_rescale: bool = True,
        do_normalize: bool = True,
        resample: PILImageResampling = PILImageResampling.BICUBIC,
        **kwargs,
    ) -> None:
        """
        Initialize image processor with configuration parameters.
        Args:
            patch_size (int): Spatial patch size for vision encoder
            merge_size (int): Merge size between vision and LLM encoders
            temporal_patch_size (int): Temporal patch size for video processing
            min_pixels (int): Minimum allowed pixels in resized image
            max_pixels (int): Maximum allowed pixels in resized image
            image_mean (float/list): Mean values for normalization per channel
            image_std (float/list): Std values for normalization per channel
            rescale_factor (float): Scaling factor for pixel values (default 1/255)
            do_rescale (bool): Whether to rescale images
            do_normalize (bool): Whether to normalize images
            resample: Resampling method for image resizing
            **kwargs: Additional base class arguments
        """
        super().__init__(**kwargs)
        self.patch_size = patch_size
        self.merge_size = merge_size
        self.temporal_patch_size = temporal_patch_size
        self.min_pixels = min_pixels
        self.max_pixels = max_pixels
        self.image_mean = image_mean
        self.image_std = image_std
        self.rescale_factor = rescale_factor
        self.do_rescale = do_rescale
        self.do_normalize = do_normalize
        self.resample = resample
    def _preprocess(
        self,
        images: Union[ImageInput, VideoInput],
        min_pixels: int,
        max_pixels: int,
        image_mean: Optional[Union[float, List[float]]],
        image_std: Optional[Union[float, List[float]]],
        rescale_factor: float,
        do_rescale: bool,
        do_normalize: bool,
        resample: PILImageResampling,
        data_format: Optional[ChannelDimension],
        input_data_format: Optional[Union[str, ChannelDimension]],
    ):
        """
        Internal method for image preprocessing pipeline.
        Args:
            images: Input image or batch of images
            min_pixels: Minimum allowed pixels in output
            max_pixels: Maximum allowed pixels in output
            image_mean: Normalization mean values
            image_std: Normalization std values
            rescale_factor: Pixel value scaling factor
            do_rescale: Whether to rescale pixel values
            do_normalize: Whether to normalize pixel values
            resample: Resampling method
            data_format: Output channel format
            input_data_format: Input channel format
        Returns:
            tuple: (flatten_patches, grid_dimensions)
                - flatten_patches: Flattened image patches
                - grid_dimensions: Grid dimensions [t, h, w]
        """
        images = make_list_of_images(images)
        # All transformations expect numpy arrays.
        images = [to_numpy_array(image) for image in images]
        if is_scaled_image(images[0]) and do_rescale:
            data_processor_logger.warning(
                "It looks like you are trying to rescale already rescaled images. If the input"
                " images have pixel values between 0 and 1, set `do_rescale=False` to avoid rescaling them again."
            )
        if input_data_format is None:
            # We assume that all images have the same channel dimension format.
            input_data_format = infer_channel_dimension_format(images[0])
        # Get original dimensions and calculate optimal resize dimensions
        height, width = get_image_size(images[0], channel_dim=input_data_format)
        resized_height, resized_width = smart_resize(
            height,
            width,
            factor=self.patch_size * self.merge_size,  # Combine patch and merge factors
            min_pixels=min_pixels,
            max_pixels=max_pixels,
        )
        processed_images = []
        for image in images:
            if height != resized_height or width != resized_width:
                # Convert to uint8 before resizing to avoid double scaling
                image = image.astype("uint8")
                # Convert to PIL Image and resize
                image = Image.fromarray(image)
                image = resize(
                    image,
                    size=(resized_height, resized_width),
                    resample=resample,
                    data_format=input_data_format,
                )
            if do_rescale and do_normalize:
                # Adjust mean and std for combined rescale+normalize
                image_mean = np.array(image_mean, dtype=np.float32) * (1.0 / rescale_factor)
                image_std = np.array(image_std, dtype=np.float32) * (1.0 / rescale_factor)
                do_rescale = False  # Skip separate rescale step
            # mutual exclusion and upper branch
            if do_rescale:
                image = image.astype(np.float32)
                image = rescale(image, scale=rescale_factor, data_format=input_data_format)
            if do_normalize:
                image = image.astype(np.float32)
                image = normalize(
                    image=image,
                    mean=image_mean,
                    std=image_std,
                    data_format=input_data_format,
                )
            image = to_channel_dimension_format(image, data_format, input_channel_dim=input_data_format)  # [C, H, W]
            processed_images.append(image)
        # Convert processed images to numpy array
        patches = np.array(processed_images)
        # Pad temporal dimension if needed
        if patches.shape[0] % self.temporal_patch_size != 0:
            repeats = np.repeat(
                patches[-1][np.newaxis],
                self.temporal_patch_size - (patches.shape[0] % self.temporal_patch_size),
                axis=0,
            )
            patches = np.concatenate([patches, repeats], axis=0)
        # Convert to channels-first format if needed
        if data_format == ChannelDimension.LAST:
            patches = patches.transpose([0, 3, 1, 2])  # [N, H, W, C] -> [N, C, H, W]
        grid_t, channel = patches.shape[:2]
        grid_t = grid_t // self.temporal_patch_size
        grid_h, grid_w = (
            resized_height // self.patch_size,
            resized_width // self.patch_size,
        )
        # Reshape into hierarchical patch structure
        patches = patches.reshape(
            [
                grid_t,
                self.temporal_patch_size,
                channel,
                grid_h // self.merge_size,
                self.merge_size,
                self.patch_size,
                grid_w // self.merge_size,
                self.merge_size,
                self.patch_size,
            ]
        )
        # Reorder dimensions for better memory access pattern
        # [grid_t, grid_h/merge_size, grid_w/merge_size, merge_size, merge_size, C, temporal_patch_size, psz, psz]
        patches = patches.transpose([0, 3, 6, 4, 7, 2, 1, 5, 8])
        flatten_patches = patches.reshape(
            [
                grid_t * grid_h * grid_w,
                channel * self.temporal_patch_size * self.patch_size * self.patch_size,
            ]
        )
        return flatten_patches, np.array([grid_t, grid_h, grid_w])
    def preprocess(
        self,
        images: Union[ImageInput, VideoInput],
        min_pixels: Optional[int] = None,
        max_pixels: Optional[int] = None,
        image_mean: Optional[Union[float, List[float]]] = None,
        image_std: Optional[Union[float, List[float]]] = None,
        rescale_factor: Optional[float] = None,
        do_rescale: Optional[bool] = None,
        do_normalize: Optional[bool] = None,
        resample: Optional[PILImageResampling] = None,
        return_tensors: Optional[Union[str, TensorType]] = None,
        data_format: Optional[ChannelDimension] = ChannelDimension.FIRST,
        input_data_format: Optional[Union[str, ChannelDimension]] = ChannelDimension.LAST,
    ):
        """
        Main preprocessing method for images/videos.
        Args:
            images: Input image/video data
            min_pixels: Override for minimum pixels
            max_pixels: Override for maximum pixels
            image_mean: Override for normalization mean
            image_std: Override for normalization std
            rescale_factor: Override for rescaling factor
            do_rescale: Override for rescaling flag
            do_normalize: Override for normalization flag
            resample: Override for resampling method
            return_tensors: Desired output tensor format
            data_format: Output channel dimension format
            input_data_format: Input channel dimension format
        Returns:
            BatchFeature: Processed features containing:
                - pixel_values: Preprocessed pixel data
                - grid_thw: Grid dimensions [temporal, height, width]
        Raises:
            ValueError: For invalid image types or dimensions
        """
        min_pixels = min_pixels if min_pixels is not None else self.min_pixels
        max_pixels = max_pixels if max_pixels is not None else self.max_pixels
        image_mean = image_mean if image_mean is not None else self.image_mean
        image_std = image_std if image_std is not None else self.image_std
        rescale_factor = rescale_factor if rescale_factor is not None else self.rescale_factor
        do_rescale = do_rescale if do_rescale is not None else self.do_rescale
        do_normalize = do_normalize if do_normalize is not None else self.do_normalize
        resample = resample if resample is not None else self.resample
        if images is not None and not valid_images(images):
            raise ValueError("Invalid image type. Must be of type PIL.Image.Image, numpy.ndarray, " "paddle.Tensor.")
        pixel_values, grid_thw = self._preprocess(
            images,
            min_pixels=min_pixels,
            max_pixels=max_pixels,
            image_mean=image_mean,
            image_std=image_std,
            rescale_factor=rescale_factor,
            do_rescale=do_rescale,
            do_normalize=do_normalize,
            resample=resample,
            data_format=data_format,
            input_data_format=input_data_format,
        )
        data = {"pixel_values": pixel_values, "grid_thw": grid_thw}
        return BatchFeature(data=data, tensor_type=return_tensors)
@@ -14,319 +14,10 @@
 # limitations under the License.
 """
-from typing import List, Optional, Union
+# Backward compatibility: this module has been migrated to
 # fastdeploy.input.image_processors.qwen_processor
 # This file will be removed in a future version.
-import numpy as np
+from fastdeploy.input.image_processors.qwen_processor import (  # noqa: F401
-import paddle
+    ImageProcessor,
 import PIL
 from paddleformers.transformers.feature_extraction_utils import BatchFeature
 from paddleformers.transformers.image_processing_utils import BaseImageProcessor
 from paddleformers.transformers.image_transforms import (
    normalize,
    rescale,
    resize,
    to_channel_dimension_format,
 )
 from paddleformers.transformers.image_utils import (
    ChannelDimension,
    ImageInput,
    PILImageResampling,
    get_image_size,
    infer_channel_dimension_format,
    make_list_of_images,
    to_numpy_array,
    valid_images,
 )
 from paddleformers.transformers.legacy.tokenizer_utils_base import TensorType
 from PIL import Image
 from fastdeploy.input.image_processors.common import is_scaled_image, smart_resize
 from fastdeploy.utils import data_processor_logger
 OPENAI_CLIP_MEAN = [0.48145466, 0.4578275, 0.40821073]
 OPENAI_CLIP_STD = [0.26862954, 0.26130258, 0.27577711]
 MIN_PIXELS = 4 * 28 * 28
 MAX_PIXELS = 16384 * 28 * 28
 VideoInput = Union[
    List["PIL.Image.Image"],
    "np.ndarray",
    "paddle.Tensor",
    List["np.ndarray"],
    List["paddle.Tensor"],
    List[List["PIL.Image.Image"]],
    List[List["np.ndarray"]],
    List[List["paddle.Tensor"]],
 ]
 class ImageProcessor(BaseImageProcessor):
    """
    Adaptive image processor for dynamic image resizing and preprocessing.
    This processor handles image resizing, rescaling, normalization and format conversion.
    It dynamically adjusts image dimensions based on original size and specified constraints.
    """
    def __init__(
        self,
        patch_size: int = 14,
        merge_size: int = 2,
        temporal_patch_size: int = 2,
        min_pixels: int = MIN_PIXELS,
        max_pixels: int = MAX_PIXELS,
        image_mean: Union[float, List[float]] = OPENAI_CLIP_MEAN,
        image_std: Union[float, List[float]] = OPENAI_CLIP_STD,
        rescale_factor: float = 1 / 255,
        do_rescale: bool = True,
        do_normalize: bool = True,
        resample: PILImageResampling = PILImageResampling.BICUBIC,
        **kwargs,
    ) -> None:
        """
        Initialize image processor with configuration parameters.
        Args:
            patch_size (int): Spatial patch size for vision encoder
            merge_size (int): Merge size between vision and LLM encoders
            temporal_patch_size (int): Temporal patch size for video processing
            min_pixels (int): Minimum allowed pixels in resized image
            max_pixels (int): Maximum allowed pixels in resized image
            image_mean (float/list): Mean values for normalization per channel
            image_std (float/list): Std values for normalization per channel
            rescale_factor (float): Scaling factor for pixel values (default 1/255)
            do_rescale (bool): Whether to rescale images
            do_normalize (bool): Whether to normalize images
            resample: Resampling method for image resizing
            **kwargs: Additional base class arguments
        """
        super().__init__(**kwargs)
        self.patch_size = patch_size
        self.merge_size = merge_size
        self.temporal_patch_size = temporal_patch_size
        self.min_pixels = min_pixels
        self.max_pixels = max_pixels
        self.image_mean = image_mean
        self.image_std = image_std
        self.rescale_factor = rescale_factor
        self.do_rescale = do_rescale
        self.do_normalize = do_normalize
        self.resample = resample
    def _preprocess(
        self,
        images: Union[ImageInput, VideoInput],
        min_pixels: int,
        max_pixels: int,
        image_mean: Optional[Union[float, List[float]]],
        image_std: Optional[Union[float, List[float]]],
        rescale_factor: float,
        do_rescale: bool,
        do_normalize: bool,
        resample: PILImageResampling,
        data_format: Optional[ChannelDimension],
        input_data_format: Optional[Union[str, ChannelDimension]],
    ):
        """
        Internal method for image preprocessing pipeline.
        Args:
            images: Input image or batch of images
            min_pixels: Minimum allowed pixels in output
            max_pixels: Maximum allowed pixels in output
            image_mean: Normalization mean values
            image_std: Normalization std values
            rescale_factor: Pixel value scaling factor
            do_rescale: Whether to rescale pixel values
            do_normalize: Whether to normalize pixel values
            resample: Resampling method
            data_format: Output channel format
            input_data_format: Input channel format
        Returns:
            tuple: (flatten_patches, grid_dimensions)
                - flatten_patches: Flattened image patches
                - grid_dimensions: Grid dimensions [t, h, w]
        """
        images = make_list_of_images(images)
        # All transformations expect numpy arrays.
        images = [to_numpy_array(image) for image in images]
        if is_scaled_image(images[0]) and do_rescale:
            data_processor_logger.warning(
                "It looks like you are trying to rescale already rescaled images. If the input"
                " images have pixel values between 0 and 1, set `do_rescale=False` to avoid rescaling them again."
            )
        if input_data_format is None:
            # We assume that all images have the same channel dimension format.
            input_data_format = infer_channel_dimension_format(images[0])
        # Get original dimensions and calculate optimal resize dimensions
        height, width = get_image_size(images[0], channel_dim=input_data_format)
        resized_height, resized_width = smart_resize(
            height,
            width,
            factor=self.patch_size * self.merge_size,  # Combine patch and merge factors
            min_pixels=min_pixels,
            max_pixels=max_pixels,
        )
        processed_images = []
        for image in images:
            if height != resized_height or width != resized_width:
                # Convert to uint8 before resizing to avoid double scaling
                image = image.astype("uint8")
                # Convert to PIL Image and resize
                image = Image.fromarray(image)
                image = resize(
                    image,
                    size=(resized_height, resized_width),
                    resample=resample,
                    data_format=input_data_format,
                )
            if do_rescale and do_normalize:
                # Adjust mean and std for combined rescale+normalize
                image_mean = np.array(image_mean, dtype=np.float32) * (1.0 / rescale_factor)
                image_std = np.array(image_std, dtype=np.float32) * (1.0 / rescale_factor)
                do_rescale = False  # Skip separate rescale step
            if do_rescale:
                image = image.astype(np.float32)
                image = rescale(image, scale=rescale_factor, data_format=input_data_format)
            if do_normalize:
                image = image.astype(np.float32)
                image = normalize(
                    image=image,
                    mean=image_mean,
                    std=image_std,
                    data_format=input_data_format,
                )
            image = to_channel_dimension_format(image, data_format, input_channel_dim=input_data_format)  # [C, H, W]
            processed_images.append(image)
        # Convert processed images to numpy array
        patches = np.array(processed_images)
        # Pad temporal dimension if needed
        if patches.shape[0] % self.temporal_patch_size != 0:
            repeats = np.repeat(
                patches[-1][np.newaxis],
                self.temporal_patch_size - (patches.shape[0] % self.temporal_patch_size),
                axis=0,
            )
            patches = np.concatenate([patches, repeats], axis=0)
        # Convert to channels-first format if needed
        if data_format == ChannelDimension.LAST:
            patches = patches.transpose([0, 3, 1, 2])  # [N, H, W, C] -> [N, C, H, W]
        grid_t, channel = patches.shape[:2]
        grid_t = grid_t // self.temporal_patch_size
        grid_h, grid_w = (
            resized_height // self.patch_size,
            resized_width // self.patch_size,
        )
        # Reshape into hierarchical patch structure
        patches = patches.reshape(
            [
                grid_t,
                self.temporal_patch_size,
                channel,
                grid_h // self.merge_size,
                self.merge_size,
                self.patch_size,
                grid_w // self.merge_size,
                self.merge_size,
                self.patch_size,
            ]
        )
        # Reorder dimensions for better memory access pattern
        # [grid_t, grid_h/merge_size, grid_w/merge_size, merge_size, merge_size, C, temporal_patch_size, psz, psz]
        patches = patches.transpose([0, 3, 6, 4, 7, 2, 1, 5, 8])
        flatten_patches = patches.reshape(
            [
                grid_t * grid_h * grid_w,
                channel * self.temporal_patch_size * self.patch_size * self.patch_size,
            ]
        )
        return flatten_patches, np.array([grid_t, grid_h, grid_w])
    def preprocess(
        self,
        images: Union[ImageInput, VideoInput],
        min_pixels: Optional[int] = None,
        max_pixels: Optional[int] = None,
        image_mean: Optional[Union[float, List[float]]] = None,
        image_std: Optional[Union[float, List[float]]] = None,
        rescale_factor: Optional[float] = None,
        do_rescale: Optional[bool] = None,
        do_normalize: Optional[bool] = None,
        resample: Optional[PILImageResampling] = None,
        return_tensors: Optional[Union[str, TensorType]] = None,
        data_format: Optional[ChannelDimension] = ChannelDimension.FIRST,
        input_data_format: Optional[Union[str, ChannelDimension]] = ChannelDimension.LAST,
    ):
        """
        Main preprocessing method for images/videos.
        Args:
            images: Input image/video data
            min_pixels: Override for minimum pixels
            max_pixels: Override for maximum pixels
            image_mean: Override for normalization mean
            image_std: Override for normalization std
            rescale_factor: Override for rescaling factor
            do_rescale: Override for rescaling flag
            do_normalize: Override for normalization flag
            resample: Override for resampling method
            return_tensors: Desired output tensor format
            data_format: Output channel dimension format
            input_data_format: Input channel dimension format
        Returns:
            BatchFeature: Processed features containing:
                - pixel_values: Preprocessed pixel data
                - grid_thw: Grid dimensions [temporal, height, width]
        Raises:
            ValueError: For invalid image types or dimensions
        """
        min_pixels = min_pixels if min_pixels is not None else self.min_pixels
        max_pixels = max_pixels if max_pixels is not None else self.max_pixels
        image_mean = image_mean if image_mean is not None else self.image_mean
        image_std = image_std if image_std is not None else self.image_std
        rescale_factor = rescale_factor if rescale_factor is not None else self.rescale_factor
        do_rescale = do_rescale if do_rescale is not None else self.do_rescale
        do_normalize = do_normalize if do_normalize is not None else self.do_normalize
        resample = resample if resample is not None else self.resample
        if images is not None and not valid_images(images):
            raise ValueError("Invalid image type. Must be of type PIL.Image.Image, numpy.ndarray, " "paddle.Tensor.")
        pixel_values, grid_thw = self._preprocess(
            images,
            min_pixels=min_pixels,
            max_pixels=max_pixels,
            image_mean=image_mean,
            image_std=image_std,
            rescale_factor=rescale_factor,
            do_rescale=do_rescale,
            do_normalize=do_normalize,
            resample=resample,
            data_format=data_format,
            input_data_format=input_data_format,
        )
        data = {"pixel_values": pixel_values, "grid_thw": grid_thw}
        return BatchFeature(data=data, tensor_type=return_tensors)
@@ -340,9 +340,7 @@ class TestImagePreprocessorAdaptive(unittest.TestCase):
        # Create a scaled image (values between 0-1)
        img_array = np.random.rand(224, 224, 3).astype(np.float32) * 0.5
        # Use patch to capture warning
-        with patch(
+        with patch("fastdeploy.input.image_processors.adaptive_processor.data_processor_logger") as mock_logger:
            "fastdeploy.input.ernie4_5_vl_processor.image_preprocessor.image_preprocessor_adaptive.data_processor_logger"
        ) as mock_logger:
            # Directly call _preprocess, pass scaled image
            self.processor._preprocess(
                [img_array],  # Pass scaled numpy array
@@ -356,9 +354,7 @@ class TestImagePreprocessorAdaptive(unittest.TestCase):
        """Test invalid image check in preprocess (line 464)"""
        # Test invalid image type - need to ensure valid_images returns False
        # Use patch to make valid_images return False, but make_batched_images succeeds
-        with patch(
+        with patch("fastdeploy.input.image_processors.adaptive_processor.valid_images") as mock_valid:
            "fastdeploy.input.ernie4_5_vl_processor.image_preprocessor.image_preprocessor_adaptive.valid_images"
        ) as mock_valid:
            mock_valid.return_value = False
            valid_images_list = [Image.new("RGB", (224, 224))]  # Valid image, but valid_images returns False
            with self.assertRaises(ValueError) as context: