[DataProcessor] Move image_processor to unified directory and add MultiModalProcessor (#7109)

* first commit

* step 9~10

* update multimodal

* update multimodal

* fix load tokenizer

* add unit test

* fix unit test & AdaptiveImageProcessor

* Delete unused code
This commit is contained in:
luukunn
2026-04-08 10:16:27 +08:00
committed by GitHub
parent d693d4be14
commit 8496ec71a6
15 changed files with 3037 additions and 1401 deletions
@@ -14,7 +14,13 @@
# limitations under the License. # limitations under the License.
""" """
from .get_image_preprocessor import get_image_preprocessor # Backward compatibility: this module has been migrated to
from .image_preprocessor_adaptive import AdaptiveImageProcessor # fastdeploy.input.image_processors.adaptive_processor
# This file will be removed in a future version.
from fastdeploy.input.image_processors.adaptive_processor import ( # noqa: F401
AdaptiveImageProcessor,
get_image_preprocessor,
)
__all__ = ["get_image_preprocessor", "AdaptiveImageProcessor"] __all__ = ["get_image_preprocessor", "AdaptiveImageProcessor"]
@@ -14,21 +14,10 @@
# limitations under the License. # limitations under the License.
""" """
"""get image preprocessor""" # Backward compatibility: this module has been migrated to
# fastdeploy.input.image_processors.adaptive_processor
# This file will be removed in a future version.
from fastdeploy.utils import data_processor_logger from fastdeploy.input.image_processors.adaptive_processor import ( # noqa: F401
get_image_preprocessor,
from .image_preprocessor_adaptive import AdaptiveImageProcessor )
def get_image_preprocessor(args):
"""
get_image_preprocessor from args
"""
if args.vision_model_name_or_path is None:
return None
data_processor_logger.info("use AdaptiveImageProcessor")
image_preprocess = AdaptiveImageProcessor.from_pretrained(args.vision_model_name_or_path)
return image_preprocess
@@ -14,498 +14,12 @@
# limitations under the License. # limitations under the License.
""" """
"""image preprocessor adaptive""" # Backward compatibility: this module has been migrated to
# fastdeploy.input.image_processors.adaptive_processor
# This file will be removed in a future version.
from typing import List, Optional, Union from fastdeploy.input.image_processors.adaptive_processor import ( # noqa: F401
AdaptiveImageProcessor,
import numpy as np make_batched_images,
import paddle make_batched_videos,
import PIL
from paddleformers.transformers.feature_extraction_utils import BatchFeature
from paddleformers.transformers.image_processing_utils import BaseImageProcessor
from paddleformers.transformers.image_transforms import (
convert_to_rgb,
normalize,
rescale,
resize,
to_channel_dimension_format,
) )
from paddleformers.transformers.image_utils import (
ChannelDimension,
ImageInput,
PILImageResampling,
get_image_size,
infer_channel_dimension_format,
is_valid_image,
make_list_of_images,
to_numpy_array,
valid_images,
)
from paddleformers.transformers.legacy.tokenizer_utils_base import TensorType
from PIL import Image
from fastdeploy.input.image_processors.common import is_scaled_image
from fastdeploy.input.image_processors.common import smart_resize_qwen as smart_resize
from fastdeploy.utils import data_processor_logger
OPENAI_CLIP_MEAN = [0.48145466, 0.4578275, 0.40821073]
OPENAI_CLIP_STD = [0.26862954, 0.26130258, 0.27577711]
IMAGE_FACTOR = 28
MIN_PIXELS = 4 * 28 * 28
MAX_PIXELS = 16384 * 28 * 28
MAX_RATIO = 200
VideoInput = Union[
List["PIL.Image.Image"],
"np.ndarray",
"paddle.Tensor",
List["np.ndarray"],
List["paddle.Tensor"],
List[List["PIL.Image.Image"]],
List[List["np.ndarrray"]],
List[List["paddle.Tensor"]],
]
__all__ = [
"AdaptiveImageProcessor",
]
def make_batched_images(images) -> List[List[ImageInput]]:
"""
Accepts images in list or nested list format, and makes a list of images for preprocessing.
images (`Union[List[List[ImageInput]], List[ImageInput], ImageInput]`):
The input image.
Returns:
list: A list of images.
"""
if isinstance(images, (list, tuple)) and isinstance(images[0], (list, tuple)) and is_valid_image(images[0][0]):
return [img for img_list in images for img in img_list]
elif isinstance(images, (list, tuple)) and is_valid_image(images[0]):
return images
elif is_valid_image(images):
return [images]
raise ValueError(f"Could not make batched images from {images}")
# Copied from transformers.models.llava_next_video.image_processing_llava_next_video.make_batched_videos
def make_batched_videos(videos) -> List[VideoInput]:
"""dummy"""
if isinstance(videos, (list, tuple)) and isinstance(videos[0], (list, tuple)) and is_valid_image(videos[0][0]):
return videos
elif isinstance(videos, (list, tuple)) and is_valid_image(videos[0]):
if isinstance(videos[0], Image.Image):
return [videos]
elif len(videos[0].shape) == 4:
return [list(video) for video in videos]
elif is_valid_image(videos) and len(videos.shape) == 4:
return [list(videos)]
raise ValueError(f"Could not make batched video from {videos}")
class AdaptiveImageProcessor(BaseImageProcessor):
r"""
Constructs a adaptive image processor that dynamically resizes images based on the original images.
Args:
do_resize (`bool`, *optional*, defaults to `True`):
Whether to resize the image's (height, width) dimensions.
resample (`PILImageResampling`, *optional*, defaults to `Resampling.BICUBIC`):
Resampling filter to use when resizing the image.
do_rescale (`bool`, *optional*, defaults to `True`):
Whether to rescale the image by the specified scale `rescale_factor`.
rescale_factor (`int` or `float`, *optional*, defaults to `1/255`):
Scale factor to use if rescaling the image.
do_normalize (`bool`, *optional*, defaults to `True`):
Whether to normalize the image.
image_mean (`float` or `List[float]`, *optional*, defaults to `[0.48145466, 0.4578275, 0.40821073]`):
Mean to use if normalizing the image. This is a float or list of floats for each channel in the image.
image_std (`float` or `List[float]`, *optional*, defaults to `[0.26862954, 0.26130258, 0.27577711]`):
Standard deviation to use if normalizing the image. This is a float or list of floats for each channel
in the image.
do_convert_rgb (`bool`, *optional*, defaults to `True`):
Whether to convert the image to RGB.
min_pixels (`int`, *optional*, defaults to `56 * 56`):
The min pixels of the image to resize the image.
max_pixels (`int`, *optional*, defaults to `28 * 28 * 1280`):
The max pixels of the image to resize the image.
patch_size (`int`, *optional*, defaults to 14):
The spacial patch size of the vision encoder.
temporal_conv_size (`int`, *optional*, defaults to 2):
The temporal conv size in resampler.
merge_size (`int`, *optional*, defaults to 2):
The merge size of the vision encoder to llm encoder.
"""
model_input_names = [
"pixel_values",
"image_grid_thw",
"pixel_values_videos",
"video_grid_thw",
]
def __init__(
self,
do_resize: bool = True,
resample: PILImageResampling = PILImageResampling.BICUBIC,
do_rescale: bool = True,
rescale_factor: float = 1 / 255,
do_normalize: bool = True,
image_mean: Optional[Union[float, List[float]]] = None,
image_std: Optional[Union[float, List[float]]] = None,
do_convert_rgb: bool = True,
min_pixels: int = 56 * 56,
max_pixels: int = 28 * 28 * 1280,
patch_size: int = 14,
temporal_conv_size: int = 2,
merge_size: int = 2,
**kwargs,
) -> None:
"""init"""
super().__init__(**kwargs)
self.do_resize = do_resize
self.resample = resample
self.do_rescale = do_rescale
self.rescale_factor = rescale_factor
self.do_normalize = do_normalize
self.image_mean = image_mean if image_mean is not None else OPENAI_CLIP_MEAN
self.image_std = image_std if image_std is not None else OPENAI_CLIP_STD
self.min_pixels = min_pixels
self.max_pixels = max_pixels
self.patch_size = patch_size
self.temporal_conv_size = temporal_conv_size
self.merge_size = merge_size
self.size = {"min_pixels": min_pixels, "max_pixels": max_pixels}
self.do_convert_rgb = do_convert_rgb
def set_pixels(self, min_pixels=None, max_pixels=None, msg=""):
"""设定pixels"""
if min_pixels is not None:
assert isinstance(min_pixels, int) and min_pixels >= 0, "min_pixels must be positive int"
data_processor_logger.info(f"{msg} AdaptiveImageProcessor set min_pixels = {min_pixels}")
self.min_pixels = min_pixels
self.size["min_pixels"] = int(min_pixels)
if max_pixels is not None:
assert isinstance(max_pixels, int) and max_pixels > 0, "max_pixels must be positive int"
data_processor_logger.info(f"{msg} AdaptiveImageProcessor set max_pixels = {max_pixels}")
self.max_pixels = max_pixels
self.size["max_pixels"] = int(max_pixels)
def get_smarted_resize(self, height, width, min_pixels=None, max_pixels=None):
"""dummy"""
actual_min_pixels = min_pixels if min_pixels is not None else self.min_pixels
actual_max_pixels = max_pixels if max_pixels is not None else self.max_pixels
resized_height, resized_width = smart_resize(
height,
width,
factor=self.patch_size * self.merge_size,
min_pixels=actual_min_pixels,
max_pixels=actual_max_pixels,
)
return (resized_height, resized_width), (
resized_height // self.patch_size,
resized_width // self.patch_size,
)
def _preprocess(
self,
images: Union[ImageInput, VideoInput],
do_resize: bool = True,
resample: PILImageResampling = None,
do_rescale: bool = True,
rescale_factor: float = 1 / 255,
do_normalize: bool = True,
image_mean: Optional[Union[float, List[float]]] = None,
image_std: Optional[Union[float, List[float]]] = None,
do_convert_rgb: bool = False,
data_format: Optional[ChannelDimension] = ChannelDimension.FIRST,
input_data_format: Optional[Union[str, ChannelDimension]] = None,
predetermined_grid_thw=None,
):
"""
Preprocess an image or batch of images. Copy of the `preprocess` method from `CLIPImageProcessor`.
Args:
images (`ImageInput`):
Image or batch of images to preprocess. Expects pixel values ranging from 0 to 255.
If pixel values range from 0 to 1, set `do_rescale=False`.
vision_info (`List[Dict]`, *optional*):
Optional list of dictionaries containing additional information about vision inputs.
do_resize (`bool`, *optional*, defaults to `self.do_resize`):
Whether to resize the image.
resample (`PILImageResampling`, *optional*, defaults to `self.resample`):
Resampling filter to use if resizing the image. This can be one of the `PILImageResampling` enums.
do_rescale (`bool`, *optional*, defaults to `self.do_rescale`):
Whether to rescale the image.
rescale_factor (`float`, *optional*, defaults to `self.rescale_factor`):
Scale factor to use if rescaling the image.
do_normalize (`bool`, *optional*, defaults to `self.do_normalize`):
Whether to normalize the image.
image_mean (`float` or `List[float]`, *optional*, defaults to `self.image_mean`):
Mean to use if normalizing the image.
Can be a float or a list of floats corresponding to the number of channels in the image.
image_std (`float` or `List[float]`, *optional*, defaults to `self.image_std`):
Standard deviation to use if normalizing the image.
Can be a float or a list of floats corresponding to the number of channels in the image.
do_convert_rgb (`bool`, *optional*, defaults to `self.do_convert_rgb`):
Whether to convert the image to RGB.
data_format (`ChannelDimension`, *optional*, defaults to `ChannelDimension.FIRST`):
The channel dimension format for the output image. Can be one of:
- `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
- `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
- Unset: Use the channel dimension format of the input image.
input_data_format (`ChannelDimension` or `str`, *optional*):
The channel dimension format for the input image. Can be one of:
- `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
- `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
- `"none"` or `ChannelDimension.NONE`: image in (height, width) format.
- `"none"` or `ChannelDimension.NONE`: image in (height, width) format.
"""
images = make_list_of_images(images)
if do_convert_rgb:
images = [convert_to_rgb(image) for image in images]
# All transformations expect numpy arrays.
images = [to_numpy_array(image) for image in images]
if is_scaled_image(images[0]) and do_rescale:
data_processor_logger.warning(
"It looks like you are trying to rescale already rescaled images. If the input"
" images have pixel values between 0 and 1, set `do_rescale=False` to avoid rescaling them again."
)
if input_data_format is None:
# We assume that all images have the same channel dimension format.
input_data_format = infer_channel_dimension_format(images[0])
height, width = get_image_size(images[0], channel_dim=input_data_format)
resized_height, resized_width = height, width
processed_images = []
if predetermined_grid_thw is not None:
assert len(predetermined_grid_thw) == len(
images
), f"len(predetermined_grid_thw) {len(predetermined_grid_thw)} == len(images) {len(images)}"
for img_idx, image in enumerate(images):
if do_resize:
if predetermined_grid_thw is not None:
(resized_height, resized_width) = predetermined_grid_thw[img_idx]
resized_height *= self.patch_size
resized_width *= self.patch_size
else:
resized_height, resized_width = smart_resize(
height,
width,
factor=self.patch_size * self.merge_size,
min_pixels=self.min_pixels,
max_pixels=self.max_pixels,
)
image = image.astype("uint8") # TODO : 需要手动加上,否则多除255 导致结果会出错
# 直接fromarray,不要靠paddleformers里面的
image = Image.fromarray(image)
image = resize(
image,
size=(resized_height, resized_width),
resample=resample,
data_format=input_data_format,
)
if do_rescale:
image = rescale(image, scale=rescale_factor, data_format=input_data_format)
if do_normalize:
image = normalize(
image=image,
mean=image_mean,
std=image_std,
data_format=input_data_format,
)
image = to_channel_dimension_format(image, data_format, input_channel_dim=input_data_format) # [C, H, W]
processed_images.append(image)
patches = np.array(processed_images)
if data_format == ChannelDimension.LAST:
patches = patches.transpose([0, 3, 1, 2])
channel = patches.shape[1] # [time, C, H, W]
grid_t = patches.shape[0]
grid_h, grid_w = (
resized_height // self.patch_size,
resized_width // self.patch_size,
)
patches = patches.reshape(
[
grid_t,
channel,
grid_h // self.merge_size,
self.merge_size,
self.patch_size,
grid_w // self.merge_size,
self.merge_size,
self.patch_size,
]
)
# [grid_t, grid_h/merge_size, grid_w/merge_size, merge_size, merge_size, C, psz, psz]
patches = patches.transpose([0, 2, 5, 3, 6, 1, 4, 7])
flatten_patches = patches.reshape(
[
grid_t * grid_h * grid_w,
channel * self.patch_size * self.patch_size,
]
) # [grid_t * grid_h * grid_w, C * psz * psz]
return flatten_patches, (grid_t, grid_h, grid_w)
def preprocess(
self,
images: ImageInput,
videos: VideoInput = None,
do_resize: bool = True,
size: Optional[Union[int, List[int]]] = None,
resample: PILImageResampling = None,
do_rescale: bool = True,
rescale_factor: float = 1 / 255,
do_normalize: bool = True,
image_mean: Optional[Union[float, List[float]]] = None,
image_std: Optional[Union[float, List[float]]] = None,
do_convert_rgb: bool = False,
return_tensors: Optional[Union[str, TensorType]] = None,
data_format: Optional[ChannelDimension] = ChannelDimension.FIRST,
input_data_format: Optional[Union[str, ChannelDimension]] = None,
predetermined_grid_thw=None,
):
"""
Args:
images (`ImageInput`):
Image to preprocess. Expects a single or batch of images with pixel values ranging from 0 to 255. If
passing in images with pixel values between 0 and 1, set `do_rescale=False`.
videos (`VideoInput`):
Video to preprocess. Expects a single or batch of videos with pixel values ranging from 0 to 255. If
passing in videos with pixel values between 0 and 1, set `do_rescale=False`.
do_resize (`bool`, *optional*, defaults to `self.do_resize`):
Whether to resize the image.
size (`Dict[str, int]`, *optional*, defaults to `self.size`):
Size of the image after resizing. Shortest edge of the image is resized to size["shortest_edge"], with
the longest edge resized to keep the input aspect ratio.
resample (`int`, *optional*, defaults to `self.resample`):
Resampling filter to use if resizing the image. This can be one of the enum `PILImageResampling`. Only
has an effect if `do_resize` is set to `True`.
do_rescale (`bool`, *optional*, defaults to `self.do_rescale`):
Whether to rescale the image.
rescale_factor (`float`, *optional*, defaults to `self.rescale_factor`):
Rescale factor to rescale the image by if `do_rescale` is set to `True`.
do_normalize (`bool`, *optional*, defaults to `self.do_normalize`):
Whether to normalize the image.
image_mean (`float` or `List[float]`, *optional*, defaults to `self.image_mean`):
Image mean to use for normalization. Only has an effect if `do_normalize` is set to `True`.
image_std (`float` or `List[float]`, *optional*, defaults to `self.image_std`):
Image standard deviation to use for normalization. Only has an effect if `do_normalize` is set to
`True`.
do_convert_rgb (`bool`, *optional*, defaults to `self.do_convert_rgb`):
Whether to convert the image to RGB.
return_tensors (`str` or `TensorType`, *optional*):
The type of tensors to return. Can be one of:
- Unset: Return a list of `np.ndarray`.
- `TensorType.PADDLE` or `'pt'`: Return a batch of type `torch.Tensor`.
- `TensorType.NUMPY` or `'np'`: Return a batch of type `np.ndarray`.
data_format (`ChannelDimension` or `str`, *optional*, defaults to `ChannelDimension.FIRST`):
The channel dimension format for the output image. Can be one of:
- `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
- `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
- Unset: Use the channel dimension format of the input image.
input_data_format (`ChannelDimension` or `str`, *optional*):
The channel dimension format for the input image. If unset, the channel dimension format is inferred
from the input image. Can be one of:
- `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
- `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
- `"none"` or `ChannelDimension.NONE`: image in (height, width) format.
"""
do_resize = do_resize if do_resize is not None else self.do_resize
size = size if size is not None else self.size
resample = resample if resample is not None else self.resample
do_rescale = do_rescale if do_rescale is not None else self.do_rescale
rescale_factor = rescale_factor if rescale_factor is not None else self.rescale_factor
do_normalize = do_normalize if do_normalize is not None else self.do_normalize
image_mean = image_mean if image_mean is not None else self.image_mean
image_std = image_std if image_std is not None else self.image_std
do_convert_rgb = do_convert_rgb if do_convert_rgb is not None else self.do_convert_rgb
if images is not None:
images = make_batched_images(images)
if videos is not None:
videos = make_batched_videos(videos)
if images is not None and not valid_images(images):
raise ValueError("Invalid image type. Must be of type PIL.Image.Image, numpy.ndarray, " "paddle.Tensor.")
if images is not None:
pixel_values, vision_grid_thws = [], []
for img_idx, image in enumerate(images):
if predetermined_grid_thw is not None:
predetermined_grid_thw_one = [predetermined_grid_thw[img_idx]]
else:
predetermined_grid_thw_one = None
patches, image_grid_thw = self._preprocess(
image,
do_resize=do_resize,
resample=resample,
do_rescale=do_rescale,
rescale_factor=rescale_factor,
do_normalize=do_normalize,
image_mean=image_mean,
image_std=image_std,
data_format=data_format,
do_convert_rgb=do_convert_rgb,
input_data_format=input_data_format,
predetermined_grid_thw=predetermined_grid_thw_one,
)
pixel_values.extend(patches)
vision_grid_thws.append(image_grid_thw)
pixel_values = np.array(pixel_values)
vision_grid_thws = np.array(vision_grid_thws)
data = {
"pixel_values": pixel_values,
"image_grid_thw": vision_grid_thws,
}
if videos is not None:
pixel_values, vision_grid_thws = [], []
for images in videos:
patches, video_grid_thw = self._preprocess(
images,
do_resize=do_resize,
resample=resample,
do_rescale=do_rescale,
rescale_factor=rescale_factor,
do_normalize=do_normalize,
image_mean=image_mean,
image_std=image_std,
data_format=data_format,
do_convert_rgb=do_convert_rgb,
input_data_format=input_data_format,
predetermined_grid_thw=predetermined_grid_thw,
)
pixel_values.extend(patches)
vision_grid_thws.append(video_grid_thw)
pixel_values = np.array(pixel_values)
vision_grid_thws = np.array(vision_grid_thws)
data = {
"pixel_values_videos": pixel_values,
"video_grid_thw": vision_grid_thws,
}
return BatchFeature(data=data, tensor_type=return_tensors)
@@ -11,3 +11,17 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and # See the License for the specific language governing permissions and
# limitations under the License. # limitations under the License.
from fastdeploy.input.image_processors.adaptive_processor import ( # noqa: F401
AdaptiveImageProcessor,
get_image_preprocessor,
)
from fastdeploy.input.image_processors.paddleocr_processor import ( # noqa: F401
ImageProcessor as PaddleOCRImageProcessor,
)
from fastdeploy.input.image_processors.qwen3_processor import ( # noqa: F401
ImageProcessor as Qwen3ImageProcessor,
)
from fastdeploy.input.image_processors.qwen_processor import ( # noqa: F401
ImageProcessor as QwenImageProcessor,
)
@@ -0,0 +1,524 @@
"""
# Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""
"""image preprocessor adaptive"""
from typing import List, Optional, Union
import numpy as np
import paddle
import PIL
from paddleformers.transformers.feature_extraction_utils import BatchFeature
from paddleformers.transformers.image_processing_utils import BaseImageProcessor
from paddleformers.transformers.image_transforms import (
convert_to_rgb,
normalize,
rescale,
resize,
to_channel_dimension_format,
)
from paddleformers.transformers.image_utils import (
ChannelDimension,
ImageInput,
PILImageResampling,
get_image_size,
infer_channel_dimension_format,
is_valid_image,
make_list_of_images,
to_numpy_array,
valid_images,
)
from paddleformers.transformers.legacy.tokenizer_utils_base import TensorType
from PIL import Image
from fastdeploy.input.image_processors.common import is_scaled_image
from fastdeploy.input.image_processors.common import smart_resize_qwen as smart_resize
from fastdeploy.utils import data_processor_logger
OPENAI_CLIP_MEAN = [0.48145466, 0.4578275, 0.40821073]
OPENAI_CLIP_STD = [0.26862954, 0.26130258, 0.27577711]
IMAGE_FACTOR = 28
MIN_PIXELS = 4 * 28 * 28
MAX_PIXELS = 16384 * 28 * 28
MAX_RATIO = 200
VideoInput = Union[
List["PIL.Image.Image"],
"np.ndarray",
"paddle.Tensor",
List["np.ndarray"],
List["paddle.Tensor"],
List[List["PIL.Image.Image"]],
List[List["np.ndarray"]],
List[List["paddle.Tensor"]],
]
__all__ = [
"AdaptiveImageProcessor",
"get_image_preprocessor",
"make_batched_images",
"make_batched_videos",
]
def make_batched_images(images) -> List[List[ImageInput]]:
"""
Accepts images in list or nested list format, and makes a list of images for preprocessing.
images (`Union[List[List[ImageInput]], List[ImageInput], ImageInput]`):
The input image.
Returns:
list: A list of images.
"""
if isinstance(images, (list, tuple)) and isinstance(images[0], (list, tuple)) and is_valid_image(images[0][0]):
return [img for img_list in images for img in img_list]
elif isinstance(images, (list, tuple)) and is_valid_image(images[0]):
return images
elif is_valid_image(images):
return [images]
raise ValueError(f"Could not make batched images from {images}")
# Copied from transformers.models.llava_next_video.image_processing_llava_next_video.make_batched_videos
def make_batched_videos(videos) -> List[VideoInput]:
"""dummy"""
if isinstance(videos, (list, tuple)) and isinstance(videos[0], (list, tuple)) and is_valid_image(videos[0][0]):
return videos
elif isinstance(videos, (list, tuple)) and is_valid_image(videos[0]):
if isinstance(videos[0], Image.Image):
return [videos]
elif len(videos[0].shape) == 4:
return [list(video) for video in videos]
elif is_valid_image(videos) and len(videos.shape) == 4:
return [list(videos)]
raise ValueError(f"Could not make batched video from {videos}")
class AdaptiveImageProcessor(BaseImageProcessor):
r"""
Constructs a adaptive image processor that dynamically resizes images based on the original images.
Args:
do_resize (`bool`, *optional*, defaults to `True`):
Whether to resize the image's (height, width) dimensions.
resample (`PILImageResampling`, *optional*, defaults to `Resampling.BICUBIC`):
Resampling filter to use when resizing the image.
do_rescale (`bool`, *optional*, defaults to `True`):
Whether to rescale the image by the specified scale `rescale_factor`.
rescale_factor (`int` or `float`, *optional*, defaults to `1/255`):
Scale factor to use if rescaling the image.
do_normalize (`bool`, *optional*, defaults to `True`):
Whether to normalize the image.
image_mean (`float` or `List[float]`, *optional*, defaults to `[0.48145466, 0.4578275, 0.40821073]`):
Mean to use if normalizing the image. This is a float or list of floats for each channel in the image.
image_std (`float` or `List[float]`, *optional*, defaults to `[0.26862954, 0.26130258, 0.27577711]`):
Standard deviation to use if normalizing the image. This is a float or list of floats for each channel
in the image.
do_convert_rgb (`bool`, *optional*, defaults to `True`):
Whether to convert the image to RGB.
min_pixels (`int`, *optional*, defaults to `56 * 56`):
The min pixels of the image to resize the image.
max_pixels (`int`, *optional*, defaults to `28 * 28 * 1280`):
The max pixels of the image to resize the image.
patch_size (`int`, *optional*, defaults to 14):
The spacial patch size of the vision encoder.
temporal_conv_size (`int`, *optional*, defaults to 2):
The temporal conv size in resampler.
merge_size (`int`, *optional*, defaults to 2):
The merge size of the vision encoder to llm encoder.
"""
model_input_names = [
"pixel_values",
"image_grid_thw",
"pixel_values_videos",
"video_grid_thw",
]
def __init__(
self,
do_resize: bool = True,
resample: PILImageResampling = PILImageResampling.BICUBIC,
do_rescale: bool = True,
rescale_factor: float = 1 / 255,
do_normalize: bool = True,
image_mean: Optional[Union[float, List[float]]] = None,
image_std: Optional[Union[float, List[float]]] = None,
do_convert_rgb: bool = True,
min_pixels: int = 56 * 56,
max_pixels: int = 28 * 28 * 1280,
patch_size: int = 14,
temporal_conv_size: int = 2,
merge_size: int = 2,
**kwargs,
) -> None:
"""init"""
super().__init__(**kwargs)
self.do_resize = do_resize
self.resample = resample
self.do_rescale = do_rescale
self.rescale_factor = rescale_factor
self.do_normalize = do_normalize
self.image_mean = image_mean if image_mean is not None else OPENAI_CLIP_MEAN
self.image_std = image_std if image_std is not None else OPENAI_CLIP_STD
self.min_pixels = min_pixels
self.max_pixels = max_pixels
self.patch_size = patch_size
self.temporal_conv_size = temporal_conv_size
self.merge_size = merge_size
self.size = {"min_pixels": min_pixels, "max_pixels": max_pixels}
self.do_convert_rgb = do_convert_rgb
def set_pixels(self, min_pixels=None, max_pixels=None, msg=""):
"""设定pixels"""
if min_pixels is not None:
assert isinstance(min_pixels, int) and min_pixels >= 0, "min_pixels must be positive int"
data_processor_logger.info(f"{msg} AdaptiveImageProcessor set min_pixels = {min_pixels}")
self.min_pixels = min_pixels
self.size["min_pixels"] = int(min_pixels)
if max_pixels is not None:
assert isinstance(max_pixels, int) and max_pixels > 0, "max_pixels must be positive int"
data_processor_logger.info(f"{msg} AdaptiveImageProcessor set max_pixels = {max_pixels}")
self.max_pixels = max_pixels
self.size["max_pixels"] = int(max_pixels)
def get_smarted_resize(self, height, width, min_pixels=None, max_pixels=None):
"""dummy"""
actual_min_pixels = min_pixels if min_pixels is not None else self.min_pixels
actual_max_pixels = max_pixels if max_pixels is not None else self.max_pixels
resized_height, resized_width = smart_resize(
height,
width,
factor=self.patch_size * self.merge_size,
min_pixels=actual_min_pixels,
max_pixels=actual_max_pixels,
)
return (resized_height, resized_width), (
resized_height // self.patch_size,
resized_width // self.patch_size,
)
def _preprocess(
self,
images: Union[ImageInput, VideoInput],
do_resize: bool = True,
resample: PILImageResampling = None,
do_rescale: bool = True,
rescale_factor: float = 1 / 255,
do_normalize: bool = True,
image_mean: Optional[Union[float, List[float]]] = None,
image_std: Optional[Union[float, List[float]]] = None,
do_convert_rgb: bool = False,
data_format: Optional[ChannelDimension] = ChannelDimension.FIRST,
input_data_format: Optional[Union[str, ChannelDimension]] = None,
predetermined_grid_thw=None,
):
"""
Preprocess an image or batch of images. Copy of the `preprocess` method from `CLIPImageProcessor`.
Args:
images (`ImageInput`):
Image or batch of images to preprocess. Expects pixel values ranging from 0 to 255.
If pixel values range from 0 to 1, set `do_rescale=False`.
vision_info (`List[Dict]`, *optional*):
Optional list of dictionaries containing additional information about vision inputs.
do_resize (`bool`, *optional*, defaults to `self.do_resize`):
Whether to resize the image.
resample (`PILImageResampling`, *optional*, defaults to `self.resample`):
Resampling filter to use if resizing the image. This can be one of the `PILImageResampling` enums.
do_rescale (`bool`, *optional*, defaults to `self.do_rescale`):
Whether to rescale the image.
rescale_factor (`float`, *optional*, defaults to `self.rescale_factor`):
Scale factor to use if rescaling the image.
do_normalize (`bool`, *optional*, defaults to `self.do_normalize`):
Whether to normalize the image.
image_mean (`float` or `List[float]`, *optional*, defaults to `self.image_mean`):
Mean to use if normalizing the image.
Can be a float or a list of floats corresponding to the number of channels in the image.
image_std (`float` or `List[float]`, *optional*, defaults to `self.image_std`):
Standard deviation to use if normalizing the image.
Can be a float or a list of floats corresponding to the number of channels in the image.
do_convert_rgb (`bool`, *optional*, defaults to `self.do_convert_rgb`):
Whether to convert the image to RGB.
data_format (`ChannelDimension`, *optional*, defaults to `ChannelDimension.FIRST`):
The channel dimension format for the output image. Can be one of:
- `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
- `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
- Unset: Use the channel dimension format of the input image.
input_data_format (`ChannelDimension` or `str`, *optional*):
The channel dimension format for the input image. Can be one of:
- `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
- `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
- `"none"` or `ChannelDimension.NONE`: image in (height, width) format.
- `"none"` or `ChannelDimension.NONE`: image in (height, width) format.
"""
images = make_list_of_images(images)
if do_convert_rgb:
images = [convert_to_rgb(image) for image in images]
# All transformations expect numpy arrays.
images = [to_numpy_array(image) for image in images]
if is_scaled_image(images[0]) and do_rescale:
data_processor_logger.warning(
"It looks like you are trying to rescale already rescaled images. If the input"
" images have pixel values between 0 and 1, set `do_rescale=False` to avoid rescaling them again."
)
if input_data_format is None:
# We assume that all images have the same channel dimension format.
input_data_format = infer_channel_dimension_format(images[0])
height, width = get_image_size(images[0], channel_dim=input_data_format)
resized_height, resized_width = height, width
processed_images = []
if predetermined_grid_thw is not None:
assert len(predetermined_grid_thw) == len(
images
), f"len(predetermined_grid_thw) {len(predetermined_grid_thw)} == len(images) {len(images)}"
for img_idx, image in enumerate(images):
if do_resize:
if predetermined_grid_thw is not None:
(resized_height, resized_width) = predetermined_grid_thw[img_idx]
resized_height *= self.patch_size
resized_width *= self.patch_size
else:
resized_height, resized_width = smart_resize(
height,
width,
factor=self.patch_size * self.merge_size,
min_pixels=self.min_pixels,
max_pixels=self.max_pixels,
)
image = image.astype("uint8") # TODO : 需要手动加上,否则多除255 导致结果会出错
# 直接fromarray,不要靠paddleformers里面的
image = Image.fromarray(image)
image = resize(
image,
size=(resized_height, resized_width),
resample=resample,
data_format=input_data_format,
)
if do_rescale:
image = rescale(image, scale=rescale_factor, data_format=input_data_format)
if do_normalize:
image = normalize(
image=image,
mean=image_mean,
std=image_std,
data_format=input_data_format,
)
image = to_channel_dimension_format(image, data_format, input_channel_dim=input_data_format) # [C, H, W]
processed_images.append(image)
patches = np.array(processed_images)
if data_format == ChannelDimension.LAST:
patches = patches.transpose([0, 3, 1, 2])
channel = patches.shape[1] # [time, C, H, W]
grid_t = patches.shape[0]
grid_h, grid_w = (
resized_height // self.patch_size,
resized_width // self.patch_size,
)
patches = patches.reshape(
[
grid_t,
channel,
grid_h // self.merge_size,
self.merge_size,
self.patch_size,
grid_w // self.merge_size,
self.merge_size,
self.patch_size,
]
)
# [grid_t, grid_h/merge_size, grid_w/merge_size, merge_size, merge_size, C, psz, psz]
patches = patches.transpose([0, 2, 5, 3, 6, 1, 4, 7])
flatten_patches = patches.reshape(
[
grid_t * grid_h * grid_w,
channel * self.patch_size * self.patch_size,
]
) # [grid_t * grid_h * grid_w, C * psz * psz]
return flatten_patches, (grid_t, grid_h, grid_w)
def preprocess(
self,
images: ImageInput,
videos: VideoInput = None,
do_resize: bool = True,
size: Optional[Union[int, List[int]]] = None,
resample: PILImageResampling = None,
do_rescale: bool = True,
rescale_factor: float = 1 / 255,
do_normalize: bool = True,
image_mean: Optional[Union[float, List[float]]] = None,
image_std: Optional[Union[float, List[float]]] = None,
do_convert_rgb: bool = False,
return_tensors: Optional[Union[str, TensorType]] = None,
data_format: Optional[ChannelDimension] = ChannelDimension.FIRST,
input_data_format: Optional[Union[str, ChannelDimension]] = None,
predetermined_grid_thw=None,
):
"""
Args:
images (`ImageInput`):
Image to preprocess. Expects a single or batch of images with pixel values ranging from 0 to 255. If
passing in images with pixel values between 0 and 1, set `do_rescale=False`.
videos (`VideoInput`):
Video to preprocess. Expects a single or batch of videos with pixel values ranging from 0 to 255. If
passing in videos with pixel values between 0 and 1, set `do_rescale=False`.
do_resize (`bool`, *optional*, defaults to `self.do_resize`):
Whether to resize the image.
size (`Dict[str, int]`, *optional*, defaults to `self.size`):
Size of the image after resizing. Shortest edge of the image is resized to size["shortest_edge"], with
the longest edge resized to keep the input aspect ratio.
resample (`int`, *optional*, defaults to `self.resample`):
Resampling filter to use if resizing the image. This can be one of the enum `PILImageResampling`. Only
has an effect if `do_resize` is set to `True`.
do_rescale (`bool`, *optional*, defaults to `self.do_rescale`):
Whether to rescale the image.
rescale_factor (`float`, *optional*, defaults to `self.rescale_factor`):
Rescale factor to rescale the image by if `do_rescale` is set to `True`.
do_normalize (`bool`, *optional*, defaults to `self.do_normalize`):
Whether to normalize the image.
image_mean (`float` or `List[float]`, *optional*, defaults to `self.image_mean`):
Image mean to use for normalization. Only has an effect if `do_normalize` is set to `True`.
image_std (`float` or `List[float]`, *optional*, defaults to `self.image_std`):
Image standard deviation to use for normalization. Only has an effect if `do_normalize` is set to
`True`.
do_convert_rgb (`bool`, *optional*, defaults to `self.do_convert_rgb`):
Whether to convert the image to RGB.
return_tensors (`str` or `TensorType`, *optional*):
The type of tensors to return. Can be one of:
- Unset: Return a list of `np.ndarray`.
- `TensorType.PADDLE` or `'pt'`: Return a batch of type `torch.Tensor`.
- `TensorType.NUMPY` or `'np'`: Return a batch of type `np.ndarray`.
data_format (`ChannelDimension` or `str`, *optional*, defaults to `ChannelDimension.FIRST`):
The channel dimension format for the output image. Can be one of:
- `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
- `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
- Unset: Use the channel dimension format of the input image.
input_data_format (`ChannelDimension` or `str`, *optional*):
The channel dimension format for the input image. If unset, the channel dimension format is inferred
from the input image. Can be one of:
- `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
- `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
- `"none"` or `ChannelDimension.NONE`: image in (height, width) format.
"""
do_resize = do_resize if do_resize is not None else self.do_resize
size = size if size is not None else self.size
resample = resample if resample is not None else self.resample
do_rescale = do_rescale if do_rescale is not None else self.do_rescale
rescale_factor = rescale_factor if rescale_factor is not None else self.rescale_factor
do_normalize = do_normalize if do_normalize is not None else self.do_normalize
image_mean = image_mean if image_mean is not None else self.image_mean
image_std = image_std if image_std is not None else self.image_std
do_convert_rgb = do_convert_rgb if do_convert_rgb is not None else self.do_convert_rgb
if images is not None:
images = make_batched_images(images)
if videos is not None:
videos = make_batched_videos(videos)
if images is not None and not valid_images(images):
raise ValueError("Invalid image type. Must be of type PIL.Image.Image, numpy.ndarray, " "paddle.Tensor.")
data = {}
if images is not None:
pixel_values, vision_grid_thws = [], []
for img_idx, image in enumerate(images):
if predetermined_grid_thw is not None:
predetermined_grid_thw_one = [predetermined_grid_thw[img_idx]]
else:
predetermined_grid_thw_one = None
patches, image_grid_thw = self._preprocess(
image,
do_resize=do_resize,
resample=resample,
do_rescale=do_rescale,
rescale_factor=rescale_factor,
do_normalize=do_normalize,
image_mean=image_mean,
image_std=image_std,
data_format=data_format,
do_convert_rgb=do_convert_rgb,
input_data_format=input_data_format,
predetermined_grid_thw=predetermined_grid_thw_one,
)
pixel_values.extend(patches)
vision_grid_thws.append(image_grid_thw)
pixel_values = np.array(pixel_values)
vision_grid_thws = np.array(vision_grid_thws)
data["pixel_values"] = pixel_values
data["image_grid_thw"] = vision_grid_thws
if videos is not None:
pixel_values, vision_grid_thws = [], []
for images in videos:
patches, video_grid_thw = self._preprocess(
images,
do_resize=do_resize,
resample=resample,
do_rescale=do_rescale,
rescale_factor=rescale_factor,
do_normalize=do_normalize,
image_mean=image_mean,
image_std=image_std,
data_format=data_format,
do_convert_rgb=do_convert_rgb,
input_data_format=input_data_format,
predetermined_grid_thw=predetermined_grid_thw,
)
pixel_values.extend(patches)
vision_grid_thws.append(video_grid_thw)
pixel_values = np.array(pixel_values)
vision_grid_thws = np.array(vision_grid_thws)
data["pixel_values_videos"] = pixel_values
data["video_grid_thw"] = vision_grid_thws
return BatchFeature(data=data, tensor_type=return_tensors)
def get_image_preprocessor(args):
"""
get_image_preprocessor from args
"""
if args.vision_model_name_or_path is None:
return None
data_processor_logger.info("use AdaptiveImageProcessor")
image_preprocess = AdaptiveImageProcessor.from_pretrained(args.vision_model_name_or_path)
return image_preprocess
@@ -0,0 +1,225 @@
"""
# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""
"""Image processor class for PaddleOCR-VL."""
import json
from pathlib import Path
from typing import Dict, List, Optional, Union
import numpy as np
from paddleformers.transformers.feature_extraction_utils import BatchFeature
from paddleformers.transformers.image_processing_utils import BaseImageProcessor
from paddleformers.transformers.image_utils import (
ImageInput,
is_valid_image,
make_list_of_images,
to_numpy_array,
)
from fastdeploy.input.image_processors.common import (
smart_resize_paddleocr as smart_resize,
)
_OPENAI_CLIP_MEAN = [0.48145466, 0.4578275, 0.40821073]
_OPENAI_CLIP_STD = [0.26862954, 0.26130258, 0.27577711]
def make_batched_images(images) -> List[ImageInput]:
"""
Accepts images in list or nested list format, and makes a flat list of images for preprocessing.
Args:
images (`Union[List[List[ImageInput]], List[ImageInput], ImageInput]`):
The input image.
Returns:
List[ImageInput]: A flat list of images.
"""
if isinstance(images, (list, tuple)) and isinstance(images[0], (list, tuple)) and is_valid_image(images[0][0]):
return [img for img_list in images for img in img_list]
elif isinstance(images, (list, tuple)) and is_valid_image(images[0]):
return images
elif is_valid_image(images):
return [images]
raise ValueError(f"Could not make batched images from {images}")
def adjust_size(size, patch_size):
num_patches = size // patch_size
if num_patches % 2 != 0:
num_patches -= 1
return num_patches * patch_size
class ImageProcessor(BaseImageProcessor):
model_input_names = [
"pixel_values",
"image_grid_thw",
"pixel_values_videos",
"video_grid_thw",
]
def __init__(
self,
do_resize: bool = True,
resample: int = 3,
do_rescale: bool = True,
rescale_factor: Union[int, float] = 1 / 255,
do_normalize: bool = True,
image_mean: Optional[Union[float, List[float]]] = None,
image_std: Optional[Union[float, List[float]]] = None,
do_convert_rgb: bool = True,
min_pixels: int = 28 * 28 * 130,
max_pixels: int = 28 * 28 * 1280,
patch_size: int = 14,
temporal_patch_size: int = 1,
merge_size: int = 2,
**kwargs,
) -> None:
super().__init__()
self.do_resize = do_resize
self.resample = resample
self.do_rescale = do_rescale
self.rescale_factor = rescale_factor
self.do_normalize = do_normalize
self.image_mean = image_mean if image_mean is not None else _OPENAI_CLIP_MEAN
self.image_std = image_std if image_std is not None else _OPENAI_CLIP_STD
self.min_pixels = min_pixels
self.max_pixels = max_pixels
self.patch_size = patch_size
self.temporal_patch_size = temporal_patch_size
self.merge_size = merge_size
self.size = {"min_pixels": min_pixels, "max_pixels": max_pixels} # not used
self.do_convert_rgb = do_convert_rgb
@classmethod
def from_pretrained(cls, pretrained_model_dir):
pretrained_model_dir = Path(pretrained_model_dir)
image_processor_config_path = pretrained_model_dir / "preprocessor_config.json"
with open(image_processor_config_path, "r", encoding="utf-8") as f:
image_processor_config = json.load(f)
return cls(**image_processor_config)
def _preprocess(
self,
images,
do_resize: Optional[bool] = None,
do_rescale: Optional[bool] = None,
rescale_factor: Optional[float] = None,
do_normalize: Optional[bool] = None,
image_mean: Optional[Union[float, List[float]]] = None,
image_std: Optional[Union[float, List[float]]] = None,
do_convert_rgb: Optional[bool] = None,
):
images = make_list_of_images(images)
if do_convert_rgb:
images = [image.convert("RGB") for image in images]
width, height = images[0].size
resized_height, resized_width = height, width
processed_images = []
for image in images:
if do_resize:
resized_height, resized_width = smart_resize(
height,
width,
factor=self.patch_size * self.merge_size,
min_pixels=self.min_pixels,
max_pixels=self.max_pixels,
)
image = image.resize((resized_width, resized_height), resample=self.resample)
image = to_numpy_array(image)
if do_rescale:
image = (image * rescale_factor).astype(np.float32)
if do_normalize:
image = image.astype(np.float32)
image -= np.array(image_mean, dtype=np.float32)
image /= np.array(image_std, dtype=np.float32)
processed_images.append(image)
patches = np.array(processed_images)
patches = patches.transpose(0, 3, 1, 2)
if patches.shape[0] == 1:
patches = np.tile(patches, (self.temporal_patch_size, 1, 1, 1))
channel = patches.shape[1]
grid_t = patches.shape[0] // self.temporal_patch_size
grid_h, grid_w = (
resized_height // self.patch_size,
resized_width // self.patch_size,
)
patches = patches.reshape(
grid_t,
self.temporal_patch_size,
channel,
grid_h,
self.patch_size,
grid_w,
self.patch_size,
)
patches = patches.transpose(0, 3, 5, 2, 1, 4, 6)
assert self.temporal_patch_size == 1
flatten_patches = patches.reshape(grid_t * grid_h * grid_w, channel, self.patch_size, self.patch_size)
return flatten_patches, np.array([grid_t, grid_h, grid_w])
def preprocess(
self,
images,
videos=None,
do_resize: Optional[bool] = None,
size: Optional[Dict[str, int]] = None,
do_rescale: Optional[bool] = None,
rescale_factor: Optional[float] = None,
do_normalize: Optional[bool] = None,
image_mean: Optional[Union[float, List[float]]] = None,
image_std: Optional[Union[float, List[float]]] = None,
do_convert_rgb: Optional[bool] = None,
return_tensors=None,
):
do_resize = do_resize if do_resize is not None else self.do_resize
size = size if size is not None else self.size
do_rescale = do_rescale if do_rescale is not None else self.do_rescale
rescale_factor = rescale_factor if rescale_factor is not None else self.rescale_factor
do_normalize = do_normalize if do_normalize is not None else self.do_normalize
image_mean = image_mean if image_mean is not None else self.image_mean
image_std = image_std if image_std is not None else self.image_std
do_convert_rgb = do_convert_rgb if do_convert_rgb is not None else self.do_convert_rgb
if videos is not None:
raise NotImplementedError("Videos are not yet supported")
patches, image_grid_thw = self._preprocess(
images,
do_resize=do_resize,
do_rescale=do_rescale,
rescale_factor=rescale_factor,
do_normalize=do_normalize,
image_mean=image_mean,
image_std=image_std,
do_convert_rgb=do_convert_rgb,
)
pixel_values = np.array(patches)
data = {"pixel_values": pixel_values, "grid_thw": image_grid_thw}
return BatchFeature(data=data, tensor_type=return_tensors)
@@ -0,0 +1,333 @@
"""
# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""
from typing import List, Optional, Union
import numpy as np
import paddle
import PIL
from paddleformers.transformers.feature_extraction_utils import BatchFeature
from paddleformers.transformers.image_processing_utils import BaseImageProcessor
from paddleformers.transformers.image_transforms import (
normalize,
rescale,
resize,
to_channel_dimension_format,
)
from paddleformers.transformers.image_utils import (
ChannelDimension,
ImageInput,
PILImageResampling,
get_image_size,
infer_channel_dimension_format,
make_list_of_images,
to_numpy_array,
valid_images,
)
from paddleformers.transformers.legacy.tokenizer_utils_base import TensorType
from PIL import Image
from fastdeploy.input.image_processors.common import is_scaled_image, smart_resize
from fastdeploy.utils import data_processor_logger
IMAGE_MEAN = [0.5, 0.5, 0.5]
IMAGE_STD = [0.5, 0.5, 0.5]
MIN_PIXELS = 65536
MAX_PIXELS = 16777216
VideoInput = Union[
List["PIL.Image.Image"],
"np.ndarray",
"paddle.Tensor",
List["np.ndarray"],
List["paddle.Tensor"],
List[List["PIL.Image.Image"]],
List[List["np.ndarray"]],
List[List["paddle.Tensor"]],
]
class ImageProcessor(BaseImageProcessor):
"""
Adaptive image processor for dynamic image resizing and preprocessing.
This processor handles image resizing, rescaling, normalization and format conversion.
It dynamically adjusts image dimensions based on original size and specified constraints.
"""
def __init__(
self,
patch_size: int = 16,
merge_size: int = 2,
temporal_patch_size: int = 2,
min_pixels: int = MIN_PIXELS,
max_pixels: int = MAX_PIXELS,
image_mean: Union[float, List[float]] = IMAGE_MEAN,
image_std: Union[float, List[float]] = IMAGE_STD,
rescale_factor: float = 1 / 255,
do_rescale: bool = True,
do_normalize: bool = True,
resample: PILImageResampling = PILImageResampling.BICUBIC,
**kwargs,
) -> None:
"""
Initialize image processor with configuration parameters.
Args:
patch_size (int): Spatial patch size for vision encoder
merge_size (int): Merge size between vision and LLM encoders
temporal_patch_size (int): Temporal patch size for video processing
min_pixels (int): Minimum allowed pixels in resized image
max_pixels (int): Maximum allowed pixels in resized image
image_mean (float/list): Mean values for normalization per channel
image_std (float/list): Std values for normalization per channel
rescale_factor (float): Scaling factor for pixel values (default 1/255)
do_rescale (bool): Whether to rescale images
do_normalize (bool): Whether to normalize images
resample: Resampling method for image resizing
**kwargs: Additional base class arguments
"""
super().__init__(**kwargs)
self.patch_size = patch_size
self.merge_size = merge_size
self.temporal_patch_size = temporal_patch_size
self.min_pixels = min_pixels
self.max_pixels = max_pixels
self.image_mean = image_mean
self.image_std = image_std
self.rescale_factor = rescale_factor
self.do_rescale = do_rescale
self.do_normalize = do_normalize
self.resample = resample
def _preprocess(
self,
images: Union[ImageInput, VideoInput],
min_pixels: int,
max_pixels: int,
image_mean: Optional[Union[float, List[float]]],
image_std: Optional[Union[float, List[float]]],
rescale_factor: float,
do_rescale: bool,
do_normalize: bool,
resample: PILImageResampling,
data_format: Optional[ChannelDimension],
input_data_format: Optional[Union[str, ChannelDimension]],
):
"""
Internal method for image preprocessing pipeline.
Args:
images: Input image or batch of images
min_pixels: Minimum allowed pixels in output
max_pixels: Maximum allowed pixels in output
image_mean: Normalization mean values
image_std: Normalization std values
rescale_factor: Pixel value scaling factor
do_rescale: Whether to rescale pixel values
do_normalize: Whether to normalize pixel values
resample: Resampling method
data_format: Output channel format
input_data_format: Input channel format
Returns:
tuple: (flatten_patches, grid_dimensions)
- flatten_patches: Flattened image patches
- grid_dimensions: Grid dimensions [t, h, w]
"""
images = make_list_of_images(images)
# All transformations expect numpy arrays.
images = [to_numpy_array(image) for image in images]
if is_scaled_image(images[0]) and do_rescale:
data_processor_logger.warning(
"It looks like you are trying to rescale already rescaled images. If the input"
" images have pixel values between 0 and 1, set `do_rescale=False` to avoid rescaling them again."
)
if input_data_format is None:
# We assume that all images have the same channel dimension format.
input_data_format = infer_channel_dimension_format(images[0])
# Get original dimensions and calculate optimal resize dimensions
height, width = get_image_size(images[0], channel_dim=input_data_format)
resized_height, resized_width = smart_resize(
height,
width,
factor=self.patch_size * self.merge_size, # Combine patch and merge factors
min_pixels=min_pixels,
max_pixels=max_pixels,
)
processed_images = []
for image in images:
if height != resized_height or width != resized_width:
# Convert to uint8 before resizing to avoid double scaling
image = image.astype("uint8")
# Convert to PIL Image and resize
image = Image.fromarray(image)
image = resize(
image,
size=(resized_height, resized_width),
resample=resample,
data_format=input_data_format,
)
if do_rescale and do_normalize:
# Adjust mean and std for combined rescale+normalize
image_mean = np.array(image_mean, dtype=np.float32) * (1.0 / rescale_factor)
image_std = np.array(image_std, dtype=np.float32) * (1.0 / rescale_factor)
do_rescale = False # Skip separate rescale step
# mutual exclusion and upper branch
if do_rescale:
image = image.astype(np.float32)
image = rescale(image, scale=rescale_factor, data_format=input_data_format)
if do_normalize:
image = image.astype(np.float32)
image = normalize(
image=image,
mean=image_mean,
std=image_std,
data_format=input_data_format,
)
image = to_channel_dimension_format(image, data_format, input_channel_dim=input_data_format) # [C, H, W]
processed_images.append(image)
# Convert processed images to numpy array
patches = np.array(processed_images)
# Pad temporal dimension if needed
if patches.shape[0] % self.temporal_patch_size != 0:
repeats = np.repeat(
patches[-1][np.newaxis],
self.temporal_patch_size - (patches.shape[0] % self.temporal_patch_size),
axis=0,
)
patches = np.concatenate([patches, repeats], axis=0)
# Convert to channels-first format if needed
if data_format == ChannelDimension.LAST:
patches = patches.transpose([0, 3, 1, 2]) # [N, H, W, C] -> [N, C, H, W]
grid_t, channel = patches.shape[:2]
grid_t = grid_t // self.temporal_patch_size
grid_h, grid_w = (
resized_height // self.patch_size,
resized_width // self.patch_size,
)
# Reshape into hierarchical patch structure
patches = patches.reshape(
[
grid_t,
self.temporal_patch_size,
channel,
grid_h // self.merge_size,
self.merge_size,
self.patch_size,
grid_w // self.merge_size,
self.merge_size,
self.patch_size,
]
)
# Reorder dimensions for better memory access pattern
# [grid_t, grid_h/merge_size, grid_w/merge_size, merge_size, merge_size, C, temporal_patch_size, psz, psz]
patches = patches.transpose([0, 3, 6, 4, 7, 2, 1, 5, 8])
flatten_patches = patches.reshape(
[
grid_t * grid_h * grid_w,
channel * self.temporal_patch_size * self.patch_size * self.patch_size,
]
)
return flatten_patches, np.array([grid_t, grid_h, grid_w])
def preprocess(
self,
images: Union[ImageInput, VideoInput],
min_pixels: Optional[int] = None,
max_pixels: Optional[int] = None,
image_mean: Optional[Union[float, List[float]]] = None,
image_std: Optional[Union[float, List[float]]] = None,
rescale_factor: Optional[float] = None,
do_rescale: Optional[bool] = None,
do_normalize: Optional[bool] = None,
resample: Optional[PILImageResampling] = None,
return_tensors: Optional[Union[str, TensorType]] = None,
data_format: Optional[ChannelDimension] = ChannelDimension.FIRST,
input_data_format: Optional[Union[str, ChannelDimension]] = ChannelDimension.LAST,
):
"""
Main preprocessing method for images/videos.
Args:
images: Input image/video data
min_pixels: Override for minimum pixels
max_pixels: Override for maximum pixels
image_mean: Override for normalization mean
image_std: Override for normalization std
rescale_factor: Override for rescaling factor
do_rescale: Override for rescaling flag
do_normalize: Override for normalization flag
resample: Override for resampling method
return_tensors: Desired output tensor format
data_format: Output channel dimension format
input_data_format: Input channel dimension format
Returns:
BatchFeature: Processed features containing:
- pixel_values: Preprocessed pixel data
- grid_thw: Grid dimensions [temporal, height, width]
Raises:
ValueError: For invalid image types or dimensions
"""
min_pixels = min_pixels if min_pixels is not None else self.min_pixels
max_pixels = max_pixels if max_pixels is not None else self.max_pixels
image_mean = image_mean if image_mean is not None else self.image_mean
image_std = image_std if image_std is not None else self.image_std
rescale_factor = rescale_factor if rescale_factor is not None else self.rescale_factor
do_rescale = do_rescale if do_rescale is not None else self.do_rescale
do_normalize = do_normalize if do_normalize is not None else self.do_normalize
resample = resample if resample is not None else self.resample
if images is not None and not valid_images(images):
raise ValueError("Invalid image type. Must be of type PIL.Image.Image, numpy.ndarray, " "paddle.Tensor.")
pixel_values, grid_thw = self._preprocess(
images,
min_pixels=min_pixels,
max_pixels=max_pixels,
image_mean=image_mean,
image_std=image_std,
rescale_factor=rescale_factor,
do_rescale=do_rescale,
do_normalize=do_normalize,
resample=resample,
data_format=data_format,
input_data_format=input_data_format,
)
data = {"pixel_values": pixel_values, "grid_thw": grid_thw}
return BatchFeature(data=data, tensor_type=return_tensors)
@@ -0,0 +1,332 @@
"""
# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""
from typing import List, Optional, Union
import numpy as np
import paddle
import PIL
from paddleformers.transformers.feature_extraction_utils import BatchFeature
from paddleformers.transformers.image_processing_utils import BaseImageProcessor
from paddleformers.transformers.image_transforms import (
normalize,
rescale,
resize,
to_channel_dimension_format,
)
from paddleformers.transformers.image_utils import (
ChannelDimension,
ImageInput,
PILImageResampling,
get_image_size,
infer_channel_dimension_format,
make_list_of_images,
to_numpy_array,
valid_images,
)
from paddleformers.transformers.legacy.tokenizer_utils_base import TensorType
from PIL import Image
from fastdeploy.input.image_processors.common import is_scaled_image, smart_resize
from fastdeploy.utils import data_processor_logger
OPENAI_CLIP_MEAN = [0.48145466, 0.4578275, 0.40821073]
OPENAI_CLIP_STD = [0.26862954, 0.26130258, 0.27577711]
MIN_PIXELS = 4 * 28 * 28
MAX_PIXELS = 16384 * 28 * 28
VideoInput = Union[
List["PIL.Image.Image"],
"np.ndarray",
"paddle.Tensor",
List["np.ndarray"],
List["paddle.Tensor"],
List[List["PIL.Image.Image"]],
List[List["np.ndarray"]],
List[List["paddle.Tensor"]],
]
class ImageProcessor(BaseImageProcessor):
"""
Adaptive image processor for dynamic image resizing and preprocessing.
This processor handles image resizing, rescaling, normalization and format conversion.
It dynamically adjusts image dimensions based on original size and specified constraints.
"""
def __init__(
self,
patch_size: int = 14,
merge_size: int = 2,
temporal_patch_size: int = 2,
min_pixels: int = MIN_PIXELS,
max_pixels: int = MAX_PIXELS,
image_mean: Union[float, List[float]] = OPENAI_CLIP_MEAN,
image_std: Union[float, List[float]] = OPENAI_CLIP_STD,
rescale_factor: float = 1 / 255,
do_rescale: bool = True,
do_normalize: bool = True,
resample: PILImageResampling = PILImageResampling.BICUBIC,
**kwargs,
) -> None:
"""
Initialize image processor with configuration parameters.
Args:
patch_size (int): Spatial patch size for vision encoder
merge_size (int): Merge size between vision and LLM encoders
temporal_patch_size (int): Temporal patch size for video processing
min_pixels (int): Minimum allowed pixels in resized image
max_pixels (int): Maximum allowed pixels in resized image
image_mean (float/list): Mean values for normalization per channel
image_std (float/list): Std values for normalization per channel
rescale_factor (float): Scaling factor for pixel values (default 1/255)
do_rescale (bool): Whether to rescale images
do_normalize (bool): Whether to normalize images
resample: Resampling method for image resizing
**kwargs: Additional base class arguments
"""
super().__init__(**kwargs)
self.patch_size = patch_size
self.merge_size = merge_size
self.temporal_patch_size = temporal_patch_size
self.min_pixels = min_pixels
self.max_pixels = max_pixels
self.image_mean = image_mean
self.image_std = image_std
self.rescale_factor = rescale_factor
self.do_rescale = do_rescale
self.do_normalize = do_normalize
self.resample = resample
def _preprocess(
self,
images: Union[ImageInput, VideoInput],
min_pixels: int,
max_pixels: int,
image_mean: Optional[Union[float, List[float]]],
image_std: Optional[Union[float, List[float]]],
rescale_factor: float,
do_rescale: bool,
do_normalize: bool,
resample: PILImageResampling,
data_format: Optional[ChannelDimension],
input_data_format: Optional[Union[str, ChannelDimension]],
):
"""
Internal method for image preprocessing pipeline.
Args:
images: Input image or batch of images
min_pixels: Minimum allowed pixels in output
max_pixels: Maximum allowed pixels in output
image_mean: Normalization mean values
image_std: Normalization std values
rescale_factor: Pixel value scaling factor
do_rescale: Whether to rescale pixel values
do_normalize: Whether to normalize pixel values
resample: Resampling method
data_format: Output channel format
input_data_format: Input channel format
Returns:
tuple: (flatten_patches, grid_dimensions)
- flatten_patches: Flattened image patches
- grid_dimensions: Grid dimensions [t, h, w]
"""
images = make_list_of_images(images)
# All transformations expect numpy arrays.
images = [to_numpy_array(image) for image in images]
if is_scaled_image(images[0]) and do_rescale:
data_processor_logger.warning(
"It looks like you are trying to rescale already rescaled images. If the input"
" images have pixel values between 0 and 1, set `do_rescale=False` to avoid rescaling them again."
)
if input_data_format is None:
# We assume that all images have the same channel dimension format.
input_data_format = infer_channel_dimension_format(images[0])
# Get original dimensions and calculate optimal resize dimensions
height, width = get_image_size(images[0], channel_dim=input_data_format)
resized_height, resized_width = smart_resize(
height,
width,
factor=self.patch_size * self.merge_size, # Combine patch and merge factors
min_pixels=min_pixels,
max_pixels=max_pixels,
)
processed_images = []
for image in images:
if height != resized_height or width != resized_width:
# Convert to uint8 before resizing to avoid double scaling
image = image.astype("uint8")
# Convert to PIL Image and resize
image = Image.fromarray(image)
image = resize(
image,
size=(resized_height, resized_width),
resample=resample,
data_format=input_data_format,
)
if do_rescale and do_normalize:
# Adjust mean and std for combined rescale+normalize
image_mean = np.array(image_mean, dtype=np.float32) * (1.0 / rescale_factor)
image_std = np.array(image_std, dtype=np.float32) * (1.0 / rescale_factor)
do_rescale = False # Skip separate rescale step
if do_rescale:
image = image.astype(np.float32)
image = rescale(image, scale=rescale_factor, data_format=input_data_format)
if do_normalize:
image = image.astype(np.float32)
image = normalize(
image=image,
mean=image_mean,
std=image_std,
data_format=input_data_format,
)
image = to_channel_dimension_format(image, data_format, input_channel_dim=input_data_format) # [C, H, W]
processed_images.append(image)
# Convert processed images to numpy array
patches = np.array(processed_images)
# Pad temporal dimension if needed
if patches.shape[0] % self.temporal_patch_size != 0:
repeats = np.repeat(
patches[-1][np.newaxis],
self.temporal_patch_size - (patches.shape[0] % self.temporal_patch_size),
axis=0,
)
patches = np.concatenate([patches, repeats], axis=0)
# Convert to channels-first format if needed
if data_format == ChannelDimension.LAST:
patches = patches.transpose([0, 3, 1, 2]) # [N, H, W, C] -> [N, C, H, W]
grid_t, channel = patches.shape[:2]
grid_t = grid_t // self.temporal_patch_size
grid_h, grid_w = (
resized_height // self.patch_size,
resized_width // self.patch_size,
)
# Reshape into hierarchical patch structure
patches = patches.reshape(
[
grid_t,
self.temporal_patch_size,
channel,
grid_h // self.merge_size,
self.merge_size,
self.patch_size,
grid_w // self.merge_size,
self.merge_size,
self.patch_size,
]
)
# Reorder dimensions for better memory access pattern
# [grid_t, grid_h/merge_size, grid_w/merge_size, merge_size, merge_size, C, temporal_patch_size, psz, psz]
patches = patches.transpose([0, 3, 6, 4, 7, 2, 1, 5, 8])
flatten_patches = patches.reshape(
[
grid_t * grid_h * grid_w,
channel * self.temporal_patch_size * self.patch_size * self.patch_size,
]
)
return flatten_patches, np.array([grid_t, grid_h, grid_w])
def preprocess(
self,
images: Union[ImageInput, VideoInput],
min_pixels: Optional[int] = None,
max_pixels: Optional[int] = None,
image_mean: Optional[Union[float, List[float]]] = None,
image_std: Optional[Union[float, List[float]]] = None,
rescale_factor: Optional[float] = None,
do_rescale: Optional[bool] = None,
do_normalize: Optional[bool] = None,
resample: Optional[PILImageResampling] = None,
return_tensors: Optional[Union[str, TensorType]] = None,
data_format: Optional[ChannelDimension] = ChannelDimension.FIRST,
input_data_format: Optional[Union[str, ChannelDimension]] = ChannelDimension.LAST,
):
"""
Main preprocessing method for images/videos.
Args:
images: Input image/video data
min_pixels: Override for minimum pixels
max_pixels: Override for maximum pixels
image_mean: Override for normalization mean
image_std: Override for normalization std
rescale_factor: Override for rescaling factor
do_rescale: Override for rescaling flag
do_normalize: Override for normalization flag
resample: Override for resampling method
return_tensors: Desired output tensor format
data_format: Output channel dimension format
input_data_format: Input channel dimension format
Returns:
BatchFeature: Processed features containing:
- pixel_values: Preprocessed pixel data
- grid_thw: Grid dimensions [temporal, height, width]
Raises:
ValueError: For invalid image types or dimensions
"""
min_pixels = min_pixels if min_pixels is not None else self.min_pixels
max_pixels = max_pixels if max_pixels is not None else self.max_pixels
image_mean = image_mean if image_mean is not None else self.image_mean
image_std = image_std if image_std is not None else self.image_std
rescale_factor = rescale_factor if rescale_factor is not None else self.rescale_factor
do_rescale = do_rescale if do_rescale is not None else self.do_rescale
do_normalize = do_normalize if do_normalize is not None else self.do_normalize
resample = resample if resample is not None else self.resample
if images is not None and not valid_images(images):
raise ValueError("Invalid image type. Must be of type PIL.Image.Image, numpy.ndarray, " "paddle.Tensor.")
pixel_values, grid_thw = self._preprocess(
images,
min_pixels=min_pixels,
max_pixels=max_pixels,
image_mean=image_mean,
image_std=image_std,
rescale_factor=rescale_factor,
do_rescale=do_rescale,
do_normalize=do_normalize,
resample=resample,
data_format=data_format,
input_data_format=input_data_format,
)
data = {"pixel_values": pixel_values, "grid_thw": grid_thw}
return BatchFeature(data=data, tensor_type=return_tensors)
+453
View File
@@ -0,0 +1,453 @@
"""
# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""
"""Unified multimodal processor for all VL model types.
Consolidates the four separate VL processor wrappers (QwenVLProcessor,
Qwen3VLProcessor, PaddleOCRVLProcessor, Ernie4_5_VLProcessor) into a
single class that dispatches per ``model_type``.
"""
from collections.abc import Mapping
from typing import Any, Dict, Optional
import numpy as np
from fastdeploy.input.base_processor import BaseTextProcessor
from fastdeploy.input.utils import IDS_TYPE_FLAG, process_stop_token_ids
from fastdeploy.utils import data_processor_logger
QWEN_VL = "qwen_vl"
QWEN3_VL = "qwen3_vl"
PADDLEOCR_VL = "paddleocr_vl"
ERNIE4_5_VL = "ernie4_5_vl"
_SUPPORTED_MODEL_TYPES = {QWEN_VL, QWEN3_VL, PADDLEOCR_VL, ERNIE4_5_VL}
_QWEN_EXPECTED_KWARGS = {
"video_max_frames": int,
"video_min_frames": int,
}
_ERNIE_EXPECTED_KWARGS = {
"spatial_conv_size": int,
"temporal_conv_size": int,
"image_min_pixels": int,
"image_max_pixels": int,
"video_min_pixels": int,
"video_max_pixels": int,
"video_target_frames": int,
"video_frames_sample": str,
"video_max_frames": int,
"video_min_frames": int,
"video_fps": int,
}
_DEFAULT_MM_LIMITS = {"image": 1, "video": 1, "audio": 1}
_SAMPLING_EPS = 1e-5
class MultiModalProcessor(BaseTextProcessor):
"""Unified multimodal processor for all supported VL model types.
Dispatches image-processor creation, config initialisation, and
encoding logic based on ``model_type``.
"""
def __init__(
self,
model_name_or_path: str,
model_type: str,
config=None,
limit_mm_per_prompt: Optional[Dict[str, Any]] = None,
mm_processor_kwargs: Optional[Dict[str, Any]] = None,
reasoning_parser_obj=None,
tool_parser_obj=None,
enable_processor_cache: bool = False,
):
if model_type not in _SUPPORTED_MODEL_TYPES:
raise ValueError(
f"Unsupported model_type '{model_type}'. " f"Must be one of {sorted(_SUPPORTED_MODEL_TYPES)}."
)
self.model_type = model_type
self.config = config
self.enable_processor_cache = enable_processor_cache
tokenizer_type = "ernie4_5" if model_type == ERNIE4_5_VL else "auto"
super().__init__(
model_name_or_path,
tokenizer_type=tokenizer_type,
reasoning_parser_obj=reasoning_parser_obj,
tool_parser_obj=tool_parser_obj,
)
data_processor_logger.info(f"model_name_or_path: {model_name_or_path}")
processor_kwargs = self._parse_processor_kwargs(mm_processor_kwargs)
self._init_mm_processor(processor_kwargs)
self._init_mm_config()
self.limit_mm_per_prompt = self._parse_limits(limit_mm_per_prompt)
def _load_tokenizer(self):
"""Load the appropriate tokenizer based on model_type."""
if self.tokenizer_type == "ernie4_5":
import os
from fastdeploy.input.ernie4_5_tokenizer import Ernie4_5Tokenizer
vocab_file_names = ["tokenizer.model", "spm.model", "ernie_token_100k.model"]
for name in vocab_file_names:
if os.path.exists(os.path.join(self.model_name_or_path, name)):
Ernie4_5Tokenizer.resource_files_names["vocab_file"] = name
break
tokenizer = Ernie4_5Tokenizer.from_pretrained(self.model_name_or_path)
else:
from paddleformers.transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained(self.model_name_or_path, padding_side="left", use_fast=True)
return tokenizer
def _init_mm_processor(self, processor_kwargs: dict):
"""Create the model-type-specific internal DataProcessor."""
if self.model_type == QWEN_VL:
from fastdeploy.input.qwen_vl_processor.process import DataProcessor
tokens_per_second = getattr(getattr(self.config, "vision_config", None), "tokens_per_second", 2)
self.processor = DataProcessor(
model_path=self.model_name_or_path,
enable_processor_cache=self.enable_processor_cache,
tokens_per_second=tokens_per_second,
tokenizer=self.tokenizer,
**processor_kwargs,
)
elif self.model_type == QWEN3_VL:
from fastdeploy.input.qwen3_vl_processor.process import DataProcessor
self.processor = DataProcessor(
model_path=self.model_name_or_path,
enable_processor_cache=self.enable_processor_cache,
tokenizer=self.tokenizer,
**processor_kwargs,
)
elif self.model_type == PADDLEOCR_VL:
from fastdeploy.input.paddleocr_vl_processor.process import DataProcessor
tokens_per_second = getattr(getattr(self.config, "vision_config", None), "tokens_per_second", 2)
self.processor = DataProcessor(
model_path=self.model_name_or_path,
enable_processor_cache=self.enable_processor_cache,
tokens_per_second=tokens_per_second,
tokenizer=self.tokenizer,
**processor_kwargs,
)
elif self.model_type == ERNIE4_5_VL:
from fastdeploy.input.ernie4_5_vl_processor.process import DataProcessor
self.processor = DataProcessor(
tokenizer_name=self.model_name_or_path,
image_preprocessor_name=self.model_name_or_path,
enable_processor_cache=self.enable_processor_cache,
**processor_kwargs,
)
self.processor.eval()
def _init_mm_config(self):
"""Set model-type-specific multimodal configuration attributes."""
if self.model_type in (QWEN_VL, QWEN3_VL):
self.image_patch_id = self.processor.image_token_id
elif self.model_type == PADDLEOCR_VL:
self.image_patch_id = self.processor.image_patch_id
elif self.model_type == ERNIE4_5_VL:
self.image_patch_id = self.processor.image_patch_id
self.spatial_conv_size = self.processor.spatial_conv_size
def _parse_processor_kwargs(self, kwargs: Optional[dict]) -> dict:
"""Parse and validate multimodal processor kwargs."""
if not kwargs:
return {}
try:
if not isinstance(kwargs, dict):
raise ValueError("mm-processor-kwargs must be a dictionary")
data_processor_logger.info(f"Processing kwargs: {kwargs}")
if self.model_type == ERNIE4_5_VL:
expected_types = _ERNIE_EXPECTED_KWARGS
else:
expected_types = _QWEN_EXPECTED_KWARGS
for key, value in kwargs.items():
if key in expected_types and not isinstance(value, expected_types[key]):
raise ValueError(
f"Invalid type for {key}: expected "
f"{expected_types[key].__name__}, got {type(value).__name__}"
)
return kwargs
except Exception as e:
data_processor_logger.warning(f"Invalid mm-processor-kwargs format: {e}")
return {}
def _parse_limits(self, limits: Optional[dict]) -> dict:
"""Parse multimodal input limits, merging with defaults."""
if not limits:
return dict(_DEFAULT_MM_LIMITS)
try:
if not isinstance(limits, dict):
raise ValueError("limit-mm-per-prompt must be a dictionary")
data_processor_logger.info(f"_parse_limits:{limits}")
return {**_DEFAULT_MM_LIMITS, **limits}
except Exception as e:
data_processor_logger.warning(f"Invalid limit-mm-per-prompt format: {e}, using default limits")
return dict(_DEFAULT_MM_LIMITS)
def _check_mm_limits(self, item):
"""Validate multimodal inputs against configured limits."""
if isinstance(item, dict):
mm_data = item
else:
mm_data = {"image": [], "video": []}
for message in item:
if isinstance(message.get("content"), list):
for part in message["content"]:
part_type = part.get("type")
if part_type in ("image_url", "image"):
mm_data["image"].append(part)
elif part_type in ("video_url", "video"):
mm_data["video"].append(part)
for modality, data in mm_data.items():
if modality in self.limit_mm_per_prompt:
limit = self.limit_mm_per_prompt[modality]
if len(data) > limit:
raise ValueError(f"Too many {modality} items in prompt, " f"got {len(data)} but limit is {limit}")
def get_mm_max_tokens_per_item(self, seq_len: int) -> Optional[Mapping[str, int]]:
"""Return per-modality max token counts, if available."""
if self.model_type == ERNIE4_5_VL:
return self.processor.get_mm_max_tokens_per_item(seq_len)
return None
def process_request_dict(self, request, max_model_len=None):
"""Process a request dictionary into model inputs.
Unified template-method flow for all VL model types. Per-model
differences are handled by small conditional branches rather than
duplicating the entire pipeline.
"""
request = self._apply_default_parameters(request)
if not request.get("eos_token_ids"):
request["eos_token_ids"] = self.eos_token_ids
self._process_stop_tokens(request)
if self.model_type != PADDLEOCR_VL:
self._process_bad_words(request)
if self.model_type == ERNIE4_5_VL:
logits_processors_args = self._prepare_think_stop_sentence(
request.get("logits_processors_args") or {}, max_model_len
)
request["logits_processors_args"] = logits_processors_args
outputs = self._tokenize_request(request)
self._process_post_tokens(request, outputs)
if self.model_type in (QWEN_VL, QWEN3_VL):
request["enable_thinking"] = False
outputs = self.pack_outputs(outputs)
if self.model_type in (QWEN3_VL, ERNIE4_5_VL) and request.get("prompt_token_ids"):
pass # preserve existing prompt_token_ids
else:
request["prompt_token_ids"] = outputs["input_ids"].tolist()
request["prompt_token_ids_len"] = len(request["prompt_token_ids"])
request["multimodal_inputs"] = outputs
if max_model_len is not None and len(request["prompt_token_ids"]) > max_model_len:
request["prompt_token_ids"] = request["prompt_token_ids"][: max_model_len - 1]
if self.model_type == ERNIE4_5_VL:
logits_processors_args = self._update_thinking_prompt_state(
request["prompt_token_ids"], request.get("logits_processors_args") or {}
)
request["logits_processors_args"] = logits_processors_args
max_tokens = max_model_len - len(request["prompt_token_ids"])
if request.get("max_tokens") is None:
request["max_tokens"] = max(1, max_tokens)
else:
request["max_tokens"] = min(max_tokens, request["max_tokens"])
if self.model_type == ERNIE4_5_VL and request.get("reasoning_max_tokens") is None:
request["reasoning_max_tokens"] = max(int(request["max_tokens"] * 0.8), 1)
if self.model_type in (PADDLEOCR_VL, ERNIE4_5_VL):
if request.get("top_p") is not None and request.get("top_p") < _SAMPLING_EPS:
request["top_p"] = _SAMPLING_EPS
request["top_k"] = 1
if self.model_type != QWEN3_VL and self.reasoning_parser:
self._apply_reasoning_parser(request)
if self.model_type == ERNIE4_5_VL:
if request.get("response_max_tokens") is not None and request.get("enable_thinking") is False:
request["max_tokens"] = min(request["response_max_tokens"], request["max_tokens"])
data_processor_logger.info(f"Processed request {request}")
return request
def _process_stop_tokens(self, request):
"""Handle stop token processing based on model type."""
if self.model_type == QWEN3_VL:
stop_sequences = request.get("stop", [])
if stop_sequences:
stop_seqs, stop_seqs_len = self.update_stop_seq(stop_sequences)
request["stop_token_ids"] = stop_seqs
request["stop_seqs_len"] = stop_seqs_len
else:
process_stop_token_ids(request, self.update_stop_seq)
def _process_bad_words(self, request):
"""Process bad_words into token ids."""
bad_words = request.get("bad_words")
bad_words_token_ids = request.get("bad_words_token_ids")
if bad_words:
bad_words_token_ids = self.update_bad_words(bad_words, bad_words_token_ids)
request["bad_words_token_ids"] = bad_words_token_ids
def _tokenize_request(self, request):
"""Core tokenization dispatch: prompt_token_ids > prompt > messages."""
default_thinking = True if self.model_type == ERNIE4_5_VL else False
if request.get("prompt_token_ids") and self.model_type in (QWEN3_VL, ERNIE4_5_VL):
messages = request.get("messages")
if messages:
self._check_mm_limits(messages)
request.setdefault("enable_thinking", default_thinking)
return self.processor.prompt_token_ids2outputs(request)
elif request.get("prompt"):
multimodal_data = request.get("multimodal_data") or {}
self._check_mm_limits(multimodal_data)
images = multimodal_data.get("image", None)
videos = multimodal_data.get("video", None)
if self.model_type == ERNIE4_5_VL:
request["prompt_tokens"] = request.get("prompt")
request.setdefault("enable_thinking", default_thinking)
return self.processor.text2ids(request["prompt"], images, videos)
elif request.get("messages"):
messages = request["messages"]
self._check_mm_limits(messages)
chat_template_kwargs = request.get("chat_template_kwargs")
if chat_template_kwargs:
if isinstance(chat_template_kwargs, dict):
for k, v in chat_template_kwargs.items():
if k not in request or request[k] is None:
request[k] = v
else:
raise ValueError("Invalid input: chat_template_kwargs must be a dict")
request.setdefault("enable_thinking", default_thinking)
return self.processor.request2ids(request)
else:
raise ValueError(f"Request must contain 'prompt', or 'messages': {request}")
def _process_post_tokens(self, request, outputs):
"""Handle post-tokenization token appending."""
if self.model_type == PADDLEOCR_VL:
metadata = request.get("metadata")
if metadata and metadata.get("generated_token_ids"):
self._append_completion_tokens_qwen(outputs, metadata["generated_token_ids"])
else:
if request.get("completion_token_ids"):
self.append_completion_tokens(outputs, request["completion_token_ids"])
def _apply_reasoning_parser(self, request):
"""Apply reasoning parser and update model status dict."""
model_status = self.reasoning_parser.get_model_status(request["prompt_token_ids"])
parts = request["request_id"].split("_")
if len(parts) > 1:
real_req_id = parts[0]
index = int(parts[1])
n = request.get("n", 1)
for idx in range(index * n, (index + 1) * n):
self.model_status_dict[f"{real_req_id}_{idx}"] = model_status
else:
self.model_status_dict[request["request_id"]] = model_status
request["enable_thinking"] = model_status == "think_start"
def append_completion_tokens(self, multimodal_inputs, completion_token_ids):
"""Append completion tokens to existing multimodal outputs."""
if self.model_type == ERNIE4_5_VL:
self._append_completion_tokens_ernie(multimodal_inputs, completion_token_ids)
else:
self._append_completion_tokens_qwen(multimodal_inputs, completion_token_ids)
def _append_completion_tokens_qwen(self, multimodal_inputs, completion_token_ids):
"""Append completion tokens for qwen_vl / qwen3_vl / paddleocr_vl."""
num_tokens = len(completion_token_ids)
multimodal_inputs["input_ids"].extend(completion_token_ids)
multimodal_inputs["token_type_ids"].extend([0] * num_tokens)
pos_ids = self.processor._compute_text_positions(multimodal_inputs["cur_position"], num_tokens)
multimodal_inputs["position_ids"].append(pos_ids)
multimodal_inputs["cur_position"] += num_tokens
def _append_completion_tokens_ernie(self, multimodal_inputs, completion_token_ids):
"""Append completion tokens for ernie4_5_vl."""
num_tokens = len(completion_token_ids)
multimodal_inputs["input_ids"].extend(completion_token_ids)
multimodal_inputs["token_type_ids"].extend([IDS_TYPE_FLAG["text"]] * num_tokens)
start = multimodal_inputs["cur_position"]
for i in range(num_tokens):
multimodal_inputs["position_ids"].append([start + i] * 3)
multimodal_inputs["cur_position"] += num_tokens
def pack_outputs(self, outputs):
"""Convert intermediate processing outputs to final format."""
if not outputs["images"]:
outputs["images"] = None
outputs["grid_thw"] = None
outputs["image_type_ids"] = None
else:
outputs["images"] = np.vstack(outputs["images"])
outputs["grid_thw"] = np.vstack(outputs["grid_thw"])
outputs["image_type_ids"] = np.array(outputs["image_type_ids"])
outputs["input_ids"] = np.array(outputs["input_ids"], dtype=np.int64)
outputs["token_type_ids"] = np.array(outputs["token_type_ids"], dtype=np.int64)
outputs["mm_num_token_func"] = self.processor.mm_num_tokens
if self.model_type in (QWEN_VL, QWEN3_VL, PADDLEOCR_VL):
outputs["position_ids"] = np.concatenate(outputs["position_ids"], axis=1, dtype=np.int64)
outputs["image_patch_id"] = self.processor.image_token_id
outputs["video_patch_id"] = self.processor.video_token_id
outputs["position_ids"] = outputs["position_ids"].transpose(1, 0)
else:
outputs["position_ids"] = np.array(outputs["position_ids"], dtype=np.int64)
outputs["image_patch_id"] = self.image_patch_id
return outputs
@@ -14,216 +14,12 @@
# limitations under the License. # limitations under the License.
""" """
"""Image processor class for Keye.""" # Backward compatibility: this module has been migrated to
# fastdeploy.input.image_processors.paddleocr_processor
# This file will be removed in a future version.
# TODO: Support videos from fastdeploy.input.image_processors.paddleocr_processor import ( # noqa: F401
ImageProcessor,
import json make_batched_images,
from pathlib import Path smart_resize,
from typing import Dict, List, Optional, Union
import numpy as np
from paddleformers.transformers.feature_extraction_utils import BatchFeature
from paddleformers.transformers.image_processing_utils import BaseImageProcessor
from paddleformers.transformers.image_utils import (
ImageInput,
is_valid_image,
make_list_of_images,
to_numpy_array,
) )
from fastdeploy.input.image_processors.common import (
smart_resize_paddleocr as smart_resize,
)
_OPENAI_CLIP_MEAN = [0.48145466, 0.4578275, 0.40821073]
_OPENAI_CLIP_STD = [0.26862954, 0.26130258, 0.27577711]
def make_batched_images(images) -> List[List[ImageInput]]:
"""
Accepts images in list or nested list format, and makes a list of images for preprocessing.
Args:
images (`Union[List[List[ImageInput]], List[ImageInput], ImageInput]`):
The input image.
Returns:
list: A list of images.
"""
if isinstance(images, (list, tuple)) and isinstance(images[0], (list, tuple)) and is_valid_image(images[0][0]):
return [img for img_list in images for img in img_list]
elif isinstance(images, (list, tuple)) and is_valid_image(images[0]):
return images
elif is_valid_image(images):
return [images]
raise ValueError(f"Could not make batched images from {images}")
def adjust_size(size, patch_size):
num_patches = size // patch_size
if num_patches % 2 != 0:
num_patches -= 1
return num_patches * patch_size
class ImageProcessor(BaseImageProcessor):
model_input_names = [
"pixel_values",
"image_grid_thw",
"pixel_values_videos",
"video_grid_thw",
]
def __init__(
self,
do_resize: bool = True,
resample: int = 3,
do_rescale: bool = True,
rescale_factor: Union[int, float] = 1 / 255,
do_normalize: bool = True,
image_mean: Optional[Union[float, List[float]]] = None,
image_std: Optional[Union[float, List[float]]] = None,
do_convert_rgb: bool = True,
min_pixels: int = 28 * 28 * 130,
max_pixels: int = 28 * 28 * 1280,
patch_size: int = 14,
temporal_patch_size: int = 1,
merge_size: int = 2,
**kwargs,
) -> None:
super().__init__()
self.do_resize = do_resize
self.resample = resample
self.do_rescale = do_rescale
self.rescale_factor = rescale_factor
self.do_normalize = do_normalize
self.image_mean = image_mean if image_mean is not None else _OPENAI_CLIP_MEAN
self.image_std = image_std if image_std is not None else _OPENAI_CLIP_STD
self.min_pixels = min_pixels
self.max_pixels = max_pixels
self.patch_size = patch_size
self.temporal_patch_size = temporal_patch_size
self.merge_size = merge_size
self.size = {"min_pixels": min_pixels, "max_pixels": max_pixels} # not used
self.do_convert_rgb = do_convert_rgb
@classmethod
def from_pretrained(cls, pretrained_model_dir):
pretrained_model_dir = Path(pretrained_model_dir)
image_processor_config_path = pretrained_model_dir / "preprocessor_config.json"
with open(image_processor_config_path, "r", encoding="utf-8") as f:
image_processor_config = json.load(f)
return cls(**image_processor_config)
def _preprocess(
self,
images,
do_resize: Optional[bool] = None,
do_rescale: Optional[bool] = None,
rescale_factor: Optional[float] = None,
do_normalize: Optional[bool] = None,
image_mean: Optional[Union[float, List[float]]] = None,
image_std: Optional[Union[float, List[float]]] = None,
do_convert_rgb: Optional[bool] = None,
):
images = make_list_of_images(images)
if do_convert_rgb:
images = [image.convert("RGB") for image in images]
width, height = images[0].size
resized_height, resized_width = height, width
processed_images = []
for image in images:
if do_resize:
resized_height, resized_width = smart_resize(
height,
width,
factor=self.patch_size * self.merge_size,
min_pixels=self.min_pixels,
max_pixels=self.max_pixels,
)
image = image.resize((resized_width, resized_height), resample=self.resample)
image = to_numpy_array(image)
if do_rescale:
image = (image * rescale_factor).astype(np.float32)
if do_normalize:
image = image.astype(np.float32)
image -= np.array(image_mean, dtype=np.float32)
image /= np.array(image_std, dtype=np.float32)
processed_images.append(image)
patches = np.array(processed_images)
patches = patches.transpose(0, 3, 1, 2)
if patches.shape[0] == 1:
patches = np.tile(patches, (self.temporal_patch_size, 1, 1, 1))
channel = patches.shape[1]
grid_t = patches.shape[0] // self.temporal_patch_size
grid_h, grid_w = (
resized_height // self.patch_size,
resized_width // self.patch_size,
)
patches = patches.reshape(
grid_t,
self.temporal_patch_size,
channel,
grid_h,
self.patch_size,
grid_w,
self.patch_size,
)
patches = patches.transpose(0, 3, 5, 2, 1, 4, 6)
assert self.temporal_patch_size == 1
flatten_patches = patches.reshape(grid_t * grid_h * grid_w, channel, self.patch_size, self.patch_size)
return flatten_patches, np.array([grid_t, grid_h, grid_w])
def preprocess(
self,
images,
videos=None,
do_resize: Optional[bool] = None,
size: Optional[Dict[str, int]] = None,
do_rescale: Optional[bool] = None,
rescale_factor: Optional[float] = None,
do_normalize: Optional[bool] = None,
image_mean: Optional[Union[float, List[float]]] = None,
image_std: Optional[Union[float, List[float]]] = None,
do_convert_rgb: Optional[bool] = None,
return_tensors=None,
):
do_resize = do_resize if do_resize is not None else self.do_resize
size = size if size is not None else self.size
do_rescale = do_rescale if do_rescale is not None else self.do_rescale
rescale_factor = rescale_factor if rescale_factor is not None else self.rescale_factor
do_normalize = do_normalize if do_normalize is not None else self.do_normalize
image_mean = image_mean if image_mean is not None else self.image_mean
image_std = image_std if image_std is not None else self.image_std
do_convert_rgb = do_convert_rgb if do_convert_rgb is not None else self.do_convert_rgb
if videos is not None:
raise NotImplementedError("Videos are not yet supported")
patches, image_grid_thw = self._preprocess(
images,
do_resize=do_resize,
do_rescale=do_rescale,
rescale_factor=rescale_factor,
do_normalize=do_normalize,
image_mean=image_mean,
image_std=image_std,
do_convert_rgb=do_convert_rgb,
)
pixel_values = np.array(patches)
data = {"pixel_values": pixel_values, "grid_thw": image_grid_thw}
return BatchFeature(data=data, tensor_type=return_tensors)
+23 -43
View File
@@ -91,54 +91,34 @@ class InputPreprocessor:
tool_parser_obj=tool_parser_obj, tool_parser_obj=tool_parser_obj,
) )
else: else:
from fastdeploy.input.multimodal_processor import (
ERNIE4_5_VL,
PADDLEOCR_VL,
QWEN3_VL,
QWEN_VL,
MultiModalProcessor,
)
if ErnieArchitectures.contains_ernie_arch(architecture): if ErnieArchitectures.contains_ernie_arch(architecture):
from fastdeploy.input.ernie4_5_vl_processor import ( model_type = ERNIE4_5_VL
Ernie4_5_VLProcessor,
)
self.processor = Ernie4_5_VLProcessor(
model_name_or_path=self.model_name_or_path,
limit_mm_per_prompt=self.limit_mm_per_prompt,
mm_processor_kwargs=self.mm_processor_kwargs,
reasoning_parser_obj=reasoning_parser_obj,
tool_parser_obj=tool_parser_obj,
enable_processor_cache=self.enable_processor_cache,
)
elif "PaddleOCRVL" in architecture: elif "PaddleOCRVL" in architecture:
from fastdeploy.input.paddleocr_vl_processor import ( model_type = PADDLEOCR_VL
PaddleOCRVLProcessor,
)
self.processor = PaddleOCRVLProcessor(
config=self.model_config,
model_name_or_path=self.model_name_or_path,
limit_mm_per_prompt=self.limit_mm_per_prompt,
mm_processor_kwargs=self.mm_processor_kwargs,
reasoning_parser_obj=reasoning_parser_obj,
)
elif "Qwen2_5_VL" in architecture: elif "Qwen2_5_VL" in architecture:
from fastdeploy.input.qwen_vl_processor import QwenVLProcessor model_type = QWEN_VL
self.processor = QwenVLProcessor(
config=self.model_config,
model_name_or_path=self.model_name_or_path,
limit_mm_per_prompt=self.limit_mm_per_prompt,
mm_processor_kwargs=self.mm_processor_kwargs,
reasoning_parser_obj=reasoning_parser_obj,
enable_processor_cache=self.enable_processor_cache,
)
elif "Qwen3VL" in architecture: elif "Qwen3VL" in architecture:
from fastdeploy.input.qwen3_vl_processor import Qwen3VLProcessor model_type = QWEN3_VL
self.processor = Qwen3VLProcessor(
config=self.model_config,
model_name_or_path=self.model_name_or_path,
limit_mm_per_prompt=self.limit_mm_per_prompt,
mm_processor_kwargs=self.mm_processor_kwargs,
reasoning_parser_obj=reasoning_parser_obj,
enable_processor_cache=self.enable_processor_cache,
)
else: else:
raise ValueError(f"Unsupported model processor architecture: {architecture}. ") raise ValueError(f"Unsupported model processor architecture: {architecture}. ")
self.processor = MultiModalProcessor(
model_name_or_path=self.model_name_or_path,
model_type=model_type,
config=self.model_config,
limit_mm_per_prompt=self.limit_mm_per_prompt,
mm_processor_kwargs=self.mm_processor_kwargs,
reasoning_parser_obj=reasoning_parser_obj,
tool_parser_obj=tool_parser_obj,
enable_processor_cache=self.enable_processor_cache,
)
return self.processor return self.processor
@@ -14,320 +14,10 @@
# limitations under the License. # limitations under the License.
""" """
from typing import List, Optional, Union # Backward compatibility: this module has been migrated to
# fastdeploy.input.image_processors.qwen3_processor
# This file will be removed in a future version.
import numpy as np from fastdeploy.input.image_processors.qwen3_processor import ( # noqa: F401
import paddle ImageProcessor,
import PIL
from paddleformers.transformers.feature_extraction_utils import BatchFeature
from paddleformers.transformers.image_processing_utils import BaseImageProcessor
from paddleformers.transformers.image_transforms import (
normalize,
rescale,
resize,
to_channel_dimension_format,
) )
from paddleformers.transformers.image_utils import (
ChannelDimension,
ImageInput,
PILImageResampling,
get_image_size,
infer_channel_dimension_format,
make_list_of_images,
to_numpy_array,
valid_images,
)
from paddleformers.transformers.legacy.tokenizer_utils_base import TensorType
from PIL import Image
from fastdeploy.input.image_processors.common import is_scaled_image, smart_resize
from fastdeploy.utils import data_processor_logger
IMAGE_MEAN = [0.5, 0.5, 0.5]
IMAGE_STD = [0.5, 0.5, 0.5]
MIN_PIXELS = 65536
MAX_PIXELS = 16777216
VideoInput = Union[
List["PIL.Image.Image"],
"np.ndarray",
"paddle.Tensor",
List["np.ndarray"],
List["paddle.Tensor"],
List[List["PIL.Image.Image"]],
List[List["np.ndarray"]],
List[List["paddle.Tensor"]],
]
class ImageProcessor(BaseImageProcessor):
"""
Adaptive image processor for dynamic image resizing and preprocessing.
This processor handles image resizing, rescaling, normalization and format conversion.
It dynamically adjusts image dimensions based on original size and specified constraints.
"""
def __init__(
self,
patch_size: int = 16,
merge_size: int = 2,
temporal_patch_size: int = 2,
min_pixels: int = MIN_PIXELS,
max_pixels: int = MAX_PIXELS,
image_mean: Union[float, List[float]] = IMAGE_MEAN,
image_std: Union[float, List[float]] = IMAGE_STD,
rescale_factor: float = 1 / 255,
do_rescale: bool = True,
do_normalize: bool = True,
resample: PILImageResampling = PILImageResampling.BICUBIC,
**kwargs,
) -> None:
"""
Initialize image processor with configuration parameters.
Args:
patch_size (int): Spatial patch size for vision encoder
merge_size (int): Merge size between vision and LLM encoders
temporal_patch_size (int): Temporal patch size for video processing
min_pixels (int): Minimum allowed pixels in resized image
max_pixels (int): Maximum allowed pixels in resized image
image_mean (float/list): Mean values for normalization per channel
image_std (float/list): Std values for normalization per channel
rescale_factor (float): Scaling factor for pixel values (default 1/255)
do_rescale (bool): Whether to rescale images
do_normalize (bool): Whether to normalize images
resample: Resampling method for image resizing
**kwargs: Additional base class arguments
"""
super().__init__(**kwargs)
self.patch_size = patch_size
self.merge_size = merge_size
self.temporal_patch_size = temporal_patch_size
self.min_pixels = min_pixels
self.max_pixels = max_pixels
self.image_mean = image_mean
self.image_std = image_std
self.rescale_factor = rescale_factor
self.do_rescale = do_rescale
self.do_normalize = do_normalize
self.resample = resample
def _preprocess(
self,
images: Union[ImageInput, VideoInput],
min_pixels: int,
max_pixels: int,
image_mean: Optional[Union[float, List[float]]],
image_std: Optional[Union[float, List[float]]],
rescale_factor: float,
do_rescale: bool,
do_normalize: bool,
resample: PILImageResampling,
data_format: Optional[ChannelDimension],
input_data_format: Optional[Union[str, ChannelDimension]],
):
"""
Internal method for image preprocessing pipeline.
Args:
images: Input image or batch of images
min_pixels: Minimum allowed pixels in output
max_pixels: Maximum allowed pixels in output
image_mean: Normalization mean values
image_std: Normalization std values
rescale_factor: Pixel value scaling factor
do_rescale: Whether to rescale pixel values
do_normalize: Whether to normalize pixel values
resample: Resampling method
data_format: Output channel format
input_data_format: Input channel format
Returns:
tuple: (flatten_patches, grid_dimensions)
- flatten_patches: Flattened image patches
- grid_dimensions: Grid dimensions [t, h, w]
"""
images = make_list_of_images(images)
# All transformations expect numpy arrays.
images = [to_numpy_array(image) for image in images]
if is_scaled_image(images[0]) and do_rescale:
data_processor_logger.warning(
"It looks like you are trying to rescale already rescaled images. If the input"
" images have pixel values between 0 and 1, set `do_rescale=False` to avoid rescaling them again."
)
if input_data_format is None:
# We assume that all images have the same channel dimension format.
input_data_format = infer_channel_dimension_format(images[0])
# Get original dimensions and calculate optimal resize dimensions
height, width = get_image_size(images[0], channel_dim=input_data_format)
resized_height, resized_width = smart_resize(
height,
width,
factor=self.patch_size * self.merge_size, # Combine patch and merge factors
min_pixels=min_pixels,
max_pixels=max_pixels,
)
processed_images = []
for image in images:
if height != resized_height or width != resized_width:
# Convert to uint8 before resizing to avoid double scaling
image = image.astype("uint8")
# Convert to PIL Image and resize
image = Image.fromarray(image)
image = resize(
image,
size=(resized_height, resized_width),
resample=resample,
data_format=input_data_format,
)
if do_rescale and do_normalize:
# Adjust mean and std for combined rescale+normalize
image_mean = np.array(image_mean, dtype=np.float32) * (1.0 / rescale_factor)
image_std = np.array(image_std, dtype=np.float32) * (1.0 / rescale_factor)
do_rescale = False # Skip separate rescale step
# mutual exclusion and upper branch
if do_rescale:
image = image.astype(np.float32)
image = rescale(image, scale=rescale_factor, data_format=input_data_format)
if do_normalize:
image = image.astype(np.float32)
image = normalize(
image=image,
mean=image_mean,
std=image_std,
data_format=input_data_format,
)
image = to_channel_dimension_format(image, data_format, input_channel_dim=input_data_format) # [C, H, W]
processed_images.append(image)
# Convert processed images to numpy array
patches = np.array(processed_images)
# Pad temporal dimension if needed
if patches.shape[0] % self.temporal_patch_size != 0:
repeats = np.repeat(
patches[-1][np.newaxis],
self.temporal_patch_size - (patches.shape[0] % self.temporal_patch_size),
axis=0,
)
patches = np.concatenate([patches, repeats], axis=0)
# Convert to channels-first format if needed
if data_format == ChannelDimension.LAST:
patches = patches.transpose([0, 3, 1, 2]) # [N, H, W, C] -> [N, C, H, W]
grid_t, channel = patches.shape[:2]
grid_t = grid_t // self.temporal_patch_size
grid_h, grid_w = (
resized_height // self.patch_size,
resized_width // self.patch_size,
)
# Reshape into hierarchical patch structure
patches = patches.reshape(
[
grid_t,
self.temporal_patch_size,
channel,
grid_h // self.merge_size,
self.merge_size,
self.patch_size,
grid_w // self.merge_size,
self.merge_size,
self.patch_size,
]
)
# Reorder dimensions for better memory access pattern
# [grid_t, grid_h/merge_size, grid_w/merge_size, merge_size, merge_size, C, temporal_patch_size, psz, psz]
patches = patches.transpose([0, 3, 6, 4, 7, 2, 1, 5, 8])
flatten_patches = patches.reshape(
[
grid_t * grid_h * grid_w,
channel * self.temporal_patch_size * self.patch_size * self.patch_size,
]
)
return flatten_patches, np.array([grid_t, grid_h, grid_w])
def preprocess(
self,
images: Union[ImageInput, VideoInput],
min_pixels: Optional[int] = None,
max_pixels: Optional[int] = None,
image_mean: Optional[Union[float, List[float]]] = None,
image_std: Optional[Union[float, List[float]]] = None,
rescale_factor: Optional[float] = None,
do_rescale: Optional[bool] = None,
do_normalize: Optional[bool] = None,
resample: Optional[PILImageResampling] = None,
return_tensors: Optional[Union[str, TensorType]] = None,
data_format: Optional[ChannelDimension] = ChannelDimension.FIRST,
input_data_format: Optional[Union[str, ChannelDimension]] = ChannelDimension.LAST,
):
"""
Main preprocessing method for images/videos.
Args:
images: Input image/video data
min_pixels: Override for minimum pixels
max_pixels: Override for maximum pixels
image_mean: Override for normalization mean
image_std: Override for normalization std
rescale_factor: Override for rescaling factor
do_rescale: Override for rescaling flag
do_normalize: Override for normalization flag
resample: Override for resampling method
return_tensors: Desired output tensor format
data_format: Output channel dimension format
input_data_format: Input channel dimension format
Returns:
BatchFeature: Processed features containing:
- pixel_values: Preprocessed pixel data
- grid_thw: Grid dimensions [temporal, height, width]
Raises:
ValueError: For invalid image types or dimensions
"""
min_pixels = min_pixels if min_pixels is not None else self.min_pixels
max_pixels = max_pixels if max_pixels is not None else self.max_pixels
image_mean = image_mean if image_mean is not None else self.image_mean
image_std = image_std if image_std is not None else self.image_std
rescale_factor = rescale_factor if rescale_factor is not None else self.rescale_factor
do_rescale = do_rescale if do_rescale is not None else self.do_rescale
do_normalize = do_normalize if do_normalize is not None else self.do_normalize
resample = resample if resample is not None else self.resample
if images is not None and not valid_images(images):
raise ValueError("Invalid image type. Must be of type PIL.Image.Image, numpy.ndarray, " "paddle.Tensor.")
pixel_values, grid_thw = self._preprocess(
images,
min_pixels=min_pixels,
max_pixels=max_pixels,
image_mean=image_mean,
image_std=image_std,
rescale_factor=rescale_factor,
do_rescale=do_rescale,
do_normalize=do_normalize,
resample=resample,
data_format=data_format,
input_data_format=input_data_format,
)
data = {"pixel_values": pixel_values, "grid_thw": grid_thw}
return BatchFeature(data=data, tensor_type=return_tensors)
@@ -14,319 +14,10 @@
# limitations under the License. # limitations under the License.
""" """
from typing import List, Optional, Union # Backward compatibility: this module has been migrated to
# fastdeploy.input.image_processors.qwen_processor
# This file will be removed in a future version.
import numpy as np from fastdeploy.input.image_processors.qwen_processor import ( # noqa: F401
import paddle ImageProcessor,
import PIL
from paddleformers.transformers.feature_extraction_utils import BatchFeature
from paddleformers.transformers.image_processing_utils import BaseImageProcessor
from paddleformers.transformers.image_transforms import (
normalize,
rescale,
resize,
to_channel_dimension_format,
) )
from paddleformers.transformers.image_utils import (
ChannelDimension,
ImageInput,
PILImageResampling,
get_image_size,
infer_channel_dimension_format,
make_list_of_images,
to_numpy_array,
valid_images,
)
from paddleformers.transformers.legacy.tokenizer_utils_base import TensorType
from PIL import Image
from fastdeploy.input.image_processors.common import is_scaled_image, smart_resize
from fastdeploy.utils import data_processor_logger
OPENAI_CLIP_MEAN = [0.48145466, 0.4578275, 0.40821073]
OPENAI_CLIP_STD = [0.26862954, 0.26130258, 0.27577711]
MIN_PIXELS = 4 * 28 * 28
MAX_PIXELS = 16384 * 28 * 28
VideoInput = Union[
List["PIL.Image.Image"],
"np.ndarray",
"paddle.Tensor",
List["np.ndarray"],
List["paddle.Tensor"],
List[List["PIL.Image.Image"]],
List[List["np.ndarray"]],
List[List["paddle.Tensor"]],
]
class ImageProcessor(BaseImageProcessor):
"""
Adaptive image processor for dynamic image resizing and preprocessing.
This processor handles image resizing, rescaling, normalization and format conversion.
It dynamically adjusts image dimensions based on original size and specified constraints.
"""
def __init__(
self,
patch_size: int = 14,
merge_size: int = 2,
temporal_patch_size: int = 2,
min_pixels: int = MIN_PIXELS,
max_pixels: int = MAX_PIXELS,
image_mean: Union[float, List[float]] = OPENAI_CLIP_MEAN,
image_std: Union[float, List[float]] = OPENAI_CLIP_STD,
rescale_factor: float = 1 / 255,
do_rescale: bool = True,
do_normalize: bool = True,
resample: PILImageResampling = PILImageResampling.BICUBIC,
**kwargs,
) -> None:
"""
Initialize image processor with configuration parameters.
Args:
patch_size (int): Spatial patch size for vision encoder
merge_size (int): Merge size between vision and LLM encoders
temporal_patch_size (int): Temporal patch size for video processing
min_pixels (int): Minimum allowed pixels in resized image
max_pixels (int): Maximum allowed pixels in resized image
image_mean (float/list): Mean values for normalization per channel
image_std (float/list): Std values for normalization per channel
rescale_factor (float): Scaling factor for pixel values (default 1/255)
do_rescale (bool): Whether to rescale images
do_normalize (bool): Whether to normalize images
resample: Resampling method for image resizing
**kwargs: Additional base class arguments
"""
super().__init__(**kwargs)
self.patch_size = patch_size
self.merge_size = merge_size
self.temporal_patch_size = temporal_patch_size
self.min_pixels = min_pixels
self.max_pixels = max_pixels
self.image_mean = image_mean
self.image_std = image_std
self.rescale_factor = rescale_factor
self.do_rescale = do_rescale
self.do_normalize = do_normalize
self.resample = resample
def _preprocess(
self,
images: Union[ImageInput, VideoInput],
min_pixels: int,
max_pixels: int,
image_mean: Optional[Union[float, List[float]]],
image_std: Optional[Union[float, List[float]]],
rescale_factor: float,
do_rescale: bool,
do_normalize: bool,
resample: PILImageResampling,
data_format: Optional[ChannelDimension],
input_data_format: Optional[Union[str, ChannelDimension]],
):
"""
Internal method for image preprocessing pipeline.
Args:
images: Input image or batch of images
min_pixels: Minimum allowed pixels in output
max_pixels: Maximum allowed pixels in output
image_mean: Normalization mean values
image_std: Normalization std values
rescale_factor: Pixel value scaling factor
do_rescale: Whether to rescale pixel values
do_normalize: Whether to normalize pixel values
resample: Resampling method
data_format: Output channel format
input_data_format: Input channel format
Returns:
tuple: (flatten_patches, grid_dimensions)
- flatten_patches: Flattened image patches
- grid_dimensions: Grid dimensions [t, h, w]
"""
images = make_list_of_images(images)
# All transformations expect numpy arrays.
images = [to_numpy_array(image) for image in images]
if is_scaled_image(images[0]) and do_rescale:
data_processor_logger.warning(
"It looks like you are trying to rescale already rescaled images. If the input"
" images have pixel values between 0 and 1, set `do_rescale=False` to avoid rescaling them again."
)
if input_data_format is None:
# We assume that all images have the same channel dimension format.
input_data_format = infer_channel_dimension_format(images[0])
# Get original dimensions and calculate optimal resize dimensions
height, width = get_image_size(images[0], channel_dim=input_data_format)
resized_height, resized_width = smart_resize(
height,
width,
factor=self.patch_size * self.merge_size, # Combine patch and merge factors
min_pixels=min_pixels,
max_pixels=max_pixels,
)
processed_images = []
for image in images:
if height != resized_height or width != resized_width:
# Convert to uint8 before resizing to avoid double scaling
image = image.astype("uint8")
# Convert to PIL Image and resize
image = Image.fromarray(image)
image = resize(
image,
size=(resized_height, resized_width),
resample=resample,
data_format=input_data_format,
)
if do_rescale and do_normalize:
# Adjust mean and std for combined rescale+normalize
image_mean = np.array(image_mean, dtype=np.float32) * (1.0 / rescale_factor)
image_std = np.array(image_std, dtype=np.float32) * (1.0 / rescale_factor)
do_rescale = False # Skip separate rescale step
if do_rescale:
image = image.astype(np.float32)
image = rescale(image, scale=rescale_factor, data_format=input_data_format)
if do_normalize:
image = image.astype(np.float32)
image = normalize(
image=image,
mean=image_mean,
std=image_std,
data_format=input_data_format,
)
image = to_channel_dimension_format(image, data_format, input_channel_dim=input_data_format) # [C, H, W]
processed_images.append(image)
# Convert processed images to numpy array
patches = np.array(processed_images)
# Pad temporal dimension if needed
if patches.shape[0] % self.temporal_patch_size != 0:
repeats = np.repeat(
patches[-1][np.newaxis],
self.temporal_patch_size - (patches.shape[0] % self.temporal_patch_size),
axis=0,
)
patches = np.concatenate([patches, repeats], axis=0)
# Convert to channels-first format if needed
if data_format == ChannelDimension.LAST:
patches = patches.transpose([0, 3, 1, 2]) # [N, H, W, C] -> [N, C, H, W]
grid_t, channel = patches.shape[:2]
grid_t = grid_t // self.temporal_patch_size
grid_h, grid_w = (
resized_height // self.patch_size,
resized_width // self.patch_size,
)
# Reshape into hierarchical patch structure
patches = patches.reshape(
[
grid_t,
self.temporal_patch_size,
channel,
grid_h // self.merge_size,
self.merge_size,
self.patch_size,
grid_w // self.merge_size,
self.merge_size,
self.patch_size,
]
)
# Reorder dimensions for better memory access pattern
# [grid_t, grid_h/merge_size, grid_w/merge_size, merge_size, merge_size, C, temporal_patch_size, psz, psz]
patches = patches.transpose([0, 3, 6, 4, 7, 2, 1, 5, 8])
flatten_patches = patches.reshape(
[
grid_t * grid_h * grid_w,
channel * self.temporal_patch_size * self.patch_size * self.patch_size,
]
)
return flatten_patches, np.array([grid_t, grid_h, grid_w])
def preprocess(
self,
images: Union[ImageInput, VideoInput],
min_pixels: Optional[int] = None,
max_pixels: Optional[int] = None,
image_mean: Optional[Union[float, List[float]]] = None,
image_std: Optional[Union[float, List[float]]] = None,
rescale_factor: Optional[float] = None,
do_rescale: Optional[bool] = None,
do_normalize: Optional[bool] = None,
resample: Optional[PILImageResampling] = None,
return_tensors: Optional[Union[str, TensorType]] = None,
data_format: Optional[ChannelDimension] = ChannelDimension.FIRST,
input_data_format: Optional[Union[str, ChannelDimension]] = ChannelDimension.LAST,
):
"""
Main preprocessing method for images/videos.
Args:
images: Input image/video data
min_pixels: Override for minimum pixels
max_pixels: Override for maximum pixels
image_mean: Override for normalization mean
image_std: Override for normalization std
rescale_factor: Override for rescaling factor
do_rescale: Override for rescaling flag
do_normalize: Override for normalization flag
resample: Override for resampling method
return_tensors: Desired output tensor format
data_format: Output channel dimension format
input_data_format: Input channel dimension format
Returns:
BatchFeature: Processed features containing:
- pixel_values: Preprocessed pixel data
- grid_thw: Grid dimensions [temporal, height, width]
Raises:
ValueError: For invalid image types or dimensions
"""
min_pixels = min_pixels if min_pixels is not None else self.min_pixels
max_pixels = max_pixels if max_pixels is not None else self.max_pixels
image_mean = image_mean if image_mean is not None else self.image_mean
image_std = image_std if image_std is not None else self.image_std
rescale_factor = rescale_factor if rescale_factor is not None else self.rescale_factor
do_rescale = do_rescale if do_rescale is not None else self.do_rescale
do_normalize = do_normalize if do_normalize is not None else self.do_normalize
resample = resample if resample is not None else self.resample
if images is not None and not valid_images(images):
raise ValueError("Invalid image type. Must be of type PIL.Image.Image, numpy.ndarray, " "paddle.Tensor.")
pixel_values, grid_thw = self._preprocess(
images,
min_pixels=min_pixels,
max_pixels=max_pixels,
image_mean=image_mean,
image_std=image_std,
rescale_factor=rescale_factor,
do_rescale=do_rescale,
do_normalize=do_normalize,
resample=resample,
data_format=data_format,
input_data_format=input_data_format,
)
data = {"pixel_values": pixel_values, "grid_thw": grid_thw}
return BatchFeature(data=data, tensor_type=return_tensors)
@@ -340,9 +340,7 @@ class TestImagePreprocessorAdaptive(unittest.TestCase):
# Create a scaled image (values between 0-1) # Create a scaled image (values between 0-1)
img_array = np.random.rand(224, 224, 3).astype(np.float32) * 0.5 img_array = np.random.rand(224, 224, 3).astype(np.float32) * 0.5
# Use patch to capture warning # Use patch to capture warning
with patch( with patch("fastdeploy.input.image_processors.adaptive_processor.data_processor_logger") as mock_logger:
"fastdeploy.input.ernie4_5_vl_processor.image_preprocessor.image_preprocessor_adaptive.data_processor_logger"
) as mock_logger:
# Directly call _preprocess, pass scaled image # Directly call _preprocess, pass scaled image
self.processor._preprocess( self.processor._preprocess(
[img_array], # Pass scaled numpy array [img_array], # Pass scaled numpy array
@@ -356,9 +354,7 @@ class TestImagePreprocessorAdaptive(unittest.TestCase):
"""Test invalid image check in preprocess (line 464)""" """Test invalid image check in preprocess (line 464)"""
# Test invalid image type - need to ensure valid_images returns False # Test invalid image type - need to ensure valid_images returns False
# Use patch to make valid_images return False, but make_batched_images succeeds # Use patch to make valid_images return False, but make_batched_images succeeds
with patch( with patch("fastdeploy.input.image_processors.adaptive_processor.valid_images") as mock_valid:
"fastdeploy.input.ernie4_5_vl_processor.image_preprocessor.image_preprocessor_adaptive.valid_images"
) as mock_valid:
mock_valid.return_value = False mock_valid.return_value = False
valid_images_list = [Image.new("RGB", (224, 224))] # Valid image, but valid_images returns False valid_images_list = [Image.new("RGB", (224, 224))] # Valid image, but valid_images returns False
with self.assertRaises(ValueError) as context: with self.assertRaises(ValueError) as context:
File diff suppressed because it is too large Load Diff