mirror of
https://github.com/PaddlePaddle/FastDeploy.git
synced 2026-04-22 16:07:51 +08:00
[DataProcessor] Move image_processor to unified directory and add MultiModalProcessor (#7109)
* first commit * step 9~10 * update multimodal * update multimodal * fix load tokenizer * add unit test * fix unit test & AdaptiveImageProcessor * Delete unused code
This commit is contained in:
@@ -14,7 +14,13 @@
|
||||
# limitations under the License.
|
||||
"""
|
||||
|
||||
from .get_image_preprocessor import get_image_preprocessor
|
||||
from .image_preprocessor_adaptive import AdaptiveImageProcessor
|
||||
# Backward compatibility: this module has been migrated to
|
||||
# fastdeploy.input.image_processors.adaptive_processor
|
||||
# This file will be removed in a future version.
|
||||
|
||||
from fastdeploy.input.image_processors.adaptive_processor import ( # noqa: F401
|
||||
AdaptiveImageProcessor,
|
||||
get_image_preprocessor,
|
||||
)
|
||||
|
||||
__all__ = ["get_image_preprocessor", "AdaptiveImageProcessor"]
|
||||
|
||||
+6
-17
@@ -14,21 +14,10 @@
|
||||
# limitations under the License.
|
||||
"""
|
||||
|
||||
"""get image preprocessor"""
|
||||
# Backward compatibility: this module has been migrated to
|
||||
# fastdeploy.input.image_processors.adaptive_processor
|
||||
# This file will be removed in a future version.
|
||||
|
||||
from fastdeploy.utils import data_processor_logger
|
||||
|
||||
from .image_preprocessor_adaptive import AdaptiveImageProcessor
|
||||
|
||||
|
||||
def get_image_preprocessor(args):
|
||||
"""
|
||||
get_image_preprocessor from args
|
||||
"""
|
||||
|
||||
if args.vision_model_name_or_path is None:
|
||||
return None
|
||||
|
||||
data_processor_logger.info("use AdaptiveImageProcessor")
|
||||
image_preprocess = AdaptiveImageProcessor.from_pretrained(args.vision_model_name_or_path)
|
||||
return image_preprocess
|
||||
from fastdeploy.input.image_processors.adaptive_processor import ( # noqa: F401
|
||||
get_image_preprocessor,
|
||||
)
|
||||
|
||||
+7
-493
@@ -14,498 +14,12 @@
|
||||
# limitations under the License.
|
||||
"""
|
||||
|
||||
"""image preprocessor adaptive"""
|
||||
# Backward compatibility: this module has been migrated to
|
||||
# fastdeploy.input.image_processors.adaptive_processor
|
||||
# This file will be removed in a future version.
|
||||
|
||||
from typing import List, Optional, Union
|
||||
|
||||
import numpy as np
|
||||
import paddle
|
||||
import PIL
|
||||
from paddleformers.transformers.feature_extraction_utils import BatchFeature
|
||||
from paddleformers.transformers.image_processing_utils import BaseImageProcessor
|
||||
from paddleformers.transformers.image_transforms import (
|
||||
convert_to_rgb,
|
||||
normalize,
|
||||
rescale,
|
||||
resize,
|
||||
to_channel_dimension_format,
|
||||
from fastdeploy.input.image_processors.adaptive_processor import ( # noqa: F401
|
||||
AdaptiveImageProcessor,
|
||||
make_batched_images,
|
||||
make_batched_videos,
|
||||
)
|
||||
from paddleformers.transformers.image_utils import (
|
||||
ChannelDimension,
|
||||
ImageInput,
|
||||
PILImageResampling,
|
||||
get_image_size,
|
||||
infer_channel_dimension_format,
|
||||
is_valid_image,
|
||||
make_list_of_images,
|
||||
to_numpy_array,
|
||||
valid_images,
|
||||
)
|
||||
from paddleformers.transformers.legacy.tokenizer_utils_base import TensorType
|
||||
from PIL import Image
|
||||
|
||||
from fastdeploy.input.image_processors.common import is_scaled_image
|
||||
from fastdeploy.input.image_processors.common import smart_resize_qwen as smart_resize
|
||||
from fastdeploy.utils import data_processor_logger
|
||||
|
||||
OPENAI_CLIP_MEAN = [0.48145466, 0.4578275, 0.40821073]
|
||||
OPENAI_CLIP_STD = [0.26862954, 0.26130258, 0.27577711]
|
||||
|
||||
IMAGE_FACTOR = 28
|
||||
MIN_PIXELS = 4 * 28 * 28
|
||||
MAX_PIXELS = 16384 * 28 * 28
|
||||
MAX_RATIO = 200
|
||||
|
||||
|
||||
VideoInput = Union[
|
||||
List["PIL.Image.Image"],
|
||||
"np.ndarray",
|
||||
"paddle.Tensor",
|
||||
List["np.ndarray"],
|
||||
List["paddle.Tensor"],
|
||||
List[List["PIL.Image.Image"]],
|
||||
List[List["np.ndarrray"]],
|
||||
List[List["paddle.Tensor"]],
|
||||
]
|
||||
|
||||
|
||||
__all__ = [
|
||||
"AdaptiveImageProcessor",
|
||||
]
|
||||
|
||||
|
||||
def make_batched_images(images) -> List[List[ImageInput]]:
|
||||
"""
|
||||
Accepts images in list or nested list format, and makes a list of images for preprocessing.
|
||||
images (`Union[List[List[ImageInput]], List[ImageInput], ImageInput]`):
|
||||
The input image.
|
||||
|
||||
Returns:
|
||||
list: A list of images.
|
||||
"""
|
||||
if isinstance(images, (list, tuple)) and isinstance(images[0], (list, tuple)) and is_valid_image(images[0][0]):
|
||||
return [img for img_list in images for img in img_list]
|
||||
|
||||
elif isinstance(images, (list, tuple)) and is_valid_image(images[0]):
|
||||
return images
|
||||
|
||||
elif is_valid_image(images):
|
||||
return [images]
|
||||
|
||||
raise ValueError(f"Could not make batched images from {images}")
|
||||
|
||||
|
||||
# Copied from transformers.models.llava_next_video.image_processing_llava_next_video.make_batched_videos
|
||||
def make_batched_videos(videos) -> List[VideoInput]:
|
||||
"""dummy"""
|
||||
if isinstance(videos, (list, tuple)) and isinstance(videos[0], (list, tuple)) and is_valid_image(videos[0][0]):
|
||||
return videos
|
||||
|
||||
elif isinstance(videos, (list, tuple)) and is_valid_image(videos[0]):
|
||||
if isinstance(videos[0], Image.Image):
|
||||
return [videos]
|
||||
elif len(videos[0].shape) == 4:
|
||||
return [list(video) for video in videos]
|
||||
|
||||
elif is_valid_image(videos) and len(videos.shape) == 4:
|
||||
return [list(videos)]
|
||||
|
||||
raise ValueError(f"Could not make batched video from {videos}")
|
||||
|
||||
|
||||
class AdaptiveImageProcessor(BaseImageProcessor):
|
||||
r"""
|
||||
Constructs a adaptive image processor that dynamically resizes images based on the original images.
|
||||
|
||||
Args:
|
||||
do_resize (`bool`, *optional*, defaults to `True`):
|
||||
Whether to resize the image's (height, width) dimensions.
|
||||
resample (`PILImageResampling`, *optional*, defaults to `Resampling.BICUBIC`):
|
||||
Resampling filter to use when resizing the image.
|
||||
do_rescale (`bool`, *optional*, defaults to `True`):
|
||||
Whether to rescale the image by the specified scale `rescale_factor`.
|
||||
rescale_factor (`int` or `float`, *optional*, defaults to `1/255`):
|
||||
Scale factor to use if rescaling the image.
|
||||
do_normalize (`bool`, *optional*, defaults to `True`):
|
||||
Whether to normalize the image.
|
||||
image_mean (`float` or `List[float]`, *optional*, defaults to `[0.48145466, 0.4578275, 0.40821073]`):
|
||||
Mean to use if normalizing the image. This is a float or list of floats for each channel in the image.
|
||||
image_std (`float` or `List[float]`, *optional*, defaults to `[0.26862954, 0.26130258, 0.27577711]`):
|
||||
Standard deviation to use if normalizing the image. This is a float or list of floats for each channel
|
||||
in the image.
|
||||
do_convert_rgb (`bool`, *optional*, defaults to `True`):
|
||||
Whether to convert the image to RGB.
|
||||
min_pixels (`int`, *optional*, defaults to `56 * 56`):
|
||||
The min pixels of the image to resize the image.
|
||||
max_pixels (`int`, *optional*, defaults to `28 * 28 * 1280`):
|
||||
The max pixels of the image to resize the image.
|
||||
patch_size (`int`, *optional*, defaults to 14):
|
||||
The spacial patch size of the vision encoder.
|
||||
temporal_conv_size (`int`, *optional*, defaults to 2):
|
||||
The temporal conv size in resampler.
|
||||
merge_size (`int`, *optional*, defaults to 2):
|
||||
The merge size of the vision encoder to llm encoder.
|
||||
"""
|
||||
|
||||
model_input_names = [
|
||||
"pixel_values",
|
||||
"image_grid_thw",
|
||||
"pixel_values_videos",
|
||||
"video_grid_thw",
|
||||
]
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
do_resize: bool = True,
|
||||
resample: PILImageResampling = PILImageResampling.BICUBIC,
|
||||
do_rescale: bool = True,
|
||||
rescale_factor: float = 1 / 255,
|
||||
do_normalize: bool = True,
|
||||
image_mean: Optional[Union[float, List[float]]] = None,
|
||||
image_std: Optional[Union[float, List[float]]] = None,
|
||||
do_convert_rgb: bool = True,
|
||||
min_pixels: int = 56 * 56,
|
||||
max_pixels: int = 28 * 28 * 1280,
|
||||
patch_size: int = 14,
|
||||
temporal_conv_size: int = 2,
|
||||
merge_size: int = 2,
|
||||
**kwargs,
|
||||
) -> None:
|
||||
"""init"""
|
||||
super().__init__(**kwargs)
|
||||
self.do_resize = do_resize
|
||||
self.resample = resample
|
||||
self.do_rescale = do_rescale
|
||||
self.rescale_factor = rescale_factor
|
||||
self.do_normalize = do_normalize
|
||||
self.image_mean = image_mean if image_mean is not None else OPENAI_CLIP_MEAN
|
||||
self.image_std = image_std if image_std is not None else OPENAI_CLIP_STD
|
||||
self.min_pixels = min_pixels
|
||||
self.max_pixels = max_pixels
|
||||
self.patch_size = patch_size
|
||||
self.temporal_conv_size = temporal_conv_size
|
||||
self.merge_size = merge_size
|
||||
self.size = {"min_pixels": min_pixels, "max_pixels": max_pixels}
|
||||
self.do_convert_rgb = do_convert_rgb
|
||||
|
||||
def set_pixels(self, min_pixels=None, max_pixels=None, msg=""):
|
||||
"""设定pixels"""
|
||||
if min_pixels is not None:
|
||||
assert isinstance(min_pixels, int) and min_pixels >= 0, "min_pixels must be positive int"
|
||||
data_processor_logger.info(f"{msg} AdaptiveImageProcessor set min_pixels = {min_pixels}")
|
||||
self.min_pixels = min_pixels
|
||||
self.size["min_pixels"] = int(min_pixels)
|
||||
if max_pixels is not None:
|
||||
assert isinstance(max_pixels, int) and max_pixels > 0, "max_pixels must be positive int"
|
||||
data_processor_logger.info(f"{msg} AdaptiveImageProcessor set max_pixels = {max_pixels}")
|
||||
self.max_pixels = max_pixels
|
||||
self.size["max_pixels"] = int(max_pixels)
|
||||
|
||||
def get_smarted_resize(self, height, width, min_pixels=None, max_pixels=None):
|
||||
"""dummy"""
|
||||
actual_min_pixels = min_pixels if min_pixels is not None else self.min_pixels
|
||||
actual_max_pixels = max_pixels if max_pixels is not None else self.max_pixels
|
||||
resized_height, resized_width = smart_resize(
|
||||
height,
|
||||
width,
|
||||
factor=self.patch_size * self.merge_size,
|
||||
min_pixels=actual_min_pixels,
|
||||
max_pixels=actual_max_pixels,
|
||||
)
|
||||
return (resized_height, resized_width), (
|
||||
resized_height // self.patch_size,
|
||||
resized_width // self.patch_size,
|
||||
)
|
||||
|
||||
def _preprocess(
|
||||
self,
|
||||
images: Union[ImageInput, VideoInput],
|
||||
do_resize: bool = True,
|
||||
resample: PILImageResampling = None,
|
||||
do_rescale: bool = True,
|
||||
rescale_factor: float = 1 / 255,
|
||||
do_normalize: bool = True,
|
||||
image_mean: Optional[Union[float, List[float]]] = None,
|
||||
image_std: Optional[Union[float, List[float]]] = None,
|
||||
do_convert_rgb: bool = False,
|
||||
data_format: Optional[ChannelDimension] = ChannelDimension.FIRST,
|
||||
input_data_format: Optional[Union[str, ChannelDimension]] = None,
|
||||
predetermined_grid_thw=None,
|
||||
):
|
||||
"""
|
||||
Preprocess an image or batch of images. Copy of the `preprocess` method from `CLIPImageProcessor`.
|
||||
|
||||
Args:
|
||||
images (`ImageInput`):
|
||||
Image or batch of images to preprocess. Expects pixel values ranging from 0 to 255.
|
||||
If pixel values range from 0 to 1, set `do_rescale=False`.
|
||||
vision_info (`List[Dict]`, *optional*):
|
||||
Optional list of dictionaries containing additional information about vision inputs.
|
||||
do_resize (`bool`, *optional*, defaults to `self.do_resize`):
|
||||
Whether to resize the image.
|
||||
resample (`PILImageResampling`, *optional*, defaults to `self.resample`):
|
||||
Resampling filter to use if resizing the image. This can be one of the `PILImageResampling` enums.
|
||||
do_rescale (`bool`, *optional*, defaults to `self.do_rescale`):
|
||||
Whether to rescale the image.
|
||||
rescale_factor (`float`, *optional*, defaults to `self.rescale_factor`):
|
||||
Scale factor to use if rescaling the image.
|
||||
do_normalize (`bool`, *optional*, defaults to `self.do_normalize`):
|
||||
Whether to normalize the image.
|
||||
image_mean (`float` or `List[float]`, *optional*, defaults to `self.image_mean`):
|
||||
Mean to use if normalizing the image.
|
||||
Can be a float or a list of floats corresponding to the number of channels in the image.
|
||||
image_std (`float` or `List[float]`, *optional*, defaults to `self.image_std`):
|
||||
Standard deviation to use if normalizing the image.
|
||||
Can be a float or a list of floats corresponding to the number of channels in the image.
|
||||
do_convert_rgb (`bool`, *optional*, defaults to `self.do_convert_rgb`):
|
||||
Whether to convert the image to RGB.
|
||||
data_format (`ChannelDimension`, *optional*, defaults to `ChannelDimension.FIRST`):
|
||||
The channel dimension format for the output image. Can be one of:
|
||||
- `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
|
||||
- `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
|
||||
- Unset: Use the channel dimension format of the input image.
|
||||
input_data_format (`ChannelDimension` or `str`, *optional*):
|
||||
The channel dimension format for the input image. Can be one of:
|
||||
- `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
|
||||
- `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
|
||||
- `"none"` or `ChannelDimension.NONE`: image in (height, width) format.
|
||||
- `"none"` or `ChannelDimension.NONE`: image in (height, width) format.
|
||||
"""
|
||||
images = make_list_of_images(images)
|
||||
|
||||
if do_convert_rgb:
|
||||
images = [convert_to_rgb(image) for image in images]
|
||||
|
||||
# All transformations expect numpy arrays.
|
||||
images = [to_numpy_array(image) for image in images]
|
||||
|
||||
if is_scaled_image(images[0]) and do_rescale:
|
||||
data_processor_logger.warning(
|
||||
"It looks like you are trying to rescale already rescaled images. If the input"
|
||||
" images have pixel values between 0 and 1, set `do_rescale=False` to avoid rescaling them again."
|
||||
)
|
||||
if input_data_format is None:
|
||||
# We assume that all images have the same channel dimension format.
|
||||
input_data_format = infer_channel_dimension_format(images[0])
|
||||
|
||||
height, width = get_image_size(images[0], channel_dim=input_data_format)
|
||||
resized_height, resized_width = height, width
|
||||
processed_images = []
|
||||
|
||||
if predetermined_grid_thw is not None:
|
||||
assert len(predetermined_grid_thw) == len(
|
||||
images
|
||||
), f"len(predetermined_grid_thw) {len(predetermined_grid_thw)} == len(images) {len(images)}"
|
||||
|
||||
for img_idx, image in enumerate(images):
|
||||
if do_resize:
|
||||
if predetermined_grid_thw is not None:
|
||||
(resized_height, resized_width) = predetermined_grid_thw[img_idx]
|
||||
resized_height *= self.patch_size
|
||||
resized_width *= self.patch_size
|
||||
else:
|
||||
resized_height, resized_width = smart_resize(
|
||||
height,
|
||||
width,
|
||||
factor=self.patch_size * self.merge_size,
|
||||
min_pixels=self.min_pixels,
|
||||
max_pixels=self.max_pixels,
|
||||
)
|
||||
image = image.astype("uint8") # TODO : 需要手动加上,否则多除255 导致结果会出错
|
||||
# 直接fromarray,不要靠paddleformers里面的
|
||||
image = Image.fromarray(image)
|
||||
image = resize(
|
||||
image,
|
||||
size=(resized_height, resized_width),
|
||||
resample=resample,
|
||||
data_format=input_data_format,
|
||||
)
|
||||
if do_rescale:
|
||||
image = rescale(image, scale=rescale_factor, data_format=input_data_format)
|
||||
|
||||
if do_normalize:
|
||||
image = normalize(
|
||||
image=image,
|
||||
mean=image_mean,
|
||||
std=image_std,
|
||||
data_format=input_data_format,
|
||||
)
|
||||
|
||||
image = to_channel_dimension_format(image, data_format, input_channel_dim=input_data_format) # [C, H, W]
|
||||
|
||||
processed_images.append(image)
|
||||
patches = np.array(processed_images)
|
||||
if data_format == ChannelDimension.LAST:
|
||||
patches = patches.transpose([0, 3, 1, 2])
|
||||
|
||||
channel = patches.shape[1] # [time, C, H, W]
|
||||
grid_t = patches.shape[0]
|
||||
grid_h, grid_w = (
|
||||
resized_height // self.patch_size,
|
||||
resized_width // self.patch_size,
|
||||
)
|
||||
patches = patches.reshape(
|
||||
[
|
||||
grid_t,
|
||||
channel,
|
||||
grid_h // self.merge_size,
|
||||
self.merge_size,
|
||||
self.patch_size,
|
||||
grid_w // self.merge_size,
|
||||
self.merge_size,
|
||||
self.patch_size,
|
||||
]
|
||||
)
|
||||
# [grid_t, grid_h/merge_size, grid_w/merge_size, merge_size, merge_size, C, psz, psz]
|
||||
patches = patches.transpose([0, 2, 5, 3, 6, 1, 4, 7])
|
||||
|
||||
flatten_patches = patches.reshape(
|
||||
[
|
||||
grid_t * grid_h * grid_w,
|
||||
channel * self.patch_size * self.patch_size,
|
||||
]
|
||||
) # [grid_t * grid_h * grid_w, C * psz * psz]
|
||||
|
||||
return flatten_patches, (grid_t, grid_h, grid_w)
|
||||
|
||||
def preprocess(
|
||||
self,
|
||||
images: ImageInput,
|
||||
videos: VideoInput = None,
|
||||
do_resize: bool = True,
|
||||
size: Optional[Union[int, List[int]]] = None,
|
||||
resample: PILImageResampling = None,
|
||||
do_rescale: bool = True,
|
||||
rescale_factor: float = 1 / 255,
|
||||
do_normalize: bool = True,
|
||||
image_mean: Optional[Union[float, List[float]]] = None,
|
||||
image_std: Optional[Union[float, List[float]]] = None,
|
||||
do_convert_rgb: bool = False,
|
||||
return_tensors: Optional[Union[str, TensorType]] = None,
|
||||
data_format: Optional[ChannelDimension] = ChannelDimension.FIRST,
|
||||
input_data_format: Optional[Union[str, ChannelDimension]] = None,
|
||||
predetermined_grid_thw=None,
|
||||
):
|
||||
"""
|
||||
Args:
|
||||
images (`ImageInput`):
|
||||
Image to preprocess. Expects a single or batch of images with pixel values ranging from 0 to 255. If
|
||||
passing in images with pixel values between 0 and 1, set `do_rescale=False`.
|
||||
videos (`VideoInput`):
|
||||
Video to preprocess. Expects a single or batch of videos with pixel values ranging from 0 to 255. If
|
||||
passing in videos with pixel values between 0 and 1, set `do_rescale=False`.
|
||||
do_resize (`bool`, *optional*, defaults to `self.do_resize`):
|
||||
Whether to resize the image.
|
||||
size (`Dict[str, int]`, *optional*, defaults to `self.size`):
|
||||
Size of the image after resizing. Shortest edge of the image is resized to size["shortest_edge"], with
|
||||
the longest edge resized to keep the input aspect ratio.
|
||||
resample (`int`, *optional*, defaults to `self.resample`):
|
||||
Resampling filter to use if resizing the image. This can be one of the enum `PILImageResampling`. Only
|
||||
has an effect if `do_resize` is set to `True`.
|
||||
do_rescale (`bool`, *optional*, defaults to `self.do_rescale`):
|
||||
Whether to rescale the image.
|
||||
rescale_factor (`float`, *optional*, defaults to `self.rescale_factor`):
|
||||
Rescale factor to rescale the image by if `do_rescale` is set to `True`.
|
||||
do_normalize (`bool`, *optional*, defaults to `self.do_normalize`):
|
||||
Whether to normalize the image.
|
||||
image_mean (`float` or `List[float]`, *optional*, defaults to `self.image_mean`):
|
||||
Image mean to use for normalization. Only has an effect if `do_normalize` is set to `True`.
|
||||
image_std (`float` or `List[float]`, *optional*, defaults to `self.image_std`):
|
||||
Image standard deviation to use for normalization. Only has an effect if `do_normalize` is set to
|
||||
`True`.
|
||||
do_convert_rgb (`bool`, *optional*, defaults to `self.do_convert_rgb`):
|
||||
Whether to convert the image to RGB.
|
||||
return_tensors (`str` or `TensorType`, *optional*):
|
||||
The type of tensors to return. Can be one of:
|
||||
- Unset: Return a list of `np.ndarray`.
|
||||
- `TensorType.PADDLE` or `'pt'`: Return a batch of type `torch.Tensor`.
|
||||
- `TensorType.NUMPY` or `'np'`: Return a batch of type `np.ndarray`.
|
||||
data_format (`ChannelDimension` or `str`, *optional*, defaults to `ChannelDimension.FIRST`):
|
||||
The channel dimension format for the output image. Can be one of:
|
||||
- `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
|
||||
- `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
|
||||
- Unset: Use the channel dimension format of the input image.
|
||||
input_data_format (`ChannelDimension` or `str`, *optional*):
|
||||
The channel dimension format for the input image. If unset, the channel dimension format is inferred
|
||||
from the input image. Can be one of:
|
||||
- `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
|
||||
- `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
|
||||
- `"none"` or `ChannelDimension.NONE`: image in (height, width) format.
|
||||
|
||||
"""
|
||||
do_resize = do_resize if do_resize is not None else self.do_resize
|
||||
size = size if size is not None else self.size
|
||||
resample = resample if resample is not None else self.resample
|
||||
do_rescale = do_rescale if do_rescale is not None else self.do_rescale
|
||||
rescale_factor = rescale_factor if rescale_factor is not None else self.rescale_factor
|
||||
do_normalize = do_normalize if do_normalize is not None else self.do_normalize
|
||||
image_mean = image_mean if image_mean is not None else self.image_mean
|
||||
image_std = image_std if image_std is not None else self.image_std
|
||||
do_convert_rgb = do_convert_rgb if do_convert_rgb is not None else self.do_convert_rgb
|
||||
|
||||
if images is not None:
|
||||
images = make_batched_images(images)
|
||||
if videos is not None:
|
||||
videos = make_batched_videos(videos)
|
||||
|
||||
if images is not None and not valid_images(images):
|
||||
raise ValueError("Invalid image type. Must be of type PIL.Image.Image, numpy.ndarray, " "paddle.Tensor.")
|
||||
|
||||
if images is not None:
|
||||
pixel_values, vision_grid_thws = [], []
|
||||
for img_idx, image in enumerate(images):
|
||||
if predetermined_grid_thw is not None:
|
||||
predetermined_grid_thw_one = [predetermined_grid_thw[img_idx]]
|
||||
else:
|
||||
predetermined_grid_thw_one = None
|
||||
patches, image_grid_thw = self._preprocess(
|
||||
image,
|
||||
do_resize=do_resize,
|
||||
resample=resample,
|
||||
do_rescale=do_rescale,
|
||||
rescale_factor=rescale_factor,
|
||||
do_normalize=do_normalize,
|
||||
image_mean=image_mean,
|
||||
image_std=image_std,
|
||||
data_format=data_format,
|
||||
do_convert_rgb=do_convert_rgb,
|
||||
input_data_format=input_data_format,
|
||||
predetermined_grid_thw=predetermined_grid_thw_one,
|
||||
)
|
||||
pixel_values.extend(patches)
|
||||
vision_grid_thws.append(image_grid_thw)
|
||||
pixel_values = np.array(pixel_values)
|
||||
vision_grid_thws = np.array(vision_grid_thws)
|
||||
data = {
|
||||
"pixel_values": pixel_values,
|
||||
"image_grid_thw": vision_grid_thws,
|
||||
}
|
||||
|
||||
if videos is not None:
|
||||
pixel_values, vision_grid_thws = [], []
|
||||
for images in videos:
|
||||
patches, video_grid_thw = self._preprocess(
|
||||
images,
|
||||
do_resize=do_resize,
|
||||
resample=resample,
|
||||
do_rescale=do_rescale,
|
||||
rescale_factor=rescale_factor,
|
||||
do_normalize=do_normalize,
|
||||
image_mean=image_mean,
|
||||
image_std=image_std,
|
||||
data_format=data_format,
|
||||
do_convert_rgb=do_convert_rgb,
|
||||
input_data_format=input_data_format,
|
||||
predetermined_grid_thw=predetermined_grid_thw,
|
||||
)
|
||||
pixel_values.extend(patches)
|
||||
vision_grid_thws.append(video_grid_thw)
|
||||
pixel_values = np.array(pixel_values)
|
||||
vision_grid_thws = np.array(vision_grid_thws)
|
||||
|
||||
data = {
|
||||
"pixel_values_videos": pixel_values,
|
||||
"video_grid_thw": vision_grid_thws,
|
||||
}
|
||||
|
||||
return BatchFeature(data=data, tensor_type=return_tensors)
|
||||
|
||||
@@ -11,3 +11,17 @@
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
from fastdeploy.input.image_processors.adaptive_processor import ( # noqa: F401
|
||||
AdaptiveImageProcessor,
|
||||
get_image_preprocessor,
|
||||
)
|
||||
from fastdeploy.input.image_processors.paddleocr_processor import ( # noqa: F401
|
||||
ImageProcessor as PaddleOCRImageProcessor,
|
||||
)
|
||||
from fastdeploy.input.image_processors.qwen3_processor import ( # noqa: F401
|
||||
ImageProcessor as Qwen3ImageProcessor,
|
||||
)
|
||||
from fastdeploy.input.image_processors.qwen_processor import ( # noqa: F401
|
||||
ImageProcessor as QwenImageProcessor,
|
||||
)
|
||||
|
||||
@@ -0,0 +1,524 @@
|
||||
"""
|
||||
# Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
"""
|
||||
|
||||
"""image preprocessor adaptive"""
|
||||
|
||||
from typing import List, Optional, Union
|
||||
|
||||
import numpy as np
|
||||
import paddle
|
||||
import PIL
|
||||
from paddleformers.transformers.feature_extraction_utils import BatchFeature
|
||||
from paddleformers.transformers.image_processing_utils import BaseImageProcessor
|
||||
from paddleformers.transformers.image_transforms import (
|
||||
convert_to_rgb,
|
||||
normalize,
|
||||
rescale,
|
||||
resize,
|
||||
to_channel_dimension_format,
|
||||
)
|
||||
from paddleformers.transformers.image_utils import (
|
||||
ChannelDimension,
|
||||
ImageInput,
|
||||
PILImageResampling,
|
||||
get_image_size,
|
||||
infer_channel_dimension_format,
|
||||
is_valid_image,
|
||||
make_list_of_images,
|
||||
to_numpy_array,
|
||||
valid_images,
|
||||
)
|
||||
from paddleformers.transformers.legacy.tokenizer_utils_base import TensorType
|
||||
from PIL import Image
|
||||
|
||||
from fastdeploy.input.image_processors.common import is_scaled_image
|
||||
from fastdeploy.input.image_processors.common import smart_resize_qwen as smart_resize
|
||||
from fastdeploy.utils import data_processor_logger
|
||||
|
||||
OPENAI_CLIP_MEAN = [0.48145466, 0.4578275, 0.40821073]
|
||||
OPENAI_CLIP_STD = [0.26862954, 0.26130258, 0.27577711]
|
||||
|
||||
IMAGE_FACTOR = 28
|
||||
MIN_PIXELS = 4 * 28 * 28
|
||||
MAX_PIXELS = 16384 * 28 * 28
|
||||
MAX_RATIO = 200
|
||||
|
||||
|
||||
VideoInput = Union[
|
||||
List["PIL.Image.Image"],
|
||||
"np.ndarray",
|
||||
"paddle.Tensor",
|
||||
List["np.ndarray"],
|
||||
List["paddle.Tensor"],
|
||||
List[List["PIL.Image.Image"]],
|
||||
List[List["np.ndarray"]],
|
||||
List[List["paddle.Tensor"]],
|
||||
]
|
||||
|
||||
|
||||
__all__ = [
|
||||
"AdaptiveImageProcessor",
|
||||
"get_image_preprocessor",
|
||||
"make_batched_images",
|
||||
"make_batched_videos",
|
||||
]
|
||||
|
||||
|
||||
def make_batched_images(images) -> List[List[ImageInput]]:
|
||||
"""
|
||||
Accepts images in list or nested list format, and makes a list of images for preprocessing.
|
||||
images (`Union[List[List[ImageInput]], List[ImageInput], ImageInput]`):
|
||||
The input image.
|
||||
|
||||
Returns:
|
||||
list: A list of images.
|
||||
"""
|
||||
if isinstance(images, (list, tuple)) and isinstance(images[0], (list, tuple)) and is_valid_image(images[0][0]):
|
||||
return [img for img_list in images for img in img_list]
|
||||
|
||||
elif isinstance(images, (list, tuple)) and is_valid_image(images[0]):
|
||||
return images
|
||||
|
||||
elif is_valid_image(images):
|
||||
return [images]
|
||||
|
||||
raise ValueError(f"Could not make batched images from {images}")
|
||||
|
||||
|
||||
# Copied from transformers.models.llava_next_video.image_processing_llava_next_video.make_batched_videos
|
||||
def make_batched_videos(videos) -> List[VideoInput]:
|
||||
"""dummy"""
|
||||
if isinstance(videos, (list, tuple)) and isinstance(videos[0], (list, tuple)) and is_valid_image(videos[0][0]):
|
||||
return videos
|
||||
|
||||
elif isinstance(videos, (list, tuple)) and is_valid_image(videos[0]):
|
||||
if isinstance(videos[0], Image.Image):
|
||||
return [videos]
|
||||
elif len(videos[0].shape) == 4:
|
||||
return [list(video) for video in videos]
|
||||
|
||||
elif is_valid_image(videos) and len(videos.shape) == 4:
|
||||
return [list(videos)]
|
||||
|
||||
raise ValueError(f"Could not make batched video from {videos}")
|
||||
|
||||
|
||||
class AdaptiveImageProcessor(BaseImageProcessor):
|
||||
r"""
|
||||
Constructs a adaptive image processor that dynamically resizes images based on the original images.
|
||||
|
||||
Args:
|
||||
do_resize (`bool`, *optional*, defaults to `True`):
|
||||
Whether to resize the image's (height, width) dimensions.
|
||||
resample (`PILImageResampling`, *optional*, defaults to `Resampling.BICUBIC`):
|
||||
Resampling filter to use when resizing the image.
|
||||
do_rescale (`bool`, *optional*, defaults to `True`):
|
||||
Whether to rescale the image by the specified scale `rescale_factor`.
|
||||
rescale_factor (`int` or `float`, *optional*, defaults to `1/255`):
|
||||
Scale factor to use if rescaling the image.
|
||||
do_normalize (`bool`, *optional*, defaults to `True`):
|
||||
Whether to normalize the image.
|
||||
image_mean (`float` or `List[float]`, *optional*, defaults to `[0.48145466, 0.4578275, 0.40821073]`):
|
||||
Mean to use if normalizing the image. This is a float or list of floats for each channel in the image.
|
||||
image_std (`float` or `List[float]`, *optional*, defaults to `[0.26862954, 0.26130258, 0.27577711]`):
|
||||
Standard deviation to use if normalizing the image. This is a float or list of floats for each channel
|
||||
in the image.
|
||||
do_convert_rgb (`bool`, *optional*, defaults to `True`):
|
||||
Whether to convert the image to RGB.
|
||||
min_pixels (`int`, *optional*, defaults to `56 * 56`):
|
||||
The min pixels of the image to resize the image.
|
||||
max_pixels (`int`, *optional*, defaults to `28 * 28 * 1280`):
|
||||
The max pixels of the image to resize the image.
|
||||
patch_size (`int`, *optional*, defaults to 14):
|
||||
The spacial patch size of the vision encoder.
|
||||
temporal_conv_size (`int`, *optional*, defaults to 2):
|
||||
The temporal conv size in resampler.
|
||||
merge_size (`int`, *optional*, defaults to 2):
|
||||
The merge size of the vision encoder to llm encoder.
|
||||
"""
|
||||
|
||||
model_input_names = [
|
||||
"pixel_values",
|
||||
"image_grid_thw",
|
||||
"pixel_values_videos",
|
||||
"video_grid_thw",
|
||||
]
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
do_resize: bool = True,
|
||||
resample: PILImageResampling = PILImageResampling.BICUBIC,
|
||||
do_rescale: bool = True,
|
||||
rescale_factor: float = 1 / 255,
|
||||
do_normalize: bool = True,
|
||||
image_mean: Optional[Union[float, List[float]]] = None,
|
||||
image_std: Optional[Union[float, List[float]]] = None,
|
||||
do_convert_rgb: bool = True,
|
||||
min_pixels: int = 56 * 56,
|
||||
max_pixels: int = 28 * 28 * 1280,
|
||||
patch_size: int = 14,
|
||||
temporal_conv_size: int = 2,
|
||||
merge_size: int = 2,
|
||||
**kwargs,
|
||||
) -> None:
|
||||
"""init"""
|
||||
super().__init__(**kwargs)
|
||||
self.do_resize = do_resize
|
||||
self.resample = resample
|
||||
self.do_rescale = do_rescale
|
||||
self.rescale_factor = rescale_factor
|
||||
self.do_normalize = do_normalize
|
||||
self.image_mean = image_mean if image_mean is not None else OPENAI_CLIP_MEAN
|
||||
self.image_std = image_std if image_std is not None else OPENAI_CLIP_STD
|
||||
self.min_pixels = min_pixels
|
||||
self.max_pixels = max_pixels
|
||||
self.patch_size = patch_size
|
||||
self.temporal_conv_size = temporal_conv_size
|
||||
self.merge_size = merge_size
|
||||
self.size = {"min_pixels": min_pixels, "max_pixels": max_pixels}
|
||||
self.do_convert_rgb = do_convert_rgb
|
||||
|
||||
def set_pixels(self, min_pixels=None, max_pixels=None, msg=""):
|
||||
"""设定pixels"""
|
||||
if min_pixels is not None:
|
||||
assert isinstance(min_pixels, int) and min_pixels >= 0, "min_pixels must be positive int"
|
||||
data_processor_logger.info(f"{msg} AdaptiveImageProcessor set min_pixels = {min_pixels}")
|
||||
self.min_pixels = min_pixels
|
||||
self.size["min_pixels"] = int(min_pixels)
|
||||
if max_pixels is not None:
|
||||
assert isinstance(max_pixels, int) and max_pixels > 0, "max_pixels must be positive int"
|
||||
data_processor_logger.info(f"{msg} AdaptiveImageProcessor set max_pixels = {max_pixels}")
|
||||
self.max_pixels = max_pixels
|
||||
self.size["max_pixels"] = int(max_pixels)
|
||||
|
||||
def get_smarted_resize(self, height, width, min_pixels=None, max_pixels=None):
|
||||
"""dummy"""
|
||||
actual_min_pixels = min_pixels if min_pixels is not None else self.min_pixels
|
||||
actual_max_pixels = max_pixels if max_pixels is not None else self.max_pixels
|
||||
resized_height, resized_width = smart_resize(
|
||||
height,
|
||||
width,
|
||||
factor=self.patch_size * self.merge_size,
|
||||
min_pixels=actual_min_pixels,
|
||||
max_pixels=actual_max_pixels,
|
||||
)
|
||||
return (resized_height, resized_width), (
|
||||
resized_height // self.patch_size,
|
||||
resized_width // self.patch_size,
|
||||
)
|
||||
|
||||
def _preprocess(
|
||||
self,
|
||||
images: Union[ImageInput, VideoInput],
|
||||
do_resize: bool = True,
|
||||
resample: PILImageResampling = None,
|
||||
do_rescale: bool = True,
|
||||
rescale_factor: float = 1 / 255,
|
||||
do_normalize: bool = True,
|
||||
image_mean: Optional[Union[float, List[float]]] = None,
|
||||
image_std: Optional[Union[float, List[float]]] = None,
|
||||
do_convert_rgb: bool = False,
|
||||
data_format: Optional[ChannelDimension] = ChannelDimension.FIRST,
|
||||
input_data_format: Optional[Union[str, ChannelDimension]] = None,
|
||||
predetermined_grid_thw=None,
|
||||
):
|
||||
"""
|
||||
Preprocess an image or batch of images. Copy of the `preprocess` method from `CLIPImageProcessor`.
|
||||
|
||||
Args:
|
||||
images (`ImageInput`):
|
||||
Image or batch of images to preprocess. Expects pixel values ranging from 0 to 255.
|
||||
If pixel values range from 0 to 1, set `do_rescale=False`.
|
||||
vision_info (`List[Dict]`, *optional*):
|
||||
Optional list of dictionaries containing additional information about vision inputs.
|
||||
do_resize (`bool`, *optional*, defaults to `self.do_resize`):
|
||||
Whether to resize the image.
|
||||
resample (`PILImageResampling`, *optional*, defaults to `self.resample`):
|
||||
Resampling filter to use if resizing the image. This can be one of the `PILImageResampling` enums.
|
||||
do_rescale (`bool`, *optional*, defaults to `self.do_rescale`):
|
||||
Whether to rescale the image.
|
||||
rescale_factor (`float`, *optional*, defaults to `self.rescale_factor`):
|
||||
Scale factor to use if rescaling the image.
|
||||
do_normalize (`bool`, *optional*, defaults to `self.do_normalize`):
|
||||
Whether to normalize the image.
|
||||
image_mean (`float` or `List[float]`, *optional*, defaults to `self.image_mean`):
|
||||
Mean to use if normalizing the image.
|
||||
Can be a float or a list of floats corresponding to the number of channels in the image.
|
||||
image_std (`float` or `List[float]`, *optional*, defaults to `self.image_std`):
|
||||
Standard deviation to use if normalizing the image.
|
||||
Can be a float or a list of floats corresponding to the number of channels in the image.
|
||||
do_convert_rgb (`bool`, *optional*, defaults to `self.do_convert_rgb`):
|
||||
Whether to convert the image to RGB.
|
||||
data_format (`ChannelDimension`, *optional*, defaults to `ChannelDimension.FIRST`):
|
||||
The channel dimension format for the output image. Can be one of:
|
||||
- `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
|
||||
- `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
|
||||
- Unset: Use the channel dimension format of the input image.
|
||||
input_data_format (`ChannelDimension` or `str`, *optional*):
|
||||
The channel dimension format for the input image. Can be one of:
|
||||
- `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
|
||||
- `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
|
||||
- `"none"` or `ChannelDimension.NONE`: image in (height, width) format.
|
||||
- `"none"` or `ChannelDimension.NONE`: image in (height, width) format.
|
||||
"""
|
||||
images = make_list_of_images(images)
|
||||
|
||||
if do_convert_rgb:
|
||||
images = [convert_to_rgb(image) for image in images]
|
||||
|
||||
# All transformations expect numpy arrays.
|
||||
images = [to_numpy_array(image) for image in images]
|
||||
|
||||
if is_scaled_image(images[0]) and do_rescale:
|
||||
data_processor_logger.warning(
|
||||
"It looks like you are trying to rescale already rescaled images. If the input"
|
||||
" images have pixel values between 0 and 1, set `do_rescale=False` to avoid rescaling them again."
|
||||
)
|
||||
if input_data_format is None:
|
||||
# We assume that all images have the same channel dimension format.
|
||||
input_data_format = infer_channel_dimension_format(images[0])
|
||||
|
||||
height, width = get_image_size(images[0], channel_dim=input_data_format)
|
||||
resized_height, resized_width = height, width
|
||||
processed_images = []
|
||||
|
||||
if predetermined_grid_thw is not None:
|
||||
assert len(predetermined_grid_thw) == len(
|
||||
images
|
||||
), f"len(predetermined_grid_thw) {len(predetermined_grid_thw)} == len(images) {len(images)}"
|
||||
|
||||
for img_idx, image in enumerate(images):
|
||||
if do_resize:
|
||||
if predetermined_grid_thw is not None:
|
||||
(resized_height, resized_width) = predetermined_grid_thw[img_idx]
|
||||
resized_height *= self.patch_size
|
||||
resized_width *= self.patch_size
|
||||
else:
|
||||
resized_height, resized_width = smart_resize(
|
||||
height,
|
||||
width,
|
||||
factor=self.patch_size * self.merge_size,
|
||||
min_pixels=self.min_pixels,
|
||||
max_pixels=self.max_pixels,
|
||||
)
|
||||
image = image.astype("uint8") # TODO : 需要手动加上,否则多除255 导致结果会出错
|
||||
# 直接fromarray,不要靠paddleformers里面的
|
||||
image = Image.fromarray(image)
|
||||
image = resize(
|
||||
image,
|
||||
size=(resized_height, resized_width),
|
||||
resample=resample,
|
||||
data_format=input_data_format,
|
||||
)
|
||||
if do_rescale:
|
||||
image = rescale(image, scale=rescale_factor, data_format=input_data_format)
|
||||
|
||||
if do_normalize:
|
||||
image = normalize(
|
||||
image=image,
|
||||
mean=image_mean,
|
||||
std=image_std,
|
||||
data_format=input_data_format,
|
||||
)
|
||||
|
||||
image = to_channel_dimension_format(image, data_format, input_channel_dim=input_data_format) # [C, H, W]
|
||||
|
||||
processed_images.append(image)
|
||||
patches = np.array(processed_images)
|
||||
if data_format == ChannelDimension.LAST:
|
||||
patches = patches.transpose([0, 3, 1, 2])
|
||||
|
||||
channel = patches.shape[1] # [time, C, H, W]
|
||||
grid_t = patches.shape[0]
|
||||
grid_h, grid_w = (
|
||||
resized_height // self.patch_size,
|
||||
resized_width // self.patch_size,
|
||||
)
|
||||
patches = patches.reshape(
|
||||
[
|
||||
grid_t,
|
||||
channel,
|
||||
grid_h // self.merge_size,
|
||||
self.merge_size,
|
||||
self.patch_size,
|
||||
grid_w // self.merge_size,
|
||||
self.merge_size,
|
||||
self.patch_size,
|
||||
]
|
||||
)
|
||||
# [grid_t, grid_h/merge_size, grid_w/merge_size, merge_size, merge_size, C, psz, psz]
|
||||
patches = patches.transpose([0, 2, 5, 3, 6, 1, 4, 7])
|
||||
|
||||
flatten_patches = patches.reshape(
|
||||
[
|
||||
grid_t * grid_h * grid_w,
|
||||
channel * self.patch_size * self.patch_size,
|
||||
]
|
||||
) # [grid_t * grid_h * grid_w, C * psz * psz]
|
||||
|
||||
return flatten_patches, (grid_t, grid_h, grid_w)
|
||||
|
||||
def preprocess(
|
||||
self,
|
||||
images: ImageInput,
|
||||
videos: VideoInput = None,
|
||||
do_resize: bool = True,
|
||||
size: Optional[Union[int, List[int]]] = None,
|
||||
resample: PILImageResampling = None,
|
||||
do_rescale: bool = True,
|
||||
rescale_factor: float = 1 / 255,
|
||||
do_normalize: bool = True,
|
||||
image_mean: Optional[Union[float, List[float]]] = None,
|
||||
image_std: Optional[Union[float, List[float]]] = None,
|
||||
do_convert_rgb: bool = False,
|
||||
return_tensors: Optional[Union[str, TensorType]] = None,
|
||||
data_format: Optional[ChannelDimension] = ChannelDimension.FIRST,
|
||||
input_data_format: Optional[Union[str, ChannelDimension]] = None,
|
||||
predetermined_grid_thw=None,
|
||||
):
|
||||
"""
|
||||
Args:
|
||||
images (`ImageInput`):
|
||||
Image to preprocess. Expects a single or batch of images with pixel values ranging from 0 to 255. If
|
||||
passing in images with pixel values between 0 and 1, set `do_rescale=False`.
|
||||
videos (`VideoInput`):
|
||||
Video to preprocess. Expects a single or batch of videos with pixel values ranging from 0 to 255. If
|
||||
passing in videos with pixel values between 0 and 1, set `do_rescale=False`.
|
||||
do_resize (`bool`, *optional*, defaults to `self.do_resize`):
|
||||
Whether to resize the image.
|
||||
size (`Dict[str, int]`, *optional*, defaults to `self.size`):
|
||||
Size of the image after resizing. Shortest edge of the image is resized to size["shortest_edge"], with
|
||||
the longest edge resized to keep the input aspect ratio.
|
||||
resample (`int`, *optional*, defaults to `self.resample`):
|
||||
Resampling filter to use if resizing the image. This can be one of the enum `PILImageResampling`. Only
|
||||
has an effect if `do_resize` is set to `True`.
|
||||
do_rescale (`bool`, *optional*, defaults to `self.do_rescale`):
|
||||
Whether to rescale the image.
|
||||
rescale_factor (`float`, *optional*, defaults to `self.rescale_factor`):
|
||||
Rescale factor to rescale the image by if `do_rescale` is set to `True`.
|
||||
do_normalize (`bool`, *optional*, defaults to `self.do_normalize`):
|
||||
Whether to normalize the image.
|
||||
image_mean (`float` or `List[float]`, *optional*, defaults to `self.image_mean`):
|
||||
Image mean to use for normalization. Only has an effect if `do_normalize` is set to `True`.
|
||||
image_std (`float` or `List[float]`, *optional*, defaults to `self.image_std`):
|
||||
Image standard deviation to use for normalization. Only has an effect if `do_normalize` is set to
|
||||
`True`.
|
||||
do_convert_rgb (`bool`, *optional*, defaults to `self.do_convert_rgb`):
|
||||
Whether to convert the image to RGB.
|
||||
return_tensors (`str` or `TensorType`, *optional*):
|
||||
The type of tensors to return. Can be one of:
|
||||
- Unset: Return a list of `np.ndarray`.
|
||||
- `TensorType.PADDLE` or `'pt'`: Return a batch of type `torch.Tensor`.
|
||||
- `TensorType.NUMPY` or `'np'`: Return a batch of type `np.ndarray`.
|
||||
data_format (`ChannelDimension` or `str`, *optional*, defaults to `ChannelDimension.FIRST`):
|
||||
The channel dimension format for the output image. Can be one of:
|
||||
- `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
|
||||
- `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
|
||||
- Unset: Use the channel dimension format of the input image.
|
||||
input_data_format (`ChannelDimension` or `str`, *optional*):
|
||||
The channel dimension format for the input image. If unset, the channel dimension format is inferred
|
||||
from the input image. Can be one of:
|
||||
- `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
|
||||
- `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
|
||||
- `"none"` or `ChannelDimension.NONE`: image in (height, width) format.
|
||||
|
||||
"""
|
||||
do_resize = do_resize if do_resize is not None else self.do_resize
|
||||
size = size if size is not None else self.size
|
||||
resample = resample if resample is not None else self.resample
|
||||
do_rescale = do_rescale if do_rescale is not None else self.do_rescale
|
||||
rescale_factor = rescale_factor if rescale_factor is not None else self.rescale_factor
|
||||
do_normalize = do_normalize if do_normalize is not None else self.do_normalize
|
||||
image_mean = image_mean if image_mean is not None else self.image_mean
|
||||
image_std = image_std if image_std is not None else self.image_std
|
||||
do_convert_rgb = do_convert_rgb if do_convert_rgb is not None else self.do_convert_rgb
|
||||
|
||||
if images is not None:
|
||||
images = make_batched_images(images)
|
||||
if videos is not None:
|
||||
videos = make_batched_videos(videos)
|
||||
|
||||
if images is not None and not valid_images(images):
|
||||
raise ValueError("Invalid image type. Must be of type PIL.Image.Image, numpy.ndarray, " "paddle.Tensor.")
|
||||
|
||||
data = {}
|
||||
|
||||
if images is not None:
|
||||
pixel_values, vision_grid_thws = [], []
|
||||
for img_idx, image in enumerate(images):
|
||||
if predetermined_grid_thw is not None:
|
||||
predetermined_grid_thw_one = [predetermined_grid_thw[img_idx]]
|
||||
else:
|
||||
predetermined_grid_thw_one = None
|
||||
patches, image_grid_thw = self._preprocess(
|
||||
image,
|
||||
do_resize=do_resize,
|
||||
resample=resample,
|
||||
do_rescale=do_rescale,
|
||||
rescale_factor=rescale_factor,
|
||||
do_normalize=do_normalize,
|
||||
image_mean=image_mean,
|
||||
image_std=image_std,
|
||||
data_format=data_format,
|
||||
do_convert_rgb=do_convert_rgb,
|
||||
input_data_format=input_data_format,
|
||||
predetermined_grid_thw=predetermined_grid_thw_one,
|
||||
)
|
||||
pixel_values.extend(patches)
|
||||
vision_grid_thws.append(image_grid_thw)
|
||||
pixel_values = np.array(pixel_values)
|
||||
vision_grid_thws = np.array(vision_grid_thws)
|
||||
data["pixel_values"] = pixel_values
|
||||
data["image_grid_thw"] = vision_grid_thws
|
||||
|
||||
if videos is not None:
|
||||
pixel_values, vision_grid_thws = [], []
|
||||
for images in videos:
|
||||
patches, video_grid_thw = self._preprocess(
|
||||
images,
|
||||
do_resize=do_resize,
|
||||
resample=resample,
|
||||
do_rescale=do_rescale,
|
||||
rescale_factor=rescale_factor,
|
||||
do_normalize=do_normalize,
|
||||
image_mean=image_mean,
|
||||
image_std=image_std,
|
||||
data_format=data_format,
|
||||
do_convert_rgb=do_convert_rgb,
|
||||
input_data_format=input_data_format,
|
||||
predetermined_grid_thw=predetermined_grid_thw,
|
||||
)
|
||||
pixel_values.extend(patches)
|
||||
vision_grid_thws.append(video_grid_thw)
|
||||
pixel_values = np.array(pixel_values)
|
||||
vision_grid_thws = np.array(vision_grid_thws)
|
||||
data["pixel_values_videos"] = pixel_values
|
||||
data["video_grid_thw"] = vision_grid_thws
|
||||
|
||||
return BatchFeature(data=data, tensor_type=return_tensors)
|
||||
|
||||
|
||||
def get_image_preprocessor(args):
|
||||
"""
|
||||
get_image_preprocessor from args
|
||||
"""
|
||||
|
||||
if args.vision_model_name_or_path is None:
|
||||
return None
|
||||
|
||||
data_processor_logger.info("use AdaptiveImageProcessor")
|
||||
image_preprocess = AdaptiveImageProcessor.from_pretrained(args.vision_model_name_or_path)
|
||||
return image_preprocess
|
||||
@@ -0,0 +1,225 @@
|
||||
"""
|
||||
# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
"""
|
||||
|
||||
"""Image processor class for PaddleOCR-VL."""
|
||||
|
||||
import json
|
||||
from pathlib import Path
|
||||
from typing import Dict, List, Optional, Union
|
||||
|
||||
import numpy as np
|
||||
from paddleformers.transformers.feature_extraction_utils import BatchFeature
|
||||
from paddleformers.transformers.image_processing_utils import BaseImageProcessor
|
||||
from paddleformers.transformers.image_utils import (
|
||||
ImageInput,
|
||||
is_valid_image,
|
||||
make_list_of_images,
|
||||
to_numpy_array,
|
||||
)
|
||||
|
||||
from fastdeploy.input.image_processors.common import (
|
||||
smart_resize_paddleocr as smart_resize,
|
||||
)
|
||||
|
||||
_OPENAI_CLIP_MEAN = [0.48145466, 0.4578275, 0.40821073]
|
||||
_OPENAI_CLIP_STD = [0.26862954, 0.26130258, 0.27577711]
|
||||
|
||||
|
||||
def make_batched_images(images) -> List[ImageInput]:
|
||||
"""
|
||||
Accepts images in list or nested list format, and makes a flat list of images for preprocessing.
|
||||
Args:
|
||||
images (`Union[List[List[ImageInput]], List[ImageInput], ImageInput]`):
|
||||
The input image.
|
||||
Returns:
|
||||
List[ImageInput]: A flat list of images.
|
||||
"""
|
||||
if isinstance(images, (list, tuple)) and isinstance(images[0], (list, tuple)) and is_valid_image(images[0][0]):
|
||||
return [img for img_list in images for img in img_list]
|
||||
|
||||
elif isinstance(images, (list, tuple)) and is_valid_image(images[0]):
|
||||
return images
|
||||
|
||||
elif is_valid_image(images):
|
||||
return [images]
|
||||
|
||||
raise ValueError(f"Could not make batched images from {images}")
|
||||
|
||||
|
||||
def adjust_size(size, patch_size):
|
||||
num_patches = size // patch_size
|
||||
if num_patches % 2 != 0:
|
||||
num_patches -= 1
|
||||
return num_patches * patch_size
|
||||
|
||||
|
||||
class ImageProcessor(BaseImageProcessor):
|
||||
model_input_names = [
|
||||
"pixel_values",
|
||||
"image_grid_thw",
|
||||
"pixel_values_videos",
|
||||
"video_grid_thw",
|
||||
]
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
do_resize: bool = True,
|
||||
resample: int = 3,
|
||||
do_rescale: bool = True,
|
||||
rescale_factor: Union[int, float] = 1 / 255,
|
||||
do_normalize: bool = True,
|
||||
image_mean: Optional[Union[float, List[float]]] = None,
|
||||
image_std: Optional[Union[float, List[float]]] = None,
|
||||
do_convert_rgb: bool = True,
|
||||
min_pixels: int = 28 * 28 * 130,
|
||||
max_pixels: int = 28 * 28 * 1280,
|
||||
patch_size: int = 14,
|
||||
temporal_patch_size: int = 1,
|
||||
merge_size: int = 2,
|
||||
**kwargs,
|
||||
) -> None:
|
||||
super().__init__()
|
||||
self.do_resize = do_resize
|
||||
self.resample = resample
|
||||
self.do_rescale = do_rescale
|
||||
self.rescale_factor = rescale_factor
|
||||
self.do_normalize = do_normalize
|
||||
self.image_mean = image_mean if image_mean is not None else _OPENAI_CLIP_MEAN
|
||||
self.image_std = image_std if image_std is not None else _OPENAI_CLIP_STD
|
||||
self.min_pixels = min_pixels
|
||||
self.max_pixels = max_pixels
|
||||
self.patch_size = patch_size
|
||||
self.temporal_patch_size = temporal_patch_size
|
||||
self.merge_size = merge_size
|
||||
self.size = {"min_pixels": min_pixels, "max_pixels": max_pixels} # not used
|
||||
self.do_convert_rgb = do_convert_rgb
|
||||
|
||||
@classmethod
|
||||
def from_pretrained(cls, pretrained_model_dir):
|
||||
pretrained_model_dir = Path(pretrained_model_dir)
|
||||
image_processor_config_path = pretrained_model_dir / "preprocessor_config.json"
|
||||
with open(image_processor_config_path, "r", encoding="utf-8") as f:
|
||||
image_processor_config = json.load(f)
|
||||
return cls(**image_processor_config)
|
||||
|
||||
def _preprocess(
|
||||
self,
|
||||
images,
|
||||
do_resize: Optional[bool] = None,
|
||||
do_rescale: Optional[bool] = None,
|
||||
rescale_factor: Optional[float] = None,
|
||||
do_normalize: Optional[bool] = None,
|
||||
image_mean: Optional[Union[float, List[float]]] = None,
|
||||
image_std: Optional[Union[float, List[float]]] = None,
|
||||
do_convert_rgb: Optional[bool] = None,
|
||||
):
|
||||
images = make_list_of_images(images)
|
||||
|
||||
if do_convert_rgb:
|
||||
images = [image.convert("RGB") for image in images]
|
||||
|
||||
width, height = images[0].size
|
||||
resized_height, resized_width = height, width
|
||||
processed_images = []
|
||||
|
||||
for image in images:
|
||||
if do_resize:
|
||||
resized_height, resized_width = smart_resize(
|
||||
height,
|
||||
width,
|
||||
factor=self.patch_size * self.merge_size,
|
||||
min_pixels=self.min_pixels,
|
||||
max_pixels=self.max_pixels,
|
||||
)
|
||||
|
||||
image = image.resize((resized_width, resized_height), resample=self.resample)
|
||||
|
||||
image = to_numpy_array(image)
|
||||
|
||||
if do_rescale:
|
||||
image = (image * rescale_factor).astype(np.float32)
|
||||
|
||||
if do_normalize:
|
||||
image = image.astype(np.float32)
|
||||
image -= np.array(image_mean, dtype=np.float32)
|
||||
image /= np.array(image_std, dtype=np.float32)
|
||||
|
||||
processed_images.append(image)
|
||||
|
||||
patches = np.array(processed_images)
|
||||
patches = patches.transpose(0, 3, 1, 2)
|
||||
if patches.shape[0] == 1:
|
||||
patches = np.tile(patches, (self.temporal_patch_size, 1, 1, 1))
|
||||
channel = patches.shape[1]
|
||||
grid_t = patches.shape[0] // self.temporal_patch_size
|
||||
grid_h, grid_w = (
|
||||
resized_height // self.patch_size,
|
||||
resized_width // self.patch_size,
|
||||
)
|
||||
|
||||
patches = patches.reshape(
|
||||
grid_t,
|
||||
self.temporal_patch_size,
|
||||
channel,
|
||||
grid_h,
|
||||
self.patch_size,
|
||||
grid_w,
|
||||
self.patch_size,
|
||||
)
|
||||
patches = patches.transpose(0, 3, 5, 2, 1, 4, 6)
|
||||
assert self.temporal_patch_size == 1
|
||||
flatten_patches = patches.reshape(grid_t * grid_h * grid_w, channel, self.patch_size, self.patch_size)
|
||||
return flatten_patches, np.array([grid_t, grid_h, grid_w])
|
||||
|
||||
def preprocess(
|
||||
self,
|
||||
images,
|
||||
videos=None,
|
||||
do_resize: Optional[bool] = None,
|
||||
size: Optional[Dict[str, int]] = None,
|
||||
do_rescale: Optional[bool] = None,
|
||||
rescale_factor: Optional[float] = None,
|
||||
do_normalize: Optional[bool] = None,
|
||||
image_mean: Optional[Union[float, List[float]]] = None,
|
||||
image_std: Optional[Union[float, List[float]]] = None,
|
||||
do_convert_rgb: Optional[bool] = None,
|
||||
return_tensors=None,
|
||||
):
|
||||
do_resize = do_resize if do_resize is not None else self.do_resize
|
||||
size = size if size is not None else self.size
|
||||
do_rescale = do_rescale if do_rescale is not None else self.do_rescale
|
||||
rescale_factor = rescale_factor if rescale_factor is not None else self.rescale_factor
|
||||
do_normalize = do_normalize if do_normalize is not None else self.do_normalize
|
||||
image_mean = image_mean if image_mean is not None else self.image_mean
|
||||
image_std = image_std if image_std is not None else self.image_std
|
||||
do_convert_rgb = do_convert_rgb if do_convert_rgb is not None else self.do_convert_rgb
|
||||
|
||||
if videos is not None:
|
||||
raise NotImplementedError("Videos are not yet supported")
|
||||
|
||||
patches, image_grid_thw = self._preprocess(
|
||||
images,
|
||||
do_resize=do_resize,
|
||||
do_rescale=do_rescale,
|
||||
rescale_factor=rescale_factor,
|
||||
do_normalize=do_normalize,
|
||||
image_mean=image_mean,
|
||||
image_std=image_std,
|
||||
do_convert_rgb=do_convert_rgb,
|
||||
)
|
||||
pixel_values = np.array(patches)
|
||||
data = {"pixel_values": pixel_values, "grid_thw": image_grid_thw}
|
||||
return BatchFeature(data=data, tensor_type=return_tensors)
|
||||
@@ -0,0 +1,333 @@
|
||||
"""
|
||||
# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
"""
|
||||
|
||||
from typing import List, Optional, Union
|
||||
|
||||
import numpy as np
|
||||
import paddle
|
||||
import PIL
|
||||
from paddleformers.transformers.feature_extraction_utils import BatchFeature
|
||||
from paddleformers.transformers.image_processing_utils import BaseImageProcessor
|
||||
from paddleformers.transformers.image_transforms import (
|
||||
normalize,
|
||||
rescale,
|
||||
resize,
|
||||
to_channel_dimension_format,
|
||||
)
|
||||
from paddleformers.transformers.image_utils import (
|
||||
ChannelDimension,
|
||||
ImageInput,
|
||||
PILImageResampling,
|
||||
get_image_size,
|
||||
infer_channel_dimension_format,
|
||||
make_list_of_images,
|
||||
to_numpy_array,
|
||||
valid_images,
|
||||
)
|
||||
from paddleformers.transformers.legacy.tokenizer_utils_base import TensorType
|
||||
from PIL import Image
|
||||
|
||||
from fastdeploy.input.image_processors.common import is_scaled_image, smart_resize
|
||||
from fastdeploy.utils import data_processor_logger
|
||||
|
||||
IMAGE_MEAN = [0.5, 0.5, 0.5]
|
||||
IMAGE_STD = [0.5, 0.5, 0.5]
|
||||
|
||||
MIN_PIXELS = 65536
|
||||
MAX_PIXELS = 16777216
|
||||
|
||||
|
||||
VideoInput = Union[
|
||||
List["PIL.Image.Image"],
|
||||
"np.ndarray",
|
||||
"paddle.Tensor",
|
||||
List["np.ndarray"],
|
||||
List["paddle.Tensor"],
|
||||
List[List["PIL.Image.Image"]],
|
||||
List[List["np.ndarray"]],
|
||||
List[List["paddle.Tensor"]],
|
||||
]
|
||||
|
||||
|
||||
class ImageProcessor(BaseImageProcessor):
|
||||
"""
|
||||
Adaptive image processor for dynamic image resizing and preprocessing.
|
||||
|
||||
This processor handles image resizing, rescaling, normalization and format conversion.
|
||||
It dynamically adjusts image dimensions based on original size and specified constraints.
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
patch_size: int = 16,
|
||||
merge_size: int = 2,
|
||||
temporal_patch_size: int = 2,
|
||||
min_pixels: int = MIN_PIXELS,
|
||||
max_pixels: int = MAX_PIXELS,
|
||||
image_mean: Union[float, List[float]] = IMAGE_MEAN,
|
||||
image_std: Union[float, List[float]] = IMAGE_STD,
|
||||
rescale_factor: float = 1 / 255,
|
||||
do_rescale: bool = True,
|
||||
do_normalize: bool = True,
|
||||
resample: PILImageResampling = PILImageResampling.BICUBIC,
|
||||
**kwargs,
|
||||
) -> None:
|
||||
"""
|
||||
Initialize image processor with configuration parameters.
|
||||
|
||||
Args:
|
||||
patch_size (int): Spatial patch size for vision encoder
|
||||
merge_size (int): Merge size between vision and LLM encoders
|
||||
temporal_patch_size (int): Temporal patch size for video processing
|
||||
min_pixels (int): Minimum allowed pixels in resized image
|
||||
max_pixels (int): Maximum allowed pixels in resized image
|
||||
image_mean (float/list): Mean values for normalization per channel
|
||||
image_std (float/list): Std values for normalization per channel
|
||||
rescale_factor (float): Scaling factor for pixel values (default 1/255)
|
||||
do_rescale (bool): Whether to rescale images
|
||||
do_normalize (bool): Whether to normalize images
|
||||
resample: Resampling method for image resizing
|
||||
**kwargs: Additional base class arguments
|
||||
"""
|
||||
super().__init__(**kwargs)
|
||||
self.patch_size = patch_size
|
||||
self.merge_size = merge_size
|
||||
self.temporal_patch_size = temporal_patch_size
|
||||
|
||||
self.min_pixels = min_pixels
|
||||
self.max_pixels = max_pixels
|
||||
|
||||
self.image_mean = image_mean
|
||||
self.image_std = image_std
|
||||
self.rescale_factor = rescale_factor
|
||||
self.do_rescale = do_rescale
|
||||
self.do_normalize = do_normalize
|
||||
|
||||
self.resample = resample
|
||||
|
||||
def _preprocess(
|
||||
self,
|
||||
images: Union[ImageInput, VideoInput],
|
||||
min_pixels: int,
|
||||
max_pixels: int,
|
||||
image_mean: Optional[Union[float, List[float]]],
|
||||
image_std: Optional[Union[float, List[float]]],
|
||||
rescale_factor: float,
|
||||
do_rescale: bool,
|
||||
do_normalize: bool,
|
||||
resample: PILImageResampling,
|
||||
data_format: Optional[ChannelDimension],
|
||||
input_data_format: Optional[Union[str, ChannelDimension]],
|
||||
):
|
||||
"""
|
||||
Internal method for image preprocessing pipeline.
|
||||
|
||||
Args:
|
||||
images: Input image or batch of images
|
||||
min_pixels: Minimum allowed pixels in output
|
||||
max_pixels: Maximum allowed pixels in output
|
||||
image_mean: Normalization mean values
|
||||
image_std: Normalization std values
|
||||
rescale_factor: Pixel value scaling factor
|
||||
do_rescale: Whether to rescale pixel values
|
||||
do_normalize: Whether to normalize pixel values
|
||||
resample: Resampling method
|
||||
data_format: Output channel format
|
||||
input_data_format: Input channel format
|
||||
|
||||
Returns:
|
||||
tuple: (flatten_patches, grid_dimensions)
|
||||
- flatten_patches: Flattened image patches
|
||||
- grid_dimensions: Grid dimensions [t, h, w]
|
||||
"""
|
||||
images = make_list_of_images(images)
|
||||
|
||||
# All transformations expect numpy arrays.
|
||||
images = [to_numpy_array(image) for image in images]
|
||||
|
||||
if is_scaled_image(images[0]) and do_rescale:
|
||||
data_processor_logger.warning(
|
||||
"It looks like you are trying to rescale already rescaled images. If the input"
|
||||
" images have pixel values between 0 and 1, set `do_rescale=False` to avoid rescaling them again."
|
||||
)
|
||||
if input_data_format is None:
|
||||
# We assume that all images have the same channel dimension format.
|
||||
input_data_format = infer_channel_dimension_format(images[0])
|
||||
|
||||
# Get original dimensions and calculate optimal resize dimensions
|
||||
height, width = get_image_size(images[0], channel_dim=input_data_format)
|
||||
resized_height, resized_width = smart_resize(
|
||||
height,
|
||||
width,
|
||||
factor=self.patch_size * self.merge_size, # Combine patch and merge factors
|
||||
min_pixels=min_pixels,
|
||||
max_pixels=max_pixels,
|
||||
)
|
||||
|
||||
processed_images = []
|
||||
for image in images:
|
||||
if height != resized_height or width != resized_width:
|
||||
# Convert to uint8 before resizing to avoid double scaling
|
||||
image = image.astype("uint8")
|
||||
# Convert to PIL Image and resize
|
||||
image = Image.fromarray(image)
|
||||
image = resize(
|
||||
image,
|
||||
size=(resized_height, resized_width),
|
||||
resample=resample,
|
||||
data_format=input_data_format,
|
||||
)
|
||||
|
||||
if do_rescale and do_normalize:
|
||||
# Adjust mean and std for combined rescale+normalize
|
||||
image_mean = np.array(image_mean, dtype=np.float32) * (1.0 / rescale_factor)
|
||||
image_std = np.array(image_std, dtype=np.float32) * (1.0 / rescale_factor)
|
||||
do_rescale = False # Skip separate rescale step
|
||||
|
||||
# mutual exclusion and upper branch
|
||||
if do_rescale:
|
||||
image = image.astype(np.float32)
|
||||
image = rescale(image, scale=rescale_factor, data_format=input_data_format)
|
||||
|
||||
if do_normalize:
|
||||
image = image.astype(np.float32)
|
||||
image = normalize(
|
||||
image=image,
|
||||
mean=image_mean,
|
||||
std=image_std,
|
||||
data_format=input_data_format,
|
||||
)
|
||||
|
||||
image = to_channel_dimension_format(image, data_format, input_channel_dim=input_data_format) # [C, H, W]
|
||||
processed_images.append(image)
|
||||
|
||||
# Convert processed images to numpy array
|
||||
patches = np.array(processed_images)
|
||||
|
||||
# Pad temporal dimension if needed
|
||||
if patches.shape[0] % self.temporal_patch_size != 0:
|
||||
repeats = np.repeat(
|
||||
patches[-1][np.newaxis],
|
||||
self.temporal_patch_size - (patches.shape[0] % self.temporal_patch_size),
|
||||
axis=0,
|
||||
)
|
||||
patches = np.concatenate([patches, repeats], axis=0)
|
||||
|
||||
# Convert to channels-first format if needed
|
||||
if data_format == ChannelDimension.LAST:
|
||||
patches = patches.transpose([0, 3, 1, 2]) # [N, H, W, C] -> [N, C, H, W]
|
||||
|
||||
grid_t, channel = patches.shape[:2]
|
||||
grid_t = grid_t // self.temporal_patch_size
|
||||
|
||||
grid_h, grid_w = (
|
||||
resized_height // self.patch_size,
|
||||
resized_width // self.patch_size,
|
||||
)
|
||||
# Reshape into hierarchical patch structure
|
||||
patches = patches.reshape(
|
||||
[
|
||||
grid_t,
|
||||
self.temporal_patch_size,
|
||||
channel,
|
||||
grid_h // self.merge_size,
|
||||
self.merge_size,
|
||||
self.patch_size,
|
||||
grid_w // self.merge_size,
|
||||
self.merge_size,
|
||||
self.patch_size,
|
||||
]
|
||||
)
|
||||
# Reorder dimensions for better memory access pattern
|
||||
# [grid_t, grid_h/merge_size, grid_w/merge_size, merge_size, merge_size, C, temporal_patch_size, psz, psz]
|
||||
patches = patches.transpose([0, 3, 6, 4, 7, 2, 1, 5, 8])
|
||||
|
||||
flatten_patches = patches.reshape(
|
||||
[
|
||||
grid_t * grid_h * grid_w,
|
||||
channel * self.temporal_patch_size * self.patch_size * self.patch_size,
|
||||
]
|
||||
)
|
||||
|
||||
return flatten_patches, np.array([grid_t, grid_h, grid_w])
|
||||
|
||||
def preprocess(
|
||||
self,
|
||||
images: Union[ImageInput, VideoInput],
|
||||
min_pixels: Optional[int] = None,
|
||||
max_pixels: Optional[int] = None,
|
||||
image_mean: Optional[Union[float, List[float]]] = None,
|
||||
image_std: Optional[Union[float, List[float]]] = None,
|
||||
rescale_factor: Optional[float] = None,
|
||||
do_rescale: Optional[bool] = None,
|
||||
do_normalize: Optional[bool] = None,
|
||||
resample: Optional[PILImageResampling] = None,
|
||||
return_tensors: Optional[Union[str, TensorType]] = None,
|
||||
data_format: Optional[ChannelDimension] = ChannelDimension.FIRST,
|
||||
input_data_format: Optional[Union[str, ChannelDimension]] = ChannelDimension.LAST,
|
||||
):
|
||||
"""
|
||||
Main preprocessing method for images/videos.
|
||||
|
||||
Args:
|
||||
images: Input image/video data
|
||||
min_pixels: Override for minimum pixels
|
||||
max_pixels: Override for maximum pixels
|
||||
image_mean: Override for normalization mean
|
||||
image_std: Override for normalization std
|
||||
rescale_factor: Override for rescaling factor
|
||||
do_rescale: Override for rescaling flag
|
||||
do_normalize: Override for normalization flag
|
||||
resample: Override for resampling method
|
||||
return_tensors: Desired output tensor format
|
||||
data_format: Output channel dimension format
|
||||
input_data_format: Input channel dimension format
|
||||
|
||||
Returns:
|
||||
BatchFeature: Processed features containing:
|
||||
- pixel_values: Preprocessed pixel data
|
||||
- grid_thw: Grid dimensions [temporal, height, width]
|
||||
|
||||
Raises:
|
||||
ValueError: For invalid image types or dimensions
|
||||
"""
|
||||
min_pixels = min_pixels if min_pixels is not None else self.min_pixels
|
||||
max_pixels = max_pixels if max_pixels is not None else self.max_pixels
|
||||
image_mean = image_mean if image_mean is not None else self.image_mean
|
||||
image_std = image_std if image_std is not None else self.image_std
|
||||
rescale_factor = rescale_factor if rescale_factor is not None else self.rescale_factor
|
||||
do_rescale = do_rescale if do_rescale is not None else self.do_rescale
|
||||
do_normalize = do_normalize if do_normalize is not None else self.do_normalize
|
||||
resample = resample if resample is not None else self.resample
|
||||
|
||||
if images is not None and not valid_images(images):
|
||||
raise ValueError("Invalid image type. Must be of type PIL.Image.Image, numpy.ndarray, " "paddle.Tensor.")
|
||||
|
||||
pixel_values, grid_thw = self._preprocess(
|
||||
images,
|
||||
min_pixels=min_pixels,
|
||||
max_pixels=max_pixels,
|
||||
image_mean=image_mean,
|
||||
image_std=image_std,
|
||||
rescale_factor=rescale_factor,
|
||||
do_rescale=do_rescale,
|
||||
do_normalize=do_normalize,
|
||||
resample=resample,
|
||||
data_format=data_format,
|
||||
input_data_format=input_data_format,
|
||||
)
|
||||
data = {"pixel_values": pixel_values, "grid_thw": grid_thw}
|
||||
return BatchFeature(data=data, tensor_type=return_tensors)
|
||||
@@ -0,0 +1,332 @@
|
||||
"""
|
||||
# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
"""
|
||||
|
||||
from typing import List, Optional, Union
|
||||
|
||||
import numpy as np
|
||||
import paddle
|
||||
import PIL
|
||||
from paddleformers.transformers.feature_extraction_utils import BatchFeature
|
||||
from paddleformers.transformers.image_processing_utils import BaseImageProcessor
|
||||
from paddleformers.transformers.image_transforms import (
|
||||
normalize,
|
||||
rescale,
|
||||
resize,
|
||||
to_channel_dimension_format,
|
||||
)
|
||||
from paddleformers.transformers.image_utils import (
|
||||
ChannelDimension,
|
||||
ImageInput,
|
||||
PILImageResampling,
|
||||
get_image_size,
|
||||
infer_channel_dimension_format,
|
||||
make_list_of_images,
|
||||
to_numpy_array,
|
||||
valid_images,
|
||||
)
|
||||
from paddleformers.transformers.legacy.tokenizer_utils_base import TensorType
|
||||
from PIL import Image
|
||||
|
||||
from fastdeploy.input.image_processors.common import is_scaled_image, smart_resize
|
||||
from fastdeploy.utils import data_processor_logger
|
||||
|
||||
OPENAI_CLIP_MEAN = [0.48145466, 0.4578275, 0.40821073]
|
||||
OPENAI_CLIP_STD = [0.26862954, 0.26130258, 0.27577711]
|
||||
|
||||
MIN_PIXELS = 4 * 28 * 28
|
||||
MAX_PIXELS = 16384 * 28 * 28
|
||||
|
||||
|
||||
VideoInput = Union[
|
||||
List["PIL.Image.Image"],
|
||||
"np.ndarray",
|
||||
"paddle.Tensor",
|
||||
List["np.ndarray"],
|
||||
List["paddle.Tensor"],
|
||||
List[List["PIL.Image.Image"]],
|
||||
List[List["np.ndarray"]],
|
||||
List[List["paddle.Tensor"]],
|
||||
]
|
||||
|
||||
|
||||
class ImageProcessor(BaseImageProcessor):
|
||||
"""
|
||||
Adaptive image processor for dynamic image resizing and preprocessing.
|
||||
|
||||
This processor handles image resizing, rescaling, normalization and format conversion.
|
||||
It dynamically adjusts image dimensions based on original size and specified constraints.
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
patch_size: int = 14,
|
||||
merge_size: int = 2,
|
||||
temporal_patch_size: int = 2,
|
||||
min_pixels: int = MIN_PIXELS,
|
||||
max_pixels: int = MAX_PIXELS,
|
||||
image_mean: Union[float, List[float]] = OPENAI_CLIP_MEAN,
|
||||
image_std: Union[float, List[float]] = OPENAI_CLIP_STD,
|
||||
rescale_factor: float = 1 / 255,
|
||||
do_rescale: bool = True,
|
||||
do_normalize: bool = True,
|
||||
resample: PILImageResampling = PILImageResampling.BICUBIC,
|
||||
**kwargs,
|
||||
) -> None:
|
||||
"""
|
||||
Initialize image processor with configuration parameters.
|
||||
|
||||
Args:
|
||||
patch_size (int): Spatial patch size for vision encoder
|
||||
merge_size (int): Merge size between vision and LLM encoders
|
||||
temporal_patch_size (int): Temporal patch size for video processing
|
||||
min_pixels (int): Minimum allowed pixels in resized image
|
||||
max_pixels (int): Maximum allowed pixels in resized image
|
||||
image_mean (float/list): Mean values for normalization per channel
|
||||
image_std (float/list): Std values for normalization per channel
|
||||
rescale_factor (float): Scaling factor for pixel values (default 1/255)
|
||||
do_rescale (bool): Whether to rescale images
|
||||
do_normalize (bool): Whether to normalize images
|
||||
resample: Resampling method for image resizing
|
||||
**kwargs: Additional base class arguments
|
||||
"""
|
||||
super().__init__(**kwargs)
|
||||
self.patch_size = patch_size
|
||||
self.merge_size = merge_size
|
||||
self.temporal_patch_size = temporal_patch_size
|
||||
|
||||
self.min_pixels = min_pixels
|
||||
self.max_pixels = max_pixels
|
||||
|
||||
self.image_mean = image_mean
|
||||
self.image_std = image_std
|
||||
self.rescale_factor = rescale_factor
|
||||
self.do_rescale = do_rescale
|
||||
self.do_normalize = do_normalize
|
||||
|
||||
self.resample = resample
|
||||
|
||||
def _preprocess(
|
||||
self,
|
||||
images: Union[ImageInput, VideoInput],
|
||||
min_pixels: int,
|
||||
max_pixels: int,
|
||||
image_mean: Optional[Union[float, List[float]]],
|
||||
image_std: Optional[Union[float, List[float]]],
|
||||
rescale_factor: float,
|
||||
do_rescale: bool,
|
||||
do_normalize: bool,
|
||||
resample: PILImageResampling,
|
||||
data_format: Optional[ChannelDimension],
|
||||
input_data_format: Optional[Union[str, ChannelDimension]],
|
||||
):
|
||||
"""
|
||||
Internal method for image preprocessing pipeline.
|
||||
|
||||
Args:
|
||||
images: Input image or batch of images
|
||||
min_pixels: Minimum allowed pixels in output
|
||||
max_pixels: Maximum allowed pixels in output
|
||||
image_mean: Normalization mean values
|
||||
image_std: Normalization std values
|
||||
rescale_factor: Pixel value scaling factor
|
||||
do_rescale: Whether to rescale pixel values
|
||||
do_normalize: Whether to normalize pixel values
|
||||
resample: Resampling method
|
||||
data_format: Output channel format
|
||||
input_data_format: Input channel format
|
||||
|
||||
Returns:
|
||||
tuple: (flatten_patches, grid_dimensions)
|
||||
- flatten_patches: Flattened image patches
|
||||
- grid_dimensions: Grid dimensions [t, h, w]
|
||||
"""
|
||||
images = make_list_of_images(images)
|
||||
|
||||
# All transformations expect numpy arrays.
|
||||
images = [to_numpy_array(image) for image in images]
|
||||
|
||||
if is_scaled_image(images[0]) and do_rescale:
|
||||
data_processor_logger.warning(
|
||||
"It looks like you are trying to rescale already rescaled images. If the input"
|
||||
" images have pixel values between 0 and 1, set `do_rescale=False` to avoid rescaling them again."
|
||||
)
|
||||
if input_data_format is None:
|
||||
# We assume that all images have the same channel dimension format.
|
||||
input_data_format = infer_channel_dimension_format(images[0])
|
||||
|
||||
# Get original dimensions and calculate optimal resize dimensions
|
||||
height, width = get_image_size(images[0], channel_dim=input_data_format)
|
||||
resized_height, resized_width = smart_resize(
|
||||
height,
|
||||
width,
|
||||
factor=self.patch_size * self.merge_size, # Combine patch and merge factors
|
||||
min_pixels=min_pixels,
|
||||
max_pixels=max_pixels,
|
||||
)
|
||||
|
||||
processed_images = []
|
||||
for image in images:
|
||||
if height != resized_height or width != resized_width:
|
||||
# Convert to uint8 before resizing to avoid double scaling
|
||||
image = image.astype("uint8")
|
||||
# Convert to PIL Image and resize
|
||||
image = Image.fromarray(image)
|
||||
image = resize(
|
||||
image,
|
||||
size=(resized_height, resized_width),
|
||||
resample=resample,
|
||||
data_format=input_data_format,
|
||||
)
|
||||
|
||||
if do_rescale and do_normalize:
|
||||
# Adjust mean and std for combined rescale+normalize
|
||||
image_mean = np.array(image_mean, dtype=np.float32) * (1.0 / rescale_factor)
|
||||
image_std = np.array(image_std, dtype=np.float32) * (1.0 / rescale_factor)
|
||||
do_rescale = False # Skip separate rescale step
|
||||
|
||||
if do_rescale:
|
||||
image = image.astype(np.float32)
|
||||
image = rescale(image, scale=rescale_factor, data_format=input_data_format)
|
||||
|
||||
if do_normalize:
|
||||
image = image.astype(np.float32)
|
||||
image = normalize(
|
||||
image=image,
|
||||
mean=image_mean,
|
||||
std=image_std,
|
||||
data_format=input_data_format,
|
||||
)
|
||||
|
||||
image = to_channel_dimension_format(image, data_format, input_channel_dim=input_data_format) # [C, H, W]
|
||||
processed_images.append(image)
|
||||
|
||||
# Convert processed images to numpy array
|
||||
patches = np.array(processed_images)
|
||||
|
||||
# Pad temporal dimension if needed
|
||||
if patches.shape[0] % self.temporal_patch_size != 0:
|
||||
repeats = np.repeat(
|
||||
patches[-1][np.newaxis],
|
||||
self.temporal_patch_size - (patches.shape[0] % self.temporal_patch_size),
|
||||
axis=0,
|
||||
)
|
||||
patches = np.concatenate([patches, repeats], axis=0)
|
||||
|
||||
# Convert to channels-first format if needed
|
||||
if data_format == ChannelDimension.LAST:
|
||||
patches = patches.transpose([0, 3, 1, 2]) # [N, H, W, C] -> [N, C, H, W]
|
||||
|
||||
grid_t, channel = patches.shape[:2]
|
||||
grid_t = grid_t // self.temporal_patch_size
|
||||
|
||||
grid_h, grid_w = (
|
||||
resized_height // self.patch_size,
|
||||
resized_width // self.patch_size,
|
||||
)
|
||||
# Reshape into hierarchical patch structure
|
||||
patches = patches.reshape(
|
||||
[
|
||||
grid_t,
|
||||
self.temporal_patch_size,
|
||||
channel,
|
||||
grid_h // self.merge_size,
|
||||
self.merge_size,
|
||||
self.patch_size,
|
||||
grid_w // self.merge_size,
|
||||
self.merge_size,
|
||||
self.patch_size,
|
||||
]
|
||||
)
|
||||
# Reorder dimensions for better memory access pattern
|
||||
# [grid_t, grid_h/merge_size, grid_w/merge_size, merge_size, merge_size, C, temporal_patch_size, psz, psz]
|
||||
patches = patches.transpose([0, 3, 6, 4, 7, 2, 1, 5, 8])
|
||||
|
||||
flatten_patches = patches.reshape(
|
||||
[
|
||||
grid_t * grid_h * grid_w,
|
||||
channel * self.temporal_patch_size * self.patch_size * self.patch_size,
|
||||
]
|
||||
)
|
||||
|
||||
return flatten_patches, np.array([grid_t, grid_h, grid_w])
|
||||
|
||||
def preprocess(
|
||||
self,
|
||||
images: Union[ImageInput, VideoInput],
|
||||
min_pixels: Optional[int] = None,
|
||||
max_pixels: Optional[int] = None,
|
||||
image_mean: Optional[Union[float, List[float]]] = None,
|
||||
image_std: Optional[Union[float, List[float]]] = None,
|
||||
rescale_factor: Optional[float] = None,
|
||||
do_rescale: Optional[bool] = None,
|
||||
do_normalize: Optional[bool] = None,
|
||||
resample: Optional[PILImageResampling] = None,
|
||||
return_tensors: Optional[Union[str, TensorType]] = None,
|
||||
data_format: Optional[ChannelDimension] = ChannelDimension.FIRST,
|
||||
input_data_format: Optional[Union[str, ChannelDimension]] = ChannelDimension.LAST,
|
||||
):
|
||||
"""
|
||||
Main preprocessing method for images/videos.
|
||||
|
||||
Args:
|
||||
images: Input image/video data
|
||||
min_pixels: Override for minimum pixels
|
||||
max_pixels: Override for maximum pixels
|
||||
image_mean: Override for normalization mean
|
||||
image_std: Override for normalization std
|
||||
rescale_factor: Override for rescaling factor
|
||||
do_rescale: Override for rescaling flag
|
||||
do_normalize: Override for normalization flag
|
||||
resample: Override for resampling method
|
||||
return_tensors: Desired output tensor format
|
||||
data_format: Output channel dimension format
|
||||
input_data_format: Input channel dimension format
|
||||
|
||||
Returns:
|
||||
BatchFeature: Processed features containing:
|
||||
- pixel_values: Preprocessed pixel data
|
||||
- grid_thw: Grid dimensions [temporal, height, width]
|
||||
|
||||
Raises:
|
||||
ValueError: For invalid image types or dimensions
|
||||
"""
|
||||
min_pixels = min_pixels if min_pixels is not None else self.min_pixels
|
||||
max_pixels = max_pixels if max_pixels is not None else self.max_pixels
|
||||
image_mean = image_mean if image_mean is not None else self.image_mean
|
||||
image_std = image_std if image_std is not None else self.image_std
|
||||
rescale_factor = rescale_factor if rescale_factor is not None else self.rescale_factor
|
||||
do_rescale = do_rescale if do_rescale is not None else self.do_rescale
|
||||
do_normalize = do_normalize if do_normalize is not None else self.do_normalize
|
||||
resample = resample if resample is not None else self.resample
|
||||
|
||||
if images is not None and not valid_images(images):
|
||||
raise ValueError("Invalid image type. Must be of type PIL.Image.Image, numpy.ndarray, " "paddle.Tensor.")
|
||||
|
||||
pixel_values, grid_thw = self._preprocess(
|
||||
images,
|
||||
min_pixels=min_pixels,
|
||||
max_pixels=max_pixels,
|
||||
image_mean=image_mean,
|
||||
image_std=image_std,
|
||||
rescale_factor=rescale_factor,
|
||||
do_rescale=do_rescale,
|
||||
do_normalize=do_normalize,
|
||||
resample=resample,
|
||||
data_format=data_format,
|
||||
input_data_format=input_data_format,
|
||||
)
|
||||
data = {"pixel_values": pixel_values, "grid_thw": grid_thw}
|
||||
return BatchFeature(data=data, tensor_type=return_tensors)
|
||||
@@ -0,0 +1,453 @@
|
||||
"""
|
||||
# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
"""
|
||||
|
||||
"""Unified multimodal processor for all VL model types.
|
||||
|
||||
Consolidates the four separate VL processor wrappers (QwenVLProcessor,
|
||||
Qwen3VLProcessor, PaddleOCRVLProcessor, Ernie4_5_VLProcessor) into a
|
||||
single class that dispatches per ``model_type``.
|
||||
"""
|
||||
|
||||
from collections.abc import Mapping
|
||||
from typing import Any, Dict, Optional
|
||||
|
||||
import numpy as np
|
||||
|
||||
from fastdeploy.input.base_processor import BaseTextProcessor
|
||||
from fastdeploy.input.utils import IDS_TYPE_FLAG, process_stop_token_ids
|
||||
from fastdeploy.utils import data_processor_logger
|
||||
|
||||
QWEN_VL = "qwen_vl"
|
||||
QWEN3_VL = "qwen3_vl"
|
||||
PADDLEOCR_VL = "paddleocr_vl"
|
||||
ERNIE4_5_VL = "ernie4_5_vl"
|
||||
|
||||
_SUPPORTED_MODEL_TYPES = {QWEN_VL, QWEN3_VL, PADDLEOCR_VL, ERNIE4_5_VL}
|
||||
|
||||
_QWEN_EXPECTED_KWARGS = {
|
||||
"video_max_frames": int,
|
||||
"video_min_frames": int,
|
||||
}
|
||||
|
||||
_ERNIE_EXPECTED_KWARGS = {
|
||||
"spatial_conv_size": int,
|
||||
"temporal_conv_size": int,
|
||||
"image_min_pixels": int,
|
||||
"image_max_pixels": int,
|
||||
"video_min_pixels": int,
|
||||
"video_max_pixels": int,
|
||||
"video_target_frames": int,
|
||||
"video_frames_sample": str,
|
||||
"video_max_frames": int,
|
||||
"video_min_frames": int,
|
||||
"video_fps": int,
|
||||
}
|
||||
|
||||
_DEFAULT_MM_LIMITS = {"image": 1, "video": 1, "audio": 1}
|
||||
|
||||
_SAMPLING_EPS = 1e-5
|
||||
|
||||
|
||||
class MultiModalProcessor(BaseTextProcessor):
|
||||
"""Unified multimodal processor for all supported VL model types.
|
||||
|
||||
Dispatches image-processor creation, config initialisation, and
|
||||
encoding logic based on ``model_type``.
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
model_name_or_path: str,
|
||||
model_type: str,
|
||||
config=None,
|
||||
limit_mm_per_prompt: Optional[Dict[str, Any]] = None,
|
||||
mm_processor_kwargs: Optional[Dict[str, Any]] = None,
|
||||
reasoning_parser_obj=None,
|
||||
tool_parser_obj=None,
|
||||
enable_processor_cache: bool = False,
|
||||
):
|
||||
if model_type not in _SUPPORTED_MODEL_TYPES:
|
||||
raise ValueError(
|
||||
f"Unsupported model_type '{model_type}'. " f"Must be one of {sorted(_SUPPORTED_MODEL_TYPES)}."
|
||||
)
|
||||
self.model_type = model_type
|
||||
self.config = config
|
||||
self.enable_processor_cache = enable_processor_cache
|
||||
|
||||
tokenizer_type = "ernie4_5" if model_type == ERNIE4_5_VL else "auto"
|
||||
|
||||
super().__init__(
|
||||
model_name_or_path,
|
||||
tokenizer_type=tokenizer_type,
|
||||
reasoning_parser_obj=reasoning_parser_obj,
|
||||
tool_parser_obj=tool_parser_obj,
|
||||
)
|
||||
|
||||
data_processor_logger.info(f"model_name_or_path: {model_name_or_path}")
|
||||
|
||||
processor_kwargs = self._parse_processor_kwargs(mm_processor_kwargs)
|
||||
self._init_mm_processor(processor_kwargs)
|
||||
self._init_mm_config()
|
||||
self.limit_mm_per_prompt = self._parse_limits(limit_mm_per_prompt)
|
||||
|
||||
def _load_tokenizer(self):
|
||||
"""Load the appropriate tokenizer based on model_type."""
|
||||
if self.tokenizer_type == "ernie4_5":
|
||||
import os
|
||||
|
||||
from fastdeploy.input.ernie4_5_tokenizer import Ernie4_5Tokenizer
|
||||
|
||||
vocab_file_names = ["tokenizer.model", "spm.model", "ernie_token_100k.model"]
|
||||
for name in vocab_file_names:
|
||||
if os.path.exists(os.path.join(self.model_name_or_path, name)):
|
||||
Ernie4_5Tokenizer.resource_files_names["vocab_file"] = name
|
||||
break
|
||||
tokenizer = Ernie4_5Tokenizer.from_pretrained(self.model_name_or_path)
|
||||
else:
|
||||
from paddleformers.transformers import AutoTokenizer
|
||||
|
||||
tokenizer = AutoTokenizer.from_pretrained(self.model_name_or_path, padding_side="left", use_fast=True)
|
||||
return tokenizer
|
||||
|
||||
def _init_mm_processor(self, processor_kwargs: dict):
|
||||
"""Create the model-type-specific internal DataProcessor."""
|
||||
if self.model_type == QWEN_VL:
|
||||
from fastdeploy.input.qwen_vl_processor.process import DataProcessor
|
||||
|
||||
tokens_per_second = getattr(getattr(self.config, "vision_config", None), "tokens_per_second", 2)
|
||||
self.processor = DataProcessor(
|
||||
model_path=self.model_name_or_path,
|
||||
enable_processor_cache=self.enable_processor_cache,
|
||||
tokens_per_second=tokens_per_second,
|
||||
tokenizer=self.tokenizer,
|
||||
**processor_kwargs,
|
||||
)
|
||||
elif self.model_type == QWEN3_VL:
|
||||
from fastdeploy.input.qwen3_vl_processor.process import DataProcessor
|
||||
|
||||
self.processor = DataProcessor(
|
||||
model_path=self.model_name_or_path,
|
||||
enable_processor_cache=self.enable_processor_cache,
|
||||
tokenizer=self.tokenizer,
|
||||
**processor_kwargs,
|
||||
)
|
||||
elif self.model_type == PADDLEOCR_VL:
|
||||
from fastdeploy.input.paddleocr_vl_processor.process import DataProcessor
|
||||
|
||||
tokens_per_second = getattr(getattr(self.config, "vision_config", None), "tokens_per_second", 2)
|
||||
self.processor = DataProcessor(
|
||||
model_path=self.model_name_or_path,
|
||||
enable_processor_cache=self.enable_processor_cache,
|
||||
tokens_per_second=tokens_per_second,
|
||||
tokenizer=self.tokenizer,
|
||||
**processor_kwargs,
|
||||
)
|
||||
elif self.model_type == ERNIE4_5_VL:
|
||||
from fastdeploy.input.ernie4_5_vl_processor.process import DataProcessor
|
||||
|
||||
self.processor = DataProcessor(
|
||||
tokenizer_name=self.model_name_or_path,
|
||||
image_preprocessor_name=self.model_name_or_path,
|
||||
enable_processor_cache=self.enable_processor_cache,
|
||||
**processor_kwargs,
|
||||
)
|
||||
self.processor.eval()
|
||||
|
||||
def _init_mm_config(self):
|
||||
"""Set model-type-specific multimodal configuration attributes."""
|
||||
if self.model_type in (QWEN_VL, QWEN3_VL):
|
||||
self.image_patch_id = self.processor.image_token_id
|
||||
elif self.model_type == PADDLEOCR_VL:
|
||||
self.image_patch_id = self.processor.image_patch_id
|
||||
elif self.model_type == ERNIE4_5_VL:
|
||||
self.image_patch_id = self.processor.image_patch_id
|
||||
self.spatial_conv_size = self.processor.spatial_conv_size
|
||||
|
||||
def _parse_processor_kwargs(self, kwargs: Optional[dict]) -> dict:
|
||||
"""Parse and validate multimodal processor kwargs."""
|
||||
if not kwargs:
|
||||
return {}
|
||||
|
||||
try:
|
||||
if not isinstance(kwargs, dict):
|
||||
raise ValueError("mm-processor-kwargs must be a dictionary")
|
||||
|
||||
data_processor_logger.info(f"Processing kwargs: {kwargs}")
|
||||
|
||||
if self.model_type == ERNIE4_5_VL:
|
||||
expected_types = _ERNIE_EXPECTED_KWARGS
|
||||
else:
|
||||
expected_types = _QWEN_EXPECTED_KWARGS
|
||||
|
||||
for key, value in kwargs.items():
|
||||
if key in expected_types and not isinstance(value, expected_types[key]):
|
||||
raise ValueError(
|
||||
f"Invalid type for {key}: expected "
|
||||
f"{expected_types[key].__name__}, got {type(value).__name__}"
|
||||
)
|
||||
return kwargs
|
||||
|
||||
except Exception as e:
|
||||
data_processor_logger.warning(f"Invalid mm-processor-kwargs format: {e}")
|
||||
return {}
|
||||
|
||||
def _parse_limits(self, limits: Optional[dict]) -> dict:
|
||||
"""Parse multimodal input limits, merging with defaults."""
|
||||
if not limits:
|
||||
return dict(_DEFAULT_MM_LIMITS)
|
||||
|
||||
try:
|
||||
if not isinstance(limits, dict):
|
||||
raise ValueError("limit-mm-per-prompt must be a dictionary")
|
||||
data_processor_logger.info(f"_parse_limits:{limits}")
|
||||
return {**_DEFAULT_MM_LIMITS, **limits}
|
||||
except Exception as e:
|
||||
data_processor_logger.warning(f"Invalid limit-mm-per-prompt format: {e}, using default limits")
|
||||
return dict(_DEFAULT_MM_LIMITS)
|
||||
|
||||
def _check_mm_limits(self, item):
|
||||
"""Validate multimodal inputs against configured limits."""
|
||||
if isinstance(item, dict):
|
||||
mm_data = item
|
||||
else:
|
||||
mm_data = {"image": [], "video": []}
|
||||
for message in item:
|
||||
if isinstance(message.get("content"), list):
|
||||
for part in message["content"]:
|
||||
part_type = part.get("type")
|
||||
if part_type in ("image_url", "image"):
|
||||
mm_data["image"].append(part)
|
||||
elif part_type in ("video_url", "video"):
|
||||
mm_data["video"].append(part)
|
||||
|
||||
for modality, data in mm_data.items():
|
||||
if modality in self.limit_mm_per_prompt:
|
||||
limit = self.limit_mm_per_prompt[modality]
|
||||
if len(data) > limit:
|
||||
raise ValueError(f"Too many {modality} items in prompt, " f"got {len(data)} but limit is {limit}")
|
||||
|
||||
def get_mm_max_tokens_per_item(self, seq_len: int) -> Optional[Mapping[str, int]]:
|
||||
"""Return per-modality max token counts, if available."""
|
||||
if self.model_type == ERNIE4_5_VL:
|
||||
return self.processor.get_mm_max_tokens_per_item(seq_len)
|
||||
return None
|
||||
|
||||
def process_request_dict(self, request, max_model_len=None):
|
||||
"""Process a request dictionary into model inputs.
|
||||
|
||||
Unified template-method flow for all VL model types. Per-model
|
||||
differences are handled by small conditional branches rather than
|
||||
duplicating the entire pipeline.
|
||||
"""
|
||||
request = self._apply_default_parameters(request)
|
||||
|
||||
if not request.get("eos_token_ids"):
|
||||
request["eos_token_ids"] = self.eos_token_ids
|
||||
|
||||
self._process_stop_tokens(request)
|
||||
|
||||
if self.model_type != PADDLEOCR_VL:
|
||||
self._process_bad_words(request)
|
||||
|
||||
if self.model_type == ERNIE4_5_VL:
|
||||
logits_processors_args = self._prepare_think_stop_sentence(
|
||||
request.get("logits_processors_args") or {}, max_model_len
|
||||
)
|
||||
request["logits_processors_args"] = logits_processors_args
|
||||
|
||||
outputs = self._tokenize_request(request)
|
||||
|
||||
self._process_post_tokens(request, outputs)
|
||||
|
||||
if self.model_type in (QWEN_VL, QWEN3_VL):
|
||||
request["enable_thinking"] = False
|
||||
|
||||
outputs = self.pack_outputs(outputs)
|
||||
|
||||
if self.model_type in (QWEN3_VL, ERNIE4_5_VL) and request.get("prompt_token_ids"):
|
||||
pass # preserve existing prompt_token_ids
|
||||
else:
|
||||
request["prompt_token_ids"] = outputs["input_ids"].tolist()
|
||||
request["prompt_token_ids_len"] = len(request["prompt_token_ids"])
|
||||
request["multimodal_inputs"] = outputs
|
||||
|
||||
if max_model_len is not None and len(request["prompt_token_ids"]) > max_model_len:
|
||||
request["prompt_token_ids"] = request["prompt_token_ids"][: max_model_len - 1]
|
||||
|
||||
if self.model_type == ERNIE4_5_VL:
|
||||
logits_processors_args = self._update_thinking_prompt_state(
|
||||
request["prompt_token_ids"], request.get("logits_processors_args") or {}
|
||||
)
|
||||
request["logits_processors_args"] = logits_processors_args
|
||||
|
||||
max_tokens = max_model_len - len(request["prompt_token_ids"])
|
||||
if request.get("max_tokens") is None:
|
||||
request["max_tokens"] = max(1, max_tokens)
|
||||
else:
|
||||
request["max_tokens"] = min(max_tokens, request["max_tokens"])
|
||||
|
||||
if self.model_type == ERNIE4_5_VL and request.get("reasoning_max_tokens") is None:
|
||||
request["reasoning_max_tokens"] = max(int(request["max_tokens"] * 0.8), 1)
|
||||
|
||||
if self.model_type in (PADDLEOCR_VL, ERNIE4_5_VL):
|
||||
if request.get("top_p") is not None and request.get("top_p") < _SAMPLING_EPS:
|
||||
request["top_p"] = _SAMPLING_EPS
|
||||
request["top_k"] = 1
|
||||
|
||||
if self.model_type != QWEN3_VL and self.reasoning_parser:
|
||||
self._apply_reasoning_parser(request)
|
||||
|
||||
if self.model_type == ERNIE4_5_VL:
|
||||
if request.get("response_max_tokens") is not None and request.get("enable_thinking") is False:
|
||||
request["max_tokens"] = min(request["response_max_tokens"], request["max_tokens"])
|
||||
|
||||
data_processor_logger.info(f"Processed request {request}")
|
||||
return request
|
||||
|
||||
def _process_stop_tokens(self, request):
|
||||
"""Handle stop token processing based on model type."""
|
||||
if self.model_type == QWEN3_VL:
|
||||
stop_sequences = request.get("stop", [])
|
||||
if stop_sequences:
|
||||
stop_seqs, stop_seqs_len = self.update_stop_seq(stop_sequences)
|
||||
request["stop_token_ids"] = stop_seqs
|
||||
request["stop_seqs_len"] = stop_seqs_len
|
||||
else:
|
||||
process_stop_token_ids(request, self.update_stop_seq)
|
||||
|
||||
def _process_bad_words(self, request):
|
||||
"""Process bad_words into token ids."""
|
||||
bad_words = request.get("bad_words")
|
||||
bad_words_token_ids = request.get("bad_words_token_ids")
|
||||
if bad_words:
|
||||
bad_words_token_ids = self.update_bad_words(bad_words, bad_words_token_ids)
|
||||
request["bad_words_token_ids"] = bad_words_token_ids
|
||||
|
||||
def _tokenize_request(self, request):
|
||||
"""Core tokenization dispatch: prompt_token_ids > prompt > messages."""
|
||||
default_thinking = True if self.model_type == ERNIE4_5_VL else False
|
||||
|
||||
if request.get("prompt_token_ids") and self.model_type in (QWEN3_VL, ERNIE4_5_VL):
|
||||
messages = request.get("messages")
|
||||
if messages:
|
||||
self._check_mm_limits(messages)
|
||||
request.setdefault("enable_thinking", default_thinking)
|
||||
return self.processor.prompt_token_ids2outputs(request)
|
||||
|
||||
elif request.get("prompt"):
|
||||
multimodal_data = request.get("multimodal_data") or {}
|
||||
self._check_mm_limits(multimodal_data)
|
||||
images = multimodal_data.get("image", None)
|
||||
videos = multimodal_data.get("video", None)
|
||||
if self.model_type == ERNIE4_5_VL:
|
||||
request["prompt_tokens"] = request.get("prompt")
|
||||
request.setdefault("enable_thinking", default_thinking)
|
||||
return self.processor.text2ids(request["prompt"], images, videos)
|
||||
|
||||
elif request.get("messages"):
|
||||
messages = request["messages"]
|
||||
self._check_mm_limits(messages)
|
||||
chat_template_kwargs = request.get("chat_template_kwargs")
|
||||
if chat_template_kwargs:
|
||||
if isinstance(chat_template_kwargs, dict):
|
||||
for k, v in chat_template_kwargs.items():
|
||||
if k not in request or request[k] is None:
|
||||
request[k] = v
|
||||
else:
|
||||
raise ValueError("Invalid input: chat_template_kwargs must be a dict")
|
||||
request.setdefault("enable_thinking", default_thinking)
|
||||
return self.processor.request2ids(request)
|
||||
|
||||
else:
|
||||
raise ValueError(f"Request must contain 'prompt', or 'messages': {request}")
|
||||
|
||||
def _process_post_tokens(self, request, outputs):
|
||||
"""Handle post-tokenization token appending."""
|
||||
if self.model_type == PADDLEOCR_VL:
|
||||
metadata = request.get("metadata")
|
||||
if metadata and metadata.get("generated_token_ids"):
|
||||
self._append_completion_tokens_qwen(outputs, metadata["generated_token_ids"])
|
||||
else:
|
||||
if request.get("completion_token_ids"):
|
||||
self.append_completion_tokens(outputs, request["completion_token_ids"])
|
||||
|
||||
def _apply_reasoning_parser(self, request):
|
||||
"""Apply reasoning parser and update model status dict."""
|
||||
model_status = self.reasoning_parser.get_model_status(request["prompt_token_ids"])
|
||||
parts = request["request_id"].split("_")
|
||||
if len(parts) > 1:
|
||||
real_req_id = parts[0]
|
||||
index = int(parts[1])
|
||||
n = request.get("n", 1)
|
||||
for idx in range(index * n, (index + 1) * n):
|
||||
self.model_status_dict[f"{real_req_id}_{idx}"] = model_status
|
||||
else:
|
||||
self.model_status_dict[request["request_id"]] = model_status
|
||||
request["enable_thinking"] = model_status == "think_start"
|
||||
|
||||
def append_completion_tokens(self, multimodal_inputs, completion_token_ids):
|
||||
"""Append completion tokens to existing multimodal outputs."""
|
||||
if self.model_type == ERNIE4_5_VL:
|
||||
self._append_completion_tokens_ernie(multimodal_inputs, completion_token_ids)
|
||||
else:
|
||||
self._append_completion_tokens_qwen(multimodal_inputs, completion_token_ids)
|
||||
|
||||
def _append_completion_tokens_qwen(self, multimodal_inputs, completion_token_ids):
|
||||
"""Append completion tokens for qwen_vl / qwen3_vl / paddleocr_vl."""
|
||||
num_tokens = len(completion_token_ids)
|
||||
multimodal_inputs["input_ids"].extend(completion_token_ids)
|
||||
multimodal_inputs["token_type_ids"].extend([0] * num_tokens)
|
||||
|
||||
pos_ids = self.processor._compute_text_positions(multimodal_inputs["cur_position"], num_tokens)
|
||||
multimodal_inputs["position_ids"].append(pos_ids)
|
||||
multimodal_inputs["cur_position"] += num_tokens
|
||||
|
||||
def _append_completion_tokens_ernie(self, multimodal_inputs, completion_token_ids):
|
||||
"""Append completion tokens for ernie4_5_vl."""
|
||||
num_tokens = len(completion_token_ids)
|
||||
multimodal_inputs["input_ids"].extend(completion_token_ids)
|
||||
multimodal_inputs["token_type_ids"].extend([IDS_TYPE_FLAG["text"]] * num_tokens)
|
||||
|
||||
start = multimodal_inputs["cur_position"]
|
||||
for i in range(num_tokens):
|
||||
multimodal_inputs["position_ids"].append([start + i] * 3)
|
||||
multimodal_inputs["cur_position"] += num_tokens
|
||||
|
||||
def pack_outputs(self, outputs):
|
||||
"""Convert intermediate processing outputs to final format."""
|
||||
if not outputs["images"]:
|
||||
outputs["images"] = None
|
||||
outputs["grid_thw"] = None
|
||||
outputs["image_type_ids"] = None
|
||||
else:
|
||||
outputs["images"] = np.vstack(outputs["images"])
|
||||
outputs["grid_thw"] = np.vstack(outputs["grid_thw"])
|
||||
outputs["image_type_ids"] = np.array(outputs["image_type_ids"])
|
||||
|
||||
outputs["input_ids"] = np.array(outputs["input_ids"], dtype=np.int64)
|
||||
outputs["token_type_ids"] = np.array(outputs["token_type_ids"], dtype=np.int64)
|
||||
outputs["mm_num_token_func"] = self.processor.mm_num_tokens
|
||||
|
||||
if self.model_type in (QWEN_VL, QWEN3_VL, PADDLEOCR_VL):
|
||||
outputs["position_ids"] = np.concatenate(outputs["position_ids"], axis=1, dtype=np.int64)
|
||||
outputs["image_patch_id"] = self.processor.image_token_id
|
||||
outputs["video_patch_id"] = self.processor.video_token_id
|
||||
outputs["position_ids"] = outputs["position_ids"].transpose(1, 0)
|
||||
else:
|
||||
outputs["position_ids"] = np.array(outputs["position_ids"], dtype=np.int64)
|
||||
outputs["image_patch_id"] = self.image_patch_id
|
||||
|
||||
return outputs
|
||||
@@ -14,216 +14,12 @@
|
||||
# limitations under the License.
|
||||
"""
|
||||
|
||||
"""Image processor class for Keye."""
|
||||
# Backward compatibility: this module has been migrated to
|
||||
# fastdeploy.input.image_processors.paddleocr_processor
|
||||
# This file will be removed in a future version.
|
||||
|
||||
# TODO: Support videos
|
||||
|
||||
import json
|
||||
from pathlib import Path
|
||||
from typing import Dict, List, Optional, Union
|
||||
|
||||
import numpy as np
|
||||
from paddleformers.transformers.feature_extraction_utils import BatchFeature
|
||||
from paddleformers.transformers.image_processing_utils import BaseImageProcessor
|
||||
from paddleformers.transformers.image_utils import (
|
||||
ImageInput,
|
||||
is_valid_image,
|
||||
make_list_of_images,
|
||||
to_numpy_array,
|
||||
from fastdeploy.input.image_processors.paddleocr_processor import ( # noqa: F401
|
||||
ImageProcessor,
|
||||
make_batched_images,
|
||||
smart_resize,
|
||||
)
|
||||
|
||||
from fastdeploy.input.image_processors.common import (
|
||||
smart_resize_paddleocr as smart_resize,
|
||||
)
|
||||
|
||||
_OPENAI_CLIP_MEAN = [0.48145466, 0.4578275, 0.40821073]
|
||||
_OPENAI_CLIP_STD = [0.26862954, 0.26130258, 0.27577711]
|
||||
|
||||
|
||||
def make_batched_images(images) -> List[List[ImageInput]]:
|
||||
"""
|
||||
Accepts images in list or nested list format, and makes a list of images for preprocessing.
|
||||
|
||||
Args:
|
||||
images (`Union[List[List[ImageInput]], List[ImageInput], ImageInput]`):
|
||||
The input image.
|
||||
|
||||
Returns:
|
||||
list: A list of images.
|
||||
"""
|
||||
if isinstance(images, (list, tuple)) and isinstance(images[0], (list, tuple)) and is_valid_image(images[0][0]):
|
||||
return [img for img_list in images for img in img_list]
|
||||
|
||||
elif isinstance(images, (list, tuple)) and is_valid_image(images[0]):
|
||||
return images
|
||||
|
||||
elif is_valid_image(images):
|
||||
return [images]
|
||||
|
||||
raise ValueError(f"Could not make batched images from {images}")
|
||||
|
||||
|
||||
def adjust_size(size, patch_size):
|
||||
num_patches = size // patch_size
|
||||
if num_patches % 2 != 0:
|
||||
num_patches -= 1
|
||||
return num_patches * patch_size
|
||||
|
||||
|
||||
class ImageProcessor(BaseImageProcessor):
|
||||
model_input_names = [
|
||||
"pixel_values",
|
||||
"image_grid_thw",
|
||||
"pixel_values_videos",
|
||||
"video_grid_thw",
|
||||
]
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
do_resize: bool = True,
|
||||
resample: int = 3,
|
||||
do_rescale: bool = True,
|
||||
rescale_factor: Union[int, float] = 1 / 255,
|
||||
do_normalize: bool = True,
|
||||
image_mean: Optional[Union[float, List[float]]] = None,
|
||||
image_std: Optional[Union[float, List[float]]] = None,
|
||||
do_convert_rgb: bool = True,
|
||||
min_pixels: int = 28 * 28 * 130,
|
||||
max_pixels: int = 28 * 28 * 1280,
|
||||
patch_size: int = 14,
|
||||
temporal_patch_size: int = 1,
|
||||
merge_size: int = 2,
|
||||
**kwargs,
|
||||
) -> None:
|
||||
super().__init__()
|
||||
self.do_resize = do_resize
|
||||
self.resample = resample
|
||||
self.do_rescale = do_rescale
|
||||
self.rescale_factor = rescale_factor
|
||||
self.do_normalize = do_normalize
|
||||
self.image_mean = image_mean if image_mean is not None else _OPENAI_CLIP_MEAN
|
||||
self.image_std = image_std if image_std is not None else _OPENAI_CLIP_STD
|
||||
self.min_pixels = min_pixels
|
||||
self.max_pixels = max_pixels
|
||||
self.patch_size = patch_size
|
||||
self.temporal_patch_size = temporal_patch_size
|
||||
self.merge_size = merge_size
|
||||
self.size = {"min_pixels": min_pixels, "max_pixels": max_pixels} # not used
|
||||
self.do_convert_rgb = do_convert_rgb
|
||||
|
||||
@classmethod
|
||||
def from_pretrained(cls, pretrained_model_dir):
|
||||
pretrained_model_dir = Path(pretrained_model_dir)
|
||||
image_processor_config_path = pretrained_model_dir / "preprocessor_config.json"
|
||||
with open(image_processor_config_path, "r", encoding="utf-8") as f:
|
||||
image_processor_config = json.load(f)
|
||||
return cls(**image_processor_config)
|
||||
|
||||
def _preprocess(
|
||||
self,
|
||||
images,
|
||||
do_resize: Optional[bool] = None,
|
||||
do_rescale: Optional[bool] = None,
|
||||
rescale_factor: Optional[float] = None,
|
||||
do_normalize: Optional[bool] = None,
|
||||
image_mean: Optional[Union[float, List[float]]] = None,
|
||||
image_std: Optional[Union[float, List[float]]] = None,
|
||||
do_convert_rgb: Optional[bool] = None,
|
||||
):
|
||||
images = make_list_of_images(images)
|
||||
|
||||
if do_convert_rgb:
|
||||
images = [image.convert("RGB") for image in images]
|
||||
|
||||
width, height = images[0].size
|
||||
resized_height, resized_width = height, width
|
||||
processed_images = []
|
||||
|
||||
for image in images:
|
||||
if do_resize:
|
||||
resized_height, resized_width = smart_resize(
|
||||
height,
|
||||
width,
|
||||
factor=self.patch_size * self.merge_size,
|
||||
min_pixels=self.min_pixels,
|
||||
max_pixels=self.max_pixels,
|
||||
)
|
||||
|
||||
image = image.resize((resized_width, resized_height), resample=self.resample)
|
||||
|
||||
image = to_numpy_array(image)
|
||||
|
||||
if do_rescale:
|
||||
image = (image * rescale_factor).astype(np.float32)
|
||||
|
||||
if do_normalize:
|
||||
image = image.astype(np.float32)
|
||||
image -= np.array(image_mean, dtype=np.float32)
|
||||
image /= np.array(image_std, dtype=np.float32)
|
||||
|
||||
processed_images.append(image)
|
||||
|
||||
patches = np.array(processed_images)
|
||||
patches = patches.transpose(0, 3, 1, 2)
|
||||
if patches.shape[0] == 1:
|
||||
patches = np.tile(patches, (self.temporal_patch_size, 1, 1, 1))
|
||||
channel = patches.shape[1]
|
||||
grid_t = patches.shape[0] // self.temporal_patch_size
|
||||
grid_h, grid_w = (
|
||||
resized_height // self.patch_size,
|
||||
resized_width // self.patch_size,
|
||||
)
|
||||
|
||||
patches = patches.reshape(
|
||||
grid_t,
|
||||
self.temporal_patch_size,
|
||||
channel,
|
||||
grid_h,
|
||||
self.patch_size,
|
||||
grid_w,
|
||||
self.patch_size,
|
||||
)
|
||||
patches = patches.transpose(0, 3, 5, 2, 1, 4, 6)
|
||||
assert self.temporal_patch_size == 1
|
||||
flatten_patches = patches.reshape(grid_t * grid_h * grid_w, channel, self.patch_size, self.patch_size)
|
||||
return flatten_patches, np.array([grid_t, grid_h, grid_w])
|
||||
|
||||
def preprocess(
|
||||
self,
|
||||
images,
|
||||
videos=None,
|
||||
do_resize: Optional[bool] = None,
|
||||
size: Optional[Dict[str, int]] = None,
|
||||
do_rescale: Optional[bool] = None,
|
||||
rescale_factor: Optional[float] = None,
|
||||
do_normalize: Optional[bool] = None,
|
||||
image_mean: Optional[Union[float, List[float]]] = None,
|
||||
image_std: Optional[Union[float, List[float]]] = None,
|
||||
do_convert_rgb: Optional[bool] = None,
|
||||
return_tensors=None,
|
||||
):
|
||||
do_resize = do_resize if do_resize is not None else self.do_resize
|
||||
size = size if size is not None else self.size
|
||||
do_rescale = do_rescale if do_rescale is not None else self.do_rescale
|
||||
rescale_factor = rescale_factor if rescale_factor is not None else self.rescale_factor
|
||||
do_normalize = do_normalize if do_normalize is not None else self.do_normalize
|
||||
image_mean = image_mean if image_mean is not None else self.image_mean
|
||||
image_std = image_std if image_std is not None else self.image_std
|
||||
do_convert_rgb = do_convert_rgb if do_convert_rgb is not None else self.do_convert_rgb
|
||||
|
||||
if videos is not None:
|
||||
raise NotImplementedError("Videos are not yet supported")
|
||||
|
||||
patches, image_grid_thw = self._preprocess(
|
||||
images,
|
||||
do_resize=do_resize,
|
||||
do_rescale=do_rescale,
|
||||
rescale_factor=rescale_factor,
|
||||
do_normalize=do_normalize,
|
||||
image_mean=image_mean,
|
||||
image_std=image_std,
|
||||
do_convert_rgb=do_convert_rgb,
|
||||
)
|
||||
pixel_values = np.array(patches)
|
||||
data = {"pixel_values": pixel_values, "grid_thw": image_grid_thw}
|
||||
return BatchFeature(data=data, tensor_type=return_tensors)
|
||||
|
||||
@@ -91,54 +91,34 @@ class InputPreprocessor:
|
||||
tool_parser_obj=tool_parser_obj,
|
||||
)
|
||||
else:
|
||||
from fastdeploy.input.multimodal_processor import (
|
||||
ERNIE4_5_VL,
|
||||
PADDLEOCR_VL,
|
||||
QWEN3_VL,
|
||||
QWEN_VL,
|
||||
MultiModalProcessor,
|
||||
)
|
||||
|
||||
if ErnieArchitectures.contains_ernie_arch(architecture):
|
||||
from fastdeploy.input.ernie4_5_vl_processor import (
|
||||
Ernie4_5_VLProcessor,
|
||||
)
|
||||
|
||||
self.processor = Ernie4_5_VLProcessor(
|
||||
model_name_or_path=self.model_name_or_path,
|
||||
limit_mm_per_prompt=self.limit_mm_per_prompt,
|
||||
mm_processor_kwargs=self.mm_processor_kwargs,
|
||||
reasoning_parser_obj=reasoning_parser_obj,
|
||||
tool_parser_obj=tool_parser_obj,
|
||||
enable_processor_cache=self.enable_processor_cache,
|
||||
)
|
||||
model_type = ERNIE4_5_VL
|
||||
elif "PaddleOCRVL" in architecture:
|
||||
from fastdeploy.input.paddleocr_vl_processor import (
|
||||
PaddleOCRVLProcessor,
|
||||
)
|
||||
|
||||
self.processor = PaddleOCRVLProcessor(
|
||||
config=self.model_config,
|
||||
model_name_or_path=self.model_name_or_path,
|
||||
limit_mm_per_prompt=self.limit_mm_per_prompt,
|
||||
mm_processor_kwargs=self.mm_processor_kwargs,
|
||||
reasoning_parser_obj=reasoning_parser_obj,
|
||||
)
|
||||
model_type = PADDLEOCR_VL
|
||||
elif "Qwen2_5_VL" in architecture:
|
||||
from fastdeploy.input.qwen_vl_processor import QwenVLProcessor
|
||||
|
||||
self.processor = QwenVLProcessor(
|
||||
config=self.model_config,
|
||||
model_name_or_path=self.model_name_or_path,
|
||||
limit_mm_per_prompt=self.limit_mm_per_prompt,
|
||||
mm_processor_kwargs=self.mm_processor_kwargs,
|
||||
reasoning_parser_obj=reasoning_parser_obj,
|
||||
enable_processor_cache=self.enable_processor_cache,
|
||||
)
|
||||
model_type = QWEN_VL
|
||||
elif "Qwen3VL" in architecture:
|
||||
from fastdeploy.input.qwen3_vl_processor import Qwen3VLProcessor
|
||||
|
||||
self.processor = Qwen3VLProcessor(
|
||||
config=self.model_config,
|
||||
model_name_or_path=self.model_name_or_path,
|
||||
limit_mm_per_prompt=self.limit_mm_per_prompt,
|
||||
mm_processor_kwargs=self.mm_processor_kwargs,
|
||||
reasoning_parser_obj=reasoning_parser_obj,
|
||||
enable_processor_cache=self.enable_processor_cache,
|
||||
)
|
||||
model_type = QWEN3_VL
|
||||
else:
|
||||
raise ValueError(f"Unsupported model processor architecture: {architecture}. ")
|
||||
|
||||
self.processor = MultiModalProcessor(
|
||||
model_name_or_path=self.model_name_or_path,
|
||||
model_type=model_type,
|
||||
config=self.model_config,
|
||||
limit_mm_per_prompt=self.limit_mm_per_prompt,
|
||||
mm_processor_kwargs=self.mm_processor_kwargs,
|
||||
reasoning_parser_obj=reasoning_parser_obj,
|
||||
tool_parser_obj=tool_parser_obj,
|
||||
enable_processor_cache=self.enable_processor_cache,
|
||||
)
|
||||
|
||||
return self.processor
|
||||
|
||||
@@ -14,320 +14,10 @@
|
||||
# limitations under the License.
|
||||
"""
|
||||
|
||||
from typing import List, Optional, Union
|
||||
# Backward compatibility: this module has been migrated to
|
||||
# fastdeploy.input.image_processors.qwen3_processor
|
||||
# This file will be removed in a future version.
|
||||
|
||||
import numpy as np
|
||||
import paddle
|
||||
import PIL
|
||||
from paddleformers.transformers.feature_extraction_utils import BatchFeature
|
||||
from paddleformers.transformers.image_processing_utils import BaseImageProcessor
|
||||
from paddleformers.transformers.image_transforms import (
|
||||
normalize,
|
||||
rescale,
|
||||
resize,
|
||||
to_channel_dimension_format,
|
||||
from fastdeploy.input.image_processors.qwen3_processor import ( # noqa: F401
|
||||
ImageProcessor,
|
||||
)
|
||||
from paddleformers.transformers.image_utils import (
|
||||
ChannelDimension,
|
||||
ImageInput,
|
||||
PILImageResampling,
|
||||
get_image_size,
|
||||
infer_channel_dimension_format,
|
||||
make_list_of_images,
|
||||
to_numpy_array,
|
||||
valid_images,
|
||||
)
|
||||
from paddleformers.transformers.legacy.tokenizer_utils_base import TensorType
|
||||
from PIL import Image
|
||||
|
||||
from fastdeploy.input.image_processors.common import is_scaled_image, smart_resize
|
||||
from fastdeploy.utils import data_processor_logger
|
||||
|
||||
IMAGE_MEAN = [0.5, 0.5, 0.5]
|
||||
IMAGE_STD = [0.5, 0.5, 0.5]
|
||||
|
||||
MIN_PIXELS = 65536
|
||||
MAX_PIXELS = 16777216
|
||||
|
||||
|
||||
VideoInput = Union[
|
||||
List["PIL.Image.Image"],
|
||||
"np.ndarray",
|
||||
"paddle.Tensor",
|
||||
List["np.ndarray"],
|
||||
List["paddle.Tensor"],
|
||||
List[List["PIL.Image.Image"]],
|
||||
List[List["np.ndarray"]],
|
||||
List[List["paddle.Tensor"]],
|
||||
]
|
||||
|
||||
|
||||
class ImageProcessor(BaseImageProcessor):
|
||||
"""
|
||||
Adaptive image processor for dynamic image resizing and preprocessing.
|
||||
|
||||
This processor handles image resizing, rescaling, normalization and format conversion.
|
||||
It dynamically adjusts image dimensions based on original size and specified constraints.
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
patch_size: int = 16,
|
||||
merge_size: int = 2,
|
||||
temporal_patch_size: int = 2,
|
||||
min_pixels: int = MIN_PIXELS,
|
||||
max_pixels: int = MAX_PIXELS,
|
||||
image_mean: Union[float, List[float]] = IMAGE_MEAN,
|
||||
image_std: Union[float, List[float]] = IMAGE_STD,
|
||||
rescale_factor: float = 1 / 255,
|
||||
do_rescale: bool = True,
|
||||
do_normalize: bool = True,
|
||||
resample: PILImageResampling = PILImageResampling.BICUBIC,
|
||||
**kwargs,
|
||||
) -> None:
|
||||
"""
|
||||
Initialize image processor with configuration parameters.
|
||||
|
||||
Args:
|
||||
patch_size (int): Spatial patch size for vision encoder
|
||||
merge_size (int): Merge size between vision and LLM encoders
|
||||
temporal_patch_size (int): Temporal patch size for video processing
|
||||
min_pixels (int): Minimum allowed pixels in resized image
|
||||
max_pixels (int): Maximum allowed pixels in resized image
|
||||
image_mean (float/list): Mean values for normalization per channel
|
||||
image_std (float/list): Std values for normalization per channel
|
||||
rescale_factor (float): Scaling factor for pixel values (default 1/255)
|
||||
do_rescale (bool): Whether to rescale images
|
||||
do_normalize (bool): Whether to normalize images
|
||||
resample: Resampling method for image resizing
|
||||
**kwargs: Additional base class arguments
|
||||
"""
|
||||
super().__init__(**kwargs)
|
||||
self.patch_size = patch_size
|
||||
self.merge_size = merge_size
|
||||
self.temporal_patch_size = temporal_patch_size
|
||||
|
||||
self.min_pixels = min_pixels
|
||||
self.max_pixels = max_pixels
|
||||
|
||||
self.image_mean = image_mean
|
||||
self.image_std = image_std
|
||||
self.rescale_factor = rescale_factor
|
||||
self.do_rescale = do_rescale
|
||||
self.do_normalize = do_normalize
|
||||
|
||||
self.resample = resample
|
||||
|
||||
def _preprocess(
|
||||
self,
|
||||
images: Union[ImageInput, VideoInput],
|
||||
min_pixels: int,
|
||||
max_pixels: int,
|
||||
image_mean: Optional[Union[float, List[float]]],
|
||||
image_std: Optional[Union[float, List[float]]],
|
||||
rescale_factor: float,
|
||||
do_rescale: bool,
|
||||
do_normalize: bool,
|
||||
resample: PILImageResampling,
|
||||
data_format: Optional[ChannelDimension],
|
||||
input_data_format: Optional[Union[str, ChannelDimension]],
|
||||
):
|
||||
"""
|
||||
Internal method for image preprocessing pipeline.
|
||||
|
||||
Args:
|
||||
images: Input image or batch of images
|
||||
min_pixels: Minimum allowed pixels in output
|
||||
max_pixels: Maximum allowed pixels in output
|
||||
image_mean: Normalization mean values
|
||||
image_std: Normalization std values
|
||||
rescale_factor: Pixel value scaling factor
|
||||
do_rescale: Whether to rescale pixel values
|
||||
do_normalize: Whether to normalize pixel values
|
||||
resample: Resampling method
|
||||
data_format: Output channel format
|
||||
input_data_format: Input channel format
|
||||
|
||||
Returns:
|
||||
tuple: (flatten_patches, grid_dimensions)
|
||||
- flatten_patches: Flattened image patches
|
||||
- grid_dimensions: Grid dimensions [t, h, w]
|
||||
"""
|
||||
images = make_list_of_images(images)
|
||||
|
||||
# All transformations expect numpy arrays.
|
||||
images = [to_numpy_array(image) for image in images]
|
||||
|
||||
if is_scaled_image(images[0]) and do_rescale:
|
||||
data_processor_logger.warning(
|
||||
"It looks like you are trying to rescale already rescaled images. If the input"
|
||||
" images have pixel values between 0 and 1, set `do_rescale=False` to avoid rescaling them again."
|
||||
)
|
||||
if input_data_format is None:
|
||||
# We assume that all images have the same channel dimension format.
|
||||
input_data_format = infer_channel_dimension_format(images[0])
|
||||
|
||||
# Get original dimensions and calculate optimal resize dimensions
|
||||
height, width = get_image_size(images[0], channel_dim=input_data_format)
|
||||
resized_height, resized_width = smart_resize(
|
||||
height,
|
||||
width,
|
||||
factor=self.patch_size * self.merge_size, # Combine patch and merge factors
|
||||
min_pixels=min_pixels,
|
||||
max_pixels=max_pixels,
|
||||
)
|
||||
|
||||
processed_images = []
|
||||
for image in images:
|
||||
if height != resized_height or width != resized_width:
|
||||
# Convert to uint8 before resizing to avoid double scaling
|
||||
image = image.astype("uint8")
|
||||
# Convert to PIL Image and resize
|
||||
image = Image.fromarray(image)
|
||||
image = resize(
|
||||
image,
|
||||
size=(resized_height, resized_width),
|
||||
resample=resample,
|
||||
data_format=input_data_format,
|
||||
)
|
||||
|
||||
if do_rescale and do_normalize:
|
||||
# Adjust mean and std for combined rescale+normalize
|
||||
image_mean = np.array(image_mean, dtype=np.float32) * (1.0 / rescale_factor)
|
||||
image_std = np.array(image_std, dtype=np.float32) * (1.0 / rescale_factor)
|
||||
do_rescale = False # Skip separate rescale step
|
||||
|
||||
# mutual exclusion and upper branch
|
||||
if do_rescale:
|
||||
image = image.astype(np.float32)
|
||||
image = rescale(image, scale=rescale_factor, data_format=input_data_format)
|
||||
|
||||
if do_normalize:
|
||||
image = image.astype(np.float32)
|
||||
image = normalize(
|
||||
image=image,
|
||||
mean=image_mean,
|
||||
std=image_std,
|
||||
data_format=input_data_format,
|
||||
)
|
||||
|
||||
image = to_channel_dimension_format(image, data_format, input_channel_dim=input_data_format) # [C, H, W]
|
||||
processed_images.append(image)
|
||||
|
||||
# Convert processed images to numpy array
|
||||
patches = np.array(processed_images)
|
||||
|
||||
# Pad temporal dimension if needed
|
||||
if patches.shape[0] % self.temporal_patch_size != 0:
|
||||
repeats = np.repeat(
|
||||
patches[-1][np.newaxis],
|
||||
self.temporal_patch_size - (patches.shape[0] % self.temporal_patch_size),
|
||||
axis=0,
|
||||
)
|
||||
patches = np.concatenate([patches, repeats], axis=0)
|
||||
|
||||
# Convert to channels-first format if needed
|
||||
if data_format == ChannelDimension.LAST:
|
||||
patches = patches.transpose([0, 3, 1, 2]) # [N, H, W, C] -> [N, C, H, W]
|
||||
|
||||
grid_t, channel = patches.shape[:2]
|
||||
grid_t = grid_t // self.temporal_patch_size
|
||||
|
||||
grid_h, grid_w = (
|
||||
resized_height // self.patch_size,
|
||||
resized_width // self.patch_size,
|
||||
)
|
||||
# Reshape into hierarchical patch structure
|
||||
patches = patches.reshape(
|
||||
[
|
||||
grid_t,
|
||||
self.temporal_patch_size,
|
||||
channel,
|
||||
grid_h // self.merge_size,
|
||||
self.merge_size,
|
||||
self.patch_size,
|
||||
grid_w // self.merge_size,
|
||||
self.merge_size,
|
||||
self.patch_size,
|
||||
]
|
||||
)
|
||||
# Reorder dimensions for better memory access pattern
|
||||
# [grid_t, grid_h/merge_size, grid_w/merge_size, merge_size, merge_size, C, temporal_patch_size, psz, psz]
|
||||
patches = patches.transpose([0, 3, 6, 4, 7, 2, 1, 5, 8])
|
||||
|
||||
flatten_patches = patches.reshape(
|
||||
[
|
||||
grid_t * grid_h * grid_w,
|
||||
channel * self.temporal_patch_size * self.patch_size * self.patch_size,
|
||||
]
|
||||
)
|
||||
|
||||
return flatten_patches, np.array([grid_t, grid_h, grid_w])
|
||||
|
||||
def preprocess(
|
||||
self,
|
||||
images: Union[ImageInput, VideoInput],
|
||||
min_pixels: Optional[int] = None,
|
||||
max_pixels: Optional[int] = None,
|
||||
image_mean: Optional[Union[float, List[float]]] = None,
|
||||
image_std: Optional[Union[float, List[float]]] = None,
|
||||
rescale_factor: Optional[float] = None,
|
||||
do_rescale: Optional[bool] = None,
|
||||
do_normalize: Optional[bool] = None,
|
||||
resample: Optional[PILImageResampling] = None,
|
||||
return_tensors: Optional[Union[str, TensorType]] = None,
|
||||
data_format: Optional[ChannelDimension] = ChannelDimension.FIRST,
|
||||
input_data_format: Optional[Union[str, ChannelDimension]] = ChannelDimension.LAST,
|
||||
):
|
||||
"""
|
||||
Main preprocessing method for images/videos.
|
||||
|
||||
Args:
|
||||
images: Input image/video data
|
||||
min_pixels: Override for minimum pixels
|
||||
max_pixels: Override for maximum pixels
|
||||
image_mean: Override for normalization mean
|
||||
image_std: Override for normalization std
|
||||
rescale_factor: Override for rescaling factor
|
||||
do_rescale: Override for rescaling flag
|
||||
do_normalize: Override for normalization flag
|
||||
resample: Override for resampling method
|
||||
return_tensors: Desired output tensor format
|
||||
data_format: Output channel dimension format
|
||||
input_data_format: Input channel dimension format
|
||||
|
||||
Returns:
|
||||
BatchFeature: Processed features containing:
|
||||
- pixel_values: Preprocessed pixel data
|
||||
- grid_thw: Grid dimensions [temporal, height, width]
|
||||
|
||||
Raises:
|
||||
ValueError: For invalid image types or dimensions
|
||||
"""
|
||||
min_pixels = min_pixels if min_pixels is not None else self.min_pixels
|
||||
max_pixels = max_pixels if max_pixels is not None else self.max_pixels
|
||||
image_mean = image_mean if image_mean is not None else self.image_mean
|
||||
image_std = image_std if image_std is not None else self.image_std
|
||||
rescale_factor = rescale_factor if rescale_factor is not None else self.rescale_factor
|
||||
do_rescale = do_rescale if do_rescale is not None else self.do_rescale
|
||||
do_normalize = do_normalize if do_normalize is not None else self.do_normalize
|
||||
resample = resample if resample is not None else self.resample
|
||||
|
||||
if images is not None and not valid_images(images):
|
||||
raise ValueError("Invalid image type. Must be of type PIL.Image.Image, numpy.ndarray, " "paddle.Tensor.")
|
||||
|
||||
pixel_values, grid_thw = self._preprocess(
|
||||
images,
|
||||
min_pixels=min_pixels,
|
||||
max_pixels=max_pixels,
|
||||
image_mean=image_mean,
|
||||
image_std=image_std,
|
||||
rescale_factor=rescale_factor,
|
||||
do_rescale=do_rescale,
|
||||
do_normalize=do_normalize,
|
||||
resample=resample,
|
||||
data_format=data_format,
|
||||
input_data_format=input_data_format,
|
||||
)
|
||||
data = {"pixel_values": pixel_values, "grid_thw": grid_thw}
|
||||
return BatchFeature(data=data, tensor_type=return_tensors)
|
||||
|
||||
@@ -14,319 +14,10 @@
|
||||
# limitations under the License.
|
||||
"""
|
||||
|
||||
from typing import List, Optional, Union
|
||||
# Backward compatibility: this module has been migrated to
|
||||
# fastdeploy.input.image_processors.qwen_processor
|
||||
# This file will be removed in a future version.
|
||||
|
||||
import numpy as np
|
||||
import paddle
|
||||
import PIL
|
||||
from paddleformers.transformers.feature_extraction_utils import BatchFeature
|
||||
from paddleformers.transformers.image_processing_utils import BaseImageProcessor
|
||||
from paddleformers.transformers.image_transforms import (
|
||||
normalize,
|
||||
rescale,
|
||||
resize,
|
||||
to_channel_dimension_format,
|
||||
from fastdeploy.input.image_processors.qwen_processor import ( # noqa: F401
|
||||
ImageProcessor,
|
||||
)
|
||||
from paddleformers.transformers.image_utils import (
|
||||
ChannelDimension,
|
||||
ImageInput,
|
||||
PILImageResampling,
|
||||
get_image_size,
|
||||
infer_channel_dimension_format,
|
||||
make_list_of_images,
|
||||
to_numpy_array,
|
||||
valid_images,
|
||||
)
|
||||
from paddleformers.transformers.legacy.tokenizer_utils_base import TensorType
|
||||
from PIL import Image
|
||||
|
||||
from fastdeploy.input.image_processors.common import is_scaled_image, smart_resize
|
||||
from fastdeploy.utils import data_processor_logger
|
||||
|
||||
OPENAI_CLIP_MEAN = [0.48145466, 0.4578275, 0.40821073]
|
||||
OPENAI_CLIP_STD = [0.26862954, 0.26130258, 0.27577711]
|
||||
|
||||
MIN_PIXELS = 4 * 28 * 28
|
||||
MAX_PIXELS = 16384 * 28 * 28
|
||||
|
||||
|
||||
VideoInput = Union[
|
||||
List["PIL.Image.Image"],
|
||||
"np.ndarray",
|
||||
"paddle.Tensor",
|
||||
List["np.ndarray"],
|
||||
List["paddle.Tensor"],
|
||||
List[List["PIL.Image.Image"]],
|
||||
List[List["np.ndarray"]],
|
||||
List[List["paddle.Tensor"]],
|
||||
]
|
||||
|
||||
|
||||
class ImageProcessor(BaseImageProcessor):
|
||||
"""
|
||||
Adaptive image processor for dynamic image resizing and preprocessing.
|
||||
|
||||
This processor handles image resizing, rescaling, normalization and format conversion.
|
||||
It dynamically adjusts image dimensions based on original size and specified constraints.
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
patch_size: int = 14,
|
||||
merge_size: int = 2,
|
||||
temporal_patch_size: int = 2,
|
||||
min_pixels: int = MIN_PIXELS,
|
||||
max_pixels: int = MAX_PIXELS,
|
||||
image_mean: Union[float, List[float]] = OPENAI_CLIP_MEAN,
|
||||
image_std: Union[float, List[float]] = OPENAI_CLIP_STD,
|
||||
rescale_factor: float = 1 / 255,
|
||||
do_rescale: bool = True,
|
||||
do_normalize: bool = True,
|
||||
resample: PILImageResampling = PILImageResampling.BICUBIC,
|
||||
**kwargs,
|
||||
) -> None:
|
||||
"""
|
||||
Initialize image processor with configuration parameters.
|
||||
|
||||
Args:
|
||||
patch_size (int): Spatial patch size for vision encoder
|
||||
merge_size (int): Merge size between vision and LLM encoders
|
||||
temporal_patch_size (int): Temporal patch size for video processing
|
||||
min_pixels (int): Minimum allowed pixels in resized image
|
||||
max_pixels (int): Maximum allowed pixels in resized image
|
||||
image_mean (float/list): Mean values for normalization per channel
|
||||
image_std (float/list): Std values for normalization per channel
|
||||
rescale_factor (float): Scaling factor for pixel values (default 1/255)
|
||||
do_rescale (bool): Whether to rescale images
|
||||
do_normalize (bool): Whether to normalize images
|
||||
resample: Resampling method for image resizing
|
||||
**kwargs: Additional base class arguments
|
||||
"""
|
||||
super().__init__(**kwargs)
|
||||
self.patch_size = patch_size
|
||||
self.merge_size = merge_size
|
||||
self.temporal_patch_size = temporal_patch_size
|
||||
|
||||
self.min_pixels = min_pixels
|
||||
self.max_pixels = max_pixels
|
||||
|
||||
self.image_mean = image_mean
|
||||
self.image_std = image_std
|
||||
self.rescale_factor = rescale_factor
|
||||
self.do_rescale = do_rescale
|
||||
self.do_normalize = do_normalize
|
||||
|
||||
self.resample = resample
|
||||
|
||||
def _preprocess(
|
||||
self,
|
||||
images: Union[ImageInput, VideoInput],
|
||||
min_pixels: int,
|
||||
max_pixels: int,
|
||||
image_mean: Optional[Union[float, List[float]]],
|
||||
image_std: Optional[Union[float, List[float]]],
|
||||
rescale_factor: float,
|
||||
do_rescale: bool,
|
||||
do_normalize: bool,
|
||||
resample: PILImageResampling,
|
||||
data_format: Optional[ChannelDimension],
|
||||
input_data_format: Optional[Union[str, ChannelDimension]],
|
||||
):
|
||||
"""
|
||||
Internal method for image preprocessing pipeline.
|
||||
|
||||
Args:
|
||||
images: Input image or batch of images
|
||||
min_pixels: Minimum allowed pixels in output
|
||||
max_pixels: Maximum allowed pixels in output
|
||||
image_mean: Normalization mean values
|
||||
image_std: Normalization std values
|
||||
rescale_factor: Pixel value scaling factor
|
||||
do_rescale: Whether to rescale pixel values
|
||||
do_normalize: Whether to normalize pixel values
|
||||
resample: Resampling method
|
||||
data_format: Output channel format
|
||||
input_data_format: Input channel format
|
||||
|
||||
Returns:
|
||||
tuple: (flatten_patches, grid_dimensions)
|
||||
- flatten_patches: Flattened image patches
|
||||
- grid_dimensions: Grid dimensions [t, h, w]
|
||||
"""
|
||||
images = make_list_of_images(images)
|
||||
|
||||
# All transformations expect numpy arrays.
|
||||
images = [to_numpy_array(image) for image in images]
|
||||
|
||||
if is_scaled_image(images[0]) and do_rescale:
|
||||
data_processor_logger.warning(
|
||||
"It looks like you are trying to rescale already rescaled images. If the input"
|
||||
" images have pixel values between 0 and 1, set `do_rescale=False` to avoid rescaling them again."
|
||||
)
|
||||
if input_data_format is None:
|
||||
# We assume that all images have the same channel dimension format.
|
||||
input_data_format = infer_channel_dimension_format(images[0])
|
||||
|
||||
# Get original dimensions and calculate optimal resize dimensions
|
||||
height, width = get_image_size(images[0], channel_dim=input_data_format)
|
||||
resized_height, resized_width = smart_resize(
|
||||
height,
|
||||
width,
|
||||
factor=self.patch_size * self.merge_size, # Combine patch and merge factors
|
||||
min_pixels=min_pixels,
|
||||
max_pixels=max_pixels,
|
||||
)
|
||||
|
||||
processed_images = []
|
||||
for image in images:
|
||||
if height != resized_height or width != resized_width:
|
||||
# Convert to uint8 before resizing to avoid double scaling
|
||||
image = image.astype("uint8")
|
||||
# Convert to PIL Image and resize
|
||||
image = Image.fromarray(image)
|
||||
image = resize(
|
||||
image,
|
||||
size=(resized_height, resized_width),
|
||||
resample=resample,
|
||||
data_format=input_data_format,
|
||||
)
|
||||
|
||||
if do_rescale and do_normalize:
|
||||
# Adjust mean and std for combined rescale+normalize
|
||||
image_mean = np.array(image_mean, dtype=np.float32) * (1.0 / rescale_factor)
|
||||
image_std = np.array(image_std, dtype=np.float32) * (1.0 / rescale_factor)
|
||||
do_rescale = False # Skip separate rescale step
|
||||
|
||||
if do_rescale:
|
||||
image = image.astype(np.float32)
|
||||
image = rescale(image, scale=rescale_factor, data_format=input_data_format)
|
||||
|
||||
if do_normalize:
|
||||
image = image.astype(np.float32)
|
||||
image = normalize(
|
||||
image=image,
|
||||
mean=image_mean,
|
||||
std=image_std,
|
||||
data_format=input_data_format,
|
||||
)
|
||||
|
||||
image = to_channel_dimension_format(image, data_format, input_channel_dim=input_data_format) # [C, H, W]
|
||||
processed_images.append(image)
|
||||
|
||||
# Convert processed images to numpy array
|
||||
patches = np.array(processed_images)
|
||||
|
||||
# Pad temporal dimension if needed
|
||||
if patches.shape[0] % self.temporal_patch_size != 0:
|
||||
repeats = np.repeat(
|
||||
patches[-1][np.newaxis],
|
||||
self.temporal_patch_size - (patches.shape[0] % self.temporal_patch_size),
|
||||
axis=0,
|
||||
)
|
||||
patches = np.concatenate([patches, repeats], axis=0)
|
||||
|
||||
# Convert to channels-first format if needed
|
||||
if data_format == ChannelDimension.LAST:
|
||||
patches = patches.transpose([0, 3, 1, 2]) # [N, H, W, C] -> [N, C, H, W]
|
||||
|
||||
grid_t, channel = patches.shape[:2]
|
||||
grid_t = grid_t // self.temporal_patch_size
|
||||
|
||||
grid_h, grid_w = (
|
||||
resized_height // self.patch_size,
|
||||
resized_width // self.patch_size,
|
||||
)
|
||||
# Reshape into hierarchical patch structure
|
||||
patches = patches.reshape(
|
||||
[
|
||||
grid_t,
|
||||
self.temporal_patch_size,
|
||||
channel,
|
||||
grid_h // self.merge_size,
|
||||
self.merge_size,
|
||||
self.patch_size,
|
||||
grid_w // self.merge_size,
|
||||
self.merge_size,
|
||||
self.patch_size,
|
||||
]
|
||||
)
|
||||
# Reorder dimensions for better memory access pattern
|
||||
# [grid_t, grid_h/merge_size, grid_w/merge_size, merge_size, merge_size, C, temporal_patch_size, psz, psz]
|
||||
patches = patches.transpose([0, 3, 6, 4, 7, 2, 1, 5, 8])
|
||||
|
||||
flatten_patches = patches.reshape(
|
||||
[
|
||||
grid_t * grid_h * grid_w,
|
||||
channel * self.temporal_patch_size * self.patch_size * self.patch_size,
|
||||
]
|
||||
)
|
||||
|
||||
return flatten_patches, np.array([grid_t, grid_h, grid_w])
|
||||
|
||||
def preprocess(
|
||||
self,
|
||||
images: Union[ImageInput, VideoInput],
|
||||
min_pixels: Optional[int] = None,
|
||||
max_pixels: Optional[int] = None,
|
||||
image_mean: Optional[Union[float, List[float]]] = None,
|
||||
image_std: Optional[Union[float, List[float]]] = None,
|
||||
rescale_factor: Optional[float] = None,
|
||||
do_rescale: Optional[bool] = None,
|
||||
do_normalize: Optional[bool] = None,
|
||||
resample: Optional[PILImageResampling] = None,
|
||||
return_tensors: Optional[Union[str, TensorType]] = None,
|
||||
data_format: Optional[ChannelDimension] = ChannelDimension.FIRST,
|
||||
input_data_format: Optional[Union[str, ChannelDimension]] = ChannelDimension.LAST,
|
||||
):
|
||||
"""
|
||||
Main preprocessing method for images/videos.
|
||||
|
||||
Args:
|
||||
images: Input image/video data
|
||||
min_pixels: Override for minimum pixels
|
||||
max_pixels: Override for maximum pixels
|
||||
image_mean: Override for normalization mean
|
||||
image_std: Override for normalization std
|
||||
rescale_factor: Override for rescaling factor
|
||||
do_rescale: Override for rescaling flag
|
||||
do_normalize: Override for normalization flag
|
||||
resample: Override for resampling method
|
||||
return_tensors: Desired output tensor format
|
||||
data_format: Output channel dimension format
|
||||
input_data_format: Input channel dimension format
|
||||
|
||||
Returns:
|
||||
BatchFeature: Processed features containing:
|
||||
- pixel_values: Preprocessed pixel data
|
||||
- grid_thw: Grid dimensions [temporal, height, width]
|
||||
|
||||
Raises:
|
||||
ValueError: For invalid image types or dimensions
|
||||
"""
|
||||
min_pixels = min_pixels if min_pixels is not None else self.min_pixels
|
||||
max_pixels = max_pixels if max_pixels is not None else self.max_pixels
|
||||
image_mean = image_mean if image_mean is not None else self.image_mean
|
||||
image_std = image_std if image_std is not None else self.image_std
|
||||
rescale_factor = rescale_factor if rescale_factor is not None else self.rescale_factor
|
||||
do_rescale = do_rescale if do_rescale is not None else self.do_rescale
|
||||
do_normalize = do_normalize if do_normalize is not None else self.do_normalize
|
||||
resample = resample if resample is not None else self.resample
|
||||
|
||||
if images is not None and not valid_images(images):
|
||||
raise ValueError("Invalid image type. Must be of type PIL.Image.Image, numpy.ndarray, " "paddle.Tensor.")
|
||||
|
||||
pixel_values, grid_thw = self._preprocess(
|
||||
images,
|
||||
min_pixels=min_pixels,
|
||||
max_pixels=max_pixels,
|
||||
image_mean=image_mean,
|
||||
image_std=image_std,
|
||||
rescale_factor=rescale_factor,
|
||||
do_rescale=do_rescale,
|
||||
do_normalize=do_normalize,
|
||||
resample=resample,
|
||||
data_format=data_format,
|
||||
input_data_format=input_data_format,
|
||||
)
|
||||
data = {"pixel_values": pixel_values, "grid_thw": grid_thw}
|
||||
return BatchFeature(data=data, tensor_type=return_tensors)
|
||||
|
||||
@@ -340,9 +340,7 @@ class TestImagePreprocessorAdaptive(unittest.TestCase):
|
||||
# Create a scaled image (values between 0-1)
|
||||
img_array = np.random.rand(224, 224, 3).astype(np.float32) * 0.5
|
||||
# Use patch to capture warning
|
||||
with patch(
|
||||
"fastdeploy.input.ernie4_5_vl_processor.image_preprocessor.image_preprocessor_adaptive.data_processor_logger"
|
||||
) as mock_logger:
|
||||
with patch("fastdeploy.input.image_processors.adaptive_processor.data_processor_logger") as mock_logger:
|
||||
# Directly call _preprocess, pass scaled image
|
||||
self.processor._preprocess(
|
||||
[img_array], # Pass scaled numpy array
|
||||
@@ -356,9 +354,7 @@ class TestImagePreprocessorAdaptive(unittest.TestCase):
|
||||
"""Test invalid image check in preprocess (line 464)"""
|
||||
# Test invalid image type - need to ensure valid_images returns False
|
||||
# Use patch to make valid_images return False, but make_batched_images succeeds
|
||||
with patch(
|
||||
"fastdeploy.input.ernie4_5_vl_processor.image_preprocessor.image_preprocessor_adaptive.valid_images"
|
||||
) as mock_valid:
|
||||
with patch("fastdeploy.input.image_processors.adaptive_processor.valid_images") as mock_valid:
|
||||
mock_valid.return_value = False
|
||||
valid_images_list = [Image.new("RGB", (224, 224))] # Valid image, but valid_images returns False
|
||||
with self.assertRaises(ValueError) as context:
|
||||
|
||||
File diff suppressed because it is too large
Load Diff
Reference in New Issue
Block a user