mirror of
https://github.com/PaddlePaddle/FastDeploy.git
synced 2026-04-23 00:17:25 +08:00
[DataProcessor] Move image_processor to unified directory and add MultiModalProcessor (#7109)
* first commit * step 9~10 * update multimodal * update multimodal * fix load tokenizer * add unit test * fix unit test & AdaptiveImageProcessor * Delete unused code
This commit is contained in:
@@ -14,7 +14,13 @@
|
|||||||
# limitations under the License.
|
# limitations under the License.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
from .get_image_preprocessor import get_image_preprocessor
|
# Backward compatibility: this module has been migrated to
|
||||||
from .image_preprocessor_adaptive import AdaptiveImageProcessor
|
# fastdeploy.input.image_processors.adaptive_processor
|
||||||
|
# This file will be removed in a future version.
|
||||||
|
|
||||||
|
from fastdeploy.input.image_processors.adaptive_processor import ( # noqa: F401
|
||||||
|
AdaptiveImageProcessor,
|
||||||
|
get_image_preprocessor,
|
||||||
|
)
|
||||||
|
|
||||||
__all__ = ["get_image_preprocessor", "AdaptiveImageProcessor"]
|
__all__ = ["get_image_preprocessor", "AdaptiveImageProcessor"]
|
||||||
|
|||||||
+6
-17
@@ -14,21 +14,10 @@
|
|||||||
# limitations under the License.
|
# limitations under the License.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
"""get image preprocessor"""
|
# Backward compatibility: this module has been migrated to
|
||||||
|
# fastdeploy.input.image_processors.adaptive_processor
|
||||||
|
# This file will be removed in a future version.
|
||||||
|
|
||||||
from fastdeploy.utils import data_processor_logger
|
from fastdeploy.input.image_processors.adaptive_processor import ( # noqa: F401
|
||||||
|
get_image_preprocessor,
|
||||||
from .image_preprocessor_adaptive import AdaptiveImageProcessor
|
)
|
||||||
|
|
||||||
|
|
||||||
def get_image_preprocessor(args):
|
|
||||||
"""
|
|
||||||
get_image_preprocessor from args
|
|
||||||
"""
|
|
||||||
|
|
||||||
if args.vision_model_name_or_path is None:
|
|
||||||
return None
|
|
||||||
|
|
||||||
data_processor_logger.info("use AdaptiveImageProcessor")
|
|
||||||
image_preprocess = AdaptiveImageProcessor.from_pretrained(args.vision_model_name_or_path)
|
|
||||||
return image_preprocess
|
|
||||||
|
|||||||
+7
-493
@@ -14,498 +14,12 @@
|
|||||||
# limitations under the License.
|
# limitations under the License.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
"""image preprocessor adaptive"""
|
# Backward compatibility: this module has been migrated to
|
||||||
|
# fastdeploy.input.image_processors.adaptive_processor
|
||||||
|
# This file will be removed in a future version.
|
||||||
|
|
||||||
from typing import List, Optional, Union
|
from fastdeploy.input.image_processors.adaptive_processor import ( # noqa: F401
|
||||||
|
AdaptiveImageProcessor,
|
||||||
import numpy as np
|
make_batched_images,
|
||||||
import paddle
|
make_batched_videos,
|
||||||
import PIL
|
|
||||||
from paddleformers.transformers.feature_extraction_utils import BatchFeature
|
|
||||||
from paddleformers.transformers.image_processing_utils import BaseImageProcessor
|
|
||||||
from paddleformers.transformers.image_transforms import (
|
|
||||||
convert_to_rgb,
|
|
||||||
normalize,
|
|
||||||
rescale,
|
|
||||||
resize,
|
|
||||||
to_channel_dimension_format,
|
|
||||||
)
|
)
|
||||||
from paddleformers.transformers.image_utils import (
|
|
||||||
ChannelDimension,
|
|
||||||
ImageInput,
|
|
||||||
PILImageResampling,
|
|
||||||
get_image_size,
|
|
||||||
infer_channel_dimension_format,
|
|
||||||
is_valid_image,
|
|
||||||
make_list_of_images,
|
|
||||||
to_numpy_array,
|
|
||||||
valid_images,
|
|
||||||
)
|
|
||||||
from paddleformers.transformers.legacy.tokenizer_utils_base import TensorType
|
|
||||||
from PIL import Image
|
|
||||||
|
|
||||||
from fastdeploy.input.image_processors.common import is_scaled_image
|
|
||||||
from fastdeploy.input.image_processors.common import smart_resize_qwen as smart_resize
|
|
||||||
from fastdeploy.utils import data_processor_logger
|
|
||||||
|
|
||||||
OPENAI_CLIP_MEAN = [0.48145466, 0.4578275, 0.40821073]
|
|
||||||
OPENAI_CLIP_STD = [0.26862954, 0.26130258, 0.27577711]
|
|
||||||
|
|
||||||
IMAGE_FACTOR = 28
|
|
||||||
MIN_PIXELS = 4 * 28 * 28
|
|
||||||
MAX_PIXELS = 16384 * 28 * 28
|
|
||||||
MAX_RATIO = 200
|
|
||||||
|
|
||||||
|
|
||||||
VideoInput = Union[
|
|
||||||
List["PIL.Image.Image"],
|
|
||||||
"np.ndarray",
|
|
||||||
"paddle.Tensor",
|
|
||||||
List["np.ndarray"],
|
|
||||||
List["paddle.Tensor"],
|
|
||||||
List[List["PIL.Image.Image"]],
|
|
||||||
List[List["np.ndarrray"]],
|
|
||||||
List[List["paddle.Tensor"]],
|
|
||||||
]
|
|
||||||
|
|
||||||
|
|
||||||
__all__ = [
|
|
||||||
"AdaptiveImageProcessor",
|
|
||||||
]
|
|
||||||
|
|
||||||
|
|
||||||
def make_batched_images(images) -> List[List[ImageInput]]:
|
|
||||||
"""
|
|
||||||
Accepts images in list or nested list format, and makes a list of images for preprocessing.
|
|
||||||
images (`Union[List[List[ImageInput]], List[ImageInput], ImageInput]`):
|
|
||||||
The input image.
|
|
||||||
|
|
||||||
Returns:
|
|
||||||
list: A list of images.
|
|
||||||
"""
|
|
||||||
if isinstance(images, (list, tuple)) and isinstance(images[0], (list, tuple)) and is_valid_image(images[0][0]):
|
|
||||||
return [img for img_list in images for img in img_list]
|
|
||||||
|
|
||||||
elif isinstance(images, (list, tuple)) and is_valid_image(images[0]):
|
|
||||||
return images
|
|
||||||
|
|
||||||
elif is_valid_image(images):
|
|
||||||
return [images]
|
|
||||||
|
|
||||||
raise ValueError(f"Could not make batched images from {images}")
|
|
||||||
|
|
||||||
|
|
||||||
# Copied from transformers.models.llava_next_video.image_processing_llava_next_video.make_batched_videos
|
|
||||||
def make_batched_videos(videos) -> List[VideoInput]:
|
|
||||||
"""dummy"""
|
|
||||||
if isinstance(videos, (list, tuple)) and isinstance(videos[0], (list, tuple)) and is_valid_image(videos[0][0]):
|
|
||||||
return videos
|
|
||||||
|
|
||||||
elif isinstance(videos, (list, tuple)) and is_valid_image(videos[0]):
|
|
||||||
if isinstance(videos[0], Image.Image):
|
|
||||||
return [videos]
|
|
||||||
elif len(videos[0].shape) == 4:
|
|
||||||
return [list(video) for video in videos]
|
|
||||||
|
|
||||||
elif is_valid_image(videos) and len(videos.shape) == 4:
|
|
||||||
return [list(videos)]
|
|
||||||
|
|
||||||
raise ValueError(f"Could not make batched video from {videos}")
|
|
||||||
|
|
||||||
|
|
||||||
class AdaptiveImageProcessor(BaseImageProcessor):
|
|
||||||
r"""
|
|
||||||
Constructs a adaptive image processor that dynamically resizes images based on the original images.
|
|
||||||
|
|
||||||
Args:
|
|
||||||
do_resize (`bool`, *optional*, defaults to `True`):
|
|
||||||
Whether to resize the image's (height, width) dimensions.
|
|
||||||
resample (`PILImageResampling`, *optional*, defaults to `Resampling.BICUBIC`):
|
|
||||||
Resampling filter to use when resizing the image.
|
|
||||||
do_rescale (`bool`, *optional*, defaults to `True`):
|
|
||||||
Whether to rescale the image by the specified scale `rescale_factor`.
|
|
||||||
rescale_factor (`int` or `float`, *optional*, defaults to `1/255`):
|
|
||||||
Scale factor to use if rescaling the image.
|
|
||||||
do_normalize (`bool`, *optional*, defaults to `True`):
|
|
||||||
Whether to normalize the image.
|
|
||||||
image_mean (`float` or `List[float]`, *optional*, defaults to `[0.48145466, 0.4578275, 0.40821073]`):
|
|
||||||
Mean to use if normalizing the image. This is a float or list of floats for each channel in the image.
|
|
||||||
image_std (`float` or `List[float]`, *optional*, defaults to `[0.26862954, 0.26130258, 0.27577711]`):
|
|
||||||
Standard deviation to use if normalizing the image. This is a float or list of floats for each channel
|
|
||||||
in the image.
|
|
||||||
do_convert_rgb (`bool`, *optional*, defaults to `True`):
|
|
||||||
Whether to convert the image to RGB.
|
|
||||||
min_pixels (`int`, *optional*, defaults to `56 * 56`):
|
|
||||||
The min pixels of the image to resize the image.
|
|
||||||
max_pixels (`int`, *optional*, defaults to `28 * 28 * 1280`):
|
|
||||||
The max pixels of the image to resize the image.
|
|
||||||
patch_size (`int`, *optional*, defaults to 14):
|
|
||||||
The spacial patch size of the vision encoder.
|
|
||||||
temporal_conv_size (`int`, *optional*, defaults to 2):
|
|
||||||
The temporal conv size in resampler.
|
|
||||||
merge_size (`int`, *optional*, defaults to 2):
|
|
||||||
The merge size of the vision encoder to llm encoder.
|
|
||||||
"""
|
|
||||||
|
|
||||||
model_input_names = [
|
|
||||||
"pixel_values",
|
|
||||||
"image_grid_thw",
|
|
||||||
"pixel_values_videos",
|
|
||||||
"video_grid_thw",
|
|
||||||
]
|
|
||||||
|
|
||||||
def __init__(
|
|
||||||
self,
|
|
||||||
do_resize: bool = True,
|
|
||||||
resample: PILImageResampling = PILImageResampling.BICUBIC,
|
|
||||||
do_rescale: bool = True,
|
|
||||||
rescale_factor: float = 1 / 255,
|
|
||||||
do_normalize: bool = True,
|
|
||||||
image_mean: Optional[Union[float, List[float]]] = None,
|
|
||||||
image_std: Optional[Union[float, List[float]]] = None,
|
|
||||||
do_convert_rgb: bool = True,
|
|
||||||
min_pixels: int = 56 * 56,
|
|
||||||
max_pixels: int = 28 * 28 * 1280,
|
|
||||||
patch_size: int = 14,
|
|
||||||
temporal_conv_size: int = 2,
|
|
||||||
merge_size: int = 2,
|
|
||||||
**kwargs,
|
|
||||||
) -> None:
|
|
||||||
"""init"""
|
|
||||||
super().__init__(**kwargs)
|
|
||||||
self.do_resize = do_resize
|
|
||||||
self.resample = resample
|
|
||||||
self.do_rescale = do_rescale
|
|
||||||
self.rescale_factor = rescale_factor
|
|
||||||
self.do_normalize = do_normalize
|
|
||||||
self.image_mean = image_mean if image_mean is not None else OPENAI_CLIP_MEAN
|
|
||||||
self.image_std = image_std if image_std is not None else OPENAI_CLIP_STD
|
|
||||||
self.min_pixels = min_pixels
|
|
||||||
self.max_pixels = max_pixels
|
|
||||||
self.patch_size = patch_size
|
|
||||||
self.temporal_conv_size = temporal_conv_size
|
|
||||||
self.merge_size = merge_size
|
|
||||||
self.size = {"min_pixels": min_pixels, "max_pixels": max_pixels}
|
|
||||||
self.do_convert_rgb = do_convert_rgb
|
|
||||||
|
|
||||||
def set_pixels(self, min_pixels=None, max_pixels=None, msg=""):
|
|
||||||
"""设定pixels"""
|
|
||||||
if min_pixels is not None:
|
|
||||||
assert isinstance(min_pixels, int) and min_pixels >= 0, "min_pixels must be positive int"
|
|
||||||
data_processor_logger.info(f"{msg} AdaptiveImageProcessor set min_pixels = {min_pixels}")
|
|
||||||
self.min_pixels = min_pixels
|
|
||||||
self.size["min_pixels"] = int(min_pixels)
|
|
||||||
if max_pixels is not None:
|
|
||||||
assert isinstance(max_pixels, int) and max_pixels > 0, "max_pixels must be positive int"
|
|
||||||
data_processor_logger.info(f"{msg} AdaptiveImageProcessor set max_pixels = {max_pixels}")
|
|
||||||
self.max_pixels = max_pixels
|
|
||||||
self.size["max_pixels"] = int(max_pixels)
|
|
||||||
|
|
||||||
def get_smarted_resize(self, height, width, min_pixels=None, max_pixels=None):
|
|
||||||
"""dummy"""
|
|
||||||
actual_min_pixels = min_pixels if min_pixels is not None else self.min_pixels
|
|
||||||
actual_max_pixels = max_pixels if max_pixels is not None else self.max_pixels
|
|
||||||
resized_height, resized_width = smart_resize(
|
|
||||||
height,
|
|
||||||
width,
|
|
||||||
factor=self.patch_size * self.merge_size,
|
|
||||||
min_pixels=actual_min_pixels,
|
|
||||||
max_pixels=actual_max_pixels,
|
|
||||||
)
|
|
||||||
return (resized_height, resized_width), (
|
|
||||||
resized_height // self.patch_size,
|
|
||||||
resized_width // self.patch_size,
|
|
||||||
)
|
|
||||||
|
|
||||||
def _preprocess(
|
|
||||||
self,
|
|
||||||
images: Union[ImageInput, VideoInput],
|
|
||||||
do_resize: bool = True,
|
|
||||||
resample: PILImageResampling = None,
|
|
||||||
do_rescale: bool = True,
|
|
||||||
rescale_factor: float = 1 / 255,
|
|
||||||
do_normalize: bool = True,
|
|
||||||
image_mean: Optional[Union[float, List[float]]] = None,
|
|
||||||
image_std: Optional[Union[float, List[float]]] = None,
|
|
||||||
do_convert_rgb: bool = False,
|
|
||||||
data_format: Optional[ChannelDimension] = ChannelDimension.FIRST,
|
|
||||||
input_data_format: Optional[Union[str, ChannelDimension]] = None,
|
|
||||||
predetermined_grid_thw=None,
|
|
||||||
):
|
|
||||||
"""
|
|
||||||
Preprocess an image or batch of images. Copy of the `preprocess` method from `CLIPImageProcessor`.
|
|
||||||
|
|
||||||
Args:
|
|
||||||
images (`ImageInput`):
|
|
||||||
Image or batch of images to preprocess. Expects pixel values ranging from 0 to 255.
|
|
||||||
If pixel values range from 0 to 1, set `do_rescale=False`.
|
|
||||||
vision_info (`List[Dict]`, *optional*):
|
|
||||||
Optional list of dictionaries containing additional information about vision inputs.
|
|
||||||
do_resize (`bool`, *optional*, defaults to `self.do_resize`):
|
|
||||||
Whether to resize the image.
|
|
||||||
resample (`PILImageResampling`, *optional*, defaults to `self.resample`):
|
|
||||||
Resampling filter to use if resizing the image. This can be one of the `PILImageResampling` enums.
|
|
||||||
do_rescale (`bool`, *optional*, defaults to `self.do_rescale`):
|
|
||||||
Whether to rescale the image.
|
|
||||||
rescale_factor (`float`, *optional*, defaults to `self.rescale_factor`):
|
|
||||||
Scale factor to use if rescaling the image.
|
|
||||||
do_normalize (`bool`, *optional*, defaults to `self.do_normalize`):
|
|
||||||
Whether to normalize the image.
|
|
||||||
image_mean (`float` or `List[float]`, *optional*, defaults to `self.image_mean`):
|
|
||||||
Mean to use if normalizing the image.
|
|
||||||
Can be a float or a list of floats corresponding to the number of channels in the image.
|
|
||||||
image_std (`float` or `List[float]`, *optional*, defaults to `self.image_std`):
|
|
||||||
Standard deviation to use if normalizing the image.
|
|
||||||
Can be a float or a list of floats corresponding to the number of channels in the image.
|
|
||||||
do_convert_rgb (`bool`, *optional*, defaults to `self.do_convert_rgb`):
|
|
||||||
Whether to convert the image to RGB.
|
|
||||||
data_format (`ChannelDimension`, *optional*, defaults to `ChannelDimension.FIRST`):
|
|
||||||
The channel dimension format for the output image. Can be one of:
|
|
||||||
- `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
|
|
||||||
- `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
|
|
||||||
- Unset: Use the channel dimension format of the input image.
|
|
||||||
input_data_format (`ChannelDimension` or `str`, *optional*):
|
|
||||||
The channel dimension format for the input image. Can be one of:
|
|
||||||
- `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
|
|
||||||
- `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
|
|
||||||
- `"none"` or `ChannelDimension.NONE`: image in (height, width) format.
|
|
||||||
- `"none"` or `ChannelDimension.NONE`: image in (height, width) format.
|
|
||||||
"""
|
|
||||||
images = make_list_of_images(images)
|
|
||||||
|
|
||||||
if do_convert_rgb:
|
|
||||||
images = [convert_to_rgb(image) for image in images]
|
|
||||||
|
|
||||||
# All transformations expect numpy arrays.
|
|
||||||
images = [to_numpy_array(image) for image in images]
|
|
||||||
|
|
||||||
if is_scaled_image(images[0]) and do_rescale:
|
|
||||||
data_processor_logger.warning(
|
|
||||||
"It looks like you are trying to rescale already rescaled images. If the input"
|
|
||||||
" images have pixel values between 0 and 1, set `do_rescale=False` to avoid rescaling them again."
|
|
||||||
)
|
|
||||||
if input_data_format is None:
|
|
||||||
# We assume that all images have the same channel dimension format.
|
|
||||||
input_data_format = infer_channel_dimension_format(images[0])
|
|
||||||
|
|
||||||
height, width = get_image_size(images[0], channel_dim=input_data_format)
|
|
||||||
resized_height, resized_width = height, width
|
|
||||||
processed_images = []
|
|
||||||
|
|
||||||
if predetermined_grid_thw is not None:
|
|
||||||
assert len(predetermined_grid_thw) == len(
|
|
||||||
images
|
|
||||||
), f"len(predetermined_grid_thw) {len(predetermined_grid_thw)} == len(images) {len(images)}"
|
|
||||||
|
|
||||||
for img_idx, image in enumerate(images):
|
|
||||||
if do_resize:
|
|
||||||
if predetermined_grid_thw is not None:
|
|
||||||
(resized_height, resized_width) = predetermined_grid_thw[img_idx]
|
|
||||||
resized_height *= self.patch_size
|
|
||||||
resized_width *= self.patch_size
|
|
||||||
else:
|
|
||||||
resized_height, resized_width = smart_resize(
|
|
||||||
height,
|
|
||||||
width,
|
|
||||||
factor=self.patch_size * self.merge_size,
|
|
||||||
min_pixels=self.min_pixels,
|
|
||||||
max_pixels=self.max_pixels,
|
|
||||||
)
|
|
||||||
image = image.astype("uint8") # TODO : 需要手动加上,否则多除255 导致结果会出错
|
|
||||||
# 直接fromarray,不要靠paddleformers里面的
|
|
||||||
image = Image.fromarray(image)
|
|
||||||
image = resize(
|
|
||||||
image,
|
|
||||||
size=(resized_height, resized_width),
|
|
||||||
resample=resample,
|
|
||||||
data_format=input_data_format,
|
|
||||||
)
|
|
||||||
if do_rescale:
|
|
||||||
image = rescale(image, scale=rescale_factor, data_format=input_data_format)
|
|
||||||
|
|
||||||
if do_normalize:
|
|
||||||
image = normalize(
|
|
||||||
image=image,
|
|
||||||
mean=image_mean,
|
|
||||||
std=image_std,
|
|
||||||
data_format=input_data_format,
|
|
||||||
)
|
|
||||||
|
|
||||||
image = to_channel_dimension_format(image, data_format, input_channel_dim=input_data_format) # [C, H, W]
|
|
||||||
|
|
||||||
processed_images.append(image)
|
|
||||||
patches = np.array(processed_images)
|
|
||||||
if data_format == ChannelDimension.LAST:
|
|
||||||
patches = patches.transpose([0, 3, 1, 2])
|
|
||||||
|
|
||||||
channel = patches.shape[1] # [time, C, H, W]
|
|
||||||
grid_t = patches.shape[0]
|
|
||||||
grid_h, grid_w = (
|
|
||||||
resized_height // self.patch_size,
|
|
||||||
resized_width // self.patch_size,
|
|
||||||
)
|
|
||||||
patches = patches.reshape(
|
|
||||||
[
|
|
||||||
grid_t,
|
|
||||||
channel,
|
|
||||||
grid_h // self.merge_size,
|
|
||||||
self.merge_size,
|
|
||||||
self.patch_size,
|
|
||||||
grid_w // self.merge_size,
|
|
||||||
self.merge_size,
|
|
||||||
self.patch_size,
|
|
||||||
]
|
|
||||||
)
|
|
||||||
# [grid_t, grid_h/merge_size, grid_w/merge_size, merge_size, merge_size, C, psz, psz]
|
|
||||||
patches = patches.transpose([0, 2, 5, 3, 6, 1, 4, 7])
|
|
||||||
|
|
||||||
flatten_patches = patches.reshape(
|
|
||||||
[
|
|
||||||
grid_t * grid_h * grid_w,
|
|
||||||
channel * self.patch_size * self.patch_size,
|
|
||||||
]
|
|
||||||
) # [grid_t * grid_h * grid_w, C * psz * psz]
|
|
||||||
|
|
||||||
return flatten_patches, (grid_t, grid_h, grid_w)
|
|
||||||
|
|
||||||
def preprocess(
|
|
||||||
self,
|
|
||||||
images: ImageInput,
|
|
||||||
videos: VideoInput = None,
|
|
||||||
do_resize: bool = True,
|
|
||||||
size: Optional[Union[int, List[int]]] = None,
|
|
||||||
resample: PILImageResampling = None,
|
|
||||||
do_rescale: bool = True,
|
|
||||||
rescale_factor: float = 1 / 255,
|
|
||||||
do_normalize: bool = True,
|
|
||||||
image_mean: Optional[Union[float, List[float]]] = None,
|
|
||||||
image_std: Optional[Union[float, List[float]]] = None,
|
|
||||||
do_convert_rgb: bool = False,
|
|
||||||
return_tensors: Optional[Union[str, TensorType]] = None,
|
|
||||||
data_format: Optional[ChannelDimension] = ChannelDimension.FIRST,
|
|
||||||
input_data_format: Optional[Union[str, ChannelDimension]] = None,
|
|
||||||
predetermined_grid_thw=None,
|
|
||||||
):
|
|
||||||
"""
|
|
||||||
Args:
|
|
||||||
images (`ImageInput`):
|
|
||||||
Image to preprocess. Expects a single or batch of images with pixel values ranging from 0 to 255. If
|
|
||||||
passing in images with pixel values between 0 and 1, set `do_rescale=False`.
|
|
||||||
videos (`VideoInput`):
|
|
||||||
Video to preprocess. Expects a single or batch of videos with pixel values ranging from 0 to 255. If
|
|
||||||
passing in videos with pixel values between 0 and 1, set `do_rescale=False`.
|
|
||||||
do_resize (`bool`, *optional*, defaults to `self.do_resize`):
|
|
||||||
Whether to resize the image.
|
|
||||||
size (`Dict[str, int]`, *optional*, defaults to `self.size`):
|
|
||||||
Size of the image after resizing. Shortest edge of the image is resized to size["shortest_edge"], with
|
|
||||||
the longest edge resized to keep the input aspect ratio.
|
|
||||||
resample (`int`, *optional*, defaults to `self.resample`):
|
|
||||||
Resampling filter to use if resizing the image. This can be one of the enum `PILImageResampling`. Only
|
|
||||||
has an effect if `do_resize` is set to `True`.
|
|
||||||
do_rescale (`bool`, *optional*, defaults to `self.do_rescale`):
|
|
||||||
Whether to rescale the image.
|
|
||||||
rescale_factor (`float`, *optional*, defaults to `self.rescale_factor`):
|
|
||||||
Rescale factor to rescale the image by if `do_rescale` is set to `True`.
|
|
||||||
do_normalize (`bool`, *optional*, defaults to `self.do_normalize`):
|
|
||||||
Whether to normalize the image.
|
|
||||||
image_mean (`float` or `List[float]`, *optional*, defaults to `self.image_mean`):
|
|
||||||
Image mean to use for normalization. Only has an effect if `do_normalize` is set to `True`.
|
|
||||||
image_std (`float` or `List[float]`, *optional*, defaults to `self.image_std`):
|
|
||||||
Image standard deviation to use for normalization. Only has an effect if `do_normalize` is set to
|
|
||||||
`True`.
|
|
||||||
do_convert_rgb (`bool`, *optional*, defaults to `self.do_convert_rgb`):
|
|
||||||
Whether to convert the image to RGB.
|
|
||||||
return_tensors (`str` or `TensorType`, *optional*):
|
|
||||||
The type of tensors to return. Can be one of:
|
|
||||||
- Unset: Return a list of `np.ndarray`.
|
|
||||||
- `TensorType.PADDLE` or `'pt'`: Return a batch of type `torch.Tensor`.
|
|
||||||
- `TensorType.NUMPY` or `'np'`: Return a batch of type `np.ndarray`.
|
|
||||||
data_format (`ChannelDimension` or `str`, *optional*, defaults to `ChannelDimension.FIRST`):
|
|
||||||
The channel dimension format for the output image. Can be one of:
|
|
||||||
- `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
|
|
||||||
- `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
|
|
||||||
- Unset: Use the channel dimension format of the input image.
|
|
||||||
input_data_format (`ChannelDimension` or `str`, *optional*):
|
|
||||||
The channel dimension format for the input image. If unset, the channel dimension format is inferred
|
|
||||||
from the input image. Can be one of:
|
|
||||||
- `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
|
|
||||||
- `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
|
|
||||||
- `"none"` or `ChannelDimension.NONE`: image in (height, width) format.
|
|
||||||
|
|
||||||
"""
|
|
||||||
do_resize = do_resize if do_resize is not None else self.do_resize
|
|
||||||
size = size if size is not None else self.size
|
|
||||||
resample = resample if resample is not None else self.resample
|
|
||||||
do_rescale = do_rescale if do_rescale is not None else self.do_rescale
|
|
||||||
rescale_factor = rescale_factor if rescale_factor is not None else self.rescale_factor
|
|
||||||
do_normalize = do_normalize if do_normalize is not None else self.do_normalize
|
|
||||||
image_mean = image_mean if image_mean is not None else self.image_mean
|
|
||||||
image_std = image_std if image_std is not None else self.image_std
|
|
||||||
do_convert_rgb = do_convert_rgb if do_convert_rgb is not None else self.do_convert_rgb
|
|
||||||
|
|
||||||
if images is not None:
|
|
||||||
images = make_batched_images(images)
|
|
||||||
if videos is not None:
|
|
||||||
videos = make_batched_videos(videos)
|
|
||||||
|
|
||||||
if images is not None and not valid_images(images):
|
|
||||||
raise ValueError("Invalid image type. Must be of type PIL.Image.Image, numpy.ndarray, " "paddle.Tensor.")
|
|
||||||
|
|
||||||
if images is not None:
|
|
||||||
pixel_values, vision_grid_thws = [], []
|
|
||||||
for img_idx, image in enumerate(images):
|
|
||||||
if predetermined_grid_thw is not None:
|
|
||||||
predetermined_grid_thw_one = [predetermined_grid_thw[img_idx]]
|
|
||||||
else:
|
|
||||||
predetermined_grid_thw_one = None
|
|
||||||
patches, image_grid_thw = self._preprocess(
|
|
||||||
image,
|
|
||||||
do_resize=do_resize,
|
|
||||||
resample=resample,
|
|
||||||
do_rescale=do_rescale,
|
|
||||||
rescale_factor=rescale_factor,
|
|
||||||
do_normalize=do_normalize,
|
|
||||||
image_mean=image_mean,
|
|
||||||
image_std=image_std,
|
|
||||||
data_format=data_format,
|
|
||||||
do_convert_rgb=do_convert_rgb,
|
|
||||||
input_data_format=input_data_format,
|
|
||||||
predetermined_grid_thw=predetermined_grid_thw_one,
|
|
||||||
)
|
|
||||||
pixel_values.extend(patches)
|
|
||||||
vision_grid_thws.append(image_grid_thw)
|
|
||||||
pixel_values = np.array(pixel_values)
|
|
||||||
vision_grid_thws = np.array(vision_grid_thws)
|
|
||||||
data = {
|
|
||||||
"pixel_values": pixel_values,
|
|
||||||
"image_grid_thw": vision_grid_thws,
|
|
||||||
}
|
|
||||||
|
|
||||||
if videos is not None:
|
|
||||||
pixel_values, vision_grid_thws = [], []
|
|
||||||
for images in videos:
|
|
||||||
patches, video_grid_thw = self._preprocess(
|
|
||||||
images,
|
|
||||||
do_resize=do_resize,
|
|
||||||
resample=resample,
|
|
||||||
do_rescale=do_rescale,
|
|
||||||
rescale_factor=rescale_factor,
|
|
||||||
do_normalize=do_normalize,
|
|
||||||
image_mean=image_mean,
|
|
||||||
image_std=image_std,
|
|
||||||
data_format=data_format,
|
|
||||||
do_convert_rgb=do_convert_rgb,
|
|
||||||
input_data_format=input_data_format,
|
|
||||||
predetermined_grid_thw=predetermined_grid_thw,
|
|
||||||
)
|
|
||||||
pixel_values.extend(patches)
|
|
||||||
vision_grid_thws.append(video_grid_thw)
|
|
||||||
pixel_values = np.array(pixel_values)
|
|
||||||
vision_grid_thws = np.array(vision_grid_thws)
|
|
||||||
|
|
||||||
data = {
|
|
||||||
"pixel_values_videos": pixel_values,
|
|
||||||
"video_grid_thw": vision_grid_thws,
|
|
||||||
}
|
|
||||||
|
|
||||||
return BatchFeature(data=data, tensor_type=return_tensors)
|
|
||||||
|
|||||||
@@ -11,3 +11,17 @@
|
|||||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
# See the License for the specific language governing permissions and
|
# See the License for the specific language governing permissions and
|
||||||
# limitations under the License.
|
# limitations under the License.
|
||||||
|
|
||||||
|
from fastdeploy.input.image_processors.adaptive_processor import ( # noqa: F401
|
||||||
|
AdaptiveImageProcessor,
|
||||||
|
get_image_preprocessor,
|
||||||
|
)
|
||||||
|
from fastdeploy.input.image_processors.paddleocr_processor import ( # noqa: F401
|
||||||
|
ImageProcessor as PaddleOCRImageProcessor,
|
||||||
|
)
|
||||||
|
from fastdeploy.input.image_processors.qwen3_processor import ( # noqa: F401
|
||||||
|
ImageProcessor as Qwen3ImageProcessor,
|
||||||
|
)
|
||||||
|
from fastdeploy.input.image_processors.qwen_processor import ( # noqa: F401
|
||||||
|
ImageProcessor as QwenImageProcessor,
|
||||||
|
)
|
||||||
|
|||||||
@@ -0,0 +1,524 @@
|
|||||||
|
"""
|
||||||
|
# Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
|
||||||
|
#
|
||||||
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
# you may not use this file except in compliance with the License.
|
||||||
|
# You may obtain a copy of the License at
|
||||||
|
#
|
||||||
|
# http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
#
|
||||||
|
# Unless required by applicable law or agreed to in writing, software
|
||||||
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
# See the License for the specific language governing permissions and
|
||||||
|
# limitations under the License.
|
||||||
|
"""
|
||||||
|
|
||||||
|
"""image preprocessor adaptive"""
|
||||||
|
|
||||||
|
from typing import List, Optional, Union
|
||||||
|
|
||||||
|
import numpy as np
|
||||||
|
import paddle
|
||||||
|
import PIL
|
||||||
|
from paddleformers.transformers.feature_extraction_utils import BatchFeature
|
||||||
|
from paddleformers.transformers.image_processing_utils import BaseImageProcessor
|
||||||
|
from paddleformers.transformers.image_transforms import (
|
||||||
|
convert_to_rgb,
|
||||||
|
normalize,
|
||||||
|
rescale,
|
||||||
|
resize,
|
||||||
|
to_channel_dimension_format,
|
||||||
|
)
|
||||||
|
from paddleformers.transformers.image_utils import (
|
||||||
|
ChannelDimension,
|
||||||
|
ImageInput,
|
||||||
|
PILImageResampling,
|
||||||
|
get_image_size,
|
||||||
|
infer_channel_dimension_format,
|
||||||
|
is_valid_image,
|
||||||
|
make_list_of_images,
|
||||||
|
to_numpy_array,
|
||||||
|
valid_images,
|
||||||
|
)
|
||||||
|
from paddleformers.transformers.legacy.tokenizer_utils_base import TensorType
|
||||||
|
from PIL import Image
|
||||||
|
|
||||||
|
from fastdeploy.input.image_processors.common import is_scaled_image
|
||||||
|
from fastdeploy.input.image_processors.common import smart_resize_qwen as smart_resize
|
||||||
|
from fastdeploy.utils import data_processor_logger
|
||||||
|
|
||||||
|
OPENAI_CLIP_MEAN = [0.48145466, 0.4578275, 0.40821073]
|
||||||
|
OPENAI_CLIP_STD = [0.26862954, 0.26130258, 0.27577711]
|
||||||
|
|
||||||
|
IMAGE_FACTOR = 28
|
||||||
|
MIN_PIXELS = 4 * 28 * 28
|
||||||
|
MAX_PIXELS = 16384 * 28 * 28
|
||||||
|
MAX_RATIO = 200
|
||||||
|
|
||||||
|
|
||||||
|
VideoInput = Union[
|
||||||
|
List["PIL.Image.Image"],
|
||||||
|
"np.ndarray",
|
||||||
|
"paddle.Tensor",
|
||||||
|
List["np.ndarray"],
|
||||||
|
List["paddle.Tensor"],
|
||||||
|
List[List["PIL.Image.Image"]],
|
||||||
|
List[List["np.ndarray"]],
|
||||||
|
List[List["paddle.Tensor"]],
|
||||||
|
]
|
||||||
|
|
||||||
|
|
||||||
|
__all__ = [
|
||||||
|
"AdaptiveImageProcessor",
|
||||||
|
"get_image_preprocessor",
|
||||||
|
"make_batched_images",
|
||||||
|
"make_batched_videos",
|
||||||
|
]
|
||||||
|
|
||||||
|
|
||||||
|
def make_batched_images(images) -> List[List[ImageInput]]:
|
||||||
|
"""
|
||||||
|
Accepts images in list or nested list format, and makes a list of images for preprocessing.
|
||||||
|
images (`Union[List[List[ImageInput]], List[ImageInput], ImageInput]`):
|
||||||
|
The input image.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
list: A list of images.
|
||||||
|
"""
|
||||||
|
if isinstance(images, (list, tuple)) and isinstance(images[0], (list, tuple)) and is_valid_image(images[0][0]):
|
||||||
|
return [img for img_list in images for img in img_list]
|
||||||
|
|
||||||
|
elif isinstance(images, (list, tuple)) and is_valid_image(images[0]):
|
||||||
|
return images
|
||||||
|
|
||||||
|
elif is_valid_image(images):
|
||||||
|
return [images]
|
||||||
|
|
||||||
|
raise ValueError(f"Could not make batched images from {images}")
|
||||||
|
|
||||||
|
|
||||||
|
# Copied from transformers.models.llava_next_video.image_processing_llava_next_video.make_batched_videos
|
||||||
|
def make_batched_videos(videos) -> List[VideoInput]:
|
||||||
|
"""dummy"""
|
||||||
|
if isinstance(videos, (list, tuple)) and isinstance(videos[0], (list, tuple)) and is_valid_image(videos[0][0]):
|
||||||
|
return videos
|
||||||
|
|
||||||
|
elif isinstance(videos, (list, tuple)) and is_valid_image(videos[0]):
|
||||||
|
if isinstance(videos[0], Image.Image):
|
||||||
|
return [videos]
|
||||||
|
elif len(videos[0].shape) == 4:
|
||||||
|
return [list(video) for video in videos]
|
||||||
|
|
||||||
|
elif is_valid_image(videos) and len(videos.shape) == 4:
|
||||||
|
return [list(videos)]
|
||||||
|
|
||||||
|
raise ValueError(f"Could not make batched video from {videos}")
|
||||||
|
|
||||||
|
|
||||||
|
class AdaptiveImageProcessor(BaseImageProcessor):
|
||||||
|
r"""
|
||||||
|
Constructs a adaptive image processor that dynamically resizes images based on the original images.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
do_resize (`bool`, *optional*, defaults to `True`):
|
||||||
|
Whether to resize the image's (height, width) dimensions.
|
||||||
|
resample (`PILImageResampling`, *optional*, defaults to `Resampling.BICUBIC`):
|
||||||
|
Resampling filter to use when resizing the image.
|
||||||
|
do_rescale (`bool`, *optional*, defaults to `True`):
|
||||||
|
Whether to rescale the image by the specified scale `rescale_factor`.
|
||||||
|
rescale_factor (`int` or `float`, *optional*, defaults to `1/255`):
|
||||||
|
Scale factor to use if rescaling the image.
|
||||||
|
do_normalize (`bool`, *optional*, defaults to `True`):
|
||||||
|
Whether to normalize the image.
|
||||||
|
image_mean (`float` or `List[float]`, *optional*, defaults to `[0.48145466, 0.4578275, 0.40821073]`):
|
||||||
|
Mean to use if normalizing the image. This is a float or list of floats for each channel in the image.
|
||||||
|
image_std (`float` or `List[float]`, *optional*, defaults to `[0.26862954, 0.26130258, 0.27577711]`):
|
||||||
|
Standard deviation to use if normalizing the image. This is a float or list of floats for each channel
|
||||||
|
in the image.
|
||||||
|
do_convert_rgb (`bool`, *optional*, defaults to `True`):
|
||||||
|
Whether to convert the image to RGB.
|
||||||
|
min_pixels (`int`, *optional*, defaults to `56 * 56`):
|
||||||
|
The min pixels of the image to resize the image.
|
||||||
|
max_pixels (`int`, *optional*, defaults to `28 * 28 * 1280`):
|
||||||
|
The max pixels of the image to resize the image.
|
||||||
|
patch_size (`int`, *optional*, defaults to 14):
|
||||||
|
The spacial patch size of the vision encoder.
|
||||||
|
temporal_conv_size (`int`, *optional*, defaults to 2):
|
||||||
|
The temporal conv size in resampler.
|
||||||
|
merge_size (`int`, *optional*, defaults to 2):
|
||||||
|
The merge size of the vision encoder to llm encoder.
|
||||||
|
"""
|
||||||
|
|
||||||
|
model_input_names = [
|
||||||
|
"pixel_values",
|
||||||
|
"image_grid_thw",
|
||||||
|
"pixel_values_videos",
|
||||||
|
"video_grid_thw",
|
||||||
|
]
|
||||||
|
|
||||||
|
def __init__(
|
||||||
|
self,
|
||||||
|
do_resize: bool = True,
|
||||||
|
resample: PILImageResampling = PILImageResampling.BICUBIC,
|
||||||
|
do_rescale: bool = True,
|
||||||
|
rescale_factor: float = 1 / 255,
|
||||||
|
do_normalize: bool = True,
|
||||||
|
image_mean: Optional[Union[float, List[float]]] = None,
|
||||||
|
image_std: Optional[Union[float, List[float]]] = None,
|
||||||
|
do_convert_rgb: bool = True,
|
||||||
|
min_pixels: int = 56 * 56,
|
||||||
|
max_pixels: int = 28 * 28 * 1280,
|
||||||
|
patch_size: int = 14,
|
||||||
|
temporal_conv_size: int = 2,
|
||||||
|
merge_size: int = 2,
|
||||||
|
**kwargs,
|
||||||
|
) -> None:
|
||||||
|
"""init"""
|
||||||
|
super().__init__(**kwargs)
|
||||||
|
self.do_resize = do_resize
|
||||||
|
self.resample = resample
|
||||||
|
self.do_rescale = do_rescale
|
||||||
|
self.rescale_factor = rescale_factor
|
||||||
|
self.do_normalize = do_normalize
|
||||||
|
self.image_mean = image_mean if image_mean is not None else OPENAI_CLIP_MEAN
|
||||||
|
self.image_std = image_std if image_std is not None else OPENAI_CLIP_STD
|
||||||
|
self.min_pixels = min_pixels
|
||||||
|
self.max_pixels = max_pixels
|
||||||
|
self.patch_size = patch_size
|
||||||
|
self.temporal_conv_size = temporal_conv_size
|
||||||
|
self.merge_size = merge_size
|
||||||
|
self.size = {"min_pixels": min_pixels, "max_pixels": max_pixels}
|
||||||
|
self.do_convert_rgb = do_convert_rgb
|
||||||
|
|
||||||
|
def set_pixels(self, min_pixels=None, max_pixels=None, msg=""):
|
||||||
|
"""设定pixels"""
|
||||||
|
if min_pixels is not None:
|
||||||
|
assert isinstance(min_pixels, int) and min_pixels >= 0, "min_pixels must be positive int"
|
||||||
|
data_processor_logger.info(f"{msg} AdaptiveImageProcessor set min_pixels = {min_pixels}")
|
||||||
|
self.min_pixels = min_pixels
|
||||||
|
self.size["min_pixels"] = int(min_pixels)
|
||||||
|
if max_pixels is not None:
|
||||||
|
assert isinstance(max_pixels, int) and max_pixels > 0, "max_pixels must be positive int"
|
||||||
|
data_processor_logger.info(f"{msg} AdaptiveImageProcessor set max_pixels = {max_pixels}")
|
||||||
|
self.max_pixels = max_pixels
|
||||||
|
self.size["max_pixels"] = int(max_pixels)
|
||||||
|
|
||||||
|
def get_smarted_resize(self, height, width, min_pixels=None, max_pixels=None):
|
||||||
|
"""dummy"""
|
||||||
|
actual_min_pixels = min_pixels if min_pixels is not None else self.min_pixels
|
||||||
|
actual_max_pixels = max_pixels if max_pixels is not None else self.max_pixels
|
||||||
|
resized_height, resized_width = smart_resize(
|
||||||
|
height,
|
||||||
|
width,
|
||||||
|
factor=self.patch_size * self.merge_size,
|
||||||
|
min_pixels=actual_min_pixels,
|
||||||
|
max_pixels=actual_max_pixels,
|
||||||
|
)
|
||||||
|
return (resized_height, resized_width), (
|
||||||
|
resized_height // self.patch_size,
|
||||||
|
resized_width // self.patch_size,
|
||||||
|
)
|
||||||
|
|
||||||
|
def _preprocess(
|
||||||
|
self,
|
||||||
|
images: Union[ImageInput, VideoInput],
|
||||||
|
do_resize: bool = True,
|
||||||
|
resample: PILImageResampling = None,
|
||||||
|
do_rescale: bool = True,
|
||||||
|
rescale_factor: float = 1 / 255,
|
||||||
|
do_normalize: bool = True,
|
||||||
|
image_mean: Optional[Union[float, List[float]]] = None,
|
||||||
|
image_std: Optional[Union[float, List[float]]] = None,
|
||||||
|
do_convert_rgb: bool = False,
|
||||||
|
data_format: Optional[ChannelDimension] = ChannelDimension.FIRST,
|
||||||
|
input_data_format: Optional[Union[str, ChannelDimension]] = None,
|
||||||
|
predetermined_grid_thw=None,
|
||||||
|
):
|
||||||
|
"""
|
||||||
|
Preprocess an image or batch of images. Copy of the `preprocess` method from `CLIPImageProcessor`.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
images (`ImageInput`):
|
||||||
|
Image or batch of images to preprocess. Expects pixel values ranging from 0 to 255.
|
||||||
|
If pixel values range from 0 to 1, set `do_rescale=False`.
|
||||||
|
vision_info (`List[Dict]`, *optional*):
|
||||||
|
Optional list of dictionaries containing additional information about vision inputs.
|
||||||
|
do_resize (`bool`, *optional*, defaults to `self.do_resize`):
|
||||||
|
Whether to resize the image.
|
||||||
|
resample (`PILImageResampling`, *optional*, defaults to `self.resample`):
|
||||||
|
Resampling filter to use if resizing the image. This can be one of the `PILImageResampling` enums.
|
||||||
|
do_rescale (`bool`, *optional*, defaults to `self.do_rescale`):
|
||||||
|
Whether to rescale the image.
|
||||||
|
rescale_factor (`float`, *optional*, defaults to `self.rescale_factor`):
|
||||||
|
Scale factor to use if rescaling the image.
|
||||||
|
do_normalize (`bool`, *optional*, defaults to `self.do_normalize`):
|
||||||
|
Whether to normalize the image.
|
||||||
|
image_mean (`float` or `List[float]`, *optional*, defaults to `self.image_mean`):
|
||||||
|
Mean to use if normalizing the image.
|
||||||
|
Can be a float or a list of floats corresponding to the number of channels in the image.
|
||||||
|
image_std (`float` or `List[float]`, *optional*, defaults to `self.image_std`):
|
||||||
|
Standard deviation to use if normalizing the image.
|
||||||
|
Can be a float or a list of floats corresponding to the number of channels in the image.
|
||||||
|
do_convert_rgb (`bool`, *optional*, defaults to `self.do_convert_rgb`):
|
||||||
|
Whether to convert the image to RGB.
|
||||||
|
data_format (`ChannelDimension`, *optional*, defaults to `ChannelDimension.FIRST`):
|
||||||
|
The channel dimension format for the output image. Can be one of:
|
||||||
|
- `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
|
||||||
|
- `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
|
||||||
|
- Unset: Use the channel dimension format of the input image.
|
||||||
|
input_data_format (`ChannelDimension` or `str`, *optional*):
|
||||||
|
The channel dimension format for the input image. Can be one of:
|
||||||
|
- `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
|
||||||
|
- `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
|
||||||
|
- `"none"` or `ChannelDimension.NONE`: image in (height, width) format.
|
||||||
|
- `"none"` or `ChannelDimension.NONE`: image in (height, width) format.
|
||||||
|
"""
|
||||||
|
images = make_list_of_images(images)
|
||||||
|
|
||||||
|
if do_convert_rgb:
|
||||||
|
images = [convert_to_rgb(image) for image in images]
|
||||||
|
|
||||||
|
# All transformations expect numpy arrays.
|
||||||
|
images = [to_numpy_array(image) for image in images]
|
||||||
|
|
||||||
|
if is_scaled_image(images[0]) and do_rescale:
|
||||||
|
data_processor_logger.warning(
|
||||||
|
"It looks like you are trying to rescale already rescaled images. If the input"
|
||||||
|
" images have pixel values between 0 and 1, set `do_rescale=False` to avoid rescaling them again."
|
||||||
|
)
|
||||||
|
if input_data_format is None:
|
||||||
|
# We assume that all images have the same channel dimension format.
|
||||||
|
input_data_format = infer_channel_dimension_format(images[0])
|
||||||
|
|
||||||
|
height, width = get_image_size(images[0], channel_dim=input_data_format)
|
||||||
|
resized_height, resized_width = height, width
|
||||||
|
processed_images = []
|
||||||
|
|
||||||
|
if predetermined_grid_thw is not None:
|
||||||
|
assert len(predetermined_grid_thw) == len(
|
||||||
|
images
|
||||||
|
), f"len(predetermined_grid_thw) {len(predetermined_grid_thw)} == len(images) {len(images)}"
|
||||||
|
|
||||||
|
for img_idx, image in enumerate(images):
|
||||||
|
if do_resize:
|
||||||
|
if predetermined_grid_thw is not None:
|
||||||
|
(resized_height, resized_width) = predetermined_grid_thw[img_idx]
|
||||||
|
resized_height *= self.patch_size
|
||||||
|
resized_width *= self.patch_size
|
||||||
|
else:
|
||||||
|
resized_height, resized_width = smart_resize(
|
||||||
|
height,
|
||||||
|
width,
|
||||||
|
factor=self.patch_size * self.merge_size,
|
||||||
|
min_pixels=self.min_pixels,
|
||||||
|
max_pixels=self.max_pixels,
|
||||||
|
)
|
||||||
|
image = image.astype("uint8") # TODO : 需要手动加上,否则多除255 导致结果会出错
|
||||||
|
# 直接fromarray,不要靠paddleformers里面的
|
||||||
|
image = Image.fromarray(image)
|
||||||
|
image = resize(
|
||||||
|
image,
|
||||||
|
size=(resized_height, resized_width),
|
||||||
|
resample=resample,
|
||||||
|
data_format=input_data_format,
|
||||||
|
)
|
||||||
|
if do_rescale:
|
||||||
|
image = rescale(image, scale=rescale_factor, data_format=input_data_format)
|
||||||
|
|
||||||
|
if do_normalize:
|
||||||
|
image = normalize(
|
||||||
|
image=image,
|
||||||
|
mean=image_mean,
|
||||||
|
std=image_std,
|
||||||
|
data_format=input_data_format,
|
||||||
|
)
|
||||||
|
|
||||||
|
image = to_channel_dimension_format(image, data_format, input_channel_dim=input_data_format) # [C, H, W]
|
||||||
|
|
||||||
|
processed_images.append(image)
|
||||||
|
patches = np.array(processed_images)
|
||||||
|
if data_format == ChannelDimension.LAST:
|
||||||
|
patches = patches.transpose([0, 3, 1, 2])
|
||||||
|
|
||||||
|
channel = patches.shape[1] # [time, C, H, W]
|
||||||
|
grid_t = patches.shape[0]
|
||||||
|
grid_h, grid_w = (
|
||||||
|
resized_height // self.patch_size,
|
||||||
|
resized_width // self.patch_size,
|
||||||
|
)
|
||||||
|
patches = patches.reshape(
|
||||||
|
[
|
||||||
|
grid_t,
|
||||||
|
channel,
|
||||||
|
grid_h // self.merge_size,
|
||||||
|
self.merge_size,
|
||||||
|
self.patch_size,
|
||||||
|
grid_w // self.merge_size,
|
||||||
|
self.merge_size,
|
||||||
|
self.patch_size,
|
||||||
|
]
|
||||||
|
)
|
||||||
|
# [grid_t, grid_h/merge_size, grid_w/merge_size, merge_size, merge_size, C, psz, psz]
|
||||||
|
patches = patches.transpose([0, 2, 5, 3, 6, 1, 4, 7])
|
||||||
|
|
||||||
|
flatten_patches = patches.reshape(
|
||||||
|
[
|
||||||
|
grid_t * grid_h * grid_w,
|
||||||
|
channel * self.patch_size * self.patch_size,
|
||||||
|
]
|
||||||
|
) # [grid_t * grid_h * grid_w, C * psz * psz]
|
||||||
|
|
||||||
|
return flatten_patches, (grid_t, grid_h, grid_w)
|
||||||
|
|
||||||
|
def preprocess(
|
||||||
|
self,
|
||||||
|
images: ImageInput,
|
||||||
|
videos: VideoInput = None,
|
||||||
|
do_resize: bool = True,
|
||||||
|
size: Optional[Union[int, List[int]]] = None,
|
||||||
|
resample: PILImageResampling = None,
|
||||||
|
do_rescale: bool = True,
|
||||||
|
rescale_factor: float = 1 / 255,
|
||||||
|
do_normalize: bool = True,
|
||||||
|
image_mean: Optional[Union[float, List[float]]] = None,
|
||||||
|
image_std: Optional[Union[float, List[float]]] = None,
|
||||||
|
do_convert_rgb: bool = False,
|
||||||
|
return_tensors: Optional[Union[str, TensorType]] = None,
|
||||||
|
data_format: Optional[ChannelDimension] = ChannelDimension.FIRST,
|
||||||
|
input_data_format: Optional[Union[str, ChannelDimension]] = None,
|
||||||
|
predetermined_grid_thw=None,
|
||||||
|
):
|
||||||
|
"""
|
||||||
|
Args:
|
||||||
|
images (`ImageInput`):
|
||||||
|
Image to preprocess. Expects a single or batch of images with pixel values ranging from 0 to 255. If
|
||||||
|
passing in images with pixel values between 0 and 1, set `do_rescale=False`.
|
||||||
|
videos (`VideoInput`):
|
||||||
|
Video to preprocess. Expects a single or batch of videos with pixel values ranging from 0 to 255. If
|
||||||
|
passing in videos with pixel values between 0 and 1, set `do_rescale=False`.
|
||||||
|
do_resize (`bool`, *optional*, defaults to `self.do_resize`):
|
||||||
|
Whether to resize the image.
|
||||||
|
size (`Dict[str, int]`, *optional*, defaults to `self.size`):
|
||||||
|
Size of the image after resizing. Shortest edge of the image is resized to size["shortest_edge"], with
|
||||||
|
the longest edge resized to keep the input aspect ratio.
|
||||||
|
resample (`int`, *optional*, defaults to `self.resample`):
|
||||||
|
Resampling filter to use if resizing the image. This can be one of the enum `PILImageResampling`. Only
|
||||||
|
has an effect if `do_resize` is set to `True`.
|
||||||
|
do_rescale (`bool`, *optional*, defaults to `self.do_rescale`):
|
||||||
|
Whether to rescale the image.
|
||||||
|
rescale_factor (`float`, *optional*, defaults to `self.rescale_factor`):
|
||||||
|
Rescale factor to rescale the image by if `do_rescale` is set to `True`.
|
||||||
|
do_normalize (`bool`, *optional*, defaults to `self.do_normalize`):
|
||||||
|
Whether to normalize the image.
|
||||||
|
image_mean (`float` or `List[float]`, *optional*, defaults to `self.image_mean`):
|
||||||
|
Image mean to use for normalization. Only has an effect if `do_normalize` is set to `True`.
|
||||||
|
image_std (`float` or `List[float]`, *optional*, defaults to `self.image_std`):
|
||||||
|
Image standard deviation to use for normalization. Only has an effect if `do_normalize` is set to
|
||||||
|
`True`.
|
||||||
|
do_convert_rgb (`bool`, *optional*, defaults to `self.do_convert_rgb`):
|
||||||
|
Whether to convert the image to RGB.
|
||||||
|
return_tensors (`str` or `TensorType`, *optional*):
|
||||||
|
The type of tensors to return. Can be one of:
|
||||||
|
- Unset: Return a list of `np.ndarray`.
|
||||||
|
- `TensorType.PADDLE` or `'pt'`: Return a batch of type `torch.Tensor`.
|
||||||
|
- `TensorType.NUMPY` or `'np'`: Return a batch of type `np.ndarray`.
|
||||||
|
data_format (`ChannelDimension` or `str`, *optional*, defaults to `ChannelDimension.FIRST`):
|
||||||
|
The channel dimension format for the output image. Can be one of:
|
||||||
|
- `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
|
||||||
|
- `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
|
||||||
|
- Unset: Use the channel dimension format of the input image.
|
||||||
|
input_data_format (`ChannelDimension` or `str`, *optional*):
|
||||||
|
The channel dimension format for the input image. If unset, the channel dimension format is inferred
|
||||||
|
from the input image. Can be one of:
|
||||||
|
- `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
|
||||||
|
- `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
|
||||||
|
- `"none"` or `ChannelDimension.NONE`: image in (height, width) format.
|
||||||
|
|
||||||
|
"""
|
||||||
|
do_resize = do_resize if do_resize is not None else self.do_resize
|
||||||
|
size = size if size is not None else self.size
|
||||||
|
resample = resample if resample is not None else self.resample
|
||||||
|
do_rescale = do_rescale if do_rescale is not None else self.do_rescale
|
||||||
|
rescale_factor = rescale_factor if rescale_factor is not None else self.rescale_factor
|
||||||
|
do_normalize = do_normalize if do_normalize is not None else self.do_normalize
|
||||||
|
image_mean = image_mean if image_mean is not None else self.image_mean
|
||||||
|
image_std = image_std if image_std is not None else self.image_std
|
||||||
|
do_convert_rgb = do_convert_rgb if do_convert_rgb is not None else self.do_convert_rgb
|
||||||
|
|
||||||
|
if images is not None:
|
||||||
|
images = make_batched_images(images)
|
||||||
|
if videos is not None:
|
||||||
|
videos = make_batched_videos(videos)
|
||||||
|
|
||||||
|
if images is not None and not valid_images(images):
|
||||||
|
raise ValueError("Invalid image type. Must be of type PIL.Image.Image, numpy.ndarray, " "paddle.Tensor.")
|
||||||
|
|
||||||
|
data = {}
|
||||||
|
|
||||||
|
if images is not None:
|
||||||
|
pixel_values, vision_grid_thws = [], []
|
||||||
|
for img_idx, image in enumerate(images):
|
||||||
|
if predetermined_grid_thw is not None:
|
||||||
|
predetermined_grid_thw_one = [predetermined_grid_thw[img_idx]]
|
||||||
|
else:
|
||||||
|
predetermined_grid_thw_one = None
|
||||||
|
patches, image_grid_thw = self._preprocess(
|
||||||
|
image,
|
||||||
|
do_resize=do_resize,
|
||||||
|
resample=resample,
|
||||||
|
do_rescale=do_rescale,
|
||||||
|
rescale_factor=rescale_factor,
|
||||||
|
do_normalize=do_normalize,
|
||||||
|
image_mean=image_mean,
|
||||||
|
image_std=image_std,
|
||||||
|
data_format=data_format,
|
||||||
|
do_convert_rgb=do_convert_rgb,
|
||||||
|
input_data_format=input_data_format,
|
||||||
|
predetermined_grid_thw=predetermined_grid_thw_one,
|
||||||
|
)
|
||||||
|
pixel_values.extend(patches)
|
||||||
|
vision_grid_thws.append(image_grid_thw)
|
||||||
|
pixel_values = np.array(pixel_values)
|
||||||
|
vision_grid_thws = np.array(vision_grid_thws)
|
||||||
|
data["pixel_values"] = pixel_values
|
||||||
|
data["image_grid_thw"] = vision_grid_thws
|
||||||
|
|
||||||
|
if videos is not None:
|
||||||
|
pixel_values, vision_grid_thws = [], []
|
||||||
|
for images in videos:
|
||||||
|
patches, video_grid_thw = self._preprocess(
|
||||||
|
images,
|
||||||
|
do_resize=do_resize,
|
||||||
|
resample=resample,
|
||||||
|
do_rescale=do_rescale,
|
||||||
|
rescale_factor=rescale_factor,
|
||||||
|
do_normalize=do_normalize,
|
||||||
|
image_mean=image_mean,
|
||||||
|
image_std=image_std,
|
||||||
|
data_format=data_format,
|
||||||
|
do_convert_rgb=do_convert_rgb,
|
||||||
|
input_data_format=input_data_format,
|
||||||
|
predetermined_grid_thw=predetermined_grid_thw,
|
||||||
|
)
|
||||||
|
pixel_values.extend(patches)
|
||||||
|
vision_grid_thws.append(video_grid_thw)
|
||||||
|
pixel_values = np.array(pixel_values)
|
||||||
|
vision_grid_thws = np.array(vision_grid_thws)
|
||||||
|
data["pixel_values_videos"] = pixel_values
|
||||||
|
data["video_grid_thw"] = vision_grid_thws
|
||||||
|
|
||||||
|
return BatchFeature(data=data, tensor_type=return_tensors)
|
||||||
|
|
||||||
|
|
||||||
|
def get_image_preprocessor(args):
|
||||||
|
"""
|
||||||
|
get_image_preprocessor from args
|
||||||
|
"""
|
||||||
|
|
||||||
|
if args.vision_model_name_or_path is None:
|
||||||
|
return None
|
||||||
|
|
||||||
|
data_processor_logger.info("use AdaptiveImageProcessor")
|
||||||
|
image_preprocess = AdaptiveImageProcessor.from_pretrained(args.vision_model_name_or_path)
|
||||||
|
return image_preprocess
|
||||||
@@ -0,0 +1,225 @@
|
|||||||
|
"""
|
||||||
|
# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
|
||||||
|
#
|
||||||
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
# you may not use this file except in compliance with the License.
|
||||||
|
# You may obtain a copy of the License at
|
||||||
|
#
|
||||||
|
# http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
#
|
||||||
|
# Unless required by applicable law or agreed to in writing, software
|
||||||
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
# See the License for the specific language governing permissions and
|
||||||
|
# limitations under the License.
|
||||||
|
"""
|
||||||
|
|
||||||
|
"""Image processor class for PaddleOCR-VL."""
|
||||||
|
|
||||||
|
import json
|
||||||
|
from pathlib import Path
|
||||||
|
from typing import Dict, List, Optional, Union
|
||||||
|
|
||||||
|
import numpy as np
|
||||||
|
from paddleformers.transformers.feature_extraction_utils import BatchFeature
|
||||||
|
from paddleformers.transformers.image_processing_utils import BaseImageProcessor
|
||||||
|
from paddleformers.transformers.image_utils import (
|
||||||
|
ImageInput,
|
||||||
|
is_valid_image,
|
||||||
|
make_list_of_images,
|
||||||
|
to_numpy_array,
|
||||||
|
)
|
||||||
|
|
||||||
|
from fastdeploy.input.image_processors.common import (
|
||||||
|
smart_resize_paddleocr as smart_resize,
|
||||||
|
)
|
||||||
|
|
||||||
|
_OPENAI_CLIP_MEAN = [0.48145466, 0.4578275, 0.40821073]
|
||||||
|
_OPENAI_CLIP_STD = [0.26862954, 0.26130258, 0.27577711]
|
||||||
|
|
||||||
|
|
||||||
|
def make_batched_images(images) -> List[ImageInput]:
|
||||||
|
"""
|
||||||
|
Accepts images in list or nested list format, and makes a flat list of images for preprocessing.
|
||||||
|
Args:
|
||||||
|
images (`Union[List[List[ImageInput]], List[ImageInput], ImageInput]`):
|
||||||
|
The input image.
|
||||||
|
Returns:
|
||||||
|
List[ImageInput]: A flat list of images.
|
||||||
|
"""
|
||||||
|
if isinstance(images, (list, tuple)) and isinstance(images[0], (list, tuple)) and is_valid_image(images[0][0]):
|
||||||
|
return [img for img_list in images for img in img_list]
|
||||||
|
|
||||||
|
elif isinstance(images, (list, tuple)) and is_valid_image(images[0]):
|
||||||
|
return images
|
||||||
|
|
||||||
|
elif is_valid_image(images):
|
||||||
|
return [images]
|
||||||
|
|
||||||
|
raise ValueError(f"Could not make batched images from {images}")
|
||||||
|
|
||||||
|
|
||||||
|
def adjust_size(size, patch_size):
|
||||||
|
num_patches = size // patch_size
|
||||||
|
if num_patches % 2 != 0:
|
||||||
|
num_patches -= 1
|
||||||
|
return num_patches * patch_size
|
||||||
|
|
||||||
|
|
||||||
|
class ImageProcessor(BaseImageProcessor):
|
||||||
|
model_input_names = [
|
||||||
|
"pixel_values",
|
||||||
|
"image_grid_thw",
|
||||||
|
"pixel_values_videos",
|
||||||
|
"video_grid_thw",
|
||||||
|
]
|
||||||
|
|
||||||
|
def __init__(
|
||||||
|
self,
|
||||||
|
do_resize: bool = True,
|
||||||
|
resample: int = 3,
|
||||||
|
do_rescale: bool = True,
|
||||||
|
rescale_factor: Union[int, float] = 1 / 255,
|
||||||
|
do_normalize: bool = True,
|
||||||
|
image_mean: Optional[Union[float, List[float]]] = None,
|
||||||
|
image_std: Optional[Union[float, List[float]]] = None,
|
||||||
|
do_convert_rgb: bool = True,
|
||||||
|
min_pixels: int = 28 * 28 * 130,
|
||||||
|
max_pixels: int = 28 * 28 * 1280,
|
||||||
|
patch_size: int = 14,
|
||||||
|
temporal_patch_size: int = 1,
|
||||||
|
merge_size: int = 2,
|
||||||
|
**kwargs,
|
||||||
|
) -> None:
|
||||||
|
super().__init__()
|
||||||
|
self.do_resize = do_resize
|
||||||
|
self.resample = resample
|
||||||
|
self.do_rescale = do_rescale
|
||||||
|
self.rescale_factor = rescale_factor
|
||||||
|
self.do_normalize = do_normalize
|
||||||
|
self.image_mean = image_mean if image_mean is not None else _OPENAI_CLIP_MEAN
|
||||||
|
self.image_std = image_std if image_std is not None else _OPENAI_CLIP_STD
|
||||||
|
self.min_pixels = min_pixels
|
||||||
|
self.max_pixels = max_pixels
|
||||||
|
self.patch_size = patch_size
|
||||||
|
self.temporal_patch_size = temporal_patch_size
|
||||||
|
self.merge_size = merge_size
|
||||||
|
self.size = {"min_pixels": min_pixels, "max_pixels": max_pixels} # not used
|
||||||
|
self.do_convert_rgb = do_convert_rgb
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def from_pretrained(cls, pretrained_model_dir):
|
||||||
|
pretrained_model_dir = Path(pretrained_model_dir)
|
||||||
|
image_processor_config_path = pretrained_model_dir / "preprocessor_config.json"
|
||||||
|
with open(image_processor_config_path, "r", encoding="utf-8") as f:
|
||||||
|
image_processor_config = json.load(f)
|
||||||
|
return cls(**image_processor_config)
|
||||||
|
|
||||||
|
def _preprocess(
|
||||||
|
self,
|
||||||
|
images,
|
||||||
|
do_resize: Optional[bool] = None,
|
||||||
|
do_rescale: Optional[bool] = None,
|
||||||
|
rescale_factor: Optional[float] = None,
|
||||||
|
do_normalize: Optional[bool] = None,
|
||||||
|
image_mean: Optional[Union[float, List[float]]] = None,
|
||||||
|
image_std: Optional[Union[float, List[float]]] = None,
|
||||||
|
do_convert_rgb: Optional[bool] = None,
|
||||||
|
):
|
||||||
|
images = make_list_of_images(images)
|
||||||
|
|
||||||
|
if do_convert_rgb:
|
||||||
|
images = [image.convert("RGB") for image in images]
|
||||||
|
|
||||||
|
width, height = images[0].size
|
||||||
|
resized_height, resized_width = height, width
|
||||||
|
processed_images = []
|
||||||
|
|
||||||
|
for image in images:
|
||||||
|
if do_resize:
|
||||||
|
resized_height, resized_width = smart_resize(
|
||||||
|
height,
|
||||||
|
width,
|
||||||
|
factor=self.patch_size * self.merge_size,
|
||||||
|
min_pixels=self.min_pixels,
|
||||||
|
max_pixels=self.max_pixels,
|
||||||
|
)
|
||||||
|
|
||||||
|
image = image.resize((resized_width, resized_height), resample=self.resample)
|
||||||
|
|
||||||
|
image = to_numpy_array(image)
|
||||||
|
|
||||||
|
if do_rescale:
|
||||||
|
image = (image * rescale_factor).astype(np.float32)
|
||||||
|
|
||||||
|
if do_normalize:
|
||||||
|
image = image.astype(np.float32)
|
||||||
|
image -= np.array(image_mean, dtype=np.float32)
|
||||||
|
image /= np.array(image_std, dtype=np.float32)
|
||||||
|
|
||||||
|
processed_images.append(image)
|
||||||
|
|
||||||
|
patches = np.array(processed_images)
|
||||||
|
patches = patches.transpose(0, 3, 1, 2)
|
||||||
|
if patches.shape[0] == 1:
|
||||||
|
patches = np.tile(patches, (self.temporal_patch_size, 1, 1, 1))
|
||||||
|
channel = patches.shape[1]
|
||||||
|
grid_t = patches.shape[0] // self.temporal_patch_size
|
||||||
|
grid_h, grid_w = (
|
||||||
|
resized_height // self.patch_size,
|
||||||
|
resized_width // self.patch_size,
|
||||||
|
)
|
||||||
|
|
||||||
|
patches = patches.reshape(
|
||||||
|
grid_t,
|
||||||
|
self.temporal_patch_size,
|
||||||
|
channel,
|
||||||
|
grid_h,
|
||||||
|
self.patch_size,
|
||||||
|
grid_w,
|
||||||
|
self.patch_size,
|
||||||
|
)
|
||||||
|
patches = patches.transpose(0, 3, 5, 2, 1, 4, 6)
|
||||||
|
assert self.temporal_patch_size == 1
|
||||||
|
flatten_patches = patches.reshape(grid_t * grid_h * grid_w, channel, self.patch_size, self.patch_size)
|
||||||
|
return flatten_patches, np.array([grid_t, grid_h, grid_w])
|
||||||
|
|
||||||
|
def preprocess(
|
||||||
|
self,
|
||||||
|
images,
|
||||||
|
videos=None,
|
||||||
|
do_resize: Optional[bool] = None,
|
||||||
|
size: Optional[Dict[str, int]] = None,
|
||||||
|
do_rescale: Optional[bool] = None,
|
||||||
|
rescale_factor: Optional[float] = None,
|
||||||
|
do_normalize: Optional[bool] = None,
|
||||||
|
image_mean: Optional[Union[float, List[float]]] = None,
|
||||||
|
image_std: Optional[Union[float, List[float]]] = None,
|
||||||
|
do_convert_rgb: Optional[bool] = None,
|
||||||
|
return_tensors=None,
|
||||||
|
):
|
||||||
|
do_resize = do_resize if do_resize is not None else self.do_resize
|
||||||
|
size = size if size is not None else self.size
|
||||||
|
do_rescale = do_rescale if do_rescale is not None else self.do_rescale
|
||||||
|
rescale_factor = rescale_factor if rescale_factor is not None else self.rescale_factor
|
||||||
|
do_normalize = do_normalize if do_normalize is not None else self.do_normalize
|
||||||
|
image_mean = image_mean if image_mean is not None else self.image_mean
|
||||||
|
image_std = image_std if image_std is not None else self.image_std
|
||||||
|
do_convert_rgb = do_convert_rgb if do_convert_rgb is not None else self.do_convert_rgb
|
||||||
|
|
||||||
|
if videos is not None:
|
||||||
|
raise NotImplementedError("Videos are not yet supported")
|
||||||
|
|
||||||
|
patches, image_grid_thw = self._preprocess(
|
||||||
|
images,
|
||||||
|
do_resize=do_resize,
|
||||||
|
do_rescale=do_rescale,
|
||||||
|
rescale_factor=rescale_factor,
|
||||||
|
do_normalize=do_normalize,
|
||||||
|
image_mean=image_mean,
|
||||||
|
image_std=image_std,
|
||||||
|
do_convert_rgb=do_convert_rgb,
|
||||||
|
)
|
||||||
|
pixel_values = np.array(patches)
|
||||||
|
data = {"pixel_values": pixel_values, "grid_thw": image_grid_thw}
|
||||||
|
return BatchFeature(data=data, tensor_type=return_tensors)
|
||||||
@@ -0,0 +1,333 @@
|
|||||||
|
"""
|
||||||
|
# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
|
||||||
|
#
|
||||||
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
# you may not use this file except in compliance with the License.
|
||||||
|
# You may obtain a copy of the License at
|
||||||
|
#
|
||||||
|
# http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
#
|
||||||
|
# Unless required by applicable law or agreed to in writing, software
|
||||||
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
# See the License for the specific language governing permissions and
|
||||||
|
# limitations under the License.
|
||||||
|
"""
|
||||||
|
|
||||||
|
from typing import List, Optional, Union
|
||||||
|
|
||||||
|
import numpy as np
|
||||||
|
import paddle
|
||||||
|
import PIL
|
||||||
|
from paddleformers.transformers.feature_extraction_utils import BatchFeature
|
||||||
|
from paddleformers.transformers.image_processing_utils import BaseImageProcessor
|
||||||
|
from paddleformers.transformers.image_transforms import (
|
||||||
|
normalize,
|
||||||
|
rescale,
|
||||||
|
resize,
|
||||||
|
to_channel_dimension_format,
|
||||||
|
)
|
||||||
|
from paddleformers.transformers.image_utils import (
|
||||||
|
ChannelDimension,
|
||||||
|
ImageInput,
|
||||||
|
PILImageResampling,
|
||||||
|
get_image_size,
|
||||||
|
infer_channel_dimension_format,
|
||||||
|
make_list_of_images,
|
||||||
|
to_numpy_array,
|
||||||
|
valid_images,
|
||||||
|
)
|
||||||
|
from paddleformers.transformers.legacy.tokenizer_utils_base import TensorType
|
||||||
|
from PIL import Image
|
||||||
|
|
||||||
|
from fastdeploy.input.image_processors.common import is_scaled_image, smart_resize
|
||||||
|
from fastdeploy.utils import data_processor_logger
|
||||||
|
|
||||||
|
IMAGE_MEAN = [0.5, 0.5, 0.5]
|
||||||
|
IMAGE_STD = [0.5, 0.5, 0.5]
|
||||||
|
|
||||||
|
MIN_PIXELS = 65536
|
||||||
|
MAX_PIXELS = 16777216
|
||||||
|
|
||||||
|
|
||||||
|
VideoInput = Union[
|
||||||
|
List["PIL.Image.Image"],
|
||||||
|
"np.ndarray",
|
||||||
|
"paddle.Tensor",
|
||||||
|
List["np.ndarray"],
|
||||||
|
List["paddle.Tensor"],
|
||||||
|
List[List["PIL.Image.Image"]],
|
||||||
|
List[List["np.ndarray"]],
|
||||||
|
List[List["paddle.Tensor"]],
|
||||||
|
]
|
||||||
|
|
||||||
|
|
||||||
|
class ImageProcessor(BaseImageProcessor):
|
||||||
|
"""
|
||||||
|
Adaptive image processor for dynamic image resizing and preprocessing.
|
||||||
|
|
||||||
|
This processor handles image resizing, rescaling, normalization and format conversion.
|
||||||
|
It dynamically adjusts image dimensions based on original size and specified constraints.
|
||||||
|
"""
|
||||||
|
|
||||||
|
def __init__(
|
||||||
|
self,
|
||||||
|
patch_size: int = 16,
|
||||||
|
merge_size: int = 2,
|
||||||
|
temporal_patch_size: int = 2,
|
||||||
|
min_pixels: int = MIN_PIXELS,
|
||||||
|
max_pixels: int = MAX_PIXELS,
|
||||||
|
image_mean: Union[float, List[float]] = IMAGE_MEAN,
|
||||||
|
image_std: Union[float, List[float]] = IMAGE_STD,
|
||||||
|
rescale_factor: float = 1 / 255,
|
||||||
|
do_rescale: bool = True,
|
||||||
|
do_normalize: bool = True,
|
||||||
|
resample: PILImageResampling = PILImageResampling.BICUBIC,
|
||||||
|
**kwargs,
|
||||||
|
) -> None:
|
||||||
|
"""
|
||||||
|
Initialize image processor with configuration parameters.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
patch_size (int): Spatial patch size for vision encoder
|
||||||
|
merge_size (int): Merge size between vision and LLM encoders
|
||||||
|
temporal_patch_size (int): Temporal patch size for video processing
|
||||||
|
min_pixels (int): Minimum allowed pixels in resized image
|
||||||
|
max_pixels (int): Maximum allowed pixels in resized image
|
||||||
|
image_mean (float/list): Mean values for normalization per channel
|
||||||
|
image_std (float/list): Std values for normalization per channel
|
||||||
|
rescale_factor (float): Scaling factor for pixel values (default 1/255)
|
||||||
|
do_rescale (bool): Whether to rescale images
|
||||||
|
do_normalize (bool): Whether to normalize images
|
||||||
|
resample: Resampling method for image resizing
|
||||||
|
**kwargs: Additional base class arguments
|
||||||
|
"""
|
||||||
|
super().__init__(**kwargs)
|
||||||
|
self.patch_size = patch_size
|
||||||
|
self.merge_size = merge_size
|
||||||
|
self.temporal_patch_size = temporal_patch_size
|
||||||
|
|
||||||
|
self.min_pixels = min_pixels
|
||||||
|
self.max_pixels = max_pixels
|
||||||
|
|
||||||
|
self.image_mean = image_mean
|
||||||
|
self.image_std = image_std
|
||||||
|
self.rescale_factor = rescale_factor
|
||||||
|
self.do_rescale = do_rescale
|
||||||
|
self.do_normalize = do_normalize
|
||||||
|
|
||||||
|
self.resample = resample
|
||||||
|
|
||||||
|
def _preprocess(
|
||||||
|
self,
|
||||||
|
images: Union[ImageInput, VideoInput],
|
||||||
|
min_pixels: int,
|
||||||
|
max_pixels: int,
|
||||||
|
image_mean: Optional[Union[float, List[float]]],
|
||||||
|
image_std: Optional[Union[float, List[float]]],
|
||||||
|
rescale_factor: float,
|
||||||
|
do_rescale: bool,
|
||||||
|
do_normalize: bool,
|
||||||
|
resample: PILImageResampling,
|
||||||
|
data_format: Optional[ChannelDimension],
|
||||||
|
input_data_format: Optional[Union[str, ChannelDimension]],
|
||||||
|
):
|
||||||
|
"""
|
||||||
|
Internal method for image preprocessing pipeline.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
images: Input image or batch of images
|
||||||
|
min_pixels: Minimum allowed pixels in output
|
||||||
|
max_pixels: Maximum allowed pixels in output
|
||||||
|
image_mean: Normalization mean values
|
||||||
|
image_std: Normalization std values
|
||||||
|
rescale_factor: Pixel value scaling factor
|
||||||
|
do_rescale: Whether to rescale pixel values
|
||||||
|
do_normalize: Whether to normalize pixel values
|
||||||
|
resample: Resampling method
|
||||||
|
data_format: Output channel format
|
||||||
|
input_data_format: Input channel format
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
tuple: (flatten_patches, grid_dimensions)
|
||||||
|
- flatten_patches: Flattened image patches
|
||||||
|
- grid_dimensions: Grid dimensions [t, h, w]
|
||||||
|
"""
|
||||||
|
images = make_list_of_images(images)
|
||||||
|
|
||||||
|
# All transformations expect numpy arrays.
|
||||||
|
images = [to_numpy_array(image) for image in images]
|
||||||
|
|
||||||
|
if is_scaled_image(images[0]) and do_rescale:
|
||||||
|
data_processor_logger.warning(
|
||||||
|
"It looks like you are trying to rescale already rescaled images. If the input"
|
||||||
|
" images have pixel values between 0 and 1, set `do_rescale=False` to avoid rescaling them again."
|
||||||
|
)
|
||||||
|
if input_data_format is None:
|
||||||
|
# We assume that all images have the same channel dimension format.
|
||||||
|
input_data_format = infer_channel_dimension_format(images[0])
|
||||||
|
|
||||||
|
# Get original dimensions and calculate optimal resize dimensions
|
||||||
|
height, width = get_image_size(images[0], channel_dim=input_data_format)
|
||||||
|
resized_height, resized_width = smart_resize(
|
||||||
|
height,
|
||||||
|
width,
|
||||||
|
factor=self.patch_size * self.merge_size, # Combine patch and merge factors
|
||||||
|
min_pixels=min_pixels,
|
||||||
|
max_pixels=max_pixels,
|
||||||
|
)
|
||||||
|
|
||||||
|
processed_images = []
|
||||||
|
for image in images:
|
||||||
|
if height != resized_height or width != resized_width:
|
||||||
|
# Convert to uint8 before resizing to avoid double scaling
|
||||||
|
image = image.astype("uint8")
|
||||||
|
# Convert to PIL Image and resize
|
||||||
|
image = Image.fromarray(image)
|
||||||
|
image = resize(
|
||||||
|
image,
|
||||||
|
size=(resized_height, resized_width),
|
||||||
|
resample=resample,
|
||||||
|
data_format=input_data_format,
|
||||||
|
)
|
||||||
|
|
||||||
|
if do_rescale and do_normalize:
|
||||||
|
# Adjust mean and std for combined rescale+normalize
|
||||||
|
image_mean = np.array(image_mean, dtype=np.float32) * (1.0 / rescale_factor)
|
||||||
|
image_std = np.array(image_std, dtype=np.float32) * (1.0 / rescale_factor)
|
||||||
|
do_rescale = False # Skip separate rescale step
|
||||||
|
|
||||||
|
# mutual exclusion and upper branch
|
||||||
|
if do_rescale:
|
||||||
|
image = image.astype(np.float32)
|
||||||
|
image = rescale(image, scale=rescale_factor, data_format=input_data_format)
|
||||||
|
|
||||||
|
if do_normalize:
|
||||||
|
image = image.astype(np.float32)
|
||||||
|
image = normalize(
|
||||||
|
image=image,
|
||||||
|
mean=image_mean,
|
||||||
|
std=image_std,
|
||||||
|
data_format=input_data_format,
|
||||||
|
)
|
||||||
|
|
||||||
|
image = to_channel_dimension_format(image, data_format, input_channel_dim=input_data_format) # [C, H, W]
|
||||||
|
processed_images.append(image)
|
||||||
|
|
||||||
|
# Convert processed images to numpy array
|
||||||
|
patches = np.array(processed_images)
|
||||||
|
|
||||||
|
# Pad temporal dimension if needed
|
||||||
|
if patches.shape[0] % self.temporal_patch_size != 0:
|
||||||
|
repeats = np.repeat(
|
||||||
|
patches[-1][np.newaxis],
|
||||||
|
self.temporal_patch_size - (patches.shape[0] % self.temporal_patch_size),
|
||||||
|
axis=0,
|
||||||
|
)
|
||||||
|
patches = np.concatenate([patches, repeats], axis=0)
|
||||||
|
|
||||||
|
# Convert to channels-first format if needed
|
||||||
|
if data_format == ChannelDimension.LAST:
|
||||||
|
patches = patches.transpose([0, 3, 1, 2]) # [N, H, W, C] -> [N, C, H, W]
|
||||||
|
|
||||||
|
grid_t, channel = patches.shape[:2]
|
||||||
|
grid_t = grid_t // self.temporal_patch_size
|
||||||
|
|
||||||
|
grid_h, grid_w = (
|
||||||
|
resized_height // self.patch_size,
|
||||||
|
resized_width // self.patch_size,
|
||||||
|
)
|
||||||
|
# Reshape into hierarchical patch structure
|
||||||
|
patches = patches.reshape(
|
||||||
|
[
|
||||||
|
grid_t,
|
||||||
|
self.temporal_patch_size,
|
||||||
|
channel,
|
||||||
|
grid_h // self.merge_size,
|
||||||
|
self.merge_size,
|
||||||
|
self.patch_size,
|
||||||
|
grid_w // self.merge_size,
|
||||||
|
self.merge_size,
|
||||||
|
self.patch_size,
|
||||||
|
]
|
||||||
|
)
|
||||||
|
# Reorder dimensions for better memory access pattern
|
||||||
|
# [grid_t, grid_h/merge_size, grid_w/merge_size, merge_size, merge_size, C, temporal_patch_size, psz, psz]
|
||||||
|
patches = patches.transpose([0, 3, 6, 4, 7, 2, 1, 5, 8])
|
||||||
|
|
||||||
|
flatten_patches = patches.reshape(
|
||||||
|
[
|
||||||
|
grid_t * grid_h * grid_w,
|
||||||
|
channel * self.temporal_patch_size * self.patch_size * self.patch_size,
|
||||||
|
]
|
||||||
|
)
|
||||||
|
|
||||||
|
return flatten_patches, np.array([grid_t, grid_h, grid_w])
|
||||||
|
|
||||||
|
def preprocess(
|
||||||
|
self,
|
||||||
|
images: Union[ImageInput, VideoInput],
|
||||||
|
min_pixels: Optional[int] = None,
|
||||||
|
max_pixels: Optional[int] = None,
|
||||||
|
image_mean: Optional[Union[float, List[float]]] = None,
|
||||||
|
image_std: Optional[Union[float, List[float]]] = None,
|
||||||
|
rescale_factor: Optional[float] = None,
|
||||||
|
do_rescale: Optional[bool] = None,
|
||||||
|
do_normalize: Optional[bool] = None,
|
||||||
|
resample: Optional[PILImageResampling] = None,
|
||||||
|
return_tensors: Optional[Union[str, TensorType]] = None,
|
||||||
|
data_format: Optional[ChannelDimension] = ChannelDimension.FIRST,
|
||||||
|
input_data_format: Optional[Union[str, ChannelDimension]] = ChannelDimension.LAST,
|
||||||
|
):
|
||||||
|
"""
|
||||||
|
Main preprocessing method for images/videos.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
images: Input image/video data
|
||||||
|
min_pixels: Override for minimum pixels
|
||||||
|
max_pixels: Override for maximum pixels
|
||||||
|
image_mean: Override for normalization mean
|
||||||
|
image_std: Override for normalization std
|
||||||
|
rescale_factor: Override for rescaling factor
|
||||||
|
do_rescale: Override for rescaling flag
|
||||||
|
do_normalize: Override for normalization flag
|
||||||
|
resample: Override for resampling method
|
||||||
|
return_tensors: Desired output tensor format
|
||||||
|
data_format: Output channel dimension format
|
||||||
|
input_data_format: Input channel dimension format
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
BatchFeature: Processed features containing:
|
||||||
|
- pixel_values: Preprocessed pixel data
|
||||||
|
- grid_thw: Grid dimensions [temporal, height, width]
|
||||||
|
|
||||||
|
Raises:
|
||||||
|
ValueError: For invalid image types or dimensions
|
||||||
|
"""
|
||||||
|
min_pixels = min_pixels if min_pixels is not None else self.min_pixels
|
||||||
|
max_pixels = max_pixels if max_pixels is not None else self.max_pixels
|
||||||
|
image_mean = image_mean if image_mean is not None else self.image_mean
|
||||||
|
image_std = image_std if image_std is not None else self.image_std
|
||||||
|
rescale_factor = rescale_factor if rescale_factor is not None else self.rescale_factor
|
||||||
|
do_rescale = do_rescale if do_rescale is not None else self.do_rescale
|
||||||
|
do_normalize = do_normalize if do_normalize is not None else self.do_normalize
|
||||||
|
resample = resample if resample is not None else self.resample
|
||||||
|
|
||||||
|
if images is not None and not valid_images(images):
|
||||||
|
raise ValueError("Invalid image type. Must be of type PIL.Image.Image, numpy.ndarray, " "paddle.Tensor.")
|
||||||
|
|
||||||
|
pixel_values, grid_thw = self._preprocess(
|
||||||
|
images,
|
||||||
|
min_pixels=min_pixels,
|
||||||
|
max_pixels=max_pixels,
|
||||||
|
image_mean=image_mean,
|
||||||
|
image_std=image_std,
|
||||||
|
rescale_factor=rescale_factor,
|
||||||
|
do_rescale=do_rescale,
|
||||||
|
do_normalize=do_normalize,
|
||||||
|
resample=resample,
|
||||||
|
data_format=data_format,
|
||||||
|
input_data_format=input_data_format,
|
||||||
|
)
|
||||||
|
data = {"pixel_values": pixel_values, "grid_thw": grid_thw}
|
||||||
|
return BatchFeature(data=data, tensor_type=return_tensors)
|
||||||
@@ -0,0 +1,332 @@
|
|||||||
|
"""
|
||||||
|
# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
|
||||||
|
#
|
||||||
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
# you may not use this file except in compliance with the License.
|
||||||
|
# You may obtain a copy of the License at
|
||||||
|
#
|
||||||
|
# http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
#
|
||||||
|
# Unless required by applicable law or agreed to in writing, software
|
||||||
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
# See the License for the specific language governing permissions and
|
||||||
|
# limitations under the License.
|
||||||
|
"""
|
||||||
|
|
||||||
|
from typing import List, Optional, Union
|
||||||
|
|
||||||
|
import numpy as np
|
||||||
|
import paddle
|
||||||
|
import PIL
|
||||||
|
from paddleformers.transformers.feature_extraction_utils import BatchFeature
|
||||||
|
from paddleformers.transformers.image_processing_utils import BaseImageProcessor
|
||||||
|
from paddleformers.transformers.image_transforms import (
|
||||||
|
normalize,
|
||||||
|
rescale,
|
||||||
|
resize,
|
||||||
|
to_channel_dimension_format,
|
||||||
|
)
|
||||||
|
from paddleformers.transformers.image_utils import (
|
||||||
|
ChannelDimension,
|
||||||
|
ImageInput,
|
||||||
|
PILImageResampling,
|
||||||
|
get_image_size,
|
||||||
|
infer_channel_dimension_format,
|
||||||
|
make_list_of_images,
|
||||||
|
to_numpy_array,
|
||||||
|
valid_images,
|
||||||
|
)
|
||||||
|
from paddleformers.transformers.legacy.tokenizer_utils_base import TensorType
|
||||||
|
from PIL import Image
|
||||||
|
|
||||||
|
from fastdeploy.input.image_processors.common import is_scaled_image, smart_resize
|
||||||
|
from fastdeploy.utils import data_processor_logger
|
||||||
|
|
||||||
|
OPENAI_CLIP_MEAN = [0.48145466, 0.4578275, 0.40821073]
|
||||||
|
OPENAI_CLIP_STD = [0.26862954, 0.26130258, 0.27577711]
|
||||||
|
|
||||||
|
MIN_PIXELS = 4 * 28 * 28
|
||||||
|
MAX_PIXELS = 16384 * 28 * 28
|
||||||
|
|
||||||
|
|
||||||
|
VideoInput = Union[
|
||||||
|
List["PIL.Image.Image"],
|
||||||
|
"np.ndarray",
|
||||||
|
"paddle.Tensor",
|
||||||
|
List["np.ndarray"],
|
||||||
|
List["paddle.Tensor"],
|
||||||
|
List[List["PIL.Image.Image"]],
|
||||||
|
List[List["np.ndarray"]],
|
||||||
|
List[List["paddle.Tensor"]],
|
||||||
|
]
|
||||||
|
|
||||||
|
|
||||||
|
class ImageProcessor(BaseImageProcessor):
|
||||||
|
"""
|
||||||
|
Adaptive image processor for dynamic image resizing and preprocessing.
|
||||||
|
|
||||||
|
This processor handles image resizing, rescaling, normalization and format conversion.
|
||||||
|
It dynamically adjusts image dimensions based on original size and specified constraints.
|
||||||
|
"""
|
||||||
|
|
||||||
|
def __init__(
|
||||||
|
self,
|
||||||
|
patch_size: int = 14,
|
||||||
|
merge_size: int = 2,
|
||||||
|
temporal_patch_size: int = 2,
|
||||||
|
min_pixels: int = MIN_PIXELS,
|
||||||
|
max_pixels: int = MAX_PIXELS,
|
||||||
|
image_mean: Union[float, List[float]] = OPENAI_CLIP_MEAN,
|
||||||
|
image_std: Union[float, List[float]] = OPENAI_CLIP_STD,
|
||||||
|
rescale_factor: float = 1 / 255,
|
||||||
|
do_rescale: bool = True,
|
||||||
|
do_normalize: bool = True,
|
||||||
|
resample: PILImageResampling = PILImageResampling.BICUBIC,
|
||||||
|
**kwargs,
|
||||||
|
) -> None:
|
||||||
|
"""
|
||||||
|
Initialize image processor with configuration parameters.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
patch_size (int): Spatial patch size for vision encoder
|
||||||
|
merge_size (int): Merge size between vision and LLM encoders
|
||||||
|
temporal_patch_size (int): Temporal patch size for video processing
|
||||||
|
min_pixels (int): Minimum allowed pixels in resized image
|
||||||
|
max_pixels (int): Maximum allowed pixels in resized image
|
||||||
|
image_mean (float/list): Mean values for normalization per channel
|
||||||
|
image_std (float/list): Std values for normalization per channel
|
||||||
|
rescale_factor (float): Scaling factor for pixel values (default 1/255)
|
||||||
|
do_rescale (bool): Whether to rescale images
|
||||||
|
do_normalize (bool): Whether to normalize images
|
||||||
|
resample: Resampling method for image resizing
|
||||||
|
**kwargs: Additional base class arguments
|
||||||
|
"""
|
||||||
|
super().__init__(**kwargs)
|
||||||
|
self.patch_size = patch_size
|
||||||
|
self.merge_size = merge_size
|
||||||
|
self.temporal_patch_size = temporal_patch_size
|
||||||
|
|
||||||
|
self.min_pixels = min_pixels
|
||||||
|
self.max_pixels = max_pixels
|
||||||
|
|
||||||
|
self.image_mean = image_mean
|
||||||
|
self.image_std = image_std
|
||||||
|
self.rescale_factor = rescale_factor
|
||||||
|
self.do_rescale = do_rescale
|
||||||
|
self.do_normalize = do_normalize
|
||||||
|
|
||||||
|
self.resample = resample
|
||||||
|
|
||||||
|
def _preprocess(
|
||||||
|
self,
|
||||||
|
images: Union[ImageInput, VideoInput],
|
||||||
|
min_pixels: int,
|
||||||
|
max_pixels: int,
|
||||||
|
image_mean: Optional[Union[float, List[float]]],
|
||||||
|
image_std: Optional[Union[float, List[float]]],
|
||||||
|
rescale_factor: float,
|
||||||
|
do_rescale: bool,
|
||||||
|
do_normalize: bool,
|
||||||
|
resample: PILImageResampling,
|
||||||
|
data_format: Optional[ChannelDimension],
|
||||||
|
input_data_format: Optional[Union[str, ChannelDimension]],
|
||||||
|
):
|
||||||
|
"""
|
||||||
|
Internal method for image preprocessing pipeline.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
images: Input image or batch of images
|
||||||
|
min_pixels: Minimum allowed pixels in output
|
||||||
|
max_pixels: Maximum allowed pixels in output
|
||||||
|
image_mean: Normalization mean values
|
||||||
|
image_std: Normalization std values
|
||||||
|
rescale_factor: Pixel value scaling factor
|
||||||
|
do_rescale: Whether to rescale pixel values
|
||||||
|
do_normalize: Whether to normalize pixel values
|
||||||
|
resample: Resampling method
|
||||||
|
data_format: Output channel format
|
||||||
|
input_data_format: Input channel format
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
tuple: (flatten_patches, grid_dimensions)
|
||||||
|
- flatten_patches: Flattened image patches
|
||||||
|
- grid_dimensions: Grid dimensions [t, h, w]
|
||||||
|
"""
|
||||||
|
images = make_list_of_images(images)
|
||||||
|
|
||||||
|
# All transformations expect numpy arrays.
|
||||||
|
images = [to_numpy_array(image) for image in images]
|
||||||
|
|
||||||
|
if is_scaled_image(images[0]) and do_rescale:
|
||||||
|
data_processor_logger.warning(
|
||||||
|
"It looks like you are trying to rescale already rescaled images. If the input"
|
||||||
|
" images have pixel values between 0 and 1, set `do_rescale=False` to avoid rescaling them again."
|
||||||
|
)
|
||||||
|
if input_data_format is None:
|
||||||
|
# We assume that all images have the same channel dimension format.
|
||||||
|
input_data_format = infer_channel_dimension_format(images[0])
|
||||||
|
|
||||||
|
# Get original dimensions and calculate optimal resize dimensions
|
||||||
|
height, width = get_image_size(images[0], channel_dim=input_data_format)
|
||||||
|
resized_height, resized_width = smart_resize(
|
||||||
|
height,
|
||||||
|
width,
|
||||||
|
factor=self.patch_size * self.merge_size, # Combine patch and merge factors
|
||||||
|
min_pixels=min_pixels,
|
||||||
|
max_pixels=max_pixels,
|
||||||
|
)
|
||||||
|
|
||||||
|
processed_images = []
|
||||||
|
for image in images:
|
||||||
|
if height != resized_height or width != resized_width:
|
||||||
|
# Convert to uint8 before resizing to avoid double scaling
|
||||||
|
image = image.astype("uint8")
|
||||||
|
# Convert to PIL Image and resize
|
||||||
|
image = Image.fromarray(image)
|
||||||
|
image = resize(
|
||||||
|
image,
|
||||||
|
size=(resized_height, resized_width),
|
||||||
|
resample=resample,
|
||||||
|
data_format=input_data_format,
|
||||||
|
)
|
||||||
|
|
||||||
|
if do_rescale and do_normalize:
|
||||||
|
# Adjust mean and std for combined rescale+normalize
|
||||||
|
image_mean = np.array(image_mean, dtype=np.float32) * (1.0 / rescale_factor)
|
||||||
|
image_std = np.array(image_std, dtype=np.float32) * (1.0 / rescale_factor)
|
||||||
|
do_rescale = False # Skip separate rescale step
|
||||||
|
|
||||||
|
if do_rescale:
|
||||||
|
image = image.astype(np.float32)
|
||||||
|
image = rescale(image, scale=rescale_factor, data_format=input_data_format)
|
||||||
|
|
||||||
|
if do_normalize:
|
||||||
|
image = image.astype(np.float32)
|
||||||
|
image = normalize(
|
||||||
|
image=image,
|
||||||
|
mean=image_mean,
|
||||||
|
std=image_std,
|
||||||
|
data_format=input_data_format,
|
||||||
|
)
|
||||||
|
|
||||||
|
image = to_channel_dimension_format(image, data_format, input_channel_dim=input_data_format) # [C, H, W]
|
||||||
|
processed_images.append(image)
|
||||||
|
|
||||||
|
# Convert processed images to numpy array
|
||||||
|
patches = np.array(processed_images)
|
||||||
|
|
||||||
|
# Pad temporal dimension if needed
|
||||||
|
if patches.shape[0] % self.temporal_patch_size != 0:
|
||||||
|
repeats = np.repeat(
|
||||||
|
patches[-1][np.newaxis],
|
||||||
|
self.temporal_patch_size - (patches.shape[0] % self.temporal_patch_size),
|
||||||
|
axis=0,
|
||||||
|
)
|
||||||
|
patches = np.concatenate([patches, repeats], axis=0)
|
||||||
|
|
||||||
|
# Convert to channels-first format if needed
|
||||||
|
if data_format == ChannelDimension.LAST:
|
||||||
|
patches = patches.transpose([0, 3, 1, 2]) # [N, H, W, C] -> [N, C, H, W]
|
||||||
|
|
||||||
|
grid_t, channel = patches.shape[:2]
|
||||||
|
grid_t = grid_t // self.temporal_patch_size
|
||||||
|
|
||||||
|
grid_h, grid_w = (
|
||||||
|
resized_height // self.patch_size,
|
||||||
|
resized_width // self.patch_size,
|
||||||
|
)
|
||||||
|
# Reshape into hierarchical patch structure
|
||||||
|
patches = patches.reshape(
|
||||||
|
[
|
||||||
|
grid_t,
|
||||||
|
self.temporal_patch_size,
|
||||||
|
channel,
|
||||||
|
grid_h // self.merge_size,
|
||||||
|
self.merge_size,
|
||||||
|
self.patch_size,
|
||||||
|
grid_w // self.merge_size,
|
||||||
|
self.merge_size,
|
||||||
|
self.patch_size,
|
||||||
|
]
|
||||||
|
)
|
||||||
|
# Reorder dimensions for better memory access pattern
|
||||||
|
# [grid_t, grid_h/merge_size, grid_w/merge_size, merge_size, merge_size, C, temporal_patch_size, psz, psz]
|
||||||
|
patches = patches.transpose([0, 3, 6, 4, 7, 2, 1, 5, 8])
|
||||||
|
|
||||||
|
flatten_patches = patches.reshape(
|
||||||
|
[
|
||||||
|
grid_t * grid_h * grid_w,
|
||||||
|
channel * self.temporal_patch_size * self.patch_size * self.patch_size,
|
||||||
|
]
|
||||||
|
)
|
||||||
|
|
||||||
|
return flatten_patches, np.array([grid_t, grid_h, grid_w])
|
||||||
|
|
||||||
|
def preprocess(
|
||||||
|
self,
|
||||||
|
images: Union[ImageInput, VideoInput],
|
||||||
|
min_pixels: Optional[int] = None,
|
||||||
|
max_pixels: Optional[int] = None,
|
||||||
|
image_mean: Optional[Union[float, List[float]]] = None,
|
||||||
|
image_std: Optional[Union[float, List[float]]] = None,
|
||||||
|
rescale_factor: Optional[float] = None,
|
||||||
|
do_rescale: Optional[bool] = None,
|
||||||
|
do_normalize: Optional[bool] = None,
|
||||||
|
resample: Optional[PILImageResampling] = None,
|
||||||
|
return_tensors: Optional[Union[str, TensorType]] = None,
|
||||||
|
data_format: Optional[ChannelDimension] = ChannelDimension.FIRST,
|
||||||
|
input_data_format: Optional[Union[str, ChannelDimension]] = ChannelDimension.LAST,
|
||||||
|
):
|
||||||
|
"""
|
||||||
|
Main preprocessing method for images/videos.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
images: Input image/video data
|
||||||
|
min_pixels: Override for minimum pixels
|
||||||
|
max_pixels: Override for maximum pixels
|
||||||
|
image_mean: Override for normalization mean
|
||||||
|
image_std: Override for normalization std
|
||||||
|
rescale_factor: Override for rescaling factor
|
||||||
|
do_rescale: Override for rescaling flag
|
||||||
|
do_normalize: Override for normalization flag
|
||||||
|
resample: Override for resampling method
|
||||||
|
return_tensors: Desired output tensor format
|
||||||
|
data_format: Output channel dimension format
|
||||||
|
input_data_format: Input channel dimension format
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
BatchFeature: Processed features containing:
|
||||||
|
- pixel_values: Preprocessed pixel data
|
||||||
|
- grid_thw: Grid dimensions [temporal, height, width]
|
||||||
|
|
||||||
|
Raises:
|
||||||
|
ValueError: For invalid image types or dimensions
|
||||||
|
"""
|
||||||
|
min_pixels = min_pixels if min_pixels is not None else self.min_pixels
|
||||||
|
max_pixels = max_pixels if max_pixels is not None else self.max_pixels
|
||||||
|
image_mean = image_mean if image_mean is not None else self.image_mean
|
||||||
|
image_std = image_std if image_std is not None else self.image_std
|
||||||
|
rescale_factor = rescale_factor if rescale_factor is not None else self.rescale_factor
|
||||||
|
do_rescale = do_rescale if do_rescale is not None else self.do_rescale
|
||||||
|
do_normalize = do_normalize if do_normalize is not None else self.do_normalize
|
||||||
|
resample = resample if resample is not None else self.resample
|
||||||
|
|
||||||
|
if images is not None and not valid_images(images):
|
||||||
|
raise ValueError("Invalid image type. Must be of type PIL.Image.Image, numpy.ndarray, " "paddle.Tensor.")
|
||||||
|
|
||||||
|
pixel_values, grid_thw = self._preprocess(
|
||||||
|
images,
|
||||||
|
min_pixels=min_pixels,
|
||||||
|
max_pixels=max_pixels,
|
||||||
|
image_mean=image_mean,
|
||||||
|
image_std=image_std,
|
||||||
|
rescale_factor=rescale_factor,
|
||||||
|
do_rescale=do_rescale,
|
||||||
|
do_normalize=do_normalize,
|
||||||
|
resample=resample,
|
||||||
|
data_format=data_format,
|
||||||
|
input_data_format=input_data_format,
|
||||||
|
)
|
||||||
|
data = {"pixel_values": pixel_values, "grid_thw": grid_thw}
|
||||||
|
return BatchFeature(data=data, tensor_type=return_tensors)
|
||||||
@@ -0,0 +1,453 @@
|
|||||||
|
"""
|
||||||
|
# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
|
||||||
|
#
|
||||||
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
# you may not use this file except in compliance with the License.
|
||||||
|
# You may obtain a copy of the License at
|
||||||
|
#
|
||||||
|
# http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
#
|
||||||
|
# Unless required by applicable law or agreed to in writing, software
|
||||||
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
# See the License for the specific language governing permissions and
|
||||||
|
# limitations under the License.
|
||||||
|
"""
|
||||||
|
|
||||||
|
"""Unified multimodal processor for all VL model types.
|
||||||
|
|
||||||
|
Consolidates the four separate VL processor wrappers (QwenVLProcessor,
|
||||||
|
Qwen3VLProcessor, PaddleOCRVLProcessor, Ernie4_5_VLProcessor) into a
|
||||||
|
single class that dispatches per ``model_type``.
|
||||||
|
"""
|
||||||
|
|
||||||
|
from collections.abc import Mapping
|
||||||
|
from typing import Any, Dict, Optional
|
||||||
|
|
||||||
|
import numpy as np
|
||||||
|
|
||||||
|
from fastdeploy.input.base_processor import BaseTextProcessor
|
||||||
|
from fastdeploy.input.utils import IDS_TYPE_FLAG, process_stop_token_ids
|
||||||
|
from fastdeploy.utils import data_processor_logger
|
||||||
|
|
||||||
|
QWEN_VL = "qwen_vl"
|
||||||
|
QWEN3_VL = "qwen3_vl"
|
||||||
|
PADDLEOCR_VL = "paddleocr_vl"
|
||||||
|
ERNIE4_5_VL = "ernie4_5_vl"
|
||||||
|
|
||||||
|
_SUPPORTED_MODEL_TYPES = {QWEN_VL, QWEN3_VL, PADDLEOCR_VL, ERNIE4_5_VL}
|
||||||
|
|
||||||
|
_QWEN_EXPECTED_KWARGS = {
|
||||||
|
"video_max_frames": int,
|
||||||
|
"video_min_frames": int,
|
||||||
|
}
|
||||||
|
|
||||||
|
_ERNIE_EXPECTED_KWARGS = {
|
||||||
|
"spatial_conv_size": int,
|
||||||
|
"temporal_conv_size": int,
|
||||||
|
"image_min_pixels": int,
|
||||||
|
"image_max_pixels": int,
|
||||||
|
"video_min_pixels": int,
|
||||||
|
"video_max_pixels": int,
|
||||||
|
"video_target_frames": int,
|
||||||
|
"video_frames_sample": str,
|
||||||
|
"video_max_frames": int,
|
||||||
|
"video_min_frames": int,
|
||||||
|
"video_fps": int,
|
||||||
|
}
|
||||||
|
|
||||||
|
_DEFAULT_MM_LIMITS = {"image": 1, "video": 1, "audio": 1}
|
||||||
|
|
||||||
|
_SAMPLING_EPS = 1e-5
|
||||||
|
|
||||||
|
|
||||||
|
class MultiModalProcessor(BaseTextProcessor):
|
||||||
|
"""Unified multimodal processor for all supported VL model types.
|
||||||
|
|
||||||
|
Dispatches image-processor creation, config initialisation, and
|
||||||
|
encoding logic based on ``model_type``.
|
||||||
|
"""
|
||||||
|
|
||||||
|
def __init__(
|
||||||
|
self,
|
||||||
|
model_name_or_path: str,
|
||||||
|
model_type: str,
|
||||||
|
config=None,
|
||||||
|
limit_mm_per_prompt: Optional[Dict[str, Any]] = None,
|
||||||
|
mm_processor_kwargs: Optional[Dict[str, Any]] = None,
|
||||||
|
reasoning_parser_obj=None,
|
||||||
|
tool_parser_obj=None,
|
||||||
|
enable_processor_cache: bool = False,
|
||||||
|
):
|
||||||
|
if model_type not in _SUPPORTED_MODEL_TYPES:
|
||||||
|
raise ValueError(
|
||||||
|
f"Unsupported model_type '{model_type}'. " f"Must be one of {sorted(_SUPPORTED_MODEL_TYPES)}."
|
||||||
|
)
|
||||||
|
self.model_type = model_type
|
||||||
|
self.config = config
|
||||||
|
self.enable_processor_cache = enable_processor_cache
|
||||||
|
|
||||||
|
tokenizer_type = "ernie4_5" if model_type == ERNIE4_5_VL else "auto"
|
||||||
|
|
||||||
|
super().__init__(
|
||||||
|
model_name_or_path,
|
||||||
|
tokenizer_type=tokenizer_type,
|
||||||
|
reasoning_parser_obj=reasoning_parser_obj,
|
||||||
|
tool_parser_obj=tool_parser_obj,
|
||||||
|
)
|
||||||
|
|
||||||
|
data_processor_logger.info(f"model_name_or_path: {model_name_or_path}")
|
||||||
|
|
||||||
|
processor_kwargs = self._parse_processor_kwargs(mm_processor_kwargs)
|
||||||
|
self._init_mm_processor(processor_kwargs)
|
||||||
|
self._init_mm_config()
|
||||||
|
self.limit_mm_per_prompt = self._parse_limits(limit_mm_per_prompt)
|
||||||
|
|
||||||
|
def _load_tokenizer(self):
|
||||||
|
"""Load the appropriate tokenizer based on model_type."""
|
||||||
|
if self.tokenizer_type == "ernie4_5":
|
||||||
|
import os
|
||||||
|
|
||||||
|
from fastdeploy.input.ernie4_5_tokenizer import Ernie4_5Tokenizer
|
||||||
|
|
||||||
|
vocab_file_names = ["tokenizer.model", "spm.model", "ernie_token_100k.model"]
|
||||||
|
for name in vocab_file_names:
|
||||||
|
if os.path.exists(os.path.join(self.model_name_or_path, name)):
|
||||||
|
Ernie4_5Tokenizer.resource_files_names["vocab_file"] = name
|
||||||
|
break
|
||||||
|
tokenizer = Ernie4_5Tokenizer.from_pretrained(self.model_name_or_path)
|
||||||
|
else:
|
||||||
|
from paddleformers.transformers import AutoTokenizer
|
||||||
|
|
||||||
|
tokenizer = AutoTokenizer.from_pretrained(self.model_name_or_path, padding_side="left", use_fast=True)
|
||||||
|
return tokenizer
|
||||||
|
|
||||||
|
def _init_mm_processor(self, processor_kwargs: dict):
|
||||||
|
"""Create the model-type-specific internal DataProcessor."""
|
||||||
|
if self.model_type == QWEN_VL:
|
||||||
|
from fastdeploy.input.qwen_vl_processor.process import DataProcessor
|
||||||
|
|
||||||
|
tokens_per_second = getattr(getattr(self.config, "vision_config", None), "tokens_per_second", 2)
|
||||||
|
self.processor = DataProcessor(
|
||||||
|
model_path=self.model_name_or_path,
|
||||||
|
enable_processor_cache=self.enable_processor_cache,
|
||||||
|
tokens_per_second=tokens_per_second,
|
||||||
|
tokenizer=self.tokenizer,
|
||||||
|
**processor_kwargs,
|
||||||
|
)
|
||||||
|
elif self.model_type == QWEN3_VL:
|
||||||
|
from fastdeploy.input.qwen3_vl_processor.process import DataProcessor
|
||||||
|
|
||||||
|
self.processor = DataProcessor(
|
||||||
|
model_path=self.model_name_or_path,
|
||||||
|
enable_processor_cache=self.enable_processor_cache,
|
||||||
|
tokenizer=self.tokenizer,
|
||||||
|
**processor_kwargs,
|
||||||
|
)
|
||||||
|
elif self.model_type == PADDLEOCR_VL:
|
||||||
|
from fastdeploy.input.paddleocr_vl_processor.process import DataProcessor
|
||||||
|
|
||||||
|
tokens_per_second = getattr(getattr(self.config, "vision_config", None), "tokens_per_second", 2)
|
||||||
|
self.processor = DataProcessor(
|
||||||
|
model_path=self.model_name_or_path,
|
||||||
|
enable_processor_cache=self.enable_processor_cache,
|
||||||
|
tokens_per_second=tokens_per_second,
|
||||||
|
tokenizer=self.tokenizer,
|
||||||
|
**processor_kwargs,
|
||||||
|
)
|
||||||
|
elif self.model_type == ERNIE4_5_VL:
|
||||||
|
from fastdeploy.input.ernie4_5_vl_processor.process import DataProcessor
|
||||||
|
|
||||||
|
self.processor = DataProcessor(
|
||||||
|
tokenizer_name=self.model_name_or_path,
|
||||||
|
image_preprocessor_name=self.model_name_or_path,
|
||||||
|
enable_processor_cache=self.enable_processor_cache,
|
||||||
|
**processor_kwargs,
|
||||||
|
)
|
||||||
|
self.processor.eval()
|
||||||
|
|
||||||
|
def _init_mm_config(self):
|
||||||
|
"""Set model-type-specific multimodal configuration attributes."""
|
||||||
|
if self.model_type in (QWEN_VL, QWEN3_VL):
|
||||||
|
self.image_patch_id = self.processor.image_token_id
|
||||||
|
elif self.model_type == PADDLEOCR_VL:
|
||||||
|
self.image_patch_id = self.processor.image_patch_id
|
||||||
|
elif self.model_type == ERNIE4_5_VL:
|
||||||
|
self.image_patch_id = self.processor.image_patch_id
|
||||||
|
self.spatial_conv_size = self.processor.spatial_conv_size
|
||||||
|
|
||||||
|
def _parse_processor_kwargs(self, kwargs: Optional[dict]) -> dict:
|
||||||
|
"""Parse and validate multimodal processor kwargs."""
|
||||||
|
if not kwargs:
|
||||||
|
return {}
|
||||||
|
|
||||||
|
try:
|
||||||
|
if not isinstance(kwargs, dict):
|
||||||
|
raise ValueError("mm-processor-kwargs must be a dictionary")
|
||||||
|
|
||||||
|
data_processor_logger.info(f"Processing kwargs: {kwargs}")
|
||||||
|
|
||||||
|
if self.model_type == ERNIE4_5_VL:
|
||||||
|
expected_types = _ERNIE_EXPECTED_KWARGS
|
||||||
|
else:
|
||||||
|
expected_types = _QWEN_EXPECTED_KWARGS
|
||||||
|
|
||||||
|
for key, value in kwargs.items():
|
||||||
|
if key in expected_types and not isinstance(value, expected_types[key]):
|
||||||
|
raise ValueError(
|
||||||
|
f"Invalid type for {key}: expected "
|
||||||
|
f"{expected_types[key].__name__}, got {type(value).__name__}"
|
||||||
|
)
|
||||||
|
return kwargs
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
data_processor_logger.warning(f"Invalid mm-processor-kwargs format: {e}")
|
||||||
|
return {}
|
||||||
|
|
||||||
|
def _parse_limits(self, limits: Optional[dict]) -> dict:
|
||||||
|
"""Parse multimodal input limits, merging with defaults."""
|
||||||
|
if not limits:
|
||||||
|
return dict(_DEFAULT_MM_LIMITS)
|
||||||
|
|
||||||
|
try:
|
||||||
|
if not isinstance(limits, dict):
|
||||||
|
raise ValueError("limit-mm-per-prompt must be a dictionary")
|
||||||
|
data_processor_logger.info(f"_parse_limits:{limits}")
|
||||||
|
return {**_DEFAULT_MM_LIMITS, **limits}
|
||||||
|
except Exception as e:
|
||||||
|
data_processor_logger.warning(f"Invalid limit-mm-per-prompt format: {e}, using default limits")
|
||||||
|
return dict(_DEFAULT_MM_LIMITS)
|
||||||
|
|
||||||
|
def _check_mm_limits(self, item):
|
||||||
|
"""Validate multimodal inputs against configured limits."""
|
||||||
|
if isinstance(item, dict):
|
||||||
|
mm_data = item
|
||||||
|
else:
|
||||||
|
mm_data = {"image": [], "video": []}
|
||||||
|
for message in item:
|
||||||
|
if isinstance(message.get("content"), list):
|
||||||
|
for part in message["content"]:
|
||||||
|
part_type = part.get("type")
|
||||||
|
if part_type in ("image_url", "image"):
|
||||||
|
mm_data["image"].append(part)
|
||||||
|
elif part_type in ("video_url", "video"):
|
||||||
|
mm_data["video"].append(part)
|
||||||
|
|
||||||
|
for modality, data in mm_data.items():
|
||||||
|
if modality in self.limit_mm_per_prompt:
|
||||||
|
limit = self.limit_mm_per_prompt[modality]
|
||||||
|
if len(data) > limit:
|
||||||
|
raise ValueError(f"Too many {modality} items in prompt, " f"got {len(data)} but limit is {limit}")
|
||||||
|
|
||||||
|
def get_mm_max_tokens_per_item(self, seq_len: int) -> Optional[Mapping[str, int]]:
|
||||||
|
"""Return per-modality max token counts, if available."""
|
||||||
|
if self.model_type == ERNIE4_5_VL:
|
||||||
|
return self.processor.get_mm_max_tokens_per_item(seq_len)
|
||||||
|
return None
|
||||||
|
|
||||||
|
def process_request_dict(self, request, max_model_len=None):
|
||||||
|
"""Process a request dictionary into model inputs.
|
||||||
|
|
||||||
|
Unified template-method flow for all VL model types. Per-model
|
||||||
|
differences are handled by small conditional branches rather than
|
||||||
|
duplicating the entire pipeline.
|
||||||
|
"""
|
||||||
|
request = self._apply_default_parameters(request)
|
||||||
|
|
||||||
|
if not request.get("eos_token_ids"):
|
||||||
|
request["eos_token_ids"] = self.eos_token_ids
|
||||||
|
|
||||||
|
self._process_stop_tokens(request)
|
||||||
|
|
||||||
|
if self.model_type != PADDLEOCR_VL:
|
||||||
|
self._process_bad_words(request)
|
||||||
|
|
||||||
|
if self.model_type == ERNIE4_5_VL:
|
||||||
|
logits_processors_args = self._prepare_think_stop_sentence(
|
||||||
|
request.get("logits_processors_args") or {}, max_model_len
|
||||||
|
)
|
||||||
|
request["logits_processors_args"] = logits_processors_args
|
||||||
|
|
||||||
|
outputs = self._tokenize_request(request)
|
||||||
|
|
||||||
|
self._process_post_tokens(request, outputs)
|
||||||
|
|
||||||
|
if self.model_type in (QWEN_VL, QWEN3_VL):
|
||||||
|
request["enable_thinking"] = False
|
||||||
|
|
||||||
|
outputs = self.pack_outputs(outputs)
|
||||||
|
|
||||||
|
if self.model_type in (QWEN3_VL, ERNIE4_5_VL) and request.get("prompt_token_ids"):
|
||||||
|
pass # preserve existing prompt_token_ids
|
||||||
|
else:
|
||||||
|
request["prompt_token_ids"] = outputs["input_ids"].tolist()
|
||||||
|
request["prompt_token_ids_len"] = len(request["prompt_token_ids"])
|
||||||
|
request["multimodal_inputs"] = outputs
|
||||||
|
|
||||||
|
if max_model_len is not None and len(request["prompt_token_ids"]) > max_model_len:
|
||||||
|
request["prompt_token_ids"] = request["prompt_token_ids"][: max_model_len - 1]
|
||||||
|
|
||||||
|
if self.model_type == ERNIE4_5_VL:
|
||||||
|
logits_processors_args = self._update_thinking_prompt_state(
|
||||||
|
request["prompt_token_ids"], request.get("logits_processors_args") or {}
|
||||||
|
)
|
||||||
|
request["logits_processors_args"] = logits_processors_args
|
||||||
|
|
||||||
|
max_tokens = max_model_len - len(request["prompt_token_ids"])
|
||||||
|
if request.get("max_tokens") is None:
|
||||||
|
request["max_tokens"] = max(1, max_tokens)
|
||||||
|
else:
|
||||||
|
request["max_tokens"] = min(max_tokens, request["max_tokens"])
|
||||||
|
|
||||||
|
if self.model_type == ERNIE4_5_VL and request.get("reasoning_max_tokens") is None:
|
||||||
|
request["reasoning_max_tokens"] = max(int(request["max_tokens"] * 0.8), 1)
|
||||||
|
|
||||||
|
if self.model_type in (PADDLEOCR_VL, ERNIE4_5_VL):
|
||||||
|
if request.get("top_p") is not None and request.get("top_p") < _SAMPLING_EPS:
|
||||||
|
request["top_p"] = _SAMPLING_EPS
|
||||||
|
request["top_k"] = 1
|
||||||
|
|
||||||
|
if self.model_type != QWEN3_VL and self.reasoning_parser:
|
||||||
|
self._apply_reasoning_parser(request)
|
||||||
|
|
||||||
|
if self.model_type == ERNIE4_5_VL:
|
||||||
|
if request.get("response_max_tokens") is not None and request.get("enable_thinking") is False:
|
||||||
|
request["max_tokens"] = min(request["response_max_tokens"], request["max_tokens"])
|
||||||
|
|
||||||
|
data_processor_logger.info(f"Processed request {request}")
|
||||||
|
return request
|
||||||
|
|
||||||
|
def _process_stop_tokens(self, request):
|
||||||
|
"""Handle stop token processing based on model type."""
|
||||||
|
if self.model_type == QWEN3_VL:
|
||||||
|
stop_sequences = request.get("stop", [])
|
||||||
|
if stop_sequences:
|
||||||
|
stop_seqs, stop_seqs_len = self.update_stop_seq(stop_sequences)
|
||||||
|
request["stop_token_ids"] = stop_seqs
|
||||||
|
request["stop_seqs_len"] = stop_seqs_len
|
||||||
|
else:
|
||||||
|
process_stop_token_ids(request, self.update_stop_seq)
|
||||||
|
|
||||||
|
def _process_bad_words(self, request):
|
||||||
|
"""Process bad_words into token ids."""
|
||||||
|
bad_words = request.get("bad_words")
|
||||||
|
bad_words_token_ids = request.get("bad_words_token_ids")
|
||||||
|
if bad_words:
|
||||||
|
bad_words_token_ids = self.update_bad_words(bad_words, bad_words_token_ids)
|
||||||
|
request["bad_words_token_ids"] = bad_words_token_ids
|
||||||
|
|
||||||
|
def _tokenize_request(self, request):
|
||||||
|
"""Core tokenization dispatch: prompt_token_ids > prompt > messages."""
|
||||||
|
default_thinking = True if self.model_type == ERNIE4_5_VL else False
|
||||||
|
|
||||||
|
if request.get("prompt_token_ids") and self.model_type in (QWEN3_VL, ERNIE4_5_VL):
|
||||||
|
messages = request.get("messages")
|
||||||
|
if messages:
|
||||||
|
self._check_mm_limits(messages)
|
||||||
|
request.setdefault("enable_thinking", default_thinking)
|
||||||
|
return self.processor.prompt_token_ids2outputs(request)
|
||||||
|
|
||||||
|
elif request.get("prompt"):
|
||||||
|
multimodal_data = request.get("multimodal_data") or {}
|
||||||
|
self._check_mm_limits(multimodal_data)
|
||||||
|
images = multimodal_data.get("image", None)
|
||||||
|
videos = multimodal_data.get("video", None)
|
||||||
|
if self.model_type == ERNIE4_5_VL:
|
||||||
|
request["prompt_tokens"] = request.get("prompt")
|
||||||
|
request.setdefault("enable_thinking", default_thinking)
|
||||||
|
return self.processor.text2ids(request["prompt"], images, videos)
|
||||||
|
|
||||||
|
elif request.get("messages"):
|
||||||
|
messages = request["messages"]
|
||||||
|
self._check_mm_limits(messages)
|
||||||
|
chat_template_kwargs = request.get("chat_template_kwargs")
|
||||||
|
if chat_template_kwargs:
|
||||||
|
if isinstance(chat_template_kwargs, dict):
|
||||||
|
for k, v in chat_template_kwargs.items():
|
||||||
|
if k not in request or request[k] is None:
|
||||||
|
request[k] = v
|
||||||
|
else:
|
||||||
|
raise ValueError("Invalid input: chat_template_kwargs must be a dict")
|
||||||
|
request.setdefault("enable_thinking", default_thinking)
|
||||||
|
return self.processor.request2ids(request)
|
||||||
|
|
||||||
|
else:
|
||||||
|
raise ValueError(f"Request must contain 'prompt', or 'messages': {request}")
|
||||||
|
|
||||||
|
def _process_post_tokens(self, request, outputs):
|
||||||
|
"""Handle post-tokenization token appending."""
|
||||||
|
if self.model_type == PADDLEOCR_VL:
|
||||||
|
metadata = request.get("metadata")
|
||||||
|
if metadata and metadata.get("generated_token_ids"):
|
||||||
|
self._append_completion_tokens_qwen(outputs, metadata["generated_token_ids"])
|
||||||
|
else:
|
||||||
|
if request.get("completion_token_ids"):
|
||||||
|
self.append_completion_tokens(outputs, request["completion_token_ids"])
|
||||||
|
|
||||||
|
def _apply_reasoning_parser(self, request):
|
||||||
|
"""Apply reasoning parser and update model status dict."""
|
||||||
|
model_status = self.reasoning_parser.get_model_status(request["prompt_token_ids"])
|
||||||
|
parts = request["request_id"].split("_")
|
||||||
|
if len(parts) > 1:
|
||||||
|
real_req_id = parts[0]
|
||||||
|
index = int(parts[1])
|
||||||
|
n = request.get("n", 1)
|
||||||
|
for idx in range(index * n, (index + 1) * n):
|
||||||
|
self.model_status_dict[f"{real_req_id}_{idx}"] = model_status
|
||||||
|
else:
|
||||||
|
self.model_status_dict[request["request_id"]] = model_status
|
||||||
|
request["enable_thinking"] = model_status == "think_start"
|
||||||
|
|
||||||
|
def append_completion_tokens(self, multimodal_inputs, completion_token_ids):
|
||||||
|
"""Append completion tokens to existing multimodal outputs."""
|
||||||
|
if self.model_type == ERNIE4_5_VL:
|
||||||
|
self._append_completion_tokens_ernie(multimodal_inputs, completion_token_ids)
|
||||||
|
else:
|
||||||
|
self._append_completion_tokens_qwen(multimodal_inputs, completion_token_ids)
|
||||||
|
|
||||||
|
def _append_completion_tokens_qwen(self, multimodal_inputs, completion_token_ids):
|
||||||
|
"""Append completion tokens for qwen_vl / qwen3_vl / paddleocr_vl."""
|
||||||
|
num_tokens = len(completion_token_ids)
|
||||||
|
multimodal_inputs["input_ids"].extend(completion_token_ids)
|
||||||
|
multimodal_inputs["token_type_ids"].extend([0] * num_tokens)
|
||||||
|
|
||||||
|
pos_ids = self.processor._compute_text_positions(multimodal_inputs["cur_position"], num_tokens)
|
||||||
|
multimodal_inputs["position_ids"].append(pos_ids)
|
||||||
|
multimodal_inputs["cur_position"] += num_tokens
|
||||||
|
|
||||||
|
def _append_completion_tokens_ernie(self, multimodal_inputs, completion_token_ids):
|
||||||
|
"""Append completion tokens for ernie4_5_vl."""
|
||||||
|
num_tokens = len(completion_token_ids)
|
||||||
|
multimodal_inputs["input_ids"].extend(completion_token_ids)
|
||||||
|
multimodal_inputs["token_type_ids"].extend([IDS_TYPE_FLAG["text"]] * num_tokens)
|
||||||
|
|
||||||
|
start = multimodal_inputs["cur_position"]
|
||||||
|
for i in range(num_tokens):
|
||||||
|
multimodal_inputs["position_ids"].append([start + i] * 3)
|
||||||
|
multimodal_inputs["cur_position"] += num_tokens
|
||||||
|
|
||||||
|
def pack_outputs(self, outputs):
|
||||||
|
"""Convert intermediate processing outputs to final format."""
|
||||||
|
if not outputs["images"]:
|
||||||
|
outputs["images"] = None
|
||||||
|
outputs["grid_thw"] = None
|
||||||
|
outputs["image_type_ids"] = None
|
||||||
|
else:
|
||||||
|
outputs["images"] = np.vstack(outputs["images"])
|
||||||
|
outputs["grid_thw"] = np.vstack(outputs["grid_thw"])
|
||||||
|
outputs["image_type_ids"] = np.array(outputs["image_type_ids"])
|
||||||
|
|
||||||
|
outputs["input_ids"] = np.array(outputs["input_ids"], dtype=np.int64)
|
||||||
|
outputs["token_type_ids"] = np.array(outputs["token_type_ids"], dtype=np.int64)
|
||||||
|
outputs["mm_num_token_func"] = self.processor.mm_num_tokens
|
||||||
|
|
||||||
|
if self.model_type in (QWEN_VL, QWEN3_VL, PADDLEOCR_VL):
|
||||||
|
outputs["position_ids"] = np.concatenate(outputs["position_ids"], axis=1, dtype=np.int64)
|
||||||
|
outputs["image_patch_id"] = self.processor.image_token_id
|
||||||
|
outputs["video_patch_id"] = self.processor.video_token_id
|
||||||
|
outputs["position_ids"] = outputs["position_ids"].transpose(1, 0)
|
||||||
|
else:
|
||||||
|
outputs["position_ids"] = np.array(outputs["position_ids"], dtype=np.int64)
|
||||||
|
outputs["image_patch_id"] = self.image_patch_id
|
||||||
|
|
||||||
|
return outputs
|
||||||
@@ -14,216 +14,12 @@
|
|||||||
# limitations under the License.
|
# limitations under the License.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
"""Image processor class for Keye."""
|
# Backward compatibility: this module has been migrated to
|
||||||
|
# fastdeploy.input.image_processors.paddleocr_processor
|
||||||
|
# This file will be removed in a future version.
|
||||||
|
|
||||||
# TODO: Support videos
|
from fastdeploy.input.image_processors.paddleocr_processor import ( # noqa: F401
|
||||||
|
ImageProcessor,
|
||||||
import json
|
make_batched_images,
|
||||||
from pathlib import Path
|
smart_resize,
|
||||||
from typing import Dict, List, Optional, Union
|
|
||||||
|
|
||||||
import numpy as np
|
|
||||||
from paddleformers.transformers.feature_extraction_utils import BatchFeature
|
|
||||||
from paddleformers.transformers.image_processing_utils import BaseImageProcessor
|
|
||||||
from paddleformers.transformers.image_utils import (
|
|
||||||
ImageInput,
|
|
||||||
is_valid_image,
|
|
||||||
make_list_of_images,
|
|
||||||
to_numpy_array,
|
|
||||||
)
|
)
|
||||||
|
|
||||||
from fastdeploy.input.image_processors.common import (
|
|
||||||
smart_resize_paddleocr as smart_resize,
|
|
||||||
)
|
|
||||||
|
|
||||||
_OPENAI_CLIP_MEAN = [0.48145466, 0.4578275, 0.40821073]
|
|
||||||
_OPENAI_CLIP_STD = [0.26862954, 0.26130258, 0.27577711]
|
|
||||||
|
|
||||||
|
|
||||||
def make_batched_images(images) -> List[List[ImageInput]]:
|
|
||||||
"""
|
|
||||||
Accepts images in list or nested list format, and makes a list of images for preprocessing.
|
|
||||||
|
|
||||||
Args:
|
|
||||||
images (`Union[List[List[ImageInput]], List[ImageInput], ImageInput]`):
|
|
||||||
The input image.
|
|
||||||
|
|
||||||
Returns:
|
|
||||||
list: A list of images.
|
|
||||||
"""
|
|
||||||
if isinstance(images, (list, tuple)) and isinstance(images[0], (list, tuple)) and is_valid_image(images[0][0]):
|
|
||||||
return [img for img_list in images for img in img_list]
|
|
||||||
|
|
||||||
elif isinstance(images, (list, tuple)) and is_valid_image(images[0]):
|
|
||||||
return images
|
|
||||||
|
|
||||||
elif is_valid_image(images):
|
|
||||||
return [images]
|
|
||||||
|
|
||||||
raise ValueError(f"Could not make batched images from {images}")
|
|
||||||
|
|
||||||
|
|
||||||
def adjust_size(size, patch_size):
|
|
||||||
num_patches = size // patch_size
|
|
||||||
if num_patches % 2 != 0:
|
|
||||||
num_patches -= 1
|
|
||||||
return num_patches * patch_size
|
|
||||||
|
|
||||||
|
|
||||||
class ImageProcessor(BaseImageProcessor):
|
|
||||||
model_input_names = [
|
|
||||||
"pixel_values",
|
|
||||||
"image_grid_thw",
|
|
||||||
"pixel_values_videos",
|
|
||||||
"video_grid_thw",
|
|
||||||
]
|
|
||||||
|
|
||||||
def __init__(
|
|
||||||
self,
|
|
||||||
do_resize: bool = True,
|
|
||||||
resample: int = 3,
|
|
||||||
do_rescale: bool = True,
|
|
||||||
rescale_factor: Union[int, float] = 1 / 255,
|
|
||||||
do_normalize: bool = True,
|
|
||||||
image_mean: Optional[Union[float, List[float]]] = None,
|
|
||||||
image_std: Optional[Union[float, List[float]]] = None,
|
|
||||||
do_convert_rgb: bool = True,
|
|
||||||
min_pixels: int = 28 * 28 * 130,
|
|
||||||
max_pixels: int = 28 * 28 * 1280,
|
|
||||||
patch_size: int = 14,
|
|
||||||
temporal_patch_size: int = 1,
|
|
||||||
merge_size: int = 2,
|
|
||||||
**kwargs,
|
|
||||||
) -> None:
|
|
||||||
super().__init__()
|
|
||||||
self.do_resize = do_resize
|
|
||||||
self.resample = resample
|
|
||||||
self.do_rescale = do_rescale
|
|
||||||
self.rescale_factor = rescale_factor
|
|
||||||
self.do_normalize = do_normalize
|
|
||||||
self.image_mean = image_mean if image_mean is not None else _OPENAI_CLIP_MEAN
|
|
||||||
self.image_std = image_std if image_std is not None else _OPENAI_CLIP_STD
|
|
||||||
self.min_pixels = min_pixels
|
|
||||||
self.max_pixels = max_pixels
|
|
||||||
self.patch_size = patch_size
|
|
||||||
self.temporal_patch_size = temporal_patch_size
|
|
||||||
self.merge_size = merge_size
|
|
||||||
self.size = {"min_pixels": min_pixels, "max_pixels": max_pixels} # not used
|
|
||||||
self.do_convert_rgb = do_convert_rgb
|
|
||||||
|
|
||||||
@classmethod
|
|
||||||
def from_pretrained(cls, pretrained_model_dir):
|
|
||||||
pretrained_model_dir = Path(pretrained_model_dir)
|
|
||||||
image_processor_config_path = pretrained_model_dir / "preprocessor_config.json"
|
|
||||||
with open(image_processor_config_path, "r", encoding="utf-8") as f:
|
|
||||||
image_processor_config = json.load(f)
|
|
||||||
return cls(**image_processor_config)
|
|
||||||
|
|
||||||
def _preprocess(
|
|
||||||
self,
|
|
||||||
images,
|
|
||||||
do_resize: Optional[bool] = None,
|
|
||||||
do_rescale: Optional[bool] = None,
|
|
||||||
rescale_factor: Optional[float] = None,
|
|
||||||
do_normalize: Optional[bool] = None,
|
|
||||||
image_mean: Optional[Union[float, List[float]]] = None,
|
|
||||||
image_std: Optional[Union[float, List[float]]] = None,
|
|
||||||
do_convert_rgb: Optional[bool] = None,
|
|
||||||
):
|
|
||||||
images = make_list_of_images(images)
|
|
||||||
|
|
||||||
if do_convert_rgb:
|
|
||||||
images = [image.convert("RGB") for image in images]
|
|
||||||
|
|
||||||
width, height = images[0].size
|
|
||||||
resized_height, resized_width = height, width
|
|
||||||
processed_images = []
|
|
||||||
|
|
||||||
for image in images:
|
|
||||||
if do_resize:
|
|
||||||
resized_height, resized_width = smart_resize(
|
|
||||||
height,
|
|
||||||
width,
|
|
||||||
factor=self.patch_size * self.merge_size,
|
|
||||||
min_pixels=self.min_pixels,
|
|
||||||
max_pixels=self.max_pixels,
|
|
||||||
)
|
|
||||||
|
|
||||||
image = image.resize((resized_width, resized_height), resample=self.resample)
|
|
||||||
|
|
||||||
image = to_numpy_array(image)
|
|
||||||
|
|
||||||
if do_rescale:
|
|
||||||
image = (image * rescale_factor).astype(np.float32)
|
|
||||||
|
|
||||||
if do_normalize:
|
|
||||||
image = image.astype(np.float32)
|
|
||||||
image -= np.array(image_mean, dtype=np.float32)
|
|
||||||
image /= np.array(image_std, dtype=np.float32)
|
|
||||||
|
|
||||||
processed_images.append(image)
|
|
||||||
|
|
||||||
patches = np.array(processed_images)
|
|
||||||
patches = patches.transpose(0, 3, 1, 2)
|
|
||||||
if patches.shape[0] == 1:
|
|
||||||
patches = np.tile(patches, (self.temporal_patch_size, 1, 1, 1))
|
|
||||||
channel = patches.shape[1]
|
|
||||||
grid_t = patches.shape[0] // self.temporal_patch_size
|
|
||||||
grid_h, grid_w = (
|
|
||||||
resized_height // self.patch_size,
|
|
||||||
resized_width // self.patch_size,
|
|
||||||
)
|
|
||||||
|
|
||||||
patches = patches.reshape(
|
|
||||||
grid_t,
|
|
||||||
self.temporal_patch_size,
|
|
||||||
channel,
|
|
||||||
grid_h,
|
|
||||||
self.patch_size,
|
|
||||||
grid_w,
|
|
||||||
self.patch_size,
|
|
||||||
)
|
|
||||||
patches = patches.transpose(0, 3, 5, 2, 1, 4, 6)
|
|
||||||
assert self.temporal_patch_size == 1
|
|
||||||
flatten_patches = patches.reshape(grid_t * grid_h * grid_w, channel, self.patch_size, self.patch_size)
|
|
||||||
return flatten_patches, np.array([grid_t, grid_h, grid_w])
|
|
||||||
|
|
||||||
def preprocess(
|
|
||||||
self,
|
|
||||||
images,
|
|
||||||
videos=None,
|
|
||||||
do_resize: Optional[bool] = None,
|
|
||||||
size: Optional[Dict[str, int]] = None,
|
|
||||||
do_rescale: Optional[bool] = None,
|
|
||||||
rescale_factor: Optional[float] = None,
|
|
||||||
do_normalize: Optional[bool] = None,
|
|
||||||
image_mean: Optional[Union[float, List[float]]] = None,
|
|
||||||
image_std: Optional[Union[float, List[float]]] = None,
|
|
||||||
do_convert_rgb: Optional[bool] = None,
|
|
||||||
return_tensors=None,
|
|
||||||
):
|
|
||||||
do_resize = do_resize if do_resize is not None else self.do_resize
|
|
||||||
size = size if size is not None else self.size
|
|
||||||
do_rescale = do_rescale if do_rescale is not None else self.do_rescale
|
|
||||||
rescale_factor = rescale_factor if rescale_factor is not None else self.rescale_factor
|
|
||||||
do_normalize = do_normalize if do_normalize is not None else self.do_normalize
|
|
||||||
image_mean = image_mean if image_mean is not None else self.image_mean
|
|
||||||
image_std = image_std if image_std is not None else self.image_std
|
|
||||||
do_convert_rgb = do_convert_rgb if do_convert_rgb is not None else self.do_convert_rgb
|
|
||||||
|
|
||||||
if videos is not None:
|
|
||||||
raise NotImplementedError("Videos are not yet supported")
|
|
||||||
|
|
||||||
patches, image_grid_thw = self._preprocess(
|
|
||||||
images,
|
|
||||||
do_resize=do_resize,
|
|
||||||
do_rescale=do_rescale,
|
|
||||||
rescale_factor=rescale_factor,
|
|
||||||
do_normalize=do_normalize,
|
|
||||||
image_mean=image_mean,
|
|
||||||
image_std=image_std,
|
|
||||||
do_convert_rgb=do_convert_rgb,
|
|
||||||
)
|
|
||||||
pixel_values = np.array(patches)
|
|
||||||
data = {"pixel_values": pixel_values, "grid_thw": image_grid_thw}
|
|
||||||
return BatchFeature(data=data, tensor_type=return_tensors)
|
|
||||||
|
|||||||
@@ -91,54 +91,34 @@ class InputPreprocessor:
|
|||||||
tool_parser_obj=tool_parser_obj,
|
tool_parser_obj=tool_parser_obj,
|
||||||
)
|
)
|
||||||
else:
|
else:
|
||||||
|
from fastdeploy.input.multimodal_processor import (
|
||||||
|
ERNIE4_5_VL,
|
||||||
|
PADDLEOCR_VL,
|
||||||
|
QWEN3_VL,
|
||||||
|
QWEN_VL,
|
||||||
|
MultiModalProcessor,
|
||||||
|
)
|
||||||
|
|
||||||
if ErnieArchitectures.contains_ernie_arch(architecture):
|
if ErnieArchitectures.contains_ernie_arch(architecture):
|
||||||
from fastdeploy.input.ernie4_5_vl_processor import (
|
model_type = ERNIE4_5_VL
|
||||||
Ernie4_5_VLProcessor,
|
|
||||||
)
|
|
||||||
|
|
||||||
self.processor = Ernie4_5_VLProcessor(
|
|
||||||
model_name_or_path=self.model_name_or_path,
|
|
||||||
limit_mm_per_prompt=self.limit_mm_per_prompt,
|
|
||||||
mm_processor_kwargs=self.mm_processor_kwargs,
|
|
||||||
reasoning_parser_obj=reasoning_parser_obj,
|
|
||||||
tool_parser_obj=tool_parser_obj,
|
|
||||||
enable_processor_cache=self.enable_processor_cache,
|
|
||||||
)
|
|
||||||
elif "PaddleOCRVL" in architecture:
|
elif "PaddleOCRVL" in architecture:
|
||||||
from fastdeploy.input.paddleocr_vl_processor import (
|
model_type = PADDLEOCR_VL
|
||||||
PaddleOCRVLProcessor,
|
|
||||||
)
|
|
||||||
|
|
||||||
self.processor = PaddleOCRVLProcessor(
|
|
||||||
config=self.model_config,
|
|
||||||
model_name_or_path=self.model_name_or_path,
|
|
||||||
limit_mm_per_prompt=self.limit_mm_per_prompt,
|
|
||||||
mm_processor_kwargs=self.mm_processor_kwargs,
|
|
||||||
reasoning_parser_obj=reasoning_parser_obj,
|
|
||||||
)
|
|
||||||
elif "Qwen2_5_VL" in architecture:
|
elif "Qwen2_5_VL" in architecture:
|
||||||
from fastdeploy.input.qwen_vl_processor import QwenVLProcessor
|
model_type = QWEN_VL
|
||||||
|
|
||||||
self.processor = QwenVLProcessor(
|
|
||||||
config=self.model_config,
|
|
||||||
model_name_or_path=self.model_name_or_path,
|
|
||||||
limit_mm_per_prompt=self.limit_mm_per_prompt,
|
|
||||||
mm_processor_kwargs=self.mm_processor_kwargs,
|
|
||||||
reasoning_parser_obj=reasoning_parser_obj,
|
|
||||||
enable_processor_cache=self.enable_processor_cache,
|
|
||||||
)
|
|
||||||
elif "Qwen3VL" in architecture:
|
elif "Qwen3VL" in architecture:
|
||||||
from fastdeploy.input.qwen3_vl_processor import Qwen3VLProcessor
|
model_type = QWEN3_VL
|
||||||
|
|
||||||
self.processor = Qwen3VLProcessor(
|
|
||||||
config=self.model_config,
|
|
||||||
model_name_or_path=self.model_name_or_path,
|
|
||||||
limit_mm_per_prompt=self.limit_mm_per_prompt,
|
|
||||||
mm_processor_kwargs=self.mm_processor_kwargs,
|
|
||||||
reasoning_parser_obj=reasoning_parser_obj,
|
|
||||||
enable_processor_cache=self.enable_processor_cache,
|
|
||||||
)
|
|
||||||
else:
|
else:
|
||||||
raise ValueError(f"Unsupported model processor architecture: {architecture}. ")
|
raise ValueError(f"Unsupported model processor architecture: {architecture}. ")
|
||||||
|
|
||||||
|
self.processor = MultiModalProcessor(
|
||||||
|
model_name_or_path=self.model_name_or_path,
|
||||||
|
model_type=model_type,
|
||||||
|
config=self.model_config,
|
||||||
|
limit_mm_per_prompt=self.limit_mm_per_prompt,
|
||||||
|
mm_processor_kwargs=self.mm_processor_kwargs,
|
||||||
|
reasoning_parser_obj=reasoning_parser_obj,
|
||||||
|
tool_parser_obj=tool_parser_obj,
|
||||||
|
enable_processor_cache=self.enable_processor_cache,
|
||||||
|
)
|
||||||
|
|
||||||
return self.processor
|
return self.processor
|
||||||
|
|||||||
@@ -14,320 +14,10 @@
|
|||||||
# limitations under the License.
|
# limitations under the License.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
from typing import List, Optional, Union
|
# Backward compatibility: this module has been migrated to
|
||||||
|
# fastdeploy.input.image_processors.qwen3_processor
|
||||||
|
# This file will be removed in a future version.
|
||||||
|
|
||||||
import numpy as np
|
from fastdeploy.input.image_processors.qwen3_processor import ( # noqa: F401
|
||||||
import paddle
|
ImageProcessor,
|
||||||
import PIL
|
|
||||||
from paddleformers.transformers.feature_extraction_utils import BatchFeature
|
|
||||||
from paddleformers.transformers.image_processing_utils import BaseImageProcessor
|
|
||||||
from paddleformers.transformers.image_transforms import (
|
|
||||||
normalize,
|
|
||||||
rescale,
|
|
||||||
resize,
|
|
||||||
to_channel_dimension_format,
|
|
||||||
)
|
)
|
||||||
from paddleformers.transformers.image_utils import (
|
|
||||||
ChannelDimension,
|
|
||||||
ImageInput,
|
|
||||||
PILImageResampling,
|
|
||||||
get_image_size,
|
|
||||||
infer_channel_dimension_format,
|
|
||||||
make_list_of_images,
|
|
||||||
to_numpy_array,
|
|
||||||
valid_images,
|
|
||||||
)
|
|
||||||
from paddleformers.transformers.legacy.tokenizer_utils_base import TensorType
|
|
||||||
from PIL import Image
|
|
||||||
|
|
||||||
from fastdeploy.input.image_processors.common import is_scaled_image, smart_resize
|
|
||||||
from fastdeploy.utils import data_processor_logger
|
|
||||||
|
|
||||||
IMAGE_MEAN = [0.5, 0.5, 0.5]
|
|
||||||
IMAGE_STD = [0.5, 0.5, 0.5]
|
|
||||||
|
|
||||||
MIN_PIXELS = 65536
|
|
||||||
MAX_PIXELS = 16777216
|
|
||||||
|
|
||||||
|
|
||||||
VideoInput = Union[
|
|
||||||
List["PIL.Image.Image"],
|
|
||||||
"np.ndarray",
|
|
||||||
"paddle.Tensor",
|
|
||||||
List["np.ndarray"],
|
|
||||||
List["paddle.Tensor"],
|
|
||||||
List[List["PIL.Image.Image"]],
|
|
||||||
List[List["np.ndarray"]],
|
|
||||||
List[List["paddle.Tensor"]],
|
|
||||||
]
|
|
||||||
|
|
||||||
|
|
||||||
class ImageProcessor(BaseImageProcessor):
|
|
||||||
"""
|
|
||||||
Adaptive image processor for dynamic image resizing and preprocessing.
|
|
||||||
|
|
||||||
This processor handles image resizing, rescaling, normalization and format conversion.
|
|
||||||
It dynamically adjusts image dimensions based on original size and specified constraints.
|
|
||||||
"""
|
|
||||||
|
|
||||||
def __init__(
|
|
||||||
self,
|
|
||||||
patch_size: int = 16,
|
|
||||||
merge_size: int = 2,
|
|
||||||
temporal_patch_size: int = 2,
|
|
||||||
min_pixels: int = MIN_PIXELS,
|
|
||||||
max_pixels: int = MAX_PIXELS,
|
|
||||||
image_mean: Union[float, List[float]] = IMAGE_MEAN,
|
|
||||||
image_std: Union[float, List[float]] = IMAGE_STD,
|
|
||||||
rescale_factor: float = 1 / 255,
|
|
||||||
do_rescale: bool = True,
|
|
||||||
do_normalize: bool = True,
|
|
||||||
resample: PILImageResampling = PILImageResampling.BICUBIC,
|
|
||||||
**kwargs,
|
|
||||||
) -> None:
|
|
||||||
"""
|
|
||||||
Initialize image processor with configuration parameters.
|
|
||||||
|
|
||||||
Args:
|
|
||||||
patch_size (int): Spatial patch size for vision encoder
|
|
||||||
merge_size (int): Merge size between vision and LLM encoders
|
|
||||||
temporal_patch_size (int): Temporal patch size for video processing
|
|
||||||
min_pixels (int): Minimum allowed pixels in resized image
|
|
||||||
max_pixels (int): Maximum allowed pixels in resized image
|
|
||||||
image_mean (float/list): Mean values for normalization per channel
|
|
||||||
image_std (float/list): Std values for normalization per channel
|
|
||||||
rescale_factor (float): Scaling factor for pixel values (default 1/255)
|
|
||||||
do_rescale (bool): Whether to rescale images
|
|
||||||
do_normalize (bool): Whether to normalize images
|
|
||||||
resample: Resampling method for image resizing
|
|
||||||
**kwargs: Additional base class arguments
|
|
||||||
"""
|
|
||||||
super().__init__(**kwargs)
|
|
||||||
self.patch_size = patch_size
|
|
||||||
self.merge_size = merge_size
|
|
||||||
self.temporal_patch_size = temporal_patch_size
|
|
||||||
|
|
||||||
self.min_pixels = min_pixels
|
|
||||||
self.max_pixels = max_pixels
|
|
||||||
|
|
||||||
self.image_mean = image_mean
|
|
||||||
self.image_std = image_std
|
|
||||||
self.rescale_factor = rescale_factor
|
|
||||||
self.do_rescale = do_rescale
|
|
||||||
self.do_normalize = do_normalize
|
|
||||||
|
|
||||||
self.resample = resample
|
|
||||||
|
|
||||||
def _preprocess(
|
|
||||||
self,
|
|
||||||
images: Union[ImageInput, VideoInput],
|
|
||||||
min_pixels: int,
|
|
||||||
max_pixels: int,
|
|
||||||
image_mean: Optional[Union[float, List[float]]],
|
|
||||||
image_std: Optional[Union[float, List[float]]],
|
|
||||||
rescale_factor: float,
|
|
||||||
do_rescale: bool,
|
|
||||||
do_normalize: bool,
|
|
||||||
resample: PILImageResampling,
|
|
||||||
data_format: Optional[ChannelDimension],
|
|
||||||
input_data_format: Optional[Union[str, ChannelDimension]],
|
|
||||||
):
|
|
||||||
"""
|
|
||||||
Internal method for image preprocessing pipeline.
|
|
||||||
|
|
||||||
Args:
|
|
||||||
images: Input image or batch of images
|
|
||||||
min_pixels: Minimum allowed pixels in output
|
|
||||||
max_pixels: Maximum allowed pixels in output
|
|
||||||
image_mean: Normalization mean values
|
|
||||||
image_std: Normalization std values
|
|
||||||
rescale_factor: Pixel value scaling factor
|
|
||||||
do_rescale: Whether to rescale pixel values
|
|
||||||
do_normalize: Whether to normalize pixel values
|
|
||||||
resample: Resampling method
|
|
||||||
data_format: Output channel format
|
|
||||||
input_data_format: Input channel format
|
|
||||||
|
|
||||||
Returns:
|
|
||||||
tuple: (flatten_patches, grid_dimensions)
|
|
||||||
- flatten_patches: Flattened image patches
|
|
||||||
- grid_dimensions: Grid dimensions [t, h, w]
|
|
||||||
"""
|
|
||||||
images = make_list_of_images(images)
|
|
||||||
|
|
||||||
# All transformations expect numpy arrays.
|
|
||||||
images = [to_numpy_array(image) for image in images]
|
|
||||||
|
|
||||||
if is_scaled_image(images[0]) and do_rescale:
|
|
||||||
data_processor_logger.warning(
|
|
||||||
"It looks like you are trying to rescale already rescaled images. If the input"
|
|
||||||
" images have pixel values between 0 and 1, set `do_rescale=False` to avoid rescaling them again."
|
|
||||||
)
|
|
||||||
if input_data_format is None:
|
|
||||||
# We assume that all images have the same channel dimension format.
|
|
||||||
input_data_format = infer_channel_dimension_format(images[0])
|
|
||||||
|
|
||||||
# Get original dimensions and calculate optimal resize dimensions
|
|
||||||
height, width = get_image_size(images[0], channel_dim=input_data_format)
|
|
||||||
resized_height, resized_width = smart_resize(
|
|
||||||
height,
|
|
||||||
width,
|
|
||||||
factor=self.patch_size * self.merge_size, # Combine patch and merge factors
|
|
||||||
min_pixels=min_pixels,
|
|
||||||
max_pixels=max_pixels,
|
|
||||||
)
|
|
||||||
|
|
||||||
processed_images = []
|
|
||||||
for image in images:
|
|
||||||
if height != resized_height or width != resized_width:
|
|
||||||
# Convert to uint8 before resizing to avoid double scaling
|
|
||||||
image = image.astype("uint8")
|
|
||||||
# Convert to PIL Image and resize
|
|
||||||
image = Image.fromarray(image)
|
|
||||||
image = resize(
|
|
||||||
image,
|
|
||||||
size=(resized_height, resized_width),
|
|
||||||
resample=resample,
|
|
||||||
data_format=input_data_format,
|
|
||||||
)
|
|
||||||
|
|
||||||
if do_rescale and do_normalize:
|
|
||||||
# Adjust mean and std for combined rescale+normalize
|
|
||||||
image_mean = np.array(image_mean, dtype=np.float32) * (1.0 / rescale_factor)
|
|
||||||
image_std = np.array(image_std, dtype=np.float32) * (1.0 / rescale_factor)
|
|
||||||
do_rescale = False # Skip separate rescale step
|
|
||||||
|
|
||||||
# mutual exclusion and upper branch
|
|
||||||
if do_rescale:
|
|
||||||
image = image.astype(np.float32)
|
|
||||||
image = rescale(image, scale=rescale_factor, data_format=input_data_format)
|
|
||||||
|
|
||||||
if do_normalize:
|
|
||||||
image = image.astype(np.float32)
|
|
||||||
image = normalize(
|
|
||||||
image=image,
|
|
||||||
mean=image_mean,
|
|
||||||
std=image_std,
|
|
||||||
data_format=input_data_format,
|
|
||||||
)
|
|
||||||
|
|
||||||
image = to_channel_dimension_format(image, data_format, input_channel_dim=input_data_format) # [C, H, W]
|
|
||||||
processed_images.append(image)
|
|
||||||
|
|
||||||
# Convert processed images to numpy array
|
|
||||||
patches = np.array(processed_images)
|
|
||||||
|
|
||||||
# Pad temporal dimension if needed
|
|
||||||
if patches.shape[0] % self.temporal_patch_size != 0:
|
|
||||||
repeats = np.repeat(
|
|
||||||
patches[-1][np.newaxis],
|
|
||||||
self.temporal_patch_size - (patches.shape[0] % self.temporal_patch_size),
|
|
||||||
axis=0,
|
|
||||||
)
|
|
||||||
patches = np.concatenate([patches, repeats], axis=0)
|
|
||||||
|
|
||||||
# Convert to channels-first format if needed
|
|
||||||
if data_format == ChannelDimension.LAST:
|
|
||||||
patches = patches.transpose([0, 3, 1, 2]) # [N, H, W, C] -> [N, C, H, W]
|
|
||||||
|
|
||||||
grid_t, channel = patches.shape[:2]
|
|
||||||
grid_t = grid_t // self.temporal_patch_size
|
|
||||||
|
|
||||||
grid_h, grid_w = (
|
|
||||||
resized_height // self.patch_size,
|
|
||||||
resized_width // self.patch_size,
|
|
||||||
)
|
|
||||||
# Reshape into hierarchical patch structure
|
|
||||||
patches = patches.reshape(
|
|
||||||
[
|
|
||||||
grid_t,
|
|
||||||
self.temporal_patch_size,
|
|
||||||
channel,
|
|
||||||
grid_h // self.merge_size,
|
|
||||||
self.merge_size,
|
|
||||||
self.patch_size,
|
|
||||||
grid_w // self.merge_size,
|
|
||||||
self.merge_size,
|
|
||||||
self.patch_size,
|
|
||||||
]
|
|
||||||
)
|
|
||||||
# Reorder dimensions for better memory access pattern
|
|
||||||
# [grid_t, grid_h/merge_size, grid_w/merge_size, merge_size, merge_size, C, temporal_patch_size, psz, psz]
|
|
||||||
patches = patches.transpose([0, 3, 6, 4, 7, 2, 1, 5, 8])
|
|
||||||
|
|
||||||
flatten_patches = patches.reshape(
|
|
||||||
[
|
|
||||||
grid_t * grid_h * grid_w,
|
|
||||||
channel * self.temporal_patch_size * self.patch_size * self.patch_size,
|
|
||||||
]
|
|
||||||
)
|
|
||||||
|
|
||||||
return flatten_patches, np.array([grid_t, grid_h, grid_w])
|
|
||||||
|
|
||||||
def preprocess(
|
|
||||||
self,
|
|
||||||
images: Union[ImageInput, VideoInput],
|
|
||||||
min_pixels: Optional[int] = None,
|
|
||||||
max_pixels: Optional[int] = None,
|
|
||||||
image_mean: Optional[Union[float, List[float]]] = None,
|
|
||||||
image_std: Optional[Union[float, List[float]]] = None,
|
|
||||||
rescale_factor: Optional[float] = None,
|
|
||||||
do_rescale: Optional[bool] = None,
|
|
||||||
do_normalize: Optional[bool] = None,
|
|
||||||
resample: Optional[PILImageResampling] = None,
|
|
||||||
return_tensors: Optional[Union[str, TensorType]] = None,
|
|
||||||
data_format: Optional[ChannelDimension] = ChannelDimension.FIRST,
|
|
||||||
input_data_format: Optional[Union[str, ChannelDimension]] = ChannelDimension.LAST,
|
|
||||||
):
|
|
||||||
"""
|
|
||||||
Main preprocessing method for images/videos.
|
|
||||||
|
|
||||||
Args:
|
|
||||||
images: Input image/video data
|
|
||||||
min_pixels: Override for minimum pixels
|
|
||||||
max_pixels: Override for maximum pixels
|
|
||||||
image_mean: Override for normalization mean
|
|
||||||
image_std: Override for normalization std
|
|
||||||
rescale_factor: Override for rescaling factor
|
|
||||||
do_rescale: Override for rescaling flag
|
|
||||||
do_normalize: Override for normalization flag
|
|
||||||
resample: Override for resampling method
|
|
||||||
return_tensors: Desired output tensor format
|
|
||||||
data_format: Output channel dimension format
|
|
||||||
input_data_format: Input channel dimension format
|
|
||||||
|
|
||||||
Returns:
|
|
||||||
BatchFeature: Processed features containing:
|
|
||||||
- pixel_values: Preprocessed pixel data
|
|
||||||
- grid_thw: Grid dimensions [temporal, height, width]
|
|
||||||
|
|
||||||
Raises:
|
|
||||||
ValueError: For invalid image types or dimensions
|
|
||||||
"""
|
|
||||||
min_pixels = min_pixels if min_pixels is not None else self.min_pixels
|
|
||||||
max_pixels = max_pixels if max_pixels is not None else self.max_pixels
|
|
||||||
image_mean = image_mean if image_mean is not None else self.image_mean
|
|
||||||
image_std = image_std if image_std is not None else self.image_std
|
|
||||||
rescale_factor = rescale_factor if rescale_factor is not None else self.rescale_factor
|
|
||||||
do_rescale = do_rescale if do_rescale is not None else self.do_rescale
|
|
||||||
do_normalize = do_normalize if do_normalize is not None else self.do_normalize
|
|
||||||
resample = resample if resample is not None else self.resample
|
|
||||||
|
|
||||||
if images is not None and not valid_images(images):
|
|
||||||
raise ValueError("Invalid image type. Must be of type PIL.Image.Image, numpy.ndarray, " "paddle.Tensor.")
|
|
||||||
|
|
||||||
pixel_values, grid_thw = self._preprocess(
|
|
||||||
images,
|
|
||||||
min_pixels=min_pixels,
|
|
||||||
max_pixels=max_pixels,
|
|
||||||
image_mean=image_mean,
|
|
||||||
image_std=image_std,
|
|
||||||
rescale_factor=rescale_factor,
|
|
||||||
do_rescale=do_rescale,
|
|
||||||
do_normalize=do_normalize,
|
|
||||||
resample=resample,
|
|
||||||
data_format=data_format,
|
|
||||||
input_data_format=input_data_format,
|
|
||||||
)
|
|
||||||
data = {"pixel_values": pixel_values, "grid_thw": grid_thw}
|
|
||||||
return BatchFeature(data=data, tensor_type=return_tensors)
|
|
||||||
|
|||||||
@@ -14,319 +14,10 @@
|
|||||||
# limitations under the License.
|
# limitations under the License.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
from typing import List, Optional, Union
|
# Backward compatibility: this module has been migrated to
|
||||||
|
# fastdeploy.input.image_processors.qwen_processor
|
||||||
|
# This file will be removed in a future version.
|
||||||
|
|
||||||
import numpy as np
|
from fastdeploy.input.image_processors.qwen_processor import ( # noqa: F401
|
||||||
import paddle
|
ImageProcessor,
|
||||||
import PIL
|
|
||||||
from paddleformers.transformers.feature_extraction_utils import BatchFeature
|
|
||||||
from paddleformers.transformers.image_processing_utils import BaseImageProcessor
|
|
||||||
from paddleformers.transformers.image_transforms import (
|
|
||||||
normalize,
|
|
||||||
rescale,
|
|
||||||
resize,
|
|
||||||
to_channel_dimension_format,
|
|
||||||
)
|
)
|
||||||
from paddleformers.transformers.image_utils import (
|
|
||||||
ChannelDimension,
|
|
||||||
ImageInput,
|
|
||||||
PILImageResampling,
|
|
||||||
get_image_size,
|
|
||||||
infer_channel_dimension_format,
|
|
||||||
make_list_of_images,
|
|
||||||
to_numpy_array,
|
|
||||||
valid_images,
|
|
||||||
)
|
|
||||||
from paddleformers.transformers.legacy.tokenizer_utils_base import TensorType
|
|
||||||
from PIL import Image
|
|
||||||
|
|
||||||
from fastdeploy.input.image_processors.common import is_scaled_image, smart_resize
|
|
||||||
from fastdeploy.utils import data_processor_logger
|
|
||||||
|
|
||||||
OPENAI_CLIP_MEAN = [0.48145466, 0.4578275, 0.40821073]
|
|
||||||
OPENAI_CLIP_STD = [0.26862954, 0.26130258, 0.27577711]
|
|
||||||
|
|
||||||
MIN_PIXELS = 4 * 28 * 28
|
|
||||||
MAX_PIXELS = 16384 * 28 * 28
|
|
||||||
|
|
||||||
|
|
||||||
VideoInput = Union[
|
|
||||||
List["PIL.Image.Image"],
|
|
||||||
"np.ndarray",
|
|
||||||
"paddle.Tensor",
|
|
||||||
List["np.ndarray"],
|
|
||||||
List["paddle.Tensor"],
|
|
||||||
List[List["PIL.Image.Image"]],
|
|
||||||
List[List["np.ndarray"]],
|
|
||||||
List[List["paddle.Tensor"]],
|
|
||||||
]
|
|
||||||
|
|
||||||
|
|
||||||
class ImageProcessor(BaseImageProcessor):
|
|
||||||
"""
|
|
||||||
Adaptive image processor for dynamic image resizing and preprocessing.
|
|
||||||
|
|
||||||
This processor handles image resizing, rescaling, normalization and format conversion.
|
|
||||||
It dynamically adjusts image dimensions based on original size and specified constraints.
|
|
||||||
"""
|
|
||||||
|
|
||||||
def __init__(
|
|
||||||
self,
|
|
||||||
patch_size: int = 14,
|
|
||||||
merge_size: int = 2,
|
|
||||||
temporal_patch_size: int = 2,
|
|
||||||
min_pixels: int = MIN_PIXELS,
|
|
||||||
max_pixels: int = MAX_PIXELS,
|
|
||||||
image_mean: Union[float, List[float]] = OPENAI_CLIP_MEAN,
|
|
||||||
image_std: Union[float, List[float]] = OPENAI_CLIP_STD,
|
|
||||||
rescale_factor: float = 1 / 255,
|
|
||||||
do_rescale: bool = True,
|
|
||||||
do_normalize: bool = True,
|
|
||||||
resample: PILImageResampling = PILImageResampling.BICUBIC,
|
|
||||||
**kwargs,
|
|
||||||
) -> None:
|
|
||||||
"""
|
|
||||||
Initialize image processor with configuration parameters.
|
|
||||||
|
|
||||||
Args:
|
|
||||||
patch_size (int): Spatial patch size for vision encoder
|
|
||||||
merge_size (int): Merge size between vision and LLM encoders
|
|
||||||
temporal_patch_size (int): Temporal patch size for video processing
|
|
||||||
min_pixels (int): Minimum allowed pixels in resized image
|
|
||||||
max_pixels (int): Maximum allowed pixels in resized image
|
|
||||||
image_mean (float/list): Mean values for normalization per channel
|
|
||||||
image_std (float/list): Std values for normalization per channel
|
|
||||||
rescale_factor (float): Scaling factor for pixel values (default 1/255)
|
|
||||||
do_rescale (bool): Whether to rescale images
|
|
||||||
do_normalize (bool): Whether to normalize images
|
|
||||||
resample: Resampling method for image resizing
|
|
||||||
**kwargs: Additional base class arguments
|
|
||||||
"""
|
|
||||||
super().__init__(**kwargs)
|
|
||||||
self.patch_size = patch_size
|
|
||||||
self.merge_size = merge_size
|
|
||||||
self.temporal_patch_size = temporal_patch_size
|
|
||||||
|
|
||||||
self.min_pixels = min_pixels
|
|
||||||
self.max_pixels = max_pixels
|
|
||||||
|
|
||||||
self.image_mean = image_mean
|
|
||||||
self.image_std = image_std
|
|
||||||
self.rescale_factor = rescale_factor
|
|
||||||
self.do_rescale = do_rescale
|
|
||||||
self.do_normalize = do_normalize
|
|
||||||
|
|
||||||
self.resample = resample
|
|
||||||
|
|
||||||
def _preprocess(
|
|
||||||
self,
|
|
||||||
images: Union[ImageInput, VideoInput],
|
|
||||||
min_pixels: int,
|
|
||||||
max_pixels: int,
|
|
||||||
image_mean: Optional[Union[float, List[float]]],
|
|
||||||
image_std: Optional[Union[float, List[float]]],
|
|
||||||
rescale_factor: float,
|
|
||||||
do_rescale: bool,
|
|
||||||
do_normalize: bool,
|
|
||||||
resample: PILImageResampling,
|
|
||||||
data_format: Optional[ChannelDimension],
|
|
||||||
input_data_format: Optional[Union[str, ChannelDimension]],
|
|
||||||
):
|
|
||||||
"""
|
|
||||||
Internal method for image preprocessing pipeline.
|
|
||||||
|
|
||||||
Args:
|
|
||||||
images: Input image or batch of images
|
|
||||||
min_pixels: Minimum allowed pixels in output
|
|
||||||
max_pixels: Maximum allowed pixels in output
|
|
||||||
image_mean: Normalization mean values
|
|
||||||
image_std: Normalization std values
|
|
||||||
rescale_factor: Pixel value scaling factor
|
|
||||||
do_rescale: Whether to rescale pixel values
|
|
||||||
do_normalize: Whether to normalize pixel values
|
|
||||||
resample: Resampling method
|
|
||||||
data_format: Output channel format
|
|
||||||
input_data_format: Input channel format
|
|
||||||
|
|
||||||
Returns:
|
|
||||||
tuple: (flatten_patches, grid_dimensions)
|
|
||||||
- flatten_patches: Flattened image patches
|
|
||||||
- grid_dimensions: Grid dimensions [t, h, w]
|
|
||||||
"""
|
|
||||||
images = make_list_of_images(images)
|
|
||||||
|
|
||||||
# All transformations expect numpy arrays.
|
|
||||||
images = [to_numpy_array(image) for image in images]
|
|
||||||
|
|
||||||
if is_scaled_image(images[0]) and do_rescale:
|
|
||||||
data_processor_logger.warning(
|
|
||||||
"It looks like you are trying to rescale already rescaled images. If the input"
|
|
||||||
" images have pixel values between 0 and 1, set `do_rescale=False` to avoid rescaling them again."
|
|
||||||
)
|
|
||||||
if input_data_format is None:
|
|
||||||
# We assume that all images have the same channel dimension format.
|
|
||||||
input_data_format = infer_channel_dimension_format(images[0])
|
|
||||||
|
|
||||||
# Get original dimensions and calculate optimal resize dimensions
|
|
||||||
height, width = get_image_size(images[0], channel_dim=input_data_format)
|
|
||||||
resized_height, resized_width = smart_resize(
|
|
||||||
height,
|
|
||||||
width,
|
|
||||||
factor=self.patch_size * self.merge_size, # Combine patch and merge factors
|
|
||||||
min_pixels=min_pixels,
|
|
||||||
max_pixels=max_pixels,
|
|
||||||
)
|
|
||||||
|
|
||||||
processed_images = []
|
|
||||||
for image in images:
|
|
||||||
if height != resized_height or width != resized_width:
|
|
||||||
# Convert to uint8 before resizing to avoid double scaling
|
|
||||||
image = image.astype("uint8")
|
|
||||||
# Convert to PIL Image and resize
|
|
||||||
image = Image.fromarray(image)
|
|
||||||
image = resize(
|
|
||||||
image,
|
|
||||||
size=(resized_height, resized_width),
|
|
||||||
resample=resample,
|
|
||||||
data_format=input_data_format,
|
|
||||||
)
|
|
||||||
|
|
||||||
if do_rescale and do_normalize:
|
|
||||||
# Adjust mean and std for combined rescale+normalize
|
|
||||||
image_mean = np.array(image_mean, dtype=np.float32) * (1.0 / rescale_factor)
|
|
||||||
image_std = np.array(image_std, dtype=np.float32) * (1.0 / rescale_factor)
|
|
||||||
do_rescale = False # Skip separate rescale step
|
|
||||||
|
|
||||||
if do_rescale:
|
|
||||||
image = image.astype(np.float32)
|
|
||||||
image = rescale(image, scale=rescale_factor, data_format=input_data_format)
|
|
||||||
|
|
||||||
if do_normalize:
|
|
||||||
image = image.astype(np.float32)
|
|
||||||
image = normalize(
|
|
||||||
image=image,
|
|
||||||
mean=image_mean,
|
|
||||||
std=image_std,
|
|
||||||
data_format=input_data_format,
|
|
||||||
)
|
|
||||||
|
|
||||||
image = to_channel_dimension_format(image, data_format, input_channel_dim=input_data_format) # [C, H, W]
|
|
||||||
processed_images.append(image)
|
|
||||||
|
|
||||||
# Convert processed images to numpy array
|
|
||||||
patches = np.array(processed_images)
|
|
||||||
|
|
||||||
# Pad temporal dimension if needed
|
|
||||||
if patches.shape[0] % self.temporal_patch_size != 0:
|
|
||||||
repeats = np.repeat(
|
|
||||||
patches[-1][np.newaxis],
|
|
||||||
self.temporal_patch_size - (patches.shape[0] % self.temporal_patch_size),
|
|
||||||
axis=0,
|
|
||||||
)
|
|
||||||
patches = np.concatenate([patches, repeats], axis=0)
|
|
||||||
|
|
||||||
# Convert to channels-first format if needed
|
|
||||||
if data_format == ChannelDimension.LAST:
|
|
||||||
patches = patches.transpose([0, 3, 1, 2]) # [N, H, W, C] -> [N, C, H, W]
|
|
||||||
|
|
||||||
grid_t, channel = patches.shape[:2]
|
|
||||||
grid_t = grid_t // self.temporal_patch_size
|
|
||||||
|
|
||||||
grid_h, grid_w = (
|
|
||||||
resized_height // self.patch_size,
|
|
||||||
resized_width // self.patch_size,
|
|
||||||
)
|
|
||||||
# Reshape into hierarchical patch structure
|
|
||||||
patches = patches.reshape(
|
|
||||||
[
|
|
||||||
grid_t,
|
|
||||||
self.temporal_patch_size,
|
|
||||||
channel,
|
|
||||||
grid_h // self.merge_size,
|
|
||||||
self.merge_size,
|
|
||||||
self.patch_size,
|
|
||||||
grid_w // self.merge_size,
|
|
||||||
self.merge_size,
|
|
||||||
self.patch_size,
|
|
||||||
]
|
|
||||||
)
|
|
||||||
# Reorder dimensions for better memory access pattern
|
|
||||||
# [grid_t, grid_h/merge_size, grid_w/merge_size, merge_size, merge_size, C, temporal_patch_size, psz, psz]
|
|
||||||
patches = patches.transpose([0, 3, 6, 4, 7, 2, 1, 5, 8])
|
|
||||||
|
|
||||||
flatten_patches = patches.reshape(
|
|
||||||
[
|
|
||||||
grid_t * grid_h * grid_w,
|
|
||||||
channel * self.temporal_patch_size * self.patch_size * self.patch_size,
|
|
||||||
]
|
|
||||||
)
|
|
||||||
|
|
||||||
return flatten_patches, np.array([grid_t, grid_h, grid_w])
|
|
||||||
|
|
||||||
def preprocess(
|
|
||||||
self,
|
|
||||||
images: Union[ImageInput, VideoInput],
|
|
||||||
min_pixels: Optional[int] = None,
|
|
||||||
max_pixels: Optional[int] = None,
|
|
||||||
image_mean: Optional[Union[float, List[float]]] = None,
|
|
||||||
image_std: Optional[Union[float, List[float]]] = None,
|
|
||||||
rescale_factor: Optional[float] = None,
|
|
||||||
do_rescale: Optional[bool] = None,
|
|
||||||
do_normalize: Optional[bool] = None,
|
|
||||||
resample: Optional[PILImageResampling] = None,
|
|
||||||
return_tensors: Optional[Union[str, TensorType]] = None,
|
|
||||||
data_format: Optional[ChannelDimension] = ChannelDimension.FIRST,
|
|
||||||
input_data_format: Optional[Union[str, ChannelDimension]] = ChannelDimension.LAST,
|
|
||||||
):
|
|
||||||
"""
|
|
||||||
Main preprocessing method for images/videos.
|
|
||||||
|
|
||||||
Args:
|
|
||||||
images: Input image/video data
|
|
||||||
min_pixels: Override for minimum pixels
|
|
||||||
max_pixels: Override for maximum pixels
|
|
||||||
image_mean: Override for normalization mean
|
|
||||||
image_std: Override for normalization std
|
|
||||||
rescale_factor: Override for rescaling factor
|
|
||||||
do_rescale: Override for rescaling flag
|
|
||||||
do_normalize: Override for normalization flag
|
|
||||||
resample: Override for resampling method
|
|
||||||
return_tensors: Desired output tensor format
|
|
||||||
data_format: Output channel dimension format
|
|
||||||
input_data_format: Input channel dimension format
|
|
||||||
|
|
||||||
Returns:
|
|
||||||
BatchFeature: Processed features containing:
|
|
||||||
- pixel_values: Preprocessed pixel data
|
|
||||||
- grid_thw: Grid dimensions [temporal, height, width]
|
|
||||||
|
|
||||||
Raises:
|
|
||||||
ValueError: For invalid image types or dimensions
|
|
||||||
"""
|
|
||||||
min_pixels = min_pixels if min_pixels is not None else self.min_pixels
|
|
||||||
max_pixels = max_pixels if max_pixels is not None else self.max_pixels
|
|
||||||
image_mean = image_mean if image_mean is not None else self.image_mean
|
|
||||||
image_std = image_std if image_std is not None else self.image_std
|
|
||||||
rescale_factor = rescale_factor if rescale_factor is not None else self.rescale_factor
|
|
||||||
do_rescale = do_rescale if do_rescale is not None else self.do_rescale
|
|
||||||
do_normalize = do_normalize if do_normalize is not None else self.do_normalize
|
|
||||||
resample = resample if resample is not None else self.resample
|
|
||||||
|
|
||||||
if images is not None and not valid_images(images):
|
|
||||||
raise ValueError("Invalid image type. Must be of type PIL.Image.Image, numpy.ndarray, " "paddle.Tensor.")
|
|
||||||
|
|
||||||
pixel_values, grid_thw = self._preprocess(
|
|
||||||
images,
|
|
||||||
min_pixels=min_pixels,
|
|
||||||
max_pixels=max_pixels,
|
|
||||||
image_mean=image_mean,
|
|
||||||
image_std=image_std,
|
|
||||||
rescale_factor=rescale_factor,
|
|
||||||
do_rescale=do_rescale,
|
|
||||||
do_normalize=do_normalize,
|
|
||||||
resample=resample,
|
|
||||||
data_format=data_format,
|
|
||||||
input_data_format=input_data_format,
|
|
||||||
)
|
|
||||||
data = {"pixel_values": pixel_values, "grid_thw": grid_thw}
|
|
||||||
return BatchFeature(data=data, tensor_type=return_tensors)
|
|
||||||
|
|||||||
@@ -340,9 +340,7 @@ class TestImagePreprocessorAdaptive(unittest.TestCase):
|
|||||||
# Create a scaled image (values between 0-1)
|
# Create a scaled image (values between 0-1)
|
||||||
img_array = np.random.rand(224, 224, 3).astype(np.float32) * 0.5
|
img_array = np.random.rand(224, 224, 3).astype(np.float32) * 0.5
|
||||||
# Use patch to capture warning
|
# Use patch to capture warning
|
||||||
with patch(
|
with patch("fastdeploy.input.image_processors.adaptive_processor.data_processor_logger") as mock_logger:
|
||||||
"fastdeploy.input.ernie4_5_vl_processor.image_preprocessor.image_preprocessor_adaptive.data_processor_logger"
|
|
||||||
) as mock_logger:
|
|
||||||
# Directly call _preprocess, pass scaled image
|
# Directly call _preprocess, pass scaled image
|
||||||
self.processor._preprocess(
|
self.processor._preprocess(
|
||||||
[img_array], # Pass scaled numpy array
|
[img_array], # Pass scaled numpy array
|
||||||
@@ -356,9 +354,7 @@ class TestImagePreprocessorAdaptive(unittest.TestCase):
|
|||||||
"""Test invalid image check in preprocess (line 464)"""
|
"""Test invalid image check in preprocess (line 464)"""
|
||||||
# Test invalid image type - need to ensure valid_images returns False
|
# Test invalid image type - need to ensure valid_images returns False
|
||||||
# Use patch to make valid_images return False, but make_batched_images succeeds
|
# Use patch to make valid_images return False, but make_batched_images succeeds
|
||||||
with patch(
|
with patch("fastdeploy.input.image_processors.adaptive_processor.valid_images") as mock_valid:
|
||||||
"fastdeploy.input.ernie4_5_vl_processor.image_preprocessor.image_preprocessor_adaptive.valid_images"
|
|
||||||
) as mock_valid:
|
|
||||||
mock_valid.return_value = False
|
mock_valid.return_value = False
|
||||||
valid_images_list = [Image.new("RGB", (224, 224))] # Valid image, but valid_images returns False
|
valid_images_list = [Image.new("RGB", (224, 224))] # Valid image, but valid_images returns False
|
||||||
with self.assertRaises(ValueError) as context:
|
with self.assertRaises(ValueError) as context:
|
||||||
|
|||||||
File diff suppressed because it is too large
Load Diff
Reference in New Issue
Block a user