diff --git a/fastdeploy/input/ernie4_5_vl_processor/image_preprocessor/image_preprocessor_adaptive.py b/fastdeploy/input/ernie4_5_vl_processor/image_preprocessor/image_preprocessor_adaptive.py index 6dcdf3a4e9..cd81274654 100644 --- a/fastdeploy/input/ernie4_5_vl_processor/image_preprocessor/image_preprocessor_adaptive.py +++ b/fastdeploy/input/ernie4_5_vl_processor/image_preprocessor/image_preprocessor_adaptive.py @@ -16,7 +16,6 @@ """image preprocessor adaptive""" -import math from typing import List, Optional, Union import numpy as np @@ -45,6 +44,8 @@ from paddleformers.transformers.image_utils import ( from paddleformers.transformers.legacy.tokenizer_utils_base import TensorType from PIL import Image +from fastdeploy.input.image_processors.common import is_scaled_image +from fastdeploy.input.image_processors.common import smart_resize_qwen as smart_resize from fastdeploy.utils import data_processor_logger OPENAI_CLIP_MEAN = [0.48145466, 0.4578275, 0.40821073] @@ -73,22 +74,9 @@ __all__ = [ ] -def is_scaled_image(image: np.ndarray) -> bool: - """ - Checks to see whether the pixel values have already been rescaled to [0, 1]. - """ - if image.dtype == np.uint8: - return False - - # It's possible the image has pixel values in [0, 255] but is of floating type - return np.min(image) >= 0 and np.max(image) <= 1 - - def make_batched_images(images) -> List[List[ImageInput]]: """ Accepts images in list or nested list format, and makes a list of images for preprocessing. - - Args: images (`Union[List[List[ImageInput]], List[ImageInput], ImageInput]`): The input image. @@ -521,67 +509,3 @@ class AdaptiveImageProcessor(BaseImageProcessor): } return BatchFeature(data=data, tensor_type=return_tensors) - - -def round_by_factor(number: int, factor: int) -> int: - """Returns the closest integer to 'number' that is divisible by 'factor'.""" - return round(number / factor) * factor - - -def ceil_by_factor(number: int, factor: int) -> int: - """Returns the smallest integer greater than or equal to 'number' that is divisible by 'factor'.""" - return math.ceil(number / factor) * factor - - -def floor_by_factor(number: int, factor: int) -> int: - """Returns the largest integer less than or equal to 'number' that is divisible by 'factor'.""" - return math.floor(number / factor) * factor - - -def smart_resize( - height: int, - width: int, - factor: int = IMAGE_FACTOR, - min_pixels: int = MIN_PIXELS, - max_pixels: int = MAX_PIXELS, -): - """ - Rescales the image so that the following conditions are met: - - 1. Both dimensions (height and width) are divisible by 'factor'. - - 2. The total number of pixels is within the range ['min_pixels', 'max_pixels']. - - 3. The aspect ratio of the image is maintained as closely as possible. - """ - if max(height, width) / min(height, width) > MAX_RATIO: - if height > width: - new_width = max(factor, round_by_factor(width, factor)) - new_height = floor_by_factor(new_width * MAX_RATIO, factor) - else: - new_height = max(factor, round_by_factor(height, factor)) - new_width = floor_by_factor(new_height * MAX_RATIO, factor) - - data_processor_logger.info( - f"absolute aspect ratio must be smaller than {MAX_RATIO}, got {max(height, width) / min(height, width)},\ - resize to {max(new_height, new_width) / min(new_height, new_width)}" - ) - - height = new_height - width = new_width - - h_bar = max(factor, round_by_factor(height, factor)) - w_bar = max(factor, round_by_factor(width, factor)) - if h_bar * w_bar > max_pixels: - beta = math.sqrt((height * width) / max_pixels) - h_bar = floor_by_factor(height / beta, factor) - w_bar = floor_by_factor(width / beta, factor) - elif h_bar * w_bar < min_pixels: - beta = math.sqrt(min_pixels / (height * width)) - h_bar = ceil_by_factor(height * beta, factor) - w_bar = ceil_by_factor(width * beta, factor) - - if min_pixels > h_bar * w_bar or h_bar * w_bar > max_pixels: - raise ValueError(f"encounter invalid h_bar: {h_bar}, w_bar: {w_bar}") - - return h_bar, w_bar diff --git a/fastdeploy/input/image_processors/__init__.py b/fastdeploy/input/image_processors/__init__.py new file mode 100644 index 0000000000..a9cc79cc9d --- /dev/null +++ b/fastdeploy/input/image_processors/__init__.py @@ -0,0 +1,13 @@ +# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. diff --git a/fastdeploy/input/image_processors/common.py b/fastdeploy/input/image_processors/common.py new file mode 100644 index 0000000000..656815fa0b --- /dev/null +++ b/fastdeploy/input/image_processors/common.py @@ -0,0 +1,208 @@ +# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Shared image utility functions for all VL image processors.""" + +import math + +import numpy as np + +from fastdeploy.utils import data_processor_logger + +__all__ = [ + "round_by_factor", + "ceil_by_factor", + "floor_by_factor", + "is_scaled_image", + "smart_resize", + "smart_resize_qwen", + "smart_resize_paddleocr", +] + + +def round_by_factor(number: int, factor: int) -> int: + """Returns the closest integer to 'number' that is divisible by 'factor'.""" + return round(number / factor) * factor + + +def ceil_by_factor(number: int, factor: int) -> int: + """Returns the smallest integer >= 'number' that is divisible by 'factor'.""" + return math.ceil(number / factor) * factor + + +def floor_by_factor(number: int, factor: int) -> int: + """Returns the largest integer <= 'number' that is divisible by 'factor'.""" + return math.floor(number / factor) * factor + + +def is_scaled_image(image: np.ndarray) -> bool: + """Check if image pixel values are already normalized to [0, 1] range. + + Args: + image: Input image array. + + Returns: + bool: True if image is already scaled to [0, 1]. + """ + if image.dtype == np.uint8: + return False + # It's possible the image has pixel values in [0, 255] but is of floating type + return np.min(image) >= 0 and np.max(image) <= 1 + + +def smart_resize_qwen( + height: int, + width: int, + factor: int, + min_pixels: int, + max_pixels: int, + max_ratio: int = 200, +) -> tuple: + """Smart image resizing for ERNIE / Qwen2.5 / Qwen3 models. + + Maintains aspect ratio and respects pixel constraints. When the aspect ratio + exceeds max_ratio, the image is cropped (not raised as error) to fit within + the ratio limit. + + Args: + height: Original image height. + width: Original image width. + factor: Patch size factor; both output dimensions will be multiples of this. + min_pixels: Minimum allowed total pixels. + max_pixels: Maximum allowed total pixels. + max_ratio: Maximum allowed aspect ratio (default 200). + + Returns: + tuple: (new_height, new_width) + + Raises: + ValueError: If calculated dimensions are still invalid after resizing. + """ + if max(height, width) / min(height, width) > max_ratio: + if height > width: + new_width = max(factor, round_by_factor(width, factor)) + new_height = floor_by_factor(new_width * max_ratio, factor) + else: + new_height = max(factor, round_by_factor(height, factor)) + new_width = floor_by_factor(new_height * max_ratio, factor) + + data_processor_logger.info( + f"absolute aspect ratio must be smaller than {max_ratio}, " + f"got {max(height, width) / min(height, width)}, " + f"resize to {max(new_height, new_width) / min(new_height, new_width)}" + ) + height = new_height + width = new_width + + h_bar = max(factor, round_by_factor(height, factor)) + w_bar = max(factor, round_by_factor(width, factor)) + if h_bar * w_bar > max_pixels: + beta = math.sqrt((height * width) / max_pixels) + h_bar = floor_by_factor(height / beta, factor) + w_bar = floor_by_factor(width / beta, factor) + elif h_bar * w_bar < min_pixels: + beta = math.sqrt(min_pixels / (height * width)) + h_bar = ceil_by_factor(height * beta, factor) + w_bar = ceil_by_factor(width * beta, factor) + + if min_pixels > h_bar * w_bar or h_bar * w_bar > max_pixels: + raise ValueError(f"encounter invalid h_bar: {h_bar}, w_bar: {w_bar}") + + return h_bar, w_bar + + +def smart_resize_paddleocr( + height: int, + width: int, + factor: int = 28, + min_pixels: int = 28 * 28 * 130, + max_pixels: int = 28 * 28 * 1280, +) -> tuple: + """Smart image resizing for PaddleOCR-VL model. + + Similar to smart_resize_qwen but adds small-image protection: if height or + width is smaller than factor, the image is scaled up to factor first. Also, + when aspect ratio exceeds 200 this function raises ValueError (instead of + silently cropping like the qwen variant). + + Args: + height: Original image height. + width: Original image width. + factor: Patch size factor; both output dimensions will be multiples of this. + min_pixels: Minimum allowed total pixels. + max_pixels: Maximum allowed total pixels. + + Returns: + tuple: (new_height, new_width) + + Raises: + ValueError: If aspect ratio exceeds 200, or calculated dimensions are invalid. + """ + if height < factor: + data_processor_logger.debug(f"smart_resize_paddleocr: height={height} < factor={factor}, reset height=factor") + width = round((width * factor) / height) + height = factor + + if width < factor: + data_processor_logger.debug(f"smart_resize_paddleocr: width={width} < factor={factor}, reset width=factor") + height = round((height * factor) / width) + width = factor + + if max(height, width) / min(height, width) > 200: + raise ValueError( + f"absolute aspect ratio must be smaller than 200, " f"got {max(height, width) / min(height, width)}" + ) + + h_bar = round(height / factor) * factor + w_bar = round(width / factor) * factor + if h_bar * w_bar > max_pixels: + beta = math.sqrt((height * width) / max_pixels) + h_bar = math.floor(height / beta / factor) * factor + w_bar = math.floor(width / beta / factor) * factor + elif h_bar * w_bar < min_pixels: + beta = math.sqrt(min_pixels / (height * width)) + h_bar = math.ceil(height * beta / factor) * factor + w_bar = math.ceil(width * beta / factor) * factor + + return h_bar, w_bar + + +def smart_resize( + height: int, + width: int, + factor: int, + min_pixels: int, + max_pixels: int, + max_ratio: int = 200, + variant: str = "qwen", +) -> tuple: + """Unified smart_resize dispatcher. + + Args: + height: Original image height. + width: Original image width. + factor: Patch size factor. + min_pixels: Minimum allowed total pixels. + max_pixels: Maximum allowed total pixels. + max_ratio: Maximum allowed aspect ratio (only used by "qwen" variant). + variant: Which algorithm variant to use. + - "qwen" (default): for ERNIE / Qwen2.5 / Qwen3. Clips extreme ratios silently. + - "paddleocr": for PaddleOCR-VL. Adds small-image protection, raises on bad ratio. + + Returns: + tuple: (new_height, new_width) + """ + if variant == "paddleocr": + return smart_resize_paddleocr(height, width, factor, min_pixels, max_pixels) + return smart_resize_qwen(height, width, factor, min_pixels, max_pixels, max_ratio) diff --git a/fastdeploy/input/paddleocr_vl_processor/image_processor.py b/fastdeploy/input/paddleocr_vl_processor/image_processor.py index 8e333d5bf9..a6e318e1ed 100644 --- a/fastdeploy/input/paddleocr_vl_processor/image_processor.py +++ b/fastdeploy/input/paddleocr_vl_processor/image_processor.py @@ -19,8 +19,6 @@ # TODO: Support videos import json -import logging -import math from pathlib import Path from typing import Dict, List, Optional, Union @@ -34,6 +32,10 @@ from paddleformers.transformers.image_utils import ( to_numpy_array, ) +from fastdeploy.input.image_processors.common import ( + smart_resize_paddleocr as smart_resize, +) + _OPENAI_CLIP_MEAN = [0.48145466, 0.4578275, 0.40821073] _OPENAI_CLIP_STD = [0.26862954, 0.26130258, 0.27577711] @@ -68,54 +70,6 @@ def adjust_size(size, patch_size): return num_patches * patch_size -def smart_resize( - height: int, - width: int, - factor: int = 28, - min_pixels: int = 28 * 28 * 130, - max_pixels: int = 28 * 28 * 1280, -): - """Rescales the image so that the following conditions are met: - - 1. Both dimensions (height and width) are divisible by 'factor'. - - 2. The total number of pixels is within the range ['min_pixels', 'max_pixels']. - - 3. The aspect ratio of the image is maintained as closely as possible. - - """ - # if height < factor or width < factor: - # raise ValueError(f"height:{height} or width:{width} must be larger than factor:{factor}") - # if int(height < factor//4) + int(width < factor//4): - # raise ValueError(f"height:{height} or width:{width} must be larger than factor:{factor//4}") - - if height < factor: - logging.debug(f"smart_resize: height={height} < factor={factor}, reset height=factor") - width = round((width * factor) / height) - height = factor - - if width < factor: - logging.debug(f"smart_resize: width={width} < factor={factor}, reset width=factor") - height = round((height * factor) / width) - width = factor - - if max(height, width) / min(height, width) > 200: - raise ValueError( - f"absolute aspect ratio must be smaller than 200, got {max(height, width) / min(height, width)}" - ) - h_bar = round(height / factor) * factor - w_bar = round(width / factor) * factor - if h_bar * w_bar > max_pixels: - beta = math.sqrt((height * width) / max_pixels) - h_bar = math.floor(height / beta / factor) * factor - w_bar = math.floor(width / beta / factor) * factor - elif h_bar * w_bar < min_pixels: - beta = math.sqrt(min_pixels / (height * width)) - h_bar = math.ceil(height * beta / factor) * factor - w_bar = math.ceil(width * beta / factor) * factor - return h_bar, w_bar - - class ImageProcessor(BaseImageProcessor): model_input_names = [ "pixel_values", diff --git a/fastdeploy/input/paddleocr_vl_processor/process.py b/fastdeploy/input/paddleocr_vl_processor/process.py index 8090abff12..58f51a9aa1 100644 --- a/fastdeploy/input/paddleocr_vl_processor/process.py +++ b/fastdeploy/input/paddleocr_vl_processor/process.py @@ -26,14 +26,14 @@ from PIL import Image from fastdeploy.engine.request import ImagePosition from fastdeploy.entrypoints.chat_utils import parse_chat_messages -from fastdeploy.input.ernie4_5_vl_processor import read_video_decord from fastdeploy.input.mm_data_processor import MMBaseDataProcessor from fastdeploy.input.utils import IDS_TYPE_FLAG +from fastdeploy.input.video_utils import read_video_decord +from fastdeploy.input.video_utils import sample_frames_paddleocr as sample_frames from fastdeploy.multimodal.hasher import MultimodalHasher from fastdeploy.utils import data_processor_logger from .image_processor import ImageProcessor -from .process_video import sample_frames class DataProcessor(MMBaseDataProcessor): diff --git a/fastdeploy/input/paddleocr_vl_processor/process_video.py b/fastdeploy/input/paddleocr_vl_processor/process_video.py deleted file mode 100644 index c7089d26dc..0000000000 --- a/fastdeploy/input/paddleocr_vl_processor/process_video.py +++ /dev/null @@ -1,82 +0,0 @@ -""" -# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -""" - -import math -from typing import Optional, Union - -import numpy as np - - -def sample_frames( - frame_factor: int, - min_frames: int, - max_frames: int, - metadata: Optional[dict] = None, - fps: Optional[Union[int, float]] = None, - num_frames: Optional[int] = None, -): - """ - Sample frames from video according to specified criteria. - - Args: - frame_factor: Ensure sampled frames are multiples of this factor - min_frames: Minimum number of frames to sample - max_frames: Maximum number of frames to sample - metadata: Video metadata containing fps information - fps: Target frames per second for sampling - num_frames: Exact number of frames to sample - - Returns: - np.ndarray: Sampled video frames - - Raises: - ValueError: If both fps and num_frames are specified, - or if required metadata is missing, - or if requested frames exceed available frames - """ - if fps > 0 and num_frames > 0: - raise ValueError("`num_frames` and `fps` are mutually exclusive arguments, please use only one!") - - total_num_frames = metadata["num_of_frame"] - - # If num_frames is not given but fps is, calculate num_frames from fps - if num_frames > 0: - num_frames = round(num_frames / frame_factor) * frame_factor - elif fps > 0: - if metadata is None: - raise ValueError( - "Asked to sample `fps` frames per second but no video metadata was provided which is required when sampling with `fps`. " - "Please pass in `VideoMetadata` object or use a fixed `num_frames` per input video" - ) - max_frames = math.floor(min(max_frames, total_num_frames) / frame_factor) * frame_factor - num_frames = total_num_frames / metadata["fps"] * fps - num_frames = min(min(max(num_frames, min_frames), max_frames), total_num_frames) - num_frames = math.floor(num_frames / frame_factor) * frame_factor - if num_frames > total_num_frames: - raise ValueError( - f"Video can't be sampled. The inferred `num_frames={num_frames}` exceeds `total_num_frames={total_num_frames}`. " - "Decrease `num_frames` or `fps` for sampling." - ) - - # Calculate frame indices based on sampling strategy - if num_frames > 0: - # Evenly spaced sampling for target frame count - indices = np.arange(0, total_num_frames, total_num_frames / num_frames).astype(np.int32) - else: - # Keep all frames if no sampling requested - indices = np.arange(0, total_num_frames).astype(np.int32) - - return indices diff --git a/fastdeploy/input/qwen3_vl_processor/image_processor.py b/fastdeploy/input/qwen3_vl_processor/image_processor.py index 167f3e340d..5927a0f969 100644 --- a/fastdeploy/input/qwen3_vl_processor/image_processor.py +++ b/fastdeploy/input/qwen3_vl_processor/image_processor.py @@ -14,7 +14,6 @@ # limitations under the License. """ -import math from typing import List, Optional, Union import numpy as np @@ -41,6 +40,7 @@ from paddleformers.transformers.image_utils import ( from paddleformers.transformers.legacy.tokenizer_utils_base import TensorType from PIL import Image +from fastdeploy.input.image_processors.common import is_scaled_image, smart_resize from fastdeploy.utils import data_processor_logger IMAGE_MEAN = [0.5, 0.5, 0.5] @@ -62,86 +62,6 @@ VideoInput = Union[ ] -def round_by_factor(number: int, factor: int) -> int: - return round(number / factor) * factor - - -def ceil_by_factor(number: int, factor: int) -> int: - return math.ceil(number / factor) * factor - - -def floor_by_factor(number: int, factor: int) -> int: - return math.floor(number / factor) * factor - - -def smart_resize(height: int, width: int, factor: int, min_pixels: int, max_pixels: int, max_ratio: int = 200): - """ - Smart image resizing that maintains aspect ratio and respects constraints. - - Args: - height: Original image height - width: Original image width - factor: Patch size factor - min_pixels: Minimum allowed pixels - max_pixels: Maximum allowed pixels - max_ratio: Maximum allowed aspect ratio - - Returns: - tuple: (new_height, new_width) - - Raises: - ValueError: If calculated dimensions are invalid - """ - if max(height, width) / min(height, width) > max_ratio: - if height > width: - new_width = max(factor, round_by_factor(width, factor)) - new_height = floor_by_factor(new_width * max_ratio, factor) - else: - new_height = max(factor, round_by_factor(height, factor)) - new_width = floor_by_factor(new_height * max_ratio, factor) - - data_processor_logger.info( - f"absolute aspect ratio must be smaller than {max_ratio}, got {max(height, width) / min(height, width)},\ - resize to {max(new_height, new_width) / min(new_height, new_width)}" - ) - - height = new_height - width = new_width - - h_bar = max(factor, round_by_factor(height, factor)) - w_bar = max(factor, round_by_factor(width, factor)) - if h_bar * w_bar > max_pixels: - beta = math.sqrt((height * width) / max_pixels) - h_bar = floor_by_factor(height / beta, factor) - w_bar = floor_by_factor(width / beta, factor) - elif h_bar * w_bar < min_pixels: - beta = math.sqrt(min_pixels / (height * width)) - h_bar = ceil_by_factor(height * beta, factor) - w_bar = ceil_by_factor(width * beta, factor) - - if min_pixels > h_bar * w_bar or h_bar * w_bar > max_pixels: - raise ValueError(f"encounter invalid h_bar: {h_bar}, w_bar: {w_bar}") - - return h_bar, w_bar - - -def is_scaled_image(image: np.ndarray) -> bool: - """ - Check if image pixel values are already normalized to [0, 1] range. - - Args: - image: Input image array - - Returns: - bool: True if image is already scaled - """ - if image.dtype == np.uint8: - return False - - # It's possible the image has pixel values in [0, 255] but is of floating type - return np.min(image) >= 0 and np.max(image) <= 1 - - class ImageProcessor(BaseImageProcessor): """ Adaptive image processor for dynamic image resizing and preprocessing. diff --git a/fastdeploy/input/qwen3_vl_processor/process.py b/fastdeploy/input/qwen3_vl_processor/process.py index 350ebb57de..994ec51291 100644 --- a/fastdeploy/input/qwen3_vl_processor/process.py +++ b/fastdeploy/input/qwen3_vl_processor/process.py @@ -26,13 +26,14 @@ from PIL import Image from fastdeploy.engine.request import ImagePosition from fastdeploy.entrypoints.chat_utils import parse_chat_messages -from fastdeploy.input.ernie4_5_vl_processor import read_video_decord from fastdeploy.input.mm_data_processor import MMBaseDataProcessor from fastdeploy.input.utils import IDS_TYPE_FLAG +from fastdeploy.input.video_utils import read_video_decord +from fastdeploy.input.video_utils import sample_frames_qwen as sample_frames from fastdeploy.multimodal.hasher import MultimodalHasher from fastdeploy.utils import data_processor_logger -from .image_processor import ImageProcessor, ceil_by_factor, floor_by_factor +from .image_processor import ImageProcessor VIDEO_MIN_PIXELS = 128 * 28 * 28 VIDEO_MAX_PIXELS = 768 * 28 * 28 @@ -42,83 +43,6 @@ FPS_MIN_FRAMES = 4 FPS_MAX_FRAMES = 768 -def sample_frames( - frame_factor: int, - min_frames: int, - max_frames: int, - metadata: Optional[dict] = None, - fps: Optional[Union[int, float]] = -1, - num_frames: Optional[int] = -1, -): - """ - Sample frames from video according to specified criteria. - - Args: - frame_factor: Ensure sampled frames are multiples of this factor - min_frames: Minimum number of frames to sample - max_frames: Maximum number of frames to sample - metadata: Video metadata containing fps information - fps: Target frames per second for sampling - num_frames: Exact number of frames to sample - - Returns: - np.ndarray: Sampled video frames - - Raises: - ValueError: If both fps and num_frames are specified, - or if required metadata is missing, - or if requested frames exceed available frames - """ - if fps > 0 and num_frames > 0: - raise ValueError("`num_frames` and `fps` are mutually exclusive arguments, please use only one!") - - total_num_frames = metadata["num_of_frame"] - - # If num_frames is not given but fps is, calculate num_frames from fps - if num_frames > 0: - num_frames = round(num_frames / frame_factor) * frame_factor - elif fps > 0: - if metadata is None: - raise ValueError( - "Asked to sample `fps` frames per second but no video metadata was provided which is required when sampling with `fps`. " - "Please pass in `VideoMetadata` object or use a fixed `num_frames` per input video" - ) - # max_frames = math.floor(min(max_frames, total_num_frames) / frame_factor) * frame_factor - min_frames = ceil_by_factor(min_frames, frame_factor) - max_frames = floor_by_factor(min(max_frames, total_num_frames), frame_factor) - - num_frames = total_num_frames / metadata["fps"] * fps - - if num_frames > total_num_frames: - data_processor_logger.warning(f"smart_nframes: nframes[{num_frames}] > total_frames[{total_num_frames}]") - - num_frames = min(min(max(num_frames, min_frames), max_frames), total_num_frames) - num_frames = floor_by_factor(num_frames, frame_factor) - - if num_frames > total_num_frames: - raise ValueError( - f"Video can't be sampled. The inferred `num_frames={num_frames}` exceeds `total_num_frames={total_num_frames}`. " - "Decrease `num_frames` or `fps` for sampling." - ) - - # Hack code ensures that num_frames can always be divided by 4 - # due to sched/resource_manager_v1.py 中 grid_thw.extend([[2, h, w]] * (t // 2)) - if num_frames > 2 and num_frames % 4 != 0: - num_frames = (num_frames // 4) * 4 # 向下取整到 4 的倍数 - total_num_frames = (total_num_frames // 4) * 4 - num_frames = min(min(max(num_frames, min_frames), max_frames), total_num_frames) - - # Calculate frame indices based on sampling strategy - if num_frames > 0: - # Evenly spaced sampling for target frame count - indices = np.arange(0, total_num_frames, total_num_frames / num_frames).astype(np.int32) - else: - # Keep all frames if no sampling requested - indices = np.arange(0, total_num_frames).astype(np.int32) - - return indices - - class DataProcessor(MMBaseDataProcessor): """ Processes multimodal inputs (text, images, videos) into model-ready formats. diff --git a/fastdeploy/input/qwen_vl_processor/image_processor.py b/fastdeploy/input/qwen_vl_processor/image_processor.py index b6a1db19bc..7c3df2b69b 100644 --- a/fastdeploy/input/qwen_vl_processor/image_processor.py +++ b/fastdeploy/input/qwen_vl_processor/image_processor.py @@ -14,7 +14,6 @@ # limitations under the License. """ -import math from typing import List, Optional, Union import numpy as np @@ -41,6 +40,7 @@ from paddleformers.transformers.image_utils import ( from paddleformers.transformers.legacy.tokenizer_utils_base import TensorType from PIL import Image +from fastdeploy.input.image_processors.common import is_scaled_image, smart_resize from fastdeploy.utils import data_processor_logger OPENAI_CLIP_MEAN = [0.48145466, 0.4578275, 0.40821073] @@ -62,116 +62,6 @@ VideoInput = Union[ ] -def round_by_factor(number: int, factor: int) -> int: - """ - Round number to nearest multiple of factor. - - Args: - number: Input number to round - factor: Rounding factor - - Returns: - int: Rounded number - """ - return round(number / factor) * factor - - -def ceil_by_factor(number: int, factor: int) -> int: - """ - Round number up to nearest multiple of factor. - - Args: - number: Input number to round - factor: Rounding factor - - Returns: - int: Rounded number - """ - return math.ceil(number / factor) * factor - - -def floor_by_factor(number: int, factor: int) -> int: - """ - Round number down to nearest multiple of factor. - - Args: - number: Input number to round - factor: Rounding factor - - Returns: - int: Rounded number - """ - return math.floor(number / factor) * factor - - -def smart_resize(height: int, width: int, factor: int, min_pixels: int, max_pixels: int, max_ratio: int = 200): - """ - Smart image resizing that maintains aspect ratio and respects constraints. - - Args: - height: Original image height - width: Original image width - factor: Patch size factor - min_pixels: Minimum allowed pixels - max_pixels: Maximum allowed pixels - max_ratio: Maximum allowed aspect ratio - - Returns: - tuple: (new_height, new_width) - - Raises: - ValueError: If calculated dimensions are invalid - """ - if max(height, width) / min(height, width) > max_ratio: - if height > width: - new_width = max(factor, round_by_factor(width, factor)) - new_height = floor_by_factor(new_width * max_ratio, factor) - else: - new_height = max(factor, round_by_factor(height, factor)) - new_width = floor_by_factor(new_height * max_ratio, factor) - - data_processor_logger.info( - f"absolute aspect ratio must be smaller than {max_ratio}, got {max(height, width) / min(height, width)},\ - resize to {max(new_height, new_width) / min(new_height, new_width)}" - ) - - height = new_height - width = new_width - - h_bar = max(factor, round_by_factor(height, factor)) - w_bar = max(factor, round_by_factor(width, factor)) - if h_bar * w_bar > max_pixels: - beta = math.sqrt((height * width) / max_pixels) - h_bar = floor_by_factor(height / beta, factor) - w_bar = floor_by_factor(width / beta, factor) - elif h_bar * w_bar < min_pixels: - beta = math.sqrt(min_pixels / (height * width)) - h_bar = ceil_by_factor(height * beta, factor) - w_bar = ceil_by_factor(width * beta, factor) - - if min_pixels > h_bar * w_bar or h_bar * w_bar > max_pixels: - raise ValueError(f"encounter invalid h_bar: {h_bar}, w_bar: {w_bar}") - - return h_bar, w_bar - - -def is_scaled_image(image: np.ndarray) -> bool: - """ - Check if image pixel values are already normalized to [0, 1] range. - - Args: - image: Input image array - - Returns: - bool: True if image is already scaled - """ - if image.dtype == np.uint8: - return False - - # It's possible the image has pixel values in [0, 255] but is of floating type - return np.min(image) >= 0 and np.max(image) <= 1 - - class ImageProcessor(BaseImageProcessor): """ Adaptive image processor for dynamic image resizing and preprocessing. diff --git a/fastdeploy/input/qwen_vl_processor/process.py b/fastdeploy/input/qwen_vl_processor/process.py index 20b48b7f17..a84fac7854 100644 --- a/fastdeploy/input/qwen_vl_processor/process.py +++ b/fastdeploy/input/qwen_vl_processor/process.py @@ -26,14 +26,14 @@ from PIL import Image from fastdeploy.engine.request import ImagePosition from fastdeploy.entrypoints.chat_utils import parse_chat_messages -from fastdeploy.input.ernie4_5_vl_processor import read_video_decord from fastdeploy.input.mm_data_processor import MMBaseDataProcessor from fastdeploy.input.utils import IDS_TYPE_FLAG +from fastdeploy.input.video_utils import read_video_decord +from fastdeploy.input.video_utils import sample_frames_qwen as sample_frames from fastdeploy.multimodal.hasher import MultimodalHasher from fastdeploy.utils import data_processor_logger from .image_processor import ImageProcessor -from .process_video import sample_frames FRAME_FACTOR = 2 FPS = 2.0 diff --git a/fastdeploy/input/qwen_vl_processor/process_video.py b/fastdeploy/input/qwen_vl_processor/process_video.py deleted file mode 100644 index 891f272033..0000000000 --- a/fastdeploy/input/qwen_vl_processor/process_video.py +++ /dev/null @@ -1,100 +0,0 @@ -""" -# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -""" - -from typing import Optional, Union - -import numpy as np - -from fastdeploy.utils import data_processor_logger - -from .image_processor import ceil_by_factor, floor_by_factor - - -def sample_frames( - frame_factor: int, - min_frames: int, - max_frames: int, - metadata: Optional[dict] = None, - fps: Optional[Union[int, float]] = -1, - num_frames: Optional[int] = -1, -): - """ - Sample frames from video according to specified criteria. - - Args: - frame_factor: Ensure sampled frames are multiples of this factor - min_frames: Minimum number of frames to sample - max_frames: Maximum number of frames to sample - metadata: Video metadata containing fps information - fps: Target frames per second for sampling - num_frames: Exact number of frames to sample - - Returns: - np.ndarray: Sampled video frames - - Raises: - ValueError: If both fps and num_frames are specified, - or if required metadata is missing, - or if requested frames exceed available frames - """ - if fps > 0 and num_frames > 0: - raise ValueError("`num_frames` and `fps` are mutually exclusive arguments, please use only one!") - - total_num_frames = metadata["num_of_frame"] - - # If num_frames is not given but fps is, calculate num_frames from fps - if num_frames > 0: - num_frames = round(num_frames / frame_factor) * frame_factor - elif fps > 0: - if metadata is None: - raise ValueError( - "Asked to sample `fps` frames per second but no video metadata was provided which is required when sampling with `fps`. " - "Please pass in `VideoMetadata` object or use a fixed `num_frames` per input video" - ) - # max_frames = math.floor(min(max_frames, total_num_frames) / frame_factor) * frame_factor - min_frames = ceil_by_factor(min_frames, frame_factor) - max_frames = floor_by_factor(min(max_frames, total_num_frames), frame_factor) - - num_frames = total_num_frames / metadata["fps"] * fps - - if num_frames > total_num_frames: - data_processor_logger.warning(f"smart_nframes: nframes[{num_frames}] > total_frames[{total_num_frames}]") - - num_frames = min(min(max(num_frames, min_frames), max_frames), total_num_frames) - num_frames = floor_by_factor(num_frames, frame_factor) - - if num_frames > total_num_frames: - raise ValueError( - f"Video can't be sampled. The inferred `num_frames={num_frames}` exceeds `total_num_frames={total_num_frames}`. " - "Decrease `num_frames` or `fps` for sampling." - ) - - # Hack code ensures that num_frames can always be divided by 4 - # due to sched/resource_manager_v1.py 中 grid_thw.extend([[2, h, w]] * (t // 2)) - if num_frames > 2 and num_frames % 4 != 0: - num_frames = (num_frames // 4) * 4 # 向下取整到 4 的倍数 - total_num_frames = (total_num_frames // 4) * 4 - num_frames = min(min(max(num_frames, min_frames), max_frames), total_num_frames) - - # Calculate frame indices based on sampling strategy - if num_frames > 0: - # Evenly spaced sampling for target frame count - indices = np.arange(0, total_num_frames, total_num_frames / num_frames).astype(np.int32) - else: - # Keep all frames if no sampling requested - indices = np.arange(0, total_num_frames).astype(np.int32) - - return indices diff --git a/fastdeploy/input/video_utils.py b/fastdeploy/input/video_utils.py new file mode 100644 index 0000000000..a81cf3f5d6 --- /dev/null +++ b/fastdeploy/input/video_utils.py @@ -0,0 +1,272 @@ +# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Shared video utilities: VideoReaderWrapper, read_video_decord, and sample_frames.""" + +import io +import math +import os +from tempfile import NamedTemporaryFile as ntf +from typing import Optional, Union + +import numpy as np + +from fastdeploy.input.image_processors.common import ceil_by_factor, floor_by_factor +from fastdeploy.utils import data_processor_logger + +__all__ = [ + "VideoReaderWrapper", + "read_video_decord", + "sample_frames", + "sample_frames_qwen", + "sample_frames_paddleocr", +] + + +# --------------------------------------------------------------------------- +# VideoReaderWrapper +# --------------------------------------------------------------------------- + + +def _is_gif(data: bytes) -> bool: + """Check if bytes represent a GIF based on magic header.""" + return data[:6] in (b"GIF87a", b"GIF89a") + + +class VideoReaderWrapper: + """decord.VideoReader wrapper that fixes a memory leak and adds GIF support. + + Reference: https://github.com/dmlc/decord/issues/208 + """ + + def __init__(self, video_path, *args, **kwargs): + import decord + + try: + # moviepy 1.0 + import moviepy.editor as mp + except Exception: + # moviepy 2.0 + import moviepy as mp + + with ntf(delete=True, suffix=".gif") as gif_file: + gif_input = None + self.original_file = None # only set when we create a temp file + + if isinstance(video_path, str): + if video_path.lower().endswith(".gif"): + gif_input = video_path + elif isinstance(video_path, bytes): + if _is_gif(video_path): + gif_file.write(video_path) + gif_file.flush() + gif_input = gif_file.name + elif isinstance(video_path, io.BytesIO): + video_path.seek(0) + tmp_bytes = video_path.read() + video_path.seek(0) + if _is_gif(tmp_bytes): + gif_file.write(tmp_bytes) + gif_file.flush() + gif_input = gif_file.name + + if gif_input is not None: + clip = mp.VideoFileClip(gif_input) + mp4_file = ntf(delete=False, suffix=".mp4") + mp4_path = mp4_file.name + mp4_file.close() # close before moviepy writes + clip.write_videofile(mp4_path, verbose=False, logger=None) + clip.close() + video_path = mp4_path + self.original_file = video_path # temp mp4, cleaned up in __del__ + + self._reader = decord.VideoReader(video_path, *args, **kwargs) + self._reader.seek(0) + + def __len__(self): + return len(self._reader) + + def __getitem__(self, key): + frames = self._reader[key] + self._reader.seek(0) + return frames + + def get_avg_fps(self): + return self._reader.get_avg_fps() + + def seek(self, pos): + return self._reader.seek(pos) + + def __del__(self): + original_file = getattr(self, "original_file", None) + if original_file: + try: + os.remove(original_file) + except OSError: + pass + + +# --------------------------------------------------------------------------- +# read_video_decord +# --------------------------------------------------------------------------- + + +def read_video_decord(video_path, save_to_disk: bool = False): + """Load a video file and return (video_reader, video_meta, video_path). + + video_meta contains keys: "fps", "duration", "num_of_frame". + """ + if isinstance(video_path, VideoReaderWrapper): + video_reader = video_path + else: + if isinstance(video_path, bytes): + video_path = io.BytesIO(video_path) + video_reader = VideoReaderWrapper(video_path, num_threads=1) + + vlen = len(video_reader) + fps = video_reader.get_avg_fps() + duration = vlen / float(fps) + + video_meta = {"fps": fps, "duration": duration, "num_of_frame": vlen} + return video_reader, video_meta, video_path + + +# --------------------------------------------------------------------------- +# sample_frames — qwen_vl variant +# --------------------------------------------------------------------------- + + +def sample_frames_qwen( + frame_factor: int, + min_frames: int, + max_frames: int, + metadata: Optional[dict] = None, + fps: Optional[Union[int, float]] = -1, + num_frames: Optional[int] = -1, +) -> np.ndarray: + """Sample frame indices — qwen_vl variant. + + Sentinel defaults are -1. Applies ceil_by_factor on min_frames and ensures + num_frames is divisible by 4. + """ + if fps > 0 and num_frames > 0: + raise ValueError("`num_frames` and `fps` are mutually exclusive arguments, please use only one!") + + if metadata is None: + raise ValueError("metadata is required for sample_frames_qwen") + + total_num_frames = metadata["num_of_frame"] + + if num_frames > 0: + num_frames = round(num_frames / frame_factor) * frame_factor + elif fps > 0: + min_frames = ceil_by_factor(min_frames, frame_factor) + max_frames = floor_by_factor(min(max_frames, total_num_frames), frame_factor) + + num_frames = total_num_frames / metadata["fps"] * fps + + if num_frames > total_num_frames: + data_processor_logger.warning(f"smart_nframes: nframes[{num_frames}] > total_frames[{total_num_frames}]") + + num_frames = min(min(max(num_frames, min_frames), max_frames), total_num_frames) + num_frames = floor_by_factor(num_frames, frame_factor) + + if num_frames > total_num_frames: + raise ValueError( + f"Video can't be sampled. The inferred `num_frames={num_frames}` exceeds " + f"`total_num_frames={total_num_frames}`. " + "Decrease `num_frames` or `fps` for sampling." + ) + + # num_frames must be divisible by 4 + if num_frames > 2 and num_frames % 4 != 0: + num_frames = (num_frames // 4) * 4 + total_num_frames = (total_num_frames // 4) * 4 + num_frames = min(min(max(num_frames, min_frames), max_frames), total_num_frames) + + if num_frames > 0: + indices = np.arange(0, total_num_frames, total_num_frames / num_frames).astype(np.int32) + else: + indices = np.arange(0, total_num_frames).astype(np.int32) + + return indices + + +# --------------------------------------------------------------------------- +# sample_frames — paddleocr_vl / ernie4_5_vl variant +# --------------------------------------------------------------------------- + + +def sample_frames_paddleocr( + frame_factor: int, + min_frames: int, + max_frames: int, + metadata: Optional[dict] = None, + fps: Optional[Union[int, float]] = None, + num_frames: Optional[int] = None, +) -> np.ndarray: + """Sample frame indices — paddleocr_vl / ernie4_5_vl variant. + + Sentinel defaults are None. Uses plain math.floor/ceil; no %4 correction. + """ + fps = fps or 0 + num_frames = num_frames or 0 + if fps > 0 and num_frames > 0: + raise ValueError("`num_frames` and `fps` are mutually exclusive arguments, please use only one!") + + if metadata is None: + raise ValueError("metadata is required for sample_frames_paddleocr") + + total_num_frames = metadata["num_of_frame"] + + if num_frames > 0: + num_frames = round(num_frames / frame_factor) * frame_factor + elif fps > 0: + max_frames = math.floor(min(max_frames, total_num_frames) / frame_factor) * frame_factor + num_frames = total_num_frames / metadata["fps"] * fps + num_frames = min(min(max(num_frames, min_frames), max_frames), total_num_frames) + num_frames = math.floor(num_frames / frame_factor) * frame_factor + + if num_frames > total_num_frames: + raise ValueError( + f"Video can't be sampled. The inferred `num_frames={num_frames}` exceeds " + f"`total_num_frames={total_num_frames}`. " + "Decrease `num_frames` or `fps` for sampling." + ) + + if num_frames > 0: + indices = np.arange(0, total_num_frames, total_num_frames / num_frames).astype(np.int32) + else: + indices = np.arange(0, total_num_frames).astype(np.int32) + + return indices + + +def sample_frames( + frame_factor: int, + min_frames: int, + max_frames: int, + metadata: Optional[dict] = None, + fps: Optional[Union[int, float]] = None, + num_frames: Optional[int] = None, + variant: str = "paddleocr", +) -> np.ndarray: + """Dispatch to sample_frames_qwen or sample_frames_paddleocr based on variant.""" + if variant == "qwen": + _fps = fps if fps is not None else -1 + _num_frames = num_frames if num_frames is not None else -1 + return sample_frames_qwen(frame_factor, min_frames, max_frames, metadata, _fps, _num_frames) + if variant == "paddleocr": + return sample_frames_paddleocr(frame_factor, min_frames, max_frames, metadata, fps, num_frames) + raise ValueError(f"Unknown variant {variant!r}. Expected 'paddleocr' or 'qwen'.") diff --git a/tests/input/test_image_preprocessor_adaptive.py b/tests/input/test_image_preprocessor_adaptive.py index 2d210c767e..cc9ed85755 100644 --- a/tests/input/test_image_preprocessor_adaptive.py +++ b/tests/input/test_image_preprocessor_adaptive.py @@ -22,14 +22,16 @@ from PIL import Image from fastdeploy.input.ernie4_5_vl_processor.image_preprocessor.image_preprocessor_adaptive import ( AdaptiveImageProcessor, + make_batched_images, + make_batched_videos, +) +from fastdeploy.input.image_processors.common import ( ceil_by_factor, floor_by_factor, is_scaled_image, - make_batched_images, - make_batched_videos, round_by_factor, - smart_resize, ) +from fastdeploy.input.image_processors.common import smart_resize_qwen as smart_resize class TestImagePreprocessorAdaptive(unittest.TestCase): diff --git a/tests/input/test_paddleocr_vl_processor.py b/tests/input/test_paddleocr_vl_processor.py index 26b270feaf..c93b5a8f97 100644 --- a/tests/input/test_paddleocr_vl_processor.py +++ b/tests/input/test_paddleocr_vl_processor.py @@ -30,7 +30,7 @@ from fastdeploy.input.paddleocr_vl_processor.paddleocr_vl_processor import ( PaddleOCRVLProcessor, ) from fastdeploy.input.paddleocr_vl_processor.process import DataProcessor -from fastdeploy.input.paddleocr_vl_processor.process_video import sample_frames +from fastdeploy.input.video_utils import sample_frames_paddleocr as sample_frames MODULE_PATH = "fastdeploy.input.paddleocr_vl_processor.process" @@ -86,7 +86,7 @@ class TestProcessVideo(unittest.TestCase): def test_error_fps_without_metadata(self): """新增:测试 fps > 0 但 metadata 为 None""" - with self.assertRaises(TypeError) as context: + with self.assertRaises(ValueError) as context: sample_frames( frame_factor=self.frame_factor, min_frames=self.min_frames, @@ -95,8 +95,7 @@ class TestProcessVideo(unittest.TestCase): fps=10, metadata=None, # 缺失 ) - # 验证是预期的 TypeError - self.assertIn("'NoneType' object is not subscriptable", str(context.exception)) + self.assertIn("metadata is required", str(context.exception)) def test_num_frames_rounding(self): """新增:测试 num_frames 向 frame_factor 舍入""" diff --git a/tests/input/test_qwen_vl_processor.py b/tests/input/test_qwen_vl_processor.py index eba0bfc084..dd5e890958 100644 --- a/tests/input/test_qwen_vl_processor.py +++ b/tests/input/test_qwen_vl_processor.py @@ -21,7 +21,7 @@ import numpy as np from PIL import Image from fastdeploy.input.qwen_vl_processor import QwenVLProcessor -from fastdeploy.input.qwen_vl_processor.process_video import sample_frames +from fastdeploy.input.video_utils import sample_frames_qwen as sample_frames def mock_pil_image(height, width): diff --git a/tests/input/test_video_utils.py b/tests/input/test_video_utils.py new file mode 100644 index 0000000000..28e6f97e3d --- /dev/null +++ b/tests/input/test_video_utils.py @@ -0,0 +1,365 @@ +# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import io +import unittest +from unittest.mock import MagicMock, patch + +import numpy as np + +from fastdeploy.input.video_utils import ( + _is_gif, + read_video_decord, + sample_frames, + sample_frames_paddleocr, + sample_frames_qwen, +) + +# --------------------------------------------------------------------------- +# helpers +# --------------------------------------------------------------------------- + +GIF87_HEADER = b"GIF87a" + b"\x00" * 10 +GIF89_HEADER = b"GIF89a" + b"\x00" * 10 +NOT_GIF = b"NOTGIF" + b"\x00" * 10 + + +def _make_mock_reader(num_frames=100, fps=25.0): + """Return a mock that mimics decord.VideoReader.""" + reader = MagicMock() + reader.__len__ = MagicMock(return_value=num_frames) + reader.get_avg_fps = MagicMock(return_value=fps) + reader.seek = MagicMock(return_value=None) + frame = MagicMock() + frame.asnumpy = MagicMock(return_value=np.zeros((480, 640, 3), dtype=np.uint8)) + reader.__getitem__ = MagicMock(return_value=frame) + return reader + + +# --------------------------------------------------------------------------- +# _is_gif +# --------------------------------------------------------------------------- + + +class TestIsGif(unittest.TestCase): + def test_gif87a(self): + self.assertTrue(_is_gif(GIF87_HEADER)) + + def test_gif89a(self): + self.assertTrue(_is_gif(GIF89_HEADER)) + + def test_not_gif(self): + self.assertFalse(_is_gif(NOT_GIF)) + + def test_short_bytes(self): + self.assertFalse(_is_gif(b"GIF")) + + +# --------------------------------------------------------------------------- +# VideoReaderWrapper (mock decord + moviepy) +# --------------------------------------------------------------------------- + + +class TestVideoReaderWrapper(unittest.TestCase): + def _make_wrapper(self, video_path, mock_reader=None): + """Construct a VideoReaderWrapper with decord mocked out.""" + from fastdeploy.input.video_utils import VideoReaderWrapper + + if mock_reader is None: + mock_reader = _make_mock_reader() + + mock_decord = MagicMock() + mock_decord.VideoReader.return_value = mock_reader + + with patch.dict("sys.modules", {"decord": mock_decord, "moviepy": MagicMock(), "moviepy.editor": MagicMock()}): + wrapper = VideoReaderWrapper(video_path) + + wrapper._reader = mock_reader + return wrapper + + def test_len(self): + reader = _make_mock_reader(num_frames=42) + wrapper = self._make_wrapper("/fake/video.mp4", reader) + self.assertEqual(len(wrapper), 42) + + def test_getitem_resets_seek(self): + reader = _make_mock_reader() + wrapper = self._make_wrapper("/fake/video.mp4", reader) + _ = wrapper[0] + reader.seek.assert_called_with(0) + + def test_get_avg_fps(self): + reader = _make_mock_reader(fps=30.0) + wrapper = self._make_wrapper("/fake/video.mp4", reader) + self.assertEqual(wrapper.get_avg_fps(), 30.0) + + def test_seek(self): + reader = _make_mock_reader() + wrapper = self._make_wrapper("/fake/video.mp4", reader) + wrapper.seek(5) + reader.seek.assert_called_with(5) + + def test_del_no_original_file(self): + """__del__ should be a no-op when original_file is None.""" + from fastdeploy.input.video_utils import VideoReaderWrapper + + wrapper = object.__new__(VideoReaderWrapper) + wrapper.original_file = None + wrapper._reader = _make_mock_reader() + # Should not raise + wrapper.__del__() + + def test_del_removes_temp_file(self): + """__del__ removes the file only when original_file is set.""" + import os + import tempfile + + from fastdeploy.input.video_utils import VideoReaderWrapper + + with tempfile.NamedTemporaryFile(delete=False) as f: + tmp_path = f.name + + wrapper = object.__new__(VideoReaderWrapper) + wrapper.original_file = tmp_path + wrapper._reader = _make_mock_reader() + wrapper.__del__() + self.assertFalse(os.path.exists(tmp_path)) + + def test_non_gif_string_path_does_not_set_original_file(self): + """Passing a non-GIF string path must NOT set original_file (bug fix).""" + from fastdeploy.input.video_utils import VideoReaderWrapper + + mock_reader = _make_mock_reader() + mock_decord = MagicMock() + mock_decord.VideoReader.return_value = mock_reader + + with patch.dict("sys.modules", {"decord": mock_decord, "moviepy": MagicMock(), "moviepy.editor": MagicMock()}): + wrapper = VideoReaderWrapper("/fake/video.mp4") + + self.assertIsNone(wrapper.original_file) + + def test_bytesio_non_gif_path_does_not_set_original_file(self): + """Passing a BytesIO that is NOT a GIF must not set original_file.""" + from fastdeploy.input.video_utils import VideoReaderWrapper + + mock_reader = _make_mock_reader() + mock_decord = MagicMock() + mock_decord.VideoReader.return_value = mock_reader + + bio = io.BytesIO(NOT_GIF) + with patch.dict("sys.modules", {"decord": mock_decord, "moviepy": MagicMock(), "moviepy.editor": MagicMock()}): + wrapper = VideoReaderWrapper(bio) + + self.assertIsNone(wrapper.original_file) + + +# --------------------------------------------------------------------------- +# read_video_decord +# --------------------------------------------------------------------------- + + +class TestReadVideoDecord(unittest.TestCase): + def _patch_wrapper(self, num_frames=100, fps=25.0): + """Return a context manager that replaces VideoReaderWrapper with a mock.""" + from fastdeploy.input import video_utils + + mock_wrapper = MagicMock() + mock_wrapper.__len__ = MagicMock(return_value=num_frames) + mock_wrapper.get_avg_fps = MagicMock(return_value=fps) + return patch.object(video_utils, "VideoReaderWrapper", return_value=mock_wrapper), mock_wrapper + + def test_existing_wrapper_passthrough(self): + """Already-wrapped reader is returned as-is.""" + from fastdeploy.input.video_utils import VideoReaderWrapper + + mock_wrapper = MagicMock(spec=VideoReaderWrapper) + mock_wrapper.__len__ = MagicMock(return_value=50) + mock_wrapper.get_avg_fps = MagicMock(return_value=10.0) + + reader, meta, path = read_video_decord(mock_wrapper) + + self.assertIs(reader, mock_wrapper) + self.assertEqual(meta["num_of_frame"], 50) + self.assertAlmostEqual(meta["fps"], 10.0) + self.assertAlmostEqual(meta["duration"], 5.0) + + def test_bytes_input_converted_to_bytesio(self): + """bytes input is converted to BytesIO before creating VideoReaderWrapper.""" + from fastdeploy.input import video_utils + + captured = [] + + class FakeWrapper: + def __init__(self, path, *args, **kwargs): + captured.append(path) + + def __len__(self): + return 30 + + def get_avg_fps(self): + return 10.0 + + with patch.object(video_utils, "VideoReaderWrapper", FakeWrapper): + reader, meta, path = read_video_decord(b"fake_video_bytes") + + self.assertIsInstance(captured[0], io.BytesIO) + + def test_string_path_input(self): + """String path is passed through to VideoReaderWrapper.""" + from fastdeploy.input import video_utils + + class FakeWrapper: + def __init__(self, path, *args, **kwargs): + pass + + def __len__(self): + return 60 + + def get_avg_fps(self): + return 30.0 + + with patch.object(video_utils, "VideoReaderWrapper", FakeWrapper): + reader, meta, path = read_video_decord("/fake/path.mp4") + + self.assertEqual(meta["num_of_frame"], 60) + self.assertAlmostEqual(meta["duration"], 2.0) + self.assertEqual(path, "/fake/path.mp4") + + +# --------------------------------------------------------------------------- +# sample_frames_qwen +# --------------------------------------------------------------------------- + + +class TestSampleFramesQwen(unittest.TestCase): + META = {"num_of_frame": 100, "fps": 25.0} + + def test_num_frames_basic(self): + indices = sample_frames_qwen(2, 4, 100, self.META, num_frames=8) + self.assertEqual(len(indices), 8) + + def test_fps_basic(self): + indices = sample_frames_qwen(2, 4, 100, self.META, fps=2.0) + self.assertGreater(len(indices), 0) + self.assertEqual(len(indices) % 2, 0) + + def test_fps_and_num_frames_raises(self): + with self.assertRaises(ValueError): + sample_frames_qwen(2, 4, 100, self.META, fps=2.0, num_frames=10) + + def test_num_frames_exceeds_total_raises(self): + with self.assertRaises(ValueError): + sample_frames_qwen(2, 4, 100, self.META, num_frames=200) + + def test_fps_warning_when_nframes_exceeds_total(self): + """fps so high that computed num_frames > total → warning logged.""" + with self.assertLogs(level="WARNING"): + sample_frames_qwen(2, 4, 100, {"num_of_frame": 10, "fps": 1.0}, fps=100.0) + + def test_divisible_by_4_correction(self): + """Result must be divisible by 4 when num_frames > 2.""" + indices = sample_frames_qwen(2, 4, 100, self.META, fps=1.5) + if len(indices) > 2: + self.assertEqual(len(indices) % 4, 0) + + def test_no_sampling_returns_all_frames(self): + """Both fps and num_frames at sentinel → return all frames.""" + indices = sample_frames_qwen(2, 4, 100, self.META) + self.assertEqual(len(indices), 100) + + def test_indices_dtype(self): + indices = sample_frames_qwen(2, 4, 100, self.META, num_frames=8) + self.assertEqual(indices.dtype, np.int32) + + +# --------------------------------------------------------------------------- +# sample_frames_paddleocr +# --------------------------------------------------------------------------- + + +class TestSampleFramesPaddleocr(unittest.TestCase): + META = {"num_of_frame": 100, "fps": 25.0} + + def test_num_frames_basic(self): + indices = sample_frames_paddleocr(1, 4, 100, self.META, num_frames=10) + self.assertEqual(len(indices), 10) + + def test_fps_basic(self): + indices = sample_frames_paddleocr(1, 4, 100, self.META, fps=2.0) + self.assertGreater(len(indices), 0) + + def test_fps_and_num_frames_raises(self): + with self.assertRaises(ValueError): + sample_frames_paddleocr(1, 4, 100, self.META, fps=2.0, num_frames=10) + + def test_num_frames_exceeds_total_raises(self): + with self.assertRaises(ValueError): + sample_frames_paddleocr(1, 4, 100, self.META, num_frames=200) + + def test_none_sentinels_no_sampling(self): + """fps=None, num_frames=None → return all frames.""" + indices = sample_frames_paddleocr(1, 4, 100, self.META) + self.assertEqual(len(indices), 100) + + def test_no_4_correction(self): + """paddleocr variant does NOT apply %4 correction.""" + # 6 frames is not divisible by 4; paddleocr should keep it + meta = {"num_of_frame": 100, "fps": 25.0} + indices = sample_frames_paddleocr(1, 1, 100, meta, num_frames=6) + self.assertEqual(len(indices), 6) + + def test_indices_dtype(self): + indices = sample_frames_paddleocr(1, 4, 100, self.META, num_frames=8) + self.assertEqual(indices.dtype, np.int32) + + +# --------------------------------------------------------------------------- +# sample_frames dispatcher +# --------------------------------------------------------------------------- + + +class TestSampleFramesDispatcher(unittest.TestCase): + META = {"num_of_frame": 100, "fps": 25.0} + + def test_default_variant_is_paddleocr(self): + with patch("fastdeploy.input.video_utils.sample_frames_paddleocr", wraps=sample_frames_paddleocr) as mock_fn: + sample_frames(1, 4, 100, self.META, num_frames=8) + mock_fn.assert_called_once() + + def test_qwen_variant_dispatched(self): + with patch("fastdeploy.input.video_utils.sample_frames_qwen", wraps=sample_frames_qwen) as mock_fn: + sample_frames(2, 4, 100, self.META, num_frames=8, variant="qwen") + mock_fn.assert_called_once() + + def test_qwen_none_fps_converted_to_sentinel(self): + """None fps/num_frames → converted to -1 before calling sample_frames_qwen.""" + with patch("fastdeploy.input.video_utils.sample_frames_qwen", return_value=np.array([])) as mock_fn: + sample_frames(2, 4, 100, self.META, fps=None, num_frames=None, variant="qwen") + args = mock_fn.call_args[0] + self.assertEqual(args[4], -1) # fps sentinel + self.assertEqual(args[5], -1) # num_frames sentinel + + def test_paddleocr_variant_result_consistent(self): + direct = sample_frames_paddleocr(1, 4, 100, self.META, num_frames=8) + via_dispatcher = sample_frames(1, 4, 100, self.META, num_frames=8, variant="paddleocr") + np.testing.assert_array_equal(direct, via_dispatcher) + + def test_qwen_variant_result_consistent(self): + direct = sample_frames_qwen(2, 4, 100, self.META, num_frames=8) + via_dispatcher = sample_frames(2, 4, 100, self.META, num_frames=8, variant="qwen") + np.testing.assert_array_equal(direct, via_dispatcher) + + +if __name__ == "__main__": + unittest.main()