[Optimization] Deduplicate shared image/video utilities across VL processors (#6988)

* step1~3

* fix import path

* 删除重复代码

* 删除重复代码

* 删除重复代码

* fix import path

* update

* fix import path

* add unit test

* fix

* update

* fix unit test
This commit is contained in:
luukunn
2026-03-26 09:49:33 +08:00
committed by GitHub
parent 1502b6f43e
commit d5cb2767d7
16 changed files with 882 additions and 593 deletions
@@ -16,7 +16,6 @@
"""image preprocessor adaptive"""
import math
from typing import List, Optional, Union
import numpy as np
@@ -45,6 +44,8 @@ from paddleformers.transformers.image_utils import (
from paddleformers.transformers.legacy.tokenizer_utils_base import TensorType
from PIL import Image
from fastdeploy.input.image_processors.common import is_scaled_image
from fastdeploy.input.image_processors.common import smart_resize_qwen as smart_resize
from fastdeploy.utils import data_processor_logger
OPENAI_CLIP_MEAN = [0.48145466, 0.4578275, 0.40821073]
@@ -73,22 +74,9 @@ __all__ = [
]
def is_scaled_image(image: np.ndarray) -> bool:
"""
Checks to see whether the pixel values have already been rescaled to [0, 1].
"""
if image.dtype == np.uint8:
return False
# It's possible the image has pixel values in [0, 255] but is of floating type
return np.min(image) >= 0 and np.max(image) <= 1
def make_batched_images(images) -> List[List[ImageInput]]:
"""
Accepts images in list or nested list format, and makes a list of images for preprocessing.
Args:
images (`Union[List[List[ImageInput]], List[ImageInput], ImageInput]`):
The input image.
@@ -521,67 +509,3 @@ class AdaptiveImageProcessor(BaseImageProcessor):
}
return BatchFeature(data=data, tensor_type=return_tensors)
def round_by_factor(number: int, factor: int) -> int:
"""Returns the closest integer to 'number' that is divisible by 'factor'."""
return round(number / factor) * factor
def ceil_by_factor(number: int, factor: int) -> int:
"""Returns the smallest integer greater than or equal to 'number' that is divisible by 'factor'."""
return math.ceil(number / factor) * factor
def floor_by_factor(number: int, factor: int) -> int:
"""Returns the largest integer less than or equal to 'number' that is divisible by 'factor'."""
return math.floor(number / factor) * factor
def smart_resize(
height: int,
width: int,
factor: int = IMAGE_FACTOR,
min_pixels: int = MIN_PIXELS,
max_pixels: int = MAX_PIXELS,
):
"""
Rescales the image so that the following conditions are met:
1. Both dimensions (height and width) are divisible by 'factor'.
2. The total number of pixels is within the range ['min_pixels', 'max_pixels'].
3. The aspect ratio of the image is maintained as closely as possible.
"""
if max(height, width) / min(height, width) > MAX_RATIO:
if height > width:
new_width = max(factor, round_by_factor(width, factor))
new_height = floor_by_factor(new_width * MAX_RATIO, factor)
else:
new_height = max(factor, round_by_factor(height, factor))
new_width = floor_by_factor(new_height * MAX_RATIO, factor)
data_processor_logger.info(
f"absolute aspect ratio must be smaller than {MAX_RATIO}, got {max(height, width) / min(height, width)},\
resize to {max(new_height, new_width) / min(new_height, new_width)}"
)
height = new_height
width = new_width
h_bar = max(factor, round_by_factor(height, factor))
w_bar = max(factor, round_by_factor(width, factor))
if h_bar * w_bar > max_pixels:
beta = math.sqrt((height * width) / max_pixels)
h_bar = floor_by_factor(height / beta, factor)
w_bar = floor_by_factor(width / beta, factor)
elif h_bar * w_bar < min_pixels:
beta = math.sqrt(min_pixels / (height * width))
h_bar = ceil_by_factor(height * beta, factor)
w_bar = ceil_by_factor(width * beta, factor)
if min_pixels > h_bar * w_bar or h_bar * w_bar > max_pixels:
raise ValueError(f"encounter invalid h_bar: {h_bar}, w_bar: {w_bar}")
return h_bar, w_bar
@@ -0,0 +1,13 @@
# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
+208
View File
@@ -0,0 +1,208 @@
# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Shared image utility functions for all VL image processors."""
import math
import numpy as np
from fastdeploy.utils import data_processor_logger
__all__ = [
"round_by_factor",
"ceil_by_factor",
"floor_by_factor",
"is_scaled_image",
"smart_resize",
"smart_resize_qwen",
"smart_resize_paddleocr",
]
def round_by_factor(number: int, factor: int) -> int:
"""Returns the closest integer to 'number' that is divisible by 'factor'."""
return round(number / factor) * factor
def ceil_by_factor(number: int, factor: int) -> int:
"""Returns the smallest integer >= 'number' that is divisible by 'factor'."""
return math.ceil(number / factor) * factor
def floor_by_factor(number: int, factor: int) -> int:
"""Returns the largest integer <= 'number' that is divisible by 'factor'."""
return math.floor(number / factor) * factor
def is_scaled_image(image: np.ndarray) -> bool:
"""Check if image pixel values are already normalized to [0, 1] range.
Args:
image: Input image array.
Returns:
bool: True if image is already scaled to [0, 1].
"""
if image.dtype == np.uint8:
return False
# It's possible the image has pixel values in [0, 255] but is of floating type
return np.min(image) >= 0 and np.max(image) <= 1
def smart_resize_qwen(
height: int,
width: int,
factor: int,
min_pixels: int,
max_pixels: int,
max_ratio: int = 200,
) -> tuple:
"""Smart image resizing for ERNIE / Qwen2.5 / Qwen3 models.
Maintains aspect ratio and respects pixel constraints. When the aspect ratio
exceeds max_ratio, the image is cropped (not raised as error) to fit within
the ratio limit.
Args:
height: Original image height.
width: Original image width.
factor: Patch size factor; both output dimensions will be multiples of this.
min_pixels: Minimum allowed total pixels.
max_pixels: Maximum allowed total pixels.
max_ratio: Maximum allowed aspect ratio (default 200).
Returns:
tuple: (new_height, new_width)
Raises:
ValueError: If calculated dimensions are still invalid after resizing.
"""
if max(height, width) / min(height, width) > max_ratio:
if height > width:
new_width = max(factor, round_by_factor(width, factor))
new_height = floor_by_factor(new_width * max_ratio, factor)
else:
new_height = max(factor, round_by_factor(height, factor))
new_width = floor_by_factor(new_height * max_ratio, factor)
data_processor_logger.info(
f"absolute aspect ratio must be smaller than {max_ratio}, "
f"got {max(height, width) / min(height, width)}, "
f"resize to {max(new_height, new_width) / min(new_height, new_width)}"
)
height = new_height
width = new_width
h_bar = max(factor, round_by_factor(height, factor))
w_bar = max(factor, round_by_factor(width, factor))
if h_bar * w_bar > max_pixels:
beta = math.sqrt((height * width) / max_pixels)
h_bar = floor_by_factor(height / beta, factor)
w_bar = floor_by_factor(width / beta, factor)
elif h_bar * w_bar < min_pixels:
beta = math.sqrt(min_pixels / (height * width))
h_bar = ceil_by_factor(height * beta, factor)
w_bar = ceil_by_factor(width * beta, factor)
if min_pixels > h_bar * w_bar or h_bar * w_bar > max_pixels:
raise ValueError(f"encounter invalid h_bar: {h_bar}, w_bar: {w_bar}")
return h_bar, w_bar
def smart_resize_paddleocr(
height: int,
width: int,
factor: int = 28,
min_pixels: int = 28 * 28 * 130,
max_pixels: int = 28 * 28 * 1280,
) -> tuple:
"""Smart image resizing for PaddleOCR-VL model.
Similar to smart_resize_qwen but adds small-image protection: if height or
width is smaller than factor, the image is scaled up to factor first. Also,
when aspect ratio exceeds 200 this function raises ValueError (instead of
silently cropping like the qwen variant).
Args:
height: Original image height.
width: Original image width.
factor: Patch size factor; both output dimensions will be multiples of this.
min_pixels: Minimum allowed total pixels.
max_pixels: Maximum allowed total pixels.
Returns:
tuple: (new_height, new_width)
Raises:
ValueError: If aspect ratio exceeds 200, or calculated dimensions are invalid.
"""
if height < factor:
data_processor_logger.debug(f"smart_resize_paddleocr: height={height} < factor={factor}, reset height=factor")
width = round((width * factor) / height)
height = factor
if width < factor:
data_processor_logger.debug(f"smart_resize_paddleocr: width={width} < factor={factor}, reset width=factor")
height = round((height * factor) / width)
width = factor
if max(height, width) / min(height, width) > 200:
raise ValueError(
f"absolute aspect ratio must be smaller than 200, " f"got {max(height, width) / min(height, width)}"
)
h_bar = round(height / factor) * factor
w_bar = round(width / factor) * factor
if h_bar * w_bar > max_pixels:
beta = math.sqrt((height * width) / max_pixels)
h_bar = math.floor(height / beta / factor) * factor
w_bar = math.floor(width / beta / factor) * factor
elif h_bar * w_bar < min_pixels:
beta = math.sqrt(min_pixels / (height * width))
h_bar = math.ceil(height * beta / factor) * factor
w_bar = math.ceil(width * beta / factor) * factor
return h_bar, w_bar
def smart_resize(
height: int,
width: int,
factor: int,
min_pixels: int,
max_pixels: int,
max_ratio: int = 200,
variant: str = "qwen",
) -> tuple:
"""Unified smart_resize dispatcher.
Args:
height: Original image height.
width: Original image width.
factor: Patch size factor.
min_pixels: Minimum allowed total pixels.
max_pixels: Maximum allowed total pixels.
max_ratio: Maximum allowed aspect ratio (only used by "qwen" variant).
variant: Which algorithm variant to use.
- "qwen" (default): for ERNIE / Qwen2.5 / Qwen3. Clips extreme ratios silently.
- "paddleocr": for PaddleOCR-VL. Adds small-image protection, raises on bad ratio.
Returns:
tuple: (new_height, new_width)
"""
if variant == "paddleocr":
return smart_resize_paddleocr(height, width, factor, min_pixels, max_pixels)
return smart_resize_qwen(height, width, factor, min_pixels, max_pixels, max_ratio)
@@ -19,8 +19,6 @@
# TODO: Support videos
import json
import logging
import math
from pathlib import Path
from typing import Dict, List, Optional, Union
@@ -34,6 +32,10 @@ from paddleformers.transformers.image_utils import (
to_numpy_array,
)
from fastdeploy.input.image_processors.common import (
smart_resize_paddleocr as smart_resize,
)
_OPENAI_CLIP_MEAN = [0.48145466, 0.4578275, 0.40821073]
_OPENAI_CLIP_STD = [0.26862954, 0.26130258, 0.27577711]
@@ -68,54 +70,6 @@ def adjust_size(size, patch_size):
return num_patches * patch_size
def smart_resize(
height: int,
width: int,
factor: int = 28,
min_pixels: int = 28 * 28 * 130,
max_pixels: int = 28 * 28 * 1280,
):
"""Rescales the image so that the following conditions are met:
1. Both dimensions (height and width) are divisible by 'factor'.
2. The total number of pixels is within the range ['min_pixels', 'max_pixels'].
3. The aspect ratio of the image is maintained as closely as possible.
"""
# if height < factor or width < factor:
# raise ValueError(f"height:{height} or width:{width} must be larger than factor:{factor}")
# if int(height < factor//4) + int(width < factor//4):
# raise ValueError(f"height:{height} or width:{width} must be larger than factor:{factor//4}")
if height < factor:
logging.debug(f"smart_resize: height={height} < factor={factor}, reset height=factor")
width = round((width * factor) / height)
height = factor
if width < factor:
logging.debug(f"smart_resize: width={width} < factor={factor}, reset width=factor")
height = round((height * factor) / width)
width = factor
if max(height, width) / min(height, width) > 200:
raise ValueError(
f"absolute aspect ratio must be smaller than 200, got {max(height, width) / min(height, width)}"
)
h_bar = round(height / factor) * factor
w_bar = round(width / factor) * factor
if h_bar * w_bar > max_pixels:
beta = math.sqrt((height * width) / max_pixels)
h_bar = math.floor(height / beta / factor) * factor
w_bar = math.floor(width / beta / factor) * factor
elif h_bar * w_bar < min_pixels:
beta = math.sqrt(min_pixels / (height * width))
h_bar = math.ceil(height * beta / factor) * factor
w_bar = math.ceil(width * beta / factor) * factor
return h_bar, w_bar
class ImageProcessor(BaseImageProcessor):
model_input_names = [
"pixel_values",
@@ -26,14 +26,14 @@ from PIL import Image
from fastdeploy.engine.request import ImagePosition
from fastdeploy.entrypoints.chat_utils import parse_chat_messages
from fastdeploy.input.ernie4_5_vl_processor import read_video_decord
from fastdeploy.input.mm_data_processor import MMBaseDataProcessor
from fastdeploy.input.utils import IDS_TYPE_FLAG
from fastdeploy.input.video_utils import read_video_decord
from fastdeploy.input.video_utils import sample_frames_paddleocr as sample_frames
from fastdeploy.multimodal.hasher import MultimodalHasher
from fastdeploy.utils import data_processor_logger
from .image_processor import ImageProcessor
from .process_video import sample_frames
class DataProcessor(MMBaseDataProcessor):
@@ -1,82 +0,0 @@
"""
# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""
import math
from typing import Optional, Union
import numpy as np
def sample_frames(
frame_factor: int,
min_frames: int,
max_frames: int,
metadata: Optional[dict] = None,
fps: Optional[Union[int, float]] = None,
num_frames: Optional[int] = None,
):
"""
Sample frames from video according to specified criteria.
Args:
frame_factor: Ensure sampled frames are multiples of this factor
min_frames: Minimum number of frames to sample
max_frames: Maximum number of frames to sample
metadata: Video metadata containing fps information
fps: Target frames per second for sampling
num_frames: Exact number of frames to sample
Returns:
np.ndarray: Sampled video frames
Raises:
ValueError: If both fps and num_frames are specified,
or if required metadata is missing,
or if requested frames exceed available frames
"""
if fps > 0 and num_frames > 0:
raise ValueError("`num_frames` and `fps` are mutually exclusive arguments, please use only one!")
total_num_frames = metadata["num_of_frame"]
# If num_frames is not given but fps is, calculate num_frames from fps
if num_frames > 0:
num_frames = round(num_frames / frame_factor) * frame_factor
elif fps > 0:
if metadata is None:
raise ValueError(
"Asked to sample `fps` frames per second but no video metadata was provided which is required when sampling with `fps`. "
"Please pass in `VideoMetadata` object or use a fixed `num_frames` per input video"
)
max_frames = math.floor(min(max_frames, total_num_frames) / frame_factor) * frame_factor
num_frames = total_num_frames / metadata["fps"] * fps
num_frames = min(min(max(num_frames, min_frames), max_frames), total_num_frames)
num_frames = math.floor(num_frames / frame_factor) * frame_factor
if num_frames > total_num_frames:
raise ValueError(
f"Video can't be sampled. The inferred `num_frames={num_frames}` exceeds `total_num_frames={total_num_frames}`. "
"Decrease `num_frames` or `fps` for sampling."
)
# Calculate frame indices based on sampling strategy
if num_frames > 0:
# Evenly spaced sampling for target frame count
indices = np.arange(0, total_num_frames, total_num_frames / num_frames).astype(np.int32)
else:
# Keep all frames if no sampling requested
indices = np.arange(0, total_num_frames).astype(np.int32)
return indices
@@ -14,7 +14,6 @@
# limitations under the License.
"""
import math
from typing import List, Optional, Union
import numpy as np
@@ -41,6 +40,7 @@ from paddleformers.transformers.image_utils import (
from paddleformers.transformers.legacy.tokenizer_utils_base import TensorType
from PIL import Image
from fastdeploy.input.image_processors.common import is_scaled_image, smart_resize
from fastdeploy.utils import data_processor_logger
IMAGE_MEAN = [0.5, 0.5, 0.5]
@@ -62,86 +62,6 @@ VideoInput = Union[
]
def round_by_factor(number: int, factor: int) -> int:
return round(number / factor) * factor
def ceil_by_factor(number: int, factor: int) -> int:
return math.ceil(number / factor) * factor
def floor_by_factor(number: int, factor: int) -> int:
return math.floor(number / factor) * factor
def smart_resize(height: int, width: int, factor: int, min_pixels: int, max_pixels: int, max_ratio: int = 200):
"""
Smart image resizing that maintains aspect ratio and respects constraints.
Args:
height: Original image height
width: Original image width
factor: Patch size factor
min_pixels: Minimum allowed pixels
max_pixels: Maximum allowed pixels
max_ratio: Maximum allowed aspect ratio
Returns:
tuple: (new_height, new_width)
Raises:
ValueError: If calculated dimensions are invalid
"""
if max(height, width) / min(height, width) > max_ratio:
if height > width:
new_width = max(factor, round_by_factor(width, factor))
new_height = floor_by_factor(new_width * max_ratio, factor)
else:
new_height = max(factor, round_by_factor(height, factor))
new_width = floor_by_factor(new_height * max_ratio, factor)
data_processor_logger.info(
f"absolute aspect ratio must be smaller than {max_ratio}, got {max(height, width) / min(height, width)},\
resize to {max(new_height, new_width) / min(new_height, new_width)}"
)
height = new_height
width = new_width
h_bar = max(factor, round_by_factor(height, factor))
w_bar = max(factor, round_by_factor(width, factor))
if h_bar * w_bar > max_pixels:
beta = math.sqrt((height * width) / max_pixels)
h_bar = floor_by_factor(height / beta, factor)
w_bar = floor_by_factor(width / beta, factor)
elif h_bar * w_bar < min_pixels:
beta = math.sqrt(min_pixels / (height * width))
h_bar = ceil_by_factor(height * beta, factor)
w_bar = ceil_by_factor(width * beta, factor)
if min_pixels > h_bar * w_bar or h_bar * w_bar > max_pixels:
raise ValueError(f"encounter invalid h_bar: {h_bar}, w_bar: {w_bar}")
return h_bar, w_bar
def is_scaled_image(image: np.ndarray) -> bool:
"""
Check if image pixel values are already normalized to [0, 1] range.
Args:
image: Input image array
Returns:
bool: True if image is already scaled
"""
if image.dtype == np.uint8:
return False
# It's possible the image has pixel values in [0, 255] but is of floating type
return np.min(image) >= 0 and np.max(image) <= 1
class ImageProcessor(BaseImageProcessor):
"""
Adaptive image processor for dynamic image resizing and preprocessing.
+3 -79
View File
@@ -26,13 +26,14 @@ from PIL import Image
from fastdeploy.engine.request import ImagePosition
from fastdeploy.entrypoints.chat_utils import parse_chat_messages
from fastdeploy.input.ernie4_5_vl_processor import read_video_decord
from fastdeploy.input.mm_data_processor import MMBaseDataProcessor
from fastdeploy.input.utils import IDS_TYPE_FLAG
from fastdeploy.input.video_utils import read_video_decord
from fastdeploy.input.video_utils import sample_frames_qwen as sample_frames
from fastdeploy.multimodal.hasher import MultimodalHasher
from fastdeploy.utils import data_processor_logger
from .image_processor import ImageProcessor, ceil_by_factor, floor_by_factor
from .image_processor import ImageProcessor
VIDEO_MIN_PIXELS = 128 * 28 * 28
VIDEO_MAX_PIXELS = 768 * 28 * 28
@@ -42,83 +43,6 @@ FPS_MIN_FRAMES = 4
FPS_MAX_FRAMES = 768
def sample_frames(
frame_factor: int,
min_frames: int,
max_frames: int,
metadata: Optional[dict] = None,
fps: Optional[Union[int, float]] = -1,
num_frames: Optional[int] = -1,
):
"""
Sample frames from video according to specified criteria.
Args:
frame_factor: Ensure sampled frames are multiples of this factor
min_frames: Minimum number of frames to sample
max_frames: Maximum number of frames to sample
metadata: Video metadata containing fps information
fps: Target frames per second for sampling
num_frames: Exact number of frames to sample
Returns:
np.ndarray: Sampled video frames
Raises:
ValueError: If both fps and num_frames are specified,
or if required metadata is missing,
or if requested frames exceed available frames
"""
if fps > 0 and num_frames > 0:
raise ValueError("`num_frames` and `fps` are mutually exclusive arguments, please use only one!")
total_num_frames = metadata["num_of_frame"]
# If num_frames is not given but fps is, calculate num_frames from fps
if num_frames > 0:
num_frames = round(num_frames / frame_factor) * frame_factor
elif fps > 0:
if metadata is None:
raise ValueError(
"Asked to sample `fps` frames per second but no video metadata was provided which is required when sampling with `fps`. "
"Please pass in `VideoMetadata` object or use a fixed `num_frames` per input video"
)
# max_frames = math.floor(min(max_frames, total_num_frames) / frame_factor) * frame_factor
min_frames = ceil_by_factor(min_frames, frame_factor)
max_frames = floor_by_factor(min(max_frames, total_num_frames), frame_factor)
num_frames = total_num_frames / metadata["fps"] * fps
if num_frames > total_num_frames:
data_processor_logger.warning(f"smart_nframes: nframes[{num_frames}] > total_frames[{total_num_frames}]")
num_frames = min(min(max(num_frames, min_frames), max_frames), total_num_frames)
num_frames = floor_by_factor(num_frames, frame_factor)
if num_frames > total_num_frames:
raise ValueError(
f"Video can't be sampled. The inferred `num_frames={num_frames}` exceeds `total_num_frames={total_num_frames}`. "
"Decrease `num_frames` or `fps` for sampling."
)
# Hack code ensures that num_frames can always be divided by 4
# due to sched/resource_manager_v1.py 中 grid_thw.extend([[2, h, w]] * (t // 2))
if num_frames > 2 and num_frames % 4 != 0:
num_frames = (num_frames // 4) * 4 # 向下取整到 4 的倍数
total_num_frames = (total_num_frames // 4) * 4
num_frames = min(min(max(num_frames, min_frames), max_frames), total_num_frames)
# Calculate frame indices based on sampling strategy
if num_frames > 0:
# Evenly spaced sampling for target frame count
indices = np.arange(0, total_num_frames, total_num_frames / num_frames).astype(np.int32)
else:
# Keep all frames if no sampling requested
indices = np.arange(0, total_num_frames).astype(np.int32)
return indices
class DataProcessor(MMBaseDataProcessor):
"""
Processes multimodal inputs (text, images, videos) into model-ready formats.
@@ -14,7 +14,6 @@
# limitations under the License.
"""
import math
from typing import List, Optional, Union
import numpy as np
@@ -41,6 +40,7 @@ from paddleformers.transformers.image_utils import (
from paddleformers.transformers.legacy.tokenizer_utils_base import TensorType
from PIL import Image
from fastdeploy.input.image_processors.common import is_scaled_image, smart_resize
from fastdeploy.utils import data_processor_logger
OPENAI_CLIP_MEAN = [0.48145466, 0.4578275, 0.40821073]
@@ -62,116 +62,6 @@ VideoInput = Union[
]
def round_by_factor(number: int, factor: int) -> int:
"""
Round number to nearest multiple of factor.
Args:
number: Input number to round
factor: Rounding factor
Returns:
int: Rounded number
"""
return round(number / factor) * factor
def ceil_by_factor(number: int, factor: int) -> int:
"""
Round number up to nearest multiple of factor.
Args:
number: Input number to round
factor: Rounding factor
Returns:
int: Rounded number
"""
return math.ceil(number / factor) * factor
def floor_by_factor(number: int, factor: int) -> int:
"""
Round number down to nearest multiple of factor.
Args:
number: Input number to round
factor: Rounding factor
Returns:
int: Rounded number
"""
return math.floor(number / factor) * factor
def smart_resize(height: int, width: int, factor: int, min_pixels: int, max_pixels: int, max_ratio: int = 200):
"""
Smart image resizing that maintains aspect ratio and respects constraints.
Args:
height: Original image height
width: Original image width
factor: Patch size factor
min_pixels: Minimum allowed pixels
max_pixels: Maximum allowed pixels
max_ratio: Maximum allowed aspect ratio
Returns:
tuple: (new_height, new_width)
Raises:
ValueError: If calculated dimensions are invalid
"""
if max(height, width) / min(height, width) > max_ratio:
if height > width:
new_width = max(factor, round_by_factor(width, factor))
new_height = floor_by_factor(new_width * max_ratio, factor)
else:
new_height = max(factor, round_by_factor(height, factor))
new_width = floor_by_factor(new_height * max_ratio, factor)
data_processor_logger.info(
f"absolute aspect ratio must be smaller than {max_ratio}, got {max(height, width) / min(height, width)},\
resize to {max(new_height, new_width) / min(new_height, new_width)}"
)
height = new_height
width = new_width
h_bar = max(factor, round_by_factor(height, factor))
w_bar = max(factor, round_by_factor(width, factor))
if h_bar * w_bar > max_pixels:
beta = math.sqrt((height * width) / max_pixels)
h_bar = floor_by_factor(height / beta, factor)
w_bar = floor_by_factor(width / beta, factor)
elif h_bar * w_bar < min_pixels:
beta = math.sqrt(min_pixels / (height * width))
h_bar = ceil_by_factor(height * beta, factor)
w_bar = ceil_by_factor(width * beta, factor)
if min_pixels > h_bar * w_bar or h_bar * w_bar > max_pixels:
raise ValueError(f"encounter invalid h_bar: {h_bar}, w_bar: {w_bar}")
return h_bar, w_bar
def is_scaled_image(image: np.ndarray) -> bool:
"""
Check if image pixel values are already normalized to [0, 1] range.
Args:
image: Input image array
Returns:
bool: True if image is already scaled
"""
if image.dtype == np.uint8:
return False
# It's possible the image has pixel values in [0, 255] but is of floating type
return np.min(image) >= 0 and np.max(image) <= 1
class ImageProcessor(BaseImageProcessor):
"""
Adaptive image processor for dynamic image resizing and preprocessing.
@@ -26,14 +26,14 @@ from PIL import Image
from fastdeploy.engine.request import ImagePosition
from fastdeploy.entrypoints.chat_utils import parse_chat_messages
from fastdeploy.input.ernie4_5_vl_processor import read_video_decord
from fastdeploy.input.mm_data_processor import MMBaseDataProcessor
from fastdeploy.input.utils import IDS_TYPE_FLAG
from fastdeploy.input.video_utils import read_video_decord
from fastdeploy.input.video_utils import sample_frames_qwen as sample_frames
from fastdeploy.multimodal.hasher import MultimodalHasher
from fastdeploy.utils import data_processor_logger
from .image_processor import ImageProcessor
from .process_video import sample_frames
FRAME_FACTOR = 2
FPS = 2.0
@@ -1,100 +0,0 @@
"""
# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""
from typing import Optional, Union
import numpy as np
from fastdeploy.utils import data_processor_logger
from .image_processor import ceil_by_factor, floor_by_factor
def sample_frames(
frame_factor: int,
min_frames: int,
max_frames: int,
metadata: Optional[dict] = None,
fps: Optional[Union[int, float]] = -1,
num_frames: Optional[int] = -1,
):
"""
Sample frames from video according to specified criteria.
Args:
frame_factor: Ensure sampled frames are multiples of this factor
min_frames: Minimum number of frames to sample
max_frames: Maximum number of frames to sample
metadata: Video metadata containing fps information
fps: Target frames per second for sampling
num_frames: Exact number of frames to sample
Returns:
np.ndarray: Sampled video frames
Raises:
ValueError: If both fps and num_frames are specified,
or if required metadata is missing,
or if requested frames exceed available frames
"""
if fps > 0 and num_frames > 0:
raise ValueError("`num_frames` and `fps` are mutually exclusive arguments, please use only one!")
total_num_frames = metadata["num_of_frame"]
# If num_frames is not given but fps is, calculate num_frames from fps
if num_frames > 0:
num_frames = round(num_frames / frame_factor) * frame_factor
elif fps > 0:
if metadata is None:
raise ValueError(
"Asked to sample `fps` frames per second but no video metadata was provided which is required when sampling with `fps`. "
"Please pass in `VideoMetadata` object or use a fixed `num_frames` per input video"
)
# max_frames = math.floor(min(max_frames, total_num_frames) / frame_factor) * frame_factor
min_frames = ceil_by_factor(min_frames, frame_factor)
max_frames = floor_by_factor(min(max_frames, total_num_frames), frame_factor)
num_frames = total_num_frames / metadata["fps"] * fps
if num_frames > total_num_frames:
data_processor_logger.warning(f"smart_nframes: nframes[{num_frames}] > total_frames[{total_num_frames}]")
num_frames = min(min(max(num_frames, min_frames), max_frames), total_num_frames)
num_frames = floor_by_factor(num_frames, frame_factor)
if num_frames > total_num_frames:
raise ValueError(
f"Video can't be sampled. The inferred `num_frames={num_frames}` exceeds `total_num_frames={total_num_frames}`. "
"Decrease `num_frames` or `fps` for sampling."
)
# Hack code ensures that num_frames can always be divided by 4
# due to sched/resource_manager_v1.py 中 grid_thw.extend([[2, h, w]] * (t // 2))
if num_frames > 2 and num_frames % 4 != 0:
num_frames = (num_frames // 4) * 4 # 向下取整到 4 的倍数
total_num_frames = (total_num_frames // 4) * 4
num_frames = min(min(max(num_frames, min_frames), max_frames), total_num_frames)
# Calculate frame indices based on sampling strategy
if num_frames > 0:
# Evenly spaced sampling for target frame count
indices = np.arange(0, total_num_frames, total_num_frames / num_frames).astype(np.int32)
else:
# Keep all frames if no sampling requested
indices = np.arange(0, total_num_frames).astype(np.int32)
return indices
+272
View File
@@ -0,0 +1,272 @@
# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Shared video utilities: VideoReaderWrapper, read_video_decord, and sample_frames."""
import io
import math
import os
from tempfile import NamedTemporaryFile as ntf
from typing import Optional, Union
import numpy as np
from fastdeploy.input.image_processors.common import ceil_by_factor, floor_by_factor
from fastdeploy.utils import data_processor_logger
__all__ = [
"VideoReaderWrapper",
"read_video_decord",
"sample_frames",
"sample_frames_qwen",
"sample_frames_paddleocr",
]
# ---------------------------------------------------------------------------
# VideoReaderWrapper
# ---------------------------------------------------------------------------
def _is_gif(data: bytes) -> bool:
"""Check if bytes represent a GIF based on magic header."""
return data[:6] in (b"GIF87a", b"GIF89a")
class VideoReaderWrapper:
"""decord.VideoReader wrapper that fixes a memory leak and adds GIF support.
Reference: https://github.com/dmlc/decord/issues/208
"""
def __init__(self, video_path, *args, **kwargs):
import decord
try:
# moviepy 1.0
import moviepy.editor as mp
except Exception:
# moviepy 2.0
import moviepy as mp
with ntf(delete=True, suffix=".gif") as gif_file:
gif_input = None
self.original_file = None # only set when we create a temp file
if isinstance(video_path, str):
if video_path.lower().endswith(".gif"):
gif_input = video_path
elif isinstance(video_path, bytes):
if _is_gif(video_path):
gif_file.write(video_path)
gif_file.flush()
gif_input = gif_file.name
elif isinstance(video_path, io.BytesIO):
video_path.seek(0)
tmp_bytes = video_path.read()
video_path.seek(0)
if _is_gif(tmp_bytes):
gif_file.write(tmp_bytes)
gif_file.flush()
gif_input = gif_file.name
if gif_input is not None:
clip = mp.VideoFileClip(gif_input)
mp4_file = ntf(delete=False, suffix=".mp4")
mp4_path = mp4_file.name
mp4_file.close() # close before moviepy writes
clip.write_videofile(mp4_path, verbose=False, logger=None)
clip.close()
video_path = mp4_path
self.original_file = video_path # temp mp4, cleaned up in __del__
self._reader = decord.VideoReader(video_path, *args, **kwargs)
self._reader.seek(0)
def __len__(self):
return len(self._reader)
def __getitem__(self, key):
frames = self._reader[key]
self._reader.seek(0)
return frames
def get_avg_fps(self):
return self._reader.get_avg_fps()
def seek(self, pos):
return self._reader.seek(pos)
def __del__(self):
original_file = getattr(self, "original_file", None)
if original_file:
try:
os.remove(original_file)
except OSError:
pass
# ---------------------------------------------------------------------------
# read_video_decord
# ---------------------------------------------------------------------------
def read_video_decord(video_path, save_to_disk: bool = False):
"""Load a video file and return (video_reader, video_meta, video_path).
video_meta contains keys: "fps", "duration", "num_of_frame".
"""
if isinstance(video_path, VideoReaderWrapper):
video_reader = video_path
else:
if isinstance(video_path, bytes):
video_path = io.BytesIO(video_path)
video_reader = VideoReaderWrapper(video_path, num_threads=1)
vlen = len(video_reader)
fps = video_reader.get_avg_fps()
duration = vlen / float(fps)
video_meta = {"fps": fps, "duration": duration, "num_of_frame": vlen}
return video_reader, video_meta, video_path
# ---------------------------------------------------------------------------
# sample_frames — qwen_vl variant
# ---------------------------------------------------------------------------
def sample_frames_qwen(
frame_factor: int,
min_frames: int,
max_frames: int,
metadata: Optional[dict] = None,
fps: Optional[Union[int, float]] = -1,
num_frames: Optional[int] = -1,
) -> np.ndarray:
"""Sample frame indices — qwen_vl variant.
Sentinel defaults are -1. Applies ceil_by_factor on min_frames and ensures
num_frames is divisible by 4.
"""
if fps > 0 and num_frames > 0:
raise ValueError("`num_frames` and `fps` are mutually exclusive arguments, please use only one!")
if metadata is None:
raise ValueError("metadata is required for sample_frames_qwen")
total_num_frames = metadata["num_of_frame"]
if num_frames > 0:
num_frames = round(num_frames / frame_factor) * frame_factor
elif fps > 0:
min_frames = ceil_by_factor(min_frames, frame_factor)
max_frames = floor_by_factor(min(max_frames, total_num_frames), frame_factor)
num_frames = total_num_frames / metadata["fps"] * fps
if num_frames > total_num_frames:
data_processor_logger.warning(f"smart_nframes: nframes[{num_frames}] > total_frames[{total_num_frames}]")
num_frames = min(min(max(num_frames, min_frames), max_frames), total_num_frames)
num_frames = floor_by_factor(num_frames, frame_factor)
if num_frames > total_num_frames:
raise ValueError(
f"Video can't be sampled. The inferred `num_frames={num_frames}` exceeds "
f"`total_num_frames={total_num_frames}`. "
"Decrease `num_frames` or `fps` for sampling."
)
# num_frames must be divisible by 4
if num_frames > 2 and num_frames % 4 != 0:
num_frames = (num_frames // 4) * 4
total_num_frames = (total_num_frames // 4) * 4
num_frames = min(min(max(num_frames, min_frames), max_frames), total_num_frames)
if num_frames > 0:
indices = np.arange(0, total_num_frames, total_num_frames / num_frames).astype(np.int32)
else:
indices = np.arange(0, total_num_frames).astype(np.int32)
return indices
# ---------------------------------------------------------------------------
# sample_frames — paddleocr_vl / ernie4_5_vl variant
# ---------------------------------------------------------------------------
def sample_frames_paddleocr(
frame_factor: int,
min_frames: int,
max_frames: int,
metadata: Optional[dict] = None,
fps: Optional[Union[int, float]] = None,
num_frames: Optional[int] = None,
) -> np.ndarray:
"""Sample frame indices — paddleocr_vl / ernie4_5_vl variant.
Sentinel defaults are None. Uses plain math.floor/ceil; no %4 correction.
"""
fps = fps or 0
num_frames = num_frames or 0
if fps > 0 and num_frames > 0:
raise ValueError("`num_frames` and `fps` are mutually exclusive arguments, please use only one!")
if metadata is None:
raise ValueError("metadata is required for sample_frames_paddleocr")
total_num_frames = metadata["num_of_frame"]
if num_frames > 0:
num_frames = round(num_frames / frame_factor) * frame_factor
elif fps > 0:
max_frames = math.floor(min(max_frames, total_num_frames) / frame_factor) * frame_factor
num_frames = total_num_frames / metadata["fps"] * fps
num_frames = min(min(max(num_frames, min_frames), max_frames), total_num_frames)
num_frames = math.floor(num_frames / frame_factor) * frame_factor
if num_frames > total_num_frames:
raise ValueError(
f"Video can't be sampled. The inferred `num_frames={num_frames}` exceeds "
f"`total_num_frames={total_num_frames}`. "
"Decrease `num_frames` or `fps` for sampling."
)
if num_frames > 0:
indices = np.arange(0, total_num_frames, total_num_frames / num_frames).astype(np.int32)
else:
indices = np.arange(0, total_num_frames).astype(np.int32)
return indices
def sample_frames(
frame_factor: int,
min_frames: int,
max_frames: int,
metadata: Optional[dict] = None,
fps: Optional[Union[int, float]] = None,
num_frames: Optional[int] = None,
variant: str = "paddleocr",
) -> np.ndarray:
"""Dispatch to sample_frames_qwen or sample_frames_paddleocr based on variant."""
if variant == "qwen":
_fps = fps if fps is not None else -1
_num_frames = num_frames if num_frames is not None else -1
return sample_frames_qwen(frame_factor, min_frames, max_frames, metadata, _fps, _num_frames)
if variant == "paddleocr":
return sample_frames_paddleocr(frame_factor, min_frames, max_frames, metadata, fps, num_frames)
raise ValueError(f"Unknown variant {variant!r}. Expected 'paddleocr' or 'qwen'.")
@@ -22,14 +22,16 @@ from PIL import Image
from fastdeploy.input.ernie4_5_vl_processor.image_preprocessor.image_preprocessor_adaptive import (
AdaptiveImageProcessor,
make_batched_images,
make_batched_videos,
)
from fastdeploy.input.image_processors.common import (
ceil_by_factor,
floor_by_factor,
is_scaled_image,
make_batched_images,
make_batched_videos,
round_by_factor,
smart_resize,
)
from fastdeploy.input.image_processors.common import smart_resize_qwen as smart_resize
class TestImagePreprocessorAdaptive(unittest.TestCase):
+3 -4
View File
@@ -30,7 +30,7 @@ from fastdeploy.input.paddleocr_vl_processor.paddleocr_vl_processor import (
PaddleOCRVLProcessor,
)
from fastdeploy.input.paddleocr_vl_processor.process import DataProcessor
from fastdeploy.input.paddleocr_vl_processor.process_video import sample_frames
from fastdeploy.input.video_utils import sample_frames_paddleocr as sample_frames
MODULE_PATH = "fastdeploy.input.paddleocr_vl_processor.process"
@@ -86,7 +86,7 @@ class TestProcessVideo(unittest.TestCase):
def test_error_fps_without_metadata(self):
"""新增:测试 fps > 0 但 metadata 为 None"""
with self.assertRaises(TypeError) as context:
with self.assertRaises(ValueError) as context:
sample_frames(
frame_factor=self.frame_factor,
min_frames=self.min_frames,
@@ -95,8 +95,7 @@ class TestProcessVideo(unittest.TestCase):
fps=10,
metadata=None, # 缺失
)
# 验证是预期的 TypeError
self.assertIn("'NoneType' object is not subscriptable", str(context.exception))
self.assertIn("metadata is required", str(context.exception))
def test_num_frames_rounding(self):
"""新增:测试 num_frames 向 frame_factor 舍入"""
+1 -1
View File
@@ -21,7 +21,7 @@ import numpy as np
from PIL import Image
from fastdeploy.input.qwen_vl_processor import QwenVLProcessor
from fastdeploy.input.qwen_vl_processor.process_video import sample_frames
from fastdeploy.input.video_utils import sample_frames_qwen as sample_frames
def mock_pil_image(height, width):
+365
View File
@@ -0,0 +1,365 @@
# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import io
import unittest
from unittest.mock import MagicMock, patch
import numpy as np
from fastdeploy.input.video_utils import (
_is_gif,
read_video_decord,
sample_frames,
sample_frames_paddleocr,
sample_frames_qwen,
)
# ---------------------------------------------------------------------------
# helpers
# ---------------------------------------------------------------------------
GIF87_HEADER = b"GIF87a" + b"\x00" * 10
GIF89_HEADER = b"GIF89a" + b"\x00" * 10
NOT_GIF = b"NOTGIF" + b"\x00" * 10
def _make_mock_reader(num_frames=100, fps=25.0):
"""Return a mock that mimics decord.VideoReader."""
reader = MagicMock()
reader.__len__ = MagicMock(return_value=num_frames)
reader.get_avg_fps = MagicMock(return_value=fps)
reader.seek = MagicMock(return_value=None)
frame = MagicMock()
frame.asnumpy = MagicMock(return_value=np.zeros((480, 640, 3), dtype=np.uint8))
reader.__getitem__ = MagicMock(return_value=frame)
return reader
# ---------------------------------------------------------------------------
# _is_gif
# ---------------------------------------------------------------------------
class TestIsGif(unittest.TestCase):
def test_gif87a(self):
self.assertTrue(_is_gif(GIF87_HEADER))
def test_gif89a(self):
self.assertTrue(_is_gif(GIF89_HEADER))
def test_not_gif(self):
self.assertFalse(_is_gif(NOT_GIF))
def test_short_bytes(self):
self.assertFalse(_is_gif(b"GIF"))
# ---------------------------------------------------------------------------
# VideoReaderWrapper (mock decord + moviepy)
# ---------------------------------------------------------------------------
class TestVideoReaderWrapper(unittest.TestCase):
def _make_wrapper(self, video_path, mock_reader=None):
"""Construct a VideoReaderWrapper with decord mocked out."""
from fastdeploy.input.video_utils import VideoReaderWrapper
if mock_reader is None:
mock_reader = _make_mock_reader()
mock_decord = MagicMock()
mock_decord.VideoReader.return_value = mock_reader
with patch.dict("sys.modules", {"decord": mock_decord, "moviepy": MagicMock(), "moviepy.editor": MagicMock()}):
wrapper = VideoReaderWrapper(video_path)
wrapper._reader = mock_reader
return wrapper
def test_len(self):
reader = _make_mock_reader(num_frames=42)
wrapper = self._make_wrapper("/fake/video.mp4", reader)
self.assertEqual(len(wrapper), 42)
def test_getitem_resets_seek(self):
reader = _make_mock_reader()
wrapper = self._make_wrapper("/fake/video.mp4", reader)
_ = wrapper[0]
reader.seek.assert_called_with(0)
def test_get_avg_fps(self):
reader = _make_mock_reader(fps=30.0)
wrapper = self._make_wrapper("/fake/video.mp4", reader)
self.assertEqual(wrapper.get_avg_fps(), 30.0)
def test_seek(self):
reader = _make_mock_reader()
wrapper = self._make_wrapper("/fake/video.mp4", reader)
wrapper.seek(5)
reader.seek.assert_called_with(5)
def test_del_no_original_file(self):
"""__del__ should be a no-op when original_file is None."""
from fastdeploy.input.video_utils import VideoReaderWrapper
wrapper = object.__new__(VideoReaderWrapper)
wrapper.original_file = None
wrapper._reader = _make_mock_reader()
# Should not raise
wrapper.__del__()
def test_del_removes_temp_file(self):
"""__del__ removes the file only when original_file is set."""
import os
import tempfile
from fastdeploy.input.video_utils import VideoReaderWrapper
with tempfile.NamedTemporaryFile(delete=False) as f:
tmp_path = f.name
wrapper = object.__new__(VideoReaderWrapper)
wrapper.original_file = tmp_path
wrapper._reader = _make_mock_reader()
wrapper.__del__()
self.assertFalse(os.path.exists(tmp_path))
def test_non_gif_string_path_does_not_set_original_file(self):
"""Passing a non-GIF string path must NOT set original_file (bug fix)."""
from fastdeploy.input.video_utils import VideoReaderWrapper
mock_reader = _make_mock_reader()
mock_decord = MagicMock()
mock_decord.VideoReader.return_value = mock_reader
with patch.dict("sys.modules", {"decord": mock_decord, "moviepy": MagicMock(), "moviepy.editor": MagicMock()}):
wrapper = VideoReaderWrapper("/fake/video.mp4")
self.assertIsNone(wrapper.original_file)
def test_bytesio_non_gif_path_does_not_set_original_file(self):
"""Passing a BytesIO that is NOT a GIF must not set original_file."""
from fastdeploy.input.video_utils import VideoReaderWrapper
mock_reader = _make_mock_reader()
mock_decord = MagicMock()
mock_decord.VideoReader.return_value = mock_reader
bio = io.BytesIO(NOT_GIF)
with patch.dict("sys.modules", {"decord": mock_decord, "moviepy": MagicMock(), "moviepy.editor": MagicMock()}):
wrapper = VideoReaderWrapper(bio)
self.assertIsNone(wrapper.original_file)
# ---------------------------------------------------------------------------
# read_video_decord
# ---------------------------------------------------------------------------
class TestReadVideoDecord(unittest.TestCase):
def _patch_wrapper(self, num_frames=100, fps=25.0):
"""Return a context manager that replaces VideoReaderWrapper with a mock."""
from fastdeploy.input import video_utils
mock_wrapper = MagicMock()
mock_wrapper.__len__ = MagicMock(return_value=num_frames)
mock_wrapper.get_avg_fps = MagicMock(return_value=fps)
return patch.object(video_utils, "VideoReaderWrapper", return_value=mock_wrapper), mock_wrapper
def test_existing_wrapper_passthrough(self):
"""Already-wrapped reader is returned as-is."""
from fastdeploy.input.video_utils import VideoReaderWrapper
mock_wrapper = MagicMock(spec=VideoReaderWrapper)
mock_wrapper.__len__ = MagicMock(return_value=50)
mock_wrapper.get_avg_fps = MagicMock(return_value=10.0)
reader, meta, path = read_video_decord(mock_wrapper)
self.assertIs(reader, mock_wrapper)
self.assertEqual(meta["num_of_frame"], 50)
self.assertAlmostEqual(meta["fps"], 10.0)
self.assertAlmostEqual(meta["duration"], 5.0)
def test_bytes_input_converted_to_bytesio(self):
"""bytes input is converted to BytesIO before creating VideoReaderWrapper."""
from fastdeploy.input import video_utils
captured = []
class FakeWrapper:
def __init__(self, path, *args, **kwargs):
captured.append(path)
def __len__(self):
return 30
def get_avg_fps(self):
return 10.0
with patch.object(video_utils, "VideoReaderWrapper", FakeWrapper):
reader, meta, path = read_video_decord(b"fake_video_bytes")
self.assertIsInstance(captured[0], io.BytesIO)
def test_string_path_input(self):
"""String path is passed through to VideoReaderWrapper."""
from fastdeploy.input import video_utils
class FakeWrapper:
def __init__(self, path, *args, **kwargs):
pass
def __len__(self):
return 60
def get_avg_fps(self):
return 30.0
with patch.object(video_utils, "VideoReaderWrapper", FakeWrapper):
reader, meta, path = read_video_decord("/fake/path.mp4")
self.assertEqual(meta["num_of_frame"], 60)
self.assertAlmostEqual(meta["duration"], 2.0)
self.assertEqual(path, "/fake/path.mp4")
# ---------------------------------------------------------------------------
# sample_frames_qwen
# ---------------------------------------------------------------------------
class TestSampleFramesQwen(unittest.TestCase):
META = {"num_of_frame": 100, "fps": 25.0}
def test_num_frames_basic(self):
indices = sample_frames_qwen(2, 4, 100, self.META, num_frames=8)
self.assertEqual(len(indices), 8)
def test_fps_basic(self):
indices = sample_frames_qwen(2, 4, 100, self.META, fps=2.0)
self.assertGreater(len(indices), 0)
self.assertEqual(len(indices) % 2, 0)
def test_fps_and_num_frames_raises(self):
with self.assertRaises(ValueError):
sample_frames_qwen(2, 4, 100, self.META, fps=2.0, num_frames=10)
def test_num_frames_exceeds_total_raises(self):
with self.assertRaises(ValueError):
sample_frames_qwen(2, 4, 100, self.META, num_frames=200)
def test_fps_warning_when_nframes_exceeds_total(self):
"""fps so high that computed num_frames > total → warning logged."""
with self.assertLogs(level="WARNING"):
sample_frames_qwen(2, 4, 100, {"num_of_frame": 10, "fps": 1.0}, fps=100.0)
def test_divisible_by_4_correction(self):
"""Result must be divisible by 4 when num_frames > 2."""
indices = sample_frames_qwen(2, 4, 100, self.META, fps=1.5)
if len(indices) > 2:
self.assertEqual(len(indices) % 4, 0)
def test_no_sampling_returns_all_frames(self):
"""Both fps and num_frames at sentinel → return all frames."""
indices = sample_frames_qwen(2, 4, 100, self.META)
self.assertEqual(len(indices), 100)
def test_indices_dtype(self):
indices = sample_frames_qwen(2, 4, 100, self.META, num_frames=8)
self.assertEqual(indices.dtype, np.int32)
# ---------------------------------------------------------------------------
# sample_frames_paddleocr
# ---------------------------------------------------------------------------
class TestSampleFramesPaddleocr(unittest.TestCase):
META = {"num_of_frame": 100, "fps": 25.0}
def test_num_frames_basic(self):
indices = sample_frames_paddleocr(1, 4, 100, self.META, num_frames=10)
self.assertEqual(len(indices), 10)
def test_fps_basic(self):
indices = sample_frames_paddleocr(1, 4, 100, self.META, fps=2.0)
self.assertGreater(len(indices), 0)
def test_fps_and_num_frames_raises(self):
with self.assertRaises(ValueError):
sample_frames_paddleocr(1, 4, 100, self.META, fps=2.0, num_frames=10)
def test_num_frames_exceeds_total_raises(self):
with self.assertRaises(ValueError):
sample_frames_paddleocr(1, 4, 100, self.META, num_frames=200)
def test_none_sentinels_no_sampling(self):
"""fps=None, num_frames=None → return all frames."""
indices = sample_frames_paddleocr(1, 4, 100, self.META)
self.assertEqual(len(indices), 100)
def test_no_4_correction(self):
"""paddleocr variant does NOT apply %4 correction."""
# 6 frames is not divisible by 4; paddleocr should keep it
meta = {"num_of_frame": 100, "fps": 25.0}
indices = sample_frames_paddleocr(1, 1, 100, meta, num_frames=6)
self.assertEqual(len(indices), 6)
def test_indices_dtype(self):
indices = sample_frames_paddleocr(1, 4, 100, self.META, num_frames=8)
self.assertEqual(indices.dtype, np.int32)
# ---------------------------------------------------------------------------
# sample_frames dispatcher
# ---------------------------------------------------------------------------
class TestSampleFramesDispatcher(unittest.TestCase):
META = {"num_of_frame": 100, "fps": 25.0}
def test_default_variant_is_paddleocr(self):
with patch("fastdeploy.input.video_utils.sample_frames_paddleocr", wraps=sample_frames_paddleocr) as mock_fn:
sample_frames(1, 4, 100, self.META, num_frames=8)
mock_fn.assert_called_once()
def test_qwen_variant_dispatched(self):
with patch("fastdeploy.input.video_utils.sample_frames_qwen", wraps=sample_frames_qwen) as mock_fn:
sample_frames(2, 4, 100, self.META, num_frames=8, variant="qwen")
mock_fn.assert_called_once()
def test_qwen_none_fps_converted_to_sentinel(self):
"""None fps/num_frames → converted to -1 before calling sample_frames_qwen."""
with patch("fastdeploy.input.video_utils.sample_frames_qwen", return_value=np.array([])) as mock_fn:
sample_frames(2, 4, 100, self.META, fps=None, num_frames=None, variant="qwen")
args = mock_fn.call_args[0]
self.assertEqual(args[4], -1) # fps sentinel
self.assertEqual(args[5], -1) # num_frames sentinel
def test_paddleocr_variant_result_consistent(self):
direct = sample_frames_paddleocr(1, 4, 100, self.META, num_frames=8)
via_dispatcher = sample_frames(1, 4, 100, self.META, num_frames=8, variant="paddleocr")
np.testing.assert_array_equal(direct, via_dispatcher)
def test_qwen_variant_result_consistent(self):
direct = sample_frames_qwen(2, 4, 100, self.META, num_frames=8)
via_dispatcher = sample_frames(2, 4, 100, self.META, num_frames=8, variant="qwen")
np.testing.assert_array_equal(direct, via_dispatcher)
if __name__ == "__main__":
unittest.main()