mirror of
https://github.com/PaddlePaddle/FastDeploy.git
synced 2026-04-23 00:17:25 +08:00
[Optimization] Deduplicate shared image/video utilities across VL processors (#6988)
* step1~3 * fix import path * 删除重复代码 * 删除重复代码 * 删除重复代码 * fix import path * update * fix import path * add unit test * fix * update * fix unit test
This commit is contained in:
+2
-78
@@ -16,7 +16,6 @@
|
||||
|
||||
"""image preprocessor adaptive"""
|
||||
|
||||
import math
|
||||
from typing import List, Optional, Union
|
||||
|
||||
import numpy as np
|
||||
@@ -45,6 +44,8 @@ from paddleformers.transformers.image_utils import (
|
||||
from paddleformers.transformers.legacy.tokenizer_utils_base import TensorType
|
||||
from PIL import Image
|
||||
|
||||
from fastdeploy.input.image_processors.common import is_scaled_image
|
||||
from fastdeploy.input.image_processors.common import smart_resize_qwen as smart_resize
|
||||
from fastdeploy.utils import data_processor_logger
|
||||
|
||||
OPENAI_CLIP_MEAN = [0.48145466, 0.4578275, 0.40821073]
|
||||
@@ -73,22 +74,9 @@ __all__ = [
|
||||
]
|
||||
|
||||
|
||||
def is_scaled_image(image: np.ndarray) -> bool:
|
||||
"""
|
||||
Checks to see whether the pixel values have already been rescaled to [0, 1].
|
||||
"""
|
||||
if image.dtype == np.uint8:
|
||||
return False
|
||||
|
||||
# It's possible the image has pixel values in [0, 255] but is of floating type
|
||||
return np.min(image) >= 0 and np.max(image) <= 1
|
||||
|
||||
|
||||
def make_batched_images(images) -> List[List[ImageInput]]:
|
||||
"""
|
||||
Accepts images in list or nested list format, and makes a list of images for preprocessing.
|
||||
|
||||
Args:
|
||||
images (`Union[List[List[ImageInput]], List[ImageInput], ImageInput]`):
|
||||
The input image.
|
||||
|
||||
@@ -521,67 +509,3 @@ class AdaptiveImageProcessor(BaseImageProcessor):
|
||||
}
|
||||
|
||||
return BatchFeature(data=data, tensor_type=return_tensors)
|
||||
|
||||
|
||||
def round_by_factor(number: int, factor: int) -> int:
|
||||
"""Returns the closest integer to 'number' that is divisible by 'factor'."""
|
||||
return round(number / factor) * factor
|
||||
|
||||
|
||||
def ceil_by_factor(number: int, factor: int) -> int:
|
||||
"""Returns the smallest integer greater than or equal to 'number' that is divisible by 'factor'."""
|
||||
return math.ceil(number / factor) * factor
|
||||
|
||||
|
||||
def floor_by_factor(number: int, factor: int) -> int:
|
||||
"""Returns the largest integer less than or equal to 'number' that is divisible by 'factor'."""
|
||||
return math.floor(number / factor) * factor
|
||||
|
||||
|
||||
def smart_resize(
|
||||
height: int,
|
||||
width: int,
|
||||
factor: int = IMAGE_FACTOR,
|
||||
min_pixels: int = MIN_PIXELS,
|
||||
max_pixels: int = MAX_PIXELS,
|
||||
):
|
||||
"""
|
||||
Rescales the image so that the following conditions are met:
|
||||
|
||||
1. Both dimensions (height and width) are divisible by 'factor'.
|
||||
|
||||
2. The total number of pixels is within the range ['min_pixels', 'max_pixels'].
|
||||
|
||||
3. The aspect ratio of the image is maintained as closely as possible.
|
||||
"""
|
||||
if max(height, width) / min(height, width) > MAX_RATIO:
|
||||
if height > width:
|
||||
new_width = max(factor, round_by_factor(width, factor))
|
||||
new_height = floor_by_factor(new_width * MAX_RATIO, factor)
|
||||
else:
|
||||
new_height = max(factor, round_by_factor(height, factor))
|
||||
new_width = floor_by_factor(new_height * MAX_RATIO, factor)
|
||||
|
||||
data_processor_logger.info(
|
||||
f"absolute aspect ratio must be smaller than {MAX_RATIO}, got {max(height, width) / min(height, width)},\
|
||||
resize to {max(new_height, new_width) / min(new_height, new_width)}"
|
||||
)
|
||||
|
||||
height = new_height
|
||||
width = new_width
|
||||
|
||||
h_bar = max(factor, round_by_factor(height, factor))
|
||||
w_bar = max(factor, round_by_factor(width, factor))
|
||||
if h_bar * w_bar > max_pixels:
|
||||
beta = math.sqrt((height * width) / max_pixels)
|
||||
h_bar = floor_by_factor(height / beta, factor)
|
||||
w_bar = floor_by_factor(width / beta, factor)
|
||||
elif h_bar * w_bar < min_pixels:
|
||||
beta = math.sqrt(min_pixels / (height * width))
|
||||
h_bar = ceil_by_factor(height * beta, factor)
|
||||
w_bar = ceil_by_factor(width * beta, factor)
|
||||
|
||||
if min_pixels > h_bar * w_bar or h_bar * w_bar > max_pixels:
|
||||
raise ValueError(f"encounter invalid h_bar: {h_bar}, w_bar: {w_bar}")
|
||||
|
||||
return h_bar, w_bar
|
||||
|
||||
@@ -0,0 +1,13 @@
|
||||
# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
@@ -0,0 +1,208 @@
|
||||
# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
"""Shared image utility functions for all VL image processors."""
|
||||
|
||||
import math
|
||||
|
||||
import numpy as np
|
||||
|
||||
from fastdeploy.utils import data_processor_logger
|
||||
|
||||
__all__ = [
|
||||
"round_by_factor",
|
||||
"ceil_by_factor",
|
||||
"floor_by_factor",
|
||||
"is_scaled_image",
|
||||
"smart_resize",
|
||||
"smart_resize_qwen",
|
||||
"smart_resize_paddleocr",
|
||||
]
|
||||
|
||||
|
||||
def round_by_factor(number: int, factor: int) -> int:
|
||||
"""Returns the closest integer to 'number' that is divisible by 'factor'."""
|
||||
return round(number / factor) * factor
|
||||
|
||||
|
||||
def ceil_by_factor(number: int, factor: int) -> int:
|
||||
"""Returns the smallest integer >= 'number' that is divisible by 'factor'."""
|
||||
return math.ceil(number / factor) * factor
|
||||
|
||||
|
||||
def floor_by_factor(number: int, factor: int) -> int:
|
||||
"""Returns the largest integer <= 'number' that is divisible by 'factor'."""
|
||||
return math.floor(number / factor) * factor
|
||||
|
||||
|
||||
def is_scaled_image(image: np.ndarray) -> bool:
|
||||
"""Check if image pixel values are already normalized to [0, 1] range.
|
||||
|
||||
Args:
|
||||
image: Input image array.
|
||||
|
||||
Returns:
|
||||
bool: True if image is already scaled to [0, 1].
|
||||
"""
|
||||
if image.dtype == np.uint8:
|
||||
return False
|
||||
# It's possible the image has pixel values in [0, 255] but is of floating type
|
||||
return np.min(image) >= 0 and np.max(image) <= 1
|
||||
|
||||
|
||||
def smart_resize_qwen(
|
||||
height: int,
|
||||
width: int,
|
||||
factor: int,
|
||||
min_pixels: int,
|
||||
max_pixels: int,
|
||||
max_ratio: int = 200,
|
||||
) -> tuple:
|
||||
"""Smart image resizing for ERNIE / Qwen2.5 / Qwen3 models.
|
||||
|
||||
Maintains aspect ratio and respects pixel constraints. When the aspect ratio
|
||||
exceeds max_ratio, the image is cropped (not raised as error) to fit within
|
||||
the ratio limit.
|
||||
|
||||
Args:
|
||||
height: Original image height.
|
||||
width: Original image width.
|
||||
factor: Patch size factor; both output dimensions will be multiples of this.
|
||||
min_pixels: Minimum allowed total pixels.
|
||||
max_pixels: Maximum allowed total pixels.
|
||||
max_ratio: Maximum allowed aspect ratio (default 200).
|
||||
|
||||
Returns:
|
||||
tuple: (new_height, new_width)
|
||||
|
||||
Raises:
|
||||
ValueError: If calculated dimensions are still invalid after resizing.
|
||||
"""
|
||||
if max(height, width) / min(height, width) > max_ratio:
|
||||
if height > width:
|
||||
new_width = max(factor, round_by_factor(width, factor))
|
||||
new_height = floor_by_factor(new_width * max_ratio, factor)
|
||||
else:
|
||||
new_height = max(factor, round_by_factor(height, factor))
|
||||
new_width = floor_by_factor(new_height * max_ratio, factor)
|
||||
|
||||
data_processor_logger.info(
|
||||
f"absolute aspect ratio must be smaller than {max_ratio}, "
|
||||
f"got {max(height, width) / min(height, width)}, "
|
||||
f"resize to {max(new_height, new_width) / min(new_height, new_width)}"
|
||||
)
|
||||
height = new_height
|
||||
width = new_width
|
||||
|
||||
h_bar = max(factor, round_by_factor(height, factor))
|
||||
w_bar = max(factor, round_by_factor(width, factor))
|
||||
if h_bar * w_bar > max_pixels:
|
||||
beta = math.sqrt((height * width) / max_pixels)
|
||||
h_bar = floor_by_factor(height / beta, factor)
|
||||
w_bar = floor_by_factor(width / beta, factor)
|
||||
elif h_bar * w_bar < min_pixels:
|
||||
beta = math.sqrt(min_pixels / (height * width))
|
||||
h_bar = ceil_by_factor(height * beta, factor)
|
||||
w_bar = ceil_by_factor(width * beta, factor)
|
||||
|
||||
if min_pixels > h_bar * w_bar or h_bar * w_bar > max_pixels:
|
||||
raise ValueError(f"encounter invalid h_bar: {h_bar}, w_bar: {w_bar}")
|
||||
|
||||
return h_bar, w_bar
|
||||
|
||||
|
||||
def smart_resize_paddleocr(
|
||||
height: int,
|
||||
width: int,
|
||||
factor: int = 28,
|
||||
min_pixels: int = 28 * 28 * 130,
|
||||
max_pixels: int = 28 * 28 * 1280,
|
||||
) -> tuple:
|
||||
"""Smart image resizing for PaddleOCR-VL model.
|
||||
|
||||
Similar to smart_resize_qwen but adds small-image protection: if height or
|
||||
width is smaller than factor, the image is scaled up to factor first. Also,
|
||||
when aspect ratio exceeds 200 this function raises ValueError (instead of
|
||||
silently cropping like the qwen variant).
|
||||
|
||||
Args:
|
||||
height: Original image height.
|
||||
width: Original image width.
|
||||
factor: Patch size factor; both output dimensions will be multiples of this.
|
||||
min_pixels: Minimum allowed total pixels.
|
||||
max_pixels: Maximum allowed total pixels.
|
||||
|
||||
Returns:
|
||||
tuple: (new_height, new_width)
|
||||
|
||||
Raises:
|
||||
ValueError: If aspect ratio exceeds 200, or calculated dimensions are invalid.
|
||||
"""
|
||||
if height < factor:
|
||||
data_processor_logger.debug(f"smart_resize_paddleocr: height={height} < factor={factor}, reset height=factor")
|
||||
width = round((width * factor) / height)
|
||||
height = factor
|
||||
|
||||
if width < factor:
|
||||
data_processor_logger.debug(f"smart_resize_paddleocr: width={width} < factor={factor}, reset width=factor")
|
||||
height = round((height * factor) / width)
|
||||
width = factor
|
||||
|
||||
if max(height, width) / min(height, width) > 200:
|
||||
raise ValueError(
|
||||
f"absolute aspect ratio must be smaller than 200, " f"got {max(height, width) / min(height, width)}"
|
||||
)
|
||||
|
||||
h_bar = round(height / factor) * factor
|
||||
w_bar = round(width / factor) * factor
|
||||
if h_bar * w_bar > max_pixels:
|
||||
beta = math.sqrt((height * width) / max_pixels)
|
||||
h_bar = math.floor(height / beta / factor) * factor
|
||||
w_bar = math.floor(width / beta / factor) * factor
|
||||
elif h_bar * w_bar < min_pixels:
|
||||
beta = math.sqrt(min_pixels / (height * width))
|
||||
h_bar = math.ceil(height * beta / factor) * factor
|
||||
w_bar = math.ceil(width * beta / factor) * factor
|
||||
|
||||
return h_bar, w_bar
|
||||
|
||||
|
||||
def smart_resize(
|
||||
height: int,
|
||||
width: int,
|
||||
factor: int,
|
||||
min_pixels: int,
|
||||
max_pixels: int,
|
||||
max_ratio: int = 200,
|
||||
variant: str = "qwen",
|
||||
) -> tuple:
|
||||
"""Unified smart_resize dispatcher.
|
||||
|
||||
Args:
|
||||
height: Original image height.
|
||||
width: Original image width.
|
||||
factor: Patch size factor.
|
||||
min_pixels: Minimum allowed total pixels.
|
||||
max_pixels: Maximum allowed total pixels.
|
||||
max_ratio: Maximum allowed aspect ratio (only used by "qwen" variant).
|
||||
variant: Which algorithm variant to use.
|
||||
- "qwen" (default): for ERNIE / Qwen2.5 / Qwen3. Clips extreme ratios silently.
|
||||
- "paddleocr": for PaddleOCR-VL. Adds small-image protection, raises on bad ratio.
|
||||
|
||||
Returns:
|
||||
tuple: (new_height, new_width)
|
||||
"""
|
||||
if variant == "paddleocr":
|
||||
return smart_resize_paddleocr(height, width, factor, min_pixels, max_pixels)
|
||||
return smart_resize_qwen(height, width, factor, min_pixels, max_pixels, max_ratio)
|
||||
@@ -19,8 +19,6 @@
|
||||
# TODO: Support videos
|
||||
|
||||
import json
|
||||
import logging
|
||||
import math
|
||||
from pathlib import Path
|
||||
from typing import Dict, List, Optional, Union
|
||||
|
||||
@@ -34,6 +32,10 @@ from paddleformers.transformers.image_utils import (
|
||||
to_numpy_array,
|
||||
)
|
||||
|
||||
from fastdeploy.input.image_processors.common import (
|
||||
smart_resize_paddleocr as smart_resize,
|
||||
)
|
||||
|
||||
_OPENAI_CLIP_MEAN = [0.48145466, 0.4578275, 0.40821073]
|
||||
_OPENAI_CLIP_STD = [0.26862954, 0.26130258, 0.27577711]
|
||||
|
||||
@@ -68,54 +70,6 @@ def adjust_size(size, patch_size):
|
||||
return num_patches * patch_size
|
||||
|
||||
|
||||
def smart_resize(
|
||||
height: int,
|
||||
width: int,
|
||||
factor: int = 28,
|
||||
min_pixels: int = 28 * 28 * 130,
|
||||
max_pixels: int = 28 * 28 * 1280,
|
||||
):
|
||||
"""Rescales the image so that the following conditions are met:
|
||||
|
||||
1. Both dimensions (height and width) are divisible by 'factor'.
|
||||
|
||||
2. The total number of pixels is within the range ['min_pixels', 'max_pixels'].
|
||||
|
||||
3. The aspect ratio of the image is maintained as closely as possible.
|
||||
|
||||
"""
|
||||
# if height < factor or width < factor:
|
||||
# raise ValueError(f"height:{height} or width:{width} must be larger than factor:{factor}")
|
||||
# if int(height < factor//4) + int(width < factor//4):
|
||||
# raise ValueError(f"height:{height} or width:{width} must be larger than factor:{factor//4}")
|
||||
|
||||
if height < factor:
|
||||
logging.debug(f"smart_resize: height={height} < factor={factor}, reset height=factor")
|
||||
width = round((width * factor) / height)
|
||||
height = factor
|
||||
|
||||
if width < factor:
|
||||
logging.debug(f"smart_resize: width={width} < factor={factor}, reset width=factor")
|
||||
height = round((height * factor) / width)
|
||||
width = factor
|
||||
|
||||
if max(height, width) / min(height, width) > 200:
|
||||
raise ValueError(
|
||||
f"absolute aspect ratio must be smaller than 200, got {max(height, width) / min(height, width)}"
|
||||
)
|
||||
h_bar = round(height / factor) * factor
|
||||
w_bar = round(width / factor) * factor
|
||||
if h_bar * w_bar > max_pixels:
|
||||
beta = math.sqrt((height * width) / max_pixels)
|
||||
h_bar = math.floor(height / beta / factor) * factor
|
||||
w_bar = math.floor(width / beta / factor) * factor
|
||||
elif h_bar * w_bar < min_pixels:
|
||||
beta = math.sqrt(min_pixels / (height * width))
|
||||
h_bar = math.ceil(height * beta / factor) * factor
|
||||
w_bar = math.ceil(width * beta / factor) * factor
|
||||
return h_bar, w_bar
|
||||
|
||||
|
||||
class ImageProcessor(BaseImageProcessor):
|
||||
model_input_names = [
|
||||
"pixel_values",
|
||||
|
||||
@@ -26,14 +26,14 @@ from PIL import Image
|
||||
|
||||
from fastdeploy.engine.request import ImagePosition
|
||||
from fastdeploy.entrypoints.chat_utils import parse_chat_messages
|
||||
from fastdeploy.input.ernie4_5_vl_processor import read_video_decord
|
||||
from fastdeploy.input.mm_data_processor import MMBaseDataProcessor
|
||||
from fastdeploy.input.utils import IDS_TYPE_FLAG
|
||||
from fastdeploy.input.video_utils import read_video_decord
|
||||
from fastdeploy.input.video_utils import sample_frames_paddleocr as sample_frames
|
||||
from fastdeploy.multimodal.hasher import MultimodalHasher
|
||||
from fastdeploy.utils import data_processor_logger
|
||||
|
||||
from .image_processor import ImageProcessor
|
||||
from .process_video import sample_frames
|
||||
|
||||
|
||||
class DataProcessor(MMBaseDataProcessor):
|
||||
|
||||
@@ -1,82 +0,0 @@
|
||||
"""
|
||||
# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
"""
|
||||
|
||||
import math
|
||||
from typing import Optional, Union
|
||||
|
||||
import numpy as np
|
||||
|
||||
|
||||
def sample_frames(
|
||||
frame_factor: int,
|
||||
min_frames: int,
|
||||
max_frames: int,
|
||||
metadata: Optional[dict] = None,
|
||||
fps: Optional[Union[int, float]] = None,
|
||||
num_frames: Optional[int] = None,
|
||||
):
|
||||
"""
|
||||
Sample frames from video according to specified criteria.
|
||||
|
||||
Args:
|
||||
frame_factor: Ensure sampled frames are multiples of this factor
|
||||
min_frames: Minimum number of frames to sample
|
||||
max_frames: Maximum number of frames to sample
|
||||
metadata: Video metadata containing fps information
|
||||
fps: Target frames per second for sampling
|
||||
num_frames: Exact number of frames to sample
|
||||
|
||||
Returns:
|
||||
np.ndarray: Sampled video frames
|
||||
|
||||
Raises:
|
||||
ValueError: If both fps and num_frames are specified,
|
||||
or if required metadata is missing,
|
||||
or if requested frames exceed available frames
|
||||
"""
|
||||
if fps > 0 and num_frames > 0:
|
||||
raise ValueError("`num_frames` and `fps` are mutually exclusive arguments, please use only one!")
|
||||
|
||||
total_num_frames = metadata["num_of_frame"]
|
||||
|
||||
# If num_frames is not given but fps is, calculate num_frames from fps
|
||||
if num_frames > 0:
|
||||
num_frames = round(num_frames / frame_factor) * frame_factor
|
||||
elif fps > 0:
|
||||
if metadata is None:
|
||||
raise ValueError(
|
||||
"Asked to sample `fps` frames per second but no video metadata was provided which is required when sampling with `fps`. "
|
||||
"Please pass in `VideoMetadata` object or use a fixed `num_frames` per input video"
|
||||
)
|
||||
max_frames = math.floor(min(max_frames, total_num_frames) / frame_factor) * frame_factor
|
||||
num_frames = total_num_frames / metadata["fps"] * fps
|
||||
num_frames = min(min(max(num_frames, min_frames), max_frames), total_num_frames)
|
||||
num_frames = math.floor(num_frames / frame_factor) * frame_factor
|
||||
if num_frames > total_num_frames:
|
||||
raise ValueError(
|
||||
f"Video can't be sampled. The inferred `num_frames={num_frames}` exceeds `total_num_frames={total_num_frames}`. "
|
||||
"Decrease `num_frames` or `fps` for sampling."
|
||||
)
|
||||
|
||||
# Calculate frame indices based on sampling strategy
|
||||
if num_frames > 0:
|
||||
# Evenly spaced sampling for target frame count
|
||||
indices = np.arange(0, total_num_frames, total_num_frames / num_frames).astype(np.int32)
|
||||
else:
|
||||
# Keep all frames if no sampling requested
|
||||
indices = np.arange(0, total_num_frames).astype(np.int32)
|
||||
|
||||
return indices
|
||||
@@ -14,7 +14,6 @@
|
||||
# limitations under the License.
|
||||
"""
|
||||
|
||||
import math
|
||||
from typing import List, Optional, Union
|
||||
|
||||
import numpy as np
|
||||
@@ -41,6 +40,7 @@ from paddleformers.transformers.image_utils import (
|
||||
from paddleformers.transformers.legacy.tokenizer_utils_base import TensorType
|
||||
from PIL import Image
|
||||
|
||||
from fastdeploy.input.image_processors.common import is_scaled_image, smart_resize
|
||||
from fastdeploy.utils import data_processor_logger
|
||||
|
||||
IMAGE_MEAN = [0.5, 0.5, 0.5]
|
||||
@@ -62,86 +62,6 @@ VideoInput = Union[
|
||||
]
|
||||
|
||||
|
||||
def round_by_factor(number: int, factor: int) -> int:
|
||||
return round(number / factor) * factor
|
||||
|
||||
|
||||
def ceil_by_factor(number: int, factor: int) -> int:
|
||||
return math.ceil(number / factor) * factor
|
||||
|
||||
|
||||
def floor_by_factor(number: int, factor: int) -> int:
|
||||
return math.floor(number / factor) * factor
|
||||
|
||||
|
||||
def smart_resize(height: int, width: int, factor: int, min_pixels: int, max_pixels: int, max_ratio: int = 200):
|
||||
"""
|
||||
Smart image resizing that maintains aspect ratio and respects constraints.
|
||||
|
||||
Args:
|
||||
height: Original image height
|
||||
width: Original image width
|
||||
factor: Patch size factor
|
||||
min_pixels: Minimum allowed pixels
|
||||
max_pixels: Maximum allowed pixels
|
||||
max_ratio: Maximum allowed aspect ratio
|
||||
|
||||
Returns:
|
||||
tuple: (new_height, new_width)
|
||||
|
||||
Raises:
|
||||
ValueError: If calculated dimensions are invalid
|
||||
"""
|
||||
if max(height, width) / min(height, width) > max_ratio:
|
||||
if height > width:
|
||||
new_width = max(factor, round_by_factor(width, factor))
|
||||
new_height = floor_by_factor(new_width * max_ratio, factor)
|
||||
else:
|
||||
new_height = max(factor, round_by_factor(height, factor))
|
||||
new_width = floor_by_factor(new_height * max_ratio, factor)
|
||||
|
||||
data_processor_logger.info(
|
||||
f"absolute aspect ratio must be smaller than {max_ratio}, got {max(height, width) / min(height, width)},\
|
||||
resize to {max(new_height, new_width) / min(new_height, new_width)}"
|
||||
)
|
||||
|
||||
height = new_height
|
||||
width = new_width
|
||||
|
||||
h_bar = max(factor, round_by_factor(height, factor))
|
||||
w_bar = max(factor, round_by_factor(width, factor))
|
||||
if h_bar * w_bar > max_pixels:
|
||||
beta = math.sqrt((height * width) / max_pixels)
|
||||
h_bar = floor_by_factor(height / beta, factor)
|
||||
w_bar = floor_by_factor(width / beta, factor)
|
||||
elif h_bar * w_bar < min_pixels:
|
||||
beta = math.sqrt(min_pixels / (height * width))
|
||||
h_bar = ceil_by_factor(height * beta, factor)
|
||||
w_bar = ceil_by_factor(width * beta, factor)
|
||||
|
||||
if min_pixels > h_bar * w_bar or h_bar * w_bar > max_pixels:
|
||||
raise ValueError(f"encounter invalid h_bar: {h_bar}, w_bar: {w_bar}")
|
||||
|
||||
return h_bar, w_bar
|
||||
|
||||
|
||||
def is_scaled_image(image: np.ndarray) -> bool:
|
||||
"""
|
||||
Check if image pixel values are already normalized to [0, 1] range.
|
||||
|
||||
Args:
|
||||
image: Input image array
|
||||
|
||||
Returns:
|
||||
bool: True if image is already scaled
|
||||
"""
|
||||
if image.dtype == np.uint8:
|
||||
return False
|
||||
|
||||
# It's possible the image has pixel values in [0, 255] but is of floating type
|
||||
return np.min(image) >= 0 and np.max(image) <= 1
|
||||
|
||||
|
||||
class ImageProcessor(BaseImageProcessor):
|
||||
"""
|
||||
Adaptive image processor for dynamic image resizing and preprocessing.
|
||||
|
||||
@@ -26,13 +26,14 @@ from PIL import Image
|
||||
|
||||
from fastdeploy.engine.request import ImagePosition
|
||||
from fastdeploy.entrypoints.chat_utils import parse_chat_messages
|
||||
from fastdeploy.input.ernie4_5_vl_processor import read_video_decord
|
||||
from fastdeploy.input.mm_data_processor import MMBaseDataProcessor
|
||||
from fastdeploy.input.utils import IDS_TYPE_FLAG
|
||||
from fastdeploy.input.video_utils import read_video_decord
|
||||
from fastdeploy.input.video_utils import sample_frames_qwen as sample_frames
|
||||
from fastdeploy.multimodal.hasher import MultimodalHasher
|
||||
from fastdeploy.utils import data_processor_logger
|
||||
|
||||
from .image_processor import ImageProcessor, ceil_by_factor, floor_by_factor
|
||||
from .image_processor import ImageProcessor
|
||||
|
||||
VIDEO_MIN_PIXELS = 128 * 28 * 28
|
||||
VIDEO_MAX_PIXELS = 768 * 28 * 28
|
||||
@@ -42,83 +43,6 @@ FPS_MIN_FRAMES = 4
|
||||
FPS_MAX_FRAMES = 768
|
||||
|
||||
|
||||
def sample_frames(
|
||||
frame_factor: int,
|
||||
min_frames: int,
|
||||
max_frames: int,
|
||||
metadata: Optional[dict] = None,
|
||||
fps: Optional[Union[int, float]] = -1,
|
||||
num_frames: Optional[int] = -1,
|
||||
):
|
||||
"""
|
||||
Sample frames from video according to specified criteria.
|
||||
|
||||
Args:
|
||||
frame_factor: Ensure sampled frames are multiples of this factor
|
||||
min_frames: Minimum number of frames to sample
|
||||
max_frames: Maximum number of frames to sample
|
||||
metadata: Video metadata containing fps information
|
||||
fps: Target frames per second for sampling
|
||||
num_frames: Exact number of frames to sample
|
||||
|
||||
Returns:
|
||||
np.ndarray: Sampled video frames
|
||||
|
||||
Raises:
|
||||
ValueError: If both fps and num_frames are specified,
|
||||
or if required metadata is missing,
|
||||
or if requested frames exceed available frames
|
||||
"""
|
||||
if fps > 0 and num_frames > 0:
|
||||
raise ValueError("`num_frames` and `fps` are mutually exclusive arguments, please use only one!")
|
||||
|
||||
total_num_frames = metadata["num_of_frame"]
|
||||
|
||||
# If num_frames is not given but fps is, calculate num_frames from fps
|
||||
if num_frames > 0:
|
||||
num_frames = round(num_frames / frame_factor) * frame_factor
|
||||
elif fps > 0:
|
||||
if metadata is None:
|
||||
raise ValueError(
|
||||
"Asked to sample `fps` frames per second but no video metadata was provided which is required when sampling with `fps`. "
|
||||
"Please pass in `VideoMetadata` object or use a fixed `num_frames` per input video"
|
||||
)
|
||||
# max_frames = math.floor(min(max_frames, total_num_frames) / frame_factor) * frame_factor
|
||||
min_frames = ceil_by_factor(min_frames, frame_factor)
|
||||
max_frames = floor_by_factor(min(max_frames, total_num_frames), frame_factor)
|
||||
|
||||
num_frames = total_num_frames / metadata["fps"] * fps
|
||||
|
||||
if num_frames > total_num_frames:
|
||||
data_processor_logger.warning(f"smart_nframes: nframes[{num_frames}] > total_frames[{total_num_frames}]")
|
||||
|
||||
num_frames = min(min(max(num_frames, min_frames), max_frames), total_num_frames)
|
||||
num_frames = floor_by_factor(num_frames, frame_factor)
|
||||
|
||||
if num_frames > total_num_frames:
|
||||
raise ValueError(
|
||||
f"Video can't be sampled. The inferred `num_frames={num_frames}` exceeds `total_num_frames={total_num_frames}`. "
|
||||
"Decrease `num_frames` or `fps` for sampling."
|
||||
)
|
||||
|
||||
# Hack code ensures that num_frames can always be divided by 4
|
||||
# due to sched/resource_manager_v1.py 中 grid_thw.extend([[2, h, w]] * (t // 2))
|
||||
if num_frames > 2 and num_frames % 4 != 0:
|
||||
num_frames = (num_frames // 4) * 4 # 向下取整到 4 的倍数
|
||||
total_num_frames = (total_num_frames // 4) * 4
|
||||
num_frames = min(min(max(num_frames, min_frames), max_frames), total_num_frames)
|
||||
|
||||
# Calculate frame indices based on sampling strategy
|
||||
if num_frames > 0:
|
||||
# Evenly spaced sampling for target frame count
|
||||
indices = np.arange(0, total_num_frames, total_num_frames / num_frames).astype(np.int32)
|
||||
else:
|
||||
# Keep all frames if no sampling requested
|
||||
indices = np.arange(0, total_num_frames).astype(np.int32)
|
||||
|
||||
return indices
|
||||
|
||||
|
||||
class DataProcessor(MMBaseDataProcessor):
|
||||
"""
|
||||
Processes multimodal inputs (text, images, videos) into model-ready formats.
|
||||
|
||||
@@ -14,7 +14,6 @@
|
||||
# limitations under the License.
|
||||
"""
|
||||
|
||||
import math
|
||||
from typing import List, Optional, Union
|
||||
|
||||
import numpy as np
|
||||
@@ -41,6 +40,7 @@ from paddleformers.transformers.image_utils import (
|
||||
from paddleformers.transformers.legacy.tokenizer_utils_base import TensorType
|
||||
from PIL import Image
|
||||
|
||||
from fastdeploy.input.image_processors.common import is_scaled_image, smart_resize
|
||||
from fastdeploy.utils import data_processor_logger
|
||||
|
||||
OPENAI_CLIP_MEAN = [0.48145466, 0.4578275, 0.40821073]
|
||||
@@ -62,116 +62,6 @@ VideoInput = Union[
|
||||
]
|
||||
|
||||
|
||||
def round_by_factor(number: int, factor: int) -> int:
|
||||
"""
|
||||
Round number to nearest multiple of factor.
|
||||
|
||||
Args:
|
||||
number: Input number to round
|
||||
factor: Rounding factor
|
||||
|
||||
Returns:
|
||||
int: Rounded number
|
||||
"""
|
||||
return round(number / factor) * factor
|
||||
|
||||
|
||||
def ceil_by_factor(number: int, factor: int) -> int:
|
||||
"""
|
||||
Round number up to nearest multiple of factor.
|
||||
|
||||
Args:
|
||||
number: Input number to round
|
||||
factor: Rounding factor
|
||||
|
||||
Returns:
|
||||
int: Rounded number
|
||||
"""
|
||||
return math.ceil(number / factor) * factor
|
||||
|
||||
|
||||
def floor_by_factor(number: int, factor: int) -> int:
|
||||
"""
|
||||
Round number down to nearest multiple of factor.
|
||||
|
||||
Args:
|
||||
number: Input number to round
|
||||
factor: Rounding factor
|
||||
|
||||
Returns:
|
||||
int: Rounded number
|
||||
"""
|
||||
return math.floor(number / factor) * factor
|
||||
|
||||
|
||||
def smart_resize(height: int, width: int, factor: int, min_pixels: int, max_pixels: int, max_ratio: int = 200):
|
||||
"""
|
||||
Smart image resizing that maintains aspect ratio and respects constraints.
|
||||
|
||||
Args:
|
||||
height: Original image height
|
||||
width: Original image width
|
||||
factor: Patch size factor
|
||||
min_pixels: Minimum allowed pixels
|
||||
max_pixels: Maximum allowed pixels
|
||||
max_ratio: Maximum allowed aspect ratio
|
||||
|
||||
Returns:
|
||||
tuple: (new_height, new_width)
|
||||
|
||||
Raises:
|
||||
ValueError: If calculated dimensions are invalid
|
||||
"""
|
||||
if max(height, width) / min(height, width) > max_ratio:
|
||||
if height > width:
|
||||
new_width = max(factor, round_by_factor(width, factor))
|
||||
new_height = floor_by_factor(new_width * max_ratio, factor)
|
||||
else:
|
||||
new_height = max(factor, round_by_factor(height, factor))
|
||||
new_width = floor_by_factor(new_height * max_ratio, factor)
|
||||
|
||||
data_processor_logger.info(
|
||||
f"absolute aspect ratio must be smaller than {max_ratio}, got {max(height, width) / min(height, width)},\
|
||||
resize to {max(new_height, new_width) / min(new_height, new_width)}"
|
||||
)
|
||||
|
||||
height = new_height
|
||||
width = new_width
|
||||
|
||||
h_bar = max(factor, round_by_factor(height, factor))
|
||||
w_bar = max(factor, round_by_factor(width, factor))
|
||||
if h_bar * w_bar > max_pixels:
|
||||
beta = math.sqrt((height * width) / max_pixels)
|
||||
h_bar = floor_by_factor(height / beta, factor)
|
||||
w_bar = floor_by_factor(width / beta, factor)
|
||||
elif h_bar * w_bar < min_pixels:
|
||||
beta = math.sqrt(min_pixels / (height * width))
|
||||
h_bar = ceil_by_factor(height * beta, factor)
|
||||
w_bar = ceil_by_factor(width * beta, factor)
|
||||
|
||||
if min_pixels > h_bar * w_bar or h_bar * w_bar > max_pixels:
|
||||
raise ValueError(f"encounter invalid h_bar: {h_bar}, w_bar: {w_bar}")
|
||||
|
||||
return h_bar, w_bar
|
||||
|
||||
|
||||
def is_scaled_image(image: np.ndarray) -> bool:
|
||||
"""
|
||||
Check if image pixel values are already normalized to [0, 1] range.
|
||||
|
||||
Args:
|
||||
image: Input image array
|
||||
|
||||
Returns:
|
||||
bool: True if image is already scaled
|
||||
"""
|
||||
if image.dtype == np.uint8:
|
||||
return False
|
||||
|
||||
# It's possible the image has pixel values in [0, 255] but is of floating type
|
||||
return np.min(image) >= 0 and np.max(image) <= 1
|
||||
|
||||
|
||||
class ImageProcessor(BaseImageProcessor):
|
||||
"""
|
||||
Adaptive image processor for dynamic image resizing and preprocessing.
|
||||
|
||||
@@ -26,14 +26,14 @@ from PIL import Image
|
||||
|
||||
from fastdeploy.engine.request import ImagePosition
|
||||
from fastdeploy.entrypoints.chat_utils import parse_chat_messages
|
||||
from fastdeploy.input.ernie4_5_vl_processor import read_video_decord
|
||||
from fastdeploy.input.mm_data_processor import MMBaseDataProcessor
|
||||
from fastdeploy.input.utils import IDS_TYPE_FLAG
|
||||
from fastdeploy.input.video_utils import read_video_decord
|
||||
from fastdeploy.input.video_utils import sample_frames_qwen as sample_frames
|
||||
from fastdeploy.multimodal.hasher import MultimodalHasher
|
||||
from fastdeploy.utils import data_processor_logger
|
||||
|
||||
from .image_processor import ImageProcessor
|
||||
from .process_video import sample_frames
|
||||
|
||||
FRAME_FACTOR = 2
|
||||
FPS = 2.0
|
||||
|
||||
@@ -1,100 +0,0 @@
|
||||
"""
|
||||
# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
"""
|
||||
|
||||
from typing import Optional, Union
|
||||
|
||||
import numpy as np
|
||||
|
||||
from fastdeploy.utils import data_processor_logger
|
||||
|
||||
from .image_processor import ceil_by_factor, floor_by_factor
|
||||
|
||||
|
||||
def sample_frames(
|
||||
frame_factor: int,
|
||||
min_frames: int,
|
||||
max_frames: int,
|
||||
metadata: Optional[dict] = None,
|
||||
fps: Optional[Union[int, float]] = -1,
|
||||
num_frames: Optional[int] = -1,
|
||||
):
|
||||
"""
|
||||
Sample frames from video according to specified criteria.
|
||||
|
||||
Args:
|
||||
frame_factor: Ensure sampled frames are multiples of this factor
|
||||
min_frames: Minimum number of frames to sample
|
||||
max_frames: Maximum number of frames to sample
|
||||
metadata: Video metadata containing fps information
|
||||
fps: Target frames per second for sampling
|
||||
num_frames: Exact number of frames to sample
|
||||
|
||||
Returns:
|
||||
np.ndarray: Sampled video frames
|
||||
|
||||
Raises:
|
||||
ValueError: If both fps and num_frames are specified,
|
||||
or if required metadata is missing,
|
||||
or if requested frames exceed available frames
|
||||
"""
|
||||
if fps > 0 and num_frames > 0:
|
||||
raise ValueError("`num_frames` and `fps` are mutually exclusive arguments, please use only one!")
|
||||
|
||||
total_num_frames = metadata["num_of_frame"]
|
||||
|
||||
# If num_frames is not given but fps is, calculate num_frames from fps
|
||||
if num_frames > 0:
|
||||
num_frames = round(num_frames / frame_factor) * frame_factor
|
||||
elif fps > 0:
|
||||
if metadata is None:
|
||||
raise ValueError(
|
||||
"Asked to sample `fps` frames per second but no video metadata was provided which is required when sampling with `fps`. "
|
||||
"Please pass in `VideoMetadata` object or use a fixed `num_frames` per input video"
|
||||
)
|
||||
# max_frames = math.floor(min(max_frames, total_num_frames) / frame_factor) * frame_factor
|
||||
min_frames = ceil_by_factor(min_frames, frame_factor)
|
||||
max_frames = floor_by_factor(min(max_frames, total_num_frames), frame_factor)
|
||||
|
||||
num_frames = total_num_frames / metadata["fps"] * fps
|
||||
|
||||
if num_frames > total_num_frames:
|
||||
data_processor_logger.warning(f"smart_nframes: nframes[{num_frames}] > total_frames[{total_num_frames}]")
|
||||
|
||||
num_frames = min(min(max(num_frames, min_frames), max_frames), total_num_frames)
|
||||
num_frames = floor_by_factor(num_frames, frame_factor)
|
||||
|
||||
if num_frames > total_num_frames:
|
||||
raise ValueError(
|
||||
f"Video can't be sampled. The inferred `num_frames={num_frames}` exceeds `total_num_frames={total_num_frames}`. "
|
||||
"Decrease `num_frames` or `fps` for sampling."
|
||||
)
|
||||
|
||||
# Hack code ensures that num_frames can always be divided by 4
|
||||
# due to sched/resource_manager_v1.py 中 grid_thw.extend([[2, h, w]] * (t // 2))
|
||||
if num_frames > 2 and num_frames % 4 != 0:
|
||||
num_frames = (num_frames // 4) * 4 # 向下取整到 4 的倍数
|
||||
total_num_frames = (total_num_frames // 4) * 4
|
||||
num_frames = min(min(max(num_frames, min_frames), max_frames), total_num_frames)
|
||||
|
||||
# Calculate frame indices based on sampling strategy
|
||||
if num_frames > 0:
|
||||
# Evenly spaced sampling for target frame count
|
||||
indices = np.arange(0, total_num_frames, total_num_frames / num_frames).astype(np.int32)
|
||||
else:
|
||||
# Keep all frames if no sampling requested
|
||||
indices = np.arange(0, total_num_frames).astype(np.int32)
|
||||
|
||||
return indices
|
||||
@@ -0,0 +1,272 @@
|
||||
# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
"""Shared video utilities: VideoReaderWrapper, read_video_decord, and sample_frames."""
|
||||
|
||||
import io
|
||||
import math
|
||||
import os
|
||||
from tempfile import NamedTemporaryFile as ntf
|
||||
from typing import Optional, Union
|
||||
|
||||
import numpy as np
|
||||
|
||||
from fastdeploy.input.image_processors.common import ceil_by_factor, floor_by_factor
|
||||
from fastdeploy.utils import data_processor_logger
|
||||
|
||||
__all__ = [
|
||||
"VideoReaderWrapper",
|
||||
"read_video_decord",
|
||||
"sample_frames",
|
||||
"sample_frames_qwen",
|
||||
"sample_frames_paddleocr",
|
||||
]
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# VideoReaderWrapper
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
|
||||
def _is_gif(data: bytes) -> bool:
|
||||
"""Check if bytes represent a GIF based on magic header."""
|
||||
return data[:6] in (b"GIF87a", b"GIF89a")
|
||||
|
||||
|
||||
class VideoReaderWrapper:
|
||||
"""decord.VideoReader wrapper that fixes a memory leak and adds GIF support.
|
||||
|
||||
Reference: https://github.com/dmlc/decord/issues/208
|
||||
"""
|
||||
|
||||
def __init__(self, video_path, *args, **kwargs):
|
||||
import decord
|
||||
|
||||
try:
|
||||
# moviepy 1.0
|
||||
import moviepy.editor as mp
|
||||
except Exception:
|
||||
# moviepy 2.0
|
||||
import moviepy as mp
|
||||
|
||||
with ntf(delete=True, suffix=".gif") as gif_file:
|
||||
gif_input = None
|
||||
self.original_file = None # only set when we create a temp file
|
||||
|
||||
if isinstance(video_path, str):
|
||||
if video_path.lower().endswith(".gif"):
|
||||
gif_input = video_path
|
||||
elif isinstance(video_path, bytes):
|
||||
if _is_gif(video_path):
|
||||
gif_file.write(video_path)
|
||||
gif_file.flush()
|
||||
gif_input = gif_file.name
|
||||
elif isinstance(video_path, io.BytesIO):
|
||||
video_path.seek(0)
|
||||
tmp_bytes = video_path.read()
|
||||
video_path.seek(0)
|
||||
if _is_gif(tmp_bytes):
|
||||
gif_file.write(tmp_bytes)
|
||||
gif_file.flush()
|
||||
gif_input = gif_file.name
|
||||
|
||||
if gif_input is not None:
|
||||
clip = mp.VideoFileClip(gif_input)
|
||||
mp4_file = ntf(delete=False, suffix=".mp4")
|
||||
mp4_path = mp4_file.name
|
||||
mp4_file.close() # close before moviepy writes
|
||||
clip.write_videofile(mp4_path, verbose=False, logger=None)
|
||||
clip.close()
|
||||
video_path = mp4_path
|
||||
self.original_file = video_path # temp mp4, cleaned up in __del__
|
||||
|
||||
self._reader = decord.VideoReader(video_path, *args, **kwargs)
|
||||
self._reader.seek(0)
|
||||
|
||||
def __len__(self):
|
||||
return len(self._reader)
|
||||
|
||||
def __getitem__(self, key):
|
||||
frames = self._reader[key]
|
||||
self._reader.seek(0)
|
||||
return frames
|
||||
|
||||
def get_avg_fps(self):
|
||||
return self._reader.get_avg_fps()
|
||||
|
||||
def seek(self, pos):
|
||||
return self._reader.seek(pos)
|
||||
|
||||
def __del__(self):
|
||||
original_file = getattr(self, "original_file", None)
|
||||
if original_file:
|
||||
try:
|
||||
os.remove(original_file)
|
||||
except OSError:
|
||||
pass
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# read_video_decord
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
|
||||
def read_video_decord(video_path, save_to_disk: bool = False):
|
||||
"""Load a video file and return (video_reader, video_meta, video_path).
|
||||
|
||||
video_meta contains keys: "fps", "duration", "num_of_frame".
|
||||
"""
|
||||
if isinstance(video_path, VideoReaderWrapper):
|
||||
video_reader = video_path
|
||||
else:
|
||||
if isinstance(video_path, bytes):
|
||||
video_path = io.BytesIO(video_path)
|
||||
video_reader = VideoReaderWrapper(video_path, num_threads=1)
|
||||
|
||||
vlen = len(video_reader)
|
||||
fps = video_reader.get_avg_fps()
|
||||
duration = vlen / float(fps)
|
||||
|
||||
video_meta = {"fps": fps, "duration": duration, "num_of_frame": vlen}
|
||||
return video_reader, video_meta, video_path
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# sample_frames — qwen_vl variant
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
|
||||
def sample_frames_qwen(
|
||||
frame_factor: int,
|
||||
min_frames: int,
|
||||
max_frames: int,
|
||||
metadata: Optional[dict] = None,
|
||||
fps: Optional[Union[int, float]] = -1,
|
||||
num_frames: Optional[int] = -1,
|
||||
) -> np.ndarray:
|
||||
"""Sample frame indices — qwen_vl variant.
|
||||
|
||||
Sentinel defaults are -1. Applies ceil_by_factor on min_frames and ensures
|
||||
num_frames is divisible by 4.
|
||||
"""
|
||||
if fps > 0 and num_frames > 0:
|
||||
raise ValueError("`num_frames` and `fps` are mutually exclusive arguments, please use only one!")
|
||||
|
||||
if metadata is None:
|
||||
raise ValueError("metadata is required for sample_frames_qwen")
|
||||
|
||||
total_num_frames = metadata["num_of_frame"]
|
||||
|
||||
if num_frames > 0:
|
||||
num_frames = round(num_frames / frame_factor) * frame_factor
|
||||
elif fps > 0:
|
||||
min_frames = ceil_by_factor(min_frames, frame_factor)
|
||||
max_frames = floor_by_factor(min(max_frames, total_num_frames), frame_factor)
|
||||
|
||||
num_frames = total_num_frames / metadata["fps"] * fps
|
||||
|
||||
if num_frames > total_num_frames:
|
||||
data_processor_logger.warning(f"smart_nframes: nframes[{num_frames}] > total_frames[{total_num_frames}]")
|
||||
|
||||
num_frames = min(min(max(num_frames, min_frames), max_frames), total_num_frames)
|
||||
num_frames = floor_by_factor(num_frames, frame_factor)
|
||||
|
||||
if num_frames > total_num_frames:
|
||||
raise ValueError(
|
||||
f"Video can't be sampled. The inferred `num_frames={num_frames}` exceeds "
|
||||
f"`total_num_frames={total_num_frames}`. "
|
||||
"Decrease `num_frames` or `fps` for sampling."
|
||||
)
|
||||
|
||||
# num_frames must be divisible by 4
|
||||
if num_frames > 2 and num_frames % 4 != 0:
|
||||
num_frames = (num_frames // 4) * 4
|
||||
total_num_frames = (total_num_frames // 4) * 4
|
||||
num_frames = min(min(max(num_frames, min_frames), max_frames), total_num_frames)
|
||||
|
||||
if num_frames > 0:
|
||||
indices = np.arange(0, total_num_frames, total_num_frames / num_frames).astype(np.int32)
|
||||
else:
|
||||
indices = np.arange(0, total_num_frames).astype(np.int32)
|
||||
|
||||
return indices
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# sample_frames — paddleocr_vl / ernie4_5_vl variant
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
|
||||
def sample_frames_paddleocr(
|
||||
frame_factor: int,
|
||||
min_frames: int,
|
||||
max_frames: int,
|
||||
metadata: Optional[dict] = None,
|
||||
fps: Optional[Union[int, float]] = None,
|
||||
num_frames: Optional[int] = None,
|
||||
) -> np.ndarray:
|
||||
"""Sample frame indices — paddleocr_vl / ernie4_5_vl variant.
|
||||
|
||||
Sentinel defaults are None. Uses plain math.floor/ceil; no %4 correction.
|
||||
"""
|
||||
fps = fps or 0
|
||||
num_frames = num_frames or 0
|
||||
if fps > 0 and num_frames > 0:
|
||||
raise ValueError("`num_frames` and `fps` are mutually exclusive arguments, please use only one!")
|
||||
|
||||
if metadata is None:
|
||||
raise ValueError("metadata is required for sample_frames_paddleocr")
|
||||
|
||||
total_num_frames = metadata["num_of_frame"]
|
||||
|
||||
if num_frames > 0:
|
||||
num_frames = round(num_frames / frame_factor) * frame_factor
|
||||
elif fps > 0:
|
||||
max_frames = math.floor(min(max_frames, total_num_frames) / frame_factor) * frame_factor
|
||||
num_frames = total_num_frames / metadata["fps"] * fps
|
||||
num_frames = min(min(max(num_frames, min_frames), max_frames), total_num_frames)
|
||||
num_frames = math.floor(num_frames / frame_factor) * frame_factor
|
||||
|
||||
if num_frames > total_num_frames:
|
||||
raise ValueError(
|
||||
f"Video can't be sampled. The inferred `num_frames={num_frames}` exceeds "
|
||||
f"`total_num_frames={total_num_frames}`. "
|
||||
"Decrease `num_frames` or `fps` for sampling."
|
||||
)
|
||||
|
||||
if num_frames > 0:
|
||||
indices = np.arange(0, total_num_frames, total_num_frames / num_frames).astype(np.int32)
|
||||
else:
|
||||
indices = np.arange(0, total_num_frames).astype(np.int32)
|
||||
|
||||
return indices
|
||||
|
||||
|
||||
def sample_frames(
|
||||
frame_factor: int,
|
||||
min_frames: int,
|
||||
max_frames: int,
|
||||
metadata: Optional[dict] = None,
|
||||
fps: Optional[Union[int, float]] = None,
|
||||
num_frames: Optional[int] = None,
|
||||
variant: str = "paddleocr",
|
||||
) -> np.ndarray:
|
||||
"""Dispatch to sample_frames_qwen or sample_frames_paddleocr based on variant."""
|
||||
if variant == "qwen":
|
||||
_fps = fps if fps is not None else -1
|
||||
_num_frames = num_frames if num_frames is not None else -1
|
||||
return sample_frames_qwen(frame_factor, min_frames, max_frames, metadata, _fps, _num_frames)
|
||||
if variant == "paddleocr":
|
||||
return sample_frames_paddleocr(frame_factor, min_frames, max_frames, metadata, fps, num_frames)
|
||||
raise ValueError(f"Unknown variant {variant!r}. Expected 'paddleocr' or 'qwen'.")
|
||||
@@ -22,14 +22,16 @@ from PIL import Image
|
||||
|
||||
from fastdeploy.input.ernie4_5_vl_processor.image_preprocessor.image_preprocessor_adaptive import (
|
||||
AdaptiveImageProcessor,
|
||||
make_batched_images,
|
||||
make_batched_videos,
|
||||
)
|
||||
from fastdeploy.input.image_processors.common import (
|
||||
ceil_by_factor,
|
||||
floor_by_factor,
|
||||
is_scaled_image,
|
||||
make_batched_images,
|
||||
make_batched_videos,
|
||||
round_by_factor,
|
||||
smart_resize,
|
||||
)
|
||||
from fastdeploy.input.image_processors.common import smart_resize_qwen as smart_resize
|
||||
|
||||
|
||||
class TestImagePreprocessorAdaptive(unittest.TestCase):
|
||||
|
||||
@@ -30,7 +30,7 @@ from fastdeploy.input.paddleocr_vl_processor.paddleocr_vl_processor import (
|
||||
PaddleOCRVLProcessor,
|
||||
)
|
||||
from fastdeploy.input.paddleocr_vl_processor.process import DataProcessor
|
||||
from fastdeploy.input.paddleocr_vl_processor.process_video import sample_frames
|
||||
from fastdeploy.input.video_utils import sample_frames_paddleocr as sample_frames
|
||||
|
||||
MODULE_PATH = "fastdeploy.input.paddleocr_vl_processor.process"
|
||||
|
||||
@@ -86,7 +86,7 @@ class TestProcessVideo(unittest.TestCase):
|
||||
|
||||
def test_error_fps_without_metadata(self):
|
||||
"""新增:测试 fps > 0 但 metadata 为 None"""
|
||||
with self.assertRaises(TypeError) as context:
|
||||
with self.assertRaises(ValueError) as context:
|
||||
sample_frames(
|
||||
frame_factor=self.frame_factor,
|
||||
min_frames=self.min_frames,
|
||||
@@ -95,8 +95,7 @@ class TestProcessVideo(unittest.TestCase):
|
||||
fps=10,
|
||||
metadata=None, # 缺失
|
||||
)
|
||||
# 验证是预期的 TypeError
|
||||
self.assertIn("'NoneType' object is not subscriptable", str(context.exception))
|
||||
self.assertIn("metadata is required", str(context.exception))
|
||||
|
||||
def test_num_frames_rounding(self):
|
||||
"""新增:测试 num_frames 向 frame_factor 舍入"""
|
||||
|
||||
@@ -21,7 +21,7 @@ import numpy as np
|
||||
from PIL import Image
|
||||
|
||||
from fastdeploy.input.qwen_vl_processor import QwenVLProcessor
|
||||
from fastdeploy.input.qwen_vl_processor.process_video import sample_frames
|
||||
from fastdeploy.input.video_utils import sample_frames_qwen as sample_frames
|
||||
|
||||
|
||||
def mock_pil_image(height, width):
|
||||
|
||||
@@ -0,0 +1,365 @@
|
||||
# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
import io
|
||||
import unittest
|
||||
from unittest.mock import MagicMock, patch
|
||||
|
||||
import numpy as np
|
||||
|
||||
from fastdeploy.input.video_utils import (
|
||||
_is_gif,
|
||||
read_video_decord,
|
||||
sample_frames,
|
||||
sample_frames_paddleocr,
|
||||
sample_frames_qwen,
|
||||
)
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# helpers
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
GIF87_HEADER = b"GIF87a" + b"\x00" * 10
|
||||
GIF89_HEADER = b"GIF89a" + b"\x00" * 10
|
||||
NOT_GIF = b"NOTGIF" + b"\x00" * 10
|
||||
|
||||
|
||||
def _make_mock_reader(num_frames=100, fps=25.0):
|
||||
"""Return a mock that mimics decord.VideoReader."""
|
||||
reader = MagicMock()
|
||||
reader.__len__ = MagicMock(return_value=num_frames)
|
||||
reader.get_avg_fps = MagicMock(return_value=fps)
|
||||
reader.seek = MagicMock(return_value=None)
|
||||
frame = MagicMock()
|
||||
frame.asnumpy = MagicMock(return_value=np.zeros((480, 640, 3), dtype=np.uint8))
|
||||
reader.__getitem__ = MagicMock(return_value=frame)
|
||||
return reader
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# _is_gif
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
|
||||
class TestIsGif(unittest.TestCase):
|
||||
def test_gif87a(self):
|
||||
self.assertTrue(_is_gif(GIF87_HEADER))
|
||||
|
||||
def test_gif89a(self):
|
||||
self.assertTrue(_is_gif(GIF89_HEADER))
|
||||
|
||||
def test_not_gif(self):
|
||||
self.assertFalse(_is_gif(NOT_GIF))
|
||||
|
||||
def test_short_bytes(self):
|
||||
self.assertFalse(_is_gif(b"GIF"))
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# VideoReaderWrapper (mock decord + moviepy)
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
|
||||
class TestVideoReaderWrapper(unittest.TestCase):
|
||||
def _make_wrapper(self, video_path, mock_reader=None):
|
||||
"""Construct a VideoReaderWrapper with decord mocked out."""
|
||||
from fastdeploy.input.video_utils import VideoReaderWrapper
|
||||
|
||||
if mock_reader is None:
|
||||
mock_reader = _make_mock_reader()
|
||||
|
||||
mock_decord = MagicMock()
|
||||
mock_decord.VideoReader.return_value = mock_reader
|
||||
|
||||
with patch.dict("sys.modules", {"decord": mock_decord, "moviepy": MagicMock(), "moviepy.editor": MagicMock()}):
|
||||
wrapper = VideoReaderWrapper(video_path)
|
||||
|
||||
wrapper._reader = mock_reader
|
||||
return wrapper
|
||||
|
||||
def test_len(self):
|
||||
reader = _make_mock_reader(num_frames=42)
|
||||
wrapper = self._make_wrapper("/fake/video.mp4", reader)
|
||||
self.assertEqual(len(wrapper), 42)
|
||||
|
||||
def test_getitem_resets_seek(self):
|
||||
reader = _make_mock_reader()
|
||||
wrapper = self._make_wrapper("/fake/video.mp4", reader)
|
||||
_ = wrapper[0]
|
||||
reader.seek.assert_called_with(0)
|
||||
|
||||
def test_get_avg_fps(self):
|
||||
reader = _make_mock_reader(fps=30.0)
|
||||
wrapper = self._make_wrapper("/fake/video.mp4", reader)
|
||||
self.assertEqual(wrapper.get_avg_fps(), 30.0)
|
||||
|
||||
def test_seek(self):
|
||||
reader = _make_mock_reader()
|
||||
wrapper = self._make_wrapper("/fake/video.mp4", reader)
|
||||
wrapper.seek(5)
|
||||
reader.seek.assert_called_with(5)
|
||||
|
||||
def test_del_no_original_file(self):
|
||||
"""__del__ should be a no-op when original_file is None."""
|
||||
from fastdeploy.input.video_utils import VideoReaderWrapper
|
||||
|
||||
wrapper = object.__new__(VideoReaderWrapper)
|
||||
wrapper.original_file = None
|
||||
wrapper._reader = _make_mock_reader()
|
||||
# Should not raise
|
||||
wrapper.__del__()
|
||||
|
||||
def test_del_removes_temp_file(self):
|
||||
"""__del__ removes the file only when original_file is set."""
|
||||
import os
|
||||
import tempfile
|
||||
|
||||
from fastdeploy.input.video_utils import VideoReaderWrapper
|
||||
|
||||
with tempfile.NamedTemporaryFile(delete=False) as f:
|
||||
tmp_path = f.name
|
||||
|
||||
wrapper = object.__new__(VideoReaderWrapper)
|
||||
wrapper.original_file = tmp_path
|
||||
wrapper._reader = _make_mock_reader()
|
||||
wrapper.__del__()
|
||||
self.assertFalse(os.path.exists(tmp_path))
|
||||
|
||||
def test_non_gif_string_path_does_not_set_original_file(self):
|
||||
"""Passing a non-GIF string path must NOT set original_file (bug fix)."""
|
||||
from fastdeploy.input.video_utils import VideoReaderWrapper
|
||||
|
||||
mock_reader = _make_mock_reader()
|
||||
mock_decord = MagicMock()
|
||||
mock_decord.VideoReader.return_value = mock_reader
|
||||
|
||||
with patch.dict("sys.modules", {"decord": mock_decord, "moviepy": MagicMock(), "moviepy.editor": MagicMock()}):
|
||||
wrapper = VideoReaderWrapper("/fake/video.mp4")
|
||||
|
||||
self.assertIsNone(wrapper.original_file)
|
||||
|
||||
def test_bytesio_non_gif_path_does_not_set_original_file(self):
|
||||
"""Passing a BytesIO that is NOT a GIF must not set original_file."""
|
||||
from fastdeploy.input.video_utils import VideoReaderWrapper
|
||||
|
||||
mock_reader = _make_mock_reader()
|
||||
mock_decord = MagicMock()
|
||||
mock_decord.VideoReader.return_value = mock_reader
|
||||
|
||||
bio = io.BytesIO(NOT_GIF)
|
||||
with patch.dict("sys.modules", {"decord": mock_decord, "moviepy": MagicMock(), "moviepy.editor": MagicMock()}):
|
||||
wrapper = VideoReaderWrapper(bio)
|
||||
|
||||
self.assertIsNone(wrapper.original_file)
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# read_video_decord
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
|
||||
class TestReadVideoDecord(unittest.TestCase):
|
||||
def _patch_wrapper(self, num_frames=100, fps=25.0):
|
||||
"""Return a context manager that replaces VideoReaderWrapper with a mock."""
|
||||
from fastdeploy.input import video_utils
|
||||
|
||||
mock_wrapper = MagicMock()
|
||||
mock_wrapper.__len__ = MagicMock(return_value=num_frames)
|
||||
mock_wrapper.get_avg_fps = MagicMock(return_value=fps)
|
||||
return patch.object(video_utils, "VideoReaderWrapper", return_value=mock_wrapper), mock_wrapper
|
||||
|
||||
def test_existing_wrapper_passthrough(self):
|
||||
"""Already-wrapped reader is returned as-is."""
|
||||
from fastdeploy.input.video_utils import VideoReaderWrapper
|
||||
|
||||
mock_wrapper = MagicMock(spec=VideoReaderWrapper)
|
||||
mock_wrapper.__len__ = MagicMock(return_value=50)
|
||||
mock_wrapper.get_avg_fps = MagicMock(return_value=10.0)
|
||||
|
||||
reader, meta, path = read_video_decord(mock_wrapper)
|
||||
|
||||
self.assertIs(reader, mock_wrapper)
|
||||
self.assertEqual(meta["num_of_frame"], 50)
|
||||
self.assertAlmostEqual(meta["fps"], 10.0)
|
||||
self.assertAlmostEqual(meta["duration"], 5.0)
|
||||
|
||||
def test_bytes_input_converted_to_bytesio(self):
|
||||
"""bytes input is converted to BytesIO before creating VideoReaderWrapper."""
|
||||
from fastdeploy.input import video_utils
|
||||
|
||||
captured = []
|
||||
|
||||
class FakeWrapper:
|
||||
def __init__(self, path, *args, **kwargs):
|
||||
captured.append(path)
|
||||
|
||||
def __len__(self):
|
||||
return 30
|
||||
|
||||
def get_avg_fps(self):
|
||||
return 10.0
|
||||
|
||||
with patch.object(video_utils, "VideoReaderWrapper", FakeWrapper):
|
||||
reader, meta, path = read_video_decord(b"fake_video_bytes")
|
||||
|
||||
self.assertIsInstance(captured[0], io.BytesIO)
|
||||
|
||||
def test_string_path_input(self):
|
||||
"""String path is passed through to VideoReaderWrapper."""
|
||||
from fastdeploy.input import video_utils
|
||||
|
||||
class FakeWrapper:
|
||||
def __init__(self, path, *args, **kwargs):
|
||||
pass
|
||||
|
||||
def __len__(self):
|
||||
return 60
|
||||
|
||||
def get_avg_fps(self):
|
||||
return 30.0
|
||||
|
||||
with patch.object(video_utils, "VideoReaderWrapper", FakeWrapper):
|
||||
reader, meta, path = read_video_decord("/fake/path.mp4")
|
||||
|
||||
self.assertEqual(meta["num_of_frame"], 60)
|
||||
self.assertAlmostEqual(meta["duration"], 2.0)
|
||||
self.assertEqual(path, "/fake/path.mp4")
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# sample_frames_qwen
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
|
||||
class TestSampleFramesQwen(unittest.TestCase):
|
||||
META = {"num_of_frame": 100, "fps": 25.0}
|
||||
|
||||
def test_num_frames_basic(self):
|
||||
indices = sample_frames_qwen(2, 4, 100, self.META, num_frames=8)
|
||||
self.assertEqual(len(indices), 8)
|
||||
|
||||
def test_fps_basic(self):
|
||||
indices = sample_frames_qwen(2, 4, 100, self.META, fps=2.0)
|
||||
self.assertGreater(len(indices), 0)
|
||||
self.assertEqual(len(indices) % 2, 0)
|
||||
|
||||
def test_fps_and_num_frames_raises(self):
|
||||
with self.assertRaises(ValueError):
|
||||
sample_frames_qwen(2, 4, 100, self.META, fps=2.0, num_frames=10)
|
||||
|
||||
def test_num_frames_exceeds_total_raises(self):
|
||||
with self.assertRaises(ValueError):
|
||||
sample_frames_qwen(2, 4, 100, self.META, num_frames=200)
|
||||
|
||||
def test_fps_warning_when_nframes_exceeds_total(self):
|
||||
"""fps so high that computed num_frames > total → warning logged."""
|
||||
with self.assertLogs(level="WARNING"):
|
||||
sample_frames_qwen(2, 4, 100, {"num_of_frame": 10, "fps": 1.0}, fps=100.0)
|
||||
|
||||
def test_divisible_by_4_correction(self):
|
||||
"""Result must be divisible by 4 when num_frames > 2."""
|
||||
indices = sample_frames_qwen(2, 4, 100, self.META, fps=1.5)
|
||||
if len(indices) > 2:
|
||||
self.assertEqual(len(indices) % 4, 0)
|
||||
|
||||
def test_no_sampling_returns_all_frames(self):
|
||||
"""Both fps and num_frames at sentinel → return all frames."""
|
||||
indices = sample_frames_qwen(2, 4, 100, self.META)
|
||||
self.assertEqual(len(indices), 100)
|
||||
|
||||
def test_indices_dtype(self):
|
||||
indices = sample_frames_qwen(2, 4, 100, self.META, num_frames=8)
|
||||
self.assertEqual(indices.dtype, np.int32)
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# sample_frames_paddleocr
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
|
||||
class TestSampleFramesPaddleocr(unittest.TestCase):
|
||||
META = {"num_of_frame": 100, "fps": 25.0}
|
||||
|
||||
def test_num_frames_basic(self):
|
||||
indices = sample_frames_paddleocr(1, 4, 100, self.META, num_frames=10)
|
||||
self.assertEqual(len(indices), 10)
|
||||
|
||||
def test_fps_basic(self):
|
||||
indices = sample_frames_paddleocr(1, 4, 100, self.META, fps=2.0)
|
||||
self.assertGreater(len(indices), 0)
|
||||
|
||||
def test_fps_and_num_frames_raises(self):
|
||||
with self.assertRaises(ValueError):
|
||||
sample_frames_paddleocr(1, 4, 100, self.META, fps=2.0, num_frames=10)
|
||||
|
||||
def test_num_frames_exceeds_total_raises(self):
|
||||
with self.assertRaises(ValueError):
|
||||
sample_frames_paddleocr(1, 4, 100, self.META, num_frames=200)
|
||||
|
||||
def test_none_sentinels_no_sampling(self):
|
||||
"""fps=None, num_frames=None → return all frames."""
|
||||
indices = sample_frames_paddleocr(1, 4, 100, self.META)
|
||||
self.assertEqual(len(indices), 100)
|
||||
|
||||
def test_no_4_correction(self):
|
||||
"""paddleocr variant does NOT apply %4 correction."""
|
||||
# 6 frames is not divisible by 4; paddleocr should keep it
|
||||
meta = {"num_of_frame": 100, "fps": 25.0}
|
||||
indices = sample_frames_paddleocr(1, 1, 100, meta, num_frames=6)
|
||||
self.assertEqual(len(indices), 6)
|
||||
|
||||
def test_indices_dtype(self):
|
||||
indices = sample_frames_paddleocr(1, 4, 100, self.META, num_frames=8)
|
||||
self.assertEqual(indices.dtype, np.int32)
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# sample_frames dispatcher
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
|
||||
class TestSampleFramesDispatcher(unittest.TestCase):
|
||||
META = {"num_of_frame": 100, "fps": 25.0}
|
||||
|
||||
def test_default_variant_is_paddleocr(self):
|
||||
with patch("fastdeploy.input.video_utils.sample_frames_paddleocr", wraps=sample_frames_paddleocr) as mock_fn:
|
||||
sample_frames(1, 4, 100, self.META, num_frames=8)
|
||||
mock_fn.assert_called_once()
|
||||
|
||||
def test_qwen_variant_dispatched(self):
|
||||
with patch("fastdeploy.input.video_utils.sample_frames_qwen", wraps=sample_frames_qwen) as mock_fn:
|
||||
sample_frames(2, 4, 100, self.META, num_frames=8, variant="qwen")
|
||||
mock_fn.assert_called_once()
|
||||
|
||||
def test_qwen_none_fps_converted_to_sentinel(self):
|
||||
"""None fps/num_frames → converted to -1 before calling sample_frames_qwen."""
|
||||
with patch("fastdeploy.input.video_utils.sample_frames_qwen", return_value=np.array([])) as mock_fn:
|
||||
sample_frames(2, 4, 100, self.META, fps=None, num_frames=None, variant="qwen")
|
||||
args = mock_fn.call_args[0]
|
||||
self.assertEqual(args[4], -1) # fps sentinel
|
||||
self.assertEqual(args[5], -1) # num_frames sentinel
|
||||
|
||||
def test_paddleocr_variant_result_consistent(self):
|
||||
direct = sample_frames_paddleocr(1, 4, 100, self.META, num_frames=8)
|
||||
via_dispatcher = sample_frames(1, 4, 100, self.META, num_frames=8, variant="paddleocr")
|
||||
np.testing.assert_array_equal(direct, via_dispatcher)
|
||||
|
||||
def test_qwen_variant_result_consistent(self):
|
||||
direct = sample_frames_qwen(2, 4, 100, self.META, num_frames=8)
|
||||
via_dispatcher = sample_frames(2, 4, 100, self.META, num_frames=8, variant="qwen")
|
||||
np.testing.assert_array_equal(direct, via_dispatcher)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
unittest.main()
|
||||
Reference in New Issue
Block a user