mirror of
https://github.com/PaddlePaddle/FastDeploy.git
synced 2026-04-23 08:21:53 +08:00
[Optimization] Deduplicate shared image/video utilities across VL processors (#6988)
* step1~3 * fix import path * 删除重复代码 * 删除重复代码 * 删除重复代码 * fix import path * update * fix import path * add unit test * fix * update * fix unit test
This commit is contained in:
@@ -14,7 +14,6 @@
|
||||
# limitations under the License.
|
||||
"""
|
||||
|
||||
import math
|
||||
from typing import List, Optional, Union
|
||||
|
||||
import numpy as np
|
||||
@@ -41,6 +40,7 @@ from paddleformers.transformers.image_utils import (
|
||||
from paddleformers.transformers.legacy.tokenizer_utils_base import TensorType
|
||||
from PIL import Image
|
||||
|
||||
from fastdeploy.input.image_processors.common import is_scaled_image, smart_resize
|
||||
from fastdeploy.utils import data_processor_logger
|
||||
|
||||
OPENAI_CLIP_MEAN = [0.48145466, 0.4578275, 0.40821073]
|
||||
@@ -62,116 +62,6 @@ VideoInput = Union[
|
||||
]
|
||||
|
||||
|
||||
def round_by_factor(number: int, factor: int) -> int:
|
||||
"""
|
||||
Round number to nearest multiple of factor.
|
||||
|
||||
Args:
|
||||
number: Input number to round
|
||||
factor: Rounding factor
|
||||
|
||||
Returns:
|
||||
int: Rounded number
|
||||
"""
|
||||
return round(number / factor) * factor
|
||||
|
||||
|
||||
def ceil_by_factor(number: int, factor: int) -> int:
|
||||
"""
|
||||
Round number up to nearest multiple of factor.
|
||||
|
||||
Args:
|
||||
number: Input number to round
|
||||
factor: Rounding factor
|
||||
|
||||
Returns:
|
||||
int: Rounded number
|
||||
"""
|
||||
return math.ceil(number / factor) * factor
|
||||
|
||||
|
||||
def floor_by_factor(number: int, factor: int) -> int:
|
||||
"""
|
||||
Round number down to nearest multiple of factor.
|
||||
|
||||
Args:
|
||||
number: Input number to round
|
||||
factor: Rounding factor
|
||||
|
||||
Returns:
|
||||
int: Rounded number
|
||||
"""
|
||||
return math.floor(number / factor) * factor
|
||||
|
||||
|
||||
def smart_resize(height: int, width: int, factor: int, min_pixels: int, max_pixels: int, max_ratio: int = 200):
|
||||
"""
|
||||
Smart image resizing that maintains aspect ratio and respects constraints.
|
||||
|
||||
Args:
|
||||
height: Original image height
|
||||
width: Original image width
|
||||
factor: Patch size factor
|
||||
min_pixels: Minimum allowed pixels
|
||||
max_pixels: Maximum allowed pixels
|
||||
max_ratio: Maximum allowed aspect ratio
|
||||
|
||||
Returns:
|
||||
tuple: (new_height, new_width)
|
||||
|
||||
Raises:
|
||||
ValueError: If calculated dimensions are invalid
|
||||
"""
|
||||
if max(height, width) / min(height, width) > max_ratio:
|
||||
if height > width:
|
||||
new_width = max(factor, round_by_factor(width, factor))
|
||||
new_height = floor_by_factor(new_width * max_ratio, factor)
|
||||
else:
|
||||
new_height = max(factor, round_by_factor(height, factor))
|
||||
new_width = floor_by_factor(new_height * max_ratio, factor)
|
||||
|
||||
data_processor_logger.info(
|
||||
f"absolute aspect ratio must be smaller than {max_ratio}, got {max(height, width) / min(height, width)},\
|
||||
resize to {max(new_height, new_width) / min(new_height, new_width)}"
|
||||
)
|
||||
|
||||
height = new_height
|
||||
width = new_width
|
||||
|
||||
h_bar = max(factor, round_by_factor(height, factor))
|
||||
w_bar = max(factor, round_by_factor(width, factor))
|
||||
if h_bar * w_bar > max_pixels:
|
||||
beta = math.sqrt((height * width) / max_pixels)
|
||||
h_bar = floor_by_factor(height / beta, factor)
|
||||
w_bar = floor_by_factor(width / beta, factor)
|
||||
elif h_bar * w_bar < min_pixels:
|
||||
beta = math.sqrt(min_pixels / (height * width))
|
||||
h_bar = ceil_by_factor(height * beta, factor)
|
||||
w_bar = ceil_by_factor(width * beta, factor)
|
||||
|
||||
if min_pixels > h_bar * w_bar or h_bar * w_bar > max_pixels:
|
||||
raise ValueError(f"encounter invalid h_bar: {h_bar}, w_bar: {w_bar}")
|
||||
|
||||
return h_bar, w_bar
|
||||
|
||||
|
||||
def is_scaled_image(image: np.ndarray) -> bool:
|
||||
"""
|
||||
Check if image pixel values are already normalized to [0, 1] range.
|
||||
|
||||
Args:
|
||||
image: Input image array
|
||||
|
||||
Returns:
|
||||
bool: True if image is already scaled
|
||||
"""
|
||||
if image.dtype == np.uint8:
|
||||
return False
|
||||
|
||||
# It's possible the image has pixel values in [0, 255] but is of floating type
|
||||
return np.min(image) >= 0 and np.max(image) <= 1
|
||||
|
||||
|
||||
class ImageProcessor(BaseImageProcessor):
|
||||
"""
|
||||
Adaptive image processor for dynamic image resizing and preprocessing.
|
||||
|
||||
@@ -26,14 +26,14 @@ from PIL import Image
|
||||
|
||||
from fastdeploy.engine.request import ImagePosition
|
||||
from fastdeploy.entrypoints.chat_utils import parse_chat_messages
|
||||
from fastdeploy.input.ernie4_5_vl_processor import read_video_decord
|
||||
from fastdeploy.input.mm_data_processor import MMBaseDataProcessor
|
||||
from fastdeploy.input.utils import IDS_TYPE_FLAG
|
||||
from fastdeploy.input.video_utils import read_video_decord
|
||||
from fastdeploy.input.video_utils import sample_frames_qwen as sample_frames
|
||||
from fastdeploy.multimodal.hasher import MultimodalHasher
|
||||
from fastdeploy.utils import data_processor_logger
|
||||
|
||||
from .image_processor import ImageProcessor
|
||||
from .process_video import sample_frames
|
||||
|
||||
FRAME_FACTOR = 2
|
||||
FPS = 2.0
|
||||
|
||||
@@ -1,100 +0,0 @@
|
||||
"""
|
||||
# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
"""
|
||||
|
||||
from typing import Optional, Union
|
||||
|
||||
import numpy as np
|
||||
|
||||
from fastdeploy.utils import data_processor_logger
|
||||
|
||||
from .image_processor import ceil_by_factor, floor_by_factor
|
||||
|
||||
|
||||
def sample_frames(
|
||||
frame_factor: int,
|
||||
min_frames: int,
|
||||
max_frames: int,
|
||||
metadata: Optional[dict] = None,
|
||||
fps: Optional[Union[int, float]] = -1,
|
||||
num_frames: Optional[int] = -1,
|
||||
):
|
||||
"""
|
||||
Sample frames from video according to specified criteria.
|
||||
|
||||
Args:
|
||||
frame_factor: Ensure sampled frames are multiples of this factor
|
||||
min_frames: Minimum number of frames to sample
|
||||
max_frames: Maximum number of frames to sample
|
||||
metadata: Video metadata containing fps information
|
||||
fps: Target frames per second for sampling
|
||||
num_frames: Exact number of frames to sample
|
||||
|
||||
Returns:
|
||||
np.ndarray: Sampled video frames
|
||||
|
||||
Raises:
|
||||
ValueError: If both fps and num_frames are specified,
|
||||
or if required metadata is missing,
|
||||
or if requested frames exceed available frames
|
||||
"""
|
||||
if fps > 0 and num_frames > 0:
|
||||
raise ValueError("`num_frames` and `fps` are mutually exclusive arguments, please use only one!")
|
||||
|
||||
total_num_frames = metadata["num_of_frame"]
|
||||
|
||||
# If num_frames is not given but fps is, calculate num_frames from fps
|
||||
if num_frames > 0:
|
||||
num_frames = round(num_frames / frame_factor) * frame_factor
|
||||
elif fps > 0:
|
||||
if metadata is None:
|
||||
raise ValueError(
|
||||
"Asked to sample `fps` frames per second but no video metadata was provided which is required when sampling with `fps`. "
|
||||
"Please pass in `VideoMetadata` object or use a fixed `num_frames` per input video"
|
||||
)
|
||||
# max_frames = math.floor(min(max_frames, total_num_frames) / frame_factor) * frame_factor
|
||||
min_frames = ceil_by_factor(min_frames, frame_factor)
|
||||
max_frames = floor_by_factor(min(max_frames, total_num_frames), frame_factor)
|
||||
|
||||
num_frames = total_num_frames / metadata["fps"] * fps
|
||||
|
||||
if num_frames > total_num_frames:
|
||||
data_processor_logger.warning(f"smart_nframes: nframes[{num_frames}] > total_frames[{total_num_frames}]")
|
||||
|
||||
num_frames = min(min(max(num_frames, min_frames), max_frames), total_num_frames)
|
||||
num_frames = floor_by_factor(num_frames, frame_factor)
|
||||
|
||||
if num_frames > total_num_frames:
|
||||
raise ValueError(
|
||||
f"Video can't be sampled. The inferred `num_frames={num_frames}` exceeds `total_num_frames={total_num_frames}`. "
|
||||
"Decrease `num_frames` or `fps` for sampling."
|
||||
)
|
||||
|
||||
# Hack code ensures that num_frames can always be divided by 4
|
||||
# due to sched/resource_manager_v1.py 中 grid_thw.extend([[2, h, w]] * (t // 2))
|
||||
if num_frames > 2 and num_frames % 4 != 0:
|
||||
num_frames = (num_frames // 4) * 4 # 向下取整到 4 的倍数
|
||||
total_num_frames = (total_num_frames // 4) * 4
|
||||
num_frames = min(min(max(num_frames, min_frames), max_frames), total_num_frames)
|
||||
|
||||
# Calculate frame indices based on sampling strategy
|
||||
if num_frames > 0:
|
||||
# Evenly spaced sampling for target frame count
|
||||
indices = np.arange(0, total_num_frames, total_num_frames / num_frames).astype(np.int32)
|
||||
else:
|
||||
# Keep all frames if no sampling requested
|
||||
indices = np.arange(0, total_num_frames).astype(np.int32)
|
||||
|
||||
return indices
|
||||
Reference in New Issue
Block a user