[Optimization] Deduplicate shared image/video utilities across VL processors (#6988)

* step1~3 * fix import path * 删除重复代码 * 删除重复代码 * 删除重复代码 * fix import path * update * fix import path * add unit test * fix * update * fix unit test
2026-04-23 00:17:25 +08:00 · 2026-03-26 09:49:33 +08:00
parent 1502b6f43e
commit d5cb2767d7
16 changed files with 882 additions and 593 deletions
@@ -16,7 +16,6 @@

 """image preprocessor adaptive"""

-import math
 from typing import List, Optional, Union

 import numpy as np
@@ -45,6 +44,8 @@ from paddleformers.transformers.image_utils import (
 from paddleformers.transformers.legacy.tokenizer_utils_base import TensorType
 from PIL import Image

+from fastdeploy.input.image_processors.common import is_scaled_image
+from fastdeploy.input.image_processors.common import smart_resize_qwen as smart_resize
 from fastdeploy.utils import data_processor_logger

 OPENAI_CLIP_MEAN = [0.48145466, 0.4578275, 0.40821073]
@@ -73,22 +74,9 @@ __all__ = [
 ]


-def is_scaled_image(image: np.ndarray) -> bool:
-    """
-    Checks to see whether the pixel values have already been rescaled to [0, 1].
-    """
-    if image.dtype == np.uint8:
-        return False
-
-    # It's possible the image has pixel values in [0, 255] but is of floating type
-    return np.min(image) >= 0 and np.max(image) <= 1
-
-
 def make_batched_images(images) -> List[List[ImageInput]]:
    """
    Accepts images in list or nested list format, and makes a list of images for preprocessing.
-
-    Args:
        images (`Union[List[List[ImageInput]], List[ImageInput], ImageInput]`):
            The input image.

@@ -521,67 +509,3 @@ class AdaptiveImageProcessor(BaseImageProcessor):
            }

        return BatchFeature(data=data, tensor_type=return_tensors)
-
-
-def round_by_factor(number: int, factor: int) -> int:
-    """Returns the closest integer to 'number' that is divisible by 'factor'."""
-    return round(number / factor) * factor
-
-
-def ceil_by_factor(number: int, factor: int) -> int:
-    """Returns the smallest integer greater than or equal to 'number' that is divisible by 'factor'."""
-    return math.ceil(number / factor) * factor
-
-
-def floor_by_factor(number: int, factor: int) -> int:
-    """Returns the largest integer less than or equal to 'number' that is divisible by 'factor'."""
-    return math.floor(number / factor) * factor
-
-
-def smart_resize(
-    height: int,
-    width: int,
-    factor: int = IMAGE_FACTOR,
-    min_pixels: int = MIN_PIXELS,
-    max_pixels: int = MAX_PIXELS,
-):
-    """
-    Rescales the image so that the following conditions are met:
-
-    1. Both dimensions (height and width) are divisible by 'factor'.
-
-    2. The total number of pixels is within the range ['min_pixels', 'max_pixels'].
-
-    3. The aspect ratio of the image is maintained as closely as possible.
-    """
-    if max(height, width) / min(height, width) > MAX_RATIO:
-        if height > width:
-            new_width = max(factor, round_by_factor(width, factor))
-            new_height = floor_by_factor(new_width * MAX_RATIO, factor)
-        else:
-            new_height = max(factor, round_by_factor(height, factor))
-            new_width = floor_by_factor(new_height * MAX_RATIO, factor)
-
-        data_processor_logger.info(
-            f"absolute aspect ratio must be smaller than {MAX_RATIO}, got {max(height, width) / min(height, width)},\
-              resize to {max(new_height, new_width) / min(new_height, new_width)}"
-        )
-
-        height = new_height
-        width = new_width
-
-    h_bar = max(factor, round_by_factor(height, factor))
-    w_bar = max(factor, round_by_factor(width, factor))
-    if h_bar * w_bar > max_pixels:
-        beta = math.sqrt((height * width) / max_pixels)
-        h_bar = floor_by_factor(height / beta, factor)
-        w_bar = floor_by_factor(width / beta, factor)
-    elif h_bar * w_bar < min_pixels:
-        beta = math.sqrt(min_pixels / (height * width))
-        h_bar = ceil_by_factor(height * beta, factor)
-        w_bar = ceil_by_factor(width * beta, factor)
-
-    if min_pixels > h_bar * w_bar or h_bar * w_bar > max_pixels:
-        raise ValueError(f"encounter invalid h_bar: {h_bar}, w_bar: {w_bar}")
-
-    return h_bar, w_bar
@@ -0,0 +1,13 @@
+# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
@@ -0,0 +1,208 @@
+# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Shared image utility functions for all VL image processors."""
+
+import math
+
+import numpy as np
+
+from fastdeploy.utils import data_processor_logger
+
+__all__ = [
+    "round_by_factor",
+    "ceil_by_factor",
+    "floor_by_factor",
+    "is_scaled_image",
+    "smart_resize",
+    "smart_resize_qwen",
+    "smart_resize_paddleocr",
+]
+
+
+def round_by_factor(number: int, factor: int) -> int:
+    """Returns the closest integer to 'number' that is divisible by 'factor'."""
+    return round(number / factor) * factor
+
+
+def ceil_by_factor(number: int, factor: int) -> int:
+    """Returns the smallest integer >= 'number' that is divisible by 'factor'."""
+    return math.ceil(number / factor) * factor
+
+
+def floor_by_factor(number: int, factor: int) -> int:
+    """Returns the largest integer <= 'number' that is divisible by 'factor'."""
+    return math.floor(number / factor) * factor
+
+
+def is_scaled_image(image: np.ndarray) -> bool:
+    """Check if image pixel values are already normalized to [0, 1] range.
+
+    Args:
+        image: Input image array.
+
+    Returns:
+        bool: True if image is already scaled to [0, 1].
+    """
+    if image.dtype == np.uint8:
+        return False
+    # It's possible the image has pixel values in [0, 255] but is of floating type
+    return np.min(image) >= 0 and np.max(image) <= 1
+
+
+def smart_resize_qwen(
+    height: int,
+    width: int,
+    factor: int,
+    min_pixels: int,
+    max_pixels: int,
+    max_ratio: int = 200,
+) -> tuple:
+    """Smart image resizing for ERNIE / Qwen2.5 / Qwen3 models.
+
+    Maintains aspect ratio and respects pixel constraints. When the aspect ratio
+    exceeds max_ratio, the image is cropped (not raised as error) to fit within
+    the ratio limit.
+
+    Args:
+        height: Original image height.
+        width: Original image width.
+        factor: Patch size factor; both output dimensions will be multiples of this.
+        min_pixels: Minimum allowed total pixels.
+        max_pixels: Maximum allowed total pixels.
+        max_ratio: Maximum allowed aspect ratio (default 200).
+
+    Returns:
+        tuple: (new_height, new_width)
+
+    Raises:
+        ValueError: If calculated dimensions are still invalid after resizing.
+    """
+    if max(height, width) / min(height, width) > max_ratio:
+        if height > width:
+            new_width = max(factor, round_by_factor(width, factor))
+            new_height = floor_by_factor(new_width * max_ratio, factor)
+        else:
+            new_height = max(factor, round_by_factor(height, factor))
+            new_width = floor_by_factor(new_height * max_ratio, factor)
+
+        data_processor_logger.info(
+            f"absolute aspect ratio must be smaller than {max_ratio}, "
+            f"got {max(height, width) / min(height, width)}, "
+            f"resize to {max(new_height, new_width) / min(new_height, new_width)}"
+        )
+        height = new_height
+        width = new_width
+
+    h_bar = max(factor, round_by_factor(height, factor))
+    w_bar = max(factor, round_by_factor(width, factor))
+    if h_bar * w_bar > max_pixels:
+        beta = math.sqrt((height * width) / max_pixels)
+        h_bar = floor_by_factor(height / beta, factor)
+        w_bar = floor_by_factor(width / beta, factor)
+    elif h_bar * w_bar < min_pixels:
+        beta = math.sqrt(min_pixels / (height * width))
+        h_bar = ceil_by_factor(height * beta, factor)
+        w_bar = ceil_by_factor(width * beta, factor)
+
+    if min_pixels > h_bar * w_bar or h_bar * w_bar > max_pixels:
+        raise ValueError(f"encounter invalid h_bar: {h_bar}, w_bar: {w_bar}")
+
+    return h_bar, w_bar
+
+
+def smart_resize_paddleocr(
+    height: int,
+    width: int,
+    factor: int = 28,
+    min_pixels: int = 28 * 28 * 130,
+    max_pixels: int = 28 * 28 * 1280,
+) -> tuple:
+    """Smart image resizing for PaddleOCR-VL model.
+
+    Similar to smart_resize_qwen but adds small-image protection: if height or
+    width is smaller than factor, the image is scaled up to factor first. Also,
+    when aspect ratio exceeds 200 this function raises ValueError (instead of
+    silently cropping like the qwen variant).
+
+    Args:
+        height: Original image height.
+        width: Original image width.
+        factor: Patch size factor; both output dimensions will be multiples of this.
+        min_pixels: Minimum allowed total pixels.
+        max_pixels: Maximum allowed total pixels.
+
+    Returns:
+        tuple: (new_height, new_width)
+
+    Raises:
+        ValueError: If aspect ratio exceeds 200, or calculated dimensions are invalid.
+    """
+    if height < factor:
+        data_processor_logger.debug(f"smart_resize_paddleocr: height={height} < factor={factor}, reset height=factor")
+        width = round((width * factor) / height)
+        height = factor
+
+    if width < factor:
+        data_processor_logger.debug(f"smart_resize_paddleocr: width={width} < factor={factor}, reset width=factor")
+        height = round((height * factor) / width)
+        width = factor
+
+    if max(height, width) / min(height, width) > 200:
+        raise ValueError(
+            f"absolute aspect ratio must be smaller than 200, " f"got {max(height, width) / min(height, width)}"
+        )
+
+    h_bar = round(height / factor) * factor
+    w_bar = round(width / factor) * factor
+    if h_bar * w_bar > max_pixels:
+        beta = math.sqrt((height * width) / max_pixels)
+        h_bar = math.floor(height / beta / factor) * factor
+        w_bar = math.floor(width / beta / factor) * factor
+    elif h_bar * w_bar < min_pixels:
+        beta = math.sqrt(min_pixels / (height * width))
+        h_bar = math.ceil(height * beta / factor) * factor
+        w_bar = math.ceil(width * beta / factor) * factor
+
+    return h_bar, w_bar
+
+
+def smart_resize(
+    height: int,
+    width: int,
+    factor: int,
+    min_pixels: int,
+    max_pixels: int,
+    max_ratio: int = 200,
+    variant: str = "qwen",
+) -> tuple:
+    """Unified smart_resize dispatcher.
+
+    Args:
+        height: Original image height.
+        width: Original image width.
+        factor: Patch size factor.
+        min_pixels: Minimum allowed total pixels.
+        max_pixels: Maximum allowed total pixels.
+        max_ratio: Maximum allowed aspect ratio (only used by "qwen" variant).
+        variant: Which algorithm variant to use.
+            - "qwen" (default): for ERNIE / Qwen2.5 / Qwen3. Clips extreme ratios silently.
+            - "paddleocr": for PaddleOCR-VL. Adds small-image protection, raises on bad ratio.
+
+    Returns:
+        tuple: (new_height, new_width)
+    """
+    if variant == "paddleocr":
+        return smart_resize_paddleocr(height, width, factor, min_pixels, max_pixels)
+    return smart_resize_qwen(height, width, factor, min_pixels, max_pixels, max_ratio)
@@ -19,8 +19,6 @@
 # TODO: Support videos

 import json
-import logging
-import math
 from pathlib import Path
 from typing import Dict, List, Optional, Union

@@ -34,6 +32,10 @@ from paddleformers.transformers.image_utils import (
    to_numpy_array,
 )

+from fastdeploy.input.image_processors.common import (
+    smart_resize_paddleocr as smart_resize,
+)
+
 _OPENAI_CLIP_MEAN = [0.48145466, 0.4578275, 0.40821073]
 _OPENAI_CLIP_STD = [0.26862954, 0.26130258, 0.27577711]

@@ -68,54 +70,6 @@ def adjust_size(size, patch_size):
    return num_patches * patch_size


-def smart_resize(
-    height: int,
-    width: int,
-    factor: int = 28,
-    min_pixels: int = 28 * 28 * 130,
-    max_pixels: int = 28 * 28 * 1280,
-):
-    """Rescales the image so that the following conditions are met:
-
-    1. Both dimensions (height and width) are divisible by 'factor'.
-
-    2. The total number of pixels is within the range ['min_pixels', 'max_pixels'].
-
-    3. The aspect ratio of the image is maintained as closely as possible.
-
-    """
-    # if height < factor or width < factor:
-    #    raise ValueError(f"height:{height} or width:{width} must be larger than factor:{factor}")
-    # if int(height < factor//4) + int(width < factor//4):
-    #     raise ValueError(f"height:{height} or width:{width} must be larger than factor:{factor//4}")
-
-    if height < factor:
-        logging.debug(f"smart_resize: height={height} < factor={factor}, reset height=factor")
-        width = round((width * factor) / height)
-        height = factor
-
-    if width < factor:
-        logging.debug(f"smart_resize: width={width} < factor={factor}, reset width=factor")
-        height = round((height * factor) / width)
-        width = factor
-
-    if max(height, width) / min(height, width) > 200:
-        raise ValueError(
-            f"absolute aspect ratio must be smaller than 200, got {max(height, width) / min(height, width)}"
-        )
-    h_bar = round(height / factor) * factor
-    w_bar = round(width / factor) * factor
-    if h_bar * w_bar > max_pixels:
-        beta = math.sqrt((height * width) / max_pixels)
-        h_bar = math.floor(height / beta / factor) * factor
-        w_bar = math.floor(width / beta / factor) * factor
-    elif h_bar * w_bar < min_pixels:
-        beta = math.sqrt(min_pixels / (height * width))
-        h_bar = math.ceil(height * beta / factor) * factor
-        w_bar = math.ceil(width * beta / factor) * factor
-    return h_bar, w_bar
-
-
 class ImageProcessor(BaseImageProcessor):
    model_input_names = [
        "pixel_values",
@@ -26,14 +26,14 @@ from PIL import Image

 from fastdeploy.engine.request import ImagePosition
 from fastdeploy.entrypoints.chat_utils import parse_chat_messages
-from fastdeploy.input.ernie4_5_vl_processor import read_video_decord
 from fastdeploy.input.mm_data_processor import MMBaseDataProcessor
 from fastdeploy.input.utils import IDS_TYPE_FLAG
+from fastdeploy.input.video_utils import read_video_decord
+from fastdeploy.input.video_utils import sample_frames_paddleocr as sample_frames
 from fastdeploy.multimodal.hasher import MultimodalHasher
 from fastdeploy.utils import data_processor_logger

 from .image_processor import ImageProcessor
-from .process_video import sample_frames


 class DataProcessor(MMBaseDataProcessor):
@@ -1,82 +0,0 @@
-"""
-# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""
-
-import math
-from typing import Optional, Union
-
-import numpy as np
-
-
-def sample_frames(
-    frame_factor: int,
-    min_frames: int,
-    max_frames: int,
-    metadata: Optional[dict] = None,
-    fps: Optional[Union[int, float]] = None,
-    num_frames: Optional[int] = None,
-):
-    """
-    Sample frames from video according to specified criteria.
-
-    Args:
-        frame_factor: Ensure sampled frames are multiples of this factor
-        min_frames: Minimum number of frames to sample
-        max_frames: Maximum number of frames to sample
-        metadata: Video metadata containing fps information
-        fps: Target frames per second for sampling
-        num_frames: Exact number of frames to sample
-
-    Returns:
-        np.ndarray: Sampled video frames
-
-    Raises:
-        ValueError: If both fps and num_frames are specified,
-                   or if required metadata is missing,
-                   or if requested frames exceed available frames
-    """
-    if fps > 0 and num_frames > 0:
-        raise ValueError("`num_frames` and `fps` are mutually exclusive arguments, please use only one!")
-
-    total_num_frames = metadata["num_of_frame"]
-
-    # If num_frames is not given but fps is, calculate num_frames from fps
-    if num_frames > 0:
-        num_frames = round(num_frames / frame_factor) * frame_factor
-    elif fps > 0:
-        if metadata is None:
-            raise ValueError(
-                "Asked to sample `fps` frames per second but no video metadata was provided which is required when sampling with `fps`. "
-                "Please pass in `VideoMetadata` object or use a fixed `num_frames` per input video"
-            )
-        max_frames = math.floor(min(max_frames, total_num_frames) / frame_factor) * frame_factor
-        num_frames = total_num_frames / metadata["fps"] * fps
-        num_frames = min(min(max(num_frames, min_frames), max_frames), total_num_frames)
-        num_frames = math.floor(num_frames / frame_factor) * frame_factor
-    if num_frames > total_num_frames:
-        raise ValueError(
-            f"Video can't be sampled. The inferred `num_frames={num_frames}` exceeds `total_num_frames={total_num_frames}`. "
-            "Decrease `num_frames` or `fps` for sampling."
-        )
-
-    # Calculate frame indices based on sampling strategy
-    if num_frames > 0:
-        # Evenly spaced sampling for target frame count
-        indices = np.arange(0, total_num_frames, total_num_frames / num_frames).astype(np.int32)
-    else:
-        # Keep all frames if no sampling requested
-        indices = np.arange(0, total_num_frames).astype(np.int32)
-
-    return indices
@@ -14,7 +14,6 @@
 # limitations under the License.
 """

-import math
 from typing import List, Optional, Union

 import numpy as np
@@ -41,6 +40,7 @@ from paddleformers.transformers.image_utils import (
 from paddleformers.transformers.legacy.tokenizer_utils_base import TensorType
 from PIL import Image

+from fastdeploy.input.image_processors.common import is_scaled_image, smart_resize
 from fastdeploy.utils import data_processor_logger

 IMAGE_MEAN = [0.5, 0.5, 0.5]
@@ -62,86 +62,6 @@ VideoInput = Union[
 ]


-def round_by_factor(number: int, factor: int) -> int:
-    return round(number / factor) * factor
-
-
-def ceil_by_factor(number: int, factor: int) -> int:
-    return math.ceil(number / factor) * factor
-
-
-def floor_by_factor(number: int, factor: int) -> int:
-    return math.floor(number / factor) * factor
-
-
-def smart_resize(height: int, width: int, factor: int, min_pixels: int, max_pixels: int, max_ratio: int = 200):
-    """
-    Smart image resizing that maintains aspect ratio and respects constraints.
-
-    Args:
-        height: Original image height
-        width: Original image width
-        factor: Patch size factor
-        min_pixels: Minimum allowed pixels
-        max_pixels: Maximum allowed pixels
-        max_ratio: Maximum allowed aspect ratio
-
-    Returns:
-        tuple: (new_height, new_width)
-
-    Raises:
-        ValueError: If calculated dimensions are invalid
-    """
-    if max(height, width) / min(height, width) > max_ratio:
-        if height > width:
-            new_width = max(factor, round_by_factor(width, factor))
-            new_height = floor_by_factor(new_width * max_ratio, factor)
-        else:
-            new_height = max(factor, round_by_factor(height, factor))
-            new_width = floor_by_factor(new_height * max_ratio, factor)
-
-        data_processor_logger.info(
-            f"absolute aspect ratio must be smaller than {max_ratio}, got {max(height, width) / min(height, width)},\
-              resize to {max(new_height, new_width) / min(new_height, new_width)}"
-        )
-
-        height = new_height
-        width = new_width
-
-    h_bar = max(factor, round_by_factor(height, factor))
-    w_bar = max(factor, round_by_factor(width, factor))
-    if h_bar * w_bar > max_pixels:
-        beta = math.sqrt((height * width) / max_pixels)
-        h_bar = floor_by_factor(height / beta, factor)
-        w_bar = floor_by_factor(width / beta, factor)
-    elif h_bar * w_bar < min_pixels:
-        beta = math.sqrt(min_pixels / (height * width))
-        h_bar = ceil_by_factor(height * beta, factor)
-        w_bar = ceil_by_factor(width * beta, factor)
-
-    if min_pixels > h_bar * w_bar or h_bar * w_bar > max_pixels:
-        raise ValueError(f"encounter invalid h_bar: {h_bar}, w_bar: {w_bar}")
-
-    return h_bar, w_bar
-
-
-def is_scaled_image(image: np.ndarray) -> bool:
-    """
-    Check if image pixel values are already normalized to [0, 1] range.
-
-    Args:
-        image: Input image array
-
-    Returns:
-        bool: True if image is already scaled
-    """
-    if image.dtype == np.uint8:
-        return False
-
-    # It's possible the image has pixel values in [0, 255] but is of floating type
-    return np.min(image) >= 0 and np.max(image) <= 1
-
-
 class ImageProcessor(BaseImageProcessor):
    """
    Adaptive image processor for dynamic image resizing and preprocessing.
@@ -26,13 +26,14 @@ from PIL import Image

 from fastdeploy.engine.request import ImagePosition
 from fastdeploy.entrypoints.chat_utils import parse_chat_messages
-from fastdeploy.input.ernie4_5_vl_processor import read_video_decord
 from fastdeploy.input.mm_data_processor import MMBaseDataProcessor
 from fastdeploy.input.utils import IDS_TYPE_FLAG
+from fastdeploy.input.video_utils import read_video_decord
+from fastdeploy.input.video_utils import sample_frames_qwen as sample_frames
 from fastdeploy.multimodal.hasher import MultimodalHasher
 from fastdeploy.utils import data_processor_logger

-from .image_processor import ImageProcessor, ceil_by_factor, floor_by_factor
+from .image_processor import ImageProcessor

 VIDEO_MIN_PIXELS = 128 * 28 * 28
 VIDEO_MAX_PIXELS = 768 * 28 * 28
@@ -42,83 +43,6 @@ FPS_MIN_FRAMES = 4
 FPS_MAX_FRAMES = 768


-def sample_frames(
-    frame_factor: int,
-    min_frames: int,
-    max_frames: int,
-    metadata: Optional[dict] = None,
-    fps: Optional[Union[int, float]] = -1,
-    num_frames: Optional[int] = -1,
-):
-    """
-    Sample frames from video according to specified criteria.
-
-    Args:
-        frame_factor: Ensure sampled frames are multiples of this factor
-        min_frames: Minimum number of frames to sample
-        max_frames: Maximum number of frames to sample
-        metadata: Video metadata containing fps information
-        fps: Target frames per second for sampling
-        num_frames: Exact number of frames to sample
-
-    Returns:
-        np.ndarray: Sampled video frames
-
-    Raises:
-        ValueError: If both fps and num_frames are specified,
-                   or if required metadata is missing,
-                   or if requested frames exceed available frames
-    """
-    if fps > 0 and num_frames > 0:
-        raise ValueError("`num_frames` and `fps` are mutually exclusive arguments, please use only one!")
-
-    total_num_frames = metadata["num_of_frame"]
-
-    # If num_frames is not given but fps is, calculate num_frames from fps
-    if num_frames > 0:
-        num_frames = round(num_frames / frame_factor) * frame_factor
-    elif fps > 0:
-        if metadata is None:
-            raise ValueError(
-                "Asked to sample `fps` frames per second but no video metadata was provided which is required when sampling with `fps`. "
-                "Please pass in `VideoMetadata` object or use a fixed `num_frames` per input video"
-            )
-        # max_frames = math.floor(min(max_frames, total_num_frames) / frame_factor) * frame_factor
-        min_frames = ceil_by_factor(min_frames, frame_factor)
-        max_frames = floor_by_factor(min(max_frames, total_num_frames), frame_factor)
-
-        num_frames = total_num_frames / metadata["fps"] * fps
-
-        if num_frames > total_num_frames:
-            data_processor_logger.warning(f"smart_nframes: nframes[{num_frames}] > total_frames[{total_num_frames}]")
-
-        num_frames = min(min(max(num_frames, min_frames), max_frames), total_num_frames)
-        num_frames = floor_by_factor(num_frames, frame_factor)
-
-    if num_frames > total_num_frames:
-        raise ValueError(
-            f"Video can't be sampled. The inferred `num_frames={num_frames}` exceeds `total_num_frames={total_num_frames}`. "
-            "Decrease `num_frames` or `fps` for sampling."
-        )
-
-    # Hack code ensures that num_frames can always be divided by 4
-    # due to sched/resource_manager_v1.py 中 grid_thw.extend([[2, h, w]] * (t // 2))
-    if num_frames > 2 and num_frames % 4 != 0:
-        num_frames = (num_frames // 4) * 4  # 向下取整到 4 的倍数
-        total_num_frames = (total_num_frames // 4) * 4
-        num_frames = min(min(max(num_frames, min_frames), max_frames), total_num_frames)
-
-    # Calculate frame indices based on sampling strategy
-    if num_frames > 0:
-        # Evenly spaced sampling for target frame count
-        indices = np.arange(0, total_num_frames, total_num_frames / num_frames).astype(np.int32)
-    else:
-        # Keep all frames if no sampling requested
-        indices = np.arange(0, total_num_frames).astype(np.int32)
-
-    return indices
-
-
 class DataProcessor(MMBaseDataProcessor):
    """
    Processes multimodal inputs (text, images, videos) into model-ready formats.
@@ -14,7 +14,6 @@
 # limitations under the License.
 """

-import math
 from typing import List, Optional, Union

 import numpy as np
@@ -41,6 +40,7 @@ from paddleformers.transformers.image_utils import (
 from paddleformers.transformers.legacy.tokenizer_utils_base import TensorType
 from PIL import Image

+from fastdeploy.input.image_processors.common import is_scaled_image, smart_resize
 from fastdeploy.utils import data_processor_logger

 OPENAI_CLIP_MEAN = [0.48145466, 0.4578275, 0.40821073]
@@ -62,116 +62,6 @@ VideoInput = Union[
 ]


-def round_by_factor(number: int, factor: int) -> int:
-    """
-    Round number to nearest multiple of factor.
-
-    Args:
-        number: Input number to round
-        factor: Rounding factor
-
-    Returns:
-        int: Rounded number
-    """
-    return round(number / factor) * factor
-
-
-def ceil_by_factor(number: int, factor: int) -> int:
-    """
-    Round number up to nearest multiple of factor.
-
-    Args:
-        number: Input number to round
-        factor: Rounding factor
-
-    Returns:
-        int: Rounded number
-    """
-    return math.ceil(number / factor) * factor
-
-
-def floor_by_factor(number: int, factor: int) -> int:
-    """
-    Round number down to nearest multiple of factor.
-
-    Args:
-        number: Input number to round
-        factor: Rounding factor
-
-    Returns:
-        int: Rounded number
-    """
-    return math.floor(number / factor) * factor
-
-
-def smart_resize(height: int, width: int, factor: int, min_pixels: int, max_pixels: int, max_ratio: int = 200):
-    """
-    Smart image resizing that maintains aspect ratio and respects constraints.
-
-    Args:
-        height: Original image height
-        width: Original image width
-        factor: Patch size factor
-        min_pixels: Minimum allowed pixels
-        max_pixels: Maximum allowed pixels
-        max_ratio: Maximum allowed aspect ratio
-
-    Returns:
-        tuple: (new_height, new_width)
-
-    Raises:
-        ValueError: If calculated dimensions are invalid
-    """
-    if max(height, width) / min(height, width) > max_ratio:
-        if height > width:
-            new_width = max(factor, round_by_factor(width, factor))
-            new_height = floor_by_factor(new_width * max_ratio, factor)
-        else:
-            new_height = max(factor, round_by_factor(height, factor))
-            new_width = floor_by_factor(new_height * max_ratio, factor)
-
-        data_processor_logger.info(
-            f"absolute aspect ratio must be smaller than {max_ratio}, got {max(height, width) / min(height, width)},\
-              resize to {max(new_height, new_width) / min(new_height, new_width)}"
-        )
-
-        height = new_height
-        width = new_width
-
-    h_bar = max(factor, round_by_factor(height, factor))
-    w_bar = max(factor, round_by_factor(width, factor))
-    if h_bar * w_bar > max_pixels:
-        beta = math.sqrt((height * width) / max_pixels)
-        h_bar = floor_by_factor(height / beta, factor)
-        w_bar = floor_by_factor(width / beta, factor)
-    elif h_bar * w_bar < min_pixels:
-        beta = math.sqrt(min_pixels / (height * width))
-        h_bar = ceil_by_factor(height * beta, factor)
-        w_bar = ceil_by_factor(width * beta, factor)
-
-    if min_pixels > h_bar * w_bar or h_bar * w_bar > max_pixels:
-        raise ValueError(f"encounter invalid h_bar: {h_bar}, w_bar: {w_bar}")
-
-    return h_bar, w_bar
-
-
-def is_scaled_image(image: np.ndarray) -> bool:
-    """
-    Check if image pixel values are already normalized to [0, 1] range.
-
-    Args:
-        image: Input image array
-
-    Returns:
-        bool: True if image is already scaled
-    """
-    if image.dtype == np.uint8:
-        return False
-
-    # It's possible the image has pixel values in [0, 255] but is of floating type
-    return np.min(image) >= 0 and np.max(image) <= 1
-
-
 class ImageProcessor(BaseImageProcessor):
    """
    Adaptive image processor for dynamic image resizing and preprocessing.
@@ -26,14 +26,14 @@ from PIL import Image

 from fastdeploy.engine.request import ImagePosition
 from fastdeploy.entrypoints.chat_utils import parse_chat_messages
-from fastdeploy.input.ernie4_5_vl_processor import read_video_decord
 from fastdeploy.input.mm_data_processor import MMBaseDataProcessor
 from fastdeploy.input.utils import IDS_TYPE_FLAG
+from fastdeploy.input.video_utils import read_video_decord
+from fastdeploy.input.video_utils import sample_frames_qwen as sample_frames
 from fastdeploy.multimodal.hasher import MultimodalHasher
 from fastdeploy.utils import data_processor_logger

 from .image_processor import ImageProcessor
-from .process_video import sample_frames

 FRAME_FACTOR = 2
 FPS = 2.0
@@ -1,100 +0,0 @@
-"""
-# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""
-
-from typing import Optional, Union
-
-import numpy as np
-
-from fastdeploy.utils import data_processor_logger
-
-from .image_processor import ceil_by_factor, floor_by_factor
-
-
-def sample_frames(
-    frame_factor: int,
-    min_frames: int,
-    max_frames: int,
-    metadata: Optional[dict] = None,
-    fps: Optional[Union[int, float]] = -1,
-    num_frames: Optional[int] = -1,
-):
-    """
-    Sample frames from video according to specified criteria.
-
-    Args:
-        frame_factor: Ensure sampled frames are multiples of this factor
-        min_frames: Minimum number of frames to sample
-        max_frames: Maximum number of frames to sample
-        metadata: Video metadata containing fps information
-        fps: Target frames per second for sampling
-        num_frames: Exact number of frames to sample
-
-    Returns:
-        np.ndarray: Sampled video frames
-
-    Raises:
-        ValueError: If both fps and num_frames are specified,
-                   or if required metadata is missing,
-                   or if requested frames exceed available frames
-    """
-    if fps > 0 and num_frames > 0:
-        raise ValueError("`num_frames` and `fps` are mutually exclusive arguments, please use only one!")
-
-    total_num_frames = metadata["num_of_frame"]
-
-    # If num_frames is not given but fps is, calculate num_frames from fps
-    if num_frames > 0:
-        num_frames = round(num_frames / frame_factor) * frame_factor
-    elif fps > 0:
-        if metadata is None:
-            raise ValueError(
-                "Asked to sample `fps` frames per second but no video metadata was provided which is required when sampling with `fps`. "
-                "Please pass in `VideoMetadata` object or use a fixed `num_frames` per input video"
-            )
-        # max_frames = math.floor(min(max_frames, total_num_frames) / frame_factor) * frame_factor
-        min_frames = ceil_by_factor(min_frames, frame_factor)
-        max_frames = floor_by_factor(min(max_frames, total_num_frames), frame_factor)
-
-        num_frames = total_num_frames / metadata["fps"] * fps
-
-        if num_frames > total_num_frames:
-            data_processor_logger.warning(f"smart_nframes: nframes[{num_frames}] > total_frames[{total_num_frames}]")
-
-        num_frames = min(min(max(num_frames, min_frames), max_frames), total_num_frames)
-        num_frames = floor_by_factor(num_frames, frame_factor)
-
-    if num_frames > total_num_frames:
-        raise ValueError(
-            f"Video can't be sampled. The inferred `num_frames={num_frames}` exceeds `total_num_frames={total_num_frames}`. "
-            "Decrease `num_frames` or `fps` for sampling."
-        )
-
-    # Hack code ensures that num_frames can always be divided by 4
-    # due to sched/resource_manager_v1.py 中 grid_thw.extend([[2, h, w]] * (t // 2))
-    if num_frames > 2 and num_frames % 4 != 0:
-        num_frames = (num_frames // 4) * 4  # 向下取整到 4 的倍数
-        total_num_frames = (total_num_frames // 4) * 4
-        num_frames = min(min(max(num_frames, min_frames), max_frames), total_num_frames)
-
-    # Calculate frame indices based on sampling strategy
-    if num_frames > 0:
-        # Evenly spaced sampling for target frame count
-        indices = np.arange(0, total_num_frames, total_num_frames / num_frames).astype(np.int32)
-    else:
-        # Keep all frames if no sampling requested
-        indices = np.arange(0, total_num_frames).astype(np.int32)
-
-    return indices
@@ -0,0 +1,272 @@
+# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Shared video utilities: VideoReaderWrapper, read_video_decord, and sample_frames."""
+
+import io
+import math
+import os
+from tempfile import NamedTemporaryFile as ntf
+from typing import Optional, Union
+
+import numpy as np
+
+from fastdeploy.input.image_processors.common import ceil_by_factor, floor_by_factor
+from fastdeploy.utils import data_processor_logger
+
+__all__ = [
+    "VideoReaderWrapper",
+    "read_video_decord",
+    "sample_frames",
+    "sample_frames_qwen",
+    "sample_frames_paddleocr",
+]
+
+
+# ---------------------------------------------------------------------------
+# VideoReaderWrapper
+# ---------------------------------------------------------------------------
+
+
+def _is_gif(data: bytes) -> bool:
+    """Check if bytes represent a GIF based on magic header."""
+    return data[:6] in (b"GIF87a", b"GIF89a")
+
+
+class VideoReaderWrapper:
+    """decord.VideoReader wrapper that fixes a memory leak and adds GIF support.
+
+    Reference: https://github.com/dmlc/decord/issues/208
+    """
+
+    def __init__(self, video_path, *args, **kwargs):
+        import decord
+
+        try:
+            # moviepy 1.0
+            import moviepy.editor as mp
+        except Exception:
+            # moviepy 2.0
+            import moviepy as mp
+
+        with ntf(delete=True, suffix=".gif") as gif_file:
+            gif_input = None
+            self.original_file = None  # only set when we create a temp file
+
+            if isinstance(video_path, str):
+                if video_path.lower().endswith(".gif"):
+                    gif_input = video_path
+            elif isinstance(video_path, bytes):
+                if _is_gif(video_path):
+                    gif_file.write(video_path)
+                    gif_file.flush()
+                    gif_input = gif_file.name
+            elif isinstance(video_path, io.BytesIO):
+                video_path.seek(0)
+                tmp_bytes = video_path.read()
+                video_path.seek(0)
+                if _is_gif(tmp_bytes):
+                    gif_file.write(tmp_bytes)
+                    gif_file.flush()
+                    gif_input = gif_file.name
+
+            if gif_input is not None:
+                clip = mp.VideoFileClip(gif_input)
+                mp4_file = ntf(delete=False, suffix=".mp4")
+                mp4_path = mp4_file.name
+                mp4_file.close()  # close before moviepy writes
+                clip.write_videofile(mp4_path, verbose=False, logger=None)
+                clip.close()
+                video_path = mp4_path
+                self.original_file = video_path  # temp mp4, cleaned up in __del__
+
+            self._reader = decord.VideoReader(video_path, *args, **kwargs)
+            self._reader.seek(0)
+
+    def __len__(self):
+        return len(self._reader)
+
+    def __getitem__(self, key):
+        frames = self._reader[key]
+        self._reader.seek(0)
+        return frames
+
+    def get_avg_fps(self):
+        return self._reader.get_avg_fps()
+
+    def seek(self, pos):
+        return self._reader.seek(pos)
+
+    def __del__(self):
+        original_file = getattr(self, "original_file", None)
+        if original_file:
+            try:
+                os.remove(original_file)
+            except OSError:
+                pass
+
+
+# ---------------------------------------------------------------------------
+# read_video_decord
+# ---------------------------------------------------------------------------
+
+
+def read_video_decord(video_path, save_to_disk: bool = False):
+    """Load a video file and return (video_reader, video_meta, video_path).
+
+    video_meta contains keys: "fps", "duration", "num_of_frame".
+    """
+    if isinstance(video_path, VideoReaderWrapper):
+        video_reader = video_path
+    else:
+        if isinstance(video_path, bytes):
+            video_path = io.BytesIO(video_path)
+        video_reader = VideoReaderWrapper(video_path, num_threads=1)
+
+    vlen = len(video_reader)
+    fps = video_reader.get_avg_fps()
+    duration = vlen / float(fps)
+
+    video_meta = {"fps": fps, "duration": duration, "num_of_frame": vlen}
+    return video_reader, video_meta, video_path
+
+
+# ---------------------------------------------------------------------------
+# sample_frames — qwen_vl variant
+# ---------------------------------------------------------------------------
+
+
+def sample_frames_qwen(
+    frame_factor: int,
+    min_frames: int,
+    max_frames: int,
+    metadata: Optional[dict] = None,
+    fps: Optional[Union[int, float]] = -1,
+    num_frames: Optional[int] = -1,
+) -> np.ndarray:
+    """Sample frame indices — qwen_vl variant.
+
+    Sentinel defaults are -1. Applies ceil_by_factor on min_frames and ensures
+    num_frames is divisible by 4.
+    """
+    if fps > 0 and num_frames > 0:
+        raise ValueError("`num_frames` and `fps` are mutually exclusive arguments, please use only one!")
+
+    if metadata is None:
+        raise ValueError("metadata is required for sample_frames_qwen")
+
+    total_num_frames = metadata["num_of_frame"]
+
+    if num_frames > 0:
+        num_frames = round(num_frames / frame_factor) * frame_factor
+    elif fps > 0:
+        min_frames = ceil_by_factor(min_frames, frame_factor)
+        max_frames = floor_by_factor(min(max_frames, total_num_frames), frame_factor)
+
+        num_frames = total_num_frames / metadata["fps"] * fps
+
+        if num_frames > total_num_frames:
+            data_processor_logger.warning(f"smart_nframes: nframes[{num_frames}] > total_frames[{total_num_frames}]")
+
+        num_frames = min(min(max(num_frames, min_frames), max_frames), total_num_frames)
+        num_frames = floor_by_factor(num_frames, frame_factor)
+
+    if num_frames > total_num_frames:
+        raise ValueError(
+            f"Video can't be sampled. The inferred `num_frames={num_frames}` exceeds "
+            f"`total_num_frames={total_num_frames}`. "
+            "Decrease `num_frames` or `fps` for sampling."
+        )
+
+    # num_frames must be divisible by 4
+    if num_frames > 2 and num_frames % 4 != 0:
+        num_frames = (num_frames // 4) * 4
+        total_num_frames = (total_num_frames // 4) * 4
+        num_frames = min(min(max(num_frames, min_frames), max_frames), total_num_frames)
+
+    if num_frames > 0:
+        indices = np.arange(0, total_num_frames, total_num_frames / num_frames).astype(np.int32)
+    else:
+        indices = np.arange(0, total_num_frames).astype(np.int32)
+
+    return indices
+
+
+# ---------------------------------------------------------------------------
+# sample_frames — paddleocr_vl / ernie4_5_vl variant
+# ---------------------------------------------------------------------------
+
+
+def sample_frames_paddleocr(
+    frame_factor: int,
+    min_frames: int,
+    max_frames: int,
+    metadata: Optional[dict] = None,
+    fps: Optional[Union[int, float]] = None,
+    num_frames: Optional[int] = None,
+) -> np.ndarray:
+    """Sample frame indices — paddleocr_vl / ernie4_5_vl variant.
+
+    Sentinel defaults are None. Uses plain math.floor/ceil; no %4 correction.
+    """
+    fps = fps or 0
+    num_frames = num_frames or 0
+    if fps > 0 and num_frames > 0:
+        raise ValueError("`num_frames` and `fps` are mutually exclusive arguments, please use only one!")
+
+    if metadata is None:
+        raise ValueError("metadata is required for sample_frames_paddleocr")
+
+    total_num_frames = metadata["num_of_frame"]
+
+    if num_frames > 0:
+        num_frames = round(num_frames / frame_factor) * frame_factor
+    elif fps > 0:
+        max_frames = math.floor(min(max_frames, total_num_frames) / frame_factor) * frame_factor
+        num_frames = total_num_frames / metadata["fps"] * fps
+        num_frames = min(min(max(num_frames, min_frames), max_frames), total_num_frames)
+        num_frames = math.floor(num_frames / frame_factor) * frame_factor
+
+    if num_frames > total_num_frames:
+        raise ValueError(
+            f"Video can't be sampled. The inferred `num_frames={num_frames}` exceeds "
+            f"`total_num_frames={total_num_frames}`. "
+            "Decrease `num_frames` or `fps` for sampling."
+        )
+
+    if num_frames > 0:
+        indices = np.arange(0, total_num_frames, total_num_frames / num_frames).astype(np.int32)
+    else:
+        indices = np.arange(0, total_num_frames).astype(np.int32)
+
+    return indices
+
+
+def sample_frames(
+    frame_factor: int,
+    min_frames: int,
+    max_frames: int,
+    metadata: Optional[dict] = None,
+    fps: Optional[Union[int, float]] = None,
+    num_frames: Optional[int] = None,
+    variant: str = "paddleocr",
+) -> np.ndarray:
+    """Dispatch to sample_frames_qwen or sample_frames_paddleocr based on variant."""
+    if variant == "qwen":
+        _fps = fps if fps is not None else -1
+        _num_frames = num_frames if num_frames is not None else -1
+        return sample_frames_qwen(frame_factor, min_frames, max_frames, metadata, _fps, _num_frames)
+    if variant == "paddleocr":
+        return sample_frames_paddleocr(frame_factor, min_frames, max_frames, metadata, fps, num_frames)
+    raise ValueError(f"Unknown variant {variant!r}. Expected 'paddleocr' or 'qwen'.")
@@ -22,14 +22,16 @@ from PIL import Image

 from fastdeploy.input.ernie4_5_vl_processor.image_preprocessor.image_preprocessor_adaptive import (
    AdaptiveImageProcessor,
+    make_batched_images,
+    make_batched_videos,
+)
+from fastdeploy.input.image_processors.common import (
    ceil_by_factor,
    floor_by_factor,
    is_scaled_image,
-    make_batched_images,
-    make_batched_videos,
    round_by_factor,
-    smart_resize,
 )
+from fastdeploy.input.image_processors.common import smart_resize_qwen as smart_resize


 class TestImagePreprocessorAdaptive(unittest.TestCase):
@@ -30,7 +30,7 @@ from fastdeploy.input.paddleocr_vl_processor.paddleocr_vl_processor import (
    PaddleOCRVLProcessor,
 )
 from fastdeploy.input.paddleocr_vl_processor.process import DataProcessor
-from fastdeploy.input.paddleocr_vl_processor.process_video import sample_frames
+from fastdeploy.input.video_utils import sample_frames_paddleocr as sample_frames

 MODULE_PATH = "fastdeploy.input.paddleocr_vl_processor.process"

@@ -86,7 +86,7 @@ class TestProcessVideo(unittest.TestCase):

    def test_error_fps_without_metadata(self):
        """新增：测试 fps > 0 但 metadata 为 None"""
-        with self.assertRaises(TypeError) as context:
+        with self.assertRaises(ValueError) as context:
            sample_frames(
                frame_factor=self.frame_factor,
                min_frames=self.min_frames,
@@ -95,8 +95,7 @@ class TestProcessVideo(unittest.TestCase):
                fps=10,
                metadata=None,  # 缺失
            )
-        # 验证是预期的 TypeError
-        self.assertIn("'NoneType' object is not subscriptable", str(context.exception))
+        self.assertIn("metadata is required", str(context.exception))

    def test_num_frames_rounding(self):
        """新增：测试 num_frames 向 frame_factor 舍入"""
@@ -21,7 +21,7 @@ import numpy as np
 from PIL import Image

 from fastdeploy.input.qwen_vl_processor import QwenVLProcessor
-from fastdeploy.input.qwen_vl_processor.process_video import sample_frames
+from fastdeploy.input.video_utils import sample_frames_qwen as sample_frames


 def mock_pil_image(height, width):
@@ -0,0 +1,365 @@
+# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import io
+import unittest
+from unittest.mock import MagicMock, patch
+
+import numpy as np
+
+from fastdeploy.input.video_utils import (
+    _is_gif,
+    read_video_decord,
+    sample_frames,
+    sample_frames_paddleocr,
+    sample_frames_qwen,
+)
+
+# ---------------------------------------------------------------------------
+# helpers
+# ---------------------------------------------------------------------------
+
+GIF87_HEADER = b"GIF87a" + b"\x00" * 10
+GIF89_HEADER = b"GIF89a" + b"\x00" * 10
+NOT_GIF = b"NOTGIF" + b"\x00" * 10
+
+
+def _make_mock_reader(num_frames=100, fps=25.0):
+    """Return a mock that mimics decord.VideoReader."""
+    reader = MagicMock()
+    reader.__len__ = MagicMock(return_value=num_frames)
+    reader.get_avg_fps = MagicMock(return_value=fps)
+    reader.seek = MagicMock(return_value=None)
+    frame = MagicMock()
+    frame.asnumpy = MagicMock(return_value=np.zeros((480, 640, 3), dtype=np.uint8))
+    reader.__getitem__ = MagicMock(return_value=frame)
+    return reader
+
+
+# ---------------------------------------------------------------------------
+# _is_gif
+# ---------------------------------------------------------------------------
+
+
+class TestIsGif(unittest.TestCase):
+    def test_gif87a(self):
+        self.assertTrue(_is_gif(GIF87_HEADER))
+
+    def test_gif89a(self):
+        self.assertTrue(_is_gif(GIF89_HEADER))
+
+    def test_not_gif(self):
+        self.assertFalse(_is_gif(NOT_GIF))
+
+    def test_short_bytes(self):
+        self.assertFalse(_is_gif(b"GIF"))
+
+
+# ---------------------------------------------------------------------------
+# VideoReaderWrapper (mock decord + moviepy)
+# ---------------------------------------------------------------------------
+
+
+class TestVideoReaderWrapper(unittest.TestCase):
+    def _make_wrapper(self, video_path, mock_reader=None):
+        """Construct a VideoReaderWrapper with decord mocked out."""
+        from fastdeploy.input.video_utils import VideoReaderWrapper
+
+        if mock_reader is None:
+            mock_reader = _make_mock_reader()
+
+        mock_decord = MagicMock()
+        mock_decord.VideoReader.return_value = mock_reader
+
+        with patch.dict("sys.modules", {"decord": mock_decord, "moviepy": MagicMock(), "moviepy.editor": MagicMock()}):
+            wrapper = VideoReaderWrapper(video_path)
+
+        wrapper._reader = mock_reader
+        return wrapper
+
+    def test_len(self):
+        reader = _make_mock_reader(num_frames=42)
+        wrapper = self._make_wrapper("/fake/video.mp4", reader)
+        self.assertEqual(len(wrapper), 42)
+
+    def test_getitem_resets_seek(self):
+        reader = _make_mock_reader()
+        wrapper = self._make_wrapper("/fake/video.mp4", reader)
+        _ = wrapper[0]
+        reader.seek.assert_called_with(0)
+
+    def test_get_avg_fps(self):
+        reader = _make_mock_reader(fps=30.0)
+        wrapper = self._make_wrapper("/fake/video.mp4", reader)
+        self.assertEqual(wrapper.get_avg_fps(), 30.0)
+
+    def test_seek(self):
+        reader = _make_mock_reader()
+        wrapper = self._make_wrapper("/fake/video.mp4", reader)
+        wrapper.seek(5)
+        reader.seek.assert_called_with(5)
+
+    def test_del_no_original_file(self):
+        """__del__ should be a no-op when original_file is None."""
+        from fastdeploy.input.video_utils import VideoReaderWrapper
+
+        wrapper = object.__new__(VideoReaderWrapper)
+        wrapper.original_file = None
+        wrapper._reader = _make_mock_reader()
+        # Should not raise
+        wrapper.__del__()
+
+    def test_del_removes_temp_file(self):
+        """__del__ removes the file only when original_file is set."""
+        import os
+        import tempfile
+
+        from fastdeploy.input.video_utils import VideoReaderWrapper
+
+        with tempfile.NamedTemporaryFile(delete=False) as f:
+            tmp_path = f.name
+
+        wrapper = object.__new__(VideoReaderWrapper)
+        wrapper.original_file = tmp_path
+        wrapper._reader = _make_mock_reader()
+        wrapper.__del__()
+        self.assertFalse(os.path.exists(tmp_path))
+
+    def test_non_gif_string_path_does_not_set_original_file(self):
+        """Passing a non-GIF string path must NOT set original_file (bug fix)."""
+        from fastdeploy.input.video_utils import VideoReaderWrapper
+
+        mock_reader = _make_mock_reader()
+        mock_decord = MagicMock()
+        mock_decord.VideoReader.return_value = mock_reader
+
+        with patch.dict("sys.modules", {"decord": mock_decord, "moviepy": MagicMock(), "moviepy.editor": MagicMock()}):
+            wrapper = VideoReaderWrapper("/fake/video.mp4")
+
+        self.assertIsNone(wrapper.original_file)
+
+    def test_bytesio_non_gif_path_does_not_set_original_file(self):
+        """Passing a BytesIO that is NOT a GIF must not set original_file."""
+        from fastdeploy.input.video_utils import VideoReaderWrapper
+
+        mock_reader = _make_mock_reader()
+        mock_decord = MagicMock()
+        mock_decord.VideoReader.return_value = mock_reader
+
+        bio = io.BytesIO(NOT_GIF)
+        with patch.dict("sys.modules", {"decord": mock_decord, "moviepy": MagicMock(), "moviepy.editor": MagicMock()}):
+            wrapper = VideoReaderWrapper(bio)
+
+        self.assertIsNone(wrapper.original_file)
+
+
+# ---------------------------------------------------------------------------
+# read_video_decord
+# ---------------------------------------------------------------------------
+
+
+class TestReadVideoDecord(unittest.TestCase):
+    def _patch_wrapper(self, num_frames=100, fps=25.0):
+        """Return a context manager that replaces VideoReaderWrapper with a mock."""
+        from fastdeploy.input import video_utils
+
+        mock_wrapper = MagicMock()
+        mock_wrapper.__len__ = MagicMock(return_value=num_frames)
+        mock_wrapper.get_avg_fps = MagicMock(return_value=fps)
+        return patch.object(video_utils, "VideoReaderWrapper", return_value=mock_wrapper), mock_wrapper
+
+    def test_existing_wrapper_passthrough(self):
+        """Already-wrapped reader is returned as-is."""
+        from fastdeploy.input.video_utils import VideoReaderWrapper
+
+        mock_wrapper = MagicMock(spec=VideoReaderWrapper)
+        mock_wrapper.__len__ = MagicMock(return_value=50)
+        mock_wrapper.get_avg_fps = MagicMock(return_value=10.0)
+
+        reader, meta, path = read_video_decord(mock_wrapper)
+
+        self.assertIs(reader, mock_wrapper)
+        self.assertEqual(meta["num_of_frame"], 50)
+        self.assertAlmostEqual(meta["fps"], 10.0)
+        self.assertAlmostEqual(meta["duration"], 5.0)
+
+    def test_bytes_input_converted_to_bytesio(self):
+        """bytes input is converted to BytesIO before creating VideoReaderWrapper."""
+        from fastdeploy.input import video_utils
+
+        captured = []
+
+        class FakeWrapper:
+            def __init__(self, path, *args, **kwargs):
+                captured.append(path)
+
+            def __len__(self):
+                return 30
+
+            def get_avg_fps(self):
+                return 10.0
+
+        with patch.object(video_utils, "VideoReaderWrapper", FakeWrapper):
+            reader, meta, path = read_video_decord(b"fake_video_bytes")
+
+        self.assertIsInstance(captured[0], io.BytesIO)
+
+    def test_string_path_input(self):
+        """String path is passed through to VideoReaderWrapper."""
+        from fastdeploy.input import video_utils
+
+        class FakeWrapper:
+            def __init__(self, path, *args, **kwargs):
+                pass
+
+            def __len__(self):
+                return 60
+
+            def get_avg_fps(self):
+                return 30.0
+
+        with patch.object(video_utils, "VideoReaderWrapper", FakeWrapper):
+            reader, meta, path = read_video_decord("/fake/path.mp4")
+
+        self.assertEqual(meta["num_of_frame"], 60)
+        self.assertAlmostEqual(meta["duration"], 2.0)
+        self.assertEqual(path, "/fake/path.mp4")
+
+
+# ---------------------------------------------------------------------------
+# sample_frames_qwen
+# ---------------------------------------------------------------------------
+
+
+class TestSampleFramesQwen(unittest.TestCase):
+    META = {"num_of_frame": 100, "fps": 25.0}
+
+    def test_num_frames_basic(self):
+        indices = sample_frames_qwen(2, 4, 100, self.META, num_frames=8)
+        self.assertEqual(len(indices), 8)
+
+    def test_fps_basic(self):
+        indices = sample_frames_qwen(2, 4, 100, self.META, fps=2.0)
+        self.assertGreater(len(indices), 0)
+        self.assertEqual(len(indices) % 2, 0)
+
+    def test_fps_and_num_frames_raises(self):
+        with self.assertRaises(ValueError):
+            sample_frames_qwen(2, 4, 100, self.META, fps=2.0, num_frames=10)
+
+    def test_num_frames_exceeds_total_raises(self):
+        with self.assertRaises(ValueError):
+            sample_frames_qwen(2, 4, 100, self.META, num_frames=200)
+
+    def test_fps_warning_when_nframes_exceeds_total(self):
+        """fps so high that computed num_frames > total → warning logged."""
+        with self.assertLogs(level="WARNING"):
+            sample_frames_qwen(2, 4, 100, {"num_of_frame": 10, "fps": 1.0}, fps=100.0)
+
+    def test_divisible_by_4_correction(self):
+        """Result must be divisible by 4 when num_frames > 2."""
+        indices = sample_frames_qwen(2, 4, 100, self.META, fps=1.5)
+        if len(indices) > 2:
+            self.assertEqual(len(indices) % 4, 0)
+
+    def test_no_sampling_returns_all_frames(self):
+        """Both fps and num_frames at sentinel → return all frames."""
+        indices = sample_frames_qwen(2, 4, 100, self.META)
+        self.assertEqual(len(indices), 100)
+
+    def test_indices_dtype(self):
+        indices = sample_frames_qwen(2, 4, 100, self.META, num_frames=8)
+        self.assertEqual(indices.dtype, np.int32)
+
+
+# ---------------------------------------------------------------------------
+# sample_frames_paddleocr
+# ---------------------------------------------------------------------------
+
+
+class TestSampleFramesPaddleocr(unittest.TestCase):
+    META = {"num_of_frame": 100, "fps": 25.0}
+
+    def test_num_frames_basic(self):
+        indices = sample_frames_paddleocr(1, 4, 100, self.META, num_frames=10)
+        self.assertEqual(len(indices), 10)
+
+    def test_fps_basic(self):
+        indices = sample_frames_paddleocr(1, 4, 100, self.META, fps=2.0)
+        self.assertGreater(len(indices), 0)
+
+    def test_fps_and_num_frames_raises(self):
+        with self.assertRaises(ValueError):
+            sample_frames_paddleocr(1, 4, 100, self.META, fps=2.0, num_frames=10)
+
+    def test_num_frames_exceeds_total_raises(self):
+        with self.assertRaises(ValueError):
+            sample_frames_paddleocr(1, 4, 100, self.META, num_frames=200)
+
+    def test_none_sentinels_no_sampling(self):
+        """fps=None, num_frames=None → return all frames."""
+        indices = sample_frames_paddleocr(1, 4, 100, self.META)
+        self.assertEqual(len(indices), 100)
+
+    def test_no_4_correction(self):
+        """paddleocr variant does NOT apply %4 correction."""
+        # 6 frames is not divisible by 4; paddleocr should keep it
+        meta = {"num_of_frame": 100, "fps": 25.0}
+        indices = sample_frames_paddleocr(1, 1, 100, meta, num_frames=6)
+        self.assertEqual(len(indices), 6)
+
+    def test_indices_dtype(self):
+        indices = sample_frames_paddleocr(1, 4, 100, self.META, num_frames=8)
+        self.assertEqual(indices.dtype, np.int32)
+
+
+# ---------------------------------------------------------------------------
+# sample_frames dispatcher
+# ---------------------------------------------------------------------------
+
+
+class TestSampleFramesDispatcher(unittest.TestCase):
+    META = {"num_of_frame": 100, "fps": 25.0}
+
+    def test_default_variant_is_paddleocr(self):
+        with patch("fastdeploy.input.video_utils.sample_frames_paddleocr", wraps=sample_frames_paddleocr) as mock_fn:
+            sample_frames(1, 4, 100, self.META, num_frames=8)
+            mock_fn.assert_called_once()
+
+    def test_qwen_variant_dispatched(self):
+        with patch("fastdeploy.input.video_utils.sample_frames_qwen", wraps=sample_frames_qwen) as mock_fn:
+            sample_frames(2, 4, 100, self.META, num_frames=8, variant="qwen")
+            mock_fn.assert_called_once()
+
+    def test_qwen_none_fps_converted_to_sentinel(self):
+        """None fps/num_frames → converted to -1 before calling sample_frames_qwen."""
+        with patch("fastdeploy.input.video_utils.sample_frames_qwen", return_value=np.array([])) as mock_fn:
+            sample_frames(2, 4, 100, self.META, fps=None, num_frames=None, variant="qwen")
+            args = mock_fn.call_args[0]
+            self.assertEqual(args[4], -1)  # fps sentinel
+            self.assertEqual(args[5], -1)  # num_frames sentinel
+
+    def test_paddleocr_variant_result_consistent(self):
+        direct = sample_frames_paddleocr(1, 4, 100, self.META, num_frames=8)
+        via_dispatcher = sample_frames(1, 4, 100, self.META, num_frames=8, variant="paddleocr")
+        np.testing.assert_array_equal(direct, via_dispatcher)
+
+    def test_qwen_variant_result_consistent(self):
+        direct = sample_frames_qwen(2, 4, 100, self.META, num_frames=8)
+        via_dispatcher = sample_frames(2, 4, 100, self.META, num_frames=8, variant="qwen")
+        np.testing.assert_array_equal(direct, via_dispatcher)
+
+
+if __name__ == "__main__":
+    unittest.main()