[DataProcessor] Refactor multimodal processor: extract encoding strategies and unify MM processing pipeline (#7298)

* merge mm processor
2026-04-22 16:07:51 +08:00 · 2026-04-15 19:01:06 +08:00
parent a218d29488
commit 3f84d8d893
36 changed files with 4016 additions and 681 deletions
@@ -435,17 +435,7 @@ class BaseTextProcessor(ABC):
            request["top_k"] = 1
        if self.reasoning_parser:
-            model_status = self.reasoning_parser.get_model_status(request["prompt_token_ids"])
+            self._apply_reasoning_parser(request)
            parts = request["request_id"].split("_")
            if len(parts) > 1:
                real_req_id = parts[0]
                index = int(parts[1])
                n = request.get("n", 1)
                for idx in range(index * n, (index + 1) * n):
                    self.model_status_dict[f"{real_req_id}_{idx}"] = model_status
            else:
                self.model_status_dict[request["request_id"]] = model_status
            request["enable_thinking"] = model_status == "think_start"
        if request.get("response_max_tokens") is not None and request.get("enable_thinking") is False:
            request["max_tokens"] = min(request["response_max_tokens"], request["max_tokens"])
@@ -453,6 +443,20 @@ class BaseTextProcessor(ABC):
        data_processor_logger.info(f"Processed request dict: {request}")
        return request
    def _apply_reasoning_parser(self, request):
        """Apply reasoning parser to determine model thinking status."""
        model_status = self.reasoning_parser.get_model_status(request["prompt_token_ids"])
        parts = request["request_id"].split("_")
        if len(parts) > 1:
            real_req_id = parts[0]
            index = int(parts[1])
            n = request.get("n", 1)
            for idx in range(index * n, (index + 1) * n):
                self.model_status_dict[f"{real_req_id}_{idx}"] = model_status
        else:
            self.model_status_dict[request["request_id"]] = model_status
        request["enable_thinking"] = model_status == "think_start"
    def clear_request_status(self, task_id):
        """Clear all per-request decode state and return the accumulated text."""
        results_all = ""
@@ -0,0 +1,23 @@
 # Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 """Multimodal encoding strategies for VL model families."""
 from fastdeploy.input.encodings.base_encoding import BaseEncoding
 from fastdeploy.input.encodings.ernie_encoding import ErnieEncoding
 from fastdeploy.input.encodings.paddleocr_encoding import PaddleOCREncoding
 from fastdeploy.input.encodings.qwen_encoding import QwenEncoding
 from fastdeploy.input.encodings.registry import EncodingRegistry
 __all__ = ["BaseEncoding", "EncodingRegistry", "ErnieEncoding", "PaddleOCREncoding", "QwenEncoding"]
@@ -0,0 +1,189 @@
 # Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 """Abstract base class for multimodal encoding strategies.
 Each encoding strategy handles model-family-specific logic such as
 position ID computation, image/video preprocessing, and token counting.
 New model families should subclass ``BaseEncoding`` and implement all
 abstract methods.
 """
 from abc import ABC, abstractmethod
 from typing import Any, Dict, Optional, Tuple
 class BaseEncoding(ABC):
    """Contract that every encoding strategy must fulfil.
    Required (abstract) methods cover the core encoding pipeline.
    Optional methods (``init_extra``, ``get_mm_max_tokens_per_item``) have
    default no-op implementations so subclasses only override when needed.
    """
    def __init__(self, processor, processor_kwargs=None):
        if processor_kwargs is None:
            processor_kwargs = {}
        cfg = processor.cfg
        # Shared objects (created by processor, used by encoding)
        self.cfg = cfg
        self.image_processor = processor.image_processor
        self.tokenizer = processor.tokenizer
        # Conv params
        if cfg.conv_params_from_kwargs:
            self.spatial_conv_size = processor_kwargs.get("spatial_conv_size", 2)
            self.temporal_conv_size = processor_kwargs.get("temporal_conv_size", 2)
        else:
            self.spatial_conv_size = self.image_processor.merge_size
            self.temporal_conv_size = self.image_processor.temporal_patch_size
        # Special token IDs
        self.image_token_id = self.tokenizer.convert_tokens_to_ids(cfg.image_token_str)
        self.video_token_id = self.tokenizer.convert_tokens_to_ids(cfg.video_token_str)
        if cfg.has_tokens_per_second:
            vision_config = getattr(getattr(processor, "config", None), "vision_config", None)
            self.tokens_per_second = getattr(vision_config, "tokens_per_second", 2)
        else:
            self.tokens_per_second = 2
        # Video params
        self.fps = processor_kwargs.get("video_fps", cfg.default_fps)
        self.min_frames = processor_kwargs.get("video_min_frames", cfg.default_min_frames)
        self.max_frames = processor_kwargs.get("video_max_frames", cfg.default_max_frames)
        self.target_frames = processor_kwargs.get("video_target_frames", cfg.default_target_frames)
        # Model-specific extra init
        self.init_extra(processor_kwargs)
    # ------------------------------------------------------------------
    # Image
    # ------------------------------------------------------------------
    @abstractmethod
    def add_image(self, img, outputs: dict, uuid, token_len=None):
        """Process a raw image and append results to *outputs*."""
    @abstractmethod
    def add_processed_image(self, img_cache, outputs: dict, uuid, token_len=None):
        """Append a pre-processed (cached) image to *outputs*."""
    # ------------------------------------------------------------------
    # Video
    # ------------------------------------------------------------------
    @abstractmethod
    def add_video(self, frames, outputs: dict, uuid, token_len=None, meta: Optional[dict] = None):
        """Process video frames and append results to *outputs*.
        Parameters
        ----------
        frames : array-like
            Decoded video frames.
        outputs : dict
            Mutable accumulator for input_ids, position_ids, etc.
        uuid : str | None
            Unique identifier for cache lookup.
        token_len : int | None
            Expected token count (for validation against pre-tokenised prompts).
        meta : dict | None
            Video metadata (fps, duration, ...).  Encoding strategies that
            need metadata (e.g. Qwen) read from this dict; those that don't
            (e.g. Ernie) simply ignore it.
        """
    @abstractmethod
    def add_processed_video(self, frames_cache, outputs: dict, uuid, token_len=None):
        """Append a pre-processed (cached) video to *outputs*."""
    @abstractmethod
    def load_video(self, url, item: dict) -> Tuple[Any, dict]:
        """Decode a video from *url* and return ``(frames, meta)``.
        All implementations must return a 2-tuple so that the caller
        (``MultiModalProcessor.text2ids``) can unpack uniformly.
        """
    # ------------------------------------------------------------------
    # Text / position helpers
    # ------------------------------------------------------------------
    @abstractmethod
    def add_text_positions(self, outputs: dict, num_tokens: int):
        """Append text position IDs to *outputs*."""
    @abstractmethod
    def append_completion_tokens(self, multimodal_inputs: dict, completion_token_ids):
        """Append completion token IDs (and their positions) to *multimodal_inputs*."""
    # ------------------------------------------------------------------
    # Prompt-token-ids path (optional — only models with
    # supports_prompt_token_ids=True need to implement this)
    # ------------------------------------------------------------------
    def prompt_token_ids2outputs(self, prompt_token_ids, mm_items=None) -> dict:
        """Build outputs dict from pre-tokenised ``prompt_token_ids``.
        Parameters
        ----------
        prompt_token_ids : list[int]
            Pre-tokenised token IDs.
        mm_items : list[dict] | None
            Already-extracted multimodal items (each has 'type', 'data', 'uuid').
            ``None`` means text-only.
        """
        raise NotImplementedError(f"{type(self).__name__} does not support prompt_token_ids path")
    # ------------------------------------------------------------------
    # Token counting & packing
    # ------------------------------------------------------------------
    @staticmethod
    @abstractmethod
    def mm_num_tokens(grid_thw):
        """Return the number of multimodal tokens for a given grid_thw."""
    @abstractmethod
    def pack_position_ids(self, outputs: dict):
        """Convert intermediate position ID lists into final packed format."""
    # ------------------------------------------------------------------
    # Outputs initialisation
    # ------------------------------------------------------------------
    def _make_outputs(self) -> dict:
        """Create the mutable accumulator dict for encoding results.
        Subclasses override to add model-specific fields (e.g. fps, vit fields).
        """
        return {
            "input_ids": [],
            "token_type_ids": [],
            "position_ids": [],
            "images": [],
            "grid_thw": [],
            "image_type_ids": [],
            "labels": [],
            "cur_position": 0,
            "video_cnt": 0,
            "num_input_image_tokens": 0,
            "num_input_video_tokens": 0,
            "mm_positions": [],
            "mm_hashes": [],
        }
    # ------------------------------------------------------------------
    # Optional hooks — subclasses override only when needed
    # ------------------------------------------------------------------
    def init_extra(self, processor_kwargs: dict):
        """Model-specific extra initialisation (called once after ``__init__``)."""
    def get_mm_max_tokens_per_item(self, seq_len: int) -> Optional[Dict[str, int]]:
        """Per-modality max token counts for the scheduler.  ``None`` = not applicable."""
        return None
@@ -0,0 +1,424 @@
 # Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 """Ernie4.5-VL encoding strategy for MultiModalProcessor."""
 import copy
 from collections import defaultdict
 import numpy as np
 import paddle
 from paddleformers.transformers.image_utils import ChannelDimension
 from fastdeploy.engine.request import ImagePosition
 from fastdeploy.input.encodings.base_encoding import BaseEncoding
 from fastdeploy.input.encodings.registry import EncodingRegistry
 from fastdeploy.input.mm_model_config import ERNIE4_5_VL
 from fastdeploy.input.utils import IDS_TYPE_FLAG, MAX_IMAGE_DIMENSION
 from fastdeploy.multimodal.hasher import MultimodalHasher
@EncodingRegistry.register(ERNIE4_5_VL)
 class ErnieEncoding(BaseEncoding):
    """Encoding strategy for Ernie4.5-VL models."""
    # Boundary token constants
    IMG_START = "<|IMAGE_START|>"
    IMG_END = "<|IMAGE_END|>"
    VID_START = "<|VIDEO_START|>"
    VID_END = "<|VIDEO_END|>"
    def init_extra(self, processor_kwargs):
        """Ernie-specific extra initialisation (pixel params, token type mapping, etc.)."""
        self.image_min_pixels = processor_kwargs.get("image_min_pixels", 4 * 28 * 28)
        self.image_max_pixels = processor_kwargs.get("image_max_pixels", 6177 * 28 * 28)
        self.video_min_pixels = processor_kwargs.get("video_min_pixels", 299 * 28 * 28)
        self.video_max_pixels = processor_kwargs.get("video_max_pixels", 1196 * 28 * 28)
        self.frames_sample = processor_kwargs.get("video_frames_sample", self.cfg.default_frames_sample)
        # Build token-type mapping for ernie boundary tokens
        self.token_type_mapping = self._build_token_type_mapping()
    def _build_token_type_mapping(self):
        mapping = defaultdict(lambda: IDS_TYPE_FLAG["text"])
        for token in (self.IMG_START, self.IMG_END, self.VID_START, self.VID_END):
            mapping[token] = IDS_TYPE_FLAG["image"]
        mapping[self.image_token_id] = IDS_TYPE_FLAG["image"]
        return mapping
    def add_image(self, img, outputs, uuid, token_len=None):
        patches_h, patches_w = self.image_processor.get_smarted_resize(
            img.height,
            img.width,
            min_pixels=self.image_min_pixels,
            max_pixels=self.image_max_pixels,
        )[1]
        num_tokens = (patches_h * patches_w) // (self.spatial_conv_size**2)
        if token_len and token_len != num_tokens:
            raise ValueError("image tokens num not match the size")
        outputs["mm_positions"].append(ImagePosition(len(outputs["input_ids"]), num_tokens))
        outputs["input_ids"].extend([self.image_token_id] * num_tokens)
        outputs["token_type_ids"].extend([IDS_TYPE_FLAG["image"]] * num_tokens)
        outputs["num_input_image_tokens"] += num_tokens
        pos_ids = self._compute_3d_positions(1, patches_h, patches_w, outputs["cur_position"])
        outputs["position_ids"].extend(pos_ids)
        outputs["cur_position"] = np.max(pos_ids) + 1
        ret = self.image_processor.preprocess(
            images=[img.convert("RGB")],
            do_normalize=False,
            do_rescale=False,
            predetermined_grid_thw=np.array([[patches_h, patches_w]]),
            do_convert_rgb=True,
            input_data_format=ChannelDimension.LAST,
        )
        outputs["images"].append(ret["pixel_values"])
        if not uuid:
            outputs["mm_hashes"].append(MultimodalHasher.hash_features(ret["pixel_values"]))
        else:
            outputs["mm_hashes"].append(uuid)
        outputs["grid_thw"].append(ret["image_grid_thw"])
        outputs["image_type_ids"].append(0)
    def add_processed_image(self, img_cache, outputs, uuid, token_len=None):
        img, meta = img_cache
        num_tokens = img.shape[0] // (self.spatial_conv_size**2)
        if token_len and num_tokens != token_len:
            raise ValueError("image tokens num not match the size")
        outputs["mm_positions"].append(ImagePosition(len(outputs["input_ids"]), num_tokens))
        outputs["input_ids"].extend([self.image_token_id] * num_tokens)
        outputs["token_type_ids"].extend([IDS_TYPE_FLAG["image"]] * num_tokens)
        outputs["num_input_image_tokens"] += num_tokens
        _, h, w = meta["thw"]
        pos_ids = self._compute_3d_positions(1, h, w, outputs["cur_position"])
        outputs["position_ids"].extend(pos_ids)
        outputs["cur_position"] = np.max(pos_ids) + 1
        outputs["images"].append(img)
        outputs["mm_hashes"].append(uuid)
        outputs["grid_thw"].append(np.array([[1, h, w]]))
        outputs["image_type_ids"].append(0)
    def add_video(self, frames, outputs, uuid, token_len=None, meta=None):
        patches_h, patches_w = self.image_processor.get_smarted_resize(
            frames[0].height,
            frames[0].width,
            min_pixels=self.video_min_pixels,
            max_pixels=self.video_max_pixels,
        )[1]
        num_frames = len(frames)
        num_tokens = (num_frames * patches_h * patches_w) // (self.spatial_conv_size**2 * self.temporal_conv_size)
        if token_len and num_tokens != token_len:
            raise ValueError("video tokens num not match the size")
        pixel_stack = np.stack([np.array(f.convert("RGB")) for f in frames], axis=0)
        ret = self.image_processor.preprocess(
            images=None,
            videos=pixel_stack,
            do_normalize=False,
            do_rescale=False,
            predetermined_grid_thw=np.array([[patches_h, patches_w]] * num_frames),
            do_convert_rgb=True,
            input_data_format=ChannelDimension.LAST,
        )
        outputs["images"].append(ret["pixel_values_videos"])
        if not uuid:
            outputs["mm_hashes"].append(MultimodalHasher.hash_features(ret["pixel_values_videos"]))
        else:
            outputs["mm_hashes"].append(uuid)
        outputs["grid_thw"].append(ret["video_grid_thw"])
        outputs["image_type_ids"].extend([1] * num_frames)
        outputs["mm_positions"].append(ImagePosition(len(outputs["input_ids"]), num_tokens))
        outputs["input_ids"].extend([self.image_token_id] * num_tokens)
        outputs["token_type_ids"].extend([IDS_TYPE_FLAG["video"]] * num_tokens)
        outputs["num_input_video_tokens"] += num_tokens
        pos_ids = self._compute_3d_positions(num_frames, patches_h, patches_w, outputs["cur_position"])
        outputs["position_ids"].extend(pos_ids)
        outputs["cur_position"] = np.max(pos_ids) + 1
    def add_processed_video(self, frames_cache, outputs, uuid, token_len=None):
        frames, meta = frames_cache
        num_tokens = frames.shape[0] // (self.spatial_conv_size**2 * self.temporal_conv_size)
        if token_len and num_tokens != token_len:
            raise ValueError("video tokens num not match the size")
        t, h, w = meta["thw"]
        outputs["images"].append(frames)
        outputs["mm_hashes"].append(uuid)
        outputs["grid_thw"].append(np.array([[t, h, w]]))
        outputs["mm_positions"].append(ImagePosition(len(outputs["input_ids"]), num_tokens))
        outputs["input_ids"].extend([self.image_token_id] * num_tokens)
        outputs["token_type_ids"].extend([IDS_TYPE_FLAG["video"]] * num_tokens)
        outputs["num_input_video_tokens"] += num_tokens
        outputs["image_type_ids"].extend([1] * t)
        pos_ids = self._compute_3d_positions(t, h, w, outputs["cur_position"])
        outputs["position_ids"].extend(pos_ids)
        outputs["cur_position"] = np.max(pos_ids) + 1
    def load_video(self, url, item):
        from fastdeploy.input.utils.render_timestamp import render_frame_timestamp
        from fastdeploy.input.utils.video import read_frames_decord, read_video_decord
        reader, meta, path = read_video_decord(url, save_to_disk=False)
        video_frame_args = {
            "fps": item.get("fps", self.fps),
            "min_frames": item.get("min_frames", self.min_frames),
            "max_frames": item.get("max_frames", self.max_frames),
            "target_frames": item.get("target_frames", self.target_frames),
            "frames_sample": item.get("frames_sample", self.frames_sample),
        }
        video_frame_args = self.set_video_frame_args(video_frame_args, meta)
        frames_data, _, timestamps = read_frames_decord(
            path,
            reader,
            meta,
            target_frames=video_frame_args["target_frames"],
            target_fps=video_frame_args["fps"],
            frames_sample=video_frame_args["frames_sample"],
            save_to_disk=False,
        )
        frames = []
        for img_array, ts in zip(frames_data, timestamps):
            frames.append(render_frame_timestamp(img_array, ts))
        # Ensure even number of frames for temporal conv
        if len(frames) % 2 != 0:
            frames.append(copy.deepcopy(frames[-1]))
        return frames, {}
    def set_video_frame_args(self, video_frame_args, video_meta):
        """Set final frame sampling args based on priorities."""
        if video_frame_args["target_frames"] > 0:
            if video_frame_args["fps"] >= 0:
                raise ValueError("fps must be negative if target_frames is given")
            if (
                video_frame_args["min_frames"] > 0
                and video_frame_args["target_frames"] < video_frame_args["min_frames"]
            ):
                raise ValueError("target_frames must be larger than min_frames")
            if (
                video_frame_args["max_frames"] > 0
                and video_frame_args["target_frames"] > video_frame_args["max_frames"]
            ):
                raise ValueError("target_frames must be smaller than max_frames")
        else:
            if video_frame_args["fps"] < 0:
                raise ValueError("Must provide either positive target_fps or positive target_frames.")
            frames_to_extract = int(video_meta["duration"] * video_frame_args["fps"])
            if (
                video_frame_args["min_frames"] > 0
                and video_frame_args["max_frames"] > 0
                and video_frame_args["min_frames"] > video_frame_args["max_frames"]
            ):
                raise ValueError("min_frames must be smaller than max_frames")
            if video_frame_args["min_frames"] > 0 and frames_to_extract < video_frame_args["min_frames"]:
                video_frame_args["target_frames"] = video_frame_args["min_frames"]
                video_frame_args["fps"] = -1
            if video_frame_args["max_frames"] > 0 and frames_to_extract > video_frame_args["max_frames"]:
                video_frame_args["target_frames"] = video_frame_args["max_frames"]
                video_frame_args["fps"] = -1
        return video_frame_args
    def add_text_positions(self, outputs, num_tokens):
        """Write text position IDs in ernie [pos, pos, pos] format."""
        start = outputs["cur_position"]
        for i in range(num_tokens):
            outputs["position_ids"].append([start + i] * 3)
        outputs["cur_position"] += num_tokens
    def append_completion_tokens(self, multimodal_inputs, completion_token_ids):
        num_tokens = len(completion_token_ids)
        multimodal_inputs["input_ids"].extend(completion_token_ids)
        multimodal_inputs["token_type_ids"].extend([IDS_TYPE_FLAG["text"]] * num_tokens)
        start = multimodal_inputs["cur_position"]
        for i in range(num_tokens):
            multimodal_inputs["position_ids"].append([start + i] * 3)
        multimodal_inputs["cur_position"] += num_tokens
    def _compute_3d_positions(self, t, h, w, start_idx):
        """Compute 3D position IDs as list-of-lists for ernie format."""
        t_eff = t // self.temporal_conv_size if t != 1 else 1
        gh, gw = h // self.spatial_conv_size, w // self.spatial_conv_size
        time_idx = np.repeat(np.arange(t_eff), gh * gw)
        h_idx = np.tile(np.repeat(np.arange(gh), gw), t_eff)
        w_idx = np.tile(np.arange(gw), t_eff * gh)
        coords = list(zip(time_idx, h_idx, w_idx))
        return [[start_idx + ti, start_idx + hi, start_idx + wi] for ti, hi, wi in coords]
    def prompt_token_ids2outputs(self, prompt_token_ids, mm_items=None):
        outputs = self._make_outputs()
        prompt_token_ids_len = len(prompt_token_ids)
        if mm_items is None:
            outputs["input_ids"].extend(prompt_token_ids)
            outputs["token_type_ids"].extend([IDS_TYPE_FLAG["text"]] * prompt_token_ids_len)
            for i in range(prompt_token_ids_len):
                outputs["position_ids"].append([i] * 3)
            outputs["cur_position"] += prompt_token_ids_len
            return outputs
        images, videos = [], []
        image_uuid, video_uuid = [], []
        for item in mm_items:
            if item.get("type") == "image":
                images.append(item["data"])
                image_uuid.append(item.get("uuid"))
            elif item.get("type") == "video":
                videos.append(item["data"])
                video_uuid.append(item.get("uuid"))
        image_start_id = self.tokenizer.convert_tokens_to_ids(self.IMG_START)
        image_end_id = self.tokenizer.convert_tokens_to_ids(self.IMG_END)
        video_start_id = self.tokenizer.convert_tokens_to_ids(self.VID_START)
        video_end_id = self.tokenizer.convert_tokens_to_ids(self.VID_END)
        st, image_idx, video_idx = 0, 0, 0
        while st < prompt_token_ids_len:
            cur_token_id = prompt_token_ids[st]
            if cur_token_id == image_start_id:
                if image_idx >= len(images):
                    raise ValueError("prompt token ids has more image placeholder than in messages")
                # append image_start_id
                outputs["input_ids"].extend([cur_token_id])
                outputs["token_type_ids"].extend([IDS_TYPE_FLAG["image"]])
                outputs["position_ids"].append([outputs["cur_position"]] * 3)
                outputs["cur_position"] += 1
                st += 1
                # process placeholder token ids
                cur_idx = st
                while cur_idx < prompt_token_ids_len and prompt_token_ids[cur_idx] != image_end_id:
                    cur_idx += 1
                if cur_idx >= prompt_token_ids_len:
                    raise ValueError("image token ids not complete")
                image = images[image_idx]
                uuid = image_uuid[image_idx] if image_uuid else None
                token_len = cur_idx - st
                if not isinstance(image, tuple):
                    self.add_image(image, outputs, uuid, token_len)
                else:
                    self.add_processed_image(image, outputs, uuid, token_len)
                image_idx += 1
                st = cur_idx
            elif cur_token_id == video_start_id:
                if video_idx >= len(videos):
                    raise ValueError("prompt token ids has more video placeholder than in messages")
                # append video_start_id
                outputs["input_ids"].extend([cur_token_id])
                outputs["token_type_ids"].extend([IDS_TYPE_FLAG["image"]])
                outputs["position_ids"].append([outputs["cur_position"]] * 3)
                outputs["cur_position"] += 1
                st += 1
                # process placeholder token ids
                cur_idx = st
                while cur_idx < prompt_token_ids_len and prompt_token_ids[cur_idx] != video_end_id:
                    cur_idx += 1
                if cur_idx >= prompt_token_ids_len:
                    raise ValueError("video token ids not complete")
                video = videos[video_idx]
                uuid = video_uuid[video_idx] if video_uuid else None
                token_len = cur_idx - st
                if not isinstance(video, tuple):
                    if isinstance(video, dict):
                        frames, _ = self.load_video(video["video"], video)
                    else:
                        frames, _ = self.load_video(video, {})
                    self.add_video(frames, outputs, uuid, token_len=token_len)
                else:
                    self.add_processed_video(video, outputs, uuid, token_len)
                video_idx += 1
                st = cur_idx
            else:
                outputs["input_ids"].extend([cur_token_id])
                type_flag = (
                    IDS_TYPE_FLAG["image"] if cur_token_id in (image_end_id, video_end_id) else IDS_TYPE_FLAG["text"]
                )
                outputs["token_type_ids"].extend([type_flag])
                outputs["position_ids"].append([outputs["cur_position"]] * 3)
                outputs["cur_position"] += 1
                st += 1
        if image_idx != len(images):
            raise ValueError("number of images does not match")
        if video_idx != len(videos):
            raise ValueError("number of videos does not match")
        return outputs
    @staticmethod
    def mm_num_tokens(grid_thw):
        """Ernie mm_num_tokens: video (t>1) divides by an extra 2."""
        if isinstance(grid_thw, paddle.Tensor):
            grid_thw = grid_thw.numpy()
        if len(grid_thw) == 0:
            return 0
        def calc_one(thw):
            t, h, w = map(int, thw)
            if t == 1:
                return t * h * w // 4
            else:
                return t * h * w // 4 // 2
        if isinstance(grid_thw[0], (list, tuple, np.ndarray)):
            return [calc_one(x) for x in grid_thw]
        return calc_one(grid_thw)
    def pack_position_ids(self, outputs):
        """Ernie: position_ids is np.array (list-of-lists -> ndarray)."""
        outputs["position_ids"] = np.array(outputs["position_ids"], dtype=np.int64)
        outputs["image_patch_id"] = self.image_token_id
    def get_mm_max_tokens_per_item(self, seq_len):
        """Per-modality max token counts for ernie."""
        target_height, target_width = self._get_image_size_with_most_features()
        # image
        patches_h, patches_w = self.image_processor.get_smarted_resize(
            height=target_height,
            width=target_width,
            min_pixels=self.image_min_pixels,
            max_pixels=self.image_max_pixels,
        )[1]
        max_image_tokens = (patches_h * patches_w) // (self.spatial_conv_size**2)
        max_image_tokens = min(max_image_tokens, seq_len)
        # video
        patches_h, patches_w = self.image_processor.get_smarted_resize(
            height=target_height,
            width=target_width,
            min_pixels=self.video_min_pixels,
            max_pixels=self.video_max_pixels,
        )[1]
        max_video_tokens = (patches_h * patches_w) // (self.spatial_conv_size**2 * self.temporal_conv_size)
        max_video_tokens = min(max_video_tokens, seq_len)
        return {"image": max_image_tokens, "video": max_video_tokens}
    def _get_image_size_with_most_features(self):
        resized_height, resized_width = self.image_processor.get_smarted_resize(
            height=MAX_IMAGE_DIMENSION,
            width=MAX_IMAGE_DIMENSION,
            min_pixels=self.image_min_pixels,
            max_pixels=self.image_max_pixels,
        )[0]
        return (resized_height, resized_width)
@@ -0,0 +1,190 @@
 # Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 """PaddleOCR-VL encoding strategy."""
 import numpy as np
 from PIL import Image
 from fastdeploy.engine.request import ImagePosition
 from fastdeploy.input.encodings.qwen_encoding import QwenEncoding
 from fastdeploy.input.encodings.registry import EncodingRegistry
 from fastdeploy.input.mm_model_config import PADDLEOCR_VL
 from fastdeploy.input.utils import IDS_TYPE_FLAG
 from fastdeploy.input.utils.video import read_video_decord
 from fastdeploy.input.utils.video import sample_frames_paddleocr as _sample_paddleocr
 from fastdeploy.multimodal.hasher import MultimodalHasher
@EncodingRegistry.register(PADDLEOCR_VL)
 class PaddleOCREncoding(QwenEncoding):
    """Encoding strategy for paddleocr_vl.
    Inherits from QwenEncoding and overrides methods that differ:
    - _make_outputs: add vit_seqlen / vit_position_ids
    - add_image / add_video: append vit_fields (vit_seqlen, vit_position_ids)
    - add_video / add_processed_video: use video_token_id instead of image_token_id
    - load_video: use sample_frames_paddleocr instead of sample_frames_qwen
    """
    def _make_outputs(self) -> dict:
        outputs = super()._make_outputs()
        outputs["vit_seqlen"] = []
        outputs["vit_position_ids"] = []
        return outputs
    def add_image(self, img, outputs, uuid, token_len=None):
        ret = self.image_processor.preprocess(images=[img.convert("RGB")])
        num_tokens = ret["grid_thw"].prod() // self.image_processor.merge_size**2
        grid_thw = ret["grid_thw"].tolist()
        if token_len is not None and token_len != num_tokens:
            raise ValueError("image tokens num not match the size")
        outputs["mm_positions"].append(ImagePosition(len(outputs["input_ids"]), num_tokens))
        outputs["input_ids"].extend([self.image_token_id] * num_tokens)
        outputs["token_type_ids"].extend([IDS_TYPE_FLAG["image"]] * num_tokens)
        outputs["num_input_image_tokens"] += int(num_tokens)
        outputs["images"].append(ret["pixel_values"])
        if not uuid:
            outputs["mm_hashes"].append(MultimodalHasher.hash_features(ret["pixel_values"]))
        else:
            outputs["mm_hashes"].append(uuid)
        outputs["grid_thw"].append(grid_thw)
        outputs["image_type_ids"].append(0)
        t, h, w = grid_thw
        pos_ids = self._compute_vision_positions(outputs["cur_position"], t, h, w, 0)
        outputs["position_ids"].append(pos_ids)
        outputs["cur_position"] = pos_ids.max() + 1
        outputs["fps"].append(0)
        # paddleocr vit fields
        numel = h * w
        outputs["vit_seqlen"].append(numel)
        outputs["vit_position_ids"].append(np.arange(numel) % numel)
    def add_processed_image(self, img_cache, outputs, uuid, token_len=None):
        super().add_processed_image(img_cache, outputs, uuid, token_len)
        _, h, w = img_cache[1]["thw"]
        numel = h * w
        outputs["vit_seqlen"].append(numel)
        outputs["vit_position_ids"].append(np.arange(numel) % numel)
    def add_video(self, frames, outputs, uuid, token_len=None, meta=None):
        preprocess_kwargs = {}
        if self.cfg.video_min_pixels is not None:
            preprocess_kwargs["min_pixels"] = self.cfg.video_min_pixels
            preprocess_kwargs["max_pixels"] = self.cfg.video_max_pixels
        ret = self.image_processor.preprocess(images=frames, **preprocess_kwargs)
        num_tokens = ret["grid_thw"].prod() // self.image_processor.merge_size**2
        grid_thw = ret["grid_thw"].tolist()
        if token_len is not None and token_len != num_tokens:
            raise ValueError("video tokens num not match the size")
        outputs["mm_positions"].append(ImagePosition(len(outputs["input_ids"]), num_tokens))
        outputs["input_ids"].extend([self.video_token_id] * num_tokens)
        outputs["token_type_ids"].extend([IDS_TYPE_FLAG["video"]] * num_tokens)
        outputs["num_input_video_tokens"] += int(num_tokens)
        outputs["images"].append(ret["pixel_values"])
        if not uuid:
            outputs["mm_hashes"].append(MultimodalHasher.hash_features(ret["pixel_values"]))
        else:
            outputs["mm_hashes"].append(uuid)
        outputs["grid_thw"].append(grid_thw)
        outputs["image_type_ids"].extend([1] * grid_thw[0])
        fps = meta["fps"] if meta else 0
        second_per_grid_t = self.temporal_conv_size / fps if fps else 0
        t, h, w = grid_thw
        pos_ids = self._compute_vision_positions(outputs["cur_position"], t, h, w, second_per_grid_t)
        outputs["position_ids"].append(pos_ids)
        outputs["cur_position"] = pos_ids.max() + 1
        outputs["fps"].append(fps)
        # paddleocr vit fields
        numel = h * w
        outputs["vit_seqlen"].append(numel)
        outputs["vit_position_ids"].append(np.arange(numel) % numel)
    def add_processed_video(self, frames_cache, outputs, uuid, token_len=None):
        frames, meta = frames_cache
        num_tokens = frames.shape[0] // self.image_processor.merge_size**2
        if token_len is not None and token_len != num_tokens:
            raise ValueError("video tokens num not match the size")
        t, h, w = meta["thw"]
        outputs["images"].append(frames)
        outputs["mm_hashes"].append(uuid)
        outputs["grid_thw"].append(np.array([[t, h, w]]))
        outputs["mm_positions"].append(ImagePosition(len(outputs["input_ids"]), num_tokens))
        outputs["input_ids"].extend([self.video_token_id] * num_tokens)
        outputs["token_type_ids"].extend([IDS_TYPE_FLAG["video"]] * num_tokens)
        outputs["num_input_video_tokens"] += num_tokens
        outputs["image_type_ids"].extend([1] * t)
        fps = meta["fps"]
        second_per_grid_t = self.temporal_conv_size / fps
        pos_ids = self._compute_vision_positions(outputs["cur_position"], t, h, w, second_per_grid_t)
        outputs["position_ids"].append(pos_ids)
        outputs["cur_position"] = pos_ids.max() + 1
        outputs["fps"].append(fps)
        # paddleocr vit fields
        numel = h * w
        outputs["vit_seqlen"].append(numel)
        outputs["vit_position_ids"].append(np.arange(numel) % numel)
    def load_video(self, url, item):
        reader, meta, _ = read_video_decord(url, save_to_disk=False)
        fps = item.get("fps", self.fps)
        num_frames = item.get("target_frames", self.target_frames)
        frame_indices = list(range(meta["num_of_frame"]))
        if fps > 0 or num_frames > 0:
            min_frames = item.get("min_frames", self.min_frames)
            max_frames = item.get("max_frames", self.max_frames)
            frame_indices = _sample_paddleocr(
                frame_factor=self.temporal_conv_size,
                min_frames=min_frames,
                max_frames=max_frames,
                metadata=meta,
                fps=fps,
                num_frames=num_frames,
            )
            meta["num_of_frame"] = len(frame_indices)
            if fps is not None:
                meta["fps"] = fps
                meta["duration"] = len(frame_indices) / fps
            else:
                meta["fps"] = len(frame_indices) / meta["duration"]
        frames = []
        for idx in frame_indices:
            frame = reader[idx].asnumpy()
            image = Image.fromarray(frame, "RGB")
            frames.append(image)
        frames = np.stack([np.array(f.convert("RGB")) for f in frames], axis=0)
        return frames, meta
@@ -0,0 +1,314 @@
 # Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 """Qwen-family (qwen_vl / qwen3_vl) encoding strategy."""
 import numpy as np
 import paddle
 from PIL import Image
 from fastdeploy.engine.request import ImagePosition
 from fastdeploy.input.encodings.base_encoding import BaseEncoding
 from fastdeploy.input.encodings.registry import EncodingRegistry
 from fastdeploy.input.mm_model_config import QWEN3_VL, QWEN_VL
 from fastdeploy.input.utils import IDS_TYPE_FLAG
 from fastdeploy.input.utils.video import read_video_decord
 from fastdeploy.input.utils.video import sample_frames_qwen as _sample_qwen
 from fastdeploy.multimodal.hasher import MultimodalHasher
@EncodingRegistry.register(QWEN_VL, QWEN3_VL)
 class QwenEncoding(BaseEncoding):
    """Encoding strategy for qwen_vl and qwen3_vl."""
    FRAME_FACTOR = 2
    def _make_outputs(self) -> dict:
        outputs = super()._make_outputs()
        outputs["fps"] = []
        return outputs
    def add_image(self, img, outputs, uuid, token_len=None):
        ret = self.image_processor.preprocess(images=[img.convert("RGB")])
        num_tokens = ret["grid_thw"].prod() // self.image_processor.merge_size**2
        grid_thw = ret["grid_thw"].tolist()
        if token_len is not None and token_len != num_tokens:
            raise ValueError("image tokens num not match the size")
        outputs["mm_positions"].append(ImagePosition(len(outputs["input_ids"]), num_tokens))
        outputs["input_ids"].extend([self.image_token_id] * num_tokens)
        outputs["token_type_ids"].extend([IDS_TYPE_FLAG["image"]] * num_tokens)
        outputs["num_input_image_tokens"] += int(num_tokens)
        outputs["images"].append(ret["pixel_values"])
        if not uuid:
            outputs["mm_hashes"].append(MultimodalHasher.hash_features(ret["pixel_values"]))
        else:
            outputs["mm_hashes"].append(uuid)
        outputs["grid_thw"].append(grid_thw)
        outputs["image_type_ids"].append(0)
        t, h, w = grid_thw
        pos_ids = self._compute_vision_positions(outputs["cur_position"], t, h, w, 0)
        outputs["position_ids"].append(pos_ids)
        outputs["cur_position"] = pos_ids.max() + 1
        outputs["fps"].append(0)
    def add_processed_image(self, img_cache, outputs, uuid, token_len=None):
        img, meta = img_cache
        num_tokens = img.shape[0] // self.image_processor.merge_size**2
        if token_len is not None and token_len != num_tokens:
            raise ValueError("image tokens num not match the size")
        outputs["mm_positions"].append(ImagePosition(len(outputs["input_ids"]), num_tokens))
        outputs["input_ids"].extend([self.image_token_id] * num_tokens)
        outputs["token_type_ids"].extend([IDS_TYPE_FLAG["image"]] * num_tokens)
        outputs["num_input_image_tokens"] += num_tokens
        _, h, w = meta["thw"]
        pos_ids = self._compute_vision_positions(outputs["cur_position"], 1, h, w, 0)
        outputs["position_ids"].append(pos_ids)
        outputs["cur_position"] = pos_ids.max() + 1
        outputs["images"].append(img)
        outputs["mm_hashes"].append(uuid)
        outputs["grid_thw"].append(np.array([[1, h, w]]))
        outputs["image_type_ids"].append(0)
        outputs["fps"].append(0)
    def add_video(self, frames, outputs, uuid, token_len=None, meta=None):
        preprocess_kwargs = {}
        # qwen3_vl passes min/max pixels for video
        if self.cfg.video_min_pixels is not None:
            preprocess_kwargs["min_pixels"] = self.cfg.video_min_pixels
            preprocess_kwargs["max_pixels"] = self.cfg.video_max_pixels
        ret = self.image_processor.preprocess(images=frames, **preprocess_kwargs)
        num_tokens = ret["grid_thw"].prod() // self.image_processor.merge_size**2
        grid_thw = ret["grid_thw"].tolist()
        if token_len is not None and token_len != num_tokens:
            raise ValueError("video tokens num not match the size")
        outputs["mm_positions"].append(ImagePosition(len(outputs["input_ids"]), num_tokens))
        outputs["input_ids"].extend([self.image_token_id] * num_tokens)
        outputs["token_type_ids"].extend([IDS_TYPE_FLAG["video"]] * num_tokens)
        outputs["num_input_video_tokens"] += int(num_tokens)
        outputs["images"].append(ret["pixel_values"])
        if not uuid:
            outputs["mm_hashes"].append(MultimodalHasher.hash_features(ret["pixel_values"]))
        else:
            outputs["mm_hashes"].append(uuid)
        outputs["grid_thw"].append(grid_thw)
        outputs["image_type_ids"].extend([1] * grid_thw[0])
        fps = meta["fps"] if meta else 0
        second_per_grid_t = self.temporal_conv_size / fps if fps else 0
        t, h, w = grid_thw
        pos_ids = self._compute_vision_positions(outputs["cur_position"], t, h, w, second_per_grid_t)
        outputs["position_ids"].append(pos_ids)
        outputs["cur_position"] = pos_ids.max() + 1
        outputs["fps"].append(fps)
    def add_processed_video(self, frames_cache, outputs, uuid, token_len=None):
        frames, meta = frames_cache
        num_tokens = frames.shape[0] // self.image_processor.merge_size**2
        if token_len is not None and token_len != num_tokens:
            raise ValueError("video tokens num not match the size")
        t, h, w = meta["thw"]
        outputs["images"].append(frames)
        outputs["mm_hashes"].append(uuid)
        outputs["grid_thw"].append(np.array([[t, h, w]]))
        outputs["mm_positions"].append(ImagePosition(len(outputs["input_ids"]), num_tokens))
        outputs["input_ids"].extend([self.image_token_id] * num_tokens)
        outputs["token_type_ids"].extend([IDS_TYPE_FLAG["video"]] * num_tokens)
        outputs["num_input_video_tokens"] += num_tokens
        outputs["image_type_ids"].extend([1] * t)
        fps = meta["fps"]
        second_per_grid_t = self.temporal_conv_size / fps
        pos_ids = self._compute_vision_positions(outputs["cur_position"], t, h, w, second_per_grid_t)
        outputs["position_ids"].append(pos_ids)
        outputs["cur_position"] = pos_ids.max() + 1
        outputs["fps"].append(fps)
    def load_video(self, url, item):
        reader, meta, _ = read_video_decord(url, save_to_disk=False)
        fps = item.get("fps", self.fps)
        num_frames = item.get("target_frames", self.target_frames)
        frame_indices = list(range(meta["num_of_frame"]))
        if fps > 0 or num_frames > 0:
            min_frames = item.get("min_frames", self.min_frames)
            max_frames = item.get("max_frames", self.max_frames)
            frame_indices = _sample_qwen(
                frame_factor=self.FRAME_FACTOR,
                min_frames=min_frames,
                max_frames=max_frames,
                metadata=meta,
                fps=-1 if num_frames > 0 else fps,
                num_frames=num_frames,
            )
            meta["num_of_frame"] = len(frame_indices)
            if fps is not None:
                meta["fps"] = fps
                meta["duration"] = len(frame_indices) / fps
            else:
                meta["fps"] = len(frame_indices) / meta["duration"]
        frames = []
        for idx in frame_indices:
            frame = reader[idx].asnumpy()
            image = Image.fromarray(frame, "RGB")
            frames.append(image)
        frames = np.stack([np.array(f.convert("RGB")) for f in frames], axis=0)
        return frames, meta
    def add_text_positions(self, outputs, num_tokens):
        """Write text position IDs in qwen 3xN ndarray format."""
        pos_ids = self._compute_text_positions(outputs["cur_position"], num_tokens)
        outputs["position_ids"].append(pos_ids)
        outputs["cur_position"] = pos_ids.max() + 1
    def append_completion_tokens(self, multimodal_inputs, completion_token_ids):
        num_tokens = len(completion_token_ids)
        multimodal_inputs["input_ids"].extend(completion_token_ids)
        multimodal_inputs["token_type_ids"].extend([IDS_TYPE_FLAG["text"]] * num_tokens)
        pos_ids = self._compute_text_positions(multimodal_inputs["cur_position"], num_tokens)
        multimodal_inputs["position_ids"].append(pos_ids)
        multimodal_inputs["cur_position"] += num_tokens
    def prompt_token_ids2outputs(self, prompt_token_ids, mm_items=None):
        """Build outputs from prompt_token_ids. Only qwen3_vl supports this."""
        outputs = self._make_outputs()
        prompt_token_ids_len = len(prompt_token_ids)
        if mm_items is None:
            self._add_text_tokens(prompt_token_ids, outputs)
            return outputs
        st, mm_idx = 0, 0
        while st < prompt_token_ids_len:
            if prompt_token_ids[st] != self.image_token_id:
                cur_idx = st
                while cur_idx < prompt_token_ids_len and prompt_token_ids[cur_idx] != self.image_token_id:
                    cur_idx += 1
                self._add_text_tokens(prompt_token_ids[st:cur_idx], outputs)
                st = cur_idx
                continue
            if mm_idx >= len(mm_items):
                raise ValueError("prompt token ids has more multimodal placeholder than in messages")
            cur_idx = st
            while cur_idx < prompt_token_ids_len and prompt_token_ids[cur_idx] == self.image_token_id:
                cur_idx += 1
            item = mm_items[mm_idx]
            uuid = item.get("uuid")
            token_len = cur_idx - st
            if item.get("type") == "image":
                image = item.get("data")
                if not isinstance(image, tuple):
                    self.add_image(image, outputs, uuid, token_len)
                else:
                    self.add_processed_image(image, outputs, uuid, token_len)
            elif item.get("type") == "video":
                video = item.get("data")
                if not isinstance(video, tuple):
                    if isinstance(video, dict):
                        frames, meta = self.load_video(video["video"], video)
                    else:
                        frames, meta = self.load_video(video, {})
                    self.add_video(frames, outputs, uuid, token_len=token_len, meta=meta)
                else:
                    self.add_processed_video(video, outputs, uuid, token_len)
            else:
                raise ValueError(f"Unsupported multimodal type: {item.get('type')}")
            mm_idx += 1
            st = cur_idx
        if mm_idx != len(mm_items):
            raise ValueError("number of multimodal items does not match prompt token ids")
        return outputs
    def _add_text_tokens(self, tokens, outputs):
        """Helper: add text tokens with position IDs."""
        if not tokens:
            return
        num_tokens = len(tokens)
        outputs["input_ids"].extend(tokens)
        outputs["token_type_ids"].extend([IDS_TYPE_FLAG["text"]] * num_tokens)
        self.add_text_positions(outputs, num_tokens)
    def _compute_text_positions(self, start_pos, num_tokens):
        """3xN ndarray for qwen-family text positions."""
        text_array = np.arange(num_tokens).reshape(1, -1)
        text_index = np.broadcast_to(text_array, (3, num_tokens))
        return text_index + start_pos
    def _compute_vision_positions(self, start_pos, t, h, w, second_per_grid_t):
        """3D position IDs as 3xN ndarray for qwen-family."""
        h //= self.spatial_conv_size
        w //= self.spatial_conv_size
        tn = np.arange(t).reshape(-1, 1)
        tn = np.broadcast_to(tn, (t, h * w))
        tn = tn * int(second_per_grid_t) * self.tokens_per_second
        t_index = tn.flatten()
        hn = np.arange(h).reshape(1, -1, 1)
        h_index = np.broadcast_to(hn, (t, h, w)).flatten()
        wn = np.arange(w).reshape(1, 1, -1)
        w_index = np.broadcast_to(wn, (t, h, w)).flatten()
        return np.stack([t_index, h_index, w_index]) + start_pos
    @staticmethod
    def mm_num_tokens(grid_thw):
        """Qwen mm_num_tokens: t * h * w // 4."""
        if isinstance(grid_thw, paddle.Tensor):
            grid_thw = grid_thw.numpy()
        if len(grid_thw) == 0:
            return 0
        def calc_one(thw):
            t, h, w = map(int, thw)
            return t * h * w // 4
        if isinstance(grid_thw[0], (list, tuple, np.ndarray)):
            return [calc_one(x) for x in grid_thw]
        return calc_one(grid_thw)
    def pack_position_ids(self, outputs):
        """Qwen: concatenate 3xN arrays, then transpose to Nx3."""
        outputs["position_ids"] = np.concatenate(outputs["position_ids"], axis=1, dtype=np.int64)
        outputs["image_patch_id"] = self.image_token_id
        outputs["video_patch_id"] = self.video_token_id
        outputs["position_ids"] = outputs["position_ids"].transpose(1, 0)
@@ -0,0 +1,54 @@
 # Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 """Registry for multimodal encoding strategy classes."""
 from typing import Dict, Type
 class EncodingRegistry:
    """Maps model_type strings to encoding strategy classes.
    Encoding classes register themselves via the ``register`` decorator
    at import time.  ``MultiModalProcessor`` queries this registry by
    *model_type* instead of using string-based dynamic imports.
    """
    _registry: Dict[str, Type] = {}
    @classmethod
    def register(cls, *model_types: str):
        """Decorator that registers an encoding class for one or more model types."""
        def decorator(enc_cls):
            for mt in model_types:
                if mt in cls._registry:
                    raise ValueError(
                        f"Encoding for '{mt}' already registered "
                        f"as {cls._registry[mt].__name__}, "
                        f"cannot re-register as {enc_cls.__name__}"
                    )
                cls._registry[mt] = enc_cls
            return enc_cls
        return decorator
    @classmethod
    def get(cls, model_type: str) -> Type:
        """Look up the encoding class for a given *model_type*."""
        if model_type not in cls._registry:
            raise ValueError(
                f"No encoding registered for '{model_type}'. " f"Available: {sorted(cls._registry.keys())}"
            )
        return cls._registry[model_type]
@@ -539,6 +539,7 @@ class DataProcessor(MMBaseDataProcessor):
        outputs["mm_positions"].append(ImagePosition(len(outputs["input_ids"]), num_tokens))
        outputs["input_ids"].extend([self.image_patch_id] * num_tokens)
        outputs["token_type_ids"].extend([IDS_TYPE_FLAG["image"]] * num_tokens)
        outputs["num_input_image_tokens"] += num_tokens
        _, h, w = meta["thw"]
        pos_ids = self._compute_3d_positions(1, h, w, outputs["cur_position"])
@@ -605,6 +606,7 @@ class DataProcessor(MMBaseDataProcessor):
        outputs["mm_positions"].append(ImagePosition(len(outputs["input_ids"]), num_tokens))
        outputs["input_ids"].extend([self.image_patch_id] * num_tokens)
        outputs["token_type_ids"].extend([IDS_TYPE_FLAG["video"]] * num_tokens)
        outputs["num_input_video_tokens"] += num_tokens
        outputs["image_type_ids"].extend([1] * t)
        pos_ids = self._compute_3d_positions(t, h, w, outputs["cur_position"])
@@ -25,3 +25,6 @@ from fastdeploy.input.image_processors.qwen3_processor import (  # noqa: F401
 from fastdeploy.input.image_processors.qwen_processor import (  # noqa: F401
    ImageProcessor as QwenImageProcessor,
 )
 from fastdeploy.input.image_processors.registry import (  # noqa: F401
    ImageProcessorRegistry,
 )
@@ -46,6 +46,8 @@ from PIL import Image
 from fastdeploy.input.image_processors.common import is_scaled_image
 from fastdeploy.input.image_processors.common import smart_resize_qwen as smart_resize
 from fastdeploy.input.image_processors.registry import ImageProcessorRegistry
 from fastdeploy.input.mm_model_config import ERNIE4_5_VL
 from fastdeploy.utils import data_processor_logger
 OPENAI_CLIP_MEAN = [0.48145466, 0.4578275, 0.40821073]
@@ -116,6 +118,7 @@ def make_batched_videos(videos) -> List[VideoInput]:
    raise ValueError(f"Could not make batched video from {videos}")
@ImageProcessorRegistry.register(ERNIE4_5_VL)
 class AdaptiveImageProcessor(BaseImageProcessor):
    r"""
    Constructs a adaptive image processor that dynamically resizes images based on the original images.
@@ -33,6 +33,8 @@ from paddleformers.transformers.image_utils import (
 from fastdeploy.input.image_processors.common import (
    smart_resize_paddleocr as smart_resize,
 )
 from fastdeploy.input.image_processors.registry import ImageProcessorRegistry
 from fastdeploy.input.mm_model_config import PADDLEOCR_VL
 _OPENAI_CLIP_MEAN = [0.48145466, 0.4578275, 0.40821073]
 _OPENAI_CLIP_STD = [0.26862954, 0.26130258, 0.27577711]
@@ -66,6 +68,7 @@ def adjust_size(size, patch_size):
    return num_patches * patch_size
@ImageProcessorRegistry.register(PADDLEOCR_VL)
 class ImageProcessor(BaseImageProcessor):
    model_input_names = [
        "pixel_values",
@@ -41,6 +41,8 @@ from paddleformers.transformers.legacy.tokenizer_utils_base import TensorType
 from PIL import Image
 from fastdeploy.input.image_processors.common import is_scaled_image, smart_resize
 from fastdeploy.input.image_processors.registry import ImageProcessorRegistry
 from fastdeploy.input.mm_model_config import QWEN3_VL
 from fastdeploy.utils import data_processor_logger
 IMAGE_MEAN = [0.5, 0.5, 0.5]
@@ -62,6 +64,7 @@ VideoInput = Union[
 ]
@ImageProcessorRegistry.register(QWEN3_VL)
 class ImageProcessor(BaseImageProcessor):
    """
    Adaptive image processor for dynamic image resizing and preprocessing.
@@ -41,6 +41,8 @@ from paddleformers.transformers.legacy.tokenizer_utils_base import TensorType
 from PIL import Image
 from fastdeploy.input.image_processors.common import is_scaled_image, smart_resize
 from fastdeploy.input.image_processors.registry import ImageProcessorRegistry
 from fastdeploy.input.mm_model_config import QWEN_VL
 from fastdeploy.utils import data_processor_logger
 OPENAI_CLIP_MEAN = [0.48145466, 0.4578275, 0.40821073]
@@ -62,6 +64,7 @@ VideoInput = Union[
 ]
@ImageProcessorRegistry.register(QWEN_VL)
 class ImageProcessor(BaseImageProcessor):
    """
    Adaptive image processor for dynamic image resizing and preprocessing.
@@ -0,0 +1,54 @@
 # Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 """Registry for multimodal image processor classes."""
 from typing import Dict, Type
 class ImageProcessorRegistry:
    """Maps model_type strings to image processor classes.
    Image processors register themselves via the ``register`` decorator
    at import time.  ``MultiModalProcessor`` queries this registry by
    *model_type* instead of using string-based dynamic imports.
    """
    _registry: Dict[str, Type] = {}
    @classmethod
    def register(cls, *model_types: str):
        """Decorator that registers an image processor class for one or more model types."""
        def decorator(proc_cls):
            for mt in model_types:
                if mt in cls._registry:
                    raise ValueError(
                        f"Image processor for '{mt}' already registered "
                        f"as {cls._registry[mt].__name__}, "
                        f"cannot re-register as {proc_cls.__name__}"
                    )
                cls._registry[mt] = proc_cls
            return proc_cls
        return decorator
    @classmethod
    def get(cls, model_type: str) -> Type:
        """Look up the image processor class for a given *model_type*."""
        if model_type not in cls._registry:
            raise ValueError(
                f"No image processor registered for '{model_type}'. " f"Available: {sorted(cls._registry.keys())}"
            )
        return cls._registry[model_type]
@@ -0,0 +1,143 @@
 # Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 """Per-model-type configuration for the unified MultiModalProcessor."""
 from dataclasses import dataclass, field
 from typing import Dict, Optional
 QWEN_VL = "qwen_vl"
 QWEN3_VL = "qwen3_vl"
 PADDLEOCR_VL = "paddleocr_vl"
 ERNIE4_5_VL = "ernie4_5_vl"
@dataclass(frozen=True)
 class MMModelConfig:
    image_placeholder: str
    video_placeholder: str
    tokenizer_type: str = "auto"  # "auto" | "ernie4_5"
    default_min_frames: int = 4
    default_max_frames: int = 768
    default_target_frames: int = -1
    default_fps: float = 2.0
    default_frames_sample: str = "leading"
    has_bad_words: bool = True
    has_tool_role: bool = False  # ernie: role_prefixes includes "tool"
    default_thinking: bool = False  # ernie: default enable_thinking=True
    force_disable_thinking: bool = False  # qwen_vl, qwen3_vl: force enable_thinking=False
    set_default_reasoning_max_tokens: bool = False  # ernie: auto-set reasoning_max_tokens
    cap_response_max_tokens: bool = False  # ernie: cap max_tokens by response_max_tokens
    has_logits_processor_think: bool = False  # ernie: _prepare_think_stop_sentence
    chat_template_pass_request: bool = False  # ernie: pass full request obj
    supports_prompt_token_ids: bool = False  # qwen3, ernie
    preserve_prompt_token_ids: bool = False  # qwen3, ernie: don't overwrite existing
    stop_tokens_variant: str = "default"  # "default" | "qwen3"
    image_token_str: str = ""
    video_token_str: str = ""
    expected_kwargs: Dict[str, type] = field(default_factory=dict)
    video_min_pixels: Optional[int] = None
    video_max_pixels: Optional[int] = None
    # ---- Conv params source ----
    conv_params_from_kwargs: bool = False  # ernie: from processor_kwargs; else: from image_processor
    # ---- tokens_per_second ----
    has_tokens_per_second: bool = True  # qwen-family: read from config; ernie: False
 _QWEN_KWARGS = {
    "video_max_frames": int,
    "video_min_frames": int,
 }
 _ERNIE_KWARGS = {
    "spatial_conv_size": int,
    "temporal_conv_size": int,
    "image_min_pixels": int,
    "image_max_pixels": int,
    "video_min_pixels": int,
    "video_max_pixels": int,
    "video_target_frames": int,
    "video_frames_sample": str,
    "video_max_frames": int,
    "video_min_frames": int,
    "video_fps": int,
 }
 MODEL_CONFIGS: Dict[str, MMModelConfig] = {
    QWEN_VL: MMModelConfig(
        image_placeholder="<|image_pad|>",
        video_placeholder="<|video_pad|>",
        image_token_str="<|image_pad|>",
        video_token_str="<|video_pad|>",
        force_disable_thinking=True,
        expected_kwargs=_QWEN_KWARGS,
    ),
    QWEN3_VL: MMModelConfig(
        image_placeholder="<|image_pad|>",
        video_placeholder="<|video_pad|>",
        image_token_str="<|image_pad|>",
        video_token_str="<|video_pad|>",
        force_disable_thinking=True,
        supports_prompt_token_ids=True,
        preserve_prompt_token_ids=True,
        stop_tokens_variant="qwen3",
        video_min_pixels=128 * 28 * 28,
        video_max_pixels=768 * 28 * 28,
        expected_kwargs=_QWEN_KWARGS,
    ),
    PADDLEOCR_VL: MMModelConfig(
        image_placeholder="<|IMAGE_PLACEHOLDER|>",
        video_placeholder="<|video_pad|>",
        image_token_str="<|IMAGE_PLACEHOLDER|>",
        video_token_str="<|video_pad|>",
        has_bad_words=False,
        default_fps=-1.0,
        expected_kwargs=_QWEN_KWARGS,
    ),
    ERNIE4_5_VL: MMModelConfig(
        image_placeholder="<|image@placeholder|>",
        video_placeholder="<|video@placeholder|>",
        tokenizer_type="ernie4_5",
        default_min_frames=16,
        default_max_frames=180,
        default_fps=2.0,
        default_frames_sample="leading",
        has_tool_role=True,
        default_thinking=True,
        set_default_reasoning_max_tokens=True,
        cap_response_max_tokens=True,
        has_logits_processor_think=True,
        chat_template_pass_request=True,
        supports_prompt_token_ids=True,
        preserve_prompt_token_ids=True,
        image_token_str="<|IMAGE_PLACEHOLDER|>",
        video_token_str="<|IMAGE_PLACEHOLDER|>",
        conv_params_from_kwargs=True,
        has_tokens_per_second=False,
        expected_kwargs=_ERNIE_KWARGS,
    ),
 }
@@ -16,46 +16,25 @@
 """Unified multimodal processor for all VL model types.
-Consolidates the four separate VL processor wrappers (QwenVLProcessor,
+Consolidates the four separate VL processor wrappers and four separate
-Qwen3VLProcessor, PaddleOCRVLProcessor, Ernie4_5_VLProcessor) into a
+DataProcessor classes into a single class with pluggable Encoding strategies.
 single class that dispatches per ``model_type``.
 """
 import pickle
 from collections.abc import Mapping
 from typing import Any, Dict, Optional
 import numpy as np
 import zmq
 from fastdeploy.entrypoints.chat_utils import parse_chat_messages
 from fastdeploy.input.base_processor import BaseTextProcessor
 from fastdeploy.input.encodings import EncodingRegistry
 from fastdeploy.input.image_processors import ImageProcessorRegistry
 from fastdeploy.input.mm_model_config import MODEL_CONFIGS
 from fastdeploy.input.utils import IDS_TYPE_FLAG, process_stop_token_ids
 from fastdeploy.utils import data_processor_logger
 QWEN_VL = "qwen_vl"
 QWEN3_VL = "qwen3_vl"
 PADDLEOCR_VL = "paddleocr_vl"
 ERNIE4_5_VL = "ernie4_5_vl"
 _SUPPORTED_MODEL_TYPES = {QWEN_VL, QWEN3_VL, PADDLEOCR_VL, ERNIE4_5_VL}
 _QWEN_EXPECTED_KWARGS = {
    "video_max_frames": int,
    "video_min_frames": int,
 }
 _ERNIE_EXPECTED_KWARGS = {
    "spatial_conv_size": int,
    "temporal_conv_size": int,
    "image_min_pixels": int,
    "image_max_pixels": int,
    "video_min_pixels": int,
    "video_max_pixels": int,
    "video_target_frames": int,
    "video_frames_sample": str,
    "video_max_frames": int,
    "video_min_frames": int,
    "video_fps": int,
 }
 _DEFAULT_MM_LIMITS = {"image": 1, "video": 1, "audio": 1}
 _SAMPLING_EPS = 1e-5
@@ -64,8 +43,9 @@ _SAMPLING_EPS = 1e-5
 class MultiModalProcessor(BaseTextProcessor):
    """Unified multimodal processor for all supported VL model types.
-    Dispatches image-processor creation, config initialisation, and
+    Uses a composition pattern: model-type-specific encoding logic is
-    encoding logic based on ``model_type``.
+    delegated to ``self.enc`` (an Encoding instance), while common logic
    (tokenization loop, request processing, caching) lives here.
    """
    def __init__(
@@ -79,19 +59,16 @@ class MultiModalProcessor(BaseTextProcessor):
        tool_parser_obj=None,
        enable_processor_cache: bool = False,
    ):
-        if model_type not in _SUPPORTED_MODEL_TYPES:
+        if model_type not in MODEL_CONFIGS:
-            raise ValueError(
+            raise ValueError(f"Unsupported model_type '{model_type}'. " f"Must be one of {sorted(MODEL_CONFIGS)}.")
                f"Unsupported model_type '{model_type}'. " f"Must be one of {sorted(_SUPPORTED_MODEL_TYPES)}."
            )
        self.model_type = model_type
        self.config = config
        self.cfg = MODEL_CONFIGS[model_type]
        self.enable_processor_cache = enable_processor_cache
        tokenizer_type = "ernie4_5" if model_type == ERNIE4_5_VL else "auto"
        super().__init__(
            model_name_or_path,
-            tokenizer_type=tokenizer_type,
+            tokenizer_type=self.cfg.tokenizer_type,
            reasoning_parser_obj=reasoning_parser_obj,
            tool_parser_obj=tool_parser_obj,
        )
@@ -99,8 +76,13 @@ class MultiModalProcessor(BaseTextProcessor):
        data_processor_logger.info(f"model_name_or_path: {model_name_or_path}")
        processor_kwargs = self._parse_processor_kwargs(mm_processor_kwargs)
-        self._init_mm_processor(processor_kwargs)
+        self._init_image_processor()
-        self._init_mm_config()
+        self._init_role_prefixes()
        # Composition: create encoding strategy via registry
        enc_cls = EncodingRegistry.get(self.model_type)
        self.enc = enc_cls(self, processor_kwargs)
        self.limit_mm_per_prompt = self._parse_limits(limit_mm_per_prompt)
    def _load_tokenizer(self):
@@ -122,76 +104,30 @@ class MultiModalProcessor(BaseTextProcessor):
            tokenizer = AutoTokenizer.from_pretrained(self.model_name_or_path, padding_side="left", use_fast=True)
        return tokenizer
-    def _init_mm_processor(self, processor_kwargs: dict):
+    def _init_image_processor(self):
-        """Create the model-type-specific internal DataProcessor."""
+        """Create the appropriate image processor."""
-        if self.model_type == QWEN_VL:
+        cls = ImageProcessorRegistry.get(self.model_type)
-            from fastdeploy.input.qwen_vl_processor.process import DataProcessor
+        self.image_processor = cls.from_pretrained(self.model_name_or_path)
-            tokens_per_second = getattr(getattr(self.config, "vision_config", None), "tokens_per_second", 2)
+    def _init_role_prefixes(self):
-            self.processor = DataProcessor(
+        """Set up role prefixes for message parsing."""
-                model_path=self.model_name_or_path,
+        self.role_prefixes = {
-                enable_processor_cache=self.enable_processor_cache,
+            "system": "",
-                tokens_per_second=tokens_per_second,
+            "user": "User: ",
-                tokenizer=self.tokenizer,
+            "bot": "Assistant: ",
-                **processor_kwargs,
+            "assistant": "Assistant: ",
-            )
+        }
-        elif self.model_type == QWEN3_VL:
+        if self.cfg.has_tool_role:
-            from fastdeploy.input.qwen3_vl_processor.process import DataProcessor
+            self.role_prefixes["tool"] = "Tool: "
            self.processor = DataProcessor(
                model_path=self.model_name_or_path,
                enable_processor_cache=self.enable_processor_cache,
                tokenizer=self.tokenizer,
                **processor_kwargs,
            )
        elif self.model_type == PADDLEOCR_VL:
            from fastdeploy.input.paddleocr_vl_processor.process import DataProcessor
            tokens_per_second = getattr(getattr(self.config, "vision_config", None), "tokens_per_second", 2)
            self.processor = DataProcessor(
                model_path=self.model_name_or_path,
                enable_processor_cache=self.enable_processor_cache,
                tokens_per_second=tokens_per_second,
                tokenizer=self.tokenizer,
                **processor_kwargs,
            )
        elif self.model_type == ERNIE4_5_VL:
            from fastdeploy.input.ernie4_5_vl_processor.process import DataProcessor
            self.processor = DataProcessor(
                tokenizer_name=self.model_name_or_path,
                image_preprocessor_name=self.model_name_or_path,
                enable_processor_cache=self.enable_processor_cache,
                **processor_kwargs,
            )
            self.processor.eval()
    def _init_mm_config(self):
        """Set model-type-specific multimodal configuration attributes."""
        if self.model_type in (QWEN_VL, QWEN3_VL):
            self.image_patch_id = self.processor.image_token_id
        elif self.model_type == PADDLEOCR_VL:
            self.image_patch_id = self.processor.image_patch_id
        elif self.model_type == ERNIE4_5_VL:
            self.image_patch_id = self.processor.image_patch_id
            self.spatial_conv_size = self.processor.spatial_conv_size
    def _parse_processor_kwargs(self, kwargs: Optional[dict]) -> dict:
        """Parse and validate multimodal processor kwargs."""
        if not kwargs:
            return {}
        try:
            if not isinstance(kwargs, dict):
                raise ValueError("mm-processor-kwargs must be a dictionary")
            data_processor_logger.info(f"Processing kwargs: {kwargs}")
-
+            expected_types = self.cfg.expected_kwargs
            if self.model_type == ERNIE4_5_VL:
                expected_types = _ERNIE_EXPECTED_KWARGS
            else:
                expected_types = _QWEN_EXPECTED_KWARGS
            for key, value in kwargs.items():
                if key in expected_types and not isinstance(value, expected_types[key]):
                    raise ValueError(
@@ -199,16 +135,13 @@ class MultiModalProcessor(BaseTextProcessor):
                        f"{expected_types[key].__name__}, got {type(value).__name__}"
                    )
            return kwargs
        except Exception as e:
            data_processor_logger.warning(f"Invalid mm-processor-kwargs format: {e}")
            return {}
    def _parse_limits(self, limits: Optional[dict]) -> dict:
        """Parse multimodal input limits, merging with defaults."""
        if not limits:
            return dict(_DEFAULT_MM_LIMITS)
        try:
            if not isinstance(limits, dict):
                raise ValueError("limit-mm-per-prompt must be a dictionary")
@@ -219,7 +152,6 @@ class MultiModalProcessor(BaseTextProcessor):
            return dict(_DEFAULT_MM_LIMITS)
    def _check_mm_limits(self, item):
        """Validate multimodal inputs against configured limits."""
        if isinstance(item, dict):
            mm_data = item
        else:
@@ -232,7 +164,6 @@ class MultiModalProcessor(BaseTextProcessor):
                            mm_data["image"].append(part)
                        elif part_type in ("video_url", "video"):
                            mm_data["video"].append(part)
        for modality, data in mm_data.items():
            if modality in self.limit_mm_per_prompt:
                limit = self.limit_mm_per_prompt[modality]
@@ -240,86 +171,201 @@ class MultiModalProcessor(BaseTextProcessor):
                    raise ValueError(f"Too many {modality} items in prompt, " f"got {len(data)} but limit is {limit}")
    def get_mm_max_tokens_per_item(self, seq_len: int) -> Optional[Mapping[str, int]]:
-        """Return per-modality max token counts, if available."""
+        return self.enc.get_mm_max_tokens_per_item(seq_len)
-        if self.model_type == ERNIE4_5_VL:
+
-            return self.processor.get_mm_max_tokens_per_item(seq_len)
+    def _extract_mm_items(self, request):
-        return None
+        """Extract images/videos from request messages, handling processor cache."""
        messages = parse_chat_messages(request.get("messages"))
        mm_items = []
        for msg in messages:
            role = msg.get("role")
            if role not in self.role_prefixes:
                raise ValueError(f"Unsupported role: {role}")
            content = msg.get("content")
            if not isinstance(content, list):
                content = [content]
            for item in content:
                if item.get("type") in ["image", "video"]:
                    mm_items.append(item)
        missing_hashes, missing_idx = [], []
        for idx, item in enumerate(mm_items):
            if not item.get("data"):
                missing_hashes.append(item.get("uuid"))
                missing_idx.append(idx)
        if len(missing_hashes) > 0 and not self.enable_processor_cache:
            raise ValueError("Missing items cannot be retrieved without processor cache.")
        dealer = None
        if self.enable_processor_cache:
            context = zmq.Context()
            dealer = context.socket(zmq.DEALER)
            dealer.connect("ipc:///dev/shm/processor_cache.ipc")
            missing_items = self.get_processor_cache(dealer, missing_hashes)
            for idx in range(len(missing_items)):
                if not missing_items[idx]:
                    raise ValueError(f"Missing item {idx} not found in processor cache")
                mm_items[missing_idx[idx]]["data"] = missing_items[idx]
        images, videos = [], []
        image_uuid, video_uuid = [], []
        for item in mm_items:
            if item.get("type") == "image":
                images.append(item["data"])
                image_uuid.append(item["uuid"])
            elif item.get("type") == "video":
                videos.append(item["data"])
                video_uuid.append(item["uuid"])
            else:
                raise ValueError(f"Unsupported multimodal type: {item.get('type')}")
        return images, videos, image_uuid, video_uuid, dealer, missing_idx, mm_items
    def text2ids(self, text, images=None, videos=None, image_uuid=None, video_uuid=None):
        """Convert text with image/video placeholders into model inputs."""
        outputs = self.enc._make_outputs()
        IMAGE_PLACEHOLDER = self.cfg.image_placeholder
        VIDEO_PLACEHOLDER = self.cfg.video_placeholder
        IMAGE_PLACEHOLDER_LEN = len(IMAGE_PLACEHOLDER)
        VIDEO_PLACEHOLDER_LEN = len(VIDEO_PLACEHOLDER)
        st, image_idx, video_idx = 0, 0, 0
        while st < len(text):
            image_pos = text.find(IMAGE_PLACEHOLDER, st)
            image_pos = len(text) if image_pos == -1 else image_pos
            video_pos = text.find(VIDEO_PLACEHOLDER, st)
            video_pos = len(text) if video_pos == -1 else video_pos
            ed = min(image_pos, video_pos)
            self._add_text(text[st:ed], outputs)
            if ed == len(text):
                break
            if ed == image_pos:
                image = images[image_idx]
                uuid = image_uuid[image_idx] if image_uuid else None
                if not isinstance(image, tuple):
                    self.enc.add_image(image, outputs, uuid)
                else:
                    self.enc.add_processed_image(image, outputs, uuid)
                image_idx += 1
                st = ed + IMAGE_PLACEHOLDER_LEN
            else:
                item = videos[video_idx]
                uuid = video_uuid[video_idx] if video_uuid else None
                if not isinstance(item, tuple):
                    if isinstance(item, dict):
                        frames, meta = self.enc.load_video(item["video"], item)
                    else:
                        frames, meta = self.enc.load_video(item, {})
                    self.enc.add_video(frames, outputs, uuid, meta=meta)
                else:
                    self.enc.add_processed_video(item, outputs, uuid)
                video_idx += 1
                st = ed + VIDEO_PLACEHOLDER_LEN
        return outputs
    def request2ids(self, request):
        """Convert chat request with multimodal messages into model inputs."""
        images, videos, image_uuid, video_uuid, dealer, missing_idx, mm_items = self._extract_mm_items(request)
        if self.tokenizer.chat_template is None:
            raise ValueError("This model does not support chat template.")
        chat_template_kwargs = request.get("chat_template_kwargs", {})
        if self.cfg.chat_template_pass_request:
            # ernie: pass full request to apply_chat_template
            prompt = self.tokenizer.apply_chat_template(
                request,
                tokenize=False,
                add_generation_prompt=request.get("add_generation_prompt", True),
                **chat_template_kwargs,
            )
        else:
            messages = parse_chat_messages(request.get("messages"))
            prompt = self.tokenizer.apply_chat_template(
                messages,
                tokenize=False,
                add_generation_prompt=request.get("add_generation_prompt", True),
                **chat_template_kwargs,
            )
        request["prompt_tokens"] = prompt
        outputs = self.text2ids(prompt, images, videos, image_uuid, video_uuid)
        if self.enable_processor_cache:
            self._update_mm_cache(dealer, missing_idx, mm_items, outputs)
        return outputs
    def _process_prompt_token_ids(self, request):
        """Handle the prompt_token_ids tokenisation path.
        Mirrors ``request2ids`` in structure: Processor owns extract/cache,
        Encoding only does pure encoding.
        """
        prompt_token_ids = request.get("prompt_token_ids", [])
        if not request.get("messages"):
            return self.enc.prompt_token_ids2outputs(prompt_token_ids)
        images, videos, image_uuid, video_uuid, dealer, missing_idx, mm_items = self._extract_mm_items(request)
        outputs = self.enc.prompt_token_ids2outputs(prompt_token_ids, mm_items)
        if self.enable_processor_cache:
            self._update_mm_cache(dealer, missing_idx, mm_items, outputs)
        return outputs
    def _update_mm_cache(self, dealer, missing_idx, mm_items, outputs):
        """Write newly-processed multimodal items to the processor cache."""
        missing_idx_set = set(missing_idx)
        hashes_to_cache, items_to_cache = [], []
        for idx in range(len(mm_items)):
            if idx in missing_idx_set:
                continue
            meta = {}
            grid_thw = np.asarray(outputs["grid_thw"][idx])
            if grid_thw.ndim > 1:
                t, h, w = grid_thw[0]
            else:
                t, h, w = grid_thw
            meta["thw"] = (int(t), int(h), int(w))
            if "fps" in outputs:
                meta["fps"] = outputs["fps"][idx]
            hashes_to_cache.append(outputs["mm_hashes"][idx])
            items_to_cache.append((outputs["images"][idx], meta))
        if hashes_to_cache:
            self.update_processor_cache(dealer, hashes_to_cache, items_to_cache)
        return outputs
    def _add_text(self, tokens, outputs):
        """Add text tokens to outputs, delegating position logic to enc."""
        if not tokens:
            return
        if isinstance(tokens, str):
            tokens_str = self.tokenizer.tokenize(tokens)
            tokens = self.tokenizer.convert_tokens_to_ids(tokens_str)
        num_tokens = len(tokens)
        outputs["input_ids"].extend(tokens)
        outputs["token_type_ids"].extend([IDS_TYPE_FLAG["text"]] * num_tokens)
        self.enc.add_text_positions(outputs, num_tokens)
    def process_request_dict(self, request, max_model_len=None):
-        """Process a request dictionary into model inputs.
+        """Process a request dictionary into model inputs."""
-
+        cfg = self.cfg
        Unified template-method flow for all VL model types.  Per-model
        differences are handled by small conditional branches rather than
        duplicating the entire pipeline.
        """
        request = self._apply_default_parameters(request)
        if not request.get("eos_token_ids"):
            request["eos_token_ids"] = self.eos_token_ids
-        self._process_stop_tokens(request)
+        # Stop tokens
-
+        if cfg.stop_tokens_variant == "qwen3":
        if self.model_type != PADDLEOCR_VL:
            self._process_bad_words(request)
        if self.model_type == ERNIE4_5_VL:
            logits_processors_args = self._prepare_think_stop_sentence(
                request.get("logits_processors_args") or {}, max_model_len
            )
            request["logits_processors_args"] = logits_processors_args
        outputs = self._tokenize_request(request)
        self._process_post_tokens(request, outputs)
        if self.model_type in (QWEN_VL, QWEN3_VL):
            request["enable_thinking"] = False
        outputs = self.pack_outputs(outputs)
        if self.model_type in (QWEN3_VL, ERNIE4_5_VL) and request.get("prompt_token_ids"):
            pass  # preserve existing prompt_token_ids
        else:
            request["prompt_token_ids"] = outputs["input_ids"].tolist()
        request["prompt_token_ids_len"] = len(request["prompt_token_ids"])
        request["multimodal_inputs"] = outputs
        if max_model_len is not None and len(request["prompt_token_ids"]) > max_model_len:
            request["prompt_token_ids"] = request["prompt_token_ids"][: max_model_len - 1]
        if self.model_type == ERNIE4_5_VL:
            logits_processors_args = self._update_thinking_prompt_state(
                request["prompt_token_ids"], request.get("logits_processors_args") or {}
            )
            request["logits_processors_args"] = logits_processors_args
        max_tokens = max_model_len - len(request["prompt_token_ids"])
        if request.get("max_tokens") is None:
            request["max_tokens"] = max(1, max_tokens)
        else:
            request["max_tokens"] = min(max_tokens, request["max_tokens"])
        if self.model_type == ERNIE4_5_VL and request.get("reasoning_max_tokens") is None:
            request["reasoning_max_tokens"] = max(int(request["max_tokens"] * 0.8), 1)
        if self.model_type in (PADDLEOCR_VL, ERNIE4_5_VL):
            if request.get("top_p") is not None and request.get("top_p") < _SAMPLING_EPS:
                request["top_p"] = _SAMPLING_EPS
                request["top_k"] = 1
        if self.model_type != QWEN3_VL and self.reasoning_parser:
            self._apply_reasoning_parser(request)
        if self.model_type == ERNIE4_5_VL:
            if request.get("response_max_tokens") is not None and request.get("enable_thinking") is False:
                request["max_tokens"] = min(request["response_max_tokens"], request["max_tokens"])
        data_processor_logger.info(f"Processed request {request}")
        return request
    def _process_stop_tokens(self, request):
        """Handle stop token processing based on model type."""
        if self.model_type == QWEN3_VL:
            stop_sequences = request.get("stop", [])
            if stop_sequences:
                stop_seqs, stop_seqs_len = self.update_stop_seq(stop_sequences)
@@ -328,34 +374,102 @@ class MultiModalProcessor(BaseTextProcessor):
        else:
            process_stop_token_ids(request, self.update_stop_seq)
-    def _process_bad_words(self, request):
+        # Bad words
-        """Process bad_words into token ids."""
+        if cfg.has_bad_words:
-        bad_words = request.get("bad_words")
+            bad_words = request.get("bad_words")
-        bad_words_token_ids = request.get("bad_words_token_ids")
+            bad_words_token_ids = request.get("bad_words_token_ids")
-        if bad_words:
+            if bad_words:
-            bad_words_token_ids = self.update_bad_words(bad_words, bad_words_token_ids)
+                bad_words_token_ids = self.update_bad_words(bad_words, bad_words_token_ids)
-            request["bad_words_token_ids"] = bad_words_token_ids
+                request["bad_words_token_ids"] = bad_words_token_ids
        # Logits processor (ernie think)
        if cfg.has_logits_processor_think:
            logits_processors_args = self._prepare_think_stop_sentence(
                request.get("logits_processors_args") or {}, max_model_len
            )
            request["logits_processors_args"] = logits_processors_args
        # Tokenize
        outputs = self._tokenize_request(request)
        # Post-token handling
        self._process_post_tokens(request, outputs)
        # Force disable thinking for qwen_vl / qwen3_vl
        if cfg.force_disable_thinking:
            request["enable_thinking"] = False
        # Pack outputs
        outputs = self.pack_outputs(outputs)
        # Assign prompt_token_ids
        if cfg.preserve_prompt_token_ids and request.get("prompt_token_ids"):
            pass  # preserve existing
        else:
            request["prompt_token_ids"] = outputs["input_ids"].tolist()
        request["multimodal_inputs"] = outputs
        # Truncation
        if max_model_len is not None and len(request["prompt_token_ids"]) > max_model_len:
            request["prompt_token_ids"] = request["prompt_token_ids"][: max_model_len - 1]
        request["prompt_token_ids_len"] = len(request["prompt_token_ids"])
        # Ernie: update thinking prompt state
        if cfg.has_logits_processor_think:
            logits_processors_args = self._update_thinking_prompt_state(
                request["prompt_token_ids"],
                request.get("logits_processors_args") or {},
            )
            request["logits_processors_args"] = logits_processors_args
        # max_tokens
        max_tokens = max_model_len - len(request["prompt_token_ids"])
        if request.get("max_tokens") is None:
            request["max_tokens"] = max(1, max_tokens)
        else:
            request["max_tokens"] = min(max_tokens, request["max_tokens"])
        # Ernie: default reasoning_max_tokens
        if cfg.set_default_reasoning_max_tokens and request.get("reasoning_max_tokens") is None:
            request["reasoning_max_tokens"] = max(int(request["max_tokens"] * 0.8), 1)
        # Clamp top_p
        if request.get("top_p") is not None and request.get("top_p") < _SAMPLING_EPS:
            request["top_p"] = _SAMPLING_EPS
            request["top_k"] = 1
        # Reasoning parser
        if self.reasoning_parser:
            self._apply_reasoning_parser(request)
        # Ernie: cap response_max_tokens
        if cfg.cap_response_max_tokens:
            if request.get("response_max_tokens") is not None and request.get("enable_thinking") is False:
                request["max_tokens"] = min(request["response_max_tokens"], request["max_tokens"])
        data_processor_logger.info(f"Processed request {request}")
        return request
    def _tokenize_request(self, request):
-        """Core tokenization dispatch: prompt_token_ids > prompt > messages."""
+        cfg = self.cfg
-        default_thinking = True if self.model_type == ERNIE4_5_VL else False
+        default_thinking = cfg.default_thinking
-        if request.get("prompt_token_ids") and self.model_type in (QWEN3_VL, ERNIE4_5_VL):
+        if request.get("prompt_token_ids") and cfg.supports_prompt_token_ids:
            messages = request.get("messages")
            if messages:
                self._check_mm_limits(messages)
            request.setdefault("enable_thinking", default_thinking)
-            return self.processor.prompt_token_ids2outputs(request)
+            return self._process_prompt_token_ids(request)
        elif request.get("prompt"):
            multimodal_data = request.get("multimodal_data") or {}
            self._check_mm_limits(multimodal_data)
            images = multimodal_data.get("image", None)
            videos = multimodal_data.get("video", None)
-            if self.model_type == ERNIE4_5_VL:
+            request["prompt_tokens"] = request.get("prompt")
                request["prompt_tokens"] = request.get("prompt")
            request.setdefault("enable_thinking", default_thinking)
-            return self.processor.text2ids(request["prompt"], images, videos)
+            return self.text2ids(request["prompt"], images, videos)
        elif request.get("messages"):
            messages = request["messages"]
@@ -369,65 +483,22 @@ class MultiModalProcessor(BaseTextProcessor):
                else:
                    raise ValueError("Invalid input: chat_template_kwargs must be a dict")
            request.setdefault("enable_thinking", default_thinking)
-            return self.processor.request2ids(request)
+            return self.request2ids(request)
        else:
            raise ValueError(f"Request must contain 'prompt', or 'messages': {request}")
    def _process_post_tokens(self, request, outputs):
-        """Handle post-tokenization token appending."""
+        completion_token_ids = request.get("completion_token_ids") or request.get("generated_token_ids")
-        if self.model_type == PADDLEOCR_VL:
+        if completion_token_ids:
-            metadata = request.get("metadata")
+            self.enc.append_completion_tokens(outputs, completion_token_ids)
            if metadata and metadata.get("generated_token_ids"):
                self._append_completion_tokens_qwen(outputs, metadata["generated_token_ids"])
        else:
            if request.get("completion_token_ids"):
                self.append_completion_tokens(outputs, request["completion_token_ids"])
    def _apply_reasoning_parser(self, request):
        """Apply reasoning parser and update model status dict."""
        model_status = self.reasoning_parser.get_model_status(request["prompt_token_ids"])
        parts = request["request_id"].split("_")
        if len(parts) > 1:
            real_req_id = parts[0]
            index = int(parts[1])
            n = request.get("n", 1)
            for idx in range(index * n, (index + 1) * n):
                self.model_status_dict[f"{real_req_id}_{idx}"] = model_status
        else:
            self.model_status_dict[request["request_id"]] = model_status
        request["enable_thinking"] = model_status == "think_start"
    def append_completion_tokens(self, multimodal_inputs, completion_token_ids):
-        """Append completion tokens to existing multimodal outputs."""
+        """Append completion tokens — delegates to enc."""
-        if self.model_type == ERNIE4_5_VL:
+        self.enc.append_completion_tokens(multimodal_inputs, completion_token_ids)
            self._append_completion_tokens_ernie(multimodal_inputs, completion_token_ids)
        else:
            self._append_completion_tokens_qwen(multimodal_inputs, completion_token_ids)
    def _append_completion_tokens_qwen(self, multimodal_inputs, completion_token_ids):
        """Append completion tokens for qwen_vl / qwen3_vl / paddleocr_vl."""
        num_tokens = len(completion_token_ids)
        multimodal_inputs["input_ids"].extend(completion_token_ids)
        multimodal_inputs["token_type_ids"].extend([0] * num_tokens)
        pos_ids = self.processor._compute_text_positions(multimodal_inputs["cur_position"], num_tokens)
        multimodal_inputs["position_ids"].append(pos_ids)
        multimodal_inputs["cur_position"] += num_tokens
    def _append_completion_tokens_ernie(self, multimodal_inputs, completion_token_ids):
        """Append completion tokens for ernie4_5_vl."""
        num_tokens = len(completion_token_ids)
        multimodal_inputs["input_ids"].extend(completion_token_ids)
        multimodal_inputs["token_type_ids"].extend([IDS_TYPE_FLAG["text"]] * num_tokens)
        start = multimodal_inputs["cur_position"]
        for i in range(num_tokens):
            multimodal_inputs["position_ids"].append([start + i] * 3)
        multimodal_inputs["cur_position"] += num_tokens
    def pack_outputs(self, outputs):
-        """Convert intermediate processing outputs to final format."""
+        """Convert intermediate outputs to final packed format."""
        if not outputs["images"]:
            outputs["images"] = None
            outputs["grid_thw"] = None
@@ -439,15 +510,22 @@ class MultiModalProcessor(BaseTextProcessor):
        outputs["input_ids"] = np.array(outputs["input_ids"], dtype=np.int64)
        outputs["token_type_ids"] = np.array(outputs["token_type_ids"], dtype=np.int64)
-        outputs["mm_num_token_func"] = self.processor.mm_num_tokens
+        outputs["mm_num_token_func"] = self.enc.mm_num_tokens
-        if self.model_type in (QWEN_VL, QWEN3_VL, PADDLEOCR_VL):
+        # Position IDs: delegate to encoding strategy
-            outputs["position_ids"] = np.concatenate(outputs["position_ids"], axis=1, dtype=np.int64)
+        self.enc.pack_position_ids(outputs)
            outputs["image_patch_id"] = self.processor.image_token_id
            outputs["video_patch_id"] = self.processor.video_token_id
            outputs["position_ids"] = outputs["position_ids"].transpose(1, 0)
        else:
            outputs["position_ids"] = np.array(outputs["position_ids"], dtype=np.int64)
            outputs["image_patch_id"] = self.image_patch_id
        return outputs
    def get_processor_cache(self, socket, mm_hashes):
        req = pickle.dumps(mm_hashes)
        socket.send_multipart([b"", req])
        _, resp = socket.recv_multipart()
        mm_items = pickle.loads(resp)
        data_processor_logger.info(f"Get cache of mm_hashes: {mm_hashes}")
        return mm_items
    def update_processor_cache(self, socket, mm_hashes, mm_items):
        req = pickle.dumps((mm_hashes, mm_items))
        socket.send_multipart([b"", req])
        data_processor_logger.info(f"Update cache of mm_hashes: {mm_hashes}")
@@ -28,8 +28,8 @@ from fastdeploy.engine.request import ImagePosition
 from fastdeploy.entrypoints.chat_utils import parse_chat_messages
 from fastdeploy.input.mm_data_processor import MMBaseDataProcessor
 from fastdeploy.input.utils import IDS_TYPE_FLAG
-from fastdeploy.input.video_utils import read_video_decord
+from fastdeploy.input.utils.video import read_video_decord
-from fastdeploy.input.video_utils import sample_frames_paddleocr as sample_frames
+from fastdeploy.input.utils.video import sample_frames_paddleocr as sample_frames
 from fastdeploy.multimodal.hasher import MultimodalHasher
 from fastdeploy.utils import data_processor_logger
@@ -94,13 +94,13 @@ class InputPreprocessor:
                    tool_parser_obj=tool_parser_obj,
                )
            else:
-                from fastdeploy.input.multimodal_processor import (
+                from fastdeploy.input.mm_model_config import (
                    ERNIE4_5_VL,
                    PADDLEOCR_VL,
                    QWEN3_VL,
                    QWEN_VL,
                    MultiModalProcessor,
                )
                from fastdeploy.input.multimodal_processor import MultiModalProcessor
                if ErnieArchitectures.contains_ernie_arch(architecture):
                    model_type = ERNIE4_5_VL
@@ -28,8 +28,8 @@ from fastdeploy.engine.request import ImagePosition
 from fastdeploy.entrypoints.chat_utils import parse_chat_messages
 from fastdeploy.input.mm_data_processor import MMBaseDataProcessor
 from fastdeploy.input.utils import IDS_TYPE_FLAG
-from fastdeploy.input.video_utils import read_video_decord
+from fastdeploy.input.utils.video import read_video_decord
-from fastdeploy.input.video_utils import sample_frames_qwen as sample_frames
+from fastdeploy.input.utils.video import sample_frames_qwen as sample_frames
 from fastdeploy.multimodal.hasher import MultimodalHasher
 from fastdeploy.utils import data_processor_logger
@@ -28,8 +28,8 @@ from fastdeploy.engine.request import ImagePosition
 from fastdeploy.entrypoints.chat_utils import parse_chat_messages
 from fastdeploy.input.mm_data_processor import MMBaseDataProcessor
 from fastdeploy.input.utils import IDS_TYPE_FLAG
-from fastdeploy.input.video_utils import read_video_decord
+from fastdeploy.input.utils.video import read_video_decord
-from fastdeploy.input.video_utils import sample_frames_qwen as sample_frames
+from fastdeploy.input.utils.video import sample_frames_qwen as sample_frames
 from fastdeploy.multimodal.hasher import MultimodalHasher
 from fastdeploy.utils import data_processor_logger
@@ -0,0 +1,41 @@
 # Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 """Utility package for fastdeploy.input — re-exports from sub-modules."""
 from fastdeploy.input.utils.common import (
    IDS_TYPE_FLAG,
    MAX_IMAGE_DIMENSION,
    process_stop_token_ids,
    validate_model_path,
 )
 from fastdeploy.input.utils.video import (
    VideoReaderWrapper,
    read_video_decord,
    sample_frames,
    sample_frames_paddleocr,
    sample_frames_qwen,
 )
 __all__ = [
    "IDS_TYPE_FLAG",
    "MAX_IMAGE_DIMENSION",
    "process_stop_token_ids",
    "validate_model_path",
    "VideoReaderWrapper",
    "read_video_decord",
    "sample_frames",
    "sample_frames_paddleocr",
    "sample_frames_qwen",
 ]
@@ -0,0 +1,94 @@
 # Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 """Render timestamps onto video frames."""
 import os
 from pathlib import Path
 from PIL import Image, ImageDraw, ImageFont
 FONT_PATH = os.path.join(Path(__file__).parent.absolute(), "Roboto-Regular.ttf")
 def render_single_image_with_timestamp(image: Image, number: str, rate: float, font_path: str = FONT_PATH):
    """Render a timestamp string onto a PIL Image.
    The font size is ``min(width, height) * rate``.
    Text is drawn in black with a white outline (10% of font size).
    """
    draw = ImageDraw.Draw(image)
    width, height = image.size
    font_size = int(min(width, height) * rate)
    outline_size = int(font_size * 0.1)
    font = ImageFont.truetype(font_path, font_size)
    x = 0
    y = 0
    draw.text(
        (x, y),
        number,
        font=font,
        fill=(0, 0, 0),
        stroke_width=outline_size,
        stroke_fill=(255, 255, 255),
    )
    return image
 def timestamp_converting(time_stamp_in_seconds):
    """Convert timestamp from seconds to ``HH:MM:SS.ss`` format."""
    hours = 0
    while time_stamp_in_seconds >= 3600:
        hours += 1
        time_stamp_in_seconds -= 3600
    mins = 0
    while time_stamp_in_seconds >= 60:
        mins += 1
        time_stamp_in_seconds -= 60
    time_hours = f"{int(hours):02d}"
    time_mins = f"{int(mins):02d}"
    time_secs = f"{time_stamp_in_seconds:05.02f}"
    fi_time_stamp = time_hours + ":" + time_mins + ":" + time_secs
    return fi_time_stamp
 def get_timestamp_for_uniform_frame_extraction(num_frames, frame_id, duration):
    """Get the timestamp of a frame during uniform extraction.
    Returns the timestamp in seconds.
    """
    time_stamp = duration * 1.0 * frame_id / num_frames
    return time_stamp
 def render_frame_timestamp(frame, timestamp, font_rate=0.1):
    """Render a timestamp onto a video frame.
    Parameters
    ----------
    frame : PIL.Image
        The video frame.
    timestamp : float
        Timestamp in seconds.
    font_rate : float
        Font size as a fraction of ``min(width, height)``.
    """
    time_stamp = "time: " + timestamp_converting(timestamp)
    new_frame = render_single_image_with_timestamp(frame, time_stamp, font_rate)
    return new_frame
@@ -0,0 +1,470 @@
 # Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 """Shared video utilities: VideoReaderWrapper, read_video_decord, sample_frames, read_frames_decord."""
 import datetime
 import hashlib
 import io
 import math
 import os
 import random
 import threading
 import uuid
 from tempfile import NamedTemporaryFile as ntf
 from typing import Optional, Union
 import numpy as np
 from PIL import Image
 from fastdeploy.input.image_processors.common import ceil_by_factor, floor_by_factor
 from fastdeploy.utils import data_processor_logger
 __all__ = [
    "VideoReaderWrapper",
    "read_video_decord",
    "sample_frames",
    "sample_frames_qwen",
    "sample_frames_paddleocr",
    "get_frame_indices",
    "read_frames_decord",
    "EXTRACTED_FRAME_DIR",
    "get_filename",
 ]
 # ---------------------------------------------------------------------------
 # VideoReaderWrapper
 # ---------------------------------------------------------------------------
 def _is_gif(data: bytes) -> bool:
    """Check if bytes represent a GIF based on magic header."""
    return data[:6] in (b"GIF87a", b"GIF89a")
 class VideoReaderWrapper:
    """decord.VideoReader wrapper that fixes a memory leak and adds GIF support.
    Reference: https://github.com/dmlc/decord/issues/208
    """
    def __init__(self, video_path, *args, **kwargs):
        import decord
        try:
            # moviepy 1.0
            import moviepy.editor as mp
        except Exception:
            # moviepy 2.0
            import moviepy as mp
        with ntf(delete=True, suffix=".gif") as gif_file:
            gif_input = None
            self.original_file = None  # only set when we create a temp file
            if isinstance(video_path, str):
                if video_path.lower().endswith(".gif"):
                    gif_input = video_path
            elif isinstance(video_path, bytes):
                if _is_gif(video_path):
                    gif_file.write(video_path)
                    gif_file.flush()
                    gif_input = gif_file.name
            elif isinstance(video_path, io.BytesIO):
                video_path.seek(0)
                tmp_bytes = video_path.read()
                video_path.seek(0)
                if _is_gif(tmp_bytes):
                    gif_file.write(tmp_bytes)
                    gif_file.flush()
                    gif_input = gif_file.name
            if gif_input is not None:
                clip = mp.VideoFileClip(gif_input)
                mp4_file = ntf(delete=False, suffix=".mp4")
                mp4_path = mp4_file.name
                mp4_file.close()  # close before moviepy writes
                clip.write_videofile(mp4_path, verbose=False, logger=None)
                clip.close()
                video_path = mp4_path
                self.original_file = video_path  # temp mp4, cleaned up in __del__
            self._reader = decord.VideoReader(video_path, *args, **kwargs)
            self._reader.seek(0)
    def __len__(self):
        return len(self._reader)
    def __getitem__(self, key):
        frames = self._reader[key]
        self._reader.seek(0)
        return frames
    def get_avg_fps(self):
        return self._reader.get_avg_fps()
    def seek(self, pos):
        return self._reader.seek(pos)
    def __del__(self):
        original_file = getattr(self, "original_file", None)
        if original_file:
            try:
                os.remove(original_file)
            except OSError:
                pass
 # ---------------------------------------------------------------------------
 # read_video_decord
 # ---------------------------------------------------------------------------
 def read_video_decord(video_path, save_to_disk: bool = False):
    """Load a video file and return (video_reader, video_meta, video_path).
    video_meta contains keys: "fps", "duration", "num_of_frame".
    """
    if isinstance(video_path, VideoReaderWrapper):
        video_reader = video_path
    else:
        if isinstance(video_path, bytes):
            video_path = io.BytesIO(video_path)
        video_reader = VideoReaderWrapper(video_path, num_threads=1)
    vlen = len(video_reader)
    fps = video_reader.get_avg_fps()
    duration = vlen / float(fps)
    video_meta = {"fps": fps, "duration": duration, "num_of_frame": vlen}
    return video_reader, video_meta, video_path
 # ---------------------------------------------------------------------------
 # sample_frames — qwen_vl variant
 # ---------------------------------------------------------------------------
 def sample_frames_qwen(
    frame_factor: int,
    min_frames: int,
    max_frames: int,
    metadata: Optional[dict] = None,
    fps: Optional[Union[int, float]] = -1,
    num_frames: Optional[int] = -1,
 ) -> np.ndarray:
    """Sample frame indices — qwen_vl variant.
    Sentinel defaults are -1. Applies ceil_by_factor on min_frames and ensures
    num_frames is divisible by 4.
    """
    if fps > 0 and num_frames > 0:
        raise ValueError("`num_frames` and `fps` are mutually exclusive arguments, please use only one!")
    if metadata is None:
        raise ValueError("metadata is required for sample_frames_qwen")
    total_num_frames = metadata["num_of_frame"]
    if num_frames > 0:
        num_frames = round(num_frames / frame_factor) * frame_factor
    elif fps > 0:
        min_frames = ceil_by_factor(min_frames, frame_factor)
        max_frames = floor_by_factor(min(max_frames, total_num_frames), frame_factor)
        num_frames = total_num_frames / metadata["fps"] * fps
        if num_frames > total_num_frames:
            data_processor_logger.warning(f"smart_nframes: nframes[{num_frames}] > total_frames[{total_num_frames}]")
        num_frames = min(min(max(num_frames, min_frames), max_frames), total_num_frames)
        num_frames = floor_by_factor(num_frames, frame_factor)
    if num_frames > total_num_frames:
        raise ValueError(
            f"Video can't be sampled. The inferred `num_frames={num_frames}` exceeds "
            f"`total_num_frames={total_num_frames}`. "
            "Decrease `num_frames` or `fps` for sampling."
        )
    # num_frames must be divisible by 4
    if num_frames > 2 and num_frames % 4 != 0:
        num_frames = (num_frames // 4) * 4
        total_num_frames = (total_num_frames // 4) * 4
        num_frames = min(min(max(num_frames, min_frames), max_frames), total_num_frames)
    if num_frames > 0:
        indices = np.arange(0, total_num_frames, total_num_frames / num_frames).astype(np.int32)
    else:
        indices = np.arange(0, total_num_frames).astype(np.int32)
    return indices
 # ---------------------------------------------------------------------------
 # sample_frames — paddleocr_vl / ernie4_5_vl variant
 # ---------------------------------------------------------------------------
 def sample_frames_paddleocr(
    frame_factor: int,
    min_frames: int,
    max_frames: int,
    metadata: Optional[dict] = None,
    fps: Optional[Union[int, float]] = None,
    num_frames: Optional[int] = None,
 ) -> np.ndarray:
    """Sample frame indices — paddleocr_vl / ernie4_5_vl variant.
    Sentinel defaults are None. Uses plain math.floor/ceil; no %4 correction.
    """
    fps = fps or 0
    num_frames = num_frames or 0
    if fps > 0 and num_frames > 0:
        raise ValueError("`num_frames` and `fps` are mutually exclusive arguments, please use only one!")
    if metadata is None:
        raise ValueError("metadata is required for sample_frames_paddleocr")
    total_num_frames = metadata["num_of_frame"]
    if num_frames > 0:
        num_frames = round(num_frames / frame_factor) * frame_factor
    elif fps > 0:
        max_frames = math.floor(min(max_frames, total_num_frames) / frame_factor) * frame_factor
        num_frames = total_num_frames / metadata["fps"] * fps
        num_frames = min(min(max(num_frames, min_frames), max_frames), total_num_frames)
        num_frames = math.floor(num_frames / frame_factor) * frame_factor
    if num_frames > total_num_frames:
        raise ValueError(
            f"Video can't be sampled. The inferred `num_frames={num_frames}` exceeds "
            f"`total_num_frames={total_num_frames}`. "
            "Decrease `num_frames` or `fps` for sampling."
        )
    if num_frames > 0:
        indices = np.arange(0, total_num_frames, total_num_frames / num_frames).astype(np.int32)
    else:
        indices = np.arange(0, total_num_frames).astype(np.int32)
    return indices
 def sample_frames(
    frame_factor: int,
    min_frames: int,
    max_frames: int,
    metadata: Optional[dict] = None,
    fps: Optional[Union[int, float]] = None,
    num_frames: Optional[int] = None,
    variant: str = "paddleocr",
 ) -> np.ndarray:
    """Dispatch to sample_frames_qwen or sample_frames_paddleocr based on variant."""
    if variant == "qwen":
        _fps = fps if fps is not None else -1
        _num_frames = num_frames if num_frames is not None else -1
        return sample_frames_qwen(frame_factor, min_frames, max_frames, metadata, _fps, _num_frames)
    if variant == "paddleocr":
        return sample_frames_paddleocr(frame_factor, min_frames, max_frames, metadata, fps, num_frames)
    raise ValueError(f"Unknown variant {variant!r}. Expected 'paddleocr' or 'qwen'.")
 # ---------------------------------------------------------------------------
 # IO helpers (migrated from ernie4_5_vl_processor/utils/io_utils.py)
 # ---------------------------------------------------------------------------
 EXTRACTED_FRAME_DIR = "./download_tmp/extracted_frames/"
 def get_filename(url=None):
    """Generate a unique filename, optionally based on a URL hash."""
    if url is None:
        return str(uuid.uuid4()).replace("-", "")
    t = datetime.datetime.now()
    if not isinstance(url, bytes):
        url = url.encode("utf-8")
    md5_hash = hashlib.md5(url).hexdigest()
    pid = os.getpid()
    tid = threading.get_ident()
    image_filename = f"{t.year}-{t.month:02d}-{t.day:02d}-{pid}-{tid}-{md5_hash}"
    return image_filename
 # ---------------------------------------------------------------------------
 # get_frame_indices / read_frames_decord
 # (migrated from ernie4_5_vl_processor/process_video.py)
 # ---------------------------------------------------------------------------
 def get_frame_indices(
    vlen,
    target_frames=-1,
    target_fps=-1,
    frames_sample="middle",
    fix_start=None,
    input_fps=-1,
 ):
    """Get frame indices for sampling from a video."""
    assert frames_sample in ["rand", "middle", "leading"]
    if target_frames > 0:
        assert target_fps <= 0, "target_fps must be negative if target_frames is given."
        if target_frames > vlen:
            acc_samples = vlen
            data_processor_logger.info(
                f"target_frames={target_frames} is larger than video length {vlen}, "
                f"will sample {acc_samples} frames."
            )
        else:
            acc_samples = target_frames
            data_processor_logger.debug(f"sampling at target_frames={target_frames}, frames_sample={frames_sample}")
        intervals = np.linspace(start=0, stop=vlen, num=acc_samples + 1).astype(int)
        ranges = []
        for idx, interv in enumerate(intervals[:-1]):
            ranges.append((interv, intervals[idx + 1] - 1))
        if frames_sample == "rand":
            try:
                frame_indices = [random.choice(range(x[0], x[1])) for x in ranges]
            except Exception:
                frame_indices = np.random.permutation(vlen)[:acc_samples]
                frame_indices.sort()
                frame_indices = list(frame_indices)
        elif fix_start is not None:
            frame_indices = [x[0] + fix_start for x in ranges]
        elif frames_sample == "leading":
            frame_indices = [x[0] for x in ranges]
        elif frames_sample == "middle":
            frame_indices = [(x[0] + x[1]) // 2 for x in ranges]
        else:
            raise NotImplementedError
    elif target_fps > 0:
        assert target_frames <= 0, "target_frames must be negative if target_fps is given."
        assert input_fps > 0, "input_fps must be provided if target_fps is given."
        data_processor_logger.info(f"sampling at fps={target_fps}, frames_sample={frames_sample}")
        duration = float(vlen) / input_fps
        delta = 1 / target_fps
        if frames_sample == "middle":
            frame_seconds = np.arange(0 + delta / 2, duration + delta / 2, delta)
        elif frames_sample == "leading":
            frame_seconds = np.arange(0, duration, delta)
        if frames_sample == "rand":
            frame_seconds = np.arange(0 + delta / 2, duration + delta / 2, delta)
            rand_offset = np.random.rand(*(frame_seconds.shape)) - 0.5
            frame_seconds += rand_offset * delta
        frame_indices = np.around(frame_seconds * input_fps).astype(int)
        frame_indices = [e for e in frame_indices if e < vlen]
    else:
        raise ValueError("Must provide either positive target_fps or positive target_frames.")
    return frame_indices
 def read_frames_decord(
    video_path,
    video_reader,
    video_meta,
    target_frames=-1,
    target_fps=-1,
    frames_sample="middle",
    fix_start=None,
    save_to_disk=False,
    cache_dir=None,
    frame_indices=None,
    tol=10,
 ):
    """Read frames from a video using decord, with retry logic for corrupt frames."""
    if cache_dir is None:
        cache_dir = EXTRACTED_FRAME_DIR
    if frame_indices is None:
        frame_indices = get_frame_indices(
            video_meta["num_of_frame"],
            target_frames=target_frames,
            target_fps=target_fps,
            frames_sample=frames_sample,
            fix_start=fix_start,
            input_fps=video_meta["fps"],
        )
    frames = []
    for frame_indice_index in range(0, len(frame_indices)):
        frame_indice = frame_indices[frame_indice_index]
        try:
            frames.append(video_reader[frame_indice].asnumpy())
        except Exception as e:
            data_processor_logger.debug(f"encounter error when get frame: {frame_indice}, error: {e}")
            previous_counter = 1
            later_counter = 1
            previous_after_flag = True
            if frame_indice == 0 or frame_indice == len(video_reader) - 1:
                cur_tol = tol * 2
            else:
                cur_tol = tol
            while previous_counter < cur_tol or later_counter < cur_tol:
                if previous_after_flag:
                    if frame_indice - previous_counter < 0:
                        previous_counter += 1
                        previous_after_flag = not previous_after_flag
                        continue
                    try:
                        frames.append(video_reader[frame_indice - previous_counter].asnumpy())
                        data_processor_logger.info(
                            f"replace {frame_indice}-th frame with {frame_indice-previous_counter}-th frame"
                        )
                        frame_indices[frame_indice_index] = frame_indice - previous_counter
                        break
                    except Exception as e:
                        previous_counter += 1
                        data_processor_logger.info(f"error: {e}")
                else:
                    if frame_indice + later_counter >= len(video_reader):
                        later_counter += 1
                        previous_after_flag = not previous_after_flag
                        continue
                    try:
                        frames.append(video_reader[frame_indice + later_counter].asnumpy())
                        data_processor_logger.info(
                            f"replace {frame_indice}-th frame with {frame_indice+later_counter}-th frame"
                        )
                        frame_indices[frame_indice_index] = frame_indice + later_counter
                        break
                    except Exception:
                        later_counter += 1
                previous_after_flag = not previous_after_flag
    frames = np.stack(frames, axis=0)
    assert len(frames) == len(frame_indices), f"len(frames): {len(frames)} != len(frame_indices): {len(frame_indices)}"
    ret = []
    url_sha1 = get_filename()
    for idx, frame in enumerate(frames):
        tmp = Image.fromarray(frame, "RGB")
        if save_to_disk:
            save_path = os.path.join(cache_dir, f"{url_sha1}", f"{idx}.png")
            if not os.path.exists(os.path.dirname(save_path)):
                os.makedirs(os.path.dirname(save_path))
            tmp.save(save_path)
            tmp = save_path
        ret.append(tmp)
    time_stamps = [frame_idx * video_meta["duration"] / video_meta["num_of_frame"] for frame_idx in frame_indices]
    return ret, frame_indices, time_stamps
@@ -85,7 +85,7 @@ import zmq
 from fastdeploy import envs
 from fastdeploy.engine.tasks import PoolingTask
-from fastdeploy.input.ernie4_5_vl_processor import DataProcessor
+from fastdeploy.input.image_processors.adaptive_processor import AdaptiveImageProcessor
 from fastdeploy.inter_communicator import IPCSignal, ZmqIpcClient
 from fastdeploy.logger.deterministic_logger import DeterministicLogger
 from fastdeploy.model_executor.forward_meta import ForwardMeta
@@ -2867,12 +2867,7 @@ class GPUModelRunner(ModelRunnerBase):
        return
    def _init_image_preprocess(self) -> None:
-        processor = DataProcessor(
+        image_preprocess = AdaptiveImageProcessor.from_pretrained(str(self.model_config.model))
            tokenizer_name=self.model_config.model,
            image_preprocessor_name=str(self.model_config.model),
        )
        processor.eval()
        image_preprocess = processor.image_preprocessor
        image_preprocess.image_mean_tensor = paddle.to_tensor(image_preprocess.image_mean, dtype="float32").reshape(
            [1, 3, 1, 1]
        )
@@ -33,7 +33,7 @@ from fastdeploy.config import FDConfig
 from fastdeploy.engine.pooling_params import PoolingParams
 from fastdeploy.engine.request import ImagePosition, Request, RequestType
 from fastdeploy.engine.tasks import PoolingTask
-from fastdeploy.input.ernie4_5_vl_processor import DataProcessor
+from fastdeploy.input.image_processors.adaptive_processor import AdaptiveImageProcessor
 from fastdeploy.inter_communicator import IPCSignal, ZmqIpcClient
 from fastdeploy.model_executor.forward_meta import ForwardMeta
 from fastdeploy.model_executor.graph_optimization.utils import (
@@ -2566,12 +2566,7 @@ class MetaxModelRunner(ModelRunnerBase):
        return
    def _init_image_preprocess(self) -> None:
-        processor = DataProcessor(
+        image_preprocess = AdaptiveImageProcessor.from_pretrained(str(self.model_config.model))
            tokenizer_name=self.model_config.model,
            image_preprocessor_name=str(self.model_config.model),
        )
        processor.eval()
        image_preprocess = processor.image_preprocessor
        image_preprocess.image_mean_tensor = paddle.to_tensor(image_preprocess.image_mean, dtype="float32").reshape(
            [1, 3, 1, 1]
        )
@@ -31,7 +31,7 @@ from paddle import nn
 from fastdeploy import envs
 from fastdeploy.config import FDConfig
 from fastdeploy.engine.request import ImagePosition, Request, RequestType
-from fastdeploy.input.ernie4_5_vl_processor import DataProcessor
+from fastdeploy.input.image_processors.adaptive_processor import AdaptiveImageProcessor
 from fastdeploy.inter_communicator import IPCSignal, ZmqIpcClient
 from fastdeploy.model_executor.forward_meta import ForwardMeta
 from fastdeploy.model_executor.graph_optimization.utils import (
@@ -1842,12 +1842,7 @@ class XPUModelRunner(ModelRunnerBase):
            self.forward_meta.clear_caches()
    def _init_image_preprocess(self) -> None:
-        processor = DataProcessor(
+        image_preprocess = AdaptiveImageProcessor.from_pretrained(str(self.model_config.model))
            tokenizer_name=self.model_config.model,
            image_preprocessor_name=str(self.model_config.model),
        )
        processor.eval()
        image_preprocess = processor.image_preprocessor
        image_preprocess.image_mean_tensor = paddle.to_tensor(image_preprocess.image_mean, dtype="float32").reshape(
            [1, 3, 1, 1]
        )
@@ -303,6 +303,7 @@ setup(
            "model_executor/models/*",
            "model_executor/layers/*",
            "input/ernie4_5_vl_processor/utils/*",
            "input/utils/Roboto-Regular.ttf",
            "model_executor/ops/gcu/*",
            "model_executor/ops/gcu/fastdeploy_ops/*",
            "cache_manager/transfer_factory/get_rdma_nics.sh",
@@ -30,7 +30,7 @@ from fastdeploy.input.paddleocr_vl_processor.paddleocr_vl_processor import (
    PaddleOCRVLProcessor,
 )
 from fastdeploy.input.paddleocr_vl_processor.process import DataProcessor
-from fastdeploy.input.video_utils import sample_frames_paddleocr as sample_frames
+from fastdeploy.input.utils.video import sample_frames_paddleocr as sample_frames
 MODULE_PATH = "fastdeploy.input.paddleocr_vl_processor.process"
@@ -24,8 +24,8 @@ from unittest.mock import patch
 import numpy as np
 from PIL import Image as PILImage
-import fastdeploy.input.ernie4_5_vl_processor.process_video as process_video_module
+import fastdeploy.input.utils.video as process_video_module
-from fastdeploy.input.ernie4_5_vl_processor.process_video import (
+from fastdeploy.input.utils.video import (
    get_frame_indices,
    read_frames_decord,
    read_video_decord,
@@ -21,7 +21,7 @@ import numpy as np
 from PIL import Image
 from fastdeploy.input.qwen_vl_processor import QwenVLProcessor
-from fastdeploy.input.video_utils import sample_frames_qwen as sample_frames
+from fastdeploy.input.utils.video import sample_frames_qwen as sample_frames
 def mock_pil_image(height, width):
@@ -41,16 +41,16 @@ class TestValidateModelPath(unittest.TestCase):
    def _patch_console_logger(self):
        """Patch console_logger.warning to capture warnings."""
-        import fastdeploy.input.utils as utils_mod
+        import fastdeploy.input.utils.common as common_mod
-        self._orig_warning = utils_mod.console_logger.warning
+        self._orig_warning = common_mod.console_logger.warning
-        utils_mod.console_logger.warning = self._capture_warning
+        common_mod.console_logger.warning = self._capture_warning
    def _unpatch_console_logger(self):
-        import fastdeploy.input.utils as utils_mod
+        import fastdeploy.input.utils.common as common_mod
        if self._orig_warning is not None:
-            utils_mod.console_logger.warning = self._orig_warning
+            common_mod.console_logger.warning = self._orig_warning
    def tearDown(self):
        self._unpatch_console_logger()
@@ -18,7 +18,7 @@ from unittest.mock import MagicMock, patch
 import numpy as np
-from fastdeploy.input.video_utils import (
+from fastdeploy.input.utils.video import (
    _is_gif,
    read_video_decord,
    sample_frames,
@@ -74,7 +74,7 @@ class TestIsGif(unittest.TestCase):
 class TestVideoReaderWrapper(unittest.TestCase):
    def _make_wrapper(self, video_path, mock_reader=None):
        """Construct a VideoReaderWrapper with decord mocked out."""
-        from fastdeploy.input.video_utils import VideoReaderWrapper
+        from fastdeploy.input.utils.video import VideoReaderWrapper
        if mock_reader is None:
            mock_reader = _make_mock_reader()
@@ -112,7 +112,7 @@ class TestVideoReaderWrapper(unittest.TestCase):
    def test_del_no_original_file(self):
        """__del__ should be a no-op when original_file is None."""
-        from fastdeploy.input.video_utils import VideoReaderWrapper
+        from fastdeploy.input.utils.video import VideoReaderWrapper
        wrapper = object.__new__(VideoReaderWrapper)
        wrapper.original_file = None
@@ -125,7 +125,7 @@ class TestVideoReaderWrapper(unittest.TestCase):
        import os
        import tempfile
-        from fastdeploy.input.video_utils import VideoReaderWrapper
+        from fastdeploy.input.utils.video import VideoReaderWrapper
        with tempfile.NamedTemporaryFile(delete=False) as f:
            tmp_path = f.name
@@ -138,7 +138,7 @@ class TestVideoReaderWrapper(unittest.TestCase):
    def test_non_gif_string_path_does_not_set_original_file(self):
        """Passing a non-GIF string path must NOT set original_file (bug fix)."""
-        from fastdeploy.input.video_utils import VideoReaderWrapper
+        from fastdeploy.input.utils.video import VideoReaderWrapper
        mock_reader = _make_mock_reader()
        mock_decord = MagicMock()
@@ -151,7 +151,7 @@ class TestVideoReaderWrapper(unittest.TestCase):
    def test_bytesio_non_gif_path_does_not_set_original_file(self):
        """Passing a BytesIO that is NOT a GIF must not set original_file."""
-        from fastdeploy.input.video_utils import VideoReaderWrapper
+        from fastdeploy.input.utils.video import VideoReaderWrapper
        mock_reader = _make_mock_reader()
        mock_decord = MagicMock()
@@ -172,16 +172,16 @@ class TestVideoReaderWrapper(unittest.TestCase):
 class TestReadVideoDecord(unittest.TestCase):
    def _patch_wrapper(self, num_frames=100, fps=25.0):
        """Return a context manager that replaces VideoReaderWrapper with a mock."""
-        from fastdeploy.input import video_utils
+        from fastdeploy.input.utils import video
        mock_wrapper = MagicMock()
        mock_wrapper.__len__ = MagicMock(return_value=num_frames)
        mock_wrapper.get_avg_fps = MagicMock(return_value=fps)
-        return patch.object(video_utils, "VideoReaderWrapper", return_value=mock_wrapper), mock_wrapper
+        return patch.object(video, "VideoReaderWrapper", return_value=mock_wrapper), mock_wrapper
    def test_existing_wrapper_passthrough(self):
        """Already-wrapped reader is returned as-is."""
-        from fastdeploy.input.video_utils import VideoReaderWrapper
+        from fastdeploy.input.utils.video import VideoReaderWrapper
        mock_wrapper = MagicMock(spec=VideoReaderWrapper)
        mock_wrapper.__len__ = MagicMock(return_value=50)
@@ -196,7 +196,7 @@ class TestReadVideoDecord(unittest.TestCase):
    def test_bytes_input_converted_to_bytesio(self):
        """bytes input is converted to BytesIO before creating VideoReaderWrapper."""
-        from fastdeploy.input import video_utils
+        from fastdeploy.input.utils import video
        captured = []
@@ -210,14 +210,14 @@ class TestReadVideoDecord(unittest.TestCase):
            def get_avg_fps(self):
                return 10.0
-        with patch.object(video_utils, "VideoReaderWrapper", FakeWrapper):
+        with patch.object(video, "VideoReaderWrapper", FakeWrapper):
            reader, meta, path = read_video_decord(b"fake_video_bytes")
        self.assertIsInstance(captured[0], io.BytesIO)
    def test_string_path_input(self):
        """String path is passed through to VideoReaderWrapper."""
-        from fastdeploy.input import video_utils
+        from fastdeploy.input.utils import video
        class FakeWrapper:
            def __init__(self, path, *args, **kwargs):
@@ -229,7 +229,7 @@ class TestReadVideoDecord(unittest.TestCase):
            def get_avg_fps(self):
                return 30.0
-        with patch.object(video_utils, "VideoReaderWrapper", FakeWrapper):
+        with patch.object(video, "VideoReaderWrapper", FakeWrapper):
            reader, meta, path = read_video_decord("/fake/path.mp4")
        self.assertEqual(meta["num_of_frame"], 60)
@@ -333,18 +333,18 @@ class TestSampleFramesDispatcher(unittest.TestCase):
    META = {"num_of_frame": 100, "fps": 25.0}
    def test_default_variant_is_paddleocr(self):
-        with patch("fastdeploy.input.video_utils.sample_frames_paddleocr", wraps=sample_frames_paddleocr) as mock_fn:
+        with patch("fastdeploy.input.utils.video.sample_frames_paddleocr", wraps=sample_frames_paddleocr) as mock_fn:
            sample_frames(1, 4, 100, self.META, num_frames=8)
            mock_fn.assert_called_once()
    def test_qwen_variant_dispatched(self):
-        with patch("fastdeploy.input.video_utils.sample_frames_qwen", wraps=sample_frames_qwen) as mock_fn:
+        with patch("fastdeploy.input.utils.video.sample_frames_qwen", wraps=sample_frames_qwen) as mock_fn:
            sample_frames(2, 4, 100, self.META, num_frames=8, variant="qwen")
            mock_fn.assert_called_once()
    def test_qwen_none_fps_converted_to_sentinel(self):
        """None fps/num_frames → converted to -1 before calling sample_frames_qwen."""
-        with patch("fastdeploy.input.video_utils.sample_frames_qwen", return_value=np.array([])) as mock_fn:
+        with patch("fastdeploy.input.utils.video.sample_frames_qwen", return_value=np.array([])) as mock_fn:
            sample_frames(2, 4, 100, self.META, fps=None, num_frames=None, variant="qwen")
            args = mock_fn.call_args[0]
            self.assertEqual(args[4], -1)  # fps sentinel