[DataProcessor] Refactor multimodal processor: extract encoding strategies and unify MM processing pipeline (#7298)

* merge mm processor
2026-04-22 16:07:51 +08:00 · 2026-04-15 19:01:06 +08:00
parent a218d29488
commit 3f84d8d893
36 changed files with 4016 additions and 681 deletions
@@ -435,17 +435,7 @@ class BaseTextProcessor(ABC):
            request["top_k"] = 1

        if self.reasoning_parser:
-            model_status = self.reasoning_parser.get_model_status(request["prompt_token_ids"])
-            parts = request["request_id"].split("_")
-            if len(parts) > 1:
-                real_req_id = parts[0]
-                index = int(parts[1])
-                n = request.get("n", 1)
-                for idx in range(index * n, (index + 1) * n):
-                    self.model_status_dict[f"{real_req_id}_{idx}"] = model_status
-            else:
-                self.model_status_dict[request["request_id"]] = model_status
-            request["enable_thinking"] = model_status == "think_start"
+            self._apply_reasoning_parser(request)

        if request.get("response_max_tokens") is not None and request.get("enable_thinking") is False:
            request["max_tokens"] = min(request["response_max_tokens"], request["max_tokens"])
@@ -453,6 +443,20 @@ class BaseTextProcessor(ABC):
        data_processor_logger.info(f"Processed request dict: {request}")
        return request

+    def _apply_reasoning_parser(self, request):
+        """Apply reasoning parser to determine model thinking status."""
+        model_status = self.reasoning_parser.get_model_status(request["prompt_token_ids"])
+        parts = request["request_id"].split("_")
+        if len(parts) > 1:
+            real_req_id = parts[0]
+            index = int(parts[1])
+            n = request.get("n", 1)
+            for idx in range(index * n, (index + 1) * n):
+                self.model_status_dict[f"{real_req_id}_{idx}"] = model_status
+        else:
+            self.model_status_dict[request["request_id"]] = model_status
+        request["enable_thinking"] = model_status == "think_start"
+
    def clear_request_status(self, task_id):
        """Clear all per-request decode state and return the accumulated text."""
        results_all = ""
@@ -0,0 +1,23 @@
+# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Multimodal encoding strategies for VL model families."""
+
+from fastdeploy.input.encodings.base_encoding import BaseEncoding
+from fastdeploy.input.encodings.ernie_encoding import ErnieEncoding
+from fastdeploy.input.encodings.paddleocr_encoding import PaddleOCREncoding
+from fastdeploy.input.encodings.qwen_encoding import QwenEncoding
+from fastdeploy.input.encodings.registry import EncodingRegistry
+
+__all__ = ["BaseEncoding", "EncodingRegistry", "ErnieEncoding", "PaddleOCREncoding", "QwenEncoding"]
@@ -0,0 +1,189 @@
+# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Abstract base class for multimodal encoding strategies.
+
+Each encoding strategy handles model-family-specific logic such as
+position ID computation, image/video preprocessing, and token counting.
+New model families should subclass ``BaseEncoding`` and implement all
+abstract methods.
+"""
+
+from abc import ABC, abstractmethod
+from typing import Any, Dict, Optional, Tuple
+
+
+class BaseEncoding(ABC):
+    """Contract that every encoding strategy must fulfil.
+
+    Required (abstract) methods cover the core encoding pipeline.
+    Optional methods (``init_extra``, ``get_mm_max_tokens_per_item``) have
+    default no-op implementations so subclasses only override when needed.
+    """
+
+    def __init__(self, processor, processor_kwargs=None):
+        if processor_kwargs is None:
+            processor_kwargs = {}
+        cfg = processor.cfg
+
+        # Shared objects (created by processor, used by encoding)
+        self.cfg = cfg
+        self.image_processor = processor.image_processor
+        self.tokenizer = processor.tokenizer
+
+        # Conv params
+        if cfg.conv_params_from_kwargs:
+            self.spatial_conv_size = processor_kwargs.get("spatial_conv_size", 2)
+            self.temporal_conv_size = processor_kwargs.get("temporal_conv_size", 2)
+        else:
+            self.spatial_conv_size = self.image_processor.merge_size
+            self.temporal_conv_size = self.image_processor.temporal_patch_size
+
+        # Special token IDs
+        self.image_token_id = self.tokenizer.convert_tokens_to_ids(cfg.image_token_str)
+        self.video_token_id = self.tokenizer.convert_tokens_to_ids(cfg.video_token_str)
+        if cfg.has_tokens_per_second:
+            vision_config = getattr(getattr(processor, "config", None), "vision_config", None)
+            self.tokens_per_second = getattr(vision_config, "tokens_per_second", 2)
+        else:
+            self.tokens_per_second = 2
+
+        # Video params
+        self.fps = processor_kwargs.get("video_fps", cfg.default_fps)
+        self.min_frames = processor_kwargs.get("video_min_frames", cfg.default_min_frames)
+        self.max_frames = processor_kwargs.get("video_max_frames", cfg.default_max_frames)
+        self.target_frames = processor_kwargs.get("video_target_frames", cfg.default_target_frames)
+
+        # Model-specific extra init
+        self.init_extra(processor_kwargs)
+
+    # ------------------------------------------------------------------
+    # Image
+    # ------------------------------------------------------------------
+    @abstractmethod
+    def add_image(self, img, outputs: dict, uuid, token_len=None):
+        """Process a raw image and append results to *outputs*."""
+
+    @abstractmethod
+    def add_processed_image(self, img_cache, outputs: dict, uuid, token_len=None):
+        """Append a pre-processed (cached) image to *outputs*."""
+
+    # ------------------------------------------------------------------
+    # Video
+    # ------------------------------------------------------------------
+    @abstractmethod
+    def add_video(self, frames, outputs: dict, uuid, token_len=None, meta: Optional[dict] = None):
+        """Process video frames and append results to *outputs*.
+
+        Parameters
+        ----------
+        frames : array-like
+            Decoded video frames.
+        outputs : dict
+            Mutable accumulator for input_ids, position_ids, etc.
+        uuid : str | None
+            Unique identifier for cache lookup.
+        token_len : int | None
+            Expected token count (for validation against pre-tokenised prompts).
+        meta : dict | None
+            Video metadata (fps, duration, ...).  Encoding strategies that
+            need metadata (e.g. Qwen) read from this dict; those that don't
+            (e.g. Ernie) simply ignore it.
+        """
+
+    @abstractmethod
+    def add_processed_video(self, frames_cache, outputs: dict, uuid, token_len=None):
+        """Append a pre-processed (cached) video to *outputs*."""
+
+    @abstractmethod
+    def load_video(self, url, item: dict) -> Tuple[Any, dict]:
+        """Decode a video from *url* and return ``(frames, meta)``.
+
+        All implementations must return a 2-tuple so that the caller
+        (``MultiModalProcessor.text2ids``) can unpack uniformly.
+        """
+
+    # ------------------------------------------------------------------
+    # Text / position helpers
+    # ------------------------------------------------------------------
+    @abstractmethod
+    def add_text_positions(self, outputs: dict, num_tokens: int):
+        """Append text position IDs to *outputs*."""
+
+    @abstractmethod
+    def append_completion_tokens(self, multimodal_inputs: dict, completion_token_ids):
+        """Append completion token IDs (and their positions) to *multimodal_inputs*."""
+
+    # ------------------------------------------------------------------
+    # Prompt-token-ids path (optional — only models with
+    # supports_prompt_token_ids=True need to implement this)
+    # ------------------------------------------------------------------
+    def prompt_token_ids2outputs(self, prompt_token_ids, mm_items=None) -> dict:
+        """Build outputs dict from pre-tokenised ``prompt_token_ids``.
+
+        Parameters
+        ----------
+        prompt_token_ids : list[int]
+            Pre-tokenised token IDs.
+        mm_items : list[dict] | None
+            Already-extracted multimodal items (each has 'type', 'data', 'uuid').
+            ``None`` means text-only.
+        """
+        raise NotImplementedError(f"{type(self).__name__} does not support prompt_token_ids path")
+
+    # ------------------------------------------------------------------
+    # Token counting & packing
+    # ------------------------------------------------------------------
+    @staticmethod
+    @abstractmethod
+    def mm_num_tokens(grid_thw):
+        """Return the number of multimodal tokens for a given grid_thw."""
+
+    @abstractmethod
+    def pack_position_ids(self, outputs: dict):
+        """Convert intermediate position ID lists into final packed format."""
+
+    # ------------------------------------------------------------------
+    # Outputs initialisation
+    # ------------------------------------------------------------------
+    def _make_outputs(self) -> dict:
+        """Create the mutable accumulator dict for encoding results.
+
+        Subclasses override to add model-specific fields (e.g. fps, vit fields).
+        """
+        return {
+            "input_ids": [],
+            "token_type_ids": [],
+            "position_ids": [],
+            "images": [],
+            "grid_thw": [],
+            "image_type_ids": [],
+            "labels": [],
+            "cur_position": 0,
+            "video_cnt": 0,
+            "num_input_image_tokens": 0,
+            "num_input_video_tokens": 0,
+            "mm_positions": [],
+            "mm_hashes": [],
+        }
+
+    # ------------------------------------------------------------------
+    # Optional hooks — subclasses override only when needed
+    # ------------------------------------------------------------------
+    def init_extra(self, processor_kwargs: dict):
+        """Model-specific extra initialisation (called once after ``__init__``)."""
+
+    def get_mm_max_tokens_per_item(self, seq_len: int) -> Optional[Dict[str, int]]:
+        """Per-modality max token counts for the scheduler.  ``None`` = not applicable."""
+        return None
@@ -0,0 +1,424 @@
+# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Ernie4.5-VL encoding strategy for MultiModalProcessor."""
+
+import copy
+from collections import defaultdict
+
+import numpy as np
+import paddle
+from paddleformers.transformers.image_utils import ChannelDimension
+
+from fastdeploy.engine.request import ImagePosition
+from fastdeploy.input.encodings.base_encoding import BaseEncoding
+from fastdeploy.input.encodings.registry import EncodingRegistry
+from fastdeploy.input.mm_model_config import ERNIE4_5_VL
+from fastdeploy.input.utils import IDS_TYPE_FLAG, MAX_IMAGE_DIMENSION
+from fastdeploy.multimodal.hasher import MultimodalHasher
+
+
+@EncodingRegistry.register(ERNIE4_5_VL)
+class ErnieEncoding(BaseEncoding):
+    """Encoding strategy for Ernie4.5-VL models."""
+
+    # Boundary token constants
+    IMG_START = "<|IMAGE_START|>"
+    IMG_END = "<|IMAGE_END|>"
+    VID_START = "<|VIDEO_START|>"
+    VID_END = "<|VIDEO_END|>"
+
+    def init_extra(self, processor_kwargs):
+        """Ernie-specific extra initialisation (pixel params, token type mapping, etc.)."""
+        self.image_min_pixels = processor_kwargs.get("image_min_pixels", 4 * 28 * 28)
+        self.image_max_pixels = processor_kwargs.get("image_max_pixels", 6177 * 28 * 28)
+        self.video_min_pixels = processor_kwargs.get("video_min_pixels", 299 * 28 * 28)
+        self.video_max_pixels = processor_kwargs.get("video_max_pixels", 1196 * 28 * 28)
+        self.frames_sample = processor_kwargs.get("video_frames_sample", self.cfg.default_frames_sample)
+
+        # Build token-type mapping for ernie boundary tokens
+        self.token_type_mapping = self._build_token_type_mapping()
+
+    def _build_token_type_mapping(self):
+        mapping = defaultdict(lambda: IDS_TYPE_FLAG["text"])
+        for token in (self.IMG_START, self.IMG_END, self.VID_START, self.VID_END):
+            mapping[token] = IDS_TYPE_FLAG["image"]
+        mapping[self.image_token_id] = IDS_TYPE_FLAG["image"]
+        return mapping
+
+    def add_image(self, img, outputs, uuid, token_len=None):
+        patches_h, patches_w = self.image_processor.get_smarted_resize(
+            img.height,
+            img.width,
+            min_pixels=self.image_min_pixels,
+            max_pixels=self.image_max_pixels,
+        )[1]
+        num_tokens = (patches_h * patches_w) // (self.spatial_conv_size**2)
+        if token_len and token_len != num_tokens:
+            raise ValueError("image tokens num not match the size")
+
+        outputs["mm_positions"].append(ImagePosition(len(outputs["input_ids"]), num_tokens))
+        outputs["input_ids"].extend([self.image_token_id] * num_tokens)
+        outputs["token_type_ids"].extend([IDS_TYPE_FLAG["image"]] * num_tokens)
+        outputs["num_input_image_tokens"] += num_tokens
+
+        pos_ids = self._compute_3d_positions(1, patches_h, patches_w, outputs["cur_position"])
+        outputs["position_ids"].extend(pos_ids)
+        outputs["cur_position"] = np.max(pos_ids) + 1
+
+        ret = self.image_processor.preprocess(
+            images=[img.convert("RGB")],
+            do_normalize=False,
+            do_rescale=False,
+            predetermined_grid_thw=np.array([[patches_h, patches_w]]),
+            do_convert_rgb=True,
+            input_data_format=ChannelDimension.LAST,
+        )
+        outputs["images"].append(ret["pixel_values"])
+        if not uuid:
+            outputs["mm_hashes"].append(MultimodalHasher.hash_features(ret["pixel_values"]))
+        else:
+            outputs["mm_hashes"].append(uuid)
+        outputs["grid_thw"].append(ret["image_grid_thw"])
+        outputs["image_type_ids"].append(0)
+
+    def add_processed_image(self, img_cache, outputs, uuid, token_len=None):
+        img, meta = img_cache
+        num_tokens = img.shape[0] // (self.spatial_conv_size**2)
+        if token_len and num_tokens != token_len:
+            raise ValueError("image tokens num not match the size")
+
+        outputs["mm_positions"].append(ImagePosition(len(outputs["input_ids"]), num_tokens))
+        outputs["input_ids"].extend([self.image_token_id] * num_tokens)
+        outputs["token_type_ids"].extend([IDS_TYPE_FLAG["image"]] * num_tokens)
+        outputs["num_input_image_tokens"] += num_tokens
+
+        _, h, w = meta["thw"]
+        pos_ids = self._compute_3d_positions(1, h, w, outputs["cur_position"])
+        outputs["position_ids"].extend(pos_ids)
+        outputs["cur_position"] = np.max(pos_ids) + 1
+
+        outputs["images"].append(img)
+        outputs["mm_hashes"].append(uuid)
+        outputs["grid_thw"].append(np.array([[1, h, w]]))
+        outputs["image_type_ids"].append(0)
+
+    def add_video(self, frames, outputs, uuid, token_len=None, meta=None):
+        patches_h, patches_w = self.image_processor.get_smarted_resize(
+            frames[0].height,
+            frames[0].width,
+            min_pixels=self.video_min_pixels,
+            max_pixels=self.video_max_pixels,
+        )[1]
+        num_frames = len(frames)
+        num_tokens = (num_frames * patches_h * patches_w) // (self.spatial_conv_size**2 * self.temporal_conv_size)
+        if token_len and num_tokens != token_len:
+            raise ValueError("video tokens num not match the size")
+
+        pixel_stack = np.stack([np.array(f.convert("RGB")) for f in frames], axis=0)
+        ret = self.image_processor.preprocess(
+            images=None,
+            videos=pixel_stack,
+            do_normalize=False,
+            do_rescale=False,
+            predetermined_grid_thw=np.array([[patches_h, patches_w]] * num_frames),
+            do_convert_rgb=True,
+            input_data_format=ChannelDimension.LAST,
+        )
+        outputs["images"].append(ret["pixel_values_videos"])
+        if not uuid:
+            outputs["mm_hashes"].append(MultimodalHasher.hash_features(ret["pixel_values_videos"]))
+        else:
+            outputs["mm_hashes"].append(uuid)
+        outputs["grid_thw"].append(ret["video_grid_thw"])
+        outputs["image_type_ids"].extend([1] * num_frames)
+
+        outputs["mm_positions"].append(ImagePosition(len(outputs["input_ids"]), num_tokens))
+        outputs["input_ids"].extend([self.image_token_id] * num_tokens)
+        outputs["token_type_ids"].extend([IDS_TYPE_FLAG["video"]] * num_tokens)
+        outputs["num_input_video_tokens"] += num_tokens
+
+        pos_ids = self._compute_3d_positions(num_frames, patches_h, patches_w, outputs["cur_position"])
+        outputs["position_ids"].extend(pos_ids)
+        outputs["cur_position"] = np.max(pos_ids) + 1
+
+    def add_processed_video(self, frames_cache, outputs, uuid, token_len=None):
+        frames, meta = frames_cache
+        num_tokens = frames.shape[0] // (self.spatial_conv_size**2 * self.temporal_conv_size)
+        if token_len and num_tokens != token_len:
+            raise ValueError("video tokens num not match the size")
+
+        t, h, w = meta["thw"]
+        outputs["images"].append(frames)
+        outputs["mm_hashes"].append(uuid)
+        outputs["grid_thw"].append(np.array([[t, h, w]]))
+
+        outputs["mm_positions"].append(ImagePosition(len(outputs["input_ids"]), num_tokens))
+        outputs["input_ids"].extend([self.image_token_id] * num_tokens)
+        outputs["token_type_ids"].extend([IDS_TYPE_FLAG["video"]] * num_tokens)
+        outputs["num_input_video_tokens"] += num_tokens
+        outputs["image_type_ids"].extend([1] * t)
+
+        pos_ids = self._compute_3d_positions(t, h, w, outputs["cur_position"])
+        outputs["position_ids"].extend(pos_ids)
+        outputs["cur_position"] = np.max(pos_ids) + 1
+
+    def load_video(self, url, item):
+        from fastdeploy.input.utils.render_timestamp import render_frame_timestamp
+        from fastdeploy.input.utils.video import read_frames_decord, read_video_decord
+
+        reader, meta, path = read_video_decord(url, save_to_disk=False)
+
+        video_frame_args = {
+            "fps": item.get("fps", self.fps),
+            "min_frames": item.get("min_frames", self.min_frames),
+            "max_frames": item.get("max_frames", self.max_frames),
+            "target_frames": item.get("target_frames", self.target_frames),
+            "frames_sample": item.get("frames_sample", self.frames_sample),
+        }
+        video_frame_args = self.set_video_frame_args(video_frame_args, meta)
+
+        frames_data, _, timestamps = read_frames_decord(
+            path,
+            reader,
+            meta,
+            target_frames=video_frame_args["target_frames"],
+            target_fps=video_frame_args["fps"],
+            frames_sample=video_frame_args["frames_sample"],
+            save_to_disk=False,
+        )
+
+        frames = []
+        for img_array, ts in zip(frames_data, timestamps):
+            frames.append(render_frame_timestamp(img_array, ts))
+        # Ensure even number of frames for temporal conv
+        if len(frames) % 2 != 0:
+            frames.append(copy.deepcopy(frames[-1]))
+        return frames, {}
+
+    def set_video_frame_args(self, video_frame_args, video_meta):
+        """Set final frame sampling args based on priorities."""
+        if video_frame_args["target_frames"] > 0:
+            if video_frame_args["fps"] >= 0:
+                raise ValueError("fps must be negative if target_frames is given")
+            if (
+                video_frame_args["min_frames"] > 0
+                and video_frame_args["target_frames"] < video_frame_args["min_frames"]
+            ):
+                raise ValueError("target_frames must be larger than min_frames")
+            if (
+                video_frame_args["max_frames"] > 0
+                and video_frame_args["target_frames"] > video_frame_args["max_frames"]
+            ):
+                raise ValueError("target_frames must be smaller than max_frames")
+        else:
+            if video_frame_args["fps"] < 0:
+                raise ValueError("Must provide either positive target_fps or positive target_frames.")
+            frames_to_extract = int(video_meta["duration"] * video_frame_args["fps"])
+            if (
+                video_frame_args["min_frames"] > 0
+                and video_frame_args["max_frames"] > 0
+                and video_frame_args["min_frames"] > video_frame_args["max_frames"]
+            ):
+                raise ValueError("min_frames must be smaller than max_frames")
+            if video_frame_args["min_frames"] > 0 and frames_to_extract < video_frame_args["min_frames"]:
+                video_frame_args["target_frames"] = video_frame_args["min_frames"]
+                video_frame_args["fps"] = -1
+            if video_frame_args["max_frames"] > 0 and frames_to_extract > video_frame_args["max_frames"]:
+                video_frame_args["target_frames"] = video_frame_args["max_frames"]
+                video_frame_args["fps"] = -1
+        return video_frame_args
+
+    def add_text_positions(self, outputs, num_tokens):
+        """Write text position IDs in ernie [pos, pos, pos] format."""
+        start = outputs["cur_position"]
+        for i in range(num_tokens):
+            outputs["position_ids"].append([start + i] * 3)
+        outputs["cur_position"] += num_tokens
+
+    def append_completion_tokens(self, multimodal_inputs, completion_token_ids):
+        num_tokens = len(completion_token_ids)
+        multimodal_inputs["input_ids"].extend(completion_token_ids)
+        multimodal_inputs["token_type_ids"].extend([IDS_TYPE_FLAG["text"]] * num_tokens)
+
+        start = multimodal_inputs["cur_position"]
+        for i in range(num_tokens):
+            multimodal_inputs["position_ids"].append([start + i] * 3)
+        multimodal_inputs["cur_position"] += num_tokens
+
+    def _compute_3d_positions(self, t, h, w, start_idx):
+        """Compute 3D position IDs as list-of-lists for ernie format."""
+        t_eff = t // self.temporal_conv_size if t != 1 else 1
+        gh, gw = h // self.spatial_conv_size, w // self.spatial_conv_size
+        time_idx = np.repeat(np.arange(t_eff), gh * gw)
+        h_idx = np.tile(np.repeat(np.arange(gh), gw), t_eff)
+        w_idx = np.tile(np.arange(gw), t_eff * gh)
+
+        coords = list(zip(time_idx, h_idx, w_idx))
+        return [[start_idx + ti, start_idx + hi, start_idx + wi] for ti, hi, wi in coords]
+
+    def prompt_token_ids2outputs(self, prompt_token_ids, mm_items=None):
+        outputs = self._make_outputs()
+        prompt_token_ids_len = len(prompt_token_ids)
+
+        if mm_items is None:
+            outputs["input_ids"].extend(prompt_token_ids)
+            outputs["token_type_ids"].extend([IDS_TYPE_FLAG["text"]] * prompt_token_ids_len)
+            for i in range(prompt_token_ids_len):
+                outputs["position_ids"].append([i] * 3)
+            outputs["cur_position"] += prompt_token_ids_len
+            return outputs
+
+        images, videos = [], []
+        image_uuid, video_uuid = [], []
+        for item in mm_items:
+            if item.get("type") == "image":
+                images.append(item["data"])
+                image_uuid.append(item.get("uuid"))
+            elif item.get("type") == "video":
+                videos.append(item["data"])
+                video_uuid.append(item.get("uuid"))
+
+        image_start_id = self.tokenizer.convert_tokens_to_ids(self.IMG_START)
+        image_end_id = self.tokenizer.convert_tokens_to_ids(self.IMG_END)
+        video_start_id = self.tokenizer.convert_tokens_to_ids(self.VID_START)
+        video_end_id = self.tokenizer.convert_tokens_to_ids(self.VID_END)
+
+        st, image_idx, video_idx = 0, 0, 0
+        while st < prompt_token_ids_len:
+            cur_token_id = prompt_token_ids[st]
+            if cur_token_id == image_start_id:
+                if image_idx >= len(images):
+                    raise ValueError("prompt token ids has more image placeholder than in messages")
+                # append image_start_id
+                outputs["input_ids"].extend([cur_token_id])
+                outputs["token_type_ids"].extend([IDS_TYPE_FLAG["image"]])
+                outputs["position_ids"].append([outputs["cur_position"]] * 3)
+                outputs["cur_position"] += 1
+                st += 1
+                # process placeholder token ids
+                cur_idx = st
+                while cur_idx < prompt_token_ids_len and prompt_token_ids[cur_idx] != image_end_id:
+                    cur_idx += 1
+                if cur_idx >= prompt_token_ids_len:
+                    raise ValueError("image token ids not complete")
+                image = images[image_idx]
+                uuid = image_uuid[image_idx] if image_uuid else None
+                token_len = cur_idx - st
+                if not isinstance(image, tuple):
+                    self.add_image(image, outputs, uuid, token_len)
+                else:
+                    self.add_processed_image(image, outputs, uuid, token_len)
+                image_idx += 1
+                st = cur_idx
+            elif cur_token_id == video_start_id:
+                if video_idx >= len(videos):
+                    raise ValueError("prompt token ids has more video placeholder than in messages")
+                # append video_start_id
+                outputs["input_ids"].extend([cur_token_id])
+                outputs["token_type_ids"].extend([IDS_TYPE_FLAG["image"]])
+                outputs["position_ids"].append([outputs["cur_position"]] * 3)
+                outputs["cur_position"] += 1
+                st += 1
+                # process placeholder token ids
+                cur_idx = st
+                while cur_idx < prompt_token_ids_len and prompt_token_ids[cur_idx] != video_end_id:
+                    cur_idx += 1
+                if cur_idx >= prompt_token_ids_len:
+                    raise ValueError("video token ids not complete")
+                video = videos[video_idx]
+                uuid = video_uuid[video_idx] if video_uuid else None
+                token_len = cur_idx - st
+                if not isinstance(video, tuple):
+                    if isinstance(video, dict):
+                        frames, _ = self.load_video(video["video"], video)
+                    else:
+                        frames, _ = self.load_video(video, {})
+                    self.add_video(frames, outputs, uuid, token_len=token_len)
+                else:
+                    self.add_processed_video(video, outputs, uuid, token_len)
+                video_idx += 1
+                st = cur_idx
+            else:
+                outputs["input_ids"].extend([cur_token_id])
+                type_flag = (
+                    IDS_TYPE_FLAG["image"] if cur_token_id in (image_end_id, video_end_id) else IDS_TYPE_FLAG["text"]
+                )
+                outputs["token_type_ids"].extend([type_flag])
+                outputs["position_ids"].append([outputs["cur_position"]] * 3)
+                outputs["cur_position"] += 1
+                st += 1
+
+        if image_idx != len(images):
+            raise ValueError("number of images does not match")
+        if video_idx != len(videos):
+            raise ValueError("number of videos does not match")
+
+        return outputs
+
+    @staticmethod
+    def mm_num_tokens(grid_thw):
+        """Ernie mm_num_tokens: video (t>1) divides by an extra 2."""
+        if isinstance(grid_thw, paddle.Tensor):
+            grid_thw = grid_thw.numpy()
+        if len(grid_thw) == 0:
+            return 0
+
+        def calc_one(thw):
+            t, h, w = map(int, thw)
+            if t == 1:
+                return t * h * w // 4
+            else:
+                return t * h * w // 4 // 2
+
+        if isinstance(grid_thw[0], (list, tuple, np.ndarray)):
+            return [calc_one(x) for x in grid_thw]
+        return calc_one(grid_thw)
+
+    def pack_position_ids(self, outputs):
+        """Ernie: position_ids is np.array (list-of-lists -> ndarray)."""
+        outputs["position_ids"] = np.array(outputs["position_ids"], dtype=np.int64)
+        outputs["image_patch_id"] = self.image_token_id
+
+    def get_mm_max_tokens_per_item(self, seq_len):
+        """Per-modality max token counts for ernie."""
+        target_height, target_width = self._get_image_size_with_most_features()
+        # image
+        patches_h, patches_w = self.image_processor.get_smarted_resize(
+            height=target_height,
+            width=target_width,
+            min_pixels=self.image_min_pixels,
+            max_pixels=self.image_max_pixels,
+        )[1]
+        max_image_tokens = (patches_h * patches_w) // (self.spatial_conv_size**2)
+        max_image_tokens = min(max_image_tokens, seq_len)
+        # video
+        patches_h, patches_w = self.image_processor.get_smarted_resize(
+            height=target_height,
+            width=target_width,
+            min_pixels=self.video_min_pixels,
+            max_pixels=self.video_max_pixels,
+        )[1]
+        max_video_tokens = (patches_h * patches_w) // (self.spatial_conv_size**2 * self.temporal_conv_size)
+        max_video_tokens = min(max_video_tokens, seq_len)
+        return {"image": max_image_tokens, "video": max_video_tokens}
+
+    def _get_image_size_with_most_features(self):
+        resized_height, resized_width = self.image_processor.get_smarted_resize(
+            height=MAX_IMAGE_DIMENSION,
+            width=MAX_IMAGE_DIMENSION,
+            min_pixels=self.image_min_pixels,
+            max_pixels=self.image_max_pixels,
+        )[0]
+        return (resized_height, resized_width)
@@ -0,0 +1,190 @@
+# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""PaddleOCR-VL encoding strategy."""
+
+import numpy as np
+from PIL import Image
+
+from fastdeploy.engine.request import ImagePosition
+from fastdeploy.input.encodings.qwen_encoding import QwenEncoding
+from fastdeploy.input.encodings.registry import EncodingRegistry
+from fastdeploy.input.mm_model_config import PADDLEOCR_VL
+from fastdeploy.input.utils import IDS_TYPE_FLAG
+from fastdeploy.input.utils.video import read_video_decord
+from fastdeploy.input.utils.video import sample_frames_paddleocr as _sample_paddleocr
+from fastdeploy.multimodal.hasher import MultimodalHasher
+
+
+@EncodingRegistry.register(PADDLEOCR_VL)
+class PaddleOCREncoding(QwenEncoding):
+    """Encoding strategy for paddleocr_vl.
+
+    Inherits from QwenEncoding and overrides methods that differ:
+    - _make_outputs: add vit_seqlen / vit_position_ids
+    - add_image / add_video: append vit_fields (vit_seqlen, vit_position_ids)
+    - add_video / add_processed_video: use video_token_id instead of image_token_id
+    - load_video: use sample_frames_paddleocr instead of sample_frames_qwen
+    """
+
+    def _make_outputs(self) -> dict:
+        outputs = super()._make_outputs()
+        outputs["vit_seqlen"] = []
+        outputs["vit_position_ids"] = []
+        return outputs
+
+    def add_image(self, img, outputs, uuid, token_len=None):
+        ret = self.image_processor.preprocess(images=[img.convert("RGB")])
+        num_tokens = ret["grid_thw"].prod() // self.image_processor.merge_size**2
+        grid_thw = ret["grid_thw"].tolist()
+        if token_len is not None and token_len != num_tokens:
+            raise ValueError("image tokens num not match the size")
+
+        outputs["mm_positions"].append(ImagePosition(len(outputs["input_ids"]), num_tokens))
+        outputs["input_ids"].extend([self.image_token_id] * num_tokens)
+        outputs["token_type_ids"].extend([IDS_TYPE_FLAG["image"]] * num_tokens)
+        outputs["num_input_image_tokens"] += int(num_tokens)
+
+        outputs["images"].append(ret["pixel_values"])
+        if not uuid:
+            outputs["mm_hashes"].append(MultimodalHasher.hash_features(ret["pixel_values"]))
+        else:
+            outputs["mm_hashes"].append(uuid)
+        outputs["grid_thw"].append(grid_thw)
+        outputs["image_type_ids"].append(0)
+
+        t, h, w = grid_thw
+        pos_ids = self._compute_vision_positions(outputs["cur_position"], t, h, w, 0)
+        outputs["position_ids"].append(pos_ids)
+        outputs["cur_position"] = pos_ids.max() + 1
+
+        outputs["fps"].append(0)
+
+        # paddleocr vit fields
+        numel = h * w
+        outputs["vit_seqlen"].append(numel)
+        outputs["vit_position_ids"].append(np.arange(numel) % numel)
+
+    def add_processed_image(self, img_cache, outputs, uuid, token_len=None):
+        super().add_processed_image(img_cache, outputs, uuid, token_len)
+        _, h, w = img_cache[1]["thw"]
+        numel = h * w
+        outputs["vit_seqlen"].append(numel)
+        outputs["vit_position_ids"].append(np.arange(numel) % numel)
+
+    def add_video(self, frames, outputs, uuid, token_len=None, meta=None):
+        preprocess_kwargs = {}
+        if self.cfg.video_min_pixels is not None:
+            preprocess_kwargs["min_pixels"] = self.cfg.video_min_pixels
+            preprocess_kwargs["max_pixels"] = self.cfg.video_max_pixels
+
+        ret = self.image_processor.preprocess(images=frames, **preprocess_kwargs)
+
+        num_tokens = ret["grid_thw"].prod() // self.image_processor.merge_size**2
+        grid_thw = ret["grid_thw"].tolist()
+        if token_len is not None and token_len != num_tokens:
+            raise ValueError("video tokens num not match the size")
+
+        outputs["mm_positions"].append(ImagePosition(len(outputs["input_ids"]), num_tokens))
+        outputs["input_ids"].extend([self.video_token_id] * num_tokens)
+        outputs["token_type_ids"].extend([IDS_TYPE_FLAG["video"]] * num_tokens)
+        outputs["num_input_video_tokens"] += int(num_tokens)
+
+        outputs["images"].append(ret["pixel_values"])
+        if not uuid:
+            outputs["mm_hashes"].append(MultimodalHasher.hash_features(ret["pixel_values"]))
+        else:
+            outputs["mm_hashes"].append(uuid)
+        outputs["grid_thw"].append(grid_thw)
+        outputs["image_type_ids"].extend([1] * grid_thw[0])
+
+        fps = meta["fps"] if meta else 0
+        second_per_grid_t = self.temporal_conv_size / fps if fps else 0
+        t, h, w = grid_thw
+        pos_ids = self._compute_vision_positions(outputs["cur_position"], t, h, w, second_per_grid_t)
+        outputs["position_ids"].append(pos_ids)
+        outputs["cur_position"] = pos_ids.max() + 1
+
+        outputs["fps"].append(fps)
+
+        # paddleocr vit fields
+        numel = h * w
+        outputs["vit_seqlen"].append(numel)
+        outputs["vit_position_ids"].append(np.arange(numel) % numel)
+
+    def add_processed_video(self, frames_cache, outputs, uuid, token_len=None):
+        frames, meta = frames_cache
+        num_tokens = frames.shape[0] // self.image_processor.merge_size**2
+        if token_len is not None and token_len != num_tokens:
+            raise ValueError("video tokens num not match the size")
+
+        t, h, w = meta["thw"]
+        outputs["images"].append(frames)
+        outputs["mm_hashes"].append(uuid)
+        outputs["grid_thw"].append(np.array([[t, h, w]]))
+
+        outputs["mm_positions"].append(ImagePosition(len(outputs["input_ids"]), num_tokens))
+        outputs["input_ids"].extend([self.video_token_id] * num_tokens)
+        outputs["token_type_ids"].extend([IDS_TYPE_FLAG["video"]] * num_tokens)
+        outputs["num_input_video_tokens"] += num_tokens
+        outputs["image_type_ids"].extend([1] * t)
+
+        fps = meta["fps"]
+        second_per_grid_t = self.temporal_conv_size / fps
+        pos_ids = self._compute_vision_positions(outputs["cur_position"], t, h, w, second_per_grid_t)
+        outputs["position_ids"].append(pos_ids)
+        outputs["cur_position"] = pos_ids.max() + 1
+
+        outputs["fps"].append(fps)
+
+        # paddleocr vit fields
+        numel = h * w
+        outputs["vit_seqlen"].append(numel)
+        outputs["vit_position_ids"].append(np.arange(numel) % numel)
+
+    def load_video(self, url, item):
+        reader, meta, _ = read_video_decord(url, save_to_disk=False)
+
+        fps = item.get("fps", self.fps)
+        num_frames = item.get("target_frames", self.target_frames)
+
+        frame_indices = list(range(meta["num_of_frame"]))
+        if fps > 0 or num_frames > 0:
+            min_frames = item.get("min_frames", self.min_frames)
+            max_frames = item.get("max_frames", self.max_frames)
+
+            frame_indices = _sample_paddleocr(
+                frame_factor=self.temporal_conv_size,
+                min_frames=min_frames,
+                max_frames=max_frames,
+                metadata=meta,
+                fps=fps,
+                num_frames=num_frames,
+            )
+
+            meta["num_of_frame"] = len(frame_indices)
+            if fps is not None:
+                meta["fps"] = fps
+                meta["duration"] = len(frame_indices) / fps
+            else:
+                meta["fps"] = len(frame_indices) / meta["duration"]
+
+        frames = []
+        for idx in frame_indices:
+            frame = reader[idx].asnumpy()
+            image = Image.fromarray(frame, "RGB")
+            frames.append(image)
+        frames = np.stack([np.array(f.convert("RGB")) for f in frames], axis=0)
+
+        return frames, meta
@@ -0,0 +1,314 @@
+# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Qwen-family (qwen_vl / qwen3_vl) encoding strategy."""
+
+
+import numpy as np
+import paddle
+from PIL import Image
+
+from fastdeploy.engine.request import ImagePosition
+from fastdeploy.input.encodings.base_encoding import BaseEncoding
+from fastdeploy.input.encodings.registry import EncodingRegistry
+from fastdeploy.input.mm_model_config import QWEN3_VL, QWEN_VL
+from fastdeploy.input.utils import IDS_TYPE_FLAG
+from fastdeploy.input.utils.video import read_video_decord
+from fastdeploy.input.utils.video import sample_frames_qwen as _sample_qwen
+from fastdeploy.multimodal.hasher import MultimodalHasher
+
+
+@EncodingRegistry.register(QWEN_VL, QWEN3_VL)
+class QwenEncoding(BaseEncoding):
+    """Encoding strategy for qwen_vl and qwen3_vl."""
+
+    FRAME_FACTOR = 2
+
+    def _make_outputs(self) -> dict:
+        outputs = super()._make_outputs()
+        outputs["fps"] = []
+        return outputs
+
+    def add_image(self, img, outputs, uuid, token_len=None):
+        ret = self.image_processor.preprocess(images=[img.convert("RGB")])
+        num_tokens = ret["grid_thw"].prod() // self.image_processor.merge_size**2
+        grid_thw = ret["grid_thw"].tolist()
+        if token_len is not None and token_len != num_tokens:
+            raise ValueError("image tokens num not match the size")
+
+        outputs["mm_positions"].append(ImagePosition(len(outputs["input_ids"]), num_tokens))
+        outputs["input_ids"].extend([self.image_token_id] * num_tokens)
+        outputs["token_type_ids"].extend([IDS_TYPE_FLAG["image"]] * num_tokens)
+        outputs["num_input_image_tokens"] += int(num_tokens)
+
+        outputs["images"].append(ret["pixel_values"])
+        if not uuid:
+            outputs["mm_hashes"].append(MultimodalHasher.hash_features(ret["pixel_values"]))
+        else:
+            outputs["mm_hashes"].append(uuid)
+        outputs["grid_thw"].append(grid_thw)
+        outputs["image_type_ids"].append(0)
+
+        t, h, w = grid_thw
+        pos_ids = self._compute_vision_positions(outputs["cur_position"], t, h, w, 0)
+        outputs["position_ids"].append(pos_ids)
+        outputs["cur_position"] = pos_ids.max() + 1
+
+        outputs["fps"].append(0)
+
+    def add_processed_image(self, img_cache, outputs, uuid, token_len=None):
+        img, meta = img_cache
+        num_tokens = img.shape[0] // self.image_processor.merge_size**2
+        if token_len is not None and token_len != num_tokens:
+            raise ValueError("image tokens num not match the size")
+
+        outputs["mm_positions"].append(ImagePosition(len(outputs["input_ids"]), num_tokens))
+        outputs["input_ids"].extend([self.image_token_id] * num_tokens)
+        outputs["token_type_ids"].extend([IDS_TYPE_FLAG["image"]] * num_tokens)
+        outputs["num_input_image_tokens"] += num_tokens
+
+        _, h, w = meta["thw"]
+        pos_ids = self._compute_vision_positions(outputs["cur_position"], 1, h, w, 0)
+        outputs["position_ids"].append(pos_ids)
+        outputs["cur_position"] = pos_ids.max() + 1
+
+        outputs["images"].append(img)
+        outputs["mm_hashes"].append(uuid)
+        outputs["grid_thw"].append(np.array([[1, h, w]]))
+        outputs["image_type_ids"].append(0)
+
+        outputs["fps"].append(0)
+
+    def add_video(self, frames, outputs, uuid, token_len=None, meta=None):
+        preprocess_kwargs = {}
+        # qwen3_vl passes min/max pixels for video
+        if self.cfg.video_min_pixels is not None:
+            preprocess_kwargs["min_pixels"] = self.cfg.video_min_pixels
+            preprocess_kwargs["max_pixels"] = self.cfg.video_max_pixels
+
+        ret = self.image_processor.preprocess(images=frames, **preprocess_kwargs)
+
+        num_tokens = ret["grid_thw"].prod() // self.image_processor.merge_size**2
+        grid_thw = ret["grid_thw"].tolist()
+        if token_len is not None and token_len != num_tokens:
+            raise ValueError("video tokens num not match the size")
+
+        outputs["mm_positions"].append(ImagePosition(len(outputs["input_ids"]), num_tokens))
+        outputs["input_ids"].extend([self.image_token_id] * num_tokens)
+        outputs["token_type_ids"].extend([IDS_TYPE_FLAG["video"]] * num_tokens)
+        outputs["num_input_video_tokens"] += int(num_tokens)
+
+        outputs["images"].append(ret["pixel_values"])
+        if not uuid:
+            outputs["mm_hashes"].append(MultimodalHasher.hash_features(ret["pixel_values"]))
+        else:
+            outputs["mm_hashes"].append(uuid)
+        outputs["grid_thw"].append(grid_thw)
+        outputs["image_type_ids"].extend([1] * grid_thw[0])
+
+        fps = meta["fps"] if meta else 0
+        second_per_grid_t = self.temporal_conv_size / fps if fps else 0
+        t, h, w = grid_thw
+        pos_ids = self._compute_vision_positions(outputs["cur_position"], t, h, w, second_per_grid_t)
+        outputs["position_ids"].append(pos_ids)
+        outputs["cur_position"] = pos_ids.max() + 1
+
+        outputs["fps"].append(fps)
+
+    def add_processed_video(self, frames_cache, outputs, uuid, token_len=None):
+        frames, meta = frames_cache
+        num_tokens = frames.shape[0] // self.image_processor.merge_size**2
+        if token_len is not None and token_len != num_tokens:
+            raise ValueError("video tokens num not match the size")
+
+        t, h, w = meta["thw"]
+        outputs["images"].append(frames)
+        outputs["mm_hashes"].append(uuid)
+        outputs["grid_thw"].append(np.array([[t, h, w]]))
+
+        outputs["mm_positions"].append(ImagePosition(len(outputs["input_ids"]), num_tokens))
+        outputs["input_ids"].extend([self.image_token_id] * num_tokens)
+        outputs["token_type_ids"].extend([IDS_TYPE_FLAG["video"]] * num_tokens)
+        outputs["num_input_video_tokens"] += num_tokens
+        outputs["image_type_ids"].extend([1] * t)
+
+        fps = meta["fps"]
+        second_per_grid_t = self.temporal_conv_size / fps
+        pos_ids = self._compute_vision_positions(outputs["cur_position"], t, h, w, second_per_grid_t)
+        outputs["position_ids"].append(pos_ids)
+        outputs["cur_position"] = pos_ids.max() + 1
+
+        outputs["fps"].append(fps)
+
+    def load_video(self, url, item):
+        reader, meta, _ = read_video_decord(url, save_to_disk=False)
+
+        fps = item.get("fps", self.fps)
+        num_frames = item.get("target_frames", self.target_frames)
+
+        frame_indices = list(range(meta["num_of_frame"]))
+        if fps > 0 or num_frames > 0:
+            min_frames = item.get("min_frames", self.min_frames)
+            max_frames = item.get("max_frames", self.max_frames)
+
+            frame_indices = _sample_qwen(
+                frame_factor=self.FRAME_FACTOR,
+                min_frames=min_frames,
+                max_frames=max_frames,
+                metadata=meta,
+                fps=-1 if num_frames > 0 else fps,
+                num_frames=num_frames,
+            )
+
+            meta["num_of_frame"] = len(frame_indices)
+            if fps is not None:
+                meta["fps"] = fps
+                meta["duration"] = len(frame_indices) / fps
+            else:
+                meta["fps"] = len(frame_indices) / meta["duration"]
+
+        frames = []
+        for idx in frame_indices:
+            frame = reader[idx].asnumpy()
+            image = Image.fromarray(frame, "RGB")
+            frames.append(image)
+        frames = np.stack([np.array(f.convert("RGB")) for f in frames], axis=0)
+
+        return frames, meta
+
+    def add_text_positions(self, outputs, num_tokens):
+        """Write text position IDs in qwen 3xN ndarray format."""
+        pos_ids = self._compute_text_positions(outputs["cur_position"], num_tokens)
+        outputs["position_ids"].append(pos_ids)
+        outputs["cur_position"] = pos_ids.max() + 1
+
+    def append_completion_tokens(self, multimodal_inputs, completion_token_ids):
+        num_tokens = len(completion_token_ids)
+        multimodal_inputs["input_ids"].extend(completion_token_ids)
+        multimodal_inputs["token_type_ids"].extend([IDS_TYPE_FLAG["text"]] * num_tokens)
+
+        pos_ids = self._compute_text_positions(multimodal_inputs["cur_position"], num_tokens)
+        multimodal_inputs["position_ids"].append(pos_ids)
+        multimodal_inputs["cur_position"] += num_tokens
+
+    def prompt_token_ids2outputs(self, prompt_token_ids, mm_items=None):
+        """Build outputs from prompt_token_ids. Only qwen3_vl supports this."""
+        outputs = self._make_outputs()
+        prompt_token_ids_len = len(prompt_token_ids)
+
+        if mm_items is None:
+            self._add_text_tokens(prompt_token_ids, outputs)
+            return outputs
+
+        st, mm_idx = 0, 0
+        while st < prompt_token_ids_len:
+            if prompt_token_ids[st] != self.image_token_id:
+                cur_idx = st
+                while cur_idx < prompt_token_ids_len and prompt_token_ids[cur_idx] != self.image_token_id:
+                    cur_idx += 1
+                self._add_text_tokens(prompt_token_ids[st:cur_idx], outputs)
+                st = cur_idx
+                continue
+
+            if mm_idx >= len(mm_items):
+                raise ValueError("prompt token ids has more multimodal placeholder than in messages")
+
+            cur_idx = st
+            while cur_idx < prompt_token_ids_len and prompt_token_ids[cur_idx] == self.image_token_id:
+                cur_idx += 1
+
+            item = mm_items[mm_idx]
+            uuid = item.get("uuid")
+            token_len = cur_idx - st
+            if item.get("type") == "image":
+                image = item.get("data")
+                if not isinstance(image, tuple):
+                    self.add_image(image, outputs, uuid, token_len)
+                else:
+                    self.add_processed_image(image, outputs, uuid, token_len)
+            elif item.get("type") == "video":
+                video = item.get("data")
+                if not isinstance(video, tuple):
+                    if isinstance(video, dict):
+                        frames, meta = self.load_video(video["video"], video)
+                    else:
+                        frames, meta = self.load_video(video, {})
+                    self.add_video(frames, outputs, uuid, token_len=token_len, meta=meta)
+                else:
+                    self.add_processed_video(video, outputs, uuid, token_len)
+            else:
+                raise ValueError(f"Unsupported multimodal type: {item.get('type')}")
+            mm_idx += 1
+            st = cur_idx
+
+        if mm_idx != len(mm_items):
+            raise ValueError("number of multimodal items does not match prompt token ids")
+
+        return outputs
+
+    def _add_text_tokens(self, tokens, outputs):
+        """Helper: add text tokens with position IDs."""
+        if not tokens:
+            return
+        num_tokens = len(tokens)
+        outputs["input_ids"].extend(tokens)
+        outputs["token_type_ids"].extend([IDS_TYPE_FLAG["text"]] * num_tokens)
+        self.add_text_positions(outputs, num_tokens)
+
+    def _compute_text_positions(self, start_pos, num_tokens):
+        """3xN ndarray for qwen-family text positions."""
+        text_array = np.arange(num_tokens).reshape(1, -1)
+        text_index = np.broadcast_to(text_array, (3, num_tokens))
+        return text_index + start_pos
+
+    def _compute_vision_positions(self, start_pos, t, h, w, second_per_grid_t):
+        """3D position IDs as 3xN ndarray for qwen-family."""
+        h //= self.spatial_conv_size
+        w //= self.spatial_conv_size
+
+        tn = np.arange(t).reshape(-1, 1)
+        tn = np.broadcast_to(tn, (t, h * w))
+        tn = tn * int(second_per_grid_t) * self.tokens_per_second
+        t_index = tn.flatten()
+
+        hn = np.arange(h).reshape(1, -1, 1)
+        h_index = np.broadcast_to(hn, (t, h, w)).flatten()
+
+        wn = np.arange(w).reshape(1, 1, -1)
+        w_index = np.broadcast_to(wn, (t, h, w)).flatten()
+
+        return np.stack([t_index, h_index, w_index]) + start_pos
+
+    @staticmethod
+    def mm_num_tokens(grid_thw):
+        """Qwen mm_num_tokens: t * h * w // 4."""
+        if isinstance(grid_thw, paddle.Tensor):
+            grid_thw = grid_thw.numpy()
+        if len(grid_thw) == 0:
+            return 0
+
+        def calc_one(thw):
+            t, h, w = map(int, thw)
+            return t * h * w // 4
+
+        if isinstance(grid_thw[0], (list, tuple, np.ndarray)):
+            return [calc_one(x) for x in grid_thw]
+        return calc_one(grid_thw)
+
+    def pack_position_ids(self, outputs):
+        """Qwen: concatenate 3xN arrays, then transpose to Nx3."""
+        outputs["position_ids"] = np.concatenate(outputs["position_ids"], axis=1, dtype=np.int64)
+        outputs["image_patch_id"] = self.image_token_id
+        outputs["video_patch_id"] = self.video_token_id
+        outputs["position_ids"] = outputs["position_ids"].transpose(1, 0)
@@ -0,0 +1,54 @@
+# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Registry for multimodal encoding strategy classes."""
+
+from typing import Dict, Type
+
+
+class EncodingRegistry:
+    """Maps model_type strings to encoding strategy classes.
+
+    Encoding classes register themselves via the ``register`` decorator
+    at import time.  ``MultiModalProcessor`` queries this registry by
+    *model_type* instead of using string-based dynamic imports.
+    """
+
+    _registry: Dict[str, Type] = {}
+
+    @classmethod
+    def register(cls, *model_types: str):
+        """Decorator that registers an encoding class for one or more model types."""
+
+        def decorator(enc_cls):
+            for mt in model_types:
+                if mt in cls._registry:
+                    raise ValueError(
+                        f"Encoding for '{mt}' already registered "
+                        f"as {cls._registry[mt].__name__}, "
+                        f"cannot re-register as {enc_cls.__name__}"
+                    )
+                cls._registry[mt] = enc_cls
+            return enc_cls
+
+        return decorator
+
+    @classmethod
+    def get(cls, model_type: str) -> Type:
+        """Look up the encoding class for a given *model_type*."""
+        if model_type not in cls._registry:
+            raise ValueError(
+                f"No encoding registered for '{model_type}'. " f"Available: {sorted(cls._registry.keys())}"
+            )
+        return cls._registry[model_type]
@@ -539,6 +539,7 @@ class DataProcessor(MMBaseDataProcessor):
        outputs["mm_positions"].append(ImagePosition(len(outputs["input_ids"]), num_tokens))
        outputs["input_ids"].extend([self.image_patch_id] * num_tokens)
        outputs["token_type_ids"].extend([IDS_TYPE_FLAG["image"]] * num_tokens)
+        outputs["num_input_image_tokens"] += num_tokens

        _, h, w = meta["thw"]
        pos_ids = self._compute_3d_positions(1, h, w, outputs["cur_position"])
@@ -605,6 +606,7 @@ class DataProcessor(MMBaseDataProcessor):
        outputs["mm_positions"].append(ImagePosition(len(outputs["input_ids"]), num_tokens))
        outputs["input_ids"].extend([self.image_patch_id] * num_tokens)
        outputs["token_type_ids"].extend([IDS_TYPE_FLAG["video"]] * num_tokens)
+        outputs["num_input_video_tokens"] += num_tokens
        outputs["image_type_ids"].extend([1] * t)

        pos_ids = self._compute_3d_positions(t, h, w, outputs["cur_position"])
@@ -25,3 +25,6 @@ from fastdeploy.input.image_processors.qwen3_processor import (  # noqa: F401
 from fastdeploy.input.image_processors.qwen_processor import (  # noqa: F401
    ImageProcessor as QwenImageProcessor,
 )
+from fastdeploy.input.image_processors.registry import (  # noqa: F401
+    ImageProcessorRegistry,
+)
@@ -46,6 +46,8 @@ from PIL import Image

 from fastdeploy.input.image_processors.common import is_scaled_image
 from fastdeploy.input.image_processors.common import smart_resize_qwen as smart_resize
+from fastdeploy.input.image_processors.registry import ImageProcessorRegistry
+from fastdeploy.input.mm_model_config import ERNIE4_5_VL
 from fastdeploy.utils import data_processor_logger

 OPENAI_CLIP_MEAN = [0.48145466, 0.4578275, 0.40821073]
@@ -116,6 +118,7 @@ def make_batched_videos(videos) -> List[VideoInput]:
    raise ValueError(f"Could not make batched video from {videos}")


+@ImageProcessorRegistry.register(ERNIE4_5_VL)
 class AdaptiveImageProcessor(BaseImageProcessor):
    r"""
    Constructs a adaptive image processor that dynamically resizes images based on the original images.
@@ -33,6 +33,8 @@ from paddleformers.transformers.image_utils import (
 from fastdeploy.input.image_processors.common import (
    smart_resize_paddleocr as smart_resize,
 )
+from fastdeploy.input.image_processors.registry import ImageProcessorRegistry
+from fastdeploy.input.mm_model_config import PADDLEOCR_VL

 _OPENAI_CLIP_MEAN = [0.48145466, 0.4578275, 0.40821073]
 _OPENAI_CLIP_STD = [0.26862954, 0.26130258, 0.27577711]
@@ -66,6 +68,7 @@ def adjust_size(size, patch_size):
    return num_patches * patch_size


+@ImageProcessorRegistry.register(PADDLEOCR_VL)
 class ImageProcessor(BaseImageProcessor):
    model_input_names = [
        "pixel_values",
@@ -41,6 +41,8 @@ from paddleformers.transformers.legacy.tokenizer_utils_base import TensorType
 from PIL import Image

 from fastdeploy.input.image_processors.common import is_scaled_image, smart_resize
+from fastdeploy.input.image_processors.registry import ImageProcessorRegistry
+from fastdeploy.input.mm_model_config import QWEN3_VL
 from fastdeploy.utils import data_processor_logger

 IMAGE_MEAN = [0.5, 0.5, 0.5]
@@ -62,6 +64,7 @@ VideoInput = Union[
 ]


+@ImageProcessorRegistry.register(QWEN3_VL)
 class ImageProcessor(BaseImageProcessor):
    """
    Adaptive image processor for dynamic image resizing and preprocessing.
@@ -41,6 +41,8 @@ from paddleformers.transformers.legacy.tokenizer_utils_base import TensorType
 from PIL import Image

 from fastdeploy.input.image_processors.common import is_scaled_image, smart_resize
+from fastdeploy.input.image_processors.registry import ImageProcessorRegistry
+from fastdeploy.input.mm_model_config import QWEN_VL
 from fastdeploy.utils import data_processor_logger

 OPENAI_CLIP_MEAN = [0.48145466, 0.4578275, 0.40821073]
@@ -62,6 +64,7 @@ VideoInput = Union[
 ]


+@ImageProcessorRegistry.register(QWEN_VL)
 class ImageProcessor(BaseImageProcessor):
    """
    Adaptive image processor for dynamic image resizing and preprocessing.
@@ -0,0 +1,54 @@
+# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Registry for multimodal image processor classes."""
+
+from typing import Dict, Type
+
+
+class ImageProcessorRegistry:
+    """Maps model_type strings to image processor classes.
+
+    Image processors register themselves via the ``register`` decorator
+    at import time.  ``MultiModalProcessor`` queries this registry by
+    *model_type* instead of using string-based dynamic imports.
+    """
+
+    _registry: Dict[str, Type] = {}
+
+    @classmethod
+    def register(cls, *model_types: str):
+        """Decorator that registers an image processor class for one or more model types."""
+
+        def decorator(proc_cls):
+            for mt in model_types:
+                if mt in cls._registry:
+                    raise ValueError(
+                        f"Image processor for '{mt}' already registered "
+                        f"as {cls._registry[mt].__name__}, "
+                        f"cannot re-register as {proc_cls.__name__}"
+                    )
+                cls._registry[mt] = proc_cls
+            return proc_cls
+
+        return decorator
+
+    @classmethod
+    def get(cls, model_type: str) -> Type:
+        """Look up the image processor class for a given *model_type*."""
+        if model_type not in cls._registry:
+            raise ValueError(
+                f"No image processor registered for '{model_type}'. " f"Available: {sorted(cls._registry.keys())}"
+            )
+        return cls._registry[model_type]
@@ -0,0 +1,143 @@
+# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Per-model-type configuration for the unified MultiModalProcessor."""
+
+from dataclasses import dataclass, field
+from typing import Dict, Optional
+
+QWEN_VL = "qwen_vl"
+QWEN3_VL = "qwen3_vl"
+PADDLEOCR_VL = "paddleocr_vl"
+ERNIE4_5_VL = "ernie4_5_vl"
+
+
+@dataclass(frozen=True)
+class MMModelConfig:
+    image_placeholder: str
+    video_placeholder: str
+
+    tokenizer_type: str = "auto"  # "auto" | "ernie4_5"
+
+    default_min_frames: int = 4
+    default_max_frames: int = 768
+    default_target_frames: int = -1
+    default_fps: float = 2.0
+    default_frames_sample: str = "leading"
+
+    has_bad_words: bool = True
+    has_tool_role: bool = False  # ernie: role_prefixes includes "tool"
+    default_thinking: bool = False  # ernie: default enable_thinking=True
+    force_disable_thinking: bool = False  # qwen_vl, qwen3_vl: force enable_thinking=False
+    set_default_reasoning_max_tokens: bool = False  # ernie: auto-set reasoning_max_tokens
+    cap_response_max_tokens: bool = False  # ernie: cap max_tokens by response_max_tokens
+    has_logits_processor_think: bool = False  # ernie: _prepare_think_stop_sentence
+
+    chat_template_pass_request: bool = False  # ernie: pass full request obj
+
+    supports_prompt_token_ids: bool = False  # qwen3, ernie
+
+    preserve_prompt_token_ids: bool = False  # qwen3, ernie: don't overwrite existing
+
+    stop_tokens_variant: str = "default"  # "default" | "qwen3"
+
+    image_token_str: str = ""
+    video_token_str: str = ""
+
+    expected_kwargs: Dict[str, type] = field(default_factory=dict)
+
+    video_min_pixels: Optional[int] = None
+    video_max_pixels: Optional[int] = None
+
+    # ---- Conv params source ----
+    conv_params_from_kwargs: bool = False  # ernie: from processor_kwargs; else: from image_processor
+
+    # ---- tokens_per_second ----
+    has_tokens_per_second: bool = True  # qwen-family: read from config; ernie: False
+
+
+_QWEN_KWARGS = {
+    "video_max_frames": int,
+    "video_min_frames": int,
+}
+
+_ERNIE_KWARGS = {
+    "spatial_conv_size": int,
+    "temporal_conv_size": int,
+    "image_min_pixels": int,
+    "image_max_pixels": int,
+    "video_min_pixels": int,
+    "video_max_pixels": int,
+    "video_target_frames": int,
+    "video_frames_sample": str,
+    "video_max_frames": int,
+    "video_min_frames": int,
+    "video_fps": int,
+}
+
+
+MODEL_CONFIGS: Dict[str, MMModelConfig] = {
+    QWEN_VL: MMModelConfig(
+        image_placeholder="<|image_pad|>",
+        video_placeholder="<|video_pad|>",
+        image_token_str="<|image_pad|>",
+        video_token_str="<|video_pad|>",
+        force_disable_thinking=True,
+        expected_kwargs=_QWEN_KWARGS,
+    ),
+    QWEN3_VL: MMModelConfig(
+        image_placeholder="<|image_pad|>",
+        video_placeholder="<|video_pad|>",
+        image_token_str="<|image_pad|>",
+        video_token_str="<|video_pad|>",
+        force_disable_thinking=True,
+        supports_prompt_token_ids=True,
+        preserve_prompt_token_ids=True,
+        stop_tokens_variant="qwen3",
+        video_min_pixels=128 * 28 * 28,
+        video_max_pixels=768 * 28 * 28,
+        expected_kwargs=_QWEN_KWARGS,
+    ),
+    PADDLEOCR_VL: MMModelConfig(
+        image_placeholder="<|IMAGE_PLACEHOLDER|>",
+        video_placeholder="<|video_pad|>",
+        image_token_str="<|IMAGE_PLACEHOLDER|>",
+        video_token_str="<|video_pad|>",
+        has_bad_words=False,
+        default_fps=-1.0,
+        expected_kwargs=_QWEN_KWARGS,
+    ),
+    ERNIE4_5_VL: MMModelConfig(
+        image_placeholder="<|image@placeholder|>",
+        video_placeholder="<|video@placeholder|>",
+        tokenizer_type="ernie4_5",
+        default_min_frames=16,
+        default_max_frames=180,
+        default_fps=2.0,
+        default_frames_sample="leading",
+        has_tool_role=True,
+        default_thinking=True,
+        set_default_reasoning_max_tokens=True,
+        cap_response_max_tokens=True,
+        has_logits_processor_think=True,
+        chat_template_pass_request=True,
+        supports_prompt_token_ids=True,
+        preserve_prompt_token_ids=True,
+        image_token_str="<|IMAGE_PLACEHOLDER|>",
+        video_token_str="<|IMAGE_PLACEHOLDER|>",
+        conv_params_from_kwargs=True,
+        has_tokens_per_second=False,
+        expected_kwargs=_ERNIE_KWARGS,
+    ),
+}
@@ -16,46 +16,25 @@

 """Unified multimodal processor for all VL model types.

-Consolidates the four separate VL processor wrappers (QwenVLProcessor,
-Qwen3VLProcessor, PaddleOCRVLProcessor, Ernie4_5_VLProcessor) into a
-single class that dispatches per ``model_type``.
+Consolidates the four separate VL processor wrappers and four separate
+DataProcessor classes into a single class with pluggable Encoding strategies.
 """

+import pickle
 from collections.abc import Mapping
 from typing import Any, Dict, Optional

 import numpy as np
+import zmq

+from fastdeploy.entrypoints.chat_utils import parse_chat_messages
 from fastdeploy.input.base_processor import BaseTextProcessor
+from fastdeploy.input.encodings import EncodingRegistry
+from fastdeploy.input.image_processors import ImageProcessorRegistry
+from fastdeploy.input.mm_model_config import MODEL_CONFIGS
 from fastdeploy.input.utils import IDS_TYPE_FLAG, process_stop_token_ids
 from fastdeploy.utils import data_processor_logger

-QWEN_VL = "qwen_vl"
-QWEN3_VL = "qwen3_vl"
-PADDLEOCR_VL = "paddleocr_vl"
-ERNIE4_5_VL = "ernie4_5_vl"
-
-_SUPPORTED_MODEL_TYPES = {QWEN_VL, QWEN3_VL, PADDLEOCR_VL, ERNIE4_5_VL}
-
-_QWEN_EXPECTED_KWARGS = {
-    "video_max_frames": int,
-    "video_min_frames": int,
-}
-
-_ERNIE_EXPECTED_KWARGS = {
-    "spatial_conv_size": int,
-    "temporal_conv_size": int,
-    "image_min_pixels": int,
-    "image_max_pixels": int,
-    "video_min_pixels": int,
-    "video_max_pixels": int,
-    "video_target_frames": int,
-    "video_frames_sample": str,
-    "video_max_frames": int,
-    "video_min_frames": int,
-    "video_fps": int,
-}
-
 _DEFAULT_MM_LIMITS = {"image": 1, "video": 1, "audio": 1}

 _SAMPLING_EPS = 1e-5
@@ -64,8 +43,9 @@ _SAMPLING_EPS = 1e-5
 class MultiModalProcessor(BaseTextProcessor):
    """Unified multimodal processor for all supported VL model types.

-    Dispatches image-processor creation, config initialisation, and
-    encoding logic based on ``model_type``.
+    Uses a composition pattern: model-type-specific encoding logic is
+    delegated to ``self.enc`` (an Encoding instance), while common logic
+    (tokenization loop, request processing, caching) lives here.
    """

    def __init__(
@@ -79,19 +59,16 @@ class MultiModalProcessor(BaseTextProcessor):
        tool_parser_obj=None,
        enable_processor_cache: bool = False,
    ):
-        if model_type not in _SUPPORTED_MODEL_TYPES:
-            raise ValueError(
-                f"Unsupported model_type '{model_type}'. " f"Must be one of {sorted(_SUPPORTED_MODEL_TYPES)}."
-            )
+        if model_type not in MODEL_CONFIGS:
+            raise ValueError(f"Unsupported model_type '{model_type}'. " f"Must be one of {sorted(MODEL_CONFIGS)}.")
        self.model_type = model_type
        self.config = config
+        self.cfg = MODEL_CONFIGS[model_type]
        self.enable_processor_cache = enable_processor_cache

-        tokenizer_type = "ernie4_5" if model_type == ERNIE4_5_VL else "auto"
-
        super().__init__(
            model_name_or_path,
-            tokenizer_type=tokenizer_type,
+            tokenizer_type=self.cfg.tokenizer_type,
            reasoning_parser_obj=reasoning_parser_obj,
            tool_parser_obj=tool_parser_obj,
        )
@@ -99,8 +76,13 @@ class MultiModalProcessor(BaseTextProcessor):
        data_processor_logger.info(f"model_name_or_path: {model_name_or_path}")

        processor_kwargs = self._parse_processor_kwargs(mm_processor_kwargs)
-        self._init_mm_processor(processor_kwargs)
-        self._init_mm_config()
+        self._init_image_processor()
+        self._init_role_prefixes()
+
+        # Composition: create encoding strategy via registry
+        enc_cls = EncodingRegistry.get(self.model_type)
+        self.enc = enc_cls(self, processor_kwargs)
+
        self.limit_mm_per_prompt = self._parse_limits(limit_mm_per_prompt)

    def _load_tokenizer(self):
@@ -122,76 +104,30 @@ class MultiModalProcessor(BaseTextProcessor):
            tokenizer = AutoTokenizer.from_pretrained(self.model_name_or_path, padding_side="left", use_fast=True)
        return tokenizer

-    def _init_mm_processor(self, processor_kwargs: dict):
-        """Create the model-type-specific internal DataProcessor."""
-        if self.model_type == QWEN_VL:
-            from fastdeploy.input.qwen_vl_processor.process import DataProcessor
+    def _init_image_processor(self):
+        """Create the appropriate image processor."""
+        cls = ImageProcessorRegistry.get(self.model_type)
+        self.image_processor = cls.from_pretrained(self.model_name_or_path)

-            tokens_per_second = getattr(getattr(self.config, "vision_config", None), "tokens_per_second", 2)
-            self.processor = DataProcessor(
-                model_path=self.model_name_or_path,
-                enable_processor_cache=self.enable_processor_cache,
-                tokens_per_second=tokens_per_second,
-                tokenizer=self.tokenizer,
-                **processor_kwargs,
-            )
-        elif self.model_type == QWEN3_VL:
-            from fastdeploy.input.qwen3_vl_processor.process import DataProcessor
-
-            self.processor = DataProcessor(
-                model_path=self.model_name_or_path,
-                enable_processor_cache=self.enable_processor_cache,
-                tokenizer=self.tokenizer,
-                **processor_kwargs,
-            )
-        elif self.model_type == PADDLEOCR_VL:
-            from fastdeploy.input.paddleocr_vl_processor.process import DataProcessor
-
-            tokens_per_second = getattr(getattr(self.config, "vision_config", None), "tokens_per_second", 2)
-            self.processor = DataProcessor(
-                model_path=self.model_name_or_path,
-                enable_processor_cache=self.enable_processor_cache,
-                tokens_per_second=tokens_per_second,
-                tokenizer=self.tokenizer,
-                **processor_kwargs,
-            )
-        elif self.model_type == ERNIE4_5_VL:
-            from fastdeploy.input.ernie4_5_vl_processor.process import DataProcessor
-
-            self.processor = DataProcessor(
-                tokenizer_name=self.model_name_or_path,
-                image_preprocessor_name=self.model_name_or_path,
-                enable_processor_cache=self.enable_processor_cache,
-                **processor_kwargs,
-            )
-            self.processor.eval()
-
-    def _init_mm_config(self):
-        """Set model-type-specific multimodal configuration attributes."""
-        if self.model_type in (QWEN_VL, QWEN3_VL):
-            self.image_patch_id = self.processor.image_token_id
-        elif self.model_type == PADDLEOCR_VL:
-            self.image_patch_id = self.processor.image_patch_id
-        elif self.model_type == ERNIE4_5_VL:
-            self.image_patch_id = self.processor.image_patch_id
-            self.spatial_conv_size = self.processor.spatial_conv_size
+    def _init_role_prefixes(self):
+        """Set up role prefixes for message parsing."""
+        self.role_prefixes = {
+            "system": "",
+            "user": "User: ",
+            "bot": "Assistant: ",
+            "assistant": "Assistant: ",
+        }
+        if self.cfg.has_tool_role:
+            self.role_prefixes["tool"] = "Tool: "

    def _parse_processor_kwargs(self, kwargs: Optional[dict]) -> dict:
-        """Parse and validate multimodal processor kwargs."""
        if not kwargs:
            return {}
-
        try:
            if not isinstance(kwargs, dict):
                raise ValueError("mm-processor-kwargs must be a dictionary")
-
            data_processor_logger.info(f"Processing kwargs: {kwargs}")
-
-            if self.model_type == ERNIE4_5_VL:
-                expected_types = _ERNIE_EXPECTED_KWARGS
-            else:
-                expected_types = _QWEN_EXPECTED_KWARGS
-
+            expected_types = self.cfg.expected_kwargs
            for key, value in kwargs.items():
                if key in expected_types and not isinstance(value, expected_types[key]):
                    raise ValueError(
@@ -199,16 +135,13 @@ class MultiModalProcessor(BaseTextProcessor):
                        f"{expected_types[key].__name__}, got {type(value).__name__}"
                    )
            return kwargs
-
        except Exception as e:
            data_processor_logger.warning(f"Invalid mm-processor-kwargs format: {e}")
            return {}

    def _parse_limits(self, limits: Optional[dict]) -> dict:
-        """Parse multimodal input limits, merging with defaults."""
        if not limits:
            return dict(_DEFAULT_MM_LIMITS)
-
        try:
            if not isinstance(limits, dict):
                raise ValueError("limit-mm-per-prompt must be a dictionary")
@@ -219,7 +152,6 @@ class MultiModalProcessor(BaseTextProcessor):
            return dict(_DEFAULT_MM_LIMITS)

    def _check_mm_limits(self, item):
-        """Validate multimodal inputs against configured limits."""
        if isinstance(item, dict):
            mm_data = item
        else:
@@ -232,7 +164,6 @@ class MultiModalProcessor(BaseTextProcessor):
                            mm_data["image"].append(part)
                        elif part_type in ("video_url", "video"):
                            mm_data["video"].append(part)
-
        for modality, data in mm_data.items():
            if modality in self.limit_mm_per_prompt:
                limit = self.limit_mm_per_prompt[modality]
@@ -240,86 +171,201 @@ class MultiModalProcessor(BaseTextProcessor):
                    raise ValueError(f"Too many {modality} items in prompt, " f"got {len(data)} but limit is {limit}")

    def get_mm_max_tokens_per_item(self, seq_len: int) -> Optional[Mapping[str, int]]:
-        """Return per-modality max token counts, if available."""
-        if self.model_type == ERNIE4_5_VL:
-            return self.processor.get_mm_max_tokens_per_item(seq_len)
-        return None
+        return self.enc.get_mm_max_tokens_per_item(seq_len)
+
+    def _extract_mm_items(self, request):
+        """Extract images/videos from request messages, handling processor cache."""
+        messages = parse_chat_messages(request.get("messages"))
+        mm_items = []
+        for msg in messages:
+            role = msg.get("role")
+            if role not in self.role_prefixes:
+                raise ValueError(f"Unsupported role: {role}")
+            content = msg.get("content")
+            if not isinstance(content, list):
+                content = [content]
+            for item in content:
+                if item.get("type") in ["image", "video"]:
+                    mm_items.append(item)
+
+        missing_hashes, missing_idx = [], []
+        for idx, item in enumerate(mm_items):
+            if not item.get("data"):
+                missing_hashes.append(item.get("uuid"))
+                missing_idx.append(idx)
+
+        if len(missing_hashes) > 0 and not self.enable_processor_cache:
+            raise ValueError("Missing items cannot be retrieved without processor cache.")
+
+        dealer = None
+        if self.enable_processor_cache:
+            context = zmq.Context()
+            dealer = context.socket(zmq.DEALER)
+            dealer.connect("ipc:///dev/shm/processor_cache.ipc")
+
+            missing_items = self.get_processor_cache(dealer, missing_hashes)
+            for idx in range(len(missing_items)):
+                if not missing_items[idx]:
+                    raise ValueError(f"Missing item {idx} not found in processor cache")
+                mm_items[missing_idx[idx]]["data"] = missing_items[idx]
+
+        images, videos = [], []
+        image_uuid, video_uuid = [], []
+        for item in mm_items:
+            if item.get("type") == "image":
+                images.append(item["data"])
+                image_uuid.append(item["uuid"])
+            elif item.get("type") == "video":
+                videos.append(item["data"])
+                video_uuid.append(item["uuid"])
+            else:
+                raise ValueError(f"Unsupported multimodal type: {item.get('type')}")
+
+        return images, videos, image_uuid, video_uuid, dealer, missing_idx, mm_items
+
+    def text2ids(self, text, images=None, videos=None, image_uuid=None, video_uuid=None):
+        """Convert text with image/video placeholders into model inputs."""
+        outputs = self.enc._make_outputs()
+
+        IMAGE_PLACEHOLDER = self.cfg.image_placeholder
+        VIDEO_PLACEHOLDER = self.cfg.video_placeholder
+        IMAGE_PLACEHOLDER_LEN = len(IMAGE_PLACEHOLDER)
+        VIDEO_PLACEHOLDER_LEN = len(VIDEO_PLACEHOLDER)
+
+        st, image_idx, video_idx = 0, 0, 0
+        while st < len(text):
+            image_pos = text.find(IMAGE_PLACEHOLDER, st)
+            image_pos = len(text) if image_pos == -1 else image_pos
+            video_pos = text.find(VIDEO_PLACEHOLDER, st)
+            video_pos = len(text) if video_pos == -1 else video_pos
+            ed = min(image_pos, video_pos)
+
+            self._add_text(text[st:ed], outputs)
+            if ed == len(text):
+                break
+
+            if ed == image_pos:
+                image = images[image_idx]
+                uuid = image_uuid[image_idx] if image_uuid else None
+                if not isinstance(image, tuple):
+                    self.enc.add_image(image, outputs, uuid)
+                else:
+                    self.enc.add_processed_image(image, outputs, uuid)
+                image_idx += 1
+                st = ed + IMAGE_PLACEHOLDER_LEN
+            else:
+                item = videos[video_idx]
+                uuid = video_uuid[video_idx] if video_uuid else None
+                if not isinstance(item, tuple):
+                    if isinstance(item, dict):
+                        frames, meta = self.enc.load_video(item["video"], item)
+                    else:
+                        frames, meta = self.enc.load_video(item, {})
+                    self.enc.add_video(frames, outputs, uuid, meta=meta)
+                else:
+                    self.enc.add_processed_video(item, outputs, uuid)
+                video_idx += 1
+                st = ed + VIDEO_PLACEHOLDER_LEN
+
+        return outputs
+
+    def request2ids(self, request):
+        """Convert chat request with multimodal messages into model inputs."""
+        images, videos, image_uuid, video_uuid, dealer, missing_idx, mm_items = self._extract_mm_items(request)
+
+        if self.tokenizer.chat_template is None:
+            raise ValueError("This model does not support chat template.")
+
+        chat_template_kwargs = request.get("chat_template_kwargs", {})
+        if self.cfg.chat_template_pass_request:
+            # ernie: pass full request to apply_chat_template
+            prompt = self.tokenizer.apply_chat_template(
+                request,
+                tokenize=False,
+                add_generation_prompt=request.get("add_generation_prompt", True),
+                **chat_template_kwargs,
+            )
+        else:
+            messages = parse_chat_messages(request.get("messages"))
+            prompt = self.tokenizer.apply_chat_template(
+                messages,
+                tokenize=False,
+                add_generation_prompt=request.get("add_generation_prompt", True),
+                **chat_template_kwargs,
+            )
+        request["prompt_tokens"] = prompt
+
+        outputs = self.text2ids(prompt, images, videos, image_uuid, video_uuid)
+
+        if self.enable_processor_cache:
+            self._update_mm_cache(dealer, missing_idx, mm_items, outputs)
+
+        return outputs
+
+    def _process_prompt_token_ids(self, request):
+        """Handle the prompt_token_ids tokenisation path.
+
+        Mirrors ``request2ids`` in structure: Processor owns extract/cache,
+        Encoding only does pure encoding.
+        """
+        prompt_token_ids = request.get("prompt_token_ids", [])
+
+        if not request.get("messages"):
+            return self.enc.prompt_token_ids2outputs(prompt_token_ids)
+
+        images, videos, image_uuid, video_uuid, dealer, missing_idx, mm_items = self._extract_mm_items(request)
+        outputs = self.enc.prompt_token_ids2outputs(prompt_token_ids, mm_items)
+
+        if self.enable_processor_cache:
+            self._update_mm_cache(dealer, missing_idx, mm_items, outputs)
+
+        return outputs
+
+    def _update_mm_cache(self, dealer, missing_idx, mm_items, outputs):
+        """Write newly-processed multimodal items to the processor cache."""
+        missing_idx_set = set(missing_idx)
+        hashes_to_cache, items_to_cache = [], []
+        for idx in range(len(mm_items)):
+            if idx in missing_idx_set:
+                continue
+            meta = {}
+            grid_thw = np.asarray(outputs["grid_thw"][idx])
+            if grid_thw.ndim > 1:
+                t, h, w = grid_thw[0]
+            else:
+                t, h, w = grid_thw
+            meta["thw"] = (int(t), int(h), int(w))
+            if "fps" in outputs:
+                meta["fps"] = outputs["fps"][idx]
+            hashes_to_cache.append(outputs["mm_hashes"][idx])
+            items_to_cache.append((outputs["images"][idx], meta))
+        if hashes_to_cache:
+            self.update_processor_cache(dealer, hashes_to_cache, items_to_cache)
+
+        return outputs
+
+    def _add_text(self, tokens, outputs):
+        """Add text tokens to outputs, delegating position logic to enc."""
+        if not tokens:
+            return
+        if isinstance(tokens, str):
+            tokens_str = self.tokenizer.tokenize(tokens)
+            tokens = self.tokenizer.convert_tokens_to_ids(tokens_str)
+        num_tokens = len(tokens)
+        outputs["input_ids"].extend(tokens)
+        outputs["token_type_ids"].extend([IDS_TYPE_FLAG["text"]] * num_tokens)
+        self.enc.add_text_positions(outputs, num_tokens)

    def process_request_dict(self, request, max_model_len=None):
-        """Process a request dictionary into model inputs.
-
-        Unified template-method flow for all VL model types.  Per-model
-        differences are handled by small conditional branches rather than
-        duplicating the entire pipeline.
-        """
+        """Process a request dictionary into model inputs."""
+        cfg = self.cfg
        request = self._apply_default_parameters(request)

        if not request.get("eos_token_ids"):
            request["eos_token_ids"] = self.eos_token_ids

-        self._process_stop_tokens(request)
-
-        if self.model_type != PADDLEOCR_VL:
-            self._process_bad_words(request)
-
-        if self.model_type == ERNIE4_5_VL:
-            logits_processors_args = self._prepare_think_stop_sentence(
-                request.get("logits_processors_args") or {}, max_model_len
-            )
-            request["logits_processors_args"] = logits_processors_args
-
-        outputs = self._tokenize_request(request)
-
-        self._process_post_tokens(request, outputs)
-
-        if self.model_type in (QWEN_VL, QWEN3_VL):
-            request["enable_thinking"] = False
-
-        outputs = self.pack_outputs(outputs)
-
-        if self.model_type in (QWEN3_VL, ERNIE4_5_VL) and request.get("prompt_token_ids"):
-            pass  # preserve existing prompt_token_ids
-        else:
-            request["prompt_token_ids"] = outputs["input_ids"].tolist()
-        request["prompt_token_ids_len"] = len(request["prompt_token_ids"])
-        request["multimodal_inputs"] = outputs
-
-        if max_model_len is not None and len(request["prompt_token_ids"]) > max_model_len:
-            request["prompt_token_ids"] = request["prompt_token_ids"][: max_model_len - 1]
-
-        if self.model_type == ERNIE4_5_VL:
-            logits_processors_args = self._update_thinking_prompt_state(
-                request["prompt_token_ids"], request.get("logits_processors_args") or {}
-            )
-            request["logits_processors_args"] = logits_processors_args
-
-        max_tokens = max_model_len - len(request["prompt_token_ids"])
-        if request.get("max_tokens") is None:
-            request["max_tokens"] = max(1, max_tokens)
-        else:
-            request["max_tokens"] = min(max_tokens, request["max_tokens"])
-
-        if self.model_type == ERNIE4_5_VL and request.get("reasoning_max_tokens") is None:
-            request["reasoning_max_tokens"] = max(int(request["max_tokens"] * 0.8), 1)
-
-        if self.model_type in (PADDLEOCR_VL, ERNIE4_5_VL):
-            if request.get("top_p") is not None and request.get("top_p") < _SAMPLING_EPS:
-                request["top_p"] = _SAMPLING_EPS
-                request["top_k"] = 1
-
-        if self.model_type != QWEN3_VL and self.reasoning_parser:
-            self._apply_reasoning_parser(request)
-
-        if self.model_type == ERNIE4_5_VL:
-            if request.get("response_max_tokens") is not None and request.get("enable_thinking") is False:
-                request["max_tokens"] = min(request["response_max_tokens"], request["max_tokens"])
-
-        data_processor_logger.info(f"Processed request {request}")
-        return request
-
-    def _process_stop_tokens(self, request):
-        """Handle stop token processing based on model type."""
-        if self.model_type == QWEN3_VL:
+        # Stop tokens
+        if cfg.stop_tokens_variant == "qwen3":
            stop_sequences = request.get("stop", [])
            if stop_sequences:
                stop_seqs, stop_seqs_len = self.update_stop_seq(stop_sequences)
@@ -328,34 +374,102 @@ class MultiModalProcessor(BaseTextProcessor):
        else:
            process_stop_token_ids(request, self.update_stop_seq)

-    def _process_bad_words(self, request):
-        """Process bad_words into token ids."""
-        bad_words = request.get("bad_words")
-        bad_words_token_ids = request.get("bad_words_token_ids")
-        if bad_words:
-            bad_words_token_ids = self.update_bad_words(bad_words, bad_words_token_ids)
-            request["bad_words_token_ids"] = bad_words_token_ids
+        # Bad words
+        if cfg.has_bad_words:
+            bad_words = request.get("bad_words")
+            bad_words_token_ids = request.get("bad_words_token_ids")
+            if bad_words:
+                bad_words_token_ids = self.update_bad_words(bad_words, bad_words_token_ids)
+                request["bad_words_token_ids"] = bad_words_token_ids
+
+        # Logits processor (ernie think)
+        if cfg.has_logits_processor_think:
+            logits_processors_args = self._prepare_think_stop_sentence(
+                request.get("logits_processors_args") or {}, max_model_len
+            )
+            request["logits_processors_args"] = logits_processors_args
+
+        # Tokenize
+        outputs = self._tokenize_request(request)
+
+        # Post-token handling
+        self._process_post_tokens(request, outputs)
+
+        # Force disable thinking for qwen_vl / qwen3_vl
+        if cfg.force_disable_thinking:
+            request["enable_thinking"] = False
+
+        # Pack outputs
+        outputs = self.pack_outputs(outputs)
+
+        # Assign prompt_token_ids
+        if cfg.preserve_prompt_token_ids and request.get("prompt_token_ids"):
+            pass  # preserve existing
+        else:
+            request["prompt_token_ids"] = outputs["input_ids"].tolist()
+        request["multimodal_inputs"] = outputs
+
+        # Truncation
+        if max_model_len is not None and len(request["prompt_token_ids"]) > max_model_len:
+            request["prompt_token_ids"] = request["prompt_token_ids"][: max_model_len - 1]
+
+        request["prompt_token_ids_len"] = len(request["prompt_token_ids"])
+
+        # Ernie: update thinking prompt state
+        if cfg.has_logits_processor_think:
+            logits_processors_args = self._update_thinking_prompt_state(
+                request["prompt_token_ids"],
+                request.get("logits_processors_args") or {},
+            )
+            request["logits_processors_args"] = logits_processors_args
+
+        # max_tokens
+        max_tokens = max_model_len - len(request["prompt_token_ids"])
+        if request.get("max_tokens") is None:
+            request["max_tokens"] = max(1, max_tokens)
+        else:
+            request["max_tokens"] = min(max_tokens, request["max_tokens"])
+
+        # Ernie: default reasoning_max_tokens
+        if cfg.set_default_reasoning_max_tokens and request.get("reasoning_max_tokens") is None:
+            request["reasoning_max_tokens"] = max(int(request["max_tokens"] * 0.8), 1)
+
+        # Clamp top_p
+        if request.get("top_p") is not None and request.get("top_p") < _SAMPLING_EPS:
+            request["top_p"] = _SAMPLING_EPS
+            request["top_k"] = 1
+
+        # Reasoning parser
+        if self.reasoning_parser:
+            self._apply_reasoning_parser(request)
+
+        # Ernie: cap response_max_tokens
+        if cfg.cap_response_max_tokens:
+            if request.get("response_max_tokens") is not None and request.get("enable_thinking") is False:
+                request["max_tokens"] = min(request["response_max_tokens"], request["max_tokens"])
+
+        data_processor_logger.info(f"Processed request {request}")
+        return request

    def _tokenize_request(self, request):
-        """Core tokenization dispatch: prompt_token_ids > prompt > messages."""
-        default_thinking = True if self.model_type == ERNIE4_5_VL else False
+        cfg = self.cfg
+        default_thinking = cfg.default_thinking

-        if request.get("prompt_token_ids") and self.model_type in (QWEN3_VL, ERNIE4_5_VL):
+        if request.get("prompt_token_ids") and cfg.supports_prompt_token_ids:
            messages = request.get("messages")
            if messages:
                self._check_mm_limits(messages)
            request.setdefault("enable_thinking", default_thinking)
-            return self.processor.prompt_token_ids2outputs(request)
+            return self._process_prompt_token_ids(request)

        elif request.get("prompt"):
            multimodal_data = request.get("multimodal_data") or {}
            self._check_mm_limits(multimodal_data)
            images = multimodal_data.get("image", None)
            videos = multimodal_data.get("video", None)
-            if self.model_type == ERNIE4_5_VL:
-                request["prompt_tokens"] = request.get("prompt")
+            request["prompt_tokens"] = request.get("prompt")
            request.setdefault("enable_thinking", default_thinking)
-            return self.processor.text2ids(request["prompt"], images, videos)
+            return self.text2ids(request["prompt"], images, videos)

        elif request.get("messages"):
            messages = request["messages"]
@@ -369,65 +483,22 @@ class MultiModalProcessor(BaseTextProcessor):
                else:
                    raise ValueError("Invalid input: chat_template_kwargs must be a dict")
            request.setdefault("enable_thinking", default_thinking)
-            return self.processor.request2ids(request)
+            return self.request2ids(request)

        else:
            raise ValueError(f"Request must contain 'prompt', or 'messages': {request}")

    def _process_post_tokens(self, request, outputs):
-        """Handle post-tokenization token appending."""
-        if self.model_type == PADDLEOCR_VL:
-            metadata = request.get("metadata")
-            if metadata and metadata.get("generated_token_ids"):
-                self._append_completion_tokens_qwen(outputs, metadata["generated_token_ids"])
-        else:
-            if request.get("completion_token_ids"):
-                self.append_completion_tokens(outputs, request["completion_token_ids"])
-
-    def _apply_reasoning_parser(self, request):
-        """Apply reasoning parser and update model status dict."""
-        model_status = self.reasoning_parser.get_model_status(request["prompt_token_ids"])
-        parts = request["request_id"].split("_")
-        if len(parts) > 1:
-            real_req_id = parts[0]
-            index = int(parts[1])
-            n = request.get("n", 1)
-            for idx in range(index * n, (index + 1) * n):
-                self.model_status_dict[f"{real_req_id}_{idx}"] = model_status
-        else:
-            self.model_status_dict[request["request_id"]] = model_status
-        request["enable_thinking"] = model_status == "think_start"
+        completion_token_ids = request.get("completion_token_ids") or request.get("generated_token_ids")
+        if completion_token_ids:
+            self.enc.append_completion_tokens(outputs, completion_token_ids)

    def append_completion_tokens(self, multimodal_inputs, completion_token_ids):
-        """Append completion tokens to existing multimodal outputs."""
-        if self.model_type == ERNIE4_5_VL:
-            self._append_completion_tokens_ernie(multimodal_inputs, completion_token_ids)
-        else:
-            self._append_completion_tokens_qwen(multimodal_inputs, completion_token_ids)
-
-    def _append_completion_tokens_qwen(self, multimodal_inputs, completion_token_ids):
-        """Append completion tokens for qwen_vl / qwen3_vl / paddleocr_vl."""
-        num_tokens = len(completion_token_ids)
-        multimodal_inputs["input_ids"].extend(completion_token_ids)
-        multimodal_inputs["token_type_ids"].extend([0] * num_tokens)
-
-        pos_ids = self.processor._compute_text_positions(multimodal_inputs["cur_position"], num_tokens)
-        multimodal_inputs["position_ids"].append(pos_ids)
-        multimodal_inputs["cur_position"] += num_tokens
-
-    def _append_completion_tokens_ernie(self, multimodal_inputs, completion_token_ids):
-        """Append completion tokens for ernie4_5_vl."""
-        num_tokens = len(completion_token_ids)
-        multimodal_inputs["input_ids"].extend(completion_token_ids)
-        multimodal_inputs["token_type_ids"].extend([IDS_TYPE_FLAG["text"]] * num_tokens)
-
-        start = multimodal_inputs["cur_position"]
-        for i in range(num_tokens):
-            multimodal_inputs["position_ids"].append([start + i] * 3)
-        multimodal_inputs["cur_position"] += num_tokens
+        """Append completion tokens — delegates to enc."""
+        self.enc.append_completion_tokens(multimodal_inputs, completion_token_ids)

    def pack_outputs(self, outputs):
-        """Convert intermediate processing outputs to final format."""
+        """Convert intermediate outputs to final packed format."""
        if not outputs["images"]:
            outputs["images"] = None
            outputs["grid_thw"] = None
@@ -439,15 +510,22 @@ class MultiModalProcessor(BaseTextProcessor):

        outputs["input_ids"] = np.array(outputs["input_ids"], dtype=np.int64)
        outputs["token_type_ids"] = np.array(outputs["token_type_ids"], dtype=np.int64)
-        outputs["mm_num_token_func"] = self.processor.mm_num_tokens
+        outputs["mm_num_token_func"] = self.enc.mm_num_tokens

-        if self.model_type in (QWEN_VL, QWEN3_VL, PADDLEOCR_VL):
-            outputs["position_ids"] = np.concatenate(outputs["position_ids"], axis=1, dtype=np.int64)
-            outputs["image_patch_id"] = self.processor.image_token_id
-            outputs["video_patch_id"] = self.processor.video_token_id
-            outputs["position_ids"] = outputs["position_ids"].transpose(1, 0)
-        else:
-            outputs["position_ids"] = np.array(outputs["position_ids"], dtype=np.int64)
-            outputs["image_patch_id"] = self.image_patch_id
+        # Position IDs: delegate to encoding strategy
+        self.enc.pack_position_ids(outputs)

        return outputs
+
+    def get_processor_cache(self, socket, mm_hashes):
+        req = pickle.dumps(mm_hashes)
+        socket.send_multipart([b"", req])
+        _, resp = socket.recv_multipart()
+        mm_items = pickle.loads(resp)
+        data_processor_logger.info(f"Get cache of mm_hashes: {mm_hashes}")
+        return mm_items
+
+    def update_processor_cache(self, socket, mm_hashes, mm_items):
+        req = pickle.dumps((mm_hashes, mm_items))
+        socket.send_multipart([b"", req])
+        data_processor_logger.info(f"Update cache of mm_hashes: {mm_hashes}")
@@ -28,8 +28,8 @@ from fastdeploy.engine.request import ImagePosition
 from fastdeploy.entrypoints.chat_utils import parse_chat_messages
 from fastdeploy.input.mm_data_processor import MMBaseDataProcessor
 from fastdeploy.input.utils import IDS_TYPE_FLAG
-from fastdeploy.input.video_utils import read_video_decord
-from fastdeploy.input.video_utils import sample_frames_paddleocr as sample_frames
+from fastdeploy.input.utils.video import read_video_decord
+from fastdeploy.input.utils.video import sample_frames_paddleocr as sample_frames
 from fastdeploy.multimodal.hasher import MultimodalHasher
 from fastdeploy.utils import data_processor_logger

@@ -94,13 +94,13 @@ class InputPreprocessor:
                    tool_parser_obj=tool_parser_obj,
                )
            else:
-                from fastdeploy.input.multimodal_processor import (
+                from fastdeploy.input.mm_model_config import (
                    ERNIE4_5_VL,
                    PADDLEOCR_VL,
                    QWEN3_VL,
                    QWEN_VL,
-                    MultiModalProcessor,
                )
+                from fastdeploy.input.multimodal_processor import MultiModalProcessor

                if ErnieArchitectures.contains_ernie_arch(architecture):
                    model_type = ERNIE4_5_VL
@@ -28,8 +28,8 @@ from fastdeploy.engine.request import ImagePosition
 from fastdeploy.entrypoints.chat_utils import parse_chat_messages
 from fastdeploy.input.mm_data_processor import MMBaseDataProcessor
 from fastdeploy.input.utils import IDS_TYPE_FLAG
-from fastdeploy.input.video_utils import read_video_decord
-from fastdeploy.input.video_utils import sample_frames_qwen as sample_frames
+from fastdeploy.input.utils.video import read_video_decord
+from fastdeploy.input.utils.video import sample_frames_qwen as sample_frames
 from fastdeploy.multimodal.hasher import MultimodalHasher
 from fastdeploy.utils import data_processor_logger

@@ -28,8 +28,8 @@ from fastdeploy.engine.request import ImagePosition
 from fastdeploy.entrypoints.chat_utils import parse_chat_messages
 from fastdeploy.input.mm_data_processor import MMBaseDataProcessor
 from fastdeploy.input.utils import IDS_TYPE_FLAG
-from fastdeploy.input.video_utils import read_video_decord
-from fastdeploy.input.video_utils import sample_frames_qwen as sample_frames
+from fastdeploy.input.utils.video import read_video_decord
+from fastdeploy.input.utils.video import sample_frames_qwen as sample_frames
 from fastdeploy.multimodal.hasher import MultimodalHasher
 from fastdeploy.utils import data_processor_logger

@@ -0,0 +1,41 @@
+# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Utility package for fastdeploy.input — re-exports from sub-modules."""
+
+from fastdeploy.input.utils.common import (
+    IDS_TYPE_FLAG,
+    MAX_IMAGE_DIMENSION,
+    process_stop_token_ids,
+    validate_model_path,
+)
+from fastdeploy.input.utils.video import (
+    VideoReaderWrapper,
+    read_video_decord,
+    sample_frames,
+    sample_frames_paddleocr,
+    sample_frames_qwen,
+)
+
+__all__ = [
+    "IDS_TYPE_FLAG",
+    "MAX_IMAGE_DIMENSION",
+    "process_stop_token_ids",
+    "validate_model_path",
+    "VideoReaderWrapper",
+    "read_video_decord",
+    "sample_frames",
+    "sample_frames_paddleocr",
+    "sample_frames_qwen",
+]
@@ -0,0 +1,94 @@
+# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Render timestamps onto video frames."""
+
+import os
+from pathlib import Path
+
+from PIL import Image, ImageDraw, ImageFont
+
+FONT_PATH = os.path.join(Path(__file__).parent.absolute(), "Roboto-Regular.ttf")
+
+
+def render_single_image_with_timestamp(image: Image, number: str, rate: float, font_path: str = FONT_PATH):
+    """Render a timestamp string onto a PIL Image.
+
+    The font size is ``min(width, height) * rate``.
+    Text is drawn in black with a white outline (10% of font size).
+    """
+    draw = ImageDraw.Draw(image)
+    width, height = image.size
+    font_size = int(min(width, height) * rate)
+    outline_size = int(font_size * 0.1)
+    font = ImageFont.truetype(font_path, font_size)
+    x = 0
+    y = 0
+
+    draw.text(
+        (x, y),
+        number,
+        font=font,
+        fill=(0, 0, 0),
+        stroke_width=outline_size,
+        stroke_fill=(255, 255, 255),
+    )
+
+    return image
+
+
+def timestamp_converting(time_stamp_in_seconds):
+    """Convert timestamp from seconds to ``HH:MM:SS.ss`` format."""
+    hours = 0
+    while time_stamp_in_seconds >= 3600:
+        hours += 1
+        time_stamp_in_seconds -= 3600
+    mins = 0
+    while time_stamp_in_seconds >= 60:
+        mins += 1
+        time_stamp_in_seconds -= 60
+    time_hours = f"{int(hours):02d}"
+    time_mins = f"{int(mins):02d}"
+    time_secs = f"{time_stamp_in_seconds:05.02f}"
+    fi_time_stamp = time_hours + ":" + time_mins + ":" + time_secs
+
+    return fi_time_stamp
+
+
+def get_timestamp_for_uniform_frame_extraction(num_frames, frame_id, duration):
+    """Get the timestamp of a frame during uniform extraction.
+
+    Returns the timestamp in seconds.
+    """
+    time_stamp = duration * 1.0 * frame_id / num_frames
+
+    return time_stamp
+
+
+def render_frame_timestamp(frame, timestamp, font_rate=0.1):
+    """Render a timestamp onto a video frame.
+
+    Parameters
+    ----------
+    frame : PIL.Image
+        The video frame.
+    timestamp : float
+        Timestamp in seconds.
+    font_rate : float
+        Font size as a fraction of ``min(width, height)``.
+    """
+    time_stamp = "time: " + timestamp_converting(timestamp)
+    new_frame = render_single_image_with_timestamp(frame, time_stamp, font_rate)
+
+    return new_frame
@@ -0,0 +1,470 @@
+# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Shared video utilities: VideoReaderWrapper, read_video_decord, sample_frames, read_frames_decord."""
+
+import datetime
+import hashlib
+import io
+import math
+import os
+import random
+import threading
+import uuid
+from tempfile import NamedTemporaryFile as ntf
+from typing import Optional, Union
+
+import numpy as np
+from PIL import Image
+
+from fastdeploy.input.image_processors.common import ceil_by_factor, floor_by_factor
+from fastdeploy.utils import data_processor_logger
+
+__all__ = [
+    "VideoReaderWrapper",
+    "read_video_decord",
+    "sample_frames",
+    "sample_frames_qwen",
+    "sample_frames_paddleocr",
+    "get_frame_indices",
+    "read_frames_decord",
+    "EXTRACTED_FRAME_DIR",
+    "get_filename",
+]
+
+
+# ---------------------------------------------------------------------------
+# VideoReaderWrapper
+# ---------------------------------------------------------------------------
+
+
+def _is_gif(data: bytes) -> bool:
+    """Check if bytes represent a GIF based on magic header."""
+    return data[:6] in (b"GIF87a", b"GIF89a")
+
+
+class VideoReaderWrapper:
+    """decord.VideoReader wrapper that fixes a memory leak and adds GIF support.
+
+    Reference: https://github.com/dmlc/decord/issues/208
+    """
+
+    def __init__(self, video_path, *args, **kwargs):
+        import decord
+
+        try:
+            # moviepy 1.0
+            import moviepy.editor as mp
+        except Exception:
+            # moviepy 2.0
+            import moviepy as mp
+
+        with ntf(delete=True, suffix=".gif") as gif_file:
+            gif_input = None
+            self.original_file = None  # only set when we create a temp file
+
+            if isinstance(video_path, str):
+                if video_path.lower().endswith(".gif"):
+                    gif_input = video_path
+            elif isinstance(video_path, bytes):
+                if _is_gif(video_path):
+                    gif_file.write(video_path)
+                    gif_file.flush()
+                    gif_input = gif_file.name
+            elif isinstance(video_path, io.BytesIO):
+                video_path.seek(0)
+                tmp_bytes = video_path.read()
+                video_path.seek(0)
+                if _is_gif(tmp_bytes):
+                    gif_file.write(tmp_bytes)
+                    gif_file.flush()
+                    gif_input = gif_file.name
+
+            if gif_input is not None:
+                clip = mp.VideoFileClip(gif_input)
+                mp4_file = ntf(delete=False, suffix=".mp4")
+                mp4_path = mp4_file.name
+                mp4_file.close()  # close before moviepy writes
+                clip.write_videofile(mp4_path, verbose=False, logger=None)
+                clip.close()
+                video_path = mp4_path
+                self.original_file = video_path  # temp mp4, cleaned up in __del__
+
+            self._reader = decord.VideoReader(video_path, *args, **kwargs)
+            self._reader.seek(0)
+
+    def __len__(self):
+        return len(self._reader)
+
+    def __getitem__(self, key):
+        frames = self._reader[key]
+        self._reader.seek(0)
+        return frames
+
+    def get_avg_fps(self):
+        return self._reader.get_avg_fps()
+
+    def seek(self, pos):
+        return self._reader.seek(pos)
+
+    def __del__(self):
+        original_file = getattr(self, "original_file", None)
+        if original_file:
+            try:
+                os.remove(original_file)
+            except OSError:
+                pass
+
+
+# ---------------------------------------------------------------------------
+# read_video_decord
+# ---------------------------------------------------------------------------
+
+
+def read_video_decord(video_path, save_to_disk: bool = False):
+    """Load a video file and return (video_reader, video_meta, video_path).
+
+    video_meta contains keys: "fps", "duration", "num_of_frame".
+    """
+    if isinstance(video_path, VideoReaderWrapper):
+        video_reader = video_path
+    else:
+        if isinstance(video_path, bytes):
+            video_path = io.BytesIO(video_path)
+        video_reader = VideoReaderWrapper(video_path, num_threads=1)
+
+    vlen = len(video_reader)
+    fps = video_reader.get_avg_fps()
+    duration = vlen / float(fps)
+
+    video_meta = {"fps": fps, "duration": duration, "num_of_frame": vlen}
+    return video_reader, video_meta, video_path
+
+
+# ---------------------------------------------------------------------------
+# sample_frames — qwen_vl variant
+# ---------------------------------------------------------------------------
+
+
+def sample_frames_qwen(
+    frame_factor: int,
+    min_frames: int,
+    max_frames: int,
+    metadata: Optional[dict] = None,
+    fps: Optional[Union[int, float]] = -1,
+    num_frames: Optional[int] = -1,
+) -> np.ndarray:
+    """Sample frame indices — qwen_vl variant.
+
+    Sentinel defaults are -1. Applies ceil_by_factor on min_frames and ensures
+    num_frames is divisible by 4.
+    """
+    if fps > 0 and num_frames > 0:
+        raise ValueError("`num_frames` and `fps` are mutually exclusive arguments, please use only one!")
+
+    if metadata is None:
+        raise ValueError("metadata is required for sample_frames_qwen")
+
+    total_num_frames = metadata["num_of_frame"]
+
+    if num_frames > 0:
+        num_frames = round(num_frames / frame_factor) * frame_factor
+    elif fps > 0:
+        min_frames = ceil_by_factor(min_frames, frame_factor)
+        max_frames = floor_by_factor(min(max_frames, total_num_frames), frame_factor)
+
+        num_frames = total_num_frames / metadata["fps"] * fps
+
+        if num_frames > total_num_frames:
+            data_processor_logger.warning(f"smart_nframes: nframes[{num_frames}] > total_frames[{total_num_frames}]")
+
+        num_frames = min(min(max(num_frames, min_frames), max_frames), total_num_frames)
+        num_frames = floor_by_factor(num_frames, frame_factor)
+
+    if num_frames > total_num_frames:
+        raise ValueError(
+            f"Video can't be sampled. The inferred `num_frames={num_frames}` exceeds "
+            f"`total_num_frames={total_num_frames}`. "
+            "Decrease `num_frames` or `fps` for sampling."
+        )
+
+    # num_frames must be divisible by 4
+    if num_frames > 2 and num_frames % 4 != 0:
+        num_frames = (num_frames // 4) * 4
+        total_num_frames = (total_num_frames // 4) * 4
+        num_frames = min(min(max(num_frames, min_frames), max_frames), total_num_frames)
+
+    if num_frames > 0:
+        indices = np.arange(0, total_num_frames, total_num_frames / num_frames).astype(np.int32)
+    else:
+        indices = np.arange(0, total_num_frames).astype(np.int32)
+
+    return indices
+
+
+# ---------------------------------------------------------------------------
+# sample_frames — paddleocr_vl / ernie4_5_vl variant
+# ---------------------------------------------------------------------------
+
+
+def sample_frames_paddleocr(
+    frame_factor: int,
+    min_frames: int,
+    max_frames: int,
+    metadata: Optional[dict] = None,
+    fps: Optional[Union[int, float]] = None,
+    num_frames: Optional[int] = None,
+) -> np.ndarray:
+    """Sample frame indices — paddleocr_vl / ernie4_5_vl variant.
+
+    Sentinel defaults are None. Uses plain math.floor/ceil; no %4 correction.
+    """
+    fps = fps or 0
+    num_frames = num_frames or 0
+    if fps > 0 and num_frames > 0:
+        raise ValueError("`num_frames` and `fps` are mutually exclusive arguments, please use only one!")
+
+    if metadata is None:
+        raise ValueError("metadata is required for sample_frames_paddleocr")
+
+    total_num_frames = metadata["num_of_frame"]
+
+    if num_frames > 0:
+        num_frames = round(num_frames / frame_factor) * frame_factor
+    elif fps > 0:
+        max_frames = math.floor(min(max_frames, total_num_frames) / frame_factor) * frame_factor
+        num_frames = total_num_frames / metadata["fps"] * fps
+        num_frames = min(min(max(num_frames, min_frames), max_frames), total_num_frames)
+        num_frames = math.floor(num_frames / frame_factor) * frame_factor
+
+    if num_frames > total_num_frames:
+        raise ValueError(
+            f"Video can't be sampled. The inferred `num_frames={num_frames}` exceeds "
+            f"`total_num_frames={total_num_frames}`. "
+            "Decrease `num_frames` or `fps` for sampling."
+        )
+
+    if num_frames > 0:
+        indices = np.arange(0, total_num_frames, total_num_frames / num_frames).astype(np.int32)
+    else:
+        indices = np.arange(0, total_num_frames).astype(np.int32)
+
+    return indices
+
+
+def sample_frames(
+    frame_factor: int,
+    min_frames: int,
+    max_frames: int,
+    metadata: Optional[dict] = None,
+    fps: Optional[Union[int, float]] = None,
+    num_frames: Optional[int] = None,
+    variant: str = "paddleocr",
+) -> np.ndarray:
+    """Dispatch to sample_frames_qwen or sample_frames_paddleocr based on variant."""
+    if variant == "qwen":
+        _fps = fps if fps is not None else -1
+        _num_frames = num_frames if num_frames is not None else -1
+        return sample_frames_qwen(frame_factor, min_frames, max_frames, metadata, _fps, _num_frames)
+    if variant == "paddleocr":
+        return sample_frames_paddleocr(frame_factor, min_frames, max_frames, metadata, fps, num_frames)
+    raise ValueError(f"Unknown variant {variant!r}. Expected 'paddleocr' or 'qwen'.")
+
+
+# ---------------------------------------------------------------------------
+# IO helpers (migrated from ernie4_5_vl_processor/utils/io_utils.py)
+# ---------------------------------------------------------------------------
+
+EXTRACTED_FRAME_DIR = "./download_tmp/extracted_frames/"
+
+
+def get_filename(url=None):
+    """Generate a unique filename, optionally based on a URL hash."""
+    if url is None:
+        return str(uuid.uuid4()).replace("-", "")
+    t = datetime.datetime.now()
+    if not isinstance(url, bytes):
+        url = url.encode("utf-8")
+
+    md5_hash = hashlib.md5(url).hexdigest()
+    pid = os.getpid()
+    tid = threading.get_ident()
+
+    image_filename = f"{t.year}-{t.month:02d}-{t.day:02d}-{pid}-{tid}-{md5_hash}"
+    return image_filename
+
+
+# ---------------------------------------------------------------------------
+# get_frame_indices / read_frames_decord
+# (migrated from ernie4_5_vl_processor/process_video.py)
+# ---------------------------------------------------------------------------
+
+
+def get_frame_indices(
+    vlen,
+    target_frames=-1,
+    target_fps=-1,
+    frames_sample="middle",
+    fix_start=None,
+    input_fps=-1,
+):
+    """Get frame indices for sampling from a video."""
+    assert frames_sample in ["rand", "middle", "leading"]
+    if target_frames > 0:
+        assert target_fps <= 0, "target_fps must be negative if target_frames is given."
+        if target_frames > vlen:
+            acc_samples = vlen
+            data_processor_logger.info(
+                f"target_frames={target_frames} is larger than video length {vlen}, "
+                f"will sample {acc_samples} frames."
+            )
+        else:
+            acc_samples = target_frames
+            data_processor_logger.debug(f"sampling at target_frames={target_frames}, frames_sample={frames_sample}")
+
+        intervals = np.linspace(start=0, stop=vlen, num=acc_samples + 1).astype(int)
+        ranges = []
+        for idx, interv in enumerate(intervals[:-1]):
+            ranges.append((interv, intervals[idx + 1] - 1))
+        if frames_sample == "rand":
+            try:
+                frame_indices = [random.choice(range(x[0], x[1])) for x in ranges]
+            except Exception:
+                frame_indices = np.random.permutation(vlen)[:acc_samples]
+                frame_indices.sort()
+                frame_indices = list(frame_indices)
+        elif fix_start is not None:
+            frame_indices = [x[0] + fix_start for x in ranges]
+        elif frames_sample == "leading":
+            frame_indices = [x[0] for x in ranges]
+        elif frames_sample == "middle":
+            frame_indices = [(x[0] + x[1]) // 2 for x in ranges]
+        else:
+            raise NotImplementedError
+
+    elif target_fps > 0:
+        assert target_frames <= 0, "target_frames must be negative if target_fps is given."
+        assert input_fps > 0, "input_fps must be provided if target_fps is given."
+        data_processor_logger.info(f"sampling at fps={target_fps}, frames_sample={frames_sample}")
+        duration = float(vlen) / input_fps
+        delta = 1 / target_fps
+        if frames_sample == "middle":
+            frame_seconds = np.arange(0 + delta / 2, duration + delta / 2, delta)
+        elif frames_sample == "leading":
+            frame_seconds = np.arange(0, duration, delta)
+        if frames_sample == "rand":
+            frame_seconds = np.arange(0 + delta / 2, duration + delta / 2, delta)
+            rand_offset = np.random.rand(*(frame_seconds.shape)) - 0.5
+            frame_seconds += rand_offset * delta
+        frame_indices = np.around(frame_seconds * input_fps).astype(int)
+        frame_indices = [e for e in frame_indices if e < vlen]
+
+    else:
+        raise ValueError("Must provide either positive target_fps or positive target_frames.")
+
+    return frame_indices
+
+
+def read_frames_decord(
+    video_path,
+    video_reader,
+    video_meta,
+    target_frames=-1,
+    target_fps=-1,
+    frames_sample="middle",
+    fix_start=None,
+    save_to_disk=False,
+    cache_dir=None,
+    frame_indices=None,
+    tol=10,
+):
+    """Read frames from a video using decord, with retry logic for corrupt frames."""
+    if cache_dir is None:
+        cache_dir = EXTRACTED_FRAME_DIR
+
+    if frame_indices is None:
+        frame_indices = get_frame_indices(
+            video_meta["num_of_frame"],
+            target_frames=target_frames,
+            target_fps=target_fps,
+            frames_sample=frames_sample,
+            fix_start=fix_start,
+            input_fps=video_meta["fps"],
+        )
+
+    frames = []
+    for frame_indice_index in range(0, len(frame_indices)):
+        frame_indice = frame_indices[frame_indice_index]
+        try:
+            frames.append(video_reader[frame_indice].asnumpy())
+        except Exception as e:
+            data_processor_logger.debug(f"encounter error when get frame: {frame_indice}, error: {e}")
+            previous_counter = 1
+            later_counter = 1
+            previous_after_flag = True
+            if frame_indice == 0 or frame_indice == len(video_reader) - 1:
+                cur_tol = tol * 2
+            else:
+                cur_tol = tol
+            while previous_counter < cur_tol or later_counter < cur_tol:
+                if previous_after_flag:
+                    if frame_indice - previous_counter < 0:
+                        previous_counter += 1
+                        previous_after_flag = not previous_after_flag
+                        continue
+                    try:
+                        frames.append(video_reader[frame_indice - previous_counter].asnumpy())
+                        data_processor_logger.info(
+                            f"replace {frame_indice}-th frame with {frame_indice-previous_counter}-th frame"
+                        )
+                        frame_indices[frame_indice_index] = frame_indice - previous_counter
+                        break
+                    except Exception as e:
+                        previous_counter += 1
+                        data_processor_logger.info(f"error: {e}")
+                else:
+                    if frame_indice + later_counter >= len(video_reader):
+                        later_counter += 1
+                        previous_after_flag = not previous_after_flag
+                        continue
+                    try:
+                        frames.append(video_reader[frame_indice + later_counter].asnumpy())
+                        data_processor_logger.info(
+                            f"replace {frame_indice}-th frame with {frame_indice+later_counter}-th frame"
+                        )
+                        frame_indices[frame_indice_index] = frame_indice + later_counter
+                        break
+                    except Exception:
+                        later_counter += 1
+                previous_after_flag = not previous_after_flag
+
+    frames = np.stack(frames, axis=0)
+    assert len(frames) == len(frame_indices), f"len(frames): {len(frames)} != len(frame_indices): {len(frame_indices)}"
+
+    ret = []
+
+    url_sha1 = get_filename()
+    for idx, frame in enumerate(frames):
+        tmp = Image.fromarray(frame, "RGB")
+        if save_to_disk:
+            save_path = os.path.join(cache_dir, f"{url_sha1}", f"{idx}.png")
+            if not os.path.exists(os.path.dirname(save_path)):
+                os.makedirs(os.path.dirname(save_path))
+            tmp.save(save_path)
+            tmp = save_path
+        ret.append(tmp)
+
+    time_stamps = [frame_idx * video_meta["duration"] / video_meta["num_of_frame"] for frame_idx in frame_indices]
+
+    return ret, frame_indices, time_stamps
@@ -85,7 +85,7 @@ import zmq

 from fastdeploy import envs
 from fastdeploy.engine.tasks import PoolingTask
-from fastdeploy.input.ernie4_5_vl_processor import DataProcessor
+from fastdeploy.input.image_processors.adaptive_processor import AdaptiveImageProcessor
 from fastdeploy.inter_communicator import IPCSignal, ZmqIpcClient
 from fastdeploy.logger.deterministic_logger import DeterministicLogger
 from fastdeploy.model_executor.forward_meta import ForwardMeta
@@ -2867,12 +2867,7 @@ class GPUModelRunner(ModelRunnerBase):
        return

    def _init_image_preprocess(self) -> None:
-        processor = DataProcessor(
-            tokenizer_name=self.model_config.model,
-            image_preprocessor_name=str(self.model_config.model),
-        )
-        processor.eval()
-        image_preprocess = processor.image_preprocessor
+        image_preprocess = AdaptiveImageProcessor.from_pretrained(str(self.model_config.model))
        image_preprocess.image_mean_tensor = paddle.to_tensor(image_preprocess.image_mean, dtype="float32").reshape(
            [1, 3, 1, 1]
        )
@@ -33,7 +33,7 @@ from fastdeploy.config import FDConfig
 from fastdeploy.engine.pooling_params import PoolingParams
 from fastdeploy.engine.request import ImagePosition, Request, RequestType
 from fastdeploy.engine.tasks import PoolingTask
-from fastdeploy.input.ernie4_5_vl_processor import DataProcessor
+from fastdeploy.input.image_processors.adaptive_processor import AdaptiveImageProcessor
 from fastdeploy.inter_communicator import IPCSignal, ZmqIpcClient
 from fastdeploy.model_executor.forward_meta import ForwardMeta
 from fastdeploy.model_executor.graph_optimization.utils import (
@@ -2566,12 +2566,7 @@ class MetaxModelRunner(ModelRunnerBase):
        return

    def _init_image_preprocess(self) -> None:
-        processor = DataProcessor(
-            tokenizer_name=self.model_config.model,
-            image_preprocessor_name=str(self.model_config.model),
-        )
-        processor.eval()
-        image_preprocess = processor.image_preprocessor
+        image_preprocess = AdaptiveImageProcessor.from_pretrained(str(self.model_config.model))
        image_preprocess.image_mean_tensor = paddle.to_tensor(image_preprocess.image_mean, dtype="float32").reshape(
            [1, 3, 1, 1]
        )
@@ -31,7 +31,7 @@ from paddle import nn
 from fastdeploy import envs
 from fastdeploy.config import FDConfig
 from fastdeploy.engine.request import ImagePosition, Request, RequestType
-from fastdeploy.input.ernie4_5_vl_processor import DataProcessor
+from fastdeploy.input.image_processors.adaptive_processor import AdaptiveImageProcessor
 from fastdeploy.inter_communicator import IPCSignal, ZmqIpcClient
 from fastdeploy.model_executor.forward_meta import ForwardMeta
 from fastdeploy.model_executor.graph_optimization.utils import (
@@ -1842,12 +1842,7 @@ class XPUModelRunner(ModelRunnerBase):
            self.forward_meta.clear_caches()

    def _init_image_preprocess(self) -> None:
-        processor = DataProcessor(
-            tokenizer_name=self.model_config.model,
-            image_preprocessor_name=str(self.model_config.model),
-        )
-        processor.eval()
-        image_preprocess = processor.image_preprocessor
+        image_preprocess = AdaptiveImageProcessor.from_pretrained(str(self.model_config.model))
        image_preprocess.image_mean_tensor = paddle.to_tensor(image_preprocess.image_mean, dtype="float32").reshape(
            [1, 3, 1, 1]
        )
@@ -303,6 +303,7 @@ setup(
            "model_executor/models/*",
            "model_executor/layers/*",
            "input/ernie4_5_vl_processor/utils/*",
+            "input/utils/Roboto-Regular.ttf",
            "model_executor/ops/gcu/*",
            "model_executor/ops/gcu/fastdeploy_ops/*",
            "cache_manager/transfer_factory/get_rdma_nics.sh",
@@ -30,7 +30,7 @@ from fastdeploy.input.paddleocr_vl_processor.paddleocr_vl_processor import (
    PaddleOCRVLProcessor,
 )
 from fastdeploy.input.paddleocr_vl_processor.process import DataProcessor
-from fastdeploy.input.video_utils import sample_frames_paddleocr as sample_frames
+from fastdeploy.input.utils.video import sample_frames_paddleocr as sample_frames

 MODULE_PATH = "fastdeploy.input.paddleocr_vl_processor.process"

@@ -24,8 +24,8 @@ from unittest.mock import patch
 import numpy as np
 from PIL import Image as PILImage

-import fastdeploy.input.ernie4_5_vl_processor.process_video as process_video_module
-from fastdeploy.input.ernie4_5_vl_processor.process_video import (
+import fastdeploy.input.utils.video as process_video_module
+from fastdeploy.input.utils.video import (
    get_frame_indices,
    read_frames_decord,
    read_video_decord,
@@ -21,7 +21,7 @@ import numpy as np
 from PIL import Image

 from fastdeploy.input.qwen_vl_processor import QwenVLProcessor
-from fastdeploy.input.video_utils import sample_frames_qwen as sample_frames
+from fastdeploy.input.utils.video import sample_frames_qwen as sample_frames


 def mock_pil_image(height, width):
@@ -41,16 +41,16 @@ class TestValidateModelPath(unittest.TestCase):

    def _patch_console_logger(self):
        """Patch console_logger.warning to capture warnings."""
-        import fastdeploy.input.utils as utils_mod
+        import fastdeploy.input.utils.common as common_mod

-        self._orig_warning = utils_mod.console_logger.warning
-        utils_mod.console_logger.warning = self._capture_warning
+        self._orig_warning = common_mod.console_logger.warning
+        common_mod.console_logger.warning = self._capture_warning

    def _unpatch_console_logger(self):
-        import fastdeploy.input.utils as utils_mod
+        import fastdeploy.input.utils.common as common_mod

        if self._orig_warning is not None:
-            utils_mod.console_logger.warning = self._orig_warning
+            common_mod.console_logger.warning = self._orig_warning

    def tearDown(self):
        self._unpatch_console_logger()
@@ -18,7 +18,7 @@ from unittest.mock import MagicMock, patch

 import numpy as np

-from fastdeploy.input.video_utils import (
+from fastdeploy.input.utils.video import (
    _is_gif,
    read_video_decord,
    sample_frames,
@@ -74,7 +74,7 @@ class TestIsGif(unittest.TestCase):
 class TestVideoReaderWrapper(unittest.TestCase):
    def _make_wrapper(self, video_path, mock_reader=None):
        """Construct a VideoReaderWrapper with decord mocked out."""
-        from fastdeploy.input.video_utils import VideoReaderWrapper
+        from fastdeploy.input.utils.video import VideoReaderWrapper

        if mock_reader is None:
            mock_reader = _make_mock_reader()
@@ -112,7 +112,7 @@ class TestVideoReaderWrapper(unittest.TestCase):

    def test_del_no_original_file(self):
        """__del__ should be a no-op when original_file is None."""
-        from fastdeploy.input.video_utils import VideoReaderWrapper
+        from fastdeploy.input.utils.video import VideoReaderWrapper

        wrapper = object.__new__(VideoReaderWrapper)
        wrapper.original_file = None
@@ -125,7 +125,7 @@ class TestVideoReaderWrapper(unittest.TestCase):
        import os
        import tempfile

-        from fastdeploy.input.video_utils import VideoReaderWrapper
+        from fastdeploy.input.utils.video import VideoReaderWrapper

        with tempfile.NamedTemporaryFile(delete=False) as f:
            tmp_path = f.name
@@ -138,7 +138,7 @@ class TestVideoReaderWrapper(unittest.TestCase):

    def test_non_gif_string_path_does_not_set_original_file(self):
        """Passing a non-GIF string path must NOT set original_file (bug fix)."""
-        from fastdeploy.input.video_utils import VideoReaderWrapper
+        from fastdeploy.input.utils.video import VideoReaderWrapper

        mock_reader = _make_mock_reader()
        mock_decord = MagicMock()
@@ -151,7 +151,7 @@ class TestVideoReaderWrapper(unittest.TestCase):

    def test_bytesio_non_gif_path_does_not_set_original_file(self):
        """Passing a BytesIO that is NOT a GIF must not set original_file."""
-        from fastdeploy.input.video_utils import VideoReaderWrapper
+        from fastdeploy.input.utils.video import VideoReaderWrapper

        mock_reader = _make_mock_reader()
        mock_decord = MagicMock()
@@ -172,16 +172,16 @@ class TestVideoReaderWrapper(unittest.TestCase):
 class TestReadVideoDecord(unittest.TestCase):
    def _patch_wrapper(self, num_frames=100, fps=25.0):
        """Return a context manager that replaces VideoReaderWrapper with a mock."""
-        from fastdeploy.input import video_utils
+        from fastdeploy.input.utils import video

        mock_wrapper = MagicMock()
        mock_wrapper.__len__ = MagicMock(return_value=num_frames)
        mock_wrapper.get_avg_fps = MagicMock(return_value=fps)
-        return patch.object(video_utils, "VideoReaderWrapper", return_value=mock_wrapper), mock_wrapper
+        return patch.object(video, "VideoReaderWrapper", return_value=mock_wrapper), mock_wrapper

    def test_existing_wrapper_passthrough(self):
        """Already-wrapped reader is returned as-is."""
-        from fastdeploy.input.video_utils import VideoReaderWrapper
+        from fastdeploy.input.utils.video import VideoReaderWrapper

        mock_wrapper = MagicMock(spec=VideoReaderWrapper)
        mock_wrapper.__len__ = MagicMock(return_value=50)
@@ -196,7 +196,7 @@ class TestReadVideoDecord(unittest.TestCase):

    def test_bytes_input_converted_to_bytesio(self):
        """bytes input is converted to BytesIO before creating VideoReaderWrapper."""
-        from fastdeploy.input import video_utils
+        from fastdeploy.input.utils import video

        captured = []

@@ -210,14 +210,14 @@ class TestReadVideoDecord(unittest.TestCase):
            def get_avg_fps(self):
                return 10.0

-        with patch.object(video_utils, "VideoReaderWrapper", FakeWrapper):
+        with patch.object(video, "VideoReaderWrapper", FakeWrapper):
            reader, meta, path = read_video_decord(b"fake_video_bytes")

        self.assertIsInstance(captured[0], io.BytesIO)

    def test_string_path_input(self):
        """String path is passed through to VideoReaderWrapper."""
-        from fastdeploy.input import video_utils
+        from fastdeploy.input.utils import video

        class FakeWrapper:
            def __init__(self, path, *args, **kwargs):
@@ -229,7 +229,7 @@ class TestReadVideoDecord(unittest.TestCase):
            def get_avg_fps(self):
                return 30.0

-        with patch.object(video_utils, "VideoReaderWrapper", FakeWrapper):
+        with patch.object(video, "VideoReaderWrapper", FakeWrapper):
            reader, meta, path = read_video_decord("/fake/path.mp4")

        self.assertEqual(meta["num_of_frame"], 60)
@@ -333,18 +333,18 @@ class TestSampleFramesDispatcher(unittest.TestCase):
    META = {"num_of_frame": 100, "fps": 25.0}

    def test_default_variant_is_paddleocr(self):
-        with patch("fastdeploy.input.video_utils.sample_frames_paddleocr", wraps=sample_frames_paddleocr) as mock_fn:
+        with patch("fastdeploy.input.utils.video.sample_frames_paddleocr", wraps=sample_frames_paddleocr) as mock_fn:
            sample_frames(1, 4, 100, self.META, num_frames=8)
            mock_fn.assert_called_once()

    def test_qwen_variant_dispatched(self):
-        with patch("fastdeploy.input.video_utils.sample_frames_qwen", wraps=sample_frames_qwen) as mock_fn:
+        with patch("fastdeploy.input.utils.video.sample_frames_qwen", wraps=sample_frames_qwen) as mock_fn:
            sample_frames(2, 4, 100, self.META, num_frames=8, variant="qwen")
            mock_fn.assert_called_once()

    def test_qwen_none_fps_converted_to_sentinel(self):
        """None fps/num_frames → converted to -1 before calling sample_frames_qwen."""
-        with patch("fastdeploy.input.video_utils.sample_frames_qwen", return_value=np.array([])) as mock_fn:
+        with patch("fastdeploy.input.utils.video.sample_frames_qwen", return_value=np.array([])) as mock_fn:
            sample_frames(2, 4, 100, self.META, fps=None, num_frames=None, variant="qwen")
            args = mock_fn.call_args[0]
            self.assertEqual(args[4], -1)  # fps sentinel