[DataProcessor] Refactor multimodal processor: extract encoding strategies and unify MM processing pipeline (#7298)

* merge mm processor
This commit is contained in:
luukunn
2026-04-15 19:01:06 +08:00
committed by GitHub
parent a218d29488
commit 3f84d8d893
36 changed files with 4016 additions and 681 deletions
+15 -11
View File
@@ -435,17 +435,7 @@ class BaseTextProcessor(ABC):
request["top_k"] = 1
if self.reasoning_parser:
model_status = self.reasoning_parser.get_model_status(request["prompt_token_ids"])
parts = request["request_id"].split("_")
if len(parts) > 1:
real_req_id = parts[0]
index = int(parts[1])
n = request.get("n", 1)
for idx in range(index * n, (index + 1) * n):
self.model_status_dict[f"{real_req_id}_{idx}"] = model_status
else:
self.model_status_dict[request["request_id"]] = model_status
request["enable_thinking"] = model_status == "think_start"
self._apply_reasoning_parser(request)
if request.get("response_max_tokens") is not None and request.get("enable_thinking") is False:
request["max_tokens"] = min(request["response_max_tokens"], request["max_tokens"])
@@ -453,6 +443,20 @@ class BaseTextProcessor(ABC):
data_processor_logger.info(f"Processed request dict: {request}")
return request
def _apply_reasoning_parser(self, request):
"""Apply reasoning parser to determine model thinking status."""
model_status = self.reasoning_parser.get_model_status(request["prompt_token_ids"])
parts = request["request_id"].split("_")
if len(parts) > 1:
real_req_id = parts[0]
index = int(parts[1])
n = request.get("n", 1)
for idx in range(index * n, (index + 1) * n):
self.model_status_dict[f"{real_req_id}_{idx}"] = model_status
else:
self.model_status_dict[request["request_id"]] = model_status
request["enable_thinking"] = model_status == "think_start"
def clear_request_status(self, task_id):
"""Clear all per-request decode state and return the accumulated text."""
results_all = ""
+23
View File
@@ -0,0 +1,23 @@
# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Multimodal encoding strategies for VL model families."""
from fastdeploy.input.encodings.base_encoding import BaseEncoding
from fastdeploy.input.encodings.ernie_encoding import ErnieEncoding
from fastdeploy.input.encodings.paddleocr_encoding import PaddleOCREncoding
from fastdeploy.input.encodings.qwen_encoding import QwenEncoding
from fastdeploy.input.encodings.registry import EncodingRegistry
__all__ = ["BaseEncoding", "EncodingRegistry", "ErnieEncoding", "PaddleOCREncoding", "QwenEncoding"]
+189
View File
@@ -0,0 +1,189 @@
# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Abstract base class for multimodal encoding strategies.
Each encoding strategy handles model-family-specific logic such as
position ID computation, image/video preprocessing, and token counting.
New model families should subclass ``BaseEncoding`` and implement all
abstract methods.
"""
from abc import ABC, abstractmethod
from typing import Any, Dict, Optional, Tuple
class BaseEncoding(ABC):
"""Contract that every encoding strategy must fulfil.
Required (abstract) methods cover the core encoding pipeline.
Optional methods (``init_extra``, ``get_mm_max_tokens_per_item``) have
default no-op implementations so subclasses only override when needed.
"""
def __init__(self, processor, processor_kwargs=None):
if processor_kwargs is None:
processor_kwargs = {}
cfg = processor.cfg
# Shared objects (created by processor, used by encoding)
self.cfg = cfg
self.image_processor = processor.image_processor
self.tokenizer = processor.tokenizer
# Conv params
if cfg.conv_params_from_kwargs:
self.spatial_conv_size = processor_kwargs.get("spatial_conv_size", 2)
self.temporal_conv_size = processor_kwargs.get("temporal_conv_size", 2)
else:
self.spatial_conv_size = self.image_processor.merge_size
self.temporal_conv_size = self.image_processor.temporal_patch_size
# Special token IDs
self.image_token_id = self.tokenizer.convert_tokens_to_ids(cfg.image_token_str)
self.video_token_id = self.tokenizer.convert_tokens_to_ids(cfg.video_token_str)
if cfg.has_tokens_per_second:
vision_config = getattr(getattr(processor, "config", None), "vision_config", None)
self.tokens_per_second = getattr(vision_config, "tokens_per_second", 2)
else:
self.tokens_per_second = 2
# Video params
self.fps = processor_kwargs.get("video_fps", cfg.default_fps)
self.min_frames = processor_kwargs.get("video_min_frames", cfg.default_min_frames)
self.max_frames = processor_kwargs.get("video_max_frames", cfg.default_max_frames)
self.target_frames = processor_kwargs.get("video_target_frames", cfg.default_target_frames)
# Model-specific extra init
self.init_extra(processor_kwargs)
# ------------------------------------------------------------------
# Image
# ------------------------------------------------------------------
@abstractmethod
def add_image(self, img, outputs: dict, uuid, token_len=None):
"""Process a raw image and append results to *outputs*."""
@abstractmethod
def add_processed_image(self, img_cache, outputs: dict, uuid, token_len=None):
"""Append a pre-processed (cached) image to *outputs*."""
# ------------------------------------------------------------------
# Video
# ------------------------------------------------------------------
@abstractmethod
def add_video(self, frames, outputs: dict, uuid, token_len=None, meta: Optional[dict] = None):
"""Process video frames and append results to *outputs*.
Parameters
----------
frames : array-like
Decoded video frames.
outputs : dict
Mutable accumulator for input_ids, position_ids, etc.
uuid : str | None
Unique identifier for cache lookup.
token_len : int | None
Expected token count (for validation against pre-tokenised prompts).
meta : dict | None
Video metadata (fps, duration, ...). Encoding strategies that
need metadata (e.g. Qwen) read from this dict; those that don't
(e.g. Ernie) simply ignore it.
"""
@abstractmethod
def add_processed_video(self, frames_cache, outputs: dict, uuid, token_len=None):
"""Append a pre-processed (cached) video to *outputs*."""
@abstractmethod
def load_video(self, url, item: dict) -> Tuple[Any, dict]:
"""Decode a video from *url* and return ``(frames, meta)``.
All implementations must return a 2-tuple so that the caller
(``MultiModalProcessor.text2ids``) can unpack uniformly.
"""
# ------------------------------------------------------------------
# Text / position helpers
# ------------------------------------------------------------------
@abstractmethod
def add_text_positions(self, outputs: dict, num_tokens: int):
"""Append text position IDs to *outputs*."""
@abstractmethod
def append_completion_tokens(self, multimodal_inputs: dict, completion_token_ids):
"""Append completion token IDs (and their positions) to *multimodal_inputs*."""
# ------------------------------------------------------------------
# Prompt-token-ids path (optional — only models with
# supports_prompt_token_ids=True need to implement this)
# ------------------------------------------------------------------
def prompt_token_ids2outputs(self, prompt_token_ids, mm_items=None) -> dict:
"""Build outputs dict from pre-tokenised ``prompt_token_ids``.
Parameters
----------
prompt_token_ids : list[int]
Pre-tokenised token IDs.
mm_items : list[dict] | None
Already-extracted multimodal items (each has 'type', 'data', 'uuid').
``None`` means text-only.
"""
raise NotImplementedError(f"{type(self).__name__} does not support prompt_token_ids path")
# ------------------------------------------------------------------
# Token counting & packing
# ------------------------------------------------------------------
@staticmethod
@abstractmethod
def mm_num_tokens(grid_thw):
"""Return the number of multimodal tokens for a given grid_thw."""
@abstractmethod
def pack_position_ids(self, outputs: dict):
"""Convert intermediate position ID lists into final packed format."""
# ------------------------------------------------------------------
# Outputs initialisation
# ------------------------------------------------------------------
def _make_outputs(self) -> dict:
"""Create the mutable accumulator dict for encoding results.
Subclasses override to add model-specific fields (e.g. fps, vit fields).
"""
return {
"input_ids": [],
"token_type_ids": [],
"position_ids": [],
"images": [],
"grid_thw": [],
"image_type_ids": [],
"labels": [],
"cur_position": 0,
"video_cnt": 0,
"num_input_image_tokens": 0,
"num_input_video_tokens": 0,
"mm_positions": [],
"mm_hashes": [],
}
# ------------------------------------------------------------------
# Optional hooks — subclasses override only when needed
# ------------------------------------------------------------------
def init_extra(self, processor_kwargs: dict):
"""Model-specific extra initialisation (called once after ``__init__``)."""
def get_mm_max_tokens_per_item(self, seq_len: int) -> Optional[Dict[str, int]]:
"""Per-modality max token counts for the scheduler. ``None`` = not applicable."""
return None
@@ -0,0 +1,424 @@
# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Ernie4.5-VL encoding strategy for MultiModalProcessor."""
import copy
from collections import defaultdict
import numpy as np
import paddle
from paddleformers.transformers.image_utils import ChannelDimension
from fastdeploy.engine.request import ImagePosition
from fastdeploy.input.encodings.base_encoding import BaseEncoding
from fastdeploy.input.encodings.registry import EncodingRegistry
from fastdeploy.input.mm_model_config import ERNIE4_5_VL
from fastdeploy.input.utils import IDS_TYPE_FLAG, MAX_IMAGE_DIMENSION
from fastdeploy.multimodal.hasher import MultimodalHasher
@EncodingRegistry.register(ERNIE4_5_VL)
class ErnieEncoding(BaseEncoding):
"""Encoding strategy for Ernie4.5-VL models."""
# Boundary token constants
IMG_START = "<|IMAGE_START|>"
IMG_END = "<|IMAGE_END|>"
VID_START = "<|VIDEO_START|>"
VID_END = "<|VIDEO_END|>"
def init_extra(self, processor_kwargs):
"""Ernie-specific extra initialisation (pixel params, token type mapping, etc.)."""
self.image_min_pixels = processor_kwargs.get("image_min_pixels", 4 * 28 * 28)
self.image_max_pixels = processor_kwargs.get("image_max_pixels", 6177 * 28 * 28)
self.video_min_pixels = processor_kwargs.get("video_min_pixels", 299 * 28 * 28)
self.video_max_pixels = processor_kwargs.get("video_max_pixels", 1196 * 28 * 28)
self.frames_sample = processor_kwargs.get("video_frames_sample", self.cfg.default_frames_sample)
# Build token-type mapping for ernie boundary tokens
self.token_type_mapping = self._build_token_type_mapping()
def _build_token_type_mapping(self):
mapping = defaultdict(lambda: IDS_TYPE_FLAG["text"])
for token in (self.IMG_START, self.IMG_END, self.VID_START, self.VID_END):
mapping[token] = IDS_TYPE_FLAG["image"]
mapping[self.image_token_id] = IDS_TYPE_FLAG["image"]
return mapping
def add_image(self, img, outputs, uuid, token_len=None):
patches_h, patches_w = self.image_processor.get_smarted_resize(
img.height,
img.width,
min_pixels=self.image_min_pixels,
max_pixels=self.image_max_pixels,
)[1]
num_tokens = (patches_h * patches_w) // (self.spatial_conv_size**2)
if token_len and token_len != num_tokens:
raise ValueError("image tokens num not match the size")
outputs["mm_positions"].append(ImagePosition(len(outputs["input_ids"]), num_tokens))
outputs["input_ids"].extend([self.image_token_id] * num_tokens)
outputs["token_type_ids"].extend([IDS_TYPE_FLAG["image"]] * num_tokens)
outputs["num_input_image_tokens"] += num_tokens
pos_ids = self._compute_3d_positions(1, patches_h, patches_w, outputs["cur_position"])
outputs["position_ids"].extend(pos_ids)
outputs["cur_position"] = np.max(pos_ids) + 1
ret = self.image_processor.preprocess(
images=[img.convert("RGB")],
do_normalize=False,
do_rescale=False,
predetermined_grid_thw=np.array([[patches_h, patches_w]]),
do_convert_rgb=True,
input_data_format=ChannelDimension.LAST,
)
outputs["images"].append(ret["pixel_values"])
if not uuid:
outputs["mm_hashes"].append(MultimodalHasher.hash_features(ret["pixel_values"]))
else:
outputs["mm_hashes"].append(uuid)
outputs["grid_thw"].append(ret["image_grid_thw"])
outputs["image_type_ids"].append(0)
def add_processed_image(self, img_cache, outputs, uuid, token_len=None):
img, meta = img_cache
num_tokens = img.shape[0] // (self.spatial_conv_size**2)
if token_len and num_tokens != token_len:
raise ValueError("image tokens num not match the size")
outputs["mm_positions"].append(ImagePosition(len(outputs["input_ids"]), num_tokens))
outputs["input_ids"].extend([self.image_token_id] * num_tokens)
outputs["token_type_ids"].extend([IDS_TYPE_FLAG["image"]] * num_tokens)
outputs["num_input_image_tokens"] += num_tokens
_, h, w = meta["thw"]
pos_ids = self._compute_3d_positions(1, h, w, outputs["cur_position"])
outputs["position_ids"].extend(pos_ids)
outputs["cur_position"] = np.max(pos_ids) + 1
outputs["images"].append(img)
outputs["mm_hashes"].append(uuid)
outputs["grid_thw"].append(np.array([[1, h, w]]))
outputs["image_type_ids"].append(0)
def add_video(self, frames, outputs, uuid, token_len=None, meta=None):
patches_h, patches_w = self.image_processor.get_smarted_resize(
frames[0].height,
frames[0].width,
min_pixels=self.video_min_pixels,
max_pixels=self.video_max_pixels,
)[1]
num_frames = len(frames)
num_tokens = (num_frames * patches_h * patches_w) // (self.spatial_conv_size**2 * self.temporal_conv_size)
if token_len and num_tokens != token_len:
raise ValueError("video tokens num not match the size")
pixel_stack = np.stack([np.array(f.convert("RGB")) for f in frames], axis=0)
ret = self.image_processor.preprocess(
images=None,
videos=pixel_stack,
do_normalize=False,
do_rescale=False,
predetermined_grid_thw=np.array([[patches_h, patches_w]] * num_frames),
do_convert_rgb=True,
input_data_format=ChannelDimension.LAST,
)
outputs["images"].append(ret["pixel_values_videos"])
if not uuid:
outputs["mm_hashes"].append(MultimodalHasher.hash_features(ret["pixel_values_videos"]))
else:
outputs["mm_hashes"].append(uuid)
outputs["grid_thw"].append(ret["video_grid_thw"])
outputs["image_type_ids"].extend([1] * num_frames)
outputs["mm_positions"].append(ImagePosition(len(outputs["input_ids"]), num_tokens))
outputs["input_ids"].extend([self.image_token_id] * num_tokens)
outputs["token_type_ids"].extend([IDS_TYPE_FLAG["video"]] * num_tokens)
outputs["num_input_video_tokens"] += num_tokens
pos_ids = self._compute_3d_positions(num_frames, patches_h, patches_w, outputs["cur_position"])
outputs["position_ids"].extend(pos_ids)
outputs["cur_position"] = np.max(pos_ids) + 1
def add_processed_video(self, frames_cache, outputs, uuid, token_len=None):
frames, meta = frames_cache
num_tokens = frames.shape[0] // (self.spatial_conv_size**2 * self.temporal_conv_size)
if token_len and num_tokens != token_len:
raise ValueError("video tokens num not match the size")
t, h, w = meta["thw"]
outputs["images"].append(frames)
outputs["mm_hashes"].append(uuid)
outputs["grid_thw"].append(np.array([[t, h, w]]))
outputs["mm_positions"].append(ImagePosition(len(outputs["input_ids"]), num_tokens))
outputs["input_ids"].extend([self.image_token_id] * num_tokens)
outputs["token_type_ids"].extend([IDS_TYPE_FLAG["video"]] * num_tokens)
outputs["num_input_video_tokens"] += num_tokens
outputs["image_type_ids"].extend([1] * t)
pos_ids = self._compute_3d_positions(t, h, w, outputs["cur_position"])
outputs["position_ids"].extend(pos_ids)
outputs["cur_position"] = np.max(pos_ids) + 1
def load_video(self, url, item):
from fastdeploy.input.utils.render_timestamp import render_frame_timestamp
from fastdeploy.input.utils.video import read_frames_decord, read_video_decord
reader, meta, path = read_video_decord(url, save_to_disk=False)
video_frame_args = {
"fps": item.get("fps", self.fps),
"min_frames": item.get("min_frames", self.min_frames),
"max_frames": item.get("max_frames", self.max_frames),
"target_frames": item.get("target_frames", self.target_frames),
"frames_sample": item.get("frames_sample", self.frames_sample),
}
video_frame_args = self.set_video_frame_args(video_frame_args, meta)
frames_data, _, timestamps = read_frames_decord(
path,
reader,
meta,
target_frames=video_frame_args["target_frames"],
target_fps=video_frame_args["fps"],
frames_sample=video_frame_args["frames_sample"],
save_to_disk=False,
)
frames = []
for img_array, ts in zip(frames_data, timestamps):
frames.append(render_frame_timestamp(img_array, ts))
# Ensure even number of frames for temporal conv
if len(frames) % 2 != 0:
frames.append(copy.deepcopy(frames[-1]))
return frames, {}
def set_video_frame_args(self, video_frame_args, video_meta):
"""Set final frame sampling args based on priorities."""
if video_frame_args["target_frames"] > 0:
if video_frame_args["fps"] >= 0:
raise ValueError("fps must be negative if target_frames is given")
if (
video_frame_args["min_frames"] > 0
and video_frame_args["target_frames"] < video_frame_args["min_frames"]
):
raise ValueError("target_frames must be larger than min_frames")
if (
video_frame_args["max_frames"] > 0
and video_frame_args["target_frames"] > video_frame_args["max_frames"]
):
raise ValueError("target_frames must be smaller than max_frames")
else:
if video_frame_args["fps"] < 0:
raise ValueError("Must provide either positive target_fps or positive target_frames.")
frames_to_extract = int(video_meta["duration"] * video_frame_args["fps"])
if (
video_frame_args["min_frames"] > 0
and video_frame_args["max_frames"] > 0
and video_frame_args["min_frames"] > video_frame_args["max_frames"]
):
raise ValueError("min_frames must be smaller than max_frames")
if video_frame_args["min_frames"] > 0 and frames_to_extract < video_frame_args["min_frames"]:
video_frame_args["target_frames"] = video_frame_args["min_frames"]
video_frame_args["fps"] = -1
if video_frame_args["max_frames"] > 0 and frames_to_extract > video_frame_args["max_frames"]:
video_frame_args["target_frames"] = video_frame_args["max_frames"]
video_frame_args["fps"] = -1
return video_frame_args
def add_text_positions(self, outputs, num_tokens):
"""Write text position IDs in ernie [pos, pos, pos] format."""
start = outputs["cur_position"]
for i in range(num_tokens):
outputs["position_ids"].append([start + i] * 3)
outputs["cur_position"] += num_tokens
def append_completion_tokens(self, multimodal_inputs, completion_token_ids):
num_tokens = len(completion_token_ids)
multimodal_inputs["input_ids"].extend(completion_token_ids)
multimodal_inputs["token_type_ids"].extend([IDS_TYPE_FLAG["text"]] * num_tokens)
start = multimodal_inputs["cur_position"]
for i in range(num_tokens):
multimodal_inputs["position_ids"].append([start + i] * 3)
multimodal_inputs["cur_position"] += num_tokens
def _compute_3d_positions(self, t, h, w, start_idx):
"""Compute 3D position IDs as list-of-lists for ernie format."""
t_eff = t // self.temporal_conv_size if t != 1 else 1
gh, gw = h // self.spatial_conv_size, w // self.spatial_conv_size
time_idx = np.repeat(np.arange(t_eff), gh * gw)
h_idx = np.tile(np.repeat(np.arange(gh), gw), t_eff)
w_idx = np.tile(np.arange(gw), t_eff * gh)
coords = list(zip(time_idx, h_idx, w_idx))
return [[start_idx + ti, start_idx + hi, start_idx + wi] for ti, hi, wi in coords]
def prompt_token_ids2outputs(self, prompt_token_ids, mm_items=None):
outputs = self._make_outputs()
prompt_token_ids_len = len(prompt_token_ids)
if mm_items is None:
outputs["input_ids"].extend(prompt_token_ids)
outputs["token_type_ids"].extend([IDS_TYPE_FLAG["text"]] * prompt_token_ids_len)
for i in range(prompt_token_ids_len):
outputs["position_ids"].append([i] * 3)
outputs["cur_position"] += prompt_token_ids_len
return outputs
images, videos = [], []
image_uuid, video_uuid = [], []
for item in mm_items:
if item.get("type") == "image":
images.append(item["data"])
image_uuid.append(item.get("uuid"))
elif item.get("type") == "video":
videos.append(item["data"])
video_uuid.append(item.get("uuid"))
image_start_id = self.tokenizer.convert_tokens_to_ids(self.IMG_START)
image_end_id = self.tokenizer.convert_tokens_to_ids(self.IMG_END)
video_start_id = self.tokenizer.convert_tokens_to_ids(self.VID_START)
video_end_id = self.tokenizer.convert_tokens_to_ids(self.VID_END)
st, image_idx, video_idx = 0, 0, 0
while st < prompt_token_ids_len:
cur_token_id = prompt_token_ids[st]
if cur_token_id == image_start_id:
if image_idx >= len(images):
raise ValueError("prompt token ids has more image placeholder than in messages")
# append image_start_id
outputs["input_ids"].extend([cur_token_id])
outputs["token_type_ids"].extend([IDS_TYPE_FLAG["image"]])
outputs["position_ids"].append([outputs["cur_position"]] * 3)
outputs["cur_position"] += 1
st += 1
# process placeholder token ids
cur_idx = st
while cur_idx < prompt_token_ids_len and prompt_token_ids[cur_idx] != image_end_id:
cur_idx += 1
if cur_idx >= prompt_token_ids_len:
raise ValueError("image token ids not complete")
image = images[image_idx]
uuid = image_uuid[image_idx] if image_uuid else None
token_len = cur_idx - st
if not isinstance(image, tuple):
self.add_image(image, outputs, uuid, token_len)
else:
self.add_processed_image(image, outputs, uuid, token_len)
image_idx += 1
st = cur_idx
elif cur_token_id == video_start_id:
if video_idx >= len(videos):
raise ValueError("prompt token ids has more video placeholder than in messages")
# append video_start_id
outputs["input_ids"].extend([cur_token_id])
outputs["token_type_ids"].extend([IDS_TYPE_FLAG["image"]])
outputs["position_ids"].append([outputs["cur_position"]] * 3)
outputs["cur_position"] += 1
st += 1
# process placeholder token ids
cur_idx = st
while cur_idx < prompt_token_ids_len and prompt_token_ids[cur_idx] != video_end_id:
cur_idx += 1
if cur_idx >= prompt_token_ids_len:
raise ValueError("video token ids not complete")
video = videos[video_idx]
uuid = video_uuid[video_idx] if video_uuid else None
token_len = cur_idx - st
if not isinstance(video, tuple):
if isinstance(video, dict):
frames, _ = self.load_video(video["video"], video)
else:
frames, _ = self.load_video(video, {})
self.add_video(frames, outputs, uuid, token_len=token_len)
else:
self.add_processed_video(video, outputs, uuid, token_len)
video_idx += 1
st = cur_idx
else:
outputs["input_ids"].extend([cur_token_id])
type_flag = (
IDS_TYPE_FLAG["image"] if cur_token_id in (image_end_id, video_end_id) else IDS_TYPE_FLAG["text"]
)
outputs["token_type_ids"].extend([type_flag])
outputs["position_ids"].append([outputs["cur_position"]] * 3)
outputs["cur_position"] += 1
st += 1
if image_idx != len(images):
raise ValueError("number of images does not match")
if video_idx != len(videos):
raise ValueError("number of videos does not match")
return outputs
@staticmethod
def mm_num_tokens(grid_thw):
"""Ernie mm_num_tokens: video (t>1) divides by an extra 2."""
if isinstance(grid_thw, paddle.Tensor):
grid_thw = grid_thw.numpy()
if len(grid_thw) == 0:
return 0
def calc_one(thw):
t, h, w = map(int, thw)
if t == 1:
return t * h * w // 4
else:
return t * h * w // 4 // 2
if isinstance(grid_thw[0], (list, tuple, np.ndarray)):
return [calc_one(x) for x in grid_thw]
return calc_one(grid_thw)
def pack_position_ids(self, outputs):
"""Ernie: position_ids is np.array (list-of-lists -> ndarray)."""
outputs["position_ids"] = np.array(outputs["position_ids"], dtype=np.int64)
outputs["image_patch_id"] = self.image_token_id
def get_mm_max_tokens_per_item(self, seq_len):
"""Per-modality max token counts for ernie."""
target_height, target_width = self._get_image_size_with_most_features()
# image
patches_h, patches_w = self.image_processor.get_smarted_resize(
height=target_height,
width=target_width,
min_pixels=self.image_min_pixels,
max_pixels=self.image_max_pixels,
)[1]
max_image_tokens = (patches_h * patches_w) // (self.spatial_conv_size**2)
max_image_tokens = min(max_image_tokens, seq_len)
# video
patches_h, patches_w = self.image_processor.get_smarted_resize(
height=target_height,
width=target_width,
min_pixels=self.video_min_pixels,
max_pixels=self.video_max_pixels,
)[1]
max_video_tokens = (patches_h * patches_w) // (self.spatial_conv_size**2 * self.temporal_conv_size)
max_video_tokens = min(max_video_tokens, seq_len)
return {"image": max_image_tokens, "video": max_video_tokens}
def _get_image_size_with_most_features(self):
resized_height, resized_width = self.image_processor.get_smarted_resize(
height=MAX_IMAGE_DIMENSION,
width=MAX_IMAGE_DIMENSION,
min_pixels=self.image_min_pixels,
max_pixels=self.image_max_pixels,
)[0]
return (resized_height, resized_width)
@@ -0,0 +1,190 @@
# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""PaddleOCR-VL encoding strategy."""
import numpy as np
from PIL import Image
from fastdeploy.engine.request import ImagePosition
from fastdeploy.input.encodings.qwen_encoding import QwenEncoding
from fastdeploy.input.encodings.registry import EncodingRegistry
from fastdeploy.input.mm_model_config import PADDLEOCR_VL
from fastdeploy.input.utils import IDS_TYPE_FLAG
from fastdeploy.input.utils.video import read_video_decord
from fastdeploy.input.utils.video import sample_frames_paddleocr as _sample_paddleocr
from fastdeploy.multimodal.hasher import MultimodalHasher
@EncodingRegistry.register(PADDLEOCR_VL)
class PaddleOCREncoding(QwenEncoding):
"""Encoding strategy for paddleocr_vl.
Inherits from QwenEncoding and overrides methods that differ:
- _make_outputs: add vit_seqlen / vit_position_ids
- add_image / add_video: append vit_fields (vit_seqlen, vit_position_ids)
- add_video / add_processed_video: use video_token_id instead of image_token_id
- load_video: use sample_frames_paddleocr instead of sample_frames_qwen
"""
def _make_outputs(self) -> dict:
outputs = super()._make_outputs()
outputs["vit_seqlen"] = []
outputs["vit_position_ids"] = []
return outputs
def add_image(self, img, outputs, uuid, token_len=None):
ret = self.image_processor.preprocess(images=[img.convert("RGB")])
num_tokens = ret["grid_thw"].prod() // self.image_processor.merge_size**2
grid_thw = ret["grid_thw"].tolist()
if token_len is not None and token_len != num_tokens:
raise ValueError("image tokens num not match the size")
outputs["mm_positions"].append(ImagePosition(len(outputs["input_ids"]), num_tokens))
outputs["input_ids"].extend([self.image_token_id] * num_tokens)
outputs["token_type_ids"].extend([IDS_TYPE_FLAG["image"]] * num_tokens)
outputs["num_input_image_tokens"] += int(num_tokens)
outputs["images"].append(ret["pixel_values"])
if not uuid:
outputs["mm_hashes"].append(MultimodalHasher.hash_features(ret["pixel_values"]))
else:
outputs["mm_hashes"].append(uuid)
outputs["grid_thw"].append(grid_thw)
outputs["image_type_ids"].append(0)
t, h, w = grid_thw
pos_ids = self._compute_vision_positions(outputs["cur_position"], t, h, w, 0)
outputs["position_ids"].append(pos_ids)
outputs["cur_position"] = pos_ids.max() + 1
outputs["fps"].append(0)
# paddleocr vit fields
numel = h * w
outputs["vit_seqlen"].append(numel)
outputs["vit_position_ids"].append(np.arange(numel) % numel)
def add_processed_image(self, img_cache, outputs, uuid, token_len=None):
super().add_processed_image(img_cache, outputs, uuid, token_len)
_, h, w = img_cache[1]["thw"]
numel = h * w
outputs["vit_seqlen"].append(numel)
outputs["vit_position_ids"].append(np.arange(numel) % numel)
def add_video(self, frames, outputs, uuid, token_len=None, meta=None):
preprocess_kwargs = {}
if self.cfg.video_min_pixels is not None:
preprocess_kwargs["min_pixels"] = self.cfg.video_min_pixels
preprocess_kwargs["max_pixels"] = self.cfg.video_max_pixels
ret = self.image_processor.preprocess(images=frames, **preprocess_kwargs)
num_tokens = ret["grid_thw"].prod() // self.image_processor.merge_size**2
grid_thw = ret["grid_thw"].tolist()
if token_len is not None and token_len != num_tokens:
raise ValueError("video tokens num not match the size")
outputs["mm_positions"].append(ImagePosition(len(outputs["input_ids"]), num_tokens))
outputs["input_ids"].extend([self.video_token_id] * num_tokens)
outputs["token_type_ids"].extend([IDS_TYPE_FLAG["video"]] * num_tokens)
outputs["num_input_video_tokens"] += int(num_tokens)
outputs["images"].append(ret["pixel_values"])
if not uuid:
outputs["mm_hashes"].append(MultimodalHasher.hash_features(ret["pixel_values"]))
else:
outputs["mm_hashes"].append(uuid)
outputs["grid_thw"].append(grid_thw)
outputs["image_type_ids"].extend([1] * grid_thw[0])
fps = meta["fps"] if meta else 0
second_per_grid_t = self.temporal_conv_size / fps if fps else 0
t, h, w = grid_thw
pos_ids = self._compute_vision_positions(outputs["cur_position"], t, h, w, second_per_grid_t)
outputs["position_ids"].append(pos_ids)
outputs["cur_position"] = pos_ids.max() + 1
outputs["fps"].append(fps)
# paddleocr vit fields
numel = h * w
outputs["vit_seqlen"].append(numel)
outputs["vit_position_ids"].append(np.arange(numel) % numel)
def add_processed_video(self, frames_cache, outputs, uuid, token_len=None):
frames, meta = frames_cache
num_tokens = frames.shape[0] // self.image_processor.merge_size**2
if token_len is not None and token_len != num_tokens:
raise ValueError("video tokens num not match the size")
t, h, w = meta["thw"]
outputs["images"].append(frames)
outputs["mm_hashes"].append(uuid)
outputs["grid_thw"].append(np.array([[t, h, w]]))
outputs["mm_positions"].append(ImagePosition(len(outputs["input_ids"]), num_tokens))
outputs["input_ids"].extend([self.video_token_id] * num_tokens)
outputs["token_type_ids"].extend([IDS_TYPE_FLAG["video"]] * num_tokens)
outputs["num_input_video_tokens"] += num_tokens
outputs["image_type_ids"].extend([1] * t)
fps = meta["fps"]
second_per_grid_t = self.temporal_conv_size / fps
pos_ids = self._compute_vision_positions(outputs["cur_position"], t, h, w, second_per_grid_t)
outputs["position_ids"].append(pos_ids)
outputs["cur_position"] = pos_ids.max() + 1
outputs["fps"].append(fps)
# paddleocr vit fields
numel = h * w
outputs["vit_seqlen"].append(numel)
outputs["vit_position_ids"].append(np.arange(numel) % numel)
def load_video(self, url, item):
reader, meta, _ = read_video_decord(url, save_to_disk=False)
fps = item.get("fps", self.fps)
num_frames = item.get("target_frames", self.target_frames)
frame_indices = list(range(meta["num_of_frame"]))
if fps > 0 or num_frames > 0:
min_frames = item.get("min_frames", self.min_frames)
max_frames = item.get("max_frames", self.max_frames)
frame_indices = _sample_paddleocr(
frame_factor=self.temporal_conv_size,
min_frames=min_frames,
max_frames=max_frames,
metadata=meta,
fps=fps,
num_frames=num_frames,
)
meta["num_of_frame"] = len(frame_indices)
if fps is not None:
meta["fps"] = fps
meta["duration"] = len(frame_indices) / fps
else:
meta["fps"] = len(frame_indices) / meta["duration"]
frames = []
for idx in frame_indices:
frame = reader[idx].asnumpy()
image = Image.fromarray(frame, "RGB")
frames.append(image)
frames = np.stack([np.array(f.convert("RGB")) for f in frames], axis=0)
return frames, meta
+314
View File
@@ -0,0 +1,314 @@
# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Qwen-family (qwen_vl / qwen3_vl) encoding strategy."""
import numpy as np
import paddle
from PIL import Image
from fastdeploy.engine.request import ImagePosition
from fastdeploy.input.encodings.base_encoding import BaseEncoding
from fastdeploy.input.encodings.registry import EncodingRegistry
from fastdeploy.input.mm_model_config import QWEN3_VL, QWEN_VL
from fastdeploy.input.utils import IDS_TYPE_FLAG
from fastdeploy.input.utils.video import read_video_decord
from fastdeploy.input.utils.video import sample_frames_qwen as _sample_qwen
from fastdeploy.multimodal.hasher import MultimodalHasher
@EncodingRegistry.register(QWEN_VL, QWEN3_VL)
class QwenEncoding(BaseEncoding):
"""Encoding strategy for qwen_vl and qwen3_vl."""
FRAME_FACTOR = 2
def _make_outputs(self) -> dict:
outputs = super()._make_outputs()
outputs["fps"] = []
return outputs
def add_image(self, img, outputs, uuid, token_len=None):
ret = self.image_processor.preprocess(images=[img.convert("RGB")])
num_tokens = ret["grid_thw"].prod() // self.image_processor.merge_size**2
grid_thw = ret["grid_thw"].tolist()
if token_len is not None and token_len != num_tokens:
raise ValueError("image tokens num not match the size")
outputs["mm_positions"].append(ImagePosition(len(outputs["input_ids"]), num_tokens))
outputs["input_ids"].extend([self.image_token_id] * num_tokens)
outputs["token_type_ids"].extend([IDS_TYPE_FLAG["image"]] * num_tokens)
outputs["num_input_image_tokens"] += int(num_tokens)
outputs["images"].append(ret["pixel_values"])
if not uuid:
outputs["mm_hashes"].append(MultimodalHasher.hash_features(ret["pixel_values"]))
else:
outputs["mm_hashes"].append(uuid)
outputs["grid_thw"].append(grid_thw)
outputs["image_type_ids"].append(0)
t, h, w = grid_thw
pos_ids = self._compute_vision_positions(outputs["cur_position"], t, h, w, 0)
outputs["position_ids"].append(pos_ids)
outputs["cur_position"] = pos_ids.max() + 1
outputs["fps"].append(0)
def add_processed_image(self, img_cache, outputs, uuid, token_len=None):
img, meta = img_cache
num_tokens = img.shape[0] // self.image_processor.merge_size**2
if token_len is not None and token_len != num_tokens:
raise ValueError("image tokens num not match the size")
outputs["mm_positions"].append(ImagePosition(len(outputs["input_ids"]), num_tokens))
outputs["input_ids"].extend([self.image_token_id] * num_tokens)
outputs["token_type_ids"].extend([IDS_TYPE_FLAG["image"]] * num_tokens)
outputs["num_input_image_tokens"] += num_tokens
_, h, w = meta["thw"]
pos_ids = self._compute_vision_positions(outputs["cur_position"], 1, h, w, 0)
outputs["position_ids"].append(pos_ids)
outputs["cur_position"] = pos_ids.max() + 1
outputs["images"].append(img)
outputs["mm_hashes"].append(uuid)
outputs["grid_thw"].append(np.array([[1, h, w]]))
outputs["image_type_ids"].append(0)
outputs["fps"].append(0)
def add_video(self, frames, outputs, uuid, token_len=None, meta=None):
preprocess_kwargs = {}
# qwen3_vl passes min/max pixels for video
if self.cfg.video_min_pixels is not None:
preprocess_kwargs["min_pixels"] = self.cfg.video_min_pixels
preprocess_kwargs["max_pixels"] = self.cfg.video_max_pixels
ret = self.image_processor.preprocess(images=frames, **preprocess_kwargs)
num_tokens = ret["grid_thw"].prod() // self.image_processor.merge_size**2
grid_thw = ret["grid_thw"].tolist()
if token_len is not None and token_len != num_tokens:
raise ValueError("video tokens num not match the size")
outputs["mm_positions"].append(ImagePosition(len(outputs["input_ids"]), num_tokens))
outputs["input_ids"].extend([self.image_token_id] * num_tokens)
outputs["token_type_ids"].extend([IDS_TYPE_FLAG["video"]] * num_tokens)
outputs["num_input_video_tokens"] += int(num_tokens)
outputs["images"].append(ret["pixel_values"])
if not uuid:
outputs["mm_hashes"].append(MultimodalHasher.hash_features(ret["pixel_values"]))
else:
outputs["mm_hashes"].append(uuid)
outputs["grid_thw"].append(grid_thw)
outputs["image_type_ids"].extend([1] * grid_thw[0])
fps = meta["fps"] if meta else 0
second_per_grid_t = self.temporal_conv_size / fps if fps else 0
t, h, w = grid_thw
pos_ids = self._compute_vision_positions(outputs["cur_position"], t, h, w, second_per_grid_t)
outputs["position_ids"].append(pos_ids)
outputs["cur_position"] = pos_ids.max() + 1
outputs["fps"].append(fps)
def add_processed_video(self, frames_cache, outputs, uuid, token_len=None):
frames, meta = frames_cache
num_tokens = frames.shape[0] // self.image_processor.merge_size**2
if token_len is not None and token_len != num_tokens:
raise ValueError("video tokens num not match the size")
t, h, w = meta["thw"]
outputs["images"].append(frames)
outputs["mm_hashes"].append(uuid)
outputs["grid_thw"].append(np.array([[t, h, w]]))
outputs["mm_positions"].append(ImagePosition(len(outputs["input_ids"]), num_tokens))
outputs["input_ids"].extend([self.image_token_id] * num_tokens)
outputs["token_type_ids"].extend([IDS_TYPE_FLAG["video"]] * num_tokens)
outputs["num_input_video_tokens"] += num_tokens
outputs["image_type_ids"].extend([1] * t)
fps = meta["fps"]
second_per_grid_t = self.temporal_conv_size / fps
pos_ids = self._compute_vision_positions(outputs["cur_position"], t, h, w, second_per_grid_t)
outputs["position_ids"].append(pos_ids)
outputs["cur_position"] = pos_ids.max() + 1
outputs["fps"].append(fps)
def load_video(self, url, item):
reader, meta, _ = read_video_decord(url, save_to_disk=False)
fps = item.get("fps", self.fps)
num_frames = item.get("target_frames", self.target_frames)
frame_indices = list(range(meta["num_of_frame"]))
if fps > 0 or num_frames > 0:
min_frames = item.get("min_frames", self.min_frames)
max_frames = item.get("max_frames", self.max_frames)
frame_indices = _sample_qwen(
frame_factor=self.FRAME_FACTOR,
min_frames=min_frames,
max_frames=max_frames,
metadata=meta,
fps=-1 if num_frames > 0 else fps,
num_frames=num_frames,
)
meta["num_of_frame"] = len(frame_indices)
if fps is not None:
meta["fps"] = fps
meta["duration"] = len(frame_indices) / fps
else:
meta["fps"] = len(frame_indices) / meta["duration"]
frames = []
for idx in frame_indices:
frame = reader[idx].asnumpy()
image = Image.fromarray(frame, "RGB")
frames.append(image)
frames = np.stack([np.array(f.convert("RGB")) for f in frames], axis=0)
return frames, meta
def add_text_positions(self, outputs, num_tokens):
"""Write text position IDs in qwen 3xN ndarray format."""
pos_ids = self._compute_text_positions(outputs["cur_position"], num_tokens)
outputs["position_ids"].append(pos_ids)
outputs["cur_position"] = pos_ids.max() + 1
def append_completion_tokens(self, multimodal_inputs, completion_token_ids):
num_tokens = len(completion_token_ids)
multimodal_inputs["input_ids"].extend(completion_token_ids)
multimodal_inputs["token_type_ids"].extend([IDS_TYPE_FLAG["text"]] * num_tokens)
pos_ids = self._compute_text_positions(multimodal_inputs["cur_position"], num_tokens)
multimodal_inputs["position_ids"].append(pos_ids)
multimodal_inputs["cur_position"] += num_tokens
def prompt_token_ids2outputs(self, prompt_token_ids, mm_items=None):
"""Build outputs from prompt_token_ids. Only qwen3_vl supports this."""
outputs = self._make_outputs()
prompt_token_ids_len = len(prompt_token_ids)
if mm_items is None:
self._add_text_tokens(prompt_token_ids, outputs)
return outputs
st, mm_idx = 0, 0
while st < prompt_token_ids_len:
if prompt_token_ids[st] != self.image_token_id:
cur_idx = st
while cur_idx < prompt_token_ids_len and prompt_token_ids[cur_idx] != self.image_token_id:
cur_idx += 1
self._add_text_tokens(prompt_token_ids[st:cur_idx], outputs)
st = cur_idx
continue
if mm_idx >= len(mm_items):
raise ValueError("prompt token ids has more multimodal placeholder than in messages")
cur_idx = st
while cur_idx < prompt_token_ids_len and prompt_token_ids[cur_idx] == self.image_token_id:
cur_idx += 1
item = mm_items[mm_idx]
uuid = item.get("uuid")
token_len = cur_idx - st
if item.get("type") == "image":
image = item.get("data")
if not isinstance(image, tuple):
self.add_image(image, outputs, uuid, token_len)
else:
self.add_processed_image(image, outputs, uuid, token_len)
elif item.get("type") == "video":
video = item.get("data")
if not isinstance(video, tuple):
if isinstance(video, dict):
frames, meta = self.load_video(video["video"], video)
else:
frames, meta = self.load_video(video, {})
self.add_video(frames, outputs, uuid, token_len=token_len, meta=meta)
else:
self.add_processed_video(video, outputs, uuid, token_len)
else:
raise ValueError(f"Unsupported multimodal type: {item.get('type')}")
mm_idx += 1
st = cur_idx
if mm_idx != len(mm_items):
raise ValueError("number of multimodal items does not match prompt token ids")
return outputs
def _add_text_tokens(self, tokens, outputs):
"""Helper: add text tokens with position IDs."""
if not tokens:
return
num_tokens = len(tokens)
outputs["input_ids"].extend(tokens)
outputs["token_type_ids"].extend([IDS_TYPE_FLAG["text"]] * num_tokens)
self.add_text_positions(outputs, num_tokens)
def _compute_text_positions(self, start_pos, num_tokens):
"""3xN ndarray for qwen-family text positions."""
text_array = np.arange(num_tokens).reshape(1, -1)
text_index = np.broadcast_to(text_array, (3, num_tokens))
return text_index + start_pos
def _compute_vision_positions(self, start_pos, t, h, w, second_per_grid_t):
"""3D position IDs as 3xN ndarray for qwen-family."""
h //= self.spatial_conv_size
w //= self.spatial_conv_size
tn = np.arange(t).reshape(-1, 1)
tn = np.broadcast_to(tn, (t, h * w))
tn = tn * int(second_per_grid_t) * self.tokens_per_second
t_index = tn.flatten()
hn = np.arange(h).reshape(1, -1, 1)
h_index = np.broadcast_to(hn, (t, h, w)).flatten()
wn = np.arange(w).reshape(1, 1, -1)
w_index = np.broadcast_to(wn, (t, h, w)).flatten()
return np.stack([t_index, h_index, w_index]) + start_pos
@staticmethod
def mm_num_tokens(grid_thw):
"""Qwen mm_num_tokens: t * h * w // 4."""
if isinstance(grid_thw, paddle.Tensor):
grid_thw = grid_thw.numpy()
if len(grid_thw) == 0:
return 0
def calc_one(thw):
t, h, w = map(int, thw)
return t * h * w // 4
if isinstance(grid_thw[0], (list, tuple, np.ndarray)):
return [calc_one(x) for x in grid_thw]
return calc_one(grid_thw)
def pack_position_ids(self, outputs):
"""Qwen: concatenate 3xN arrays, then transpose to Nx3."""
outputs["position_ids"] = np.concatenate(outputs["position_ids"], axis=1, dtype=np.int64)
outputs["image_patch_id"] = self.image_token_id
outputs["video_patch_id"] = self.video_token_id
outputs["position_ids"] = outputs["position_ids"].transpose(1, 0)
+54
View File
@@ -0,0 +1,54 @@
# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Registry for multimodal encoding strategy classes."""
from typing import Dict, Type
class EncodingRegistry:
"""Maps model_type strings to encoding strategy classes.
Encoding classes register themselves via the ``register`` decorator
at import time. ``MultiModalProcessor`` queries this registry by
*model_type* instead of using string-based dynamic imports.
"""
_registry: Dict[str, Type] = {}
@classmethod
def register(cls, *model_types: str):
"""Decorator that registers an encoding class for one or more model types."""
def decorator(enc_cls):
for mt in model_types:
if mt in cls._registry:
raise ValueError(
f"Encoding for '{mt}' already registered "
f"as {cls._registry[mt].__name__}, "
f"cannot re-register as {enc_cls.__name__}"
)
cls._registry[mt] = enc_cls
return enc_cls
return decorator
@classmethod
def get(cls, model_type: str) -> Type:
"""Look up the encoding class for a given *model_type*."""
if model_type not in cls._registry:
raise ValueError(
f"No encoding registered for '{model_type}'. " f"Available: {sorted(cls._registry.keys())}"
)
return cls._registry[model_type]
@@ -539,6 +539,7 @@ class DataProcessor(MMBaseDataProcessor):
outputs["mm_positions"].append(ImagePosition(len(outputs["input_ids"]), num_tokens))
outputs["input_ids"].extend([self.image_patch_id] * num_tokens)
outputs["token_type_ids"].extend([IDS_TYPE_FLAG["image"]] * num_tokens)
outputs["num_input_image_tokens"] += num_tokens
_, h, w = meta["thw"]
pos_ids = self._compute_3d_positions(1, h, w, outputs["cur_position"])
@@ -605,6 +606,7 @@ class DataProcessor(MMBaseDataProcessor):
outputs["mm_positions"].append(ImagePosition(len(outputs["input_ids"]), num_tokens))
outputs["input_ids"].extend([self.image_patch_id] * num_tokens)
outputs["token_type_ids"].extend([IDS_TYPE_FLAG["video"]] * num_tokens)
outputs["num_input_video_tokens"] += num_tokens
outputs["image_type_ids"].extend([1] * t)
pos_ids = self._compute_3d_positions(t, h, w, outputs["cur_position"])
@@ -25,3 +25,6 @@ from fastdeploy.input.image_processors.qwen3_processor import ( # noqa: F401
from fastdeploy.input.image_processors.qwen_processor import ( # noqa: F401
ImageProcessor as QwenImageProcessor,
)
from fastdeploy.input.image_processors.registry import ( # noqa: F401
ImageProcessorRegistry,
)
@@ -46,6 +46,8 @@ from PIL import Image
from fastdeploy.input.image_processors.common import is_scaled_image
from fastdeploy.input.image_processors.common import smart_resize_qwen as smart_resize
from fastdeploy.input.image_processors.registry import ImageProcessorRegistry
from fastdeploy.input.mm_model_config import ERNIE4_5_VL
from fastdeploy.utils import data_processor_logger
OPENAI_CLIP_MEAN = [0.48145466, 0.4578275, 0.40821073]
@@ -116,6 +118,7 @@ def make_batched_videos(videos) -> List[VideoInput]:
raise ValueError(f"Could not make batched video from {videos}")
@ImageProcessorRegistry.register(ERNIE4_5_VL)
class AdaptiveImageProcessor(BaseImageProcessor):
r"""
Constructs a adaptive image processor that dynamically resizes images based on the original images.
@@ -33,6 +33,8 @@ from paddleformers.transformers.image_utils import (
from fastdeploy.input.image_processors.common import (
smart_resize_paddleocr as smart_resize,
)
from fastdeploy.input.image_processors.registry import ImageProcessorRegistry
from fastdeploy.input.mm_model_config import PADDLEOCR_VL
_OPENAI_CLIP_MEAN = [0.48145466, 0.4578275, 0.40821073]
_OPENAI_CLIP_STD = [0.26862954, 0.26130258, 0.27577711]
@@ -66,6 +68,7 @@ def adjust_size(size, patch_size):
return num_patches * patch_size
@ImageProcessorRegistry.register(PADDLEOCR_VL)
class ImageProcessor(BaseImageProcessor):
model_input_names = [
"pixel_values",
@@ -41,6 +41,8 @@ from paddleformers.transformers.legacy.tokenizer_utils_base import TensorType
from PIL import Image
from fastdeploy.input.image_processors.common import is_scaled_image, smart_resize
from fastdeploy.input.image_processors.registry import ImageProcessorRegistry
from fastdeploy.input.mm_model_config import QWEN3_VL
from fastdeploy.utils import data_processor_logger
IMAGE_MEAN = [0.5, 0.5, 0.5]
@@ -62,6 +64,7 @@ VideoInput = Union[
]
@ImageProcessorRegistry.register(QWEN3_VL)
class ImageProcessor(BaseImageProcessor):
"""
Adaptive image processor for dynamic image resizing and preprocessing.
@@ -41,6 +41,8 @@ from paddleformers.transformers.legacy.tokenizer_utils_base import TensorType
from PIL import Image
from fastdeploy.input.image_processors.common import is_scaled_image, smart_resize
from fastdeploy.input.image_processors.registry import ImageProcessorRegistry
from fastdeploy.input.mm_model_config import QWEN_VL
from fastdeploy.utils import data_processor_logger
OPENAI_CLIP_MEAN = [0.48145466, 0.4578275, 0.40821073]
@@ -62,6 +64,7 @@ VideoInput = Union[
]
@ImageProcessorRegistry.register(QWEN_VL)
class ImageProcessor(BaseImageProcessor):
"""
Adaptive image processor for dynamic image resizing and preprocessing.
@@ -0,0 +1,54 @@
# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Registry for multimodal image processor classes."""
from typing import Dict, Type
class ImageProcessorRegistry:
"""Maps model_type strings to image processor classes.
Image processors register themselves via the ``register`` decorator
at import time. ``MultiModalProcessor`` queries this registry by
*model_type* instead of using string-based dynamic imports.
"""
_registry: Dict[str, Type] = {}
@classmethod
def register(cls, *model_types: str):
"""Decorator that registers an image processor class for one or more model types."""
def decorator(proc_cls):
for mt in model_types:
if mt in cls._registry:
raise ValueError(
f"Image processor for '{mt}' already registered "
f"as {cls._registry[mt].__name__}, "
f"cannot re-register as {proc_cls.__name__}"
)
cls._registry[mt] = proc_cls
return proc_cls
return decorator
@classmethod
def get(cls, model_type: str) -> Type:
"""Look up the image processor class for a given *model_type*."""
if model_type not in cls._registry:
raise ValueError(
f"No image processor registered for '{model_type}'. " f"Available: {sorted(cls._registry.keys())}"
)
return cls._registry[model_type]
+143
View File
@@ -0,0 +1,143 @@
# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Per-model-type configuration for the unified MultiModalProcessor."""
from dataclasses import dataclass, field
from typing import Dict, Optional
QWEN_VL = "qwen_vl"
QWEN3_VL = "qwen3_vl"
PADDLEOCR_VL = "paddleocr_vl"
ERNIE4_5_VL = "ernie4_5_vl"
@dataclass(frozen=True)
class MMModelConfig:
image_placeholder: str
video_placeholder: str
tokenizer_type: str = "auto" # "auto" | "ernie4_5"
default_min_frames: int = 4
default_max_frames: int = 768
default_target_frames: int = -1
default_fps: float = 2.0
default_frames_sample: str = "leading"
has_bad_words: bool = True
has_tool_role: bool = False # ernie: role_prefixes includes "tool"
default_thinking: bool = False # ernie: default enable_thinking=True
force_disable_thinking: bool = False # qwen_vl, qwen3_vl: force enable_thinking=False
set_default_reasoning_max_tokens: bool = False # ernie: auto-set reasoning_max_tokens
cap_response_max_tokens: bool = False # ernie: cap max_tokens by response_max_tokens
has_logits_processor_think: bool = False # ernie: _prepare_think_stop_sentence
chat_template_pass_request: bool = False # ernie: pass full request obj
supports_prompt_token_ids: bool = False # qwen3, ernie
preserve_prompt_token_ids: bool = False # qwen3, ernie: don't overwrite existing
stop_tokens_variant: str = "default" # "default" | "qwen3"
image_token_str: str = ""
video_token_str: str = ""
expected_kwargs: Dict[str, type] = field(default_factory=dict)
video_min_pixels: Optional[int] = None
video_max_pixels: Optional[int] = None
# ---- Conv params source ----
conv_params_from_kwargs: bool = False # ernie: from processor_kwargs; else: from image_processor
# ---- tokens_per_second ----
has_tokens_per_second: bool = True # qwen-family: read from config; ernie: False
_QWEN_KWARGS = {
"video_max_frames": int,
"video_min_frames": int,
}
_ERNIE_KWARGS = {
"spatial_conv_size": int,
"temporal_conv_size": int,
"image_min_pixels": int,
"image_max_pixels": int,
"video_min_pixels": int,
"video_max_pixels": int,
"video_target_frames": int,
"video_frames_sample": str,
"video_max_frames": int,
"video_min_frames": int,
"video_fps": int,
}
MODEL_CONFIGS: Dict[str, MMModelConfig] = {
QWEN_VL: MMModelConfig(
image_placeholder="<|image_pad|>",
video_placeholder="<|video_pad|>",
image_token_str="<|image_pad|>",
video_token_str="<|video_pad|>",
force_disable_thinking=True,
expected_kwargs=_QWEN_KWARGS,
),
QWEN3_VL: MMModelConfig(
image_placeholder="<|image_pad|>",
video_placeholder="<|video_pad|>",
image_token_str="<|image_pad|>",
video_token_str="<|video_pad|>",
force_disable_thinking=True,
supports_prompt_token_ids=True,
preserve_prompt_token_ids=True,
stop_tokens_variant="qwen3",
video_min_pixels=128 * 28 * 28,
video_max_pixels=768 * 28 * 28,
expected_kwargs=_QWEN_KWARGS,
),
PADDLEOCR_VL: MMModelConfig(
image_placeholder="<|IMAGE_PLACEHOLDER|>",
video_placeholder="<|video_pad|>",
image_token_str="<|IMAGE_PLACEHOLDER|>",
video_token_str="<|video_pad|>",
has_bad_words=False,
default_fps=-1.0,
expected_kwargs=_QWEN_KWARGS,
),
ERNIE4_5_VL: MMModelConfig(
image_placeholder="<|image@placeholder|>",
video_placeholder="<|video@placeholder|>",
tokenizer_type="ernie4_5",
default_min_frames=16,
default_max_frames=180,
default_fps=2.0,
default_frames_sample="leading",
has_tool_role=True,
default_thinking=True,
set_default_reasoning_max_tokens=True,
cap_response_max_tokens=True,
has_logits_processor_think=True,
chat_template_pass_request=True,
supports_prompt_token_ids=True,
preserve_prompt_token_ids=True,
image_token_str="<|IMAGE_PLACEHOLDER|>",
video_token_str="<|IMAGE_PLACEHOLDER|>",
conv_params_from_kwargs=True,
has_tokens_per_second=False,
expected_kwargs=_ERNIE_KWARGS,
),
}
+330 -252
View File
@@ -16,46 +16,25 @@
"""Unified multimodal processor for all VL model types.
Consolidates the four separate VL processor wrappers (QwenVLProcessor,
Qwen3VLProcessor, PaddleOCRVLProcessor, Ernie4_5_VLProcessor) into a
single class that dispatches per ``model_type``.
Consolidates the four separate VL processor wrappers and four separate
DataProcessor classes into a single class with pluggable Encoding strategies.
"""
import pickle
from collections.abc import Mapping
from typing import Any, Dict, Optional
import numpy as np
import zmq
from fastdeploy.entrypoints.chat_utils import parse_chat_messages
from fastdeploy.input.base_processor import BaseTextProcessor
from fastdeploy.input.encodings import EncodingRegistry
from fastdeploy.input.image_processors import ImageProcessorRegistry
from fastdeploy.input.mm_model_config import MODEL_CONFIGS
from fastdeploy.input.utils import IDS_TYPE_FLAG, process_stop_token_ids
from fastdeploy.utils import data_processor_logger
QWEN_VL = "qwen_vl"
QWEN3_VL = "qwen3_vl"
PADDLEOCR_VL = "paddleocr_vl"
ERNIE4_5_VL = "ernie4_5_vl"
_SUPPORTED_MODEL_TYPES = {QWEN_VL, QWEN3_VL, PADDLEOCR_VL, ERNIE4_5_VL}
_QWEN_EXPECTED_KWARGS = {
"video_max_frames": int,
"video_min_frames": int,
}
_ERNIE_EXPECTED_KWARGS = {
"spatial_conv_size": int,
"temporal_conv_size": int,
"image_min_pixels": int,
"image_max_pixels": int,
"video_min_pixels": int,
"video_max_pixels": int,
"video_target_frames": int,
"video_frames_sample": str,
"video_max_frames": int,
"video_min_frames": int,
"video_fps": int,
}
_DEFAULT_MM_LIMITS = {"image": 1, "video": 1, "audio": 1}
_SAMPLING_EPS = 1e-5
@@ -64,8 +43,9 @@ _SAMPLING_EPS = 1e-5
class MultiModalProcessor(BaseTextProcessor):
"""Unified multimodal processor for all supported VL model types.
Dispatches image-processor creation, config initialisation, and
encoding logic based on ``model_type``.
Uses a composition pattern: model-type-specific encoding logic is
delegated to ``self.enc`` (an Encoding instance), while common logic
(tokenization loop, request processing, caching) lives here.
"""
def __init__(
@@ -79,19 +59,16 @@ class MultiModalProcessor(BaseTextProcessor):
tool_parser_obj=None,
enable_processor_cache: bool = False,
):
if model_type not in _SUPPORTED_MODEL_TYPES:
raise ValueError(
f"Unsupported model_type '{model_type}'. " f"Must be one of {sorted(_SUPPORTED_MODEL_TYPES)}."
)
if model_type not in MODEL_CONFIGS:
raise ValueError(f"Unsupported model_type '{model_type}'. " f"Must be one of {sorted(MODEL_CONFIGS)}.")
self.model_type = model_type
self.config = config
self.cfg = MODEL_CONFIGS[model_type]
self.enable_processor_cache = enable_processor_cache
tokenizer_type = "ernie4_5" if model_type == ERNIE4_5_VL else "auto"
super().__init__(
model_name_or_path,
tokenizer_type=tokenizer_type,
tokenizer_type=self.cfg.tokenizer_type,
reasoning_parser_obj=reasoning_parser_obj,
tool_parser_obj=tool_parser_obj,
)
@@ -99,8 +76,13 @@ class MultiModalProcessor(BaseTextProcessor):
data_processor_logger.info(f"model_name_or_path: {model_name_or_path}")
processor_kwargs = self._parse_processor_kwargs(mm_processor_kwargs)
self._init_mm_processor(processor_kwargs)
self._init_mm_config()
self._init_image_processor()
self._init_role_prefixes()
# Composition: create encoding strategy via registry
enc_cls = EncodingRegistry.get(self.model_type)
self.enc = enc_cls(self, processor_kwargs)
self.limit_mm_per_prompt = self._parse_limits(limit_mm_per_prompt)
def _load_tokenizer(self):
@@ -122,76 +104,30 @@ class MultiModalProcessor(BaseTextProcessor):
tokenizer = AutoTokenizer.from_pretrained(self.model_name_or_path, padding_side="left", use_fast=True)
return tokenizer
def _init_mm_processor(self, processor_kwargs: dict):
"""Create the model-type-specific internal DataProcessor."""
if self.model_type == QWEN_VL:
from fastdeploy.input.qwen_vl_processor.process import DataProcessor
def _init_image_processor(self):
"""Create the appropriate image processor."""
cls = ImageProcessorRegistry.get(self.model_type)
self.image_processor = cls.from_pretrained(self.model_name_or_path)
tokens_per_second = getattr(getattr(self.config, "vision_config", None), "tokens_per_second", 2)
self.processor = DataProcessor(
model_path=self.model_name_or_path,
enable_processor_cache=self.enable_processor_cache,
tokens_per_second=tokens_per_second,
tokenizer=self.tokenizer,
**processor_kwargs,
)
elif self.model_type == QWEN3_VL:
from fastdeploy.input.qwen3_vl_processor.process import DataProcessor
self.processor = DataProcessor(
model_path=self.model_name_or_path,
enable_processor_cache=self.enable_processor_cache,
tokenizer=self.tokenizer,
**processor_kwargs,
)
elif self.model_type == PADDLEOCR_VL:
from fastdeploy.input.paddleocr_vl_processor.process import DataProcessor
tokens_per_second = getattr(getattr(self.config, "vision_config", None), "tokens_per_second", 2)
self.processor = DataProcessor(
model_path=self.model_name_or_path,
enable_processor_cache=self.enable_processor_cache,
tokens_per_second=tokens_per_second,
tokenizer=self.tokenizer,
**processor_kwargs,
)
elif self.model_type == ERNIE4_5_VL:
from fastdeploy.input.ernie4_5_vl_processor.process import DataProcessor
self.processor = DataProcessor(
tokenizer_name=self.model_name_or_path,
image_preprocessor_name=self.model_name_or_path,
enable_processor_cache=self.enable_processor_cache,
**processor_kwargs,
)
self.processor.eval()
def _init_mm_config(self):
"""Set model-type-specific multimodal configuration attributes."""
if self.model_type in (QWEN_VL, QWEN3_VL):
self.image_patch_id = self.processor.image_token_id
elif self.model_type == PADDLEOCR_VL:
self.image_patch_id = self.processor.image_patch_id
elif self.model_type == ERNIE4_5_VL:
self.image_patch_id = self.processor.image_patch_id
self.spatial_conv_size = self.processor.spatial_conv_size
def _init_role_prefixes(self):
"""Set up role prefixes for message parsing."""
self.role_prefixes = {
"system": "",
"user": "User: ",
"bot": "Assistant: ",
"assistant": "Assistant: ",
}
if self.cfg.has_tool_role:
self.role_prefixes["tool"] = "Tool: "
def _parse_processor_kwargs(self, kwargs: Optional[dict]) -> dict:
"""Parse and validate multimodal processor kwargs."""
if not kwargs:
return {}
try:
if not isinstance(kwargs, dict):
raise ValueError("mm-processor-kwargs must be a dictionary")
data_processor_logger.info(f"Processing kwargs: {kwargs}")
if self.model_type == ERNIE4_5_VL:
expected_types = _ERNIE_EXPECTED_KWARGS
else:
expected_types = _QWEN_EXPECTED_KWARGS
expected_types = self.cfg.expected_kwargs
for key, value in kwargs.items():
if key in expected_types and not isinstance(value, expected_types[key]):
raise ValueError(
@@ -199,16 +135,13 @@ class MultiModalProcessor(BaseTextProcessor):
f"{expected_types[key].__name__}, got {type(value).__name__}"
)
return kwargs
except Exception as e:
data_processor_logger.warning(f"Invalid mm-processor-kwargs format: {e}")
return {}
def _parse_limits(self, limits: Optional[dict]) -> dict:
"""Parse multimodal input limits, merging with defaults."""
if not limits:
return dict(_DEFAULT_MM_LIMITS)
try:
if not isinstance(limits, dict):
raise ValueError("limit-mm-per-prompt must be a dictionary")
@@ -219,7 +152,6 @@ class MultiModalProcessor(BaseTextProcessor):
return dict(_DEFAULT_MM_LIMITS)
def _check_mm_limits(self, item):
"""Validate multimodal inputs against configured limits."""
if isinstance(item, dict):
mm_data = item
else:
@@ -232,7 +164,6 @@ class MultiModalProcessor(BaseTextProcessor):
mm_data["image"].append(part)
elif part_type in ("video_url", "video"):
mm_data["video"].append(part)
for modality, data in mm_data.items():
if modality in self.limit_mm_per_prompt:
limit = self.limit_mm_per_prompt[modality]
@@ -240,86 +171,201 @@ class MultiModalProcessor(BaseTextProcessor):
raise ValueError(f"Too many {modality} items in prompt, " f"got {len(data)} but limit is {limit}")
def get_mm_max_tokens_per_item(self, seq_len: int) -> Optional[Mapping[str, int]]:
"""Return per-modality max token counts, if available."""
if self.model_type == ERNIE4_5_VL:
return self.processor.get_mm_max_tokens_per_item(seq_len)
return None
return self.enc.get_mm_max_tokens_per_item(seq_len)
def _extract_mm_items(self, request):
"""Extract images/videos from request messages, handling processor cache."""
messages = parse_chat_messages(request.get("messages"))
mm_items = []
for msg in messages:
role = msg.get("role")
if role not in self.role_prefixes:
raise ValueError(f"Unsupported role: {role}")
content = msg.get("content")
if not isinstance(content, list):
content = [content]
for item in content:
if item.get("type") in ["image", "video"]:
mm_items.append(item)
missing_hashes, missing_idx = [], []
for idx, item in enumerate(mm_items):
if not item.get("data"):
missing_hashes.append(item.get("uuid"))
missing_idx.append(idx)
if len(missing_hashes) > 0 and not self.enable_processor_cache:
raise ValueError("Missing items cannot be retrieved without processor cache.")
dealer = None
if self.enable_processor_cache:
context = zmq.Context()
dealer = context.socket(zmq.DEALER)
dealer.connect("ipc:///dev/shm/processor_cache.ipc")
missing_items = self.get_processor_cache(dealer, missing_hashes)
for idx in range(len(missing_items)):
if not missing_items[idx]:
raise ValueError(f"Missing item {idx} not found in processor cache")
mm_items[missing_idx[idx]]["data"] = missing_items[idx]
images, videos = [], []
image_uuid, video_uuid = [], []
for item in mm_items:
if item.get("type") == "image":
images.append(item["data"])
image_uuid.append(item["uuid"])
elif item.get("type") == "video":
videos.append(item["data"])
video_uuid.append(item["uuid"])
else:
raise ValueError(f"Unsupported multimodal type: {item.get('type')}")
return images, videos, image_uuid, video_uuid, dealer, missing_idx, mm_items
def text2ids(self, text, images=None, videos=None, image_uuid=None, video_uuid=None):
"""Convert text with image/video placeholders into model inputs."""
outputs = self.enc._make_outputs()
IMAGE_PLACEHOLDER = self.cfg.image_placeholder
VIDEO_PLACEHOLDER = self.cfg.video_placeholder
IMAGE_PLACEHOLDER_LEN = len(IMAGE_PLACEHOLDER)
VIDEO_PLACEHOLDER_LEN = len(VIDEO_PLACEHOLDER)
st, image_idx, video_idx = 0, 0, 0
while st < len(text):
image_pos = text.find(IMAGE_PLACEHOLDER, st)
image_pos = len(text) if image_pos == -1 else image_pos
video_pos = text.find(VIDEO_PLACEHOLDER, st)
video_pos = len(text) if video_pos == -1 else video_pos
ed = min(image_pos, video_pos)
self._add_text(text[st:ed], outputs)
if ed == len(text):
break
if ed == image_pos:
image = images[image_idx]
uuid = image_uuid[image_idx] if image_uuid else None
if not isinstance(image, tuple):
self.enc.add_image(image, outputs, uuid)
else:
self.enc.add_processed_image(image, outputs, uuid)
image_idx += 1
st = ed + IMAGE_PLACEHOLDER_LEN
else:
item = videos[video_idx]
uuid = video_uuid[video_idx] if video_uuid else None
if not isinstance(item, tuple):
if isinstance(item, dict):
frames, meta = self.enc.load_video(item["video"], item)
else:
frames, meta = self.enc.load_video(item, {})
self.enc.add_video(frames, outputs, uuid, meta=meta)
else:
self.enc.add_processed_video(item, outputs, uuid)
video_idx += 1
st = ed + VIDEO_PLACEHOLDER_LEN
return outputs
def request2ids(self, request):
"""Convert chat request with multimodal messages into model inputs."""
images, videos, image_uuid, video_uuid, dealer, missing_idx, mm_items = self._extract_mm_items(request)
if self.tokenizer.chat_template is None:
raise ValueError("This model does not support chat template.")
chat_template_kwargs = request.get("chat_template_kwargs", {})
if self.cfg.chat_template_pass_request:
# ernie: pass full request to apply_chat_template
prompt = self.tokenizer.apply_chat_template(
request,
tokenize=False,
add_generation_prompt=request.get("add_generation_prompt", True),
**chat_template_kwargs,
)
else:
messages = parse_chat_messages(request.get("messages"))
prompt = self.tokenizer.apply_chat_template(
messages,
tokenize=False,
add_generation_prompt=request.get("add_generation_prompt", True),
**chat_template_kwargs,
)
request["prompt_tokens"] = prompt
outputs = self.text2ids(prompt, images, videos, image_uuid, video_uuid)
if self.enable_processor_cache:
self._update_mm_cache(dealer, missing_idx, mm_items, outputs)
return outputs
def _process_prompt_token_ids(self, request):
"""Handle the prompt_token_ids tokenisation path.
Mirrors ``request2ids`` in structure: Processor owns extract/cache,
Encoding only does pure encoding.
"""
prompt_token_ids = request.get("prompt_token_ids", [])
if not request.get("messages"):
return self.enc.prompt_token_ids2outputs(prompt_token_ids)
images, videos, image_uuid, video_uuid, dealer, missing_idx, mm_items = self._extract_mm_items(request)
outputs = self.enc.prompt_token_ids2outputs(prompt_token_ids, mm_items)
if self.enable_processor_cache:
self._update_mm_cache(dealer, missing_idx, mm_items, outputs)
return outputs
def _update_mm_cache(self, dealer, missing_idx, mm_items, outputs):
"""Write newly-processed multimodal items to the processor cache."""
missing_idx_set = set(missing_idx)
hashes_to_cache, items_to_cache = [], []
for idx in range(len(mm_items)):
if idx in missing_idx_set:
continue
meta = {}
grid_thw = np.asarray(outputs["grid_thw"][idx])
if grid_thw.ndim > 1:
t, h, w = grid_thw[0]
else:
t, h, w = grid_thw
meta["thw"] = (int(t), int(h), int(w))
if "fps" in outputs:
meta["fps"] = outputs["fps"][idx]
hashes_to_cache.append(outputs["mm_hashes"][idx])
items_to_cache.append((outputs["images"][idx], meta))
if hashes_to_cache:
self.update_processor_cache(dealer, hashes_to_cache, items_to_cache)
return outputs
def _add_text(self, tokens, outputs):
"""Add text tokens to outputs, delegating position logic to enc."""
if not tokens:
return
if isinstance(tokens, str):
tokens_str = self.tokenizer.tokenize(tokens)
tokens = self.tokenizer.convert_tokens_to_ids(tokens_str)
num_tokens = len(tokens)
outputs["input_ids"].extend(tokens)
outputs["token_type_ids"].extend([IDS_TYPE_FLAG["text"]] * num_tokens)
self.enc.add_text_positions(outputs, num_tokens)
def process_request_dict(self, request, max_model_len=None):
"""Process a request dictionary into model inputs.
Unified template-method flow for all VL model types. Per-model
differences are handled by small conditional branches rather than
duplicating the entire pipeline.
"""
"""Process a request dictionary into model inputs."""
cfg = self.cfg
request = self._apply_default_parameters(request)
if not request.get("eos_token_ids"):
request["eos_token_ids"] = self.eos_token_ids
self._process_stop_tokens(request)
if self.model_type != PADDLEOCR_VL:
self._process_bad_words(request)
if self.model_type == ERNIE4_5_VL:
logits_processors_args = self._prepare_think_stop_sentence(
request.get("logits_processors_args") or {}, max_model_len
)
request["logits_processors_args"] = logits_processors_args
outputs = self._tokenize_request(request)
self._process_post_tokens(request, outputs)
if self.model_type in (QWEN_VL, QWEN3_VL):
request["enable_thinking"] = False
outputs = self.pack_outputs(outputs)
if self.model_type in (QWEN3_VL, ERNIE4_5_VL) and request.get("prompt_token_ids"):
pass # preserve existing prompt_token_ids
else:
request["prompt_token_ids"] = outputs["input_ids"].tolist()
request["prompt_token_ids_len"] = len(request["prompt_token_ids"])
request["multimodal_inputs"] = outputs
if max_model_len is not None and len(request["prompt_token_ids"]) > max_model_len:
request["prompt_token_ids"] = request["prompt_token_ids"][: max_model_len - 1]
if self.model_type == ERNIE4_5_VL:
logits_processors_args = self._update_thinking_prompt_state(
request["prompt_token_ids"], request.get("logits_processors_args") or {}
)
request["logits_processors_args"] = logits_processors_args
max_tokens = max_model_len - len(request["prompt_token_ids"])
if request.get("max_tokens") is None:
request["max_tokens"] = max(1, max_tokens)
else:
request["max_tokens"] = min(max_tokens, request["max_tokens"])
if self.model_type == ERNIE4_5_VL and request.get("reasoning_max_tokens") is None:
request["reasoning_max_tokens"] = max(int(request["max_tokens"] * 0.8), 1)
if self.model_type in (PADDLEOCR_VL, ERNIE4_5_VL):
if request.get("top_p") is not None and request.get("top_p") < _SAMPLING_EPS:
request["top_p"] = _SAMPLING_EPS
request["top_k"] = 1
if self.model_type != QWEN3_VL and self.reasoning_parser:
self._apply_reasoning_parser(request)
if self.model_type == ERNIE4_5_VL:
if request.get("response_max_tokens") is not None and request.get("enable_thinking") is False:
request["max_tokens"] = min(request["response_max_tokens"], request["max_tokens"])
data_processor_logger.info(f"Processed request {request}")
return request
def _process_stop_tokens(self, request):
"""Handle stop token processing based on model type."""
if self.model_type == QWEN3_VL:
# Stop tokens
if cfg.stop_tokens_variant == "qwen3":
stop_sequences = request.get("stop", [])
if stop_sequences:
stop_seqs, stop_seqs_len = self.update_stop_seq(stop_sequences)
@@ -328,34 +374,102 @@ class MultiModalProcessor(BaseTextProcessor):
else:
process_stop_token_ids(request, self.update_stop_seq)
def _process_bad_words(self, request):
"""Process bad_words into token ids."""
bad_words = request.get("bad_words")
bad_words_token_ids = request.get("bad_words_token_ids")
if bad_words:
bad_words_token_ids = self.update_bad_words(bad_words, bad_words_token_ids)
request["bad_words_token_ids"] = bad_words_token_ids
# Bad words
if cfg.has_bad_words:
bad_words = request.get("bad_words")
bad_words_token_ids = request.get("bad_words_token_ids")
if bad_words:
bad_words_token_ids = self.update_bad_words(bad_words, bad_words_token_ids)
request["bad_words_token_ids"] = bad_words_token_ids
# Logits processor (ernie think)
if cfg.has_logits_processor_think:
logits_processors_args = self._prepare_think_stop_sentence(
request.get("logits_processors_args") or {}, max_model_len
)
request["logits_processors_args"] = logits_processors_args
# Tokenize
outputs = self._tokenize_request(request)
# Post-token handling
self._process_post_tokens(request, outputs)
# Force disable thinking for qwen_vl / qwen3_vl
if cfg.force_disable_thinking:
request["enable_thinking"] = False
# Pack outputs
outputs = self.pack_outputs(outputs)
# Assign prompt_token_ids
if cfg.preserve_prompt_token_ids and request.get("prompt_token_ids"):
pass # preserve existing
else:
request["prompt_token_ids"] = outputs["input_ids"].tolist()
request["multimodal_inputs"] = outputs
# Truncation
if max_model_len is not None and len(request["prompt_token_ids"]) > max_model_len:
request["prompt_token_ids"] = request["prompt_token_ids"][: max_model_len - 1]
request["prompt_token_ids_len"] = len(request["prompt_token_ids"])
# Ernie: update thinking prompt state
if cfg.has_logits_processor_think:
logits_processors_args = self._update_thinking_prompt_state(
request["prompt_token_ids"],
request.get("logits_processors_args") or {},
)
request["logits_processors_args"] = logits_processors_args
# max_tokens
max_tokens = max_model_len - len(request["prompt_token_ids"])
if request.get("max_tokens") is None:
request["max_tokens"] = max(1, max_tokens)
else:
request["max_tokens"] = min(max_tokens, request["max_tokens"])
# Ernie: default reasoning_max_tokens
if cfg.set_default_reasoning_max_tokens and request.get("reasoning_max_tokens") is None:
request["reasoning_max_tokens"] = max(int(request["max_tokens"] * 0.8), 1)
# Clamp top_p
if request.get("top_p") is not None and request.get("top_p") < _SAMPLING_EPS:
request["top_p"] = _SAMPLING_EPS
request["top_k"] = 1
# Reasoning parser
if self.reasoning_parser:
self._apply_reasoning_parser(request)
# Ernie: cap response_max_tokens
if cfg.cap_response_max_tokens:
if request.get("response_max_tokens") is not None and request.get("enable_thinking") is False:
request["max_tokens"] = min(request["response_max_tokens"], request["max_tokens"])
data_processor_logger.info(f"Processed request {request}")
return request
def _tokenize_request(self, request):
"""Core tokenization dispatch: prompt_token_ids > prompt > messages."""
default_thinking = True if self.model_type == ERNIE4_5_VL else False
cfg = self.cfg
default_thinking = cfg.default_thinking
if request.get("prompt_token_ids") and self.model_type in (QWEN3_VL, ERNIE4_5_VL):
if request.get("prompt_token_ids") and cfg.supports_prompt_token_ids:
messages = request.get("messages")
if messages:
self._check_mm_limits(messages)
request.setdefault("enable_thinking", default_thinking)
return self.processor.prompt_token_ids2outputs(request)
return self._process_prompt_token_ids(request)
elif request.get("prompt"):
multimodal_data = request.get("multimodal_data") or {}
self._check_mm_limits(multimodal_data)
images = multimodal_data.get("image", None)
videos = multimodal_data.get("video", None)
if self.model_type == ERNIE4_5_VL:
request["prompt_tokens"] = request.get("prompt")
request["prompt_tokens"] = request.get("prompt")
request.setdefault("enable_thinking", default_thinking)
return self.processor.text2ids(request["prompt"], images, videos)
return self.text2ids(request["prompt"], images, videos)
elif request.get("messages"):
messages = request["messages"]
@@ -369,65 +483,22 @@ class MultiModalProcessor(BaseTextProcessor):
else:
raise ValueError("Invalid input: chat_template_kwargs must be a dict")
request.setdefault("enable_thinking", default_thinking)
return self.processor.request2ids(request)
return self.request2ids(request)
else:
raise ValueError(f"Request must contain 'prompt', or 'messages': {request}")
def _process_post_tokens(self, request, outputs):
"""Handle post-tokenization token appending."""
if self.model_type == PADDLEOCR_VL:
metadata = request.get("metadata")
if metadata and metadata.get("generated_token_ids"):
self._append_completion_tokens_qwen(outputs, metadata["generated_token_ids"])
else:
if request.get("completion_token_ids"):
self.append_completion_tokens(outputs, request["completion_token_ids"])
def _apply_reasoning_parser(self, request):
"""Apply reasoning parser and update model status dict."""
model_status = self.reasoning_parser.get_model_status(request["prompt_token_ids"])
parts = request["request_id"].split("_")
if len(parts) > 1:
real_req_id = parts[0]
index = int(parts[1])
n = request.get("n", 1)
for idx in range(index * n, (index + 1) * n):
self.model_status_dict[f"{real_req_id}_{idx}"] = model_status
else:
self.model_status_dict[request["request_id"]] = model_status
request["enable_thinking"] = model_status == "think_start"
completion_token_ids = request.get("completion_token_ids") or request.get("generated_token_ids")
if completion_token_ids:
self.enc.append_completion_tokens(outputs, completion_token_ids)
def append_completion_tokens(self, multimodal_inputs, completion_token_ids):
"""Append completion tokens to existing multimodal outputs."""
if self.model_type == ERNIE4_5_VL:
self._append_completion_tokens_ernie(multimodal_inputs, completion_token_ids)
else:
self._append_completion_tokens_qwen(multimodal_inputs, completion_token_ids)
def _append_completion_tokens_qwen(self, multimodal_inputs, completion_token_ids):
"""Append completion tokens for qwen_vl / qwen3_vl / paddleocr_vl."""
num_tokens = len(completion_token_ids)
multimodal_inputs["input_ids"].extend(completion_token_ids)
multimodal_inputs["token_type_ids"].extend([0] * num_tokens)
pos_ids = self.processor._compute_text_positions(multimodal_inputs["cur_position"], num_tokens)
multimodal_inputs["position_ids"].append(pos_ids)
multimodal_inputs["cur_position"] += num_tokens
def _append_completion_tokens_ernie(self, multimodal_inputs, completion_token_ids):
"""Append completion tokens for ernie4_5_vl."""
num_tokens = len(completion_token_ids)
multimodal_inputs["input_ids"].extend(completion_token_ids)
multimodal_inputs["token_type_ids"].extend([IDS_TYPE_FLAG["text"]] * num_tokens)
start = multimodal_inputs["cur_position"]
for i in range(num_tokens):
multimodal_inputs["position_ids"].append([start + i] * 3)
multimodal_inputs["cur_position"] += num_tokens
"""Append completion tokens — delegates to enc."""
self.enc.append_completion_tokens(multimodal_inputs, completion_token_ids)
def pack_outputs(self, outputs):
"""Convert intermediate processing outputs to final format."""
"""Convert intermediate outputs to final packed format."""
if not outputs["images"]:
outputs["images"] = None
outputs["grid_thw"] = None
@@ -439,15 +510,22 @@ class MultiModalProcessor(BaseTextProcessor):
outputs["input_ids"] = np.array(outputs["input_ids"], dtype=np.int64)
outputs["token_type_ids"] = np.array(outputs["token_type_ids"], dtype=np.int64)
outputs["mm_num_token_func"] = self.processor.mm_num_tokens
outputs["mm_num_token_func"] = self.enc.mm_num_tokens
if self.model_type in (QWEN_VL, QWEN3_VL, PADDLEOCR_VL):
outputs["position_ids"] = np.concatenate(outputs["position_ids"], axis=1, dtype=np.int64)
outputs["image_patch_id"] = self.processor.image_token_id
outputs["video_patch_id"] = self.processor.video_token_id
outputs["position_ids"] = outputs["position_ids"].transpose(1, 0)
else:
outputs["position_ids"] = np.array(outputs["position_ids"], dtype=np.int64)
outputs["image_patch_id"] = self.image_patch_id
# Position IDs: delegate to encoding strategy
self.enc.pack_position_ids(outputs)
return outputs
def get_processor_cache(self, socket, mm_hashes):
req = pickle.dumps(mm_hashes)
socket.send_multipart([b"", req])
_, resp = socket.recv_multipart()
mm_items = pickle.loads(resp)
data_processor_logger.info(f"Get cache of mm_hashes: {mm_hashes}")
return mm_items
def update_processor_cache(self, socket, mm_hashes, mm_items):
req = pickle.dumps((mm_hashes, mm_items))
socket.send_multipart([b"", req])
data_processor_logger.info(f"Update cache of mm_hashes: {mm_hashes}")
@@ -28,8 +28,8 @@ from fastdeploy.engine.request import ImagePosition
from fastdeploy.entrypoints.chat_utils import parse_chat_messages
from fastdeploy.input.mm_data_processor import MMBaseDataProcessor
from fastdeploy.input.utils import IDS_TYPE_FLAG
from fastdeploy.input.video_utils import read_video_decord
from fastdeploy.input.video_utils import sample_frames_paddleocr as sample_frames
from fastdeploy.input.utils.video import read_video_decord
from fastdeploy.input.utils.video import sample_frames_paddleocr as sample_frames
from fastdeploy.multimodal.hasher import MultimodalHasher
from fastdeploy.utils import data_processor_logger
+2 -2
View File
@@ -94,13 +94,13 @@ class InputPreprocessor:
tool_parser_obj=tool_parser_obj,
)
else:
from fastdeploy.input.multimodal_processor import (
from fastdeploy.input.mm_model_config import (
ERNIE4_5_VL,
PADDLEOCR_VL,
QWEN3_VL,
QWEN_VL,
MultiModalProcessor,
)
from fastdeploy.input.multimodal_processor import MultiModalProcessor
if ErnieArchitectures.contains_ernie_arch(architecture):
model_type = ERNIE4_5_VL
@@ -28,8 +28,8 @@ from fastdeploy.engine.request import ImagePosition
from fastdeploy.entrypoints.chat_utils import parse_chat_messages
from fastdeploy.input.mm_data_processor import MMBaseDataProcessor
from fastdeploy.input.utils import IDS_TYPE_FLAG
from fastdeploy.input.video_utils import read_video_decord
from fastdeploy.input.video_utils import sample_frames_qwen as sample_frames
from fastdeploy.input.utils.video import read_video_decord
from fastdeploy.input.utils.video import sample_frames_qwen as sample_frames
from fastdeploy.multimodal.hasher import MultimodalHasher
from fastdeploy.utils import data_processor_logger
@@ -28,8 +28,8 @@ from fastdeploy.engine.request import ImagePosition
from fastdeploy.entrypoints.chat_utils import parse_chat_messages
from fastdeploy.input.mm_data_processor import MMBaseDataProcessor
from fastdeploy.input.utils import IDS_TYPE_FLAG
from fastdeploy.input.video_utils import read_video_decord
from fastdeploy.input.video_utils import sample_frames_qwen as sample_frames
from fastdeploy.input.utils.video import read_video_decord
from fastdeploy.input.utils.video import sample_frames_qwen as sample_frames
from fastdeploy.multimodal.hasher import MultimodalHasher
from fastdeploy.utils import data_processor_logger
Binary file not shown.
+41
View File
@@ -0,0 +1,41 @@
# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Utility package for fastdeploy.input — re-exports from sub-modules."""
from fastdeploy.input.utils.common import (
IDS_TYPE_FLAG,
MAX_IMAGE_DIMENSION,
process_stop_token_ids,
validate_model_path,
)
from fastdeploy.input.utils.video import (
VideoReaderWrapper,
read_video_decord,
sample_frames,
sample_frames_paddleocr,
sample_frames_qwen,
)
__all__ = [
"IDS_TYPE_FLAG",
"MAX_IMAGE_DIMENSION",
"process_stop_token_ids",
"validate_model_path",
"VideoReaderWrapper",
"read_video_decord",
"sample_frames",
"sample_frames_paddleocr",
"sample_frames_qwen",
]
@@ -0,0 +1,94 @@
# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Render timestamps onto video frames."""
import os
from pathlib import Path
from PIL import Image, ImageDraw, ImageFont
FONT_PATH = os.path.join(Path(__file__).parent.absolute(), "Roboto-Regular.ttf")
def render_single_image_with_timestamp(image: Image, number: str, rate: float, font_path: str = FONT_PATH):
"""Render a timestamp string onto a PIL Image.
The font size is ``min(width, height) * rate``.
Text is drawn in black with a white outline (10% of font size).
"""
draw = ImageDraw.Draw(image)
width, height = image.size
font_size = int(min(width, height) * rate)
outline_size = int(font_size * 0.1)
font = ImageFont.truetype(font_path, font_size)
x = 0
y = 0
draw.text(
(x, y),
number,
font=font,
fill=(0, 0, 0),
stroke_width=outline_size,
stroke_fill=(255, 255, 255),
)
return image
def timestamp_converting(time_stamp_in_seconds):
"""Convert timestamp from seconds to ``HH:MM:SS.ss`` format."""
hours = 0
while time_stamp_in_seconds >= 3600:
hours += 1
time_stamp_in_seconds -= 3600
mins = 0
while time_stamp_in_seconds >= 60:
mins += 1
time_stamp_in_seconds -= 60
time_hours = f"{int(hours):02d}"
time_mins = f"{int(mins):02d}"
time_secs = f"{time_stamp_in_seconds:05.02f}"
fi_time_stamp = time_hours + ":" + time_mins + ":" + time_secs
return fi_time_stamp
def get_timestamp_for_uniform_frame_extraction(num_frames, frame_id, duration):
"""Get the timestamp of a frame during uniform extraction.
Returns the timestamp in seconds.
"""
time_stamp = duration * 1.0 * frame_id / num_frames
return time_stamp
def render_frame_timestamp(frame, timestamp, font_rate=0.1):
"""Render a timestamp onto a video frame.
Parameters
----------
frame : PIL.Image
The video frame.
timestamp : float
Timestamp in seconds.
font_rate : float
Font size as a fraction of ``min(width, height)``.
"""
time_stamp = "time: " + timestamp_converting(timestamp)
new_frame = render_single_image_with_timestamp(frame, time_stamp, font_rate)
return new_frame
+470
View File
@@ -0,0 +1,470 @@
# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Shared video utilities: VideoReaderWrapper, read_video_decord, sample_frames, read_frames_decord."""
import datetime
import hashlib
import io
import math
import os
import random
import threading
import uuid
from tempfile import NamedTemporaryFile as ntf
from typing import Optional, Union
import numpy as np
from PIL import Image
from fastdeploy.input.image_processors.common import ceil_by_factor, floor_by_factor
from fastdeploy.utils import data_processor_logger
__all__ = [
"VideoReaderWrapper",
"read_video_decord",
"sample_frames",
"sample_frames_qwen",
"sample_frames_paddleocr",
"get_frame_indices",
"read_frames_decord",
"EXTRACTED_FRAME_DIR",
"get_filename",
]
# ---------------------------------------------------------------------------
# VideoReaderWrapper
# ---------------------------------------------------------------------------
def _is_gif(data: bytes) -> bool:
"""Check if bytes represent a GIF based on magic header."""
return data[:6] in (b"GIF87a", b"GIF89a")
class VideoReaderWrapper:
"""decord.VideoReader wrapper that fixes a memory leak and adds GIF support.
Reference: https://github.com/dmlc/decord/issues/208
"""
def __init__(self, video_path, *args, **kwargs):
import decord
try:
# moviepy 1.0
import moviepy.editor as mp
except Exception:
# moviepy 2.0
import moviepy as mp
with ntf(delete=True, suffix=".gif") as gif_file:
gif_input = None
self.original_file = None # only set when we create a temp file
if isinstance(video_path, str):
if video_path.lower().endswith(".gif"):
gif_input = video_path
elif isinstance(video_path, bytes):
if _is_gif(video_path):
gif_file.write(video_path)
gif_file.flush()
gif_input = gif_file.name
elif isinstance(video_path, io.BytesIO):
video_path.seek(0)
tmp_bytes = video_path.read()
video_path.seek(0)
if _is_gif(tmp_bytes):
gif_file.write(tmp_bytes)
gif_file.flush()
gif_input = gif_file.name
if gif_input is not None:
clip = mp.VideoFileClip(gif_input)
mp4_file = ntf(delete=False, suffix=".mp4")
mp4_path = mp4_file.name
mp4_file.close() # close before moviepy writes
clip.write_videofile(mp4_path, verbose=False, logger=None)
clip.close()
video_path = mp4_path
self.original_file = video_path # temp mp4, cleaned up in __del__
self._reader = decord.VideoReader(video_path, *args, **kwargs)
self._reader.seek(0)
def __len__(self):
return len(self._reader)
def __getitem__(self, key):
frames = self._reader[key]
self._reader.seek(0)
return frames
def get_avg_fps(self):
return self._reader.get_avg_fps()
def seek(self, pos):
return self._reader.seek(pos)
def __del__(self):
original_file = getattr(self, "original_file", None)
if original_file:
try:
os.remove(original_file)
except OSError:
pass
# ---------------------------------------------------------------------------
# read_video_decord
# ---------------------------------------------------------------------------
def read_video_decord(video_path, save_to_disk: bool = False):
"""Load a video file and return (video_reader, video_meta, video_path).
video_meta contains keys: "fps", "duration", "num_of_frame".
"""
if isinstance(video_path, VideoReaderWrapper):
video_reader = video_path
else:
if isinstance(video_path, bytes):
video_path = io.BytesIO(video_path)
video_reader = VideoReaderWrapper(video_path, num_threads=1)
vlen = len(video_reader)
fps = video_reader.get_avg_fps()
duration = vlen / float(fps)
video_meta = {"fps": fps, "duration": duration, "num_of_frame": vlen}
return video_reader, video_meta, video_path
# ---------------------------------------------------------------------------
# sample_frames — qwen_vl variant
# ---------------------------------------------------------------------------
def sample_frames_qwen(
frame_factor: int,
min_frames: int,
max_frames: int,
metadata: Optional[dict] = None,
fps: Optional[Union[int, float]] = -1,
num_frames: Optional[int] = -1,
) -> np.ndarray:
"""Sample frame indices — qwen_vl variant.
Sentinel defaults are -1. Applies ceil_by_factor on min_frames and ensures
num_frames is divisible by 4.
"""
if fps > 0 and num_frames > 0:
raise ValueError("`num_frames` and `fps` are mutually exclusive arguments, please use only one!")
if metadata is None:
raise ValueError("metadata is required for sample_frames_qwen")
total_num_frames = metadata["num_of_frame"]
if num_frames > 0:
num_frames = round(num_frames / frame_factor) * frame_factor
elif fps > 0:
min_frames = ceil_by_factor(min_frames, frame_factor)
max_frames = floor_by_factor(min(max_frames, total_num_frames), frame_factor)
num_frames = total_num_frames / metadata["fps"] * fps
if num_frames > total_num_frames:
data_processor_logger.warning(f"smart_nframes: nframes[{num_frames}] > total_frames[{total_num_frames}]")
num_frames = min(min(max(num_frames, min_frames), max_frames), total_num_frames)
num_frames = floor_by_factor(num_frames, frame_factor)
if num_frames > total_num_frames:
raise ValueError(
f"Video can't be sampled. The inferred `num_frames={num_frames}` exceeds "
f"`total_num_frames={total_num_frames}`. "
"Decrease `num_frames` or `fps` for sampling."
)
# num_frames must be divisible by 4
if num_frames > 2 and num_frames % 4 != 0:
num_frames = (num_frames // 4) * 4
total_num_frames = (total_num_frames // 4) * 4
num_frames = min(min(max(num_frames, min_frames), max_frames), total_num_frames)
if num_frames > 0:
indices = np.arange(0, total_num_frames, total_num_frames / num_frames).astype(np.int32)
else:
indices = np.arange(0, total_num_frames).astype(np.int32)
return indices
# ---------------------------------------------------------------------------
# sample_frames — paddleocr_vl / ernie4_5_vl variant
# ---------------------------------------------------------------------------
def sample_frames_paddleocr(
frame_factor: int,
min_frames: int,
max_frames: int,
metadata: Optional[dict] = None,
fps: Optional[Union[int, float]] = None,
num_frames: Optional[int] = None,
) -> np.ndarray:
"""Sample frame indices — paddleocr_vl / ernie4_5_vl variant.
Sentinel defaults are None. Uses plain math.floor/ceil; no %4 correction.
"""
fps = fps or 0
num_frames = num_frames or 0
if fps > 0 and num_frames > 0:
raise ValueError("`num_frames` and `fps` are mutually exclusive arguments, please use only one!")
if metadata is None:
raise ValueError("metadata is required for sample_frames_paddleocr")
total_num_frames = metadata["num_of_frame"]
if num_frames > 0:
num_frames = round(num_frames / frame_factor) * frame_factor
elif fps > 0:
max_frames = math.floor(min(max_frames, total_num_frames) / frame_factor) * frame_factor
num_frames = total_num_frames / metadata["fps"] * fps
num_frames = min(min(max(num_frames, min_frames), max_frames), total_num_frames)
num_frames = math.floor(num_frames / frame_factor) * frame_factor
if num_frames > total_num_frames:
raise ValueError(
f"Video can't be sampled. The inferred `num_frames={num_frames}` exceeds "
f"`total_num_frames={total_num_frames}`. "
"Decrease `num_frames` or `fps` for sampling."
)
if num_frames > 0:
indices = np.arange(0, total_num_frames, total_num_frames / num_frames).astype(np.int32)
else:
indices = np.arange(0, total_num_frames).astype(np.int32)
return indices
def sample_frames(
frame_factor: int,
min_frames: int,
max_frames: int,
metadata: Optional[dict] = None,
fps: Optional[Union[int, float]] = None,
num_frames: Optional[int] = None,
variant: str = "paddleocr",
) -> np.ndarray:
"""Dispatch to sample_frames_qwen or sample_frames_paddleocr based on variant."""
if variant == "qwen":
_fps = fps if fps is not None else -1
_num_frames = num_frames if num_frames is not None else -1
return sample_frames_qwen(frame_factor, min_frames, max_frames, metadata, _fps, _num_frames)
if variant == "paddleocr":
return sample_frames_paddleocr(frame_factor, min_frames, max_frames, metadata, fps, num_frames)
raise ValueError(f"Unknown variant {variant!r}. Expected 'paddleocr' or 'qwen'.")
# ---------------------------------------------------------------------------
# IO helpers (migrated from ernie4_5_vl_processor/utils/io_utils.py)
# ---------------------------------------------------------------------------
EXTRACTED_FRAME_DIR = "./download_tmp/extracted_frames/"
def get_filename(url=None):
"""Generate a unique filename, optionally based on a URL hash."""
if url is None:
return str(uuid.uuid4()).replace("-", "")
t = datetime.datetime.now()
if not isinstance(url, bytes):
url = url.encode("utf-8")
md5_hash = hashlib.md5(url).hexdigest()
pid = os.getpid()
tid = threading.get_ident()
image_filename = f"{t.year}-{t.month:02d}-{t.day:02d}-{pid}-{tid}-{md5_hash}"
return image_filename
# ---------------------------------------------------------------------------
# get_frame_indices / read_frames_decord
# (migrated from ernie4_5_vl_processor/process_video.py)
# ---------------------------------------------------------------------------
def get_frame_indices(
vlen,
target_frames=-1,
target_fps=-1,
frames_sample="middle",
fix_start=None,
input_fps=-1,
):
"""Get frame indices for sampling from a video."""
assert frames_sample in ["rand", "middle", "leading"]
if target_frames > 0:
assert target_fps <= 0, "target_fps must be negative if target_frames is given."
if target_frames > vlen:
acc_samples = vlen
data_processor_logger.info(
f"target_frames={target_frames} is larger than video length {vlen}, "
f"will sample {acc_samples} frames."
)
else:
acc_samples = target_frames
data_processor_logger.debug(f"sampling at target_frames={target_frames}, frames_sample={frames_sample}")
intervals = np.linspace(start=0, stop=vlen, num=acc_samples + 1).astype(int)
ranges = []
for idx, interv in enumerate(intervals[:-1]):
ranges.append((interv, intervals[idx + 1] - 1))
if frames_sample == "rand":
try:
frame_indices = [random.choice(range(x[0], x[1])) for x in ranges]
except Exception:
frame_indices = np.random.permutation(vlen)[:acc_samples]
frame_indices.sort()
frame_indices = list(frame_indices)
elif fix_start is not None:
frame_indices = [x[0] + fix_start for x in ranges]
elif frames_sample == "leading":
frame_indices = [x[0] for x in ranges]
elif frames_sample == "middle":
frame_indices = [(x[0] + x[1]) // 2 for x in ranges]
else:
raise NotImplementedError
elif target_fps > 0:
assert target_frames <= 0, "target_frames must be negative if target_fps is given."
assert input_fps > 0, "input_fps must be provided if target_fps is given."
data_processor_logger.info(f"sampling at fps={target_fps}, frames_sample={frames_sample}")
duration = float(vlen) / input_fps
delta = 1 / target_fps
if frames_sample == "middle":
frame_seconds = np.arange(0 + delta / 2, duration + delta / 2, delta)
elif frames_sample == "leading":
frame_seconds = np.arange(0, duration, delta)
if frames_sample == "rand":
frame_seconds = np.arange(0 + delta / 2, duration + delta / 2, delta)
rand_offset = np.random.rand(*(frame_seconds.shape)) - 0.5
frame_seconds += rand_offset * delta
frame_indices = np.around(frame_seconds * input_fps).astype(int)
frame_indices = [e for e in frame_indices if e < vlen]
else:
raise ValueError("Must provide either positive target_fps or positive target_frames.")
return frame_indices
def read_frames_decord(
video_path,
video_reader,
video_meta,
target_frames=-1,
target_fps=-1,
frames_sample="middle",
fix_start=None,
save_to_disk=False,
cache_dir=None,
frame_indices=None,
tol=10,
):
"""Read frames from a video using decord, with retry logic for corrupt frames."""
if cache_dir is None:
cache_dir = EXTRACTED_FRAME_DIR
if frame_indices is None:
frame_indices = get_frame_indices(
video_meta["num_of_frame"],
target_frames=target_frames,
target_fps=target_fps,
frames_sample=frames_sample,
fix_start=fix_start,
input_fps=video_meta["fps"],
)
frames = []
for frame_indice_index in range(0, len(frame_indices)):
frame_indice = frame_indices[frame_indice_index]
try:
frames.append(video_reader[frame_indice].asnumpy())
except Exception as e:
data_processor_logger.debug(f"encounter error when get frame: {frame_indice}, error: {e}")
previous_counter = 1
later_counter = 1
previous_after_flag = True
if frame_indice == 0 or frame_indice == len(video_reader) - 1:
cur_tol = tol * 2
else:
cur_tol = tol
while previous_counter < cur_tol or later_counter < cur_tol:
if previous_after_flag:
if frame_indice - previous_counter < 0:
previous_counter += 1
previous_after_flag = not previous_after_flag
continue
try:
frames.append(video_reader[frame_indice - previous_counter].asnumpy())
data_processor_logger.info(
f"replace {frame_indice}-th frame with {frame_indice-previous_counter}-th frame"
)
frame_indices[frame_indice_index] = frame_indice - previous_counter
break
except Exception as e:
previous_counter += 1
data_processor_logger.info(f"error: {e}")
else:
if frame_indice + later_counter >= len(video_reader):
later_counter += 1
previous_after_flag = not previous_after_flag
continue
try:
frames.append(video_reader[frame_indice + later_counter].asnumpy())
data_processor_logger.info(
f"replace {frame_indice}-th frame with {frame_indice+later_counter}-th frame"
)
frame_indices[frame_indice_index] = frame_indice + later_counter
break
except Exception:
later_counter += 1
previous_after_flag = not previous_after_flag
frames = np.stack(frames, axis=0)
assert len(frames) == len(frame_indices), f"len(frames): {len(frames)} != len(frame_indices): {len(frame_indices)}"
ret = []
url_sha1 = get_filename()
for idx, frame in enumerate(frames):
tmp = Image.fromarray(frame, "RGB")
if save_to_disk:
save_path = os.path.join(cache_dir, f"{url_sha1}", f"{idx}.png")
if not os.path.exists(os.path.dirname(save_path)):
os.makedirs(os.path.dirname(save_path))
tmp.save(save_path)
tmp = save_path
ret.append(tmp)
time_stamps = [frame_idx * video_meta["duration"] / video_meta["num_of_frame"] for frame_idx in frame_indices]
return ret, frame_indices, time_stamps
+2 -7
View File
@@ -85,7 +85,7 @@ import zmq
from fastdeploy import envs
from fastdeploy.engine.tasks import PoolingTask
from fastdeploy.input.ernie4_5_vl_processor import DataProcessor
from fastdeploy.input.image_processors.adaptive_processor import AdaptiveImageProcessor
from fastdeploy.inter_communicator import IPCSignal, ZmqIpcClient
from fastdeploy.logger.deterministic_logger import DeterministicLogger
from fastdeploy.model_executor.forward_meta import ForwardMeta
@@ -2867,12 +2867,7 @@ class GPUModelRunner(ModelRunnerBase):
return
def _init_image_preprocess(self) -> None:
processor = DataProcessor(
tokenizer_name=self.model_config.model,
image_preprocessor_name=str(self.model_config.model),
)
processor.eval()
image_preprocess = processor.image_preprocessor
image_preprocess = AdaptiveImageProcessor.from_pretrained(str(self.model_config.model))
image_preprocess.image_mean_tensor = paddle.to_tensor(image_preprocess.image_mean, dtype="float32").reshape(
[1, 3, 1, 1]
)
+2 -7
View File
@@ -33,7 +33,7 @@ from fastdeploy.config import FDConfig
from fastdeploy.engine.pooling_params import PoolingParams
from fastdeploy.engine.request import ImagePosition, Request, RequestType
from fastdeploy.engine.tasks import PoolingTask
from fastdeploy.input.ernie4_5_vl_processor import DataProcessor
from fastdeploy.input.image_processors.adaptive_processor import AdaptiveImageProcessor
from fastdeploy.inter_communicator import IPCSignal, ZmqIpcClient
from fastdeploy.model_executor.forward_meta import ForwardMeta
from fastdeploy.model_executor.graph_optimization.utils import (
@@ -2566,12 +2566,7 @@ class MetaxModelRunner(ModelRunnerBase):
return
def _init_image_preprocess(self) -> None:
processor = DataProcessor(
tokenizer_name=self.model_config.model,
image_preprocessor_name=str(self.model_config.model),
)
processor.eval()
image_preprocess = processor.image_preprocessor
image_preprocess = AdaptiveImageProcessor.from_pretrained(str(self.model_config.model))
image_preprocess.image_mean_tensor = paddle.to_tensor(image_preprocess.image_mean, dtype="float32").reshape(
[1, 3, 1, 1]
)
+2 -7
View File
@@ -31,7 +31,7 @@ from paddle import nn
from fastdeploy import envs
from fastdeploy.config import FDConfig
from fastdeploy.engine.request import ImagePosition, Request, RequestType
from fastdeploy.input.ernie4_5_vl_processor import DataProcessor
from fastdeploy.input.image_processors.adaptive_processor import AdaptiveImageProcessor
from fastdeploy.inter_communicator import IPCSignal, ZmqIpcClient
from fastdeploy.model_executor.forward_meta import ForwardMeta
from fastdeploy.model_executor.graph_optimization.utils import (
@@ -1842,12 +1842,7 @@ class XPUModelRunner(ModelRunnerBase):
self.forward_meta.clear_caches()
def _init_image_preprocess(self) -> None:
processor = DataProcessor(
tokenizer_name=self.model_config.model,
image_preprocessor_name=str(self.model_config.model),
)
processor.eval()
image_preprocess = processor.image_preprocessor
image_preprocess = AdaptiveImageProcessor.from_pretrained(str(self.model_config.model))
image_preprocess.image_mean_tensor = paddle.to_tensor(image_preprocess.image_mean, dtype="float32").reshape(
[1, 3, 1, 1]
)
+1
View File
@@ -303,6 +303,7 @@ setup(
"model_executor/models/*",
"model_executor/layers/*",
"input/ernie4_5_vl_processor/utils/*",
"input/utils/Roboto-Regular.ttf",
"model_executor/ops/gcu/*",
"model_executor/ops/gcu/fastdeploy_ops/*",
"cache_manager/transfer_factory/get_rdma_nics.sh",
File diff suppressed because it is too large Load Diff
File diff suppressed because it is too large Load Diff
+1 -1
View File
@@ -30,7 +30,7 @@ from fastdeploy.input.paddleocr_vl_processor.paddleocr_vl_processor import (
PaddleOCRVLProcessor,
)
from fastdeploy.input.paddleocr_vl_processor.process import DataProcessor
from fastdeploy.input.video_utils import sample_frames_paddleocr as sample_frames
from fastdeploy.input.utils.video import sample_frames_paddleocr as sample_frames
MODULE_PATH = "fastdeploy.input.paddleocr_vl_processor.process"
+2 -2
View File
@@ -24,8 +24,8 @@ from unittest.mock import patch
import numpy as np
from PIL import Image as PILImage
import fastdeploy.input.ernie4_5_vl_processor.process_video as process_video_module
from fastdeploy.input.ernie4_5_vl_processor.process_video import (
import fastdeploy.input.utils.video as process_video_module
from fastdeploy.input.utils.video import (
get_frame_indices,
read_frames_decord,
read_video_decord,
+1 -1
View File
@@ -21,7 +21,7 @@ import numpy as np
from PIL import Image
from fastdeploy.input.qwen_vl_processor import QwenVLProcessor
from fastdeploy.input.video_utils import sample_frames_qwen as sample_frames
from fastdeploy.input.utils.video import sample_frames_qwen as sample_frames
def mock_pil_image(height, width):
+5 -5
View File
@@ -41,16 +41,16 @@ class TestValidateModelPath(unittest.TestCase):
def _patch_console_logger(self):
"""Patch console_logger.warning to capture warnings."""
import fastdeploy.input.utils as utils_mod
import fastdeploy.input.utils.common as common_mod
self._orig_warning = utils_mod.console_logger.warning
utils_mod.console_logger.warning = self._capture_warning
self._orig_warning = common_mod.console_logger.warning
common_mod.console_logger.warning = self._capture_warning
def _unpatch_console_logger(self):
import fastdeploy.input.utils as utils_mod
import fastdeploy.input.utils.common as common_mod
if self._orig_warning is not None:
utils_mod.console_logger.warning = self._orig_warning
common_mod.console_logger.warning = self._orig_warning
def tearDown(self):
self._unpatch_console_logger()
+16 -16
View File
@@ -18,7 +18,7 @@ from unittest.mock import MagicMock, patch
import numpy as np
from fastdeploy.input.video_utils import (
from fastdeploy.input.utils.video import (
_is_gif,
read_video_decord,
sample_frames,
@@ -74,7 +74,7 @@ class TestIsGif(unittest.TestCase):
class TestVideoReaderWrapper(unittest.TestCase):
def _make_wrapper(self, video_path, mock_reader=None):
"""Construct a VideoReaderWrapper with decord mocked out."""
from fastdeploy.input.video_utils import VideoReaderWrapper
from fastdeploy.input.utils.video import VideoReaderWrapper
if mock_reader is None:
mock_reader = _make_mock_reader()
@@ -112,7 +112,7 @@ class TestVideoReaderWrapper(unittest.TestCase):
def test_del_no_original_file(self):
"""__del__ should be a no-op when original_file is None."""
from fastdeploy.input.video_utils import VideoReaderWrapper
from fastdeploy.input.utils.video import VideoReaderWrapper
wrapper = object.__new__(VideoReaderWrapper)
wrapper.original_file = None
@@ -125,7 +125,7 @@ class TestVideoReaderWrapper(unittest.TestCase):
import os
import tempfile
from fastdeploy.input.video_utils import VideoReaderWrapper
from fastdeploy.input.utils.video import VideoReaderWrapper
with tempfile.NamedTemporaryFile(delete=False) as f:
tmp_path = f.name
@@ -138,7 +138,7 @@ class TestVideoReaderWrapper(unittest.TestCase):
def test_non_gif_string_path_does_not_set_original_file(self):
"""Passing a non-GIF string path must NOT set original_file (bug fix)."""
from fastdeploy.input.video_utils import VideoReaderWrapper
from fastdeploy.input.utils.video import VideoReaderWrapper
mock_reader = _make_mock_reader()
mock_decord = MagicMock()
@@ -151,7 +151,7 @@ class TestVideoReaderWrapper(unittest.TestCase):
def test_bytesio_non_gif_path_does_not_set_original_file(self):
"""Passing a BytesIO that is NOT a GIF must not set original_file."""
from fastdeploy.input.video_utils import VideoReaderWrapper
from fastdeploy.input.utils.video import VideoReaderWrapper
mock_reader = _make_mock_reader()
mock_decord = MagicMock()
@@ -172,16 +172,16 @@ class TestVideoReaderWrapper(unittest.TestCase):
class TestReadVideoDecord(unittest.TestCase):
def _patch_wrapper(self, num_frames=100, fps=25.0):
"""Return a context manager that replaces VideoReaderWrapper with a mock."""
from fastdeploy.input import video_utils
from fastdeploy.input.utils import video
mock_wrapper = MagicMock()
mock_wrapper.__len__ = MagicMock(return_value=num_frames)
mock_wrapper.get_avg_fps = MagicMock(return_value=fps)
return patch.object(video_utils, "VideoReaderWrapper", return_value=mock_wrapper), mock_wrapper
return patch.object(video, "VideoReaderWrapper", return_value=mock_wrapper), mock_wrapper
def test_existing_wrapper_passthrough(self):
"""Already-wrapped reader is returned as-is."""
from fastdeploy.input.video_utils import VideoReaderWrapper
from fastdeploy.input.utils.video import VideoReaderWrapper
mock_wrapper = MagicMock(spec=VideoReaderWrapper)
mock_wrapper.__len__ = MagicMock(return_value=50)
@@ -196,7 +196,7 @@ class TestReadVideoDecord(unittest.TestCase):
def test_bytes_input_converted_to_bytesio(self):
"""bytes input is converted to BytesIO before creating VideoReaderWrapper."""
from fastdeploy.input import video_utils
from fastdeploy.input.utils import video
captured = []
@@ -210,14 +210,14 @@ class TestReadVideoDecord(unittest.TestCase):
def get_avg_fps(self):
return 10.0
with patch.object(video_utils, "VideoReaderWrapper", FakeWrapper):
with patch.object(video, "VideoReaderWrapper", FakeWrapper):
reader, meta, path = read_video_decord(b"fake_video_bytes")
self.assertIsInstance(captured[0], io.BytesIO)
def test_string_path_input(self):
"""String path is passed through to VideoReaderWrapper."""
from fastdeploy.input import video_utils
from fastdeploy.input.utils import video
class FakeWrapper:
def __init__(self, path, *args, **kwargs):
@@ -229,7 +229,7 @@ class TestReadVideoDecord(unittest.TestCase):
def get_avg_fps(self):
return 30.0
with patch.object(video_utils, "VideoReaderWrapper", FakeWrapper):
with patch.object(video, "VideoReaderWrapper", FakeWrapper):
reader, meta, path = read_video_decord("/fake/path.mp4")
self.assertEqual(meta["num_of_frame"], 60)
@@ -333,18 +333,18 @@ class TestSampleFramesDispatcher(unittest.TestCase):
META = {"num_of_frame": 100, "fps": 25.0}
def test_default_variant_is_paddleocr(self):
with patch("fastdeploy.input.video_utils.sample_frames_paddleocr", wraps=sample_frames_paddleocr) as mock_fn:
with patch("fastdeploy.input.utils.video.sample_frames_paddleocr", wraps=sample_frames_paddleocr) as mock_fn:
sample_frames(1, 4, 100, self.META, num_frames=8)
mock_fn.assert_called_once()
def test_qwen_variant_dispatched(self):
with patch("fastdeploy.input.video_utils.sample_frames_qwen", wraps=sample_frames_qwen) as mock_fn:
with patch("fastdeploy.input.utils.video.sample_frames_qwen", wraps=sample_frames_qwen) as mock_fn:
sample_frames(2, 4, 100, self.META, num_frames=8, variant="qwen")
mock_fn.assert_called_once()
def test_qwen_none_fps_converted_to_sentinel(self):
"""None fps/num_frames → converted to -1 before calling sample_frames_qwen."""
with patch("fastdeploy.input.video_utils.sample_frames_qwen", return_value=np.array([])) as mock_fn:
with patch("fastdeploy.input.utils.video.sample_frames_qwen", return_value=np.array([])) as mock_fn:
sample_frames(2, 4, 100, self.META, fps=None, num_frames=None, variant="qwen")
args = mock_fn.call_args[0]
self.assertEqual(args[4], -1) # fps sentinel