mirror of
https://github.com/PaddlePaddle/FastDeploy.git
synced 2026-04-22 16:07:51 +08:00
[DataProcessor] Refactor multimodal processor: extract encoding strategies and unify MM processing pipeline (#7298)
* merge mm processor
This commit is contained in:
@@ -435,17 +435,7 @@ class BaseTextProcessor(ABC):
|
|||||||
request["top_k"] = 1
|
request["top_k"] = 1
|
||||||
|
|
||||||
if self.reasoning_parser:
|
if self.reasoning_parser:
|
||||||
model_status = self.reasoning_parser.get_model_status(request["prompt_token_ids"])
|
self._apply_reasoning_parser(request)
|
||||||
parts = request["request_id"].split("_")
|
|
||||||
if len(parts) > 1:
|
|
||||||
real_req_id = parts[0]
|
|
||||||
index = int(parts[1])
|
|
||||||
n = request.get("n", 1)
|
|
||||||
for idx in range(index * n, (index + 1) * n):
|
|
||||||
self.model_status_dict[f"{real_req_id}_{idx}"] = model_status
|
|
||||||
else:
|
|
||||||
self.model_status_dict[request["request_id"]] = model_status
|
|
||||||
request["enable_thinking"] = model_status == "think_start"
|
|
||||||
|
|
||||||
if request.get("response_max_tokens") is not None and request.get("enable_thinking") is False:
|
if request.get("response_max_tokens") is not None and request.get("enable_thinking") is False:
|
||||||
request["max_tokens"] = min(request["response_max_tokens"], request["max_tokens"])
|
request["max_tokens"] = min(request["response_max_tokens"], request["max_tokens"])
|
||||||
@@ -453,6 +443,20 @@ class BaseTextProcessor(ABC):
|
|||||||
data_processor_logger.info(f"Processed request dict: {request}")
|
data_processor_logger.info(f"Processed request dict: {request}")
|
||||||
return request
|
return request
|
||||||
|
|
||||||
|
def _apply_reasoning_parser(self, request):
|
||||||
|
"""Apply reasoning parser to determine model thinking status."""
|
||||||
|
model_status = self.reasoning_parser.get_model_status(request["prompt_token_ids"])
|
||||||
|
parts = request["request_id"].split("_")
|
||||||
|
if len(parts) > 1:
|
||||||
|
real_req_id = parts[0]
|
||||||
|
index = int(parts[1])
|
||||||
|
n = request.get("n", 1)
|
||||||
|
for idx in range(index * n, (index + 1) * n):
|
||||||
|
self.model_status_dict[f"{real_req_id}_{idx}"] = model_status
|
||||||
|
else:
|
||||||
|
self.model_status_dict[request["request_id"]] = model_status
|
||||||
|
request["enable_thinking"] = model_status == "think_start"
|
||||||
|
|
||||||
def clear_request_status(self, task_id):
|
def clear_request_status(self, task_id):
|
||||||
"""Clear all per-request decode state and return the accumulated text."""
|
"""Clear all per-request decode state and return the accumulated text."""
|
||||||
results_all = ""
|
results_all = ""
|
||||||
|
|||||||
@@ -0,0 +1,23 @@
|
|||||||
|
# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
|
||||||
|
#
|
||||||
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
# you may not use this file except in compliance with the License.
|
||||||
|
# You may obtain a copy of the License at
|
||||||
|
#
|
||||||
|
# http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
#
|
||||||
|
# Unless required by applicable law or agreed to in writing, software
|
||||||
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
# See the License for the specific language governing permissions and
|
||||||
|
# limitations under the License.
|
||||||
|
|
||||||
|
"""Multimodal encoding strategies for VL model families."""
|
||||||
|
|
||||||
|
from fastdeploy.input.encodings.base_encoding import BaseEncoding
|
||||||
|
from fastdeploy.input.encodings.ernie_encoding import ErnieEncoding
|
||||||
|
from fastdeploy.input.encodings.paddleocr_encoding import PaddleOCREncoding
|
||||||
|
from fastdeploy.input.encodings.qwen_encoding import QwenEncoding
|
||||||
|
from fastdeploy.input.encodings.registry import EncodingRegistry
|
||||||
|
|
||||||
|
__all__ = ["BaseEncoding", "EncodingRegistry", "ErnieEncoding", "PaddleOCREncoding", "QwenEncoding"]
|
||||||
@@ -0,0 +1,189 @@
|
|||||||
|
# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
|
||||||
|
#
|
||||||
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
# you may not use this file except in compliance with the License.
|
||||||
|
# You may obtain a copy of the License at
|
||||||
|
#
|
||||||
|
# http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
#
|
||||||
|
# Unless required by applicable law or agreed to in writing, software
|
||||||
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
# See the License for the specific language governing permissions and
|
||||||
|
# limitations under the License.
|
||||||
|
|
||||||
|
"""Abstract base class for multimodal encoding strategies.
|
||||||
|
|
||||||
|
Each encoding strategy handles model-family-specific logic such as
|
||||||
|
position ID computation, image/video preprocessing, and token counting.
|
||||||
|
New model families should subclass ``BaseEncoding`` and implement all
|
||||||
|
abstract methods.
|
||||||
|
"""
|
||||||
|
|
||||||
|
from abc import ABC, abstractmethod
|
||||||
|
from typing import Any, Dict, Optional, Tuple
|
||||||
|
|
||||||
|
|
||||||
|
class BaseEncoding(ABC):
|
||||||
|
"""Contract that every encoding strategy must fulfil.
|
||||||
|
|
||||||
|
Required (abstract) methods cover the core encoding pipeline.
|
||||||
|
Optional methods (``init_extra``, ``get_mm_max_tokens_per_item``) have
|
||||||
|
default no-op implementations so subclasses only override when needed.
|
||||||
|
"""
|
||||||
|
|
||||||
|
def __init__(self, processor, processor_kwargs=None):
|
||||||
|
if processor_kwargs is None:
|
||||||
|
processor_kwargs = {}
|
||||||
|
cfg = processor.cfg
|
||||||
|
|
||||||
|
# Shared objects (created by processor, used by encoding)
|
||||||
|
self.cfg = cfg
|
||||||
|
self.image_processor = processor.image_processor
|
||||||
|
self.tokenizer = processor.tokenizer
|
||||||
|
|
||||||
|
# Conv params
|
||||||
|
if cfg.conv_params_from_kwargs:
|
||||||
|
self.spatial_conv_size = processor_kwargs.get("spatial_conv_size", 2)
|
||||||
|
self.temporal_conv_size = processor_kwargs.get("temporal_conv_size", 2)
|
||||||
|
else:
|
||||||
|
self.spatial_conv_size = self.image_processor.merge_size
|
||||||
|
self.temporal_conv_size = self.image_processor.temporal_patch_size
|
||||||
|
|
||||||
|
# Special token IDs
|
||||||
|
self.image_token_id = self.tokenizer.convert_tokens_to_ids(cfg.image_token_str)
|
||||||
|
self.video_token_id = self.tokenizer.convert_tokens_to_ids(cfg.video_token_str)
|
||||||
|
if cfg.has_tokens_per_second:
|
||||||
|
vision_config = getattr(getattr(processor, "config", None), "vision_config", None)
|
||||||
|
self.tokens_per_second = getattr(vision_config, "tokens_per_second", 2)
|
||||||
|
else:
|
||||||
|
self.tokens_per_second = 2
|
||||||
|
|
||||||
|
# Video params
|
||||||
|
self.fps = processor_kwargs.get("video_fps", cfg.default_fps)
|
||||||
|
self.min_frames = processor_kwargs.get("video_min_frames", cfg.default_min_frames)
|
||||||
|
self.max_frames = processor_kwargs.get("video_max_frames", cfg.default_max_frames)
|
||||||
|
self.target_frames = processor_kwargs.get("video_target_frames", cfg.default_target_frames)
|
||||||
|
|
||||||
|
# Model-specific extra init
|
||||||
|
self.init_extra(processor_kwargs)
|
||||||
|
|
||||||
|
# ------------------------------------------------------------------
|
||||||
|
# Image
|
||||||
|
# ------------------------------------------------------------------
|
||||||
|
@abstractmethod
|
||||||
|
def add_image(self, img, outputs: dict, uuid, token_len=None):
|
||||||
|
"""Process a raw image and append results to *outputs*."""
|
||||||
|
|
||||||
|
@abstractmethod
|
||||||
|
def add_processed_image(self, img_cache, outputs: dict, uuid, token_len=None):
|
||||||
|
"""Append a pre-processed (cached) image to *outputs*."""
|
||||||
|
|
||||||
|
# ------------------------------------------------------------------
|
||||||
|
# Video
|
||||||
|
# ------------------------------------------------------------------
|
||||||
|
@abstractmethod
|
||||||
|
def add_video(self, frames, outputs: dict, uuid, token_len=None, meta: Optional[dict] = None):
|
||||||
|
"""Process video frames and append results to *outputs*.
|
||||||
|
|
||||||
|
Parameters
|
||||||
|
----------
|
||||||
|
frames : array-like
|
||||||
|
Decoded video frames.
|
||||||
|
outputs : dict
|
||||||
|
Mutable accumulator for input_ids, position_ids, etc.
|
||||||
|
uuid : str | None
|
||||||
|
Unique identifier for cache lookup.
|
||||||
|
token_len : int | None
|
||||||
|
Expected token count (for validation against pre-tokenised prompts).
|
||||||
|
meta : dict | None
|
||||||
|
Video metadata (fps, duration, ...). Encoding strategies that
|
||||||
|
need metadata (e.g. Qwen) read from this dict; those that don't
|
||||||
|
(e.g. Ernie) simply ignore it.
|
||||||
|
"""
|
||||||
|
|
||||||
|
@abstractmethod
|
||||||
|
def add_processed_video(self, frames_cache, outputs: dict, uuid, token_len=None):
|
||||||
|
"""Append a pre-processed (cached) video to *outputs*."""
|
||||||
|
|
||||||
|
@abstractmethod
|
||||||
|
def load_video(self, url, item: dict) -> Tuple[Any, dict]:
|
||||||
|
"""Decode a video from *url* and return ``(frames, meta)``.
|
||||||
|
|
||||||
|
All implementations must return a 2-tuple so that the caller
|
||||||
|
(``MultiModalProcessor.text2ids``) can unpack uniformly.
|
||||||
|
"""
|
||||||
|
|
||||||
|
# ------------------------------------------------------------------
|
||||||
|
# Text / position helpers
|
||||||
|
# ------------------------------------------------------------------
|
||||||
|
@abstractmethod
|
||||||
|
def add_text_positions(self, outputs: dict, num_tokens: int):
|
||||||
|
"""Append text position IDs to *outputs*."""
|
||||||
|
|
||||||
|
@abstractmethod
|
||||||
|
def append_completion_tokens(self, multimodal_inputs: dict, completion_token_ids):
|
||||||
|
"""Append completion token IDs (and their positions) to *multimodal_inputs*."""
|
||||||
|
|
||||||
|
# ------------------------------------------------------------------
|
||||||
|
# Prompt-token-ids path (optional — only models with
|
||||||
|
# supports_prompt_token_ids=True need to implement this)
|
||||||
|
# ------------------------------------------------------------------
|
||||||
|
def prompt_token_ids2outputs(self, prompt_token_ids, mm_items=None) -> dict:
|
||||||
|
"""Build outputs dict from pre-tokenised ``prompt_token_ids``.
|
||||||
|
|
||||||
|
Parameters
|
||||||
|
----------
|
||||||
|
prompt_token_ids : list[int]
|
||||||
|
Pre-tokenised token IDs.
|
||||||
|
mm_items : list[dict] | None
|
||||||
|
Already-extracted multimodal items (each has 'type', 'data', 'uuid').
|
||||||
|
``None`` means text-only.
|
||||||
|
"""
|
||||||
|
raise NotImplementedError(f"{type(self).__name__} does not support prompt_token_ids path")
|
||||||
|
|
||||||
|
# ------------------------------------------------------------------
|
||||||
|
# Token counting & packing
|
||||||
|
# ------------------------------------------------------------------
|
||||||
|
@staticmethod
|
||||||
|
@abstractmethod
|
||||||
|
def mm_num_tokens(grid_thw):
|
||||||
|
"""Return the number of multimodal tokens for a given grid_thw."""
|
||||||
|
|
||||||
|
@abstractmethod
|
||||||
|
def pack_position_ids(self, outputs: dict):
|
||||||
|
"""Convert intermediate position ID lists into final packed format."""
|
||||||
|
|
||||||
|
# ------------------------------------------------------------------
|
||||||
|
# Outputs initialisation
|
||||||
|
# ------------------------------------------------------------------
|
||||||
|
def _make_outputs(self) -> dict:
|
||||||
|
"""Create the mutable accumulator dict for encoding results.
|
||||||
|
|
||||||
|
Subclasses override to add model-specific fields (e.g. fps, vit fields).
|
||||||
|
"""
|
||||||
|
return {
|
||||||
|
"input_ids": [],
|
||||||
|
"token_type_ids": [],
|
||||||
|
"position_ids": [],
|
||||||
|
"images": [],
|
||||||
|
"grid_thw": [],
|
||||||
|
"image_type_ids": [],
|
||||||
|
"labels": [],
|
||||||
|
"cur_position": 0,
|
||||||
|
"video_cnt": 0,
|
||||||
|
"num_input_image_tokens": 0,
|
||||||
|
"num_input_video_tokens": 0,
|
||||||
|
"mm_positions": [],
|
||||||
|
"mm_hashes": [],
|
||||||
|
}
|
||||||
|
|
||||||
|
# ------------------------------------------------------------------
|
||||||
|
# Optional hooks — subclasses override only when needed
|
||||||
|
# ------------------------------------------------------------------
|
||||||
|
def init_extra(self, processor_kwargs: dict):
|
||||||
|
"""Model-specific extra initialisation (called once after ``__init__``)."""
|
||||||
|
|
||||||
|
def get_mm_max_tokens_per_item(self, seq_len: int) -> Optional[Dict[str, int]]:
|
||||||
|
"""Per-modality max token counts for the scheduler. ``None`` = not applicable."""
|
||||||
|
return None
|
||||||
@@ -0,0 +1,424 @@
|
|||||||
|
# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
|
||||||
|
#
|
||||||
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
# you may not use this file except in compliance with the License.
|
||||||
|
# You may obtain a copy of the License at
|
||||||
|
#
|
||||||
|
# http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
#
|
||||||
|
# Unless required by applicable law or agreed to in writing, software
|
||||||
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
# See the License for the specific language governing permissions and
|
||||||
|
# limitations under the License.
|
||||||
|
|
||||||
|
"""Ernie4.5-VL encoding strategy for MultiModalProcessor."""
|
||||||
|
|
||||||
|
import copy
|
||||||
|
from collections import defaultdict
|
||||||
|
|
||||||
|
import numpy as np
|
||||||
|
import paddle
|
||||||
|
from paddleformers.transformers.image_utils import ChannelDimension
|
||||||
|
|
||||||
|
from fastdeploy.engine.request import ImagePosition
|
||||||
|
from fastdeploy.input.encodings.base_encoding import BaseEncoding
|
||||||
|
from fastdeploy.input.encodings.registry import EncodingRegistry
|
||||||
|
from fastdeploy.input.mm_model_config import ERNIE4_5_VL
|
||||||
|
from fastdeploy.input.utils import IDS_TYPE_FLAG, MAX_IMAGE_DIMENSION
|
||||||
|
from fastdeploy.multimodal.hasher import MultimodalHasher
|
||||||
|
|
||||||
|
|
||||||
|
@EncodingRegistry.register(ERNIE4_5_VL)
|
||||||
|
class ErnieEncoding(BaseEncoding):
|
||||||
|
"""Encoding strategy for Ernie4.5-VL models."""
|
||||||
|
|
||||||
|
# Boundary token constants
|
||||||
|
IMG_START = "<|IMAGE_START|>"
|
||||||
|
IMG_END = "<|IMAGE_END|>"
|
||||||
|
VID_START = "<|VIDEO_START|>"
|
||||||
|
VID_END = "<|VIDEO_END|>"
|
||||||
|
|
||||||
|
def init_extra(self, processor_kwargs):
|
||||||
|
"""Ernie-specific extra initialisation (pixel params, token type mapping, etc.)."""
|
||||||
|
self.image_min_pixels = processor_kwargs.get("image_min_pixels", 4 * 28 * 28)
|
||||||
|
self.image_max_pixels = processor_kwargs.get("image_max_pixels", 6177 * 28 * 28)
|
||||||
|
self.video_min_pixels = processor_kwargs.get("video_min_pixels", 299 * 28 * 28)
|
||||||
|
self.video_max_pixels = processor_kwargs.get("video_max_pixels", 1196 * 28 * 28)
|
||||||
|
self.frames_sample = processor_kwargs.get("video_frames_sample", self.cfg.default_frames_sample)
|
||||||
|
|
||||||
|
# Build token-type mapping for ernie boundary tokens
|
||||||
|
self.token_type_mapping = self._build_token_type_mapping()
|
||||||
|
|
||||||
|
def _build_token_type_mapping(self):
|
||||||
|
mapping = defaultdict(lambda: IDS_TYPE_FLAG["text"])
|
||||||
|
for token in (self.IMG_START, self.IMG_END, self.VID_START, self.VID_END):
|
||||||
|
mapping[token] = IDS_TYPE_FLAG["image"]
|
||||||
|
mapping[self.image_token_id] = IDS_TYPE_FLAG["image"]
|
||||||
|
return mapping
|
||||||
|
|
||||||
|
def add_image(self, img, outputs, uuid, token_len=None):
|
||||||
|
patches_h, patches_w = self.image_processor.get_smarted_resize(
|
||||||
|
img.height,
|
||||||
|
img.width,
|
||||||
|
min_pixels=self.image_min_pixels,
|
||||||
|
max_pixels=self.image_max_pixels,
|
||||||
|
)[1]
|
||||||
|
num_tokens = (patches_h * patches_w) // (self.spatial_conv_size**2)
|
||||||
|
if token_len and token_len != num_tokens:
|
||||||
|
raise ValueError("image tokens num not match the size")
|
||||||
|
|
||||||
|
outputs["mm_positions"].append(ImagePosition(len(outputs["input_ids"]), num_tokens))
|
||||||
|
outputs["input_ids"].extend([self.image_token_id] * num_tokens)
|
||||||
|
outputs["token_type_ids"].extend([IDS_TYPE_FLAG["image"]] * num_tokens)
|
||||||
|
outputs["num_input_image_tokens"] += num_tokens
|
||||||
|
|
||||||
|
pos_ids = self._compute_3d_positions(1, patches_h, patches_w, outputs["cur_position"])
|
||||||
|
outputs["position_ids"].extend(pos_ids)
|
||||||
|
outputs["cur_position"] = np.max(pos_ids) + 1
|
||||||
|
|
||||||
|
ret = self.image_processor.preprocess(
|
||||||
|
images=[img.convert("RGB")],
|
||||||
|
do_normalize=False,
|
||||||
|
do_rescale=False,
|
||||||
|
predetermined_grid_thw=np.array([[patches_h, patches_w]]),
|
||||||
|
do_convert_rgb=True,
|
||||||
|
input_data_format=ChannelDimension.LAST,
|
||||||
|
)
|
||||||
|
outputs["images"].append(ret["pixel_values"])
|
||||||
|
if not uuid:
|
||||||
|
outputs["mm_hashes"].append(MultimodalHasher.hash_features(ret["pixel_values"]))
|
||||||
|
else:
|
||||||
|
outputs["mm_hashes"].append(uuid)
|
||||||
|
outputs["grid_thw"].append(ret["image_grid_thw"])
|
||||||
|
outputs["image_type_ids"].append(0)
|
||||||
|
|
||||||
|
def add_processed_image(self, img_cache, outputs, uuid, token_len=None):
|
||||||
|
img, meta = img_cache
|
||||||
|
num_tokens = img.shape[0] // (self.spatial_conv_size**2)
|
||||||
|
if token_len and num_tokens != token_len:
|
||||||
|
raise ValueError("image tokens num not match the size")
|
||||||
|
|
||||||
|
outputs["mm_positions"].append(ImagePosition(len(outputs["input_ids"]), num_tokens))
|
||||||
|
outputs["input_ids"].extend([self.image_token_id] * num_tokens)
|
||||||
|
outputs["token_type_ids"].extend([IDS_TYPE_FLAG["image"]] * num_tokens)
|
||||||
|
outputs["num_input_image_tokens"] += num_tokens
|
||||||
|
|
||||||
|
_, h, w = meta["thw"]
|
||||||
|
pos_ids = self._compute_3d_positions(1, h, w, outputs["cur_position"])
|
||||||
|
outputs["position_ids"].extend(pos_ids)
|
||||||
|
outputs["cur_position"] = np.max(pos_ids) + 1
|
||||||
|
|
||||||
|
outputs["images"].append(img)
|
||||||
|
outputs["mm_hashes"].append(uuid)
|
||||||
|
outputs["grid_thw"].append(np.array([[1, h, w]]))
|
||||||
|
outputs["image_type_ids"].append(0)
|
||||||
|
|
||||||
|
def add_video(self, frames, outputs, uuid, token_len=None, meta=None):
|
||||||
|
patches_h, patches_w = self.image_processor.get_smarted_resize(
|
||||||
|
frames[0].height,
|
||||||
|
frames[0].width,
|
||||||
|
min_pixels=self.video_min_pixels,
|
||||||
|
max_pixels=self.video_max_pixels,
|
||||||
|
)[1]
|
||||||
|
num_frames = len(frames)
|
||||||
|
num_tokens = (num_frames * patches_h * patches_w) // (self.spatial_conv_size**2 * self.temporal_conv_size)
|
||||||
|
if token_len and num_tokens != token_len:
|
||||||
|
raise ValueError("video tokens num not match the size")
|
||||||
|
|
||||||
|
pixel_stack = np.stack([np.array(f.convert("RGB")) for f in frames], axis=0)
|
||||||
|
ret = self.image_processor.preprocess(
|
||||||
|
images=None,
|
||||||
|
videos=pixel_stack,
|
||||||
|
do_normalize=False,
|
||||||
|
do_rescale=False,
|
||||||
|
predetermined_grid_thw=np.array([[patches_h, patches_w]] * num_frames),
|
||||||
|
do_convert_rgb=True,
|
||||||
|
input_data_format=ChannelDimension.LAST,
|
||||||
|
)
|
||||||
|
outputs["images"].append(ret["pixel_values_videos"])
|
||||||
|
if not uuid:
|
||||||
|
outputs["mm_hashes"].append(MultimodalHasher.hash_features(ret["pixel_values_videos"]))
|
||||||
|
else:
|
||||||
|
outputs["mm_hashes"].append(uuid)
|
||||||
|
outputs["grid_thw"].append(ret["video_grid_thw"])
|
||||||
|
outputs["image_type_ids"].extend([1] * num_frames)
|
||||||
|
|
||||||
|
outputs["mm_positions"].append(ImagePosition(len(outputs["input_ids"]), num_tokens))
|
||||||
|
outputs["input_ids"].extend([self.image_token_id] * num_tokens)
|
||||||
|
outputs["token_type_ids"].extend([IDS_TYPE_FLAG["video"]] * num_tokens)
|
||||||
|
outputs["num_input_video_tokens"] += num_tokens
|
||||||
|
|
||||||
|
pos_ids = self._compute_3d_positions(num_frames, patches_h, patches_w, outputs["cur_position"])
|
||||||
|
outputs["position_ids"].extend(pos_ids)
|
||||||
|
outputs["cur_position"] = np.max(pos_ids) + 1
|
||||||
|
|
||||||
|
def add_processed_video(self, frames_cache, outputs, uuid, token_len=None):
|
||||||
|
frames, meta = frames_cache
|
||||||
|
num_tokens = frames.shape[0] // (self.spatial_conv_size**2 * self.temporal_conv_size)
|
||||||
|
if token_len and num_tokens != token_len:
|
||||||
|
raise ValueError("video tokens num not match the size")
|
||||||
|
|
||||||
|
t, h, w = meta["thw"]
|
||||||
|
outputs["images"].append(frames)
|
||||||
|
outputs["mm_hashes"].append(uuid)
|
||||||
|
outputs["grid_thw"].append(np.array([[t, h, w]]))
|
||||||
|
|
||||||
|
outputs["mm_positions"].append(ImagePosition(len(outputs["input_ids"]), num_tokens))
|
||||||
|
outputs["input_ids"].extend([self.image_token_id] * num_tokens)
|
||||||
|
outputs["token_type_ids"].extend([IDS_TYPE_FLAG["video"]] * num_tokens)
|
||||||
|
outputs["num_input_video_tokens"] += num_tokens
|
||||||
|
outputs["image_type_ids"].extend([1] * t)
|
||||||
|
|
||||||
|
pos_ids = self._compute_3d_positions(t, h, w, outputs["cur_position"])
|
||||||
|
outputs["position_ids"].extend(pos_ids)
|
||||||
|
outputs["cur_position"] = np.max(pos_ids) + 1
|
||||||
|
|
||||||
|
def load_video(self, url, item):
|
||||||
|
from fastdeploy.input.utils.render_timestamp import render_frame_timestamp
|
||||||
|
from fastdeploy.input.utils.video import read_frames_decord, read_video_decord
|
||||||
|
|
||||||
|
reader, meta, path = read_video_decord(url, save_to_disk=False)
|
||||||
|
|
||||||
|
video_frame_args = {
|
||||||
|
"fps": item.get("fps", self.fps),
|
||||||
|
"min_frames": item.get("min_frames", self.min_frames),
|
||||||
|
"max_frames": item.get("max_frames", self.max_frames),
|
||||||
|
"target_frames": item.get("target_frames", self.target_frames),
|
||||||
|
"frames_sample": item.get("frames_sample", self.frames_sample),
|
||||||
|
}
|
||||||
|
video_frame_args = self.set_video_frame_args(video_frame_args, meta)
|
||||||
|
|
||||||
|
frames_data, _, timestamps = read_frames_decord(
|
||||||
|
path,
|
||||||
|
reader,
|
||||||
|
meta,
|
||||||
|
target_frames=video_frame_args["target_frames"],
|
||||||
|
target_fps=video_frame_args["fps"],
|
||||||
|
frames_sample=video_frame_args["frames_sample"],
|
||||||
|
save_to_disk=False,
|
||||||
|
)
|
||||||
|
|
||||||
|
frames = []
|
||||||
|
for img_array, ts in zip(frames_data, timestamps):
|
||||||
|
frames.append(render_frame_timestamp(img_array, ts))
|
||||||
|
# Ensure even number of frames for temporal conv
|
||||||
|
if len(frames) % 2 != 0:
|
||||||
|
frames.append(copy.deepcopy(frames[-1]))
|
||||||
|
return frames, {}
|
||||||
|
|
||||||
|
def set_video_frame_args(self, video_frame_args, video_meta):
|
||||||
|
"""Set final frame sampling args based on priorities."""
|
||||||
|
if video_frame_args["target_frames"] > 0:
|
||||||
|
if video_frame_args["fps"] >= 0:
|
||||||
|
raise ValueError("fps must be negative if target_frames is given")
|
||||||
|
if (
|
||||||
|
video_frame_args["min_frames"] > 0
|
||||||
|
and video_frame_args["target_frames"] < video_frame_args["min_frames"]
|
||||||
|
):
|
||||||
|
raise ValueError("target_frames must be larger than min_frames")
|
||||||
|
if (
|
||||||
|
video_frame_args["max_frames"] > 0
|
||||||
|
and video_frame_args["target_frames"] > video_frame_args["max_frames"]
|
||||||
|
):
|
||||||
|
raise ValueError("target_frames must be smaller than max_frames")
|
||||||
|
else:
|
||||||
|
if video_frame_args["fps"] < 0:
|
||||||
|
raise ValueError("Must provide either positive target_fps or positive target_frames.")
|
||||||
|
frames_to_extract = int(video_meta["duration"] * video_frame_args["fps"])
|
||||||
|
if (
|
||||||
|
video_frame_args["min_frames"] > 0
|
||||||
|
and video_frame_args["max_frames"] > 0
|
||||||
|
and video_frame_args["min_frames"] > video_frame_args["max_frames"]
|
||||||
|
):
|
||||||
|
raise ValueError("min_frames must be smaller than max_frames")
|
||||||
|
if video_frame_args["min_frames"] > 0 and frames_to_extract < video_frame_args["min_frames"]:
|
||||||
|
video_frame_args["target_frames"] = video_frame_args["min_frames"]
|
||||||
|
video_frame_args["fps"] = -1
|
||||||
|
if video_frame_args["max_frames"] > 0 and frames_to_extract > video_frame_args["max_frames"]:
|
||||||
|
video_frame_args["target_frames"] = video_frame_args["max_frames"]
|
||||||
|
video_frame_args["fps"] = -1
|
||||||
|
return video_frame_args
|
||||||
|
|
||||||
|
def add_text_positions(self, outputs, num_tokens):
|
||||||
|
"""Write text position IDs in ernie [pos, pos, pos] format."""
|
||||||
|
start = outputs["cur_position"]
|
||||||
|
for i in range(num_tokens):
|
||||||
|
outputs["position_ids"].append([start + i] * 3)
|
||||||
|
outputs["cur_position"] += num_tokens
|
||||||
|
|
||||||
|
def append_completion_tokens(self, multimodal_inputs, completion_token_ids):
|
||||||
|
num_tokens = len(completion_token_ids)
|
||||||
|
multimodal_inputs["input_ids"].extend(completion_token_ids)
|
||||||
|
multimodal_inputs["token_type_ids"].extend([IDS_TYPE_FLAG["text"]] * num_tokens)
|
||||||
|
|
||||||
|
start = multimodal_inputs["cur_position"]
|
||||||
|
for i in range(num_tokens):
|
||||||
|
multimodal_inputs["position_ids"].append([start + i] * 3)
|
||||||
|
multimodal_inputs["cur_position"] += num_tokens
|
||||||
|
|
||||||
|
def _compute_3d_positions(self, t, h, w, start_idx):
|
||||||
|
"""Compute 3D position IDs as list-of-lists for ernie format."""
|
||||||
|
t_eff = t // self.temporal_conv_size if t != 1 else 1
|
||||||
|
gh, gw = h // self.spatial_conv_size, w // self.spatial_conv_size
|
||||||
|
time_idx = np.repeat(np.arange(t_eff), gh * gw)
|
||||||
|
h_idx = np.tile(np.repeat(np.arange(gh), gw), t_eff)
|
||||||
|
w_idx = np.tile(np.arange(gw), t_eff * gh)
|
||||||
|
|
||||||
|
coords = list(zip(time_idx, h_idx, w_idx))
|
||||||
|
return [[start_idx + ti, start_idx + hi, start_idx + wi] for ti, hi, wi in coords]
|
||||||
|
|
||||||
|
def prompt_token_ids2outputs(self, prompt_token_ids, mm_items=None):
|
||||||
|
outputs = self._make_outputs()
|
||||||
|
prompt_token_ids_len = len(prompt_token_ids)
|
||||||
|
|
||||||
|
if mm_items is None:
|
||||||
|
outputs["input_ids"].extend(prompt_token_ids)
|
||||||
|
outputs["token_type_ids"].extend([IDS_TYPE_FLAG["text"]] * prompt_token_ids_len)
|
||||||
|
for i in range(prompt_token_ids_len):
|
||||||
|
outputs["position_ids"].append([i] * 3)
|
||||||
|
outputs["cur_position"] += prompt_token_ids_len
|
||||||
|
return outputs
|
||||||
|
|
||||||
|
images, videos = [], []
|
||||||
|
image_uuid, video_uuid = [], []
|
||||||
|
for item in mm_items:
|
||||||
|
if item.get("type") == "image":
|
||||||
|
images.append(item["data"])
|
||||||
|
image_uuid.append(item.get("uuid"))
|
||||||
|
elif item.get("type") == "video":
|
||||||
|
videos.append(item["data"])
|
||||||
|
video_uuid.append(item.get("uuid"))
|
||||||
|
|
||||||
|
image_start_id = self.tokenizer.convert_tokens_to_ids(self.IMG_START)
|
||||||
|
image_end_id = self.tokenizer.convert_tokens_to_ids(self.IMG_END)
|
||||||
|
video_start_id = self.tokenizer.convert_tokens_to_ids(self.VID_START)
|
||||||
|
video_end_id = self.tokenizer.convert_tokens_to_ids(self.VID_END)
|
||||||
|
|
||||||
|
st, image_idx, video_idx = 0, 0, 0
|
||||||
|
while st < prompt_token_ids_len:
|
||||||
|
cur_token_id = prompt_token_ids[st]
|
||||||
|
if cur_token_id == image_start_id:
|
||||||
|
if image_idx >= len(images):
|
||||||
|
raise ValueError("prompt token ids has more image placeholder than in messages")
|
||||||
|
# append image_start_id
|
||||||
|
outputs["input_ids"].extend([cur_token_id])
|
||||||
|
outputs["token_type_ids"].extend([IDS_TYPE_FLAG["image"]])
|
||||||
|
outputs["position_ids"].append([outputs["cur_position"]] * 3)
|
||||||
|
outputs["cur_position"] += 1
|
||||||
|
st += 1
|
||||||
|
# process placeholder token ids
|
||||||
|
cur_idx = st
|
||||||
|
while cur_idx < prompt_token_ids_len and prompt_token_ids[cur_idx] != image_end_id:
|
||||||
|
cur_idx += 1
|
||||||
|
if cur_idx >= prompt_token_ids_len:
|
||||||
|
raise ValueError("image token ids not complete")
|
||||||
|
image = images[image_idx]
|
||||||
|
uuid = image_uuid[image_idx] if image_uuid else None
|
||||||
|
token_len = cur_idx - st
|
||||||
|
if not isinstance(image, tuple):
|
||||||
|
self.add_image(image, outputs, uuid, token_len)
|
||||||
|
else:
|
||||||
|
self.add_processed_image(image, outputs, uuid, token_len)
|
||||||
|
image_idx += 1
|
||||||
|
st = cur_idx
|
||||||
|
elif cur_token_id == video_start_id:
|
||||||
|
if video_idx >= len(videos):
|
||||||
|
raise ValueError("prompt token ids has more video placeholder than in messages")
|
||||||
|
# append video_start_id
|
||||||
|
outputs["input_ids"].extend([cur_token_id])
|
||||||
|
outputs["token_type_ids"].extend([IDS_TYPE_FLAG["image"]])
|
||||||
|
outputs["position_ids"].append([outputs["cur_position"]] * 3)
|
||||||
|
outputs["cur_position"] += 1
|
||||||
|
st += 1
|
||||||
|
# process placeholder token ids
|
||||||
|
cur_idx = st
|
||||||
|
while cur_idx < prompt_token_ids_len and prompt_token_ids[cur_idx] != video_end_id:
|
||||||
|
cur_idx += 1
|
||||||
|
if cur_idx >= prompt_token_ids_len:
|
||||||
|
raise ValueError("video token ids not complete")
|
||||||
|
video = videos[video_idx]
|
||||||
|
uuid = video_uuid[video_idx] if video_uuid else None
|
||||||
|
token_len = cur_idx - st
|
||||||
|
if not isinstance(video, tuple):
|
||||||
|
if isinstance(video, dict):
|
||||||
|
frames, _ = self.load_video(video["video"], video)
|
||||||
|
else:
|
||||||
|
frames, _ = self.load_video(video, {})
|
||||||
|
self.add_video(frames, outputs, uuid, token_len=token_len)
|
||||||
|
else:
|
||||||
|
self.add_processed_video(video, outputs, uuid, token_len)
|
||||||
|
video_idx += 1
|
||||||
|
st = cur_idx
|
||||||
|
else:
|
||||||
|
outputs["input_ids"].extend([cur_token_id])
|
||||||
|
type_flag = (
|
||||||
|
IDS_TYPE_FLAG["image"] if cur_token_id in (image_end_id, video_end_id) else IDS_TYPE_FLAG["text"]
|
||||||
|
)
|
||||||
|
outputs["token_type_ids"].extend([type_flag])
|
||||||
|
outputs["position_ids"].append([outputs["cur_position"]] * 3)
|
||||||
|
outputs["cur_position"] += 1
|
||||||
|
st += 1
|
||||||
|
|
||||||
|
if image_idx != len(images):
|
||||||
|
raise ValueError("number of images does not match")
|
||||||
|
if video_idx != len(videos):
|
||||||
|
raise ValueError("number of videos does not match")
|
||||||
|
|
||||||
|
return outputs
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def mm_num_tokens(grid_thw):
|
||||||
|
"""Ernie mm_num_tokens: video (t>1) divides by an extra 2."""
|
||||||
|
if isinstance(grid_thw, paddle.Tensor):
|
||||||
|
grid_thw = grid_thw.numpy()
|
||||||
|
if len(grid_thw) == 0:
|
||||||
|
return 0
|
||||||
|
|
||||||
|
def calc_one(thw):
|
||||||
|
t, h, w = map(int, thw)
|
||||||
|
if t == 1:
|
||||||
|
return t * h * w // 4
|
||||||
|
else:
|
||||||
|
return t * h * w // 4 // 2
|
||||||
|
|
||||||
|
if isinstance(grid_thw[0], (list, tuple, np.ndarray)):
|
||||||
|
return [calc_one(x) for x in grid_thw]
|
||||||
|
return calc_one(grid_thw)
|
||||||
|
|
||||||
|
def pack_position_ids(self, outputs):
|
||||||
|
"""Ernie: position_ids is np.array (list-of-lists -> ndarray)."""
|
||||||
|
outputs["position_ids"] = np.array(outputs["position_ids"], dtype=np.int64)
|
||||||
|
outputs["image_patch_id"] = self.image_token_id
|
||||||
|
|
||||||
|
def get_mm_max_tokens_per_item(self, seq_len):
|
||||||
|
"""Per-modality max token counts for ernie."""
|
||||||
|
target_height, target_width = self._get_image_size_with_most_features()
|
||||||
|
# image
|
||||||
|
patches_h, patches_w = self.image_processor.get_smarted_resize(
|
||||||
|
height=target_height,
|
||||||
|
width=target_width,
|
||||||
|
min_pixels=self.image_min_pixels,
|
||||||
|
max_pixels=self.image_max_pixels,
|
||||||
|
)[1]
|
||||||
|
max_image_tokens = (patches_h * patches_w) // (self.spatial_conv_size**2)
|
||||||
|
max_image_tokens = min(max_image_tokens, seq_len)
|
||||||
|
# video
|
||||||
|
patches_h, patches_w = self.image_processor.get_smarted_resize(
|
||||||
|
height=target_height,
|
||||||
|
width=target_width,
|
||||||
|
min_pixels=self.video_min_pixels,
|
||||||
|
max_pixels=self.video_max_pixels,
|
||||||
|
)[1]
|
||||||
|
max_video_tokens = (patches_h * patches_w) // (self.spatial_conv_size**2 * self.temporal_conv_size)
|
||||||
|
max_video_tokens = min(max_video_tokens, seq_len)
|
||||||
|
return {"image": max_image_tokens, "video": max_video_tokens}
|
||||||
|
|
||||||
|
def _get_image_size_with_most_features(self):
|
||||||
|
resized_height, resized_width = self.image_processor.get_smarted_resize(
|
||||||
|
height=MAX_IMAGE_DIMENSION,
|
||||||
|
width=MAX_IMAGE_DIMENSION,
|
||||||
|
min_pixels=self.image_min_pixels,
|
||||||
|
max_pixels=self.image_max_pixels,
|
||||||
|
)[0]
|
||||||
|
return (resized_height, resized_width)
|
||||||
@@ -0,0 +1,190 @@
|
|||||||
|
# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
|
||||||
|
#
|
||||||
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
# you may not use this file except in compliance with the License.
|
||||||
|
# You may obtain a copy of the License at
|
||||||
|
#
|
||||||
|
# http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
#
|
||||||
|
# Unless required by applicable law or agreed to in writing, software
|
||||||
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
# See the License for the specific language governing permissions and
|
||||||
|
# limitations under the License.
|
||||||
|
|
||||||
|
"""PaddleOCR-VL encoding strategy."""
|
||||||
|
|
||||||
|
import numpy as np
|
||||||
|
from PIL import Image
|
||||||
|
|
||||||
|
from fastdeploy.engine.request import ImagePosition
|
||||||
|
from fastdeploy.input.encodings.qwen_encoding import QwenEncoding
|
||||||
|
from fastdeploy.input.encodings.registry import EncodingRegistry
|
||||||
|
from fastdeploy.input.mm_model_config import PADDLEOCR_VL
|
||||||
|
from fastdeploy.input.utils import IDS_TYPE_FLAG
|
||||||
|
from fastdeploy.input.utils.video import read_video_decord
|
||||||
|
from fastdeploy.input.utils.video import sample_frames_paddleocr as _sample_paddleocr
|
||||||
|
from fastdeploy.multimodal.hasher import MultimodalHasher
|
||||||
|
|
||||||
|
|
||||||
|
@EncodingRegistry.register(PADDLEOCR_VL)
|
||||||
|
class PaddleOCREncoding(QwenEncoding):
|
||||||
|
"""Encoding strategy for paddleocr_vl.
|
||||||
|
|
||||||
|
Inherits from QwenEncoding and overrides methods that differ:
|
||||||
|
- _make_outputs: add vit_seqlen / vit_position_ids
|
||||||
|
- add_image / add_video: append vit_fields (vit_seqlen, vit_position_ids)
|
||||||
|
- add_video / add_processed_video: use video_token_id instead of image_token_id
|
||||||
|
- load_video: use sample_frames_paddleocr instead of sample_frames_qwen
|
||||||
|
"""
|
||||||
|
|
||||||
|
def _make_outputs(self) -> dict:
|
||||||
|
outputs = super()._make_outputs()
|
||||||
|
outputs["vit_seqlen"] = []
|
||||||
|
outputs["vit_position_ids"] = []
|
||||||
|
return outputs
|
||||||
|
|
||||||
|
def add_image(self, img, outputs, uuid, token_len=None):
|
||||||
|
ret = self.image_processor.preprocess(images=[img.convert("RGB")])
|
||||||
|
num_tokens = ret["grid_thw"].prod() // self.image_processor.merge_size**2
|
||||||
|
grid_thw = ret["grid_thw"].tolist()
|
||||||
|
if token_len is not None and token_len != num_tokens:
|
||||||
|
raise ValueError("image tokens num not match the size")
|
||||||
|
|
||||||
|
outputs["mm_positions"].append(ImagePosition(len(outputs["input_ids"]), num_tokens))
|
||||||
|
outputs["input_ids"].extend([self.image_token_id] * num_tokens)
|
||||||
|
outputs["token_type_ids"].extend([IDS_TYPE_FLAG["image"]] * num_tokens)
|
||||||
|
outputs["num_input_image_tokens"] += int(num_tokens)
|
||||||
|
|
||||||
|
outputs["images"].append(ret["pixel_values"])
|
||||||
|
if not uuid:
|
||||||
|
outputs["mm_hashes"].append(MultimodalHasher.hash_features(ret["pixel_values"]))
|
||||||
|
else:
|
||||||
|
outputs["mm_hashes"].append(uuid)
|
||||||
|
outputs["grid_thw"].append(grid_thw)
|
||||||
|
outputs["image_type_ids"].append(0)
|
||||||
|
|
||||||
|
t, h, w = grid_thw
|
||||||
|
pos_ids = self._compute_vision_positions(outputs["cur_position"], t, h, w, 0)
|
||||||
|
outputs["position_ids"].append(pos_ids)
|
||||||
|
outputs["cur_position"] = pos_ids.max() + 1
|
||||||
|
|
||||||
|
outputs["fps"].append(0)
|
||||||
|
|
||||||
|
# paddleocr vit fields
|
||||||
|
numel = h * w
|
||||||
|
outputs["vit_seqlen"].append(numel)
|
||||||
|
outputs["vit_position_ids"].append(np.arange(numel) % numel)
|
||||||
|
|
||||||
|
def add_processed_image(self, img_cache, outputs, uuid, token_len=None):
|
||||||
|
super().add_processed_image(img_cache, outputs, uuid, token_len)
|
||||||
|
_, h, w = img_cache[1]["thw"]
|
||||||
|
numel = h * w
|
||||||
|
outputs["vit_seqlen"].append(numel)
|
||||||
|
outputs["vit_position_ids"].append(np.arange(numel) % numel)
|
||||||
|
|
||||||
|
def add_video(self, frames, outputs, uuid, token_len=None, meta=None):
|
||||||
|
preprocess_kwargs = {}
|
||||||
|
if self.cfg.video_min_pixels is not None:
|
||||||
|
preprocess_kwargs["min_pixels"] = self.cfg.video_min_pixels
|
||||||
|
preprocess_kwargs["max_pixels"] = self.cfg.video_max_pixels
|
||||||
|
|
||||||
|
ret = self.image_processor.preprocess(images=frames, **preprocess_kwargs)
|
||||||
|
|
||||||
|
num_tokens = ret["grid_thw"].prod() // self.image_processor.merge_size**2
|
||||||
|
grid_thw = ret["grid_thw"].tolist()
|
||||||
|
if token_len is not None and token_len != num_tokens:
|
||||||
|
raise ValueError("video tokens num not match the size")
|
||||||
|
|
||||||
|
outputs["mm_positions"].append(ImagePosition(len(outputs["input_ids"]), num_tokens))
|
||||||
|
outputs["input_ids"].extend([self.video_token_id] * num_tokens)
|
||||||
|
outputs["token_type_ids"].extend([IDS_TYPE_FLAG["video"]] * num_tokens)
|
||||||
|
outputs["num_input_video_tokens"] += int(num_tokens)
|
||||||
|
|
||||||
|
outputs["images"].append(ret["pixel_values"])
|
||||||
|
if not uuid:
|
||||||
|
outputs["mm_hashes"].append(MultimodalHasher.hash_features(ret["pixel_values"]))
|
||||||
|
else:
|
||||||
|
outputs["mm_hashes"].append(uuid)
|
||||||
|
outputs["grid_thw"].append(grid_thw)
|
||||||
|
outputs["image_type_ids"].extend([1] * grid_thw[0])
|
||||||
|
|
||||||
|
fps = meta["fps"] if meta else 0
|
||||||
|
second_per_grid_t = self.temporal_conv_size / fps if fps else 0
|
||||||
|
t, h, w = grid_thw
|
||||||
|
pos_ids = self._compute_vision_positions(outputs["cur_position"], t, h, w, second_per_grid_t)
|
||||||
|
outputs["position_ids"].append(pos_ids)
|
||||||
|
outputs["cur_position"] = pos_ids.max() + 1
|
||||||
|
|
||||||
|
outputs["fps"].append(fps)
|
||||||
|
|
||||||
|
# paddleocr vit fields
|
||||||
|
numel = h * w
|
||||||
|
outputs["vit_seqlen"].append(numel)
|
||||||
|
outputs["vit_position_ids"].append(np.arange(numel) % numel)
|
||||||
|
|
||||||
|
def add_processed_video(self, frames_cache, outputs, uuid, token_len=None):
|
||||||
|
frames, meta = frames_cache
|
||||||
|
num_tokens = frames.shape[0] // self.image_processor.merge_size**2
|
||||||
|
if token_len is not None and token_len != num_tokens:
|
||||||
|
raise ValueError("video tokens num not match the size")
|
||||||
|
|
||||||
|
t, h, w = meta["thw"]
|
||||||
|
outputs["images"].append(frames)
|
||||||
|
outputs["mm_hashes"].append(uuid)
|
||||||
|
outputs["grid_thw"].append(np.array([[t, h, w]]))
|
||||||
|
|
||||||
|
outputs["mm_positions"].append(ImagePosition(len(outputs["input_ids"]), num_tokens))
|
||||||
|
outputs["input_ids"].extend([self.video_token_id] * num_tokens)
|
||||||
|
outputs["token_type_ids"].extend([IDS_TYPE_FLAG["video"]] * num_tokens)
|
||||||
|
outputs["num_input_video_tokens"] += num_tokens
|
||||||
|
outputs["image_type_ids"].extend([1] * t)
|
||||||
|
|
||||||
|
fps = meta["fps"]
|
||||||
|
second_per_grid_t = self.temporal_conv_size / fps
|
||||||
|
pos_ids = self._compute_vision_positions(outputs["cur_position"], t, h, w, second_per_grid_t)
|
||||||
|
outputs["position_ids"].append(pos_ids)
|
||||||
|
outputs["cur_position"] = pos_ids.max() + 1
|
||||||
|
|
||||||
|
outputs["fps"].append(fps)
|
||||||
|
|
||||||
|
# paddleocr vit fields
|
||||||
|
numel = h * w
|
||||||
|
outputs["vit_seqlen"].append(numel)
|
||||||
|
outputs["vit_position_ids"].append(np.arange(numel) % numel)
|
||||||
|
|
||||||
|
def load_video(self, url, item):
|
||||||
|
reader, meta, _ = read_video_decord(url, save_to_disk=False)
|
||||||
|
|
||||||
|
fps = item.get("fps", self.fps)
|
||||||
|
num_frames = item.get("target_frames", self.target_frames)
|
||||||
|
|
||||||
|
frame_indices = list(range(meta["num_of_frame"]))
|
||||||
|
if fps > 0 or num_frames > 0:
|
||||||
|
min_frames = item.get("min_frames", self.min_frames)
|
||||||
|
max_frames = item.get("max_frames", self.max_frames)
|
||||||
|
|
||||||
|
frame_indices = _sample_paddleocr(
|
||||||
|
frame_factor=self.temporal_conv_size,
|
||||||
|
min_frames=min_frames,
|
||||||
|
max_frames=max_frames,
|
||||||
|
metadata=meta,
|
||||||
|
fps=fps,
|
||||||
|
num_frames=num_frames,
|
||||||
|
)
|
||||||
|
|
||||||
|
meta["num_of_frame"] = len(frame_indices)
|
||||||
|
if fps is not None:
|
||||||
|
meta["fps"] = fps
|
||||||
|
meta["duration"] = len(frame_indices) / fps
|
||||||
|
else:
|
||||||
|
meta["fps"] = len(frame_indices) / meta["duration"]
|
||||||
|
|
||||||
|
frames = []
|
||||||
|
for idx in frame_indices:
|
||||||
|
frame = reader[idx].asnumpy()
|
||||||
|
image = Image.fromarray(frame, "RGB")
|
||||||
|
frames.append(image)
|
||||||
|
frames = np.stack([np.array(f.convert("RGB")) for f in frames], axis=0)
|
||||||
|
|
||||||
|
return frames, meta
|
||||||
@@ -0,0 +1,314 @@
|
|||||||
|
# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
|
||||||
|
#
|
||||||
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
# you may not use this file except in compliance with the License.
|
||||||
|
# You may obtain a copy of the License at
|
||||||
|
#
|
||||||
|
# http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
#
|
||||||
|
# Unless required by applicable law or agreed to in writing, software
|
||||||
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
# See the License for the specific language governing permissions and
|
||||||
|
# limitations under the License.
|
||||||
|
|
||||||
|
"""Qwen-family (qwen_vl / qwen3_vl) encoding strategy."""
|
||||||
|
|
||||||
|
|
||||||
|
import numpy as np
|
||||||
|
import paddle
|
||||||
|
from PIL import Image
|
||||||
|
|
||||||
|
from fastdeploy.engine.request import ImagePosition
|
||||||
|
from fastdeploy.input.encodings.base_encoding import BaseEncoding
|
||||||
|
from fastdeploy.input.encodings.registry import EncodingRegistry
|
||||||
|
from fastdeploy.input.mm_model_config import QWEN3_VL, QWEN_VL
|
||||||
|
from fastdeploy.input.utils import IDS_TYPE_FLAG
|
||||||
|
from fastdeploy.input.utils.video import read_video_decord
|
||||||
|
from fastdeploy.input.utils.video import sample_frames_qwen as _sample_qwen
|
||||||
|
from fastdeploy.multimodal.hasher import MultimodalHasher
|
||||||
|
|
||||||
|
|
||||||
|
@EncodingRegistry.register(QWEN_VL, QWEN3_VL)
|
||||||
|
class QwenEncoding(BaseEncoding):
|
||||||
|
"""Encoding strategy for qwen_vl and qwen3_vl."""
|
||||||
|
|
||||||
|
FRAME_FACTOR = 2
|
||||||
|
|
||||||
|
def _make_outputs(self) -> dict:
|
||||||
|
outputs = super()._make_outputs()
|
||||||
|
outputs["fps"] = []
|
||||||
|
return outputs
|
||||||
|
|
||||||
|
def add_image(self, img, outputs, uuid, token_len=None):
|
||||||
|
ret = self.image_processor.preprocess(images=[img.convert("RGB")])
|
||||||
|
num_tokens = ret["grid_thw"].prod() // self.image_processor.merge_size**2
|
||||||
|
grid_thw = ret["grid_thw"].tolist()
|
||||||
|
if token_len is not None and token_len != num_tokens:
|
||||||
|
raise ValueError("image tokens num not match the size")
|
||||||
|
|
||||||
|
outputs["mm_positions"].append(ImagePosition(len(outputs["input_ids"]), num_tokens))
|
||||||
|
outputs["input_ids"].extend([self.image_token_id] * num_tokens)
|
||||||
|
outputs["token_type_ids"].extend([IDS_TYPE_FLAG["image"]] * num_tokens)
|
||||||
|
outputs["num_input_image_tokens"] += int(num_tokens)
|
||||||
|
|
||||||
|
outputs["images"].append(ret["pixel_values"])
|
||||||
|
if not uuid:
|
||||||
|
outputs["mm_hashes"].append(MultimodalHasher.hash_features(ret["pixel_values"]))
|
||||||
|
else:
|
||||||
|
outputs["mm_hashes"].append(uuid)
|
||||||
|
outputs["grid_thw"].append(grid_thw)
|
||||||
|
outputs["image_type_ids"].append(0)
|
||||||
|
|
||||||
|
t, h, w = grid_thw
|
||||||
|
pos_ids = self._compute_vision_positions(outputs["cur_position"], t, h, w, 0)
|
||||||
|
outputs["position_ids"].append(pos_ids)
|
||||||
|
outputs["cur_position"] = pos_ids.max() + 1
|
||||||
|
|
||||||
|
outputs["fps"].append(0)
|
||||||
|
|
||||||
|
def add_processed_image(self, img_cache, outputs, uuid, token_len=None):
|
||||||
|
img, meta = img_cache
|
||||||
|
num_tokens = img.shape[0] // self.image_processor.merge_size**2
|
||||||
|
if token_len is not None and token_len != num_tokens:
|
||||||
|
raise ValueError("image tokens num not match the size")
|
||||||
|
|
||||||
|
outputs["mm_positions"].append(ImagePosition(len(outputs["input_ids"]), num_tokens))
|
||||||
|
outputs["input_ids"].extend([self.image_token_id] * num_tokens)
|
||||||
|
outputs["token_type_ids"].extend([IDS_TYPE_FLAG["image"]] * num_tokens)
|
||||||
|
outputs["num_input_image_tokens"] += num_tokens
|
||||||
|
|
||||||
|
_, h, w = meta["thw"]
|
||||||
|
pos_ids = self._compute_vision_positions(outputs["cur_position"], 1, h, w, 0)
|
||||||
|
outputs["position_ids"].append(pos_ids)
|
||||||
|
outputs["cur_position"] = pos_ids.max() + 1
|
||||||
|
|
||||||
|
outputs["images"].append(img)
|
||||||
|
outputs["mm_hashes"].append(uuid)
|
||||||
|
outputs["grid_thw"].append(np.array([[1, h, w]]))
|
||||||
|
outputs["image_type_ids"].append(0)
|
||||||
|
|
||||||
|
outputs["fps"].append(0)
|
||||||
|
|
||||||
|
def add_video(self, frames, outputs, uuid, token_len=None, meta=None):
|
||||||
|
preprocess_kwargs = {}
|
||||||
|
# qwen3_vl passes min/max pixels for video
|
||||||
|
if self.cfg.video_min_pixels is not None:
|
||||||
|
preprocess_kwargs["min_pixels"] = self.cfg.video_min_pixels
|
||||||
|
preprocess_kwargs["max_pixels"] = self.cfg.video_max_pixels
|
||||||
|
|
||||||
|
ret = self.image_processor.preprocess(images=frames, **preprocess_kwargs)
|
||||||
|
|
||||||
|
num_tokens = ret["grid_thw"].prod() // self.image_processor.merge_size**2
|
||||||
|
grid_thw = ret["grid_thw"].tolist()
|
||||||
|
if token_len is not None and token_len != num_tokens:
|
||||||
|
raise ValueError("video tokens num not match the size")
|
||||||
|
|
||||||
|
outputs["mm_positions"].append(ImagePosition(len(outputs["input_ids"]), num_tokens))
|
||||||
|
outputs["input_ids"].extend([self.image_token_id] * num_tokens)
|
||||||
|
outputs["token_type_ids"].extend([IDS_TYPE_FLAG["video"]] * num_tokens)
|
||||||
|
outputs["num_input_video_tokens"] += int(num_tokens)
|
||||||
|
|
||||||
|
outputs["images"].append(ret["pixel_values"])
|
||||||
|
if not uuid:
|
||||||
|
outputs["mm_hashes"].append(MultimodalHasher.hash_features(ret["pixel_values"]))
|
||||||
|
else:
|
||||||
|
outputs["mm_hashes"].append(uuid)
|
||||||
|
outputs["grid_thw"].append(grid_thw)
|
||||||
|
outputs["image_type_ids"].extend([1] * grid_thw[0])
|
||||||
|
|
||||||
|
fps = meta["fps"] if meta else 0
|
||||||
|
second_per_grid_t = self.temporal_conv_size / fps if fps else 0
|
||||||
|
t, h, w = grid_thw
|
||||||
|
pos_ids = self._compute_vision_positions(outputs["cur_position"], t, h, w, second_per_grid_t)
|
||||||
|
outputs["position_ids"].append(pos_ids)
|
||||||
|
outputs["cur_position"] = pos_ids.max() + 1
|
||||||
|
|
||||||
|
outputs["fps"].append(fps)
|
||||||
|
|
||||||
|
def add_processed_video(self, frames_cache, outputs, uuid, token_len=None):
|
||||||
|
frames, meta = frames_cache
|
||||||
|
num_tokens = frames.shape[0] // self.image_processor.merge_size**2
|
||||||
|
if token_len is not None and token_len != num_tokens:
|
||||||
|
raise ValueError("video tokens num not match the size")
|
||||||
|
|
||||||
|
t, h, w = meta["thw"]
|
||||||
|
outputs["images"].append(frames)
|
||||||
|
outputs["mm_hashes"].append(uuid)
|
||||||
|
outputs["grid_thw"].append(np.array([[t, h, w]]))
|
||||||
|
|
||||||
|
outputs["mm_positions"].append(ImagePosition(len(outputs["input_ids"]), num_tokens))
|
||||||
|
outputs["input_ids"].extend([self.image_token_id] * num_tokens)
|
||||||
|
outputs["token_type_ids"].extend([IDS_TYPE_FLAG["video"]] * num_tokens)
|
||||||
|
outputs["num_input_video_tokens"] += num_tokens
|
||||||
|
outputs["image_type_ids"].extend([1] * t)
|
||||||
|
|
||||||
|
fps = meta["fps"]
|
||||||
|
second_per_grid_t = self.temporal_conv_size / fps
|
||||||
|
pos_ids = self._compute_vision_positions(outputs["cur_position"], t, h, w, second_per_grid_t)
|
||||||
|
outputs["position_ids"].append(pos_ids)
|
||||||
|
outputs["cur_position"] = pos_ids.max() + 1
|
||||||
|
|
||||||
|
outputs["fps"].append(fps)
|
||||||
|
|
||||||
|
def load_video(self, url, item):
|
||||||
|
reader, meta, _ = read_video_decord(url, save_to_disk=False)
|
||||||
|
|
||||||
|
fps = item.get("fps", self.fps)
|
||||||
|
num_frames = item.get("target_frames", self.target_frames)
|
||||||
|
|
||||||
|
frame_indices = list(range(meta["num_of_frame"]))
|
||||||
|
if fps > 0 or num_frames > 0:
|
||||||
|
min_frames = item.get("min_frames", self.min_frames)
|
||||||
|
max_frames = item.get("max_frames", self.max_frames)
|
||||||
|
|
||||||
|
frame_indices = _sample_qwen(
|
||||||
|
frame_factor=self.FRAME_FACTOR,
|
||||||
|
min_frames=min_frames,
|
||||||
|
max_frames=max_frames,
|
||||||
|
metadata=meta,
|
||||||
|
fps=-1 if num_frames > 0 else fps,
|
||||||
|
num_frames=num_frames,
|
||||||
|
)
|
||||||
|
|
||||||
|
meta["num_of_frame"] = len(frame_indices)
|
||||||
|
if fps is not None:
|
||||||
|
meta["fps"] = fps
|
||||||
|
meta["duration"] = len(frame_indices) / fps
|
||||||
|
else:
|
||||||
|
meta["fps"] = len(frame_indices) / meta["duration"]
|
||||||
|
|
||||||
|
frames = []
|
||||||
|
for idx in frame_indices:
|
||||||
|
frame = reader[idx].asnumpy()
|
||||||
|
image = Image.fromarray(frame, "RGB")
|
||||||
|
frames.append(image)
|
||||||
|
frames = np.stack([np.array(f.convert("RGB")) for f in frames], axis=0)
|
||||||
|
|
||||||
|
return frames, meta
|
||||||
|
|
||||||
|
def add_text_positions(self, outputs, num_tokens):
|
||||||
|
"""Write text position IDs in qwen 3xN ndarray format."""
|
||||||
|
pos_ids = self._compute_text_positions(outputs["cur_position"], num_tokens)
|
||||||
|
outputs["position_ids"].append(pos_ids)
|
||||||
|
outputs["cur_position"] = pos_ids.max() + 1
|
||||||
|
|
||||||
|
def append_completion_tokens(self, multimodal_inputs, completion_token_ids):
|
||||||
|
num_tokens = len(completion_token_ids)
|
||||||
|
multimodal_inputs["input_ids"].extend(completion_token_ids)
|
||||||
|
multimodal_inputs["token_type_ids"].extend([IDS_TYPE_FLAG["text"]] * num_tokens)
|
||||||
|
|
||||||
|
pos_ids = self._compute_text_positions(multimodal_inputs["cur_position"], num_tokens)
|
||||||
|
multimodal_inputs["position_ids"].append(pos_ids)
|
||||||
|
multimodal_inputs["cur_position"] += num_tokens
|
||||||
|
|
||||||
|
def prompt_token_ids2outputs(self, prompt_token_ids, mm_items=None):
|
||||||
|
"""Build outputs from prompt_token_ids. Only qwen3_vl supports this."""
|
||||||
|
outputs = self._make_outputs()
|
||||||
|
prompt_token_ids_len = len(prompt_token_ids)
|
||||||
|
|
||||||
|
if mm_items is None:
|
||||||
|
self._add_text_tokens(prompt_token_ids, outputs)
|
||||||
|
return outputs
|
||||||
|
|
||||||
|
st, mm_idx = 0, 0
|
||||||
|
while st < prompt_token_ids_len:
|
||||||
|
if prompt_token_ids[st] != self.image_token_id:
|
||||||
|
cur_idx = st
|
||||||
|
while cur_idx < prompt_token_ids_len and prompt_token_ids[cur_idx] != self.image_token_id:
|
||||||
|
cur_idx += 1
|
||||||
|
self._add_text_tokens(prompt_token_ids[st:cur_idx], outputs)
|
||||||
|
st = cur_idx
|
||||||
|
continue
|
||||||
|
|
||||||
|
if mm_idx >= len(mm_items):
|
||||||
|
raise ValueError("prompt token ids has more multimodal placeholder than in messages")
|
||||||
|
|
||||||
|
cur_idx = st
|
||||||
|
while cur_idx < prompt_token_ids_len and prompt_token_ids[cur_idx] == self.image_token_id:
|
||||||
|
cur_idx += 1
|
||||||
|
|
||||||
|
item = mm_items[mm_idx]
|
||||||
|
uuid = item.get("uuid")
|
||||||
|
token_len = cur_idx - st
|
||||||
|
if item.get("type") == "image":
|
||||||
|
image = item.get("data")
|
||||||
|
if not isinstance(image, tuple):
|
||||||
|
self.add_image(image, outputs, uuid, token_len)
|
||||||
|
else:
|
||||||
|
self.add_processed_image(image, outputs, uuid, token_len)
|
||||||
|
elif item.get("type") == "video":
|
||||||
|
video = item.get("data")
|
||||||
|
if not isinstance(video, tuple):
|
||||||
|
if isinstance(video, dict):
|
||||||
|
frames, meta = self.load_video(video["video"], video)
|
||||||
|
else:
|
||||||
|
frames, meta = self.load_video(video, {})
|
||||||
|
self.add_video(frames, outputs, uuid, token_len=token_len, meta=meta)
|
||||||
|
else:
|
||||||
|
self.add_processed_video(video, outputs, uuid, token_len)
|
||||||
|
else:
|
||||||
|
raise ValueError(f"Unsupported multimodal type: {item.get('type')}")
|
||||||
|
mm_idx += 1
|
||||||
|
st = cur_idx
|
||||||
|
|
||||||
|
if mm_idx != len(mm_items):
|
||||||
|
raise ValueError("number of multimodal items does not match prompt token ids")
|
||||||
|
|
||||||
|
return outputs
|
||||||
|
|
||||||
|
def _add_text_tokens(self, tokens, outputs):
|
||||||
|
"""Helper: add text tokens with position IDs."""
|
||||||
|
if not tokens:
|
||||||
|
return
|
||||||
|
num_tokens = len(tokens)
|
||||||
|
outputs["input_ids"].extend(tokens)
|
||||||
|
outputs["token_type_ids"].extend([IDS_TYPE_FLAG["text"]] * num_tokens)
|
||||||
|
self.add_text_positions(outputs, num_tokens)
|
||||||
|
|
||||||
|
def _compute_text_positions(self, start_pos, num_tokens):
|
||||||
|
"""3xN ndarray for qwen-family text positions."""
|
||||||
|
text_array = np.arange(num_tokens).reshape(1, -1)
|
||||||
|
text_index = np.broadcast_to(text_array, (3, num_tokens))
|
||||||
|
return text_index + start_pos
|
||||||
|
|
||||||
|
def _compute_vision_positions(self, start_pos, t, h, w, second_per_grid_t):
|
||||||
|
"""3D position IDs as 3xN ndarray for qwen-family."""
|
||||||
|
h //= self.spatial_conv_size
|
||||||
|
w //= self.spatial_conv_size
|
||||||
|
|
||||||
|
tn = np.arange(t).reshape(-1, 1)
|
||||||
|
tn = np.broadcast_to(tn, (t, h * w))
|
||||||
|
tn = tn * int(second_per_grid_t) * self.tokens_per_second
|
||||||
|
t_index = tn.flatten()
|
||||||
|
|
||||||
|
hn = np.arange(h).reshape(1, -1, 1)
|
||||||
|
h_index = np.broadcast_to(hn, (t, h, w)).flatten()
|
||||||
|
|
||||||
|
wn = np.arange(w).reshape(1, 1, -1)
|
||||||
|
w_index = np.broadcast_to(wn, (t, h, w)).flatten()
|
||||||
|
|
||||||
|
return np.stack([t_index, h_index, w_index]) + start_pos
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def mm_num_tokens(grid_thw):
|
||||||
|
"""Qwen mm_num_tokens: t * h * w // 4."""
|
||||||
|
if isinstance(grid_thw, paddle.Tensor):
|
||||||
|
grid_thw = grid_thw.numpy()
|
||||||
|
if len(grid_thw) == 0:
|
||||||
|
return 0
|
||||||
|
|
||||||
|
def calc_one(thw):
|
||||||
|
t, h, w = map(int, thw)
|
||||||
|
return t * h * w // 4
|
||||||
|
|
||||||
|
if isinstance(grid_thw[0], (list, tuple, np.ndarray)):
|
||||||
|
return [calc_one(x) for x in grid_thw]
|
||||||
|
return calc_one(grid_thw)
|
||||||
|
|
||||||
|
def pack_position_ids(self, outputs):
|
||||||
|
"""Qwen: concatenate 3xN arrays, then transpose to Nx3."""
|
||||||
|
outputs["position_ids"] = np.concatenate(outputs["position_ids"], axis=1, dtype=np.int64)
|
||||||
|
outputs["image_patch_id"] = self.image_token_id
|
||||||
|
outputs["video_patch_id"] = self.video_token_id
|
||||||
|
outputs["position_ids"] = outputs["position_ids"].transpose(1, 0)
|
||||||
@@ -0,0 +1,54 @@
|
|||||||
|
# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
|
||||||
|
#
|
||||||
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
# you may not use this file except in compliance with the License.
|
||||||
|
# You may obtain a copy of the License at
|
||||||
|
#
|
||||||
|
# http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
#
|
||||||
|
# Unless required by applicable law or agreed to in writing, software
|
||||||
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
# See the License for the specific language governing permissions and
|
||||||
|
# limitations under the License.
|
||||||
|
|
||||||
|
"""Registry for multimodal encoding strategy classes."""
|
||||||
|
|
||||||
|
from typing import Dict, Type
|
||||||
|
|
||||||
|
|
||||||
|
class EncodingRegistry:
|
||||||
|
"""Maps model_type strings to encoding strategy classes.
|
||||||
|
|
||||||
|
Encoding classes register themselves via the ``register`` decorator
|
||||||
|
at import time. ``MultiModalProcessor`` queries this registry by
|
||||||
|
*model_type* instead of using string-based dynamic imports.
|
||||||
|
"""
|
||||||
|
|
||||||
|
_registry: Dict[str, Type] = {}
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def register(cls, *model_types: str):
|
||||||
|
"""Decorator that registers an encoding class for one or more model types."""
|
||||||
|
|
||||||
|
def decorator(enc_cls):
|
||||||
|
for mt in model_types:
|
||||||
|
if mt in cls._registry:
|
||||||
|
raise ValueError(
|
||||||
|
f"Encoding for '{mt}' already registered "
|
||||||
|
f"as {cls._registry[mt].__name__}, "
|
||||||
|
f"cannot re-register as {enc_cls.__name__}"
|
||||||
|
)
|
||||||
|
cls._registry[mt] = enc_cls
|
||||||
|
return enc_cls
|
||||||
|
|
||||||
|
return decorator
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def get(cls, model_type: str) -> Type:
|
||||||
|
"""Look up the encoding class for a given *model_type*."""
|
||||||
|
if model_type not in cls._registry:
|
||||||
|
raise ValueError(
|
||||||
|
f"No encoding registered for '{model_type}'. " f"Available: {sorted(cls._registry.keys())}"
|
||||||
|
)
|
||||||
|
return cls._registry[model_type]
|
||||||
@@ -539,6 +539,7 @@ class DataProcessor(MMBaseDataProcessor):
|
|||||||
outputs["mm_positions"].append(ImagePosition(len(outputs["input_ids"]), num_tokens))
|
outputs["mm_positions"].append(ImagePosition(len(outputs["input_ids"]), num_tokens))
|
||||||
outputs["input_ids"].extend([self.image_patch_id] * num_tokens)
|
outputs["input_ids"].extend([self.image_patch_id] * num_tokens)
|
||||||
outputs["token_type_ids"].extend([IDS_TYPE_FLAG["image"]] * num_tokens)
|
outputs["token_type_ids"].extend([IDS_TYPE_FLAG["image"]] * num_tokens)
|
||||||
|
outputs["num_input_image_tokens"] += num_tokens
|
||||||
|
|
||||||
_, h, w = meta["thw"]
|
_, h, w = meta["thw"]
|
||||||
pos_ids = self._compute_3d_positions(1, h, w, outputs["cur_position"])
|
pos_ids = self._compute_3d_positions(1, h, w, outputs["cur_position"])
|
||||||
@@ -605,6 +606,7 @@ class DataProcessor(MMBaseDataProcessor):
|
|||||||
outputs["mm_positions"].append(ImagePosition(len(outputs["input_ids"]), num_tokens))
|
outputs["mm_positions"].append(ImagePosition(len(outputs["input_ids"]), num_tokens))
|
||||||
outputs["input_ids"].extend([self.image_patch_id] * num_tokens)
|
outputs["input_ids"].extend([self.image_patch_id] * num_tokens)
|
||||||
outputs["token_type_ids"].extend([IDS_TYPE_FLAG["video"]] * num_tokens)
|
outputs["token_type_ids"].extend([IDS_TYPE_FLAG["video"]] * num_tokens)
|
||||||
|
outputs["num_input_video_tokens"] += num_tokens
|
||||||
outputs["image_type_ids"].extend([1] * t)
|
outputs["image_type_ids"].extend([1] * t)
|
||||||
|
|
||||||
pos_ids = self._compute_3d_positions(t, h, w, outputs["cur_position"])
|
pos_ids = self._compute_3d_positions(t, h, w, outputs["cur_position"])
|
||||||
|
|||||||
@@ -25,3 +25,6 @@ from fastdeploy.input.image_processors.qwen3_processor import ( # noqa: F401
|
|||||||
from fastdeploy.input.image_processors.qwen_processor import ( # noqa: F401
|
from fastdeploy.input.image_processors.qwen_processor import ( # noqa: F401
|
||||||
ImageProcessor as QwenImageProcessor,
|
ImageProcessor as QwenImageProcessor,
|
||||||
)
|
)
|
||||||
|
from fastdeploy.input.image_processors.registry import ( # noqa: F401
|
||||||
|
ImageProcessorRegistry,
|
||||||
|
)
|
||||||
|
|||||||
@@ -46,6 +46,8 @@ from PIL import Image
|
|||||||
|
|
||||||
from fastdeploy.input.image_processors.common import is_scaled_image
|
from fastdeploy.input.image_processors.common import is_scaled_image
|
||||||
from fastdeploy.input.image_processors.common import smart_resize_qwen as smart_resize
|
from fastdeploy.input.image_processors.common import smart_resize_qwen as smart_resize
|
||||||
|
from fastdeploy.input.image_processors.registry import ImageProcessorRegistry
|
||||||
|
from fastdeploy.input.mm_model_config import ERNIE4_5_VL
|
||||||
from fastdeploy.utils import data_processor_logger
|
from fastdeploy.utils import data_processor_logger
|
||||||
|
|
||||||
OPENAI_CLIP_MEAN = [0.48145466, 0.4578275, 0.40821073]
|
OPENAI_CLIP_MEAN = [0.48145466, 0.4578275, 0.40821073]
|
||||||
@@ -116,6 +118,7 @@ def make_batched_videos(videos) -> List[VideoInput]:
|
|||||||
raise ValueError(f"Could not make batched video from {videos}")
|
raise ValueError(f"Could not make batched video from {videos}")
|
||||||
|
|
||||||
|
|
||||||
|
@ImageProcessorRegistry.register(ERNIE4_5_VL)
|
||||||
class AdaptiveImageProcessor(BaseImageProcessor):
|
class AdaptiveImageProcessor(BaseImageProcessor):
|
||||||
r"""
|
r"""
|
||||||
Constructs a adaptive image processor that dynamically resizes images based on the original images.
|
Constructs a adaptive image processor that dynamically resizes images based on the original images.
|
||||||
|
|||||||
@@ -33,6 +33,8 @@ from paddleformers.transformers.image_utils import (
|
|||||||
from fastdeploy.input.image_processors.common import (
|
from fastdeploy.input.image_processors.common import (
|
||||||
smart_resize_paddleocr as smart_resize,
|
smart_resize_paddleocr as smart_resize,
|
||||||
)
|
)
|
||||||
|
from fastdeploy.input.image_processors.registry import ImageProcessorRegistry
|
||||||
|
from fastdeploy.input.mm_model_config import PADDLEOCR_VL
|
||||||
|
|
||||||
_OPENAI_CLIP_MEAN = [0.48145466, 0.4578275, 0.40821073]
|
_OPENAI_CLIP_MEAN = [0.48145466, 0.4578275, 0.40821073]
|
||||||
_OPENAI_CLIP_STD = [0.26862954, 0.26130258, 0.27577711]
|
_OPENAI_CLIP_STD = [0.26862954, 0.26130258, 0.27577711]
|
||||||
@@ -66,6 +68,7 @@ def adjust_size(size, patch_size):
|
|||||||
return num_patches * patch_size
|
return num_patches * patch_size
|
||||||
|
|
||||||
|
|
||||||
|
@ImageProcessorRegistry.register(PADDLEOCR_VL)
|
||||||
class ImageProcessor(BaseImageProcessor):
|
class ImageProcessor(BaseImageProcessor):
|
||||||
model_input_names = [
|
model_input_names = [
|
||||||
"pixel_values",
|
"pixel_values",
|
||||||
|
|||||||
@@ -41,6 +41,8 @@ from paddleformers.transformers.legacy.tokenizer_utils_base import TensorType
|
|||||||
from PIL import Image
|
from PIL import Image
|
||||||
|
|
||||||
from fastdeploy.input.image_processors.common import is_scaled_image, smart_resize
|
from fastdeploy.input.image_processors.common import is_scaled_image, smart_resize
|
||||||
|
from fastdeploy.input.image_processors.registry import ImageProcessorRegistry
|
||||||
|
from fastdeploy.input.mm_model_config import QWEN3_VL
|
||||||
from fastdeploy.utils import data_processor_logger
|
from fastdeploy.utils import data_processor_logger
|
||||||
|
|
||||||
IMAGE_MEAN = [0.5, 0.5, 0.5]
|
IMAGE_MEAN = [0.5, 0.5, 0.5]
|
||||||
@@ -62,6 +64,7 @@ VideoInput = Union[
|
|||||||
]
|
]
|
||||||
|
|
||||||
|
|
||||||
|
@ImageProcessorRegistry.register(QWEN3_VL)
|
||||||
class ImageProcessor(BaseImageProcessor):
|
class ImageProcessor(BaseImageProcessor):
|
||||||
"""
|
"""
|
||||||
Adaptive image processor for dynamic image resizing and preprocessing.
|
Adaptive image processor for dynamic image resizing and preprocessing.
|
||||||
|
|||||||
@@ -41,6 +41,8 @@ from paddleformers.transformers.legacy.tokenizer_utils_base import TensorType
|
|||||||
from PIL import Image
|
from PIL import Image
|
||||||
|
|
||||||
from fastdeploy.input.image_processors.common import is_scaled_image, smart_resize
|
from fastdeploy.input.image_processors.common import is_scaled_image, smart_resize
|
||||||
|
from fastdeploy.input.image_processors.registry import ImageProcessorRegistry
|
||||||
|
from fastdeploy.input.mm_model_config import QWEN_VL
|
||||||
from fastdeploy.utils import data_processor_logger
|
from fastdeploy.utils import data_processor_logger
|
||||||
|
|
||||||
OPENAI_CLIP_MEAN = [0.48145466, 0.4578275, 0.40821073]
|
OPENAI_CLIP_MEAN = [0.48145466, 0.4578275, 0.40821073]
|
||||||
@@ -62,6 +64,7 @@ VideoInput = Union[
|
|||||||
]
|
]
|
||||||
|
|
||||||
|
|
||||||
|
@ImageProcessorRegistry.register(QWEN_VL)
|
||||||
class ImageProcessor(BaseImageProcessor):
|
class ImageProcessor(BaseImageProcessor):
|
||||||
"""
|
"""
|
||||||
Adaptive image processor for dynamic image resizing and preprocessing.
|
Adaptive image processor for dynamic image resizing and preprocessing.
|
||||||
|
|||||||
@@ -0,0 +1,54 @@
|
|||||||
|
# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
|
||||||
|
#
|
||||||
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
# you may not use this file except in compliance with the License.
|
||||||
|
# You may obtain a copy of the License at
|
||||||
|
#
|
||||||
|
# http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
#
|
||||||
|
# Unless required by applicable law or agreed to in writing, software
|
||||||
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
# See the License for the specific language governing permissions and
|
||||||
|
# limitations under the License.
|
||||||
|
|
||||||
|
"""Registry for multimodal image processor classes."""
|
||||||
|
|
||||||
|
from typing import Dict, Type
|
||||||
|
|
||||||
|
|
||||||
|
class ImageProcessorRegistry:
|
||||||
|
"""Maps model_type strings to image processor classes.
|
||||||
|
|
||||||
|
Image processors register themselves via the ``register`` decorator
|
||||||
|
at import time. ``MultiModalProcessor`` queries this registry by
|
||||||
|
*model_type* instead of using string-based dynamic imports.
|
||||||
|
"""
|
||||||
|
|
||||||
|
_registry: Dict[str, Type] = {}
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def register(cls, *model_types: str):
|
||||||
|
"""Decorator that registers an image processor class for one or more model types."""
|
||||||
|
|
||||||
|
def decorator(proc_cls):
|
||||||
|
for mt in model_types:
|
||||||
|
if mt in cls._registry:
|
||||||
|
raise ValueError(
|
||||||
|
f"Image processor for '{mt}' already registered "
|
||||||
|
f"as {cls._registry[mt].__name__}, "
|
||||||
|
f"cannot re-register as {proc_cls.__name__}"
|
||||||
|
)
|
||||||
|
cls._registry[mt] = proc_cls
|
||||||
|
return proc_cls
|
||||||
|
|
||||||
|
return decorator
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def get(cls, model_type: str) -> Type:
|
||||||
|
"""Look up the image processor class for a given *model_type*."""
|
||||||
|
if model_type not in cls._registry:
|
||||||
|
raise ValueError(
|
||||||
|
f"No image processor registered for '{model_type}'. " f"Available: {sorted(cls._registry.keys())}"
|
||||||
|
)
|
||||||
|
return cls._registry[model_type]
|
||||||
@@ -0,0 +1,143 @@
|
|||||||
|
# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
|
||||||
|
#
|
||||||
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
# you may not use this file except in compliance with the License.
|
||||||
|
# You may obtain a copy of the License at
|
||||||
|
#
|
||||||
|
# http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
#
|
||||||
|
# Unless required by applicable law or agreed to in writing, software
|
||||||
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
# See the License for the specific language governing permissions and
|
||||||
|
# limitations under the License.
|
||||||
|
|
||||||
|
"""Per-model-type configuration for the unified MultiModalProcessor."""
|
||||||
|
|
||||||
|
from dataclasses import dataclass, field
|
||||||
|
from typing import Dict, Optional
|
||||||
|
|
||||||
|
QWEN_VL = "qwen_vl"
|
||||||
|
QWEN3_VL = "qwen3_vl"
|
||||||
|
PADDLEOCR_VL = "paddleocr_vl"
|
||||||
|
ERNIE4_5_VL = "ernie4_5_vl"
|
||||||
|
|
||||||
|
|
||||||
|
@dataclass(frozen=True)
|
||||||
|
class MMModelConfig:
|
||||||
|
image_placeholder: str
|
||||||
|
video_placeholder: str
|
||||||
|
|
||||||
|
tokenizer_type: str = "auto" # "auto" | "ernie4_5"
|
||||||
|
|
||||||
|
default_min_frames: int = 4
|
||||||
|
default_max_frames: int = 768
|
||||||
|
default_target_frames: int = -1
|
||||||
|
default_fps: float = 2.0
|
||||||
|
default_frames_sample: str = "leading"
|
||||||
|
|
||||||
|
has_bad_words: bool = True
|
||||||
|
has_tool_role: bool = False # ernie: role_prefixes includes "tool"
|
||||||
|
default_thinking: bool = False # ernie: default enable_thinking=True
|
||||||
|
force_disable_thinking: bool = False # qwen_vl, qwen3_vl: force enable_thinking=False
|
||||||
|
set_default_reasoning_max_tokens: bool = False # ernie: auto-set reasoning_max_tokens
|
||||||
|
cap_response_max_tokens: bool = False # ernie: cap max_tokens by response_max_tokens
|
||||||
|
has_logits_processor_think: bool = False # ernie: _prepare_think_stop_sentence
|
||||||
|
|
||||||
|
chat_template_pass_request: bool = False # ernie: pass full request obj
|
||||||
|
|
||||||
|
supports_prompt_token_ids: bool = False # qwen3, ernie
|
||||||
|
|
||||||
|
preserve_prompt_token_ids: bool = False # qwen3, ernie: don't overwrite existing
|
||||||
|
|
||||||
|
stop_tokens_variant: str = "default" # "default" | "qwen3"
|
||||||
|
|
||||||
|
image_token_str: str = ""
|
||||||
|
video_token_str: str = ""
|
||||||
|
|
||||||
|
expected_kwargs: Dict[str, type] = field(default_factory=dict)
|
||||||
|
|
||||||
|
video_min_pixels: Optional[int] = None
|
||||||
|
video_max_pixels: Optional[int] = None
|
||||||
|
|
||||||
|
# ---- Conv params source ----
|
||||||
|
conv_params_from_kwargs: bool = False # ernie: from processor_kwargs; else: from image_processor
|
||||||
|
|
||||||
|
# ---- tokens_per_second ----
|
||||||
|
has_tokens_per_second: bool = True # qwen-family: read from config; ernie: False
|
||||||
|
|
||||||
|
|
||||||
|
_QWEN_KWARGS = {
|
||||||
|
"video_max_frames": int,
|
||||||
|
"video_min_frames": int,
|
||||||
|
}
|
||||||
|
|
||||||
|
_ERNIE_KWARGS = {
|
||||||
|
"spatial_conv_size": int,
|
||||||
|
"temporal_conv_size": int,
|
||||||
|
"image_min_pixels": int,
|
||||||
|
"image_max_pixels": int,
|
||||||
|
"video_min_pixels": int,
|
||||||
|
"video_max_pixels": int,
|
||||||
|
"video_target_frames": int,
|
||||||
|
"video_frames_sample": str,
|
||||||
|
"video_max_frames": int,
|
||||||
|
"video_min_frames": int,
|
||||||
|
"video_fps": int,
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
MODEL_CONFIGS: Dict[str, MMModelConfig] = {
|
||||||
|
QWEN_VL: MMModelConfig(
|
||||||
|
image_placeholder="<|image_pad|>",
|
||||||
|
video_placeholder="<|video_pad|>",
|
||||||
|
image_token_str="<|image_pad|>",
|
||||||
|
video_token_str="<|video_pad|>",
|
||||||
|
force_disable_thinking=True,
|
||||||
|
expected_kwargs=_QWEN_KWARGS,
|
||||||
|
),
|
||||||
|
QWEN3_VL: MMModelConfig(
|
||||||
|
image_placeholder="<|image_pad|>",
|
||||||
|
video_placeholder="<|video_pad|>",
|
||||||
|
image_token_str="<|image_pad|>",
|
||||||
|
video_token_str="<|video_pad|>",
|
||||||
|
force_disable_thinking=True,
|
||||||
|
supports_prompt_token_ids=True,
|
||||||
|
preserve_prompt_token_ids=True,
|
||||||
|
stop_tokens_variant="qwen3",
|
||||||
|
video_min_pixels=128 * 28 * 28,
|
||||||
|
video_max_pixels=768 * 28 * 28,
|
||||||
|
expected_kwargs=_QWEN_KWARGS,
|
||||||
|
),
|
||||||
|
PADDLEOCR_VL: MMModelConfig(
|
||||||
|
image_placeholder="<|IMAGE_PLACEHOLDER|>",
|
||||||
|
video_placeholder="<|video_pad|>",
|
||||||
|
image_token_str="<|IMAGE_PLACEHOLDER|>",
|
||||||
|
video_token_str="<|video_pad|>",
|
||||||
|
has_bad_words=False,
|
||||||
|
default_fps=-1.0,
|
||||||
|
expected_kwargs=_QWEN_KWARGS,
|
||||||
|
),
|
||||||
|
ERNIE4_5_VL: MMModelConfig(
|
||||||
|
image_placeholder="<|image@placeholder|>",
|
||||||
|
video_placeholder="<|video@placeholder|>",
|
||||||
|
tokenizer_type="ernie4_5",
|
||||||
|
default_min_frames=16,
|
||||||
|
default_max_frames=180,
|
||||||
|
default_fps=2.0,
|
||||||
|
default_frames_sample="leading",
|
||||||
|
has_tool_role=True,
|
||||||
|
default_thinking=True,
|
||||||
|
set_default_reasoning_max_tokens=True,
|
||||||
|
cap_response_max_tokens=True,
|
||||||
|
has_logits_processor_think=True,
|
||||||
|
chat_template_pass_request=True,
|
||||||
|
supports_prompt_token_ids=True,
|
||||||
|
preserve_prompt_token_ids=True,
|
||||||
|
image_token_str="<|IMAGE_PLACEHOLDER|>",
|
||||||
|
video_token_str="<|IMAGE_PLACEHOLDER|>",
|
||||||
|
conv_params_from_kwargs=True,
|
||||||
|
has_tokens_per_second=False,
|
||||||
|
expected_kwargs=_ERNIE_KWARGS,
|
||||||
|
),
|
||||||
|
}
|
||||||
@@ -16,46 +16,25 @@
|
|||||||
|
|
||||||
"""Unified multimodal processor for all VL model types.
|
"""Unified multimodal processor for all VL model types.
|
||||||
|
|
||||||
Consolidates the four separate VL processor wrappers (QwenVLProcessor,
|
Consolidates the four separate VL processor wrappers and four separate
|
||||||
Qwen3VLProcessor, PaddleOCRVLProcessor, Ernie4_5_VLProcessor) into a
|
DataProcessor classes into a single class with pluggable Encoding strategies.
|
||||||
single class that dispatches per ``model_type``.
|
|
||||||
"""
|
"""
|
||||||
|
|
||||||
|
import pickle
|
||||||
from collections.abc import Mapping
|
from collections.abc import Mapping
|
||||||
from typing import Any, Dict, Optional
|
from typing import Any, Dict, Optional
|
||||||
|
|
||||||
import numpy as np
|
import numpy as np
|
||||||
|
import zmq
|
||||||
|
|
||||||
|
from fastdeploy.entrypoints.chat_utils import parse_chat_messages
|
||||||
from fastdeploy.input.base_processor import BaseTextProcessor
|
from fastdeploy.input.base_processor import BaseTextProcessor
|
||||||
|
from fastdeploy.input.encodings import EncodingRegistry
|
||||||
|
from fastdeploy.input.image_processors import ImageProcessorRegistry
|
||||||
|
from fastdeploy.input.mm_model_config import MODEL_CONFIGS
|
||||||
from fastdeploy.input.utils import IDS_TYPE_FLAG, process_stop_token_ids
|
from fastdeploy.input.utils import IDS_TYPE_FLAG, process_stop_token_ids
|
||||||
from fastdeploy.utils import data_processor_logger
|
from fastdeploy.utils import data_processor_logger
|
||||||
|
|
||||||
QWEN_VL = "qwen_vl"
|
|
||||||
QWEN3_VL = "qwen3_vl"
|
|
||||||
PADDLEOCR_VL = "paddleocr_vl"
|
|
||||||
ERNIE4_5_VL = "ernie4_5_vl"
|
|
||||||
|
|
||||||
_SUPPORTED_MODEL_TYPES = {QWEN_VL, QWEN3_VL, PADDLEOCR_VL, ERNIE4_5_VL}
|
|
||||||
|
|
||||||
_QWEN_EXPECTED_KWARGS = {
|
|
||||||
"video_max_frames": int,
|
|
||||||
"video_min_frames": int,
|
|
||||||
}
|
|
||||||
|
|
||||||
_ERNIE_EXPECTED_KWARGS = {
|
|
||||||
"spatial_conv_size": int,
|
|
||||||
"temporal_conv_size": int,
|
|
||||||
"image_min_pixels": int,
|
|
||||||
"image_max_pixels": int,
|
|
||||||
"video_min_pixels": int,
|
|
||||||
"video_max_pixels": int,
|
|
||||||
"video_target_frames": int,
|
|
||||||
"video_frames_sample": str,
|
|
||||||
"video_max_frames": int,
|
|
||||||
"video_min_frames": int,
|
|
||||||
"video_fps": int,
|
|
||||||
}
|
|
||||||
|
|
||||||
_DEFAULT_MM_LIMITS = {"image": 1, "video": 1, "audio": 1}
|
_DEFAULT_MM_LIMITS = {"image": 1, "video": 1, "audio": 1}
|
||||||
|
|
||||||
_SAMPLING_EPS = 1e-5
|
_SAMPLING_EPS = 1e-5
|
||||||
@@ -64,8 +43,9 @@ _SAMPLING_EPS = 1e-5
|
|||||||
class MultiModalProcessor(BaseTextProcessor):
|
class MultiModalProcessor(BaseTextProcessor):
|
||||||
"""Unified multimodal processor for all supported VL model types.
|
"""Unified multimodal processor for all supported VL model types.
|
||||||
|
|
||||||
Dispatches image-processor creation, config initialisation, and
|
Uses a composition pattern: model-type-specific encoding logic is
|
||||||
encoding logic based on ``model_type``.
|
delegated to ``self.enc`` (an Encoding instance), while common logic
|
||||||
|
(tokenization loop, request processing, caching) lives here.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
def __init__(
|
def __init__(
|
||||||
@@ -79,19 +59,16 @@ class MultiModalProcessor(BaseTextProcessor):
|
|||||||
tool_parser_obj=None,
|
tool_parser_obj=None,
|
||||||
enable_processor_cache: bool = False,
|
enable_processor_cache: bool = False,
|
||||||
):
|
):
|
||||||
if model_type not in _SUPPORTED_MODEL_TYPES:
|
if model_type not in MODEL_CONFIGS:
|
||||||
raise ValueError(
|
raise ValueError(f"Unsupported model_type '{model_type}'. " f"Must be one of {sorted(MODEL_CONFIGS)}.")
|
||||||
f"Unsupported model_type '{model_type}'. " f"Must be one of {sorted(_SUPPORTED_MODEL_TYPES)}."
|
|
||||||
)
|
|
||||||
self.model_type = model_type
|
self.model_type = model_type
|
||||||
self.config = config
|
self.config = config
|
||||||
|
self.cfg = MODEL_CONFIGS[model_type]
|
||||||
self.enable_processor_cache = enable_processor_cache
|
self.enable_processor_cache = enable_processor_cache
|
||||||
|
|
||||||
tokenizer_type = "ernie4_5" if model_type == ERNIE4_5_VL else "auto"
|
|
||||||
|
|
||||||
super().__init__(
|
super().__init__(
|
||||||
model_name_or_path,
|
model_name_or_path,
|
||||||
tokenizer_type=tokenizer_type,
|
tokenizer_type=self.cfg.tokenizer_type,
|
||||||
reasoning_parser_obj=reasoning_parser_obj,
|
reasoning_parser_obj=reasoning_parser_obj,
|
||||||
tool_parser_obj=tool_parser_obj,
|
tool_parser_obj=tool_parser_obj,
|
||||||
)
|
)
|
||||||
@@ -99,8 +76,13 @@ class MultiModalProcessor(BaseTextProcessor):
|
|||||||
data_processor_logger.info(f"model_name_or_path: {model_name_or_path}")
|
data_processor_logger.info(f"model_name_or_path: {model_name_or_path}")
|
||||||
|
|
||||||
processor_kwargs = self._parse_processor_kwargs(mm_processor_kwargs)
|
processor_kwargs = self._parse_processor_kwargs(mm_processor_kwargs)
|
||||||
self._init_mm_processor(processor_kwargs)
|
self._init_image_processor()
|
||||||
self._init_mm_config()
|
self._init_role_prefixes()
|
||||||
|
|
||||||
|
# Composition: create encoding strategy via registry
|
||||||
|
enc_cls = EncodingRegistry.get(self.model_type)
|
||||||
|
self.enc = enc_cls(self, processor_kwargs)
|
||||||
|
|
||||||
self.limit_mm_per_prompt = self._parse_limits(limit_mm_per_prompt)
|
self.limit_mm_per_prompt = self._parse_limits(limit_mm_per_prompt)
|
||||||
|
|
||||||
def _load_tokenizer(self):
|
def _load_tokenizer(self):
|
||||||
@@ -122,76 +104,30 @@ class MultiModalProcessor(BaseTextProcessor):
|
|||||||
tokenizer = AutoTokenizer.from_pretrained(self.model_name_or_path, padding_side="left", use_fast=True)
|
tokenizer = AutoTokenizer.from_pretrained(self.model_name_or_path, padding_side="left", use_fast=True)
|
||||||
return tokenizer
|
return tokenizer
|
||||||
|
|
||||||
def _init_mm_processor(self, processor_kwargs: dict):
|
def _init_image_processor(self):
|
||||||
"""Create the model-type-specific internal DataProcessor."""
|
"""Create the appropriate image processor."""
|
||||||
if self.model_type == QWEN_VL:
|
cls = ImageProcessorRegistry.get(self.model_type)
|
||||||
from fastdeploy.input.qwen_vl_processor.process import DataProcessor
|
self.image_processor = cls.from_pretrained(self.model_name_or_path)
|
||||||
|
|
||||||
tokens_per_second = getattr(getattr(self.config, "vision_config", None), "tokens_per_second", 2)
|
def _init_role_prefixes(self):
|
||||||
self.processor = DataProcessor(
|
"""Set up role prefixes for message parsing."""
|
||||||
model_path=self.model_name_or_path,
|
self.role_prefixes = {
|
||||||
enable_processor_cache=self.enable_processor_cache,
|
"system": "",
|
||||||
tokens_per_second=tokens_per_second,
|
"user": "User: ",
|
||||||
tokenizer=self.tokenizer,
|
"bot": "Assistant: ",
|
||||||
**processor_kwargs,
|
"assistant": "Assistant: ",
|
||||||
)
|
}
|
||||||
elif self.model_type == QWEN3_VL:
|
if self.cfg.has_tool_role:
|
||||||
from fastdeploy.input.qwen3_vl_processor.process import DataProcessor
|
self.role_prefixes["tool"] = "Tool: "
|
||||||
|
|
||||||
self.processor = DataProcessor(
|
|
||||||
model_path=self.model_name_or_path,
|
|
||||||
enable_processor_cache=self.enable_processor_cache,
|
|
||||||
tokenizer=self.tokenizer,
|
|
||||||
**processor_kwargs,
|
|
||||||
)
|
|
||||||
elif self.model_type == PADDLEOCR_VL:
|
|
||||||
from fastdeploy.input.paddleocr_vl_processor.process import DataProcessor
|
|
||||||
|
|
||||||
tokens_per_second = getattr(getattr(self.config, "vision_config", None), "tokens_per_second", 2)
|
|
||||||
self.processor = DataProcessor(
|
|
||||||
model_path=self.model_name_or_path,
|
|
||||||
enable_processor_cache=self.enable_processor_cache,
|
|
||||||
tokens_per_second=tokens_per_second,
|
|
||||||
tokenizer=self.tokenizer,
|
|
||||||
**processor_kwargs,
|
|
||||||
)
|
|
||||||
elif self.model_type == ERNIE4_5_VL:
|
|
||||||
from fastdeploy.input.ernie4_5_vl_processor.process import DataProcessor
|
|
||||||
|
|
||||||
self.processor = DataProcessor(
|
|
||||||
tokenizer_name=self.model_name_or_path,
|
|
||||||
image_preprocessor_name=self.model_name_or_path,
|
|
||||||
enable_processor_cache=self.enable_processor_cache,
|
|
||||||
**processor_kwargs,
|
|
||||||
)
|
|
||||||
self.processor.eval()
|
|
||||||
|
|
||||||
def _init_mm_config(self):
|
|
||||||
"""Set model-type-specific multimodal configuration attributes."""
|
|
||||||
if self.model_type in (QWEN_VL, QWEN3_VL):
|
|
||||||
self.image_patch_id = self.processor.image_token_id
|
|
||||||
elif self.model_type == PADDLEOCR_VL:
|
|
||||||
self.image_patch_id = self.processor.image_patch_id
|
|
||||||
elif self.model_type == ERNIE4_5_VL:
|
|
||||||
self.image_patch_id = self.processor.image_patch_id
|
|
||||||
self.spatial_conv_size = self.processor.spatial_conv_size
|
|
||||||
|
|
||||||
def _parse_processor_kwargs(self, kwargs: Optional[dict]) -> dict:
|
def _parse_processor_kwargs(self, kwargs: Optional[dict]) -> dict:
|
||||||
"""Parse and validate multimodal processor kwargs."""
|
|
||||||
if not kwargs:
|
if not kwargs:
|
||||||
return {}
|
return {}
|
||||||
|
|
||||||
try:
|
try:
|
||||||
if not isinstance(kwargs, dict):
|
if not isinstance(kwargs, dict):
|
||||||
raise ValueError("mm-processor-kwargs must be a dictionary")
|
raise ValueError("mm-processor-kwargs must be a dictionary")
|
||||||
|
|
||||||
data_processor_logger.info(f"Processing kwargs: {kwargs}")
|
data_processor_logger.info(f"Processing kwargs: {kwargs}")
|
||||||
|
expected_types = self.cfg.expected_kwargs
|
||||||
if self.model_type == ERNIE4_5_VL:
|
|
||||||
expected_types = _ERNIE_EXPECTED_KWARGS
|
|
||||||
else:
|
|
||||||
expected_types = _QWEN_EXPECTED_KWARGS
|
|
||||||
|
|
||||||
for key, value in kwargs.items():
|
for key, value in kwargs.items():
|
||||||
if key in expected_types and not isinstance(value, expected_types[key]):
|
if key in expected_types and not isinstance(value, expected_types[key]):
|
||||||
raise ValueError(
|
raise ValueError(
|
||||||
@@ -199,16 +135,13 @@ class MultiModalProcessor(BaseTextProcessor):
|
|||||||
f"{expected_types[key].__name__}, got {type(value).__name__}"
|
f"{expected_types[key].__name__}, got {type(value).__name__}"
|
||||||
)
|
)
|
||||||
return kwargs
|
return kwargs
|
||||||
|
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
data_processor_logger.warning(f"Invalid mm-processor-kwargs format: {e}")
|
data_processor_logger.warning(f"Invalid mm-processor-kwargs format: {e}")
|
||||||
return {}
|
return {}
|
||||||
|
|
||||||
def _parse_limits(self, limits: Optional[dict]) -> dict:
|
def _parse_limits(self, limits: Optional[dict]) -> dict:
|
||||||
"""Parse multimodal input limits, merging with defaults."""
|
|
||||||
if not limits:
|
if not limits:
|
||||||
return dict(_DEFAULT_MM_LIMITS)
|
return dict(_DEFAULT_MM_LIMITS)
|
||||||
|
|
||||||
try:
|
try:
|
||||||
if not isinstance(limits, dict):
|
if not isinstance(limits, dict):
|
||||||
raise ValueError("limit-mm-per-prompt must be a dictionary")
|
raise ValueError("limit-mm-per-prompt must be a dictionary")
|
||||||
@@ -219,7 +152,6 @@ class MultiModalProcessor(BaseTextProcessor):
|
|||||||
return dict(_DEFAULT_MM_LIMITS)
|
return dict(_DEFAULT_MM_LIMITS)
|
||||||
|
|
||||||
def _check_mm_limits(self, item):
|
def _check_mm_limits(self, item):
|
||||||
"""Validate multimodal inputs against configured limits."""
|
|
||||||
if isinstance(item, dict):
|
if isinstance(item, dict):
|
||||||
mm_data = item
|
mm_data = item
|
||||||
else:
|
else:
|
||||||
@@ -232,7 +164,6 @@ class MultiModalProcessor(BaseTextProcessor):
|
|||||||
mm_data["image"].append(part)
|
mm_data["image"].append(part)
|
||||||
elif part_type in ("video_url", "video"):
|
elif part_type in ("video_url", "video"):
|
||||||
mm_data["video"].append(part)
|
mm_data["video"].append(part)
|
||||||
|
|
||||||
for modality, data in mm_data.items():
|
for modality, data in mm_data.items():
|
||||||
if modality in self.limit_mm_per_prompt:
|
if modality in self.limit_mm_per_prompt:
|
||||||
limit = self.limit_mm_per_prompt[modality]
|
limit = self.limit_mm_per_prompt[modality]
|
||||||
@@ -240,86 +171,201 @@ class MultiModalProcessor(BaseTextProcessor):
|
|||||||
raise ValueError(f"Too many {modality} items in prompt, " f"got {len(data)} but limit is {limit}")
|
raise ValueError(f"Too many {modality} items in prompt, " f"got {len(data)} but limit is {limit}")
|
||||||
|
|
||||||
def get_mm_max_tokens_per_item(self, seq_len: int) -> Optional[Mapping[str, int]]:
|
def get_mm_max_tokens_per_item(self, seq_len: int) -> Optional[Mapping[str, int]]:
|
||||||
"""Return per-modality max token counts, if available."""
|
return self.enc.get_mm_max_tokens_per_item(seq_len)
|
||||||
if self.model_type == ERNIE4_5_VL:
|
|
||||||
return self.processor.get_mm_max_tokens_per_item(seq_len)
|
def _extract_mm_items(self, request):
|
||||||
return None
|
"""Extract images/videos from request messages, handling processor cache."""
|
||||||
|
messages = parse_chat_messages(request.get("messages"))
|
||||||
|
mm_items = []
|
||||||
|
for msg in messages:
|
||||||
|
role = msg.get("role")
|
||||||
|
if role not in self.role_prefixes:
|
||||||
|
raise ValueError(f"Unsupported role: {role}")
|
||||||
|
content = msg.get("content")
|
||||||
|
if not isinstance(content, list):
|
||||||
|
content = [content]
|
||||||
|
for item in content:
|
||||||
|
if item.get("type") in ["image", "video"]:
|
||||||
|
mm_items.append(item)
|
||||||
|
|
||||||
|
missing_hashes, missing_idx = [], []
|
||||||
|
for idx, item in enumerate(mm_items):
|
||||||
|
if not item.get("data"):
|
||||||
|
missing_hashes.append(item.get("uuid"))
|
||||||
|
missing_idx.append(idx)
|
||||||
|
|
||||||
|
if len(missing_hashes) > 0 and not self.enable_processor_cache:
|
||||||
|
raise ValueError("Missing items cannot be retrieved without processor cache.")
|
||||||
|
|
||||||
|
dealer = None
|
||||||
|
if self.enable_processor_cache:
|
||||||
|
context = zmq.Context()
|
||||||
|
dealer = context.socket(zmq.DEALER)
|
||||||
|
dealer.connect("ipc:///dev/shm/processor_cache.ipc")
|
||||||
|
|
||||||
|
missing_items = self.get_processor_cache(dealer, missing_hashes)
|
||||||
|
for idx in range(len(missing_items)):
|
||||||
|
if not missing_items[idx]:
|
||||||
|
raise ValueError(f"Missing item {idx} not found in processor cache")
|
||||||
|
mm_items[missing_idx[idx]]["data"] = missing_items[idx]
|
||||||
|
|
||||||
|
images, videos = [], []
|
||||||
|
image_uuid, video_uuid = [], []
|
||||||
|
for item in mm_items:
|
||||||
|
if item.get("type") == "image":
|
||||||
|
images.append(item["data"])
|
||||||
|
image_uuid.append(item["uuid"])
|
||||||
|
elif item.get("type") == "video":
|
||||||
|
videos.append(item["data"])
|
||||||
|
video_uuid.append(item["uuid"])
|
||||||
|
else:
|
||||||
|
raise ValueError(f"Unsupported multimodal type: {item.get('type')}")
|
||||||
|
|
||||||
|
return images, videos, image_uuid, video_uuid, dealer, missing_idx, mm_items
|
||||||
|
|
||||||
|
def text2ids(self, text, images=None, videos=None, image_uuid=None, video_uuid=None):
|
||||||
|
"""Convert text with image/video placeholders into model inputs."""
|
||||||
|
outputs = self.enc._make_outputs()
|
||||||
|
|
||||||
|
IMAGE_PLACEHOLDER = self.cfg.image_placeholder
|
||||||
|
VIDEO_PLACEHOLDER = self.cfg.video_placeholder
|
||||||
|
IMAGE_PLACEHOLDER_LEN = len(IMAGE_PLACEHOLDER)
|
||||||
|
VIDEO_PLACEHOLDER_LEN = len(VIDEO_PLACEHOLDER)
|
||||||
|
|
||||||
|
st, image_idx, video_idx = 0, 0, 0
|
||||||
|
while st < len(text):
|
||||||
|
image_pos = text.find(IMAGE_PLACEHOLDER, st)
|
||||||
|
image_pos = len(text) if image_pos == -1 else image_pos
|
||||||
|
video_pos = text.find(VIDEO_PLACEHOLDER, st)
|
||||||
|
video_pos = len(text) if video_pos == -1 else video_pos
|
||||||
|
ed = min(image_pos, video_pos)
|
||||||
|
|
||||||
|
self._add_text(text[st:ed], outputs)
|
||||||
|
if ed == len(text):
|
||||||
|
break
|
||||||
|
|
||||||
|
if ed == image_pos:
|
||||||
|
image = images[image_idx]
|
||||||
|
uuid = image_uuid[image_idx] if image_uuid else None
|
||||||
|
if not isinstance(image, tuple):
|
||||||
|
self.enc.add_image(image, outputs, uuid)
|
||||||
|
else:
|
||||||
|
self.enc.add_processed_image(image, outputs, uuid)
|
||||||
|
image_idx += 1
|
||||||
|
st = ed + IMAGE_PLACEHOLDER_LEN
|
||||||
|
else:
|
||||||
|
item = videos[video_idx]
|
||||||
|
uuid = video_uuid[video_idx] if video_uuid else None
|
||||||
|
if not isinstance(item, tuple):
|
||||||
|
if isinstance(item, dict):
|
||||||
|
frames, meta = self.enc.load_video(item["video"], item)
|
||||||
|
else:
|
||||||
|
frames, meta = self.enc.load_video(item, {})
|
||||||
|
self.enc.add_video(frames, outputs, uuid, meta=meta)
|
||||||
|
else:
|
||||||
|
self.enc.add_processed_video(item, outputs, uuid)
|
||||||
|
video_idx += 1
|
||||||
|
st = ed + VIDEO_PLACEHOLDER_LEN
|
||||||
|
|
||||||
|
return outputs
|
||||||
|
|
||||||
|
def request2ids(self, request):
|
||||||
|
"""Convert chat request with multimodal messages into model inputs."""
|
||||||
|
images, videos, image_uuid, video_uuid, dealer, missing_idx, mm_items = self._extract_mm_items(request)
|
||||||
|
|
||||||
|
if self.tokenizer.chat_template is None:
|
||||||
|
raise ValueError("This model does not support chat template.")
|
||||||
|
|
||||||
|
chat_template_kwargs = request.get("chat_template_kwargs", {})
|
||||||
|
if self.cfg.chat_template_pass_request:
|
||||||
|
# ernie: pass full request to apply_chat_template
|
||||||
|
prompt = self.tokenizer.apply_chat_template(
|
||||||
|
request,
|
||||||
|
tokenize=False,
|
||||||
|
add_generation_prompt=request.get("add_generation_prompt", True),
|
||||||
|
**chat_template_kwargs,
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
messages = parse_chat_messages(request.get("messages"))
|
||||||
|
prompt = self.tokenizer.apply_chat_template(
|
||||||
|
messages,
|
||||||
|
tokenize=False,
|
||||||
|
add_generation_prompt=request.get("add_generation_prompt", True),
|
||||||
|
**chat_template_kwargs,
|
||||||
|
)
|
||||||
|
request["prompt_tokens"] = prompt
|
||||||
|
|
||||||
|
outputs = self.text2ids(prompt, images, videos, image_uuid, video_uuid)
|
||||||
|
|
||||||
|
if self.enable_processor_cache:
|
||||||
|
self._update_mm_cache(dealer, missing_idx, mm_items, outputs)
|
||||||
|
|
||||||
|
return outputs
|
||||||
|
|
||||||
|
def _process_prompt_token_ids(self, request):
|
||||||
|
"""Handle the prompt_token_ids tokenisation path.
|
||||||
|
|
||||||
|
Mirrors ``request2ids`` in structure: Processor owns extract/cache,
|
||||||
|
Encoding only does pure encoding.
|
||||||
|
"""
|
||||||
|
prompt_token_ids = request.get("prompt_token_ids", [])
|
||||||
|
|
||||||
|
if not request.get("messages"):
|
||||||
|
return self.enc.prompt_token_ids2outputs(prompt_token_ids)
|
||||||
|
|
||||||
|
images, videos, image_uuid, video_uuid, dealer, missing_idx, mm_items = self._extract_mm_items(request)
|
||||||
|
outputs = self.enc.prompt_token_ids2outputs(prompt_token_ids, mm_items)
|
||||||
|
|
||||||
|
if self.enable_processor_cache:
|
||||||
|
self._update_mm_cache(dealer, missing_idx, mm_items, outputs)
|
||||||
|
|
||||||
|
return outputs
|
||||||
|
|
||||||
|
def _update_mm_cache(self, dealer, missing_idx, mm_items, outputs):
|
||||||
|
"""Write newly-processed multimodal items to the processor cache."""
|
||||||
|
missing_idx_set = set(missing_idx)
|
||||||
|
hashes_to_cache, items_to_cache = [], []
|
||||||
|
for idx in range(len(mm_items)):
|
||||||
|
if idx in missing_idx_set:
|
||||||
|
continue
|
||||||
|
meta = {}
|
||||||
|
grid_thw = np.asarray(outputs["grid_thw"][idx])
|
||||||
|
if grid_thw.ndim > 1:
|
||||||
|
t, h, w = grid_thw[0]
|
||||||
|
else:
|
||||||
|
t, h, w = grid_thw
|
||||||
|
meta["thw"] = (int(t), int(h), int(w))
|
||||||
|
if "fps" in outputs:
|
||||||
|
meta["fps"] = outputs["fps"][idx]
|
||||||
|
hashes_to_cache.append(outputs["mm_hashes"][idx])
|
||||||
|
items_to_cache.append((outputs["images"][idx], meta))
|
||||||
|
if hashes_to_cache:
|
||||||
|
self.update_processor_cache(dealer, hashes_to_cache, items_to_cache)
|
||||||
|
|
||||||
|
return outputs
|
||||||
|
|
||||||
|
def _add_text(self, tokens, outputs):
|
||||||
|
"""Add text tokens to outputs, delegating position logic to enc."""
|
||||||
|
if not tokens:
|
||||||
|
return
|
||||||
|
if isinstance(tokens, str):
|
||||||
|
tokens_str = self.tokenizer.tokenize(tokens)
|
||||||
|
tokens = self.tokenizer.convert_tokens_to_ids(tokens_str)
|
||||||
|
num_tokens = len(tokens)
|
||||||
|
outputs["input_ids"].extend(tokens)
|
||||||
|
outputs["token_type_ids"].extend([IDS_TYPE_FLAG["text"]] * num_tokens)
|
||||||
|
self.enc.add_text_positions(outputs, num_tokens)
|
||||||
|
|
||||||
def process_request_dict(self, request, max_model_len=None):
|
def process_request_dict(self, request, max_model_len=None):
|
||||||
"""Process a request dictionary into model inputs.
|
"""Process a request dictionary into model inputs."""
|
||||||
|
cfg = self.cfg
|
||||||
Unified template-method flow for all VL model types. Per-model
|
|
||||||
differences are handled by small conditional branches rather than
|
|
||||||
duplicating the entire pipeline.
|
|
||||||
"""
|
|
||||||
request = self._apply_default_parameters(request)
|
request = self._apply_default_parameters(request)
|
||||||
|
|
||||||
if not request.get("eos_token_ids"):
|
if not request.get("eos_token_ids"):
|
||||||
request["eos_token_ids"] = self.eos_token_ids
|
request["eos_token_ids"] = self.eos_token_ids
|
||||||
|
|
||||||
self._process_stop_tokens(request)
|
# Stop tokens
|
||||||
|
if cfg.stop_tokens_variant == "qwen3":
|
||||||
if self.model_type != PADDLEOCR_VL:
|
|
||||||
self._process_bad_words(request)
|
|
||||||
|
|
||||||
if self.model_type == ERNIE4_5_VL:
|
|
||||||
logits_processors_args = self._prepare_think_stop_sentence(
|
|
||||||
request.get("logits_processors_args") or {}, max_model_len
|
|
||||||
)
|
|
||||||
request["logits_processors_args"] = logits_processors_args
|
|
||||||
|
|
||||||
outputs = self._tokenize_request(request)
|
|
||||||
|
|
||||||
self._process_post_tokens(request, outputs)
|
|
||||||
|
|
||||||
if self.model_type in (QWEN_VL, QWEN3_VL):
|
|
||||||
request["enable_thinking"] = False
|
|
||||||
|
|
||||||
outputs = self.pack_outputs(outputs)
|
|
||||||
|
|
||||||
if self.model_type in (QWEN3_VL, ERNIE4_5_VL) and request.get("prompt_token_ids"):
|
|
||||||
pass # preserve existing prompt_token_ids
|
|
||||||
else:
|
|
||||||
request["prompt_token_ids"] = outputs["input_ids"].tolist()
|
|
||||||
request["prompt_token_ids_len"] = len(request["prompt_token_ids"])
|
|
||||||
request["multimodal_inputs"] = outputs
|
|
||||||
|
|
||||||
if max_model_len is not None and len(request["prompt_token_ids"]) > max_model_len:
|
|
||||||
request["prompt_token_ids"] = request["prompt_token_ids"][: max_model_len - 1]
|
|
||||||
|
|
||||||
if self.model_type == ERNIE4_5_VL:
|
|
||||||
logits_processors_args = self._update_thinking_prompt_state(
|
|
||||||
request["prompt_token_ids"], request.get("logits_processors_args") or {}
|
|
||||||
)
|
|
||||||
request["logits_processors_args"] = logits_processors_args
|
|
||||||
|
|
||||||
max_tokens = max_model_len - len(request["prompt_token_ids"])
|
|
||||||
if request.get("max_tokens") is None:
|
|
||||||
request["max_tokens"] = max(1, max_tokens)
|
|
||||||
else:
|
|
||||||
request["max_tokens"] = min(max_tokens, request["max_tokens"])
|
|
||||||
|
|
||||||
if self.model_type == ERNIE4_5_VL and request.get("reasoning_max_tokens") is None:
|
|
||||||
request["reasoning_max_tokens"] = max(int(request["max_tokens"] * 0.8), 1)
|
|
||||||
|
|
||||||
if self.model_type in (PADDLEOCR_VL, ERNIE4_5_VL):
|
|
||||||
if request.get("top_p") is not None and request.get("top_p") < _SAMPLING_EPS:
|
|
||||||
request["top_p"] = _SAMPLING_EPS
|
|
||||||
request["top_k"] = 1
|
|
||||||
|
|
||||||
if self.model_type != QWEN3_VL and self.reasoning_parser:
|
|
||||||
self._apply_reasoning_parser(request)
|
|
||||||
|
|
||||||
if self.model_type == ERNIE4_5_VL:
|
|
||||||
if request.get("response_max_tokens") is not None and request.get("enable_thinking") is False:
|
|
||||||
request["max_tokens"] = min(request["response_max_tokens"], request["max_tokens"])
|
|
||||||
|
|
||||||
data_processor_logger.info(f"Processed request {request}")
|
|
||||||
return request
|
|
||||||
|
|
||||||
def _process_stop_tokens(self, request):
|
|
||||||
"""Handle stop token processing based on model type."""
|
|
||||||
if self.model_type == QWEN3_VL:
|
|
||||||
stop_sequences = request.get("stop", [])
|
stop_sequences = request.get("stop", [])
|
||||||
if stop_sequences:
|
if stop_sequences:
|
||||||
stop_seqs, stop_seqs_len = self.update_stop_seq(stop_sequences)
|
stop_seqs, stop_seqs_len = self.update_stop_seq(stop_sequences)
|
||||||
@@ -328,34 +374,102 @@ class MultiModalProcessor(BaseTextProcessor):
|
|||||||
else:
|
else:
|
||||||
process_stop_token_ids(request, self.update_stop_seq)
|
process_stop_token_ids(request, self.update_stop_seq)
|
||||||
|
|
||||||
def _process_bad_words(self, request):
|
# Bad words
|
||||||
"""Process bad_words into token ids."""
|
if cfg.has_bad_words:
|
||||||
bad_words = request.get("bad_words")
|
bad_words = request.get("bad_words")
|
||||||
bad_words_token_ids = request.get("bad_words_token_ids")
|
bad_words_token_ids = request.get("bad_words_token_ids")
|
||||||
if bad_words:
|
if bad_words:
|
||||||
bad_words_token_ids = self.update_bad_words(bad_words, bad_words_token_ids)
|
bad_words_token_ids = self.update_bad_words(bad_words, bad_words_token_ids)
|
||||||
request["bad_words_token_ids"] = bad_words_token_ids
|
request["bad_words_token_ids"] = bad_words_token_ids
|
||||||
|
|
||||||
|
# Logits processor (ernie think)
|
||||||
|
if cfg.has_logits_processor_think:
|
||||||
|
logits_processors_args = self._prepare_think_stop_sentence(
|
||||||
|
request.get("logits_processors_args") or {}, max_model_len
|
||||||
|
)
|
||||||
|
request["logits_processors_args"] = logits_processors_args
|
||||||
|
|
||||||
|
# Tokenize
|
||||||
|
outputs = self._tokenize_request(request)
|
||||||
|
|
||||||
|
# Post-token handling
|
||||||
|
self._process_post_tokens(request, outputs)
|
||||||
|
|
||||||
|
# Force disable thinking for qwen_vl / qwen3_vl
|
||||||
|
if cfg.force_disable_thinking:
|
||||||
|
request["enable_thinking"] = False
|
||||||
|
|
||||||
|
# Pack outputs
|
||||||
|
outputs = self.pack_outputs(outputs)
|
||||||
|
|
||||||
|
# Assign prompt_token_ids
|
||||||
|
if cfg.preserve_prompt_token_ids and request.get("prompt_token_ids"):
|
||||||
|
pass # preserve existing
|
||||||
|
else:
|
||||||
|
request["prompt_token_ids"] = outputs["input_ids"].tolist()
|
||||||
|
request["multimodal_inputs"] = outputs
|
||||||
|
|
||||||
|
# Truncation
|
||||||
|
if max_model_len is not None and len(request["prompt_token_ids"]) > max_model_len:
|
||||||
|
request["prompt_token_ids"] = request["prompt_token_ids"][: max_model_len - 1]
|
||||||
|
|
||||||
|
request["prompt_token_ids_len"] = len(request["prompt_token_ids"])
|
||||||
|
|
||||||
|
# Ernie: update thinking prompt state
|
||||||
|
if cfg.has_logits_processor_think:
|
||||||
|
logits_processors_args = self._update_thinking_prompt_state(
|
||||||
|
request["prompt_token_ids"],
|
||||||
|
request.get("logits_processors_args") or {},
|
||||||
|
)
|
||||||
|
request["logits_processors_args"] = logits_processors_args
|
||||||
|
|
||||||
|
# max_tokens
|
||||||
|
max_tokens = max_model_len - len(request["prompt_token_ids"])
|
||||||
|
if request.get("max_tokens") is None:
|
||||||
|
request["max_tokens"] = max(1, max_tokens)
|
||||||
|
else:
|
||||||
|
request["max_tokens"] = min(max_tokens, request["max_tokens"])
|
||||||
|
|
||||||
|
# Ernie: default reasoning_max_tokens
|
||||||
|
if cfg.set_default_reasoning_max_tokens and request.get("reasoning_max_tokens") is None:
|
||||||
|
request["reasoning_max_tokens"] = max(int(request["max_tokens"] * 0.8), 1)
|
||||||
|
|
||||||
|
# Clamp top_p
|
||||||
|
if request.get("top_p") is not None and request.get("top_p") < _SAMPLING_EPS:
|
||||||
|
request["top_p"] = _SAMPLING_EPS
|
||||||
|
request["top_k"] = 1
|
||||||
|
|
||||||
|
# Reasoning parser
|
||||||
|
if self.reasoning_parser:
|
||||||
|
self._apply_reasoning_parser(request)
|
||||||
|
|
||||||
|
# Ernie: cap response_max_tokens
|
||||||
|
if cfg.cap_response_max_tokens:
|
||||||
|
if request.get("response_max_tokens") is not None and request.get("enable_thinking") is False:
|
||||||
|
request["max_tokens"] = min(request["response_max_tokens"], request["max_tokens"])
|
||||||
|
|
||||||
|
data_processor_logger.info(f"Processed request {request}")
|
||||||
|
return request
|
||||||
|
|
||||||
def _tokenize_request(self, request):
|
def _tokenize_request(self, request):
|
||||||
"""Core tokenization dispatch: prompt_token_ids > prompt > messages."""
|
cfg = self.cfg
|
||||||
default_thinking = True if self.model_type == ERNIE4_5_VL else False
|
default_thinking = cfg.default_thinking
|
||||||
|
|
||||||
if request.get("prompt_token_ids") and self.model_type in (QWEN3_VL, ERNIE4_5_VL):
|
if request.get("prompt_token_ids") and cfg.supports_prompt_token_ids:
|
||||||
messages = request.get("messages")
|
messages = request.get("messages")
|
||||||
if messages:
|
if messages:
|
||||||
self._check_mm_limits(messages)
|
self._check_mm_limits(messages)
|
||||||
request.setdefault("enable_thinking", default_thinking)
|
request.setdefault("enable_thinking", default_thinking)
|
||||||
return self.processor.prompt_token_ids2outputs(request)
|
return self._process_prompt_token_ids(request)
|
||||||
|
|
||||||
elif request.get("prompt"):
|
elif request.get("prompt"):
|
||||||
multimodal_data = request.get("multimodal_data") or {}
|
multimodal_data = request.get("multimodal_data") or {}
|
||||||
self._check_mm_limits(multimodal_data)
|
self._check_mm_limits(multimodal_data)
|
||||||
images = multimodal_data.get("image", None)
|
images = multimodal_data.get("image", None)
|
||||||
videos = multimodal_data.get("video", None)
|
videos = multimodal_data.get("video", None)
|
||||||
if self.model_type == ERNIE4_5_VL:
|
request["prompt_tokens"] = request.get("prompt")
|
||||||
request["prompt_tokens"] = request.get("prompt")
|
|
||||||
request.setdefault("enable_thinking", default_thinking)
|
request.setdefault("enable_thinking", default_thinking)
|
||||||
return self.processor.text2ids(request["prompt"], images, videos)
|
return self.text2ids(request["prompt"], images, videos)
|
||||||
|
|
||||||
elif request.get("messages"):
|
elif request.get("messages"):
|
||||||
messages = request["messages"]
|
messages = request["messages"]
|
||||||
@@ -369,65 +483,22 @@ class MultiModalProcessor(BaseTextProcessor):
|
|||||||
else:
|
else:
|
||||||
raise ValueError("Invalid input: chat_template_kwargs must be a dict")
|
raise ValueError("Invalid input: chat_template_kwargs must be a dict")
|
||||||
request.setdefault("enable_thinking", default_thinking)
|
request.setdefault("enable_thinking", default_thinking)
|
||||||
return self.processor.request2ids(request)
|
return self.request2ids(request)
|
||||||
|
|
||||||
else:
|
else:
|
||||||
raise ValueError(f"Request must contain 'prompt', or 'messages': {request}")
|
raise ValueError(f"Request must contain 'prompt', or 'messages': {request}")
|
||||||
|
|
||||||
def _process_post_tokens(self, request, outputs):
|
def _process_post_tokens(self, request, outputs):
|
||||||
"""Handle post-tokenization token appending."""
|
completion_token_ids = request.get("completion_token_ids") or request.get("generated_token_ids")
|
||||||
if self.model_type == PADDLEOCR_VL:
|
if completion_token_ids:
|
||||||
metadata = request.get("metadata")
|
self.enc.append_completion_tokens(outputs, completion_token_ids)
|
||||||
if metadata and metadata.get("generated_token_ids"):
|
|
||||||
self._append_completion_tokens_qwen(outputs, metadata["generated_token_ids"])
|
|
||||||
else:
|
|
||||||
if request.get("completion_token_ids"):
|
|
||||||
self.append_completion_tokens(outputs, request["completion_token_ids"])
|
|
||||||
|
|
||||||
def _apply_reasoning_parser(self, request):
|
|
||||||
"""Apply reasoning parser and update model status dict."""
|
|
||||||
model_status = self.reasoning_parser.get_model_status(request["prompt_token_ids"])
|
|
||||||
parts = request["request_id"].split("_")
|
|
||||||
if len(parts) > 1:
|
|
||||||
real_req_id = parts[0]
|
|
||||||
index = int(parts[1])
|
|
||||||
n = request.get("n", 1)
|
|
||||||
for idx in range(index * n, (index + 1) * n):
|
|
||||||
self.model_status_dict[f"{real_req_id}_{idx}"] = model_status
|
|
||||||
else:
|
|
||||||
self.model_status_dict[request["request_id"]] = model_status
|
|
||||||
request["enable_thinking"] = model_status == "think_start"
|
|
||||||
|
|
||||||
def append_completion_tokens(self, multimodal_inputs, completion_token_ids):
|
def append_completion_tokens(self, multimodal_inputs, completion_token_ids):
|
||||||
"""Append completion tokens to existing multimodal outputs."""
|
"""Append completion tokens — delegates to enc."""
|
||||||
if self.model_type == ERNIE4_5_VL:
|
self.enc.append_completion_tokens(multimodal_inputs, completion_token_ids)
|
||||||
self._append_completion_tokens_ernie(multimodal_inputs, completion_token_ids)
|
|
||||||
else:
|
|
||||||
self._append_completion_tokens_qwen(multimodal_inputs, completion_token_ids)
|
|
||||||
|
|
||||||
def _append_completion_tokens_qwen(self, multimodal_inputs, completion_token_ids):
|
|
||||||
"""Append completion tokens for qwen_vl / qwen3_vl / paddleocr_vl."""
|
|
||||||
num_tokens = len(completion_token_ids)
|
|
||||||
multimodal_inputs["input_ids"].extend(completion_token_ids)
|
|
||||||
multimodal_inputs["token_type_ids"].extend([0] * num_tokens)
|
|
||||||
|
|
||||||
pos_ids = self.processor._compute_text_positions(multimodal_inputs["cur_position"], num_tokens)
|
|
||||||
multimodal_inputs["position_ids"].append(pos_ids)
|
|
||||||
multimodal_inputs["cur_position"] += num_tokens
|
|
||||||
|
|
||||||
def _append_completion_tokens_ernie(self, multimodal_inputs, completion_token_ids):
|
|
||||||
"""Append completion tokens for ernie4_5_vl."""
|
|
||||||
num_tokens = len(completion_token_ids)
|
|
||||||
multimodal_inputs["input_ids"].extend(completion_token_ids)
|
|
||||||
multimodal_inputs["token_type_ids"].extend([IDS_TYPE_FLAG["text"]] * num_tokens)
|
|
||||||
|
|
||||||
start = multimodal_inputs["cur_position"]
|
|
||||||
for i in range(num_tokens):
|
|
||||||
multimodal_inputs["position_ids"].append([start + i] * 3)
|
|
||||||
multimodal_inputs["cur_position"] += num_tokens
|
|
||||||
|
|
||||||
def pack_outputs(self, outputs):
|
def pack_outputs(self, outputs):
|
||||||
"""Convert intermediate processing outputs to final format."""
|
"""Convert intermediate outputs to final packed format."""
|
||||||
if not outputs["images"]:
|
if not outputs["images"]:
|
||||||
outputs["images"] = None
|
outputs["images"] = None
|
||||||
outputs["grid_thw"] = None
|
outputs["grid_thw"] = None
|
||||||
@@ -439,15 +510,22 @@ class MultiModalProcessor(BaseTextProcessor):
|
|||||||
|
|
||||||
outputs["input_ids"] = np.array(outputs["input_ids"], dtype=np.int64)
|
outputs["input_ids"] = np.array(outputs["input_ids"], dtype=np.int64)
|
||||||
outputs["token_type_ids"] = np.array(outputs["token_type_ids"], dtype=np.int64)
|
outputs["token_type_ids"] = np.array(outputs["token_type_ids"], dtype=np.int64)
|
||||||
outputs["mm_num_token_func"] = self.processor.mm_num_tokens
|
outputs["mm_num_token_func"] = self.enc.mm_num_tokens
|
||||||
|
|
||||||
if self.model_type in (QWEN_VL, QWEN3_VL, PADDLEOCR_VL):
|
# Position IDs: delegate to encoding strategy
|
||||||
outputs["position_ids"] = np.concatenate(outputs["position_ids"], axis=1, dtype=np.int64)
|
self.enc.pack_position_ids(outputs)
|
||||||
outputs["image_patch_id"] = self.processor.image_token_id
|
|
||||||
outputs["video_patch_id"] = self.processor.video_token_id
|
|
||||||
outputs["position_ids"] = outputs["position_ids"].transpose(1, 0)
|
|
||||||
else:
|
|
||||||
outputs["position_ids"] = np.array(outputs["position_ids"], dtype=np.int64)
|
|
||||||
outputs["image_patch_id"] = self.image_patch_id
|
|
||||||
|
|
||||||
return outputs
|
return outputs
|
||||||
|
|
||||||
|
def get_processor_cache(self, socket, mm_hashes):
|
||||||
|
req = pickle.dumps(mm_hashes)
|
||||||
|
socket.send_multipart([b"", req])
|
||||||
|
_, resp = socket.recv_multipart()
|
||||||
|
mm_items = pickle.loads(resp)
|
||||||
|
data_processor_logger.info(f"Get cache of mm_hashes: {mm_hashes}")
|
||||||
|
return mm_items
|
||||||
|
|
||||||
|
def update_processor_cache(self, socket, mm_hashes, mm_items):
|
||||||
|
req = pickle.dumps((mm_hashes, mm_items))
|
||||||
|
socket.send_multipart([b"", req])
|
||||||
|
data_processor_logger.info(f"Update cache of mm_hashes: {mm_hashes}")
|
||||||
|
|||||||
@@ -28,8 +28,8 @@ from fastdeploy.engine.request import ImagePosition
|
|||||||
from fastdeploy.entrypoints.chat_utils import parse_chat_messages
|
from fastdeploy.entrypoints.chat_utils import parse_chat_messages
|
||||||
from fastdeploy.input.mm_data_processor import MMBaseDataProcessor
|
from fastdeploy.input.mm_data_processor import MMBaseDataProcessor
|
||||||
from fastdeploy.input.utils import IDS_TYPE_FLAG
|
from fastdeploy.input.utils import IDS_TYPE_FLAG
|
||||||
from fastdeploy.input.video_utils import read_video_decord
|
from fastdeploy.input.utils.video import read_video_decord
|
||||||
from fastdeploy.input.video_utils import sample_frames_paddleocr as sample_frames
|
from fastdeploy.input.utils.video import sample_frames_paddleocr as sample_frames
|
||||||
from fastdeploy.multimodal.hasher import MultimodalHasher
|
from fastdeploy.multimodal.hasher import MultimodalHasher
|
||||||
from fastdeploy.utils import data_processor_logger
|
from fastdeploy.utils import data_processor_logger
|
||||||
|
|
||||||
|
|||||||
@@ -94,13 +94,13 @@ class InputPreprocessor:
|
|||||||
tool_parser_obj=tool_parser_obj,
|
tool_parser_obj=tool_parser_obj,
|
||||||
)
|
)
|
||||||
else:
|
else:
|
||||||
from fastdeploy.input.multimodal_processor import (
|
from fastdeploy.input.mm_model_config import (
|
||||||
ERNIE4_5_VL,
|
ERNIE4_5_VL,
|
||||||
PADDLEOCR_VL,
|
PADDLEOCR_VL,
|
||||||
QWEN3_VL,
|
QWEN3_VL,
|
||||||
QWEN_VL,
|
QWEN_VL,
|
||||||
MultiModalProcessor,
|
|
||||||
)
|
)
|
||||||
|
from fastdeploy.input.multimodal_processor import MultiModalProcessor
|
||||||
|
|
||||||
if ErnieArchitectures.contains_ernie_arch(architecture):
|
if ErnieArchitectures.contains_ernie_arch(architecture):
|
||||||
model_type = ERNIE4_5_VL
|
model_type = ERNIE4_5_VL
|
||||||
|
|||||||
@@ -28,8 +28,8 @@ from fastdeploy.engine.request import ImagePosition
|
|||||||
from fastdeploy.entrypoints.chat_utils import parse_chat_messages
|
from fastdeploy.entrypoints.chat_utils import parse_chat_messages
|
||||||
from fastdeploy.input.mm_data_processor import MMBaseDataProcessor
|
from fastdeploy.input.mm_data_processor import MMBaseDataProcessor
|
||||||
from fastdeploy.input.utils import IDS_TYPE_FLAG
|
from fastdeploy.input.utils import IDS_TYPE_FLAG
|
||||||
from fastdeploy.input.video_utils import read_video_decord
|
from fastdeploy.input.utils.video import read_video_decord
|
||||||
from fastdeploy.input.video_utils import sample_frames_qwen as sample_frames
|
from fastdeploy.input.utils.video import sample_frames_qwen as sample_frames
|
||||||
from fastdeploy.multimodal.hasher import MultimodalHasher
|
from fastdeploy.multimodal.hasher import MultimodalHasher
|
||||||
from fastdeploy.utils import data_processor_logger
|
from fastdeploy.utils import data_processor_logger
|
||||||
|
|
||||||
|
|||||||
@@ -28,8 +28,8 @@ from fastdeploy.engine.request import ImagePosition
|
|||||||
from fastdeploy.entrypoints.chat_utils import parse_chat_messages
|
from fastdeploy.entrypoints.chat_utils import parse_chat_messages
|
||||||
from fastdeploy.input.mm_data_processor import MMBaseDataProcessor
|
from fastdeploy.input.mm_data_processor import MMBaseDataProcessor
|
||||||
from fastdeploy.input.utils import IDS_TYPE_FLAG
|
from fastdeploy.input.utils import IDS_TYPE_FLAG
|
||||||
from fastdeploy.input.video_utils import read_video_decord
|
from fastdeploy.input.utils.video import read_video_decord
|
||||||
from fastdeploy.input.video_utils import sample_frames_qwen as sample_frames
|
from fastdeploy.input.utils.video import sample_frames_qwen as sample_frames
|
||||||
from fastdeploy.multimodal.hasher import MultimodalHasher
|
from fastdeploy.multimodal.hasher import MultimodalHasher
|
||||||
from fastdeploy.utils import data_processor_logger
|
from fastdeploy.utils import data_processor_logger
|
||||||
|
|
||||||
|
|||||||
Binary file not shown.
@@ -0,0 +1,41 @@
|
|||||||
|
# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
|
||||||
|
#
|
||||||
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
# you may not use this file except in compliance with the License.
|
||||||
|
# You may obtain a copy of the License at
|
||||||
|
#
|
||||||
|
# http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
#
|
||||||
|
# Unless required by applicable law or agreed to in writing, software
|
||||||
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
# See the License for the specific language governing permissions and
|
||||||
|
# limitations under the License.
|
||||||
|
|
||||||
|
"""Utility package for fastdeploy.input — re-exports from sub-modules."""
|
||||||
|
|
||||||
|
from fastdeploy.input.utils.common import (
|
||||||
|
IDS_TYPE_FLAG,
|
||||||
|
MAX_IMAGE_DIMENSION,
|
||||||
|
process_stop_token_ids,
|
||||||
|
validate_model_path,
|
||||||
|
)
|
||||||
|
from fastdeploy.input.utils.video import (
|
||||||
|
VideoReaderWrapper,
|
||||||
|
read_video_decord,
|
||||||
|
sample_frames,
|
||||||
|
sample_frames_paddleocr,
|
||||||
|
sample_frames_qwen,
|
||||||
|
)
|
||||||
|
|
||||||
|
__all__ = [
|
||||||
|
"IDS_TYPE_FLAG",
|
||||||
|
"MAX_IMAGE_DIMENSION",
|
||||||
|
"process_stop_token_ids",
|
||||||
|
"validate_model_path",
|
||||||
|
"VideoReaderWrapper",
|
||||||
|
"read_video_decord",
|
||||||
|
"sample_frames",
|
||||||
|
"sample_frames_paddleocr",
|
||||||
|
"sample_frames_qwen",
|
||||||
|
]
|
||||||
@@ -0,0 +1,94 @@
|
|||||||
|
# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
|
||||||
|
#
|
||||||
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
# you may not use this file except in compliance with the License.
|
||||||
|
# You may obtain a copy of the License at
|
||||||
|
#
|
||||||
|
# http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
#
|
||||||
|
# Unless required by applicable law or agreed to in writing, software
|
||||||
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
# See the License for the specific language governing permissions and
|
||||||
|
# limitations under the License.
|
||||||
|
|
||||||
|
"""Render timestamps onto video frames."""
|
||||||
|
|
||||||
|
import os
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
|
from PIL import Image, ImageDraw, ImageFont
|
||||||
|
|
||||||
|
FONT_PATH = os.path.join(Path(__file__).parent.absolute(), "Roboto-Regular.ttf")
|
||||||
|
|
||||||
|
|
||||||
|
def render_single_image_with_timestamp(image: Image, number: str, rate: float, font_path: str = FONT_PATH):
|
||||||
|
"""Render a timestamp string onto a PIL Image.
|
||||||
|
|
||||||
|
The font size is ``min(width, height) * rate``.
|
||||||
|
Text is drawn in black with a white outline (10% of font size).
|
||||||
|
"""
|
||||||
|
draw = ImageDraw.Draw(image)
|
||||||
|
width, height = image.size
|
||||||
|
font_size = int(min(width, height) * rate)
|
||||||
|
outline_size = int(font_size * 0.1)
|
||||||
|
font = ImageFont.truetype(font_path, font_size)
|
||||||
|
x = 0
|
||||||
|
y = 0
|
||||||
|
|
||||||
|
draw.text(
|
||||||
|
(x, y),
|
||||||
|
number,
|
||||||
|
font=font,
|
||||||
|
fill=(0, 0, 0),
|
||||||
|
stroke_width=outline_size,
|
||||||
|
stroke_fill=(255, 255, 255),
|
||||||
|
)
|
||||||
|
|
||||||
|
return image
|
||||||
|
|
||||||
|
|
||||||
|
def timestamp_converting(time_stamp_in_seconds):
|
||||||
|
"""Convert timestamp from seconds to ``HH:MM:SS.ss`` format."""
|
||||||
|
hours = 0
|
||||||
|
while time_stamp_in_seconds >= 3600:
|
||||||
|
hours += 1
|
||||||
|
time_stamp_in_seconds -= 3600
|
||||||
|
mins = 0
|
||||||
|
while time_stamp_in_seconds >= 60:
|
||||||
|
mins += 1
|
||||||
|
time_stamp_in_seconds -= 60
|
||||||
|
time_hours = f"{int(hours):02d}"
|
||||||
|
time_mins = f"{int(mins):02d}"
|
||||||
|
time_secs = f"{time_stamp_in_seconds:05.02f}"
|
||||||
|
fi_time_stamp = time_hours + ":" + time_mins + ":" + time_secs
|
||||||
|
|
||||||
|
return fi_time_stamp
|
||||||
|
|
||||||
|
|
||||||
|
def get_timestamp_for_uniform_frame_extraction(num_frames, frame_id, duration):
|
||||||
|
"""Get the timestamp of a frame during uniform extraction.
|
||||||
|
|
||||||
|
Returns the timestamp in seconds.
|
||||||
|
"""
|
||||||
|
time_stamp = duration * 1.0 * frame_id / num_frames
|
||||||
|
|
||||||
|
return time_stamp
|
||||||
|
|
||||||
|
|
||||||
|
def render_frame_timestamp(frame, timestamp, font_rate=0.1):
|
||||||
|
"""Render a timestamp onto a video frame.
|
||||||
|
|
||||||
|
Parameters
|
||||||
|
----------
|
||||||
|
frame : PIL.Image
|
||||||
|
The video frame.
|
||||||
|
timestamp : float
|
||||||
|
Timestamp in seconds.
|
||||||
|
font_rate : float
|
||||||
|
Font size as a fraction of ``min(width, height)``.
|
||||||
|
"""
|
||||||
|
time_stamp = "time: " + timestamp_converting(timestamp)
|
||||||
|
new_frame = render_single_image_with_timestamp(frame, time_stamp, font_rate)
|
||||||
|
|
||||||
|
return new_frame
|
||||||
@@ -0,0 +1,470 @@
|
|||||||
|
# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
|
||||||
|
#
|
||||||
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
# you may not use this file except in compliance with the License.
|
||||||
|
# You may obtain a copy of the License at
|
||||||
|
#
|
||||||
|
# http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
#
|
||||||
|
# Unless required by applicable law or agreed to in writing, software
|
||||||
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
# See the License for the specific language governing permissions and
|
||||||
|
# limitations under the License.
|
||||||
|
|
||||||
|
"""Shared video utilities: VideoReaderWrapper, read_video_decord, sample_frames, read_frames_decord."""
|
||||||
|
|
||||||
|
import datetime
|
||||||
|
import hashlib
|
||||||
|
import io
|
||||||
|
import math
|
||||||
|
import os
|
||||||
|
import random
|
||||||
|
import threading
|
||||||
|
import uuid
|
||||||
|
from tempfile import NamedTemporaryFile as ntf
|
||||||
|
from typing import Optional, Union
|
||||||
|
|
||||||
|
import numpy as np
|
||||||
|
from PIL import Image
|
||||||
|
|
||||||
|
from fastdeploy.input.image_processors.common import ceil_by_factor, floor_by_factor
|
||||||
|
from fastdeploy.utils import data_processor_logger
|
||||||
|
|
||||||
|
__all__ = [
|
||||||
|
"VideoReaderWrapper",
|
||||||
|
"read_video_decord",
|
||||||
|
"sample_frames",
|
||||||
|
"sample_frames_qwen",
|
||||||
|
"sample_frames_paddleocr",
|
||||||
|
"get_frame_indices",
|
||||||
|
"read_frames_decord",
|
||||||
|
"EXTRACTED_FRAME_DIR",
|
||||||
|
"get_filename",
|
||||||
|
]
|
||||||
|
|
||||||
|
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
# VideoReaderWrapper
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
|
||||||
|
def _is_gif(data: bytes) -> bool:
|
||||||
|
"""Check if bytes represent a GIF based on magic header."""
|
||||||
|
return data[:6] in (b"GIF87a", b"GIF89a")
|
||||||
|
|
||||||
|
|
||||||
|
class VideoReaderWrapper:
|
||||||
|
"""decord.VideoReader wrapper that fixes a memory leak and adds GIF support.
|
||||||
|
|
||||||
|
Reference: https://github.com/dmlc/decord/issues/208
|
||||||
|
"""
|
||||||
|
|
||||||
|
def __init__(self, video_path, *args, **kwargs):
|
||||||
|
import decord
|
||||||
|
|
||||||
|
try:
|
||||||
|
# moviepy 1.0
|
||||||
|
import moviepy.editor as mp
|
||||||
|
except Exception:
|
||||||
|
# moviepy 2.0
|
||||||
|
import moviepy as mp
|
||||||
|
|
||||||
|
with ntf(delete=True, suffix=".gif") as gif_file:
|
||||||
|
gif_input = None
|
||||||
|
self.original_file = None # only set when we create a temp file
|
||||||
|
|
||||||
|
if isinstance(video_path, str):
|
||||||
|
if video_path.lower().endswith(".gif"):
|
||||||
|
gif_input = video_path
|
||||||
|
elif isinstance(video_path, bytes):
|
||||||
|
if _is_gif(video_path):
|
||||||
|
gif_file.write(video_path)
|
||||||
|
gif_file.flush()
|
||||||
|
gif_input = gif_file.name
|
||||||
|
elif isinstance(video_path, io.BytesIO):
|
||||||
|
video_path.seek(0)
|
||||||
|
tmp_bytes = video_path.read()
|
||||||
|
video_path.seek(0)
|
||||||
|
if _is_gif(tmp_bytes):
|
||||||
|
gif_file.write(tmp_bytes)
|
||||||
|
gif_file.flush()
|
||||||
|
gif_input = gif_file.name
|
||||||
|
|
||||||
|
if gif_input is not None:
|
||||||
|
clip = mp.VideoFileClip(gif_input)
|
||||||
|
mp4_file = ntf(delete=False, suffix=".mp4")
|
||||||
|
mp4_path = mp4_file.name
|
||||||
|
mp4_file.close() # close before moviepy writes
|
||||||
|
clip.write_videofile(mp4_path, verbose=False, logger=None)
|
||||||
|
clip.close()
|
||||||
|
video_path = mp4_path
|
||||||
|
self.original_file = video_path # temp mp4, cleaned up in __del__
|
||||||
|
|
||||||
|
self._reader = decord.VideoReader(video_path, *args, **kwargs)
|
||||||
|
self._reader.seek(0)
|
||||||
|
|
||||||
|
def __len__(self):
|
||||||
|
return len(self._reader)
|
||||||
|
|
||||||
|
def __getitem__(self, key):
|
||||||
|
frames = self._reader[key]
|
||||||
|
self._reader.seek(0)
|
||||||
|
return frames
|
||||||
|
|
||||||
|
def get_avg_fps(self):
|
||||||
|
return self._reader.get_avg_fps()
|
||||||
|
|
||||||
|
def seek(self, pos):
|
||||||
|
return self._reader.seek(pos)
|
||||||
|
|
||||||
|
def __del__(self):
|
||||||
|
original_file = getattr(self, "original_file", None)
|
||||||
|
if original_file:
|
||||||
|
try:
|
||||||
|
os.remove(original_file)
|
||||||
|
except OSError:
|
||||||
|
pass
|
||||||
|
|
||||||
|
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
# read_video_decord
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
|
||||||
|
def read_video_decord(video_path, save_to_disk: bool = False):
|
||||||
|
"""Load a video file and return (video_reader, video_meta, video_path).
|
||||||
|
|
||||||
|
video_meta contains keys: "fps", "duration", "num_of_frame".
|
||||||
|
"""
|
||||||
|
if isinstance(video_path, VideoReaderWrapper):
|
||||||
|
video_reader = video_path
|
||||||
|
else:
|
||||||
|
if isinstance(video_path, bytes):
|
||||||
|
video_path = io.BytesIO(video_path)
|
||||||
|
video_reader = VideoReaderWrapper(video_path, num_threads=1)
|
||||||
|
|
||||||
|
vlen = len(video_reader)
|
||||||
|
fps = video_reader.get_avg_fps()
|
||||||
|
duration = vlen / float(fps)
|
||||||
|
|
||||||
|
video_meta = {"fps": fps, "duration": duration, "num_of_frame": vlen}
|
||||||
|
return video_reader, video_meta, video_path
|
||||||
|
|
||||||
|
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
# sample_frames — qwen_vl variant
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
|
||||||
|
def sample_frames_qwen(
|
||||||
|
frame_factor: int,
|
||||||
|
min_frames: int,
|
||||||
|
max_frames: int,
|
||||||
|
metadata: Optional[dict] = None,
|
||||||
|
fps: Optional[Union[int, float]] = -1,
|
||||||
|
num_frames: Optional[int] = -1,
|
||||||
|
) -> np.ndarray:
|
||||||
|
"""Sample frame indices — qwen_vl variant.
|
||||||
|
|
||||||
|
Sentinel defaults are -1. Applies ceil_by_factor on min_frames and ensures
|
||||||
|
num_frames is divisible by 4.
|
||||||
|
"""
|
||||||
|
if fps > 0 and num_frames > 0:
|
||||||
|
raise ValueError("`num_frames` and `fps` are mutually exclusive arguments, please use only one!")
|
||||||
|
|
||||||
|
if metadata is None:
|
||||||
|
raise ValueError("metadata is required for sample_frames_qwen")
|
||||||
|
|
||||||
|
total_num_frames = metadata["num_of_frame"]
|
||||||
|
|
||||||
|
if num_frames > 0:
|
||||||
|
num_frames = round(num_frames / frame_factor) * frame_factor
|
||||||
|
elif fps > 0:
|
||||||
|
min_frames = ceil_by_factor(min_frames, frame_factor)
|
||||||
|
max_frames = floor_by_factor(min(max_frames, total_num_frames), frame_factor)
|
||||||
|
|
||||||
|
num_frames = total_num_frames / metadata["fps"] * fps
|
||||||
|
|
||||||
|
if num_frames > total_num_frames:
|
||||||
|
data_processor_logger.warning(f"smart_nframes: nframes[{num_frames}] > total_frames[{total_num_frames}]")
|
||||||
|
|
||||||
|
num_frames = min(min(max(num_frames, min_frames), max_frames), total_num_frames)
|
||||||
|
num_frames = floor_by_factor(num_frames, frame_factor)
|
||||||
|
|
||||||
|
if num_frames > total_num_frames:
|
||||||
|
raise ValueError(
|
||||||
|
f"Video can't be sampled. The inferred `num_frames={num_frames}` exceeds "
|
||||||
|
f"`total_num_frames={total_num_frames}`. "
|
||||||
|
"Decrease `num_frames` or `fps` for sampling."
|
||||||
|
)
|
||||||
|
|
||||||
|
# num_frames must be divisible by 4
|
||||||
|
if num_frames > 2 and num_frames % 4 != 0:
|
||||||
|
num_frames = (num_frames // 4) * 4
|
||||||
|
total_num_frames = (total_num_frames // 4) * 4
|
||||||
|
num_frames = min(min(max(num_frames, min_frames), max_frames), total_num_frames)
|
||||||
|
|
||||||
|
if num_frames > 0:
|
||||||
|
indices = np.arange(0, total_num_frames, total_num_frames / num_frames).astype(np.int32)
|
||||||
|
else:
|
||||||
|
indices = np.arange(0, total_num_frames).astype(np.int32)
|
||||||
|
|
||||||
|
return indices
|
||||||
|
|
||||||
|
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
# sample_frames — paddleocr_vl / ernie4_5_vl variant
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
|
||||||
|
def sample_frames_paddleocr(
|
||||||
|
frame_factor: int,
|
||||||
|
min_frames: int,
|
||||||
|
max_frames: int,
|
||||||
|
metadata: Optional[dict] = None,
|
||||||
|
fps: Optional[Union[int, float]] = None,
|
||||||
|
num_frames: Optional[int] = None,
|
||||||
|
) -> np.ndarray:
|
||||||
|
"""Sample frame indices — paddleocr_vl / ernie4_5_vl variant.
|
||||||
|
|
||||||
|
Sentinel defaults are None. Uses plain math.floor/ceil; no %4 correction.
|
||||||
|
"""
|
||||||
|
fps = fps or 0
|
||||||
|
num_frames = num_frames or 0
|
||||||
|
if fps > 0 and num_frames > 0:
|
||||||
|
raise ValueError("`num_frames` and `fps` are mutually exclusive arguments, please use only one!")
|
||||||
|
|
||||||
|
if metadata is None:
|
||||||
|
raise ValueError("metadata is required for sample_frames_paddleocr")
|
||||||
|
|
||||||
|
total_num_frames = metadata["num_of_frame"]
|
||||||
|
|
||||||
|
if num_frames > 0:
|
||||||
|
num_frames = round(num_frames / frame_factor) * frame_factor
|
||||||
|
elif fps > 0:
|
||||||
|
max_frames = math.floor(min(max_frames, total_num_frames) / frame_factor) * frame_factor
|
||||||
|
num_frames = total_num_frames / metadata["fps"] * fps
|
||||||
|
num_frames = min(min(max(num_frames, min_frames), max_frames), total_num_frames)
|
||||||
|
num_frames = math.floor(num_frames / frame_factor) * frame_factor
|
||||||
|
|
||||||
|
if num_frames > total_num_frames:
|
||||||
|
raise ValueError(
|
||||||
|
f"Video can't be sampled. The inferred `num_frames={num_frames}` exceeds "
|
||||||
|
f"`total_num_frames={total_num_frames}`. "
|
||||||
|
"Decrease `num_frames` or `fps` for sampling."
|
||||||
|
)
|
||||||
|
|
||||||
|
if num_frames > 0:
|
||||||
|
indices = np.arange(0, total_num_frames, total_num_frames / num_frames).astype(np.int32)
|
||||||
|
else:
|
||||||
|
indices = np.arange(0, total_num_frames).astype(np.int32)
|
||||||
|
|
||||||
|
return indices
|
||||||
|
|
||||||
|
|
||||||
|
def sample_frames(
|
||||||
|
frame_factor: int,
|
||||||
|
min_frames: int,
|
||||||
|
max_frames: int,
|
||||||
|
metadata: Optional[dict] = None,
|
||||||
|
fps: Optional[Union[int, float]] = None,
|
||||||
|
num_frames: Optional[int] = None,
|
||||||
|
variant: str = "paddleocr",
|
||||||
|
) -> np.ndarray:
|
||||||
|
"""Dispatch to sample_frames_qwen or sample_frames_paddleocr based on variant."""
|
||||||
|
if variant == "qwen":
|
||||||
|
_fps = fps if fps is not None else -1
|
||||||
|
_num_frames = num_frames if num_frames is not None else -1
|
||||||
|
return sample_frames_qwen(frame_factor, min_frames, max_frames, metadata, _fps, _num_frames)
|
||||||
|
if variant == "paddleocr":
|
||||||
|
return sample_frames_paddleocr(frame_factor, min_frames, max_frames, metadata, fps, num_frames)
|
||||||
|
raise ValueError(f"Unknown variant {variant!r}. Expected 'paddleocr' or 'qwen'.")
|
||||||
|
|
||||||
|
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
# IO helpers (migrated from ernie4_5_vl_processor/utils/io_utils.py)
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
EXTRACTED_FRAME_DIR = "./download_tmp/extracted_frames/"
|
||||||
|
|
||||||
|
|
||||||
|
def get_filename(url=None):
|
||||||
|
"""Generate a unique filename, optionally based on a URL hash."""
|
||||||
|
if url is None:
|
||||||
|
return str(uuid.uuid4()).replace("-", "")
|
||||||
|
t = datetime.datetime.now()
|
||||||
|
if not isinstance(url, bytes):
|
||||||
|
url = url.encode("utf-8")
|
||||||
|
|
||||||
|
md5_hash = hashlib.md5(url).hexdigest()
|
||||||
|
pid = os.getpid()
|
||||||
|
tid = threading.get_ident()
|
||||||
|
|
||||||
|
image_filename = f"{t.year}-{t.month:02d}-{t.day:02d}-{pid}-{tid}-{md5_hash}"
|
||||||
|
return image_filename
|
||||||
|
|
||||||
|
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
# get_frame_indices / read_frames_decord
|
||||||
|
# (migrated from ernie4_5_vl_processor/process_video.py)
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
|
||||||
|
def get_frame_indices(
|
||||||
|
vlen,
|
||||||
|
target_frames=-1,
|
||||||
|
target_fps=-1,
|
||||||
|
frames_sample="middle",
|
||||||
|
fix_start=None,
|
||||||
|
input_fps=-1,
|
||||||
|
):
|
||||||
|
"""Get frame indices for sampling from a video."""
|
||||||
|
assert frames_sample in ["rand", "middle", "leading"]
|
||||||
|
if target_frames > 0:
|
||||||
|
assert target_fps <= 0, "target_fps must be negative if target_frames is given."
|
||||||
|
if target_frames > vlen:
|
||||||
|
acc_samples = vlen
|
||||||
|
data_processor_logger.info(
|
||||||
|
f"target_frames={target_frames} is larger than video length {vlen}, "
|
||||||
|
f"will sample {acc_samples} frames."
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
acc_samples = target_frames
|
||||||
|
data_processor_logger.debug(f"sampling at target_frames={target_frames}, frames_sample={frames_sample}")
|
||||||
|
|
||||||
|
intervals = np.linspace(start=0, stop=vlen, num=acc_samples + 1).astype(int)
|
||||||
|
ranges = []
|
||||||
|
for idx, interv in enumerate(intervals[:-1]):
|
||||||
|
ranges.append((interv, intervals[idx + 1] - 1))
|
||||||
|
if frames_sample == "rand":
|
||||||
|
try:
|
||||||
|
frame_indices = [random.choice(range(x[0], x[1])) for x in ranges]
|
||||||
|
except Exception:
|
||||||
|
frame_indices = np.random.permutation(vlen)[:acc_samples]
|
||||||
|
frame_indices.sort()
|
||||||
|
frame_indices = list(frame_indices)
|
||||||
|
elif fix_start is not None:
|
||||||
|
frame_indices = [x[0] + fix_start for x in ranges]
|
||||||
|
elif frames_sample == "leading":
|
||||||
|
frame_indices = [x[0] for x in ranges]
|
||||||
|
elif frames_sample == "middle":
|
||||||
|
frame_indices = [(x[0] + x[1]) // 2 for x in ranges]
|
||||||
|
else:
|
||||||
|
raise NotImplementedError
|
||||||
|
|
||||||
|
elif target_fps > 0:
|
||||||
|
assert target_frames <= 0, "target_frames must be negative if target_fps is given."
|
||||||
|
assert input_fps > 0, "input_fps must be provided if target_fps is given."
|
||||||
|
data_processor_logger.info(f"sampling at fps={target_fps}, frames_sample={frames_sample}")
|
||||||
|
duration = float(vlen) / input_fps
|
||||||
|
delta = 1 / target_fps
|
||||||
|
if frames_sample == "middle":
|
||||||
|
frame_seconds = np.arange(0 + delta / 2, duration + delta / 2, delta)
|
||||||
|
elif frames_sample == "leading":
|
||||||
|
frame_seconds = np.arange(0, duration, delta)
|
||||||
|
if frames_sample == "rand":
|
||||||
|
frame_seconds = np.arange(0 + delta / 2, duration + delta / 2, delta)
|
||||||
|
rand_offset = np.random.rand(*(frame_seconds.shape)) - 0.5
|
||||||
|
frame_seconds += rand_offset * delta
|
||||||
|
frame_indices = np.around(frame_seconds * input_fps).astype(int)
|
||||||
|
frame_indices = [e for e in frame_indices if e < vlen]
|
||||||
|
|
||||||
|
else:
|
||||||
|
raise ValueError("Must provide either positive target_fps or positive target_frames.")
|
||||||
|
|
||||||
|
return frame_indices
|
||||||
|
|
||||||
|
|
||||||
|
def read_frames_decord(
|
||||||
|
video_path,
|
||||||
|
video_reader,
|
||||||
|
video_meta,
|
||||||
|
target_frames=-1,
|
||||||
|
target_fps=-1,
|
||||||
|
frames_sample="middle",
|
||||||
|
fix_start=None,
|
||||||
|
save_to_disk=False,
|
||||||
|
cache_dir=None,
|
||||||
|
frame_indices=None,
|
||||||
|
tol=10,
|
||||||
|
):
|
||||||
|
"""Read frames from a video using decord, with retry logic for corrupt frames."""
|
||||||
|
if cache_dir is None:
|
||||||
|
cache_dir = EXTRACTED_FRAME_DIR
|
||||||
|
|
||||||
|
if frame_indices is None:
|
||||||
|
frame_indices = get_frame_indices(
|
||||||
|
video_meta["num_of_frame"],
|
||||||
|
target_frames=target_frames,
|
||||||
|
target_fps=target_fps,
|
||||||
|
frames_sample=frames_sample,
|
||||||
|
fix_start=fix_start,
|
||||||
|
input_fps=video_meta["fps"],
|
||||||
|
)
|
||||||
|
|
||||||
|
frames = []
|
||||||
|
for frame_indice_index in range(0, len(frame_indices)):
|
||||||
|
frame_indice = frame_indices[frame_indice_index]
|
||||||
|
try:
|
||||||
|
frames.append(video_reader[frame_indice].asnumpy())
|
||||||
|
except Exception as e:
|
||||||
|
data_processor_logger.debug(f"encounter error when get frame: {frame_indice}, error: {e}")
|
||||||
|
previous_counter = 1
|
||||||
|
later_counter = 1
|
||||||
|
previous_after_flag = True
|
||||||
|
if frame_indice == 0 or frame_indice == len(video_reader) - 1:
|
||||||
|
cur_tol = tol * 2
|
||||||
|
else:
|
||||||
|
cur_tol = tol
|
||||||
|
while previous_counter < cur_tol or later_counter < cur_tol:
|
||||||
|
if previous_after_flag:
|
||||||
|
if frame_indice - previous_counter < 0:
|
||||||
|
previous_counter += 1
|
||||||
|
previous_after_flag = not previous_after_flag
|
||||||
|
continue
|
||||||
|
try:
|
||||||
|
frames.append(video_reader[frame_indice - previous_counter].asnumpy())
|
||||||
|
data_processor_logger.info(
|
||||||
|
f"replace {frame_indice}-th frame with {frame_indice-previous_counter}-th frame"
|
||||||
|
)
|
||||||
|
frame_indices[frame_indice_index] = frame_indice - previous_counter
|
||||||
|
break
|
||||||
|
except Exception as e:
|
||||||
|
previous_counter += 1
|
||||||
|
data_processor_logger.info(f"error: {e}")
|
||||||
|
else:
|
||||||
|
if frame_indice + later_counter >= len(video_reader):
|
||||||
|
later_counter += 1
|
||||||
|
previous_after_flag = not previous_after_flag
|
||||||
|
continue
|
||||||
|
try:
|
||||||
|
frames.append(video_reader[frame_indice + later_counter].asnumpy())
|
||||||
|
data_processor_logger.info(
|
||||||
|
f"replace {frame_indice}-th frame with {frame_indice+later_counter}-th frame"
|
||||||
|
)
|
||||||
|
frame_indices[frame_indice_index] = frame_indice + later_counter
|
||||||
|
break
|
||||||
|
except Exception:
|
||||||
|
later_counter += 1
|
||||||
|
previous_after_flag = not previous_after_flag
|
||||||
|
|
||||||
|
frames = np.stack(frames, axis=0)
|
||||||
|
assert len(frames) == len(frame_indices), f"len(frames): {len(frames)} != len(frame_indices): {len(frame_indices)}"
|
||||||
|
|
||||||
|
ret = []
|
||||||
|
|
||||||
|
url_sha1 = get_filename()
|
||||||
|
for idx, frame in enumerate(frames):
|
||||||
|
tmp = Image.fromarray(frame, "RGB")
|
||||||
|
if save_to_disk:
|
||||||
|
save_path = os.path.join(cache_dir, f"{url_sha1}", f"{idx}.png")
|
||||||
|
if not os.path.exists(os.path.dirname(save_path)):
|
||||||
|
os.makedirs(os.path.dirname(save_path))
|
||||||
|
tmp.save(save_path)
|
||||||
|
tmp = save_path
|
||||||
|
ret.append(tmp)
|
||||||
|
|
||||||
|
time_stamps = [frame_idx * video_meta["duration"] / video_meta["num_of_frame"] for frame_idx in frame_indices]
|
||||||
|
|
||||||
|
return ret, frame_indices, time_stamps
|
||||||
@@ -85,7 +85,7 @@ import zmq
|
|||||||
|
|
||||||
from fastdeploy import envs
|
from fastdeploy import envs
|
||||||
from fastdeploy.engine.tasks import PoolingTask
|
from fastdeploy.engine.tasks import PoolingTask
|
||||||
from fastdeploy.input.ernie4_5_vl_processor import DataProcessor
|
from fastdeploy.input.image_processors.adaptive_processor import AdaptiveImageProcessor
|
||||||
from fastdeploy.inter_communicator import IPCSignal, ZmqIpcClient
|
from fastdeploy.inter_communicator import IPCSignal, ZmqIpcClient
|
||||||
from fastdeploy.logger.deterministic_logger import DeterministicLogger
|
from fastdeploy.logger.deterministic_logger import DeterministicLogger
|
||||||
from fastdeploy.model_executor.forward_meta import ForwardMeta
|
from fastdeploy.model_executor.forward_meta import ForwardMeta
|
||||||
@@ -2867,12 +2867,7 @@ class GPUModelRunner(ModelRunnerBase):
|
|||||||
return
|
return
|
||||||
|
|
||||||
def _init_image_preprocess(self) -> None:
|
def _init_image_preprocess(self) -> None:
|
||||||
processor = DataProcessor(
|
image_preprocess = AdaptiveImageProcessor.from_pretrained(str(self.model_config.model))
|
||||||
tokenizer_name=self.model_config.model,
|
|
||||||
image_preprocessor_name=str(self.model_config.model),
|
|
||||||
)
|
|
||||||
processor.eval()
|
|
||||||
image_preprocess = processor.image_preprocessor
|
|
||||||
image_preprocess.image_mean_tensor = paddle.to_tensor(image_preprocess.image_mean, dtype="float32").reshape(
|
image_preprocess.image_mean_tensor = paddle.to_tensor(image_preprocess.image_mean, dtype="float32").reshape(
|
||||||
[1, 3, 1, 1]
|
[1, 3, 1, 1]
|
||||||
)
|
)
|
||||||
|
|||||||
@@ -33,7 +33,7 @@ from fastdeploy.config import FDConfig
|
|||||||
from fastdeploy.engine.pooling_params import PoolingParams
|
from fastdeploy.engine.pooling_params import PoolingParams
|
||||||
from fastdeploy.engine.request import ImagePosition, Request, RequestType
|
from fastdeploy.engine.request import ImagePosition, Request, RequestType
|
||||||
from fastdeploy.engine.tasks import PoolingTask
|
from fastdeploy.engine.tasks import PoolingTask
|
||||||
from fastdeploy.input.ernie4_5_vl_processor import DataProcessor
|
from fastdeploy.input.image_processors.adaptive_processor import AdaptiveImageProcessor
|
||||||
from fastdeploy.inter_communicator import IPCSignal, ZmqIpcClient
|
from fastdeploy.inter_communicator import IPCSignal, ZmqIpcClient
|
||||||
from fastdeploy.model_executor.forward_meta import ForwardMeta
|
from fastdeploy.model_executor.forward_meta import ForwardMeta
|
||||||
from fastdeploy.model_executor.graph_optimization.utils import (
|
from fastdeploy.model_executor.graph_optimization.utils import (
|
||||||
@@ -2566,12 +2566,7 @@ class MetaxModelRunner(ModelRunnerBase):
|
|||||||
return
|
return
|
||||||
|
|
||||||
def _init_image_preprocess(self) -> None:
|
def _init_image_preprocess(self) -> None:
|
||||||
processor = DataProcessor(
|
image_preprocess = AdaptiveImageProcessor.from_pretrained(str(self.model_config.model))
|
||||||
tokenizer_name=self.model_config.model,
|
|
||||||
image_preprocessor_name=str(self.model_config.model),
|
|
||||||
)
|
|
||||||
processor.eval()
|
|
||||||
image_preprocess = processor.image_preprocessor
|
|
||||||
image_preprocess.image_mean_tensor = paddle.to_tensor(image_preprocess.image_mean, dtype="float32").reshape(
|
image_preprocess.image_mean_tensor = paddle.to_tensor(image_preprocess.image_mean, dtype="float32").reshape(
|
||||||
[1, 3, 1, 1]
|
[1, 3, 1, 1]
|
||||||
)
|
)
|
||||||
|
|||||||
@@ -31,7 +31,7 @@ from paddle import nn
|
|||||||
from fastdeploy import envs
|
from fastdeploy import envs
|
||||||
from fastdeploy.config import FDConfig
|
from fastdeploy.config import FDConfig
|
||||||
from fastdeploy.engine.request import ImagePosition, Request, RequestType
|
from fastdeploy.engine.request import ImagePosition, Request, RequestType
|
||||||
from fastdeploy.input.ernie4_5_vl_processor import DataProcessor
|
from fastdeploy.input.image_processors.adaptive_processor import AdaptiveImageProcessor
|
||||||
from fastdeploy.inter_communicator import IPCSignal, ZmqIpcClient
|
from fastdeploy.inter_communicator import IPCSignal, ZmqIpcClient
|
||||||
from fastdeploy.model_executor.forward_meta import ForwardMeta
|
from fastdeploy.model_executor.forward_meta import ForwardMeta
|
||||||
from fastdeploy.model_executor.graph_optimization.utils import (
|
from fastdeploy.model_executor.graph_optimization.utils import (
|
||||||
@@ -1842,12 +1842,7 @@ class XPUModelRunner(ModelRunnerBase):
|
|||||||
self.forward_meta.clear_caches()
|
self.forward_meta.clear_caches()
|
||||||
|
|
||||||
def _init_image_preprocess(self) -> None:
|
def _init_image_preprocess(self) -> None:
|
||||||
processor = DataProcessor(
|
image_preprocess = AdaptiveImageProcessor.from_pretrained(str(self.model_config.model))
|
||||||
tokenizer_name=self.model_config.model,
|
|
||||||
image_preprocessor_name=str(self.model_config.model),
|
|
||||||
)
|
|
||||||
processor.eval()
|
|
||||||
image_preprocess = processor.image_preprocessor
|
|
||||||
image_preprocess.image_mean_tensor = paddle.to_tensor(image_preprocess.image_mean, dtype="float32").reshape(
|
image_preprocess.image_mean_tensor = paddle.to_tensor(image_preprocess.image_mean, dtype="float32").reshape(
|
||||||
[1, 3, 1, 1]
|
[1, 3, 1, 1]
|
||||||
)
|
)
|
||||||
|
|||||||
@@ -303,6 +303,7 @@ setup(
|
|||||||
"model_executor/models/*",
|
"model_executor/models/*",
|
||||||
"model_executor/layers/*",
|
"model_executor/layers/*",
|
||||||
"input/ernie4_5_vl_processor/utils/*",
|
"input/ernie4_5_vl_processor/utils/*",
|
||||||
|
"input/utils/Roboto-Regular.ttf",
|
||||||
"model_executor/ops/gcu/*",
|
"model_executor/ops/gcu/*",
|
||||||
"model_executor/ops/gcu/fastdeploy_ops/*",
|
"model_executor/ops/gcu/fastdeploy_ops/*",
|
||||||
"cache_manager/transfer_factory/get_rdma_nics.sh",
|
"cache_manager/transfer_factory/get_rdma_nics.sh",
|
||||||
|
|||||||
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
@@ -30,7 +30,7 @@ from fastdeploy.input.paddleocr_vl_processor.paddleocr_vl_processor import (
|
|||||||
PaddleOCRVLProcessor,
|
PaddleOCRVLProcessor,
|
||||||
)
|
)
|
||||||
from fastdeploy.input.paddleocr_vl_processor.process import DataProcessor
|
from fastdeploy.input.paddleocr_vl_processor.process import DataProcessor
|
||||||
from fastdeploy.input.video_utils import sample_frames_paddleocr as sample_frames
|
from fastdeploy.input.utils.video import sample_frames_paddleocr as sample_frames
|
||||||
|
|
||||||
MODULE_PATH = "fastdeploy.input.paddleocr_vl_processor.process"
|
MODULE_PATH = "fastdeploy.input.paddleocr_vl_processor.process"
|
||||||
|
|
||||||
|
|||||||
@@ -24,8 +24,8 @@ from unittest.mock import patch
|
|||||||
import numpy as np
|
import numpy as np
|
||||||
from PIL import Image as PILImage
|
from PIL import Image as PILImage
|
||||||
|
|
||||||
import fastdeploy.input.ernie4_5_vl_processor.process_video as process_video_module
|
import fastdeploy.input.utils.video as process_video_module
|
||||||
from fastdeploy.input.ernie4_5_vl_processor.process_video import (
|
from fastdeploy.input.utils.video import (
|
||||||
get_frame_indices,
|
get_frame_indices,
|
||||||
read_frames_decord,
|
read_frames_decord,
|
||||||
read_video_decord,
|
read_video_decord,
|
||||||
|
|||||||
@@ -21,7 +21,7 @@ import numpy as np
|
|||||||
from PIL import Image
|
from PIL import Image
|
||||||
|
|
||||||
from fastdeploy.input.qwen_vl_processor import QwenVLProcessor
|
from fastdeploy.input.qwen_vl_processor import QwenVLProcessor
|
||||||
from fastdeploy.input.video_utils import sample_frames_qwen as sample_frames
|
from fastdeploy.input.utils.video import sample_frames_qwen as sample_frames
|
||||||
|
|
||||||
|
|
||||||
def mock_pil_image(height, width):
|
def mock_pil_image(height, width):
|
||||||
|
|||||||
@@ -41,16 +41,16 @@ class TestValidateModelPath(unittest.TestCase):
|
|||||||
|
|
||||||
def _patch_console_logger(self):
|
def _patch_console_logger(self):
|
||||||
"""Patch console_logger.warning to capture warnings."""
|
"""Patch console_logger.warning to capture warnings."""
|
||||||
import fastdeploy.input.utils as utils_mod
|
import fastdeploy.input.utils.common as common_mod
|
||||||
|
|
||||||
self._orig_warning = utils_mod.console_logger.warning
|
self._orig_warning = common_mod.console_logger.warning
|
||||||
utils_mod.console_logger.warning = self._capture_warning
|
common_mod.console_logger.warning = self._capture_warning
|
||||||
|
|
||||||
def _unpatch_console_logger(self):
|
def _unpatch_console_logger(self):
|
||||||
import fastdeploy.input.utils as utils_mod
|
import fastdeploy.input.utils.common as common_mod
|
||||||
|
|
||||||
if self._orig_warning is not None:
|
if self._orig_warning is not None:
|
||||||
utils_mod.console_logger.warning = self._orig_warning
|
common_mod.console_logger.warning = self._orig_warning
|
||||||
|
|
||||||
def tearDown(self):
|
def tearDown(self):
|
||||||
self._unpatch_console_logger()
|
self._unpatch_console_logger()
|
||||||
|
|||||||
@@ -18,7 +18,7 @@ from unittest.mock import MagicMock, patch
|
|||||||
|
|
||||||
import numpy as np
|
import numpy as np
|
||||||
|
|
||||||
from fastdeploy.input.video_utils import (
|
from fastdeploy.input.utils.video import (
|
||||||
_is_gif,
|
_is_gif,
|
||||||
read_video_decord,
|
read_video_decord,
|
||||||
sample_frames,
|
sample_frames,
|
||||||
@@ -74,7 +74,7 @@ class TestIsGif(unittest.TestCase):
|
|||||||
class TestVideoReaderWrapper(unittest.TestCase):
|
class TestVideoReaderWrapper(unittest.TestCase):
|
||||||
def _make_wrapper(self, video_path, mock_reader=None):
|
def _make_wrapper(self, video_path, mock_reader=None):
|
||||||
"""Construct a VideoReaderWrapper with decord mocked out."""
|
"""Construct a VideoReaderWrapper with decord mocked out."""
|
||||||
from fastdeploy.input.video_utils import VideoReaderWrapper
|
from fastdeploy.input.utils.video import VideoReaderWrapper
|
||||||
|
|
||||||
if mock_reader is None:
|
if mock_reader is None:
|
||||||
mock_reader = _make_mock_reader()
|
mock_reader = _make_mock_reader()
|
||||||
@@ -112,7 +112,7 @@ class TestVideoReaderWrapper(unittest.TestCase):
|
|||||||
|
|
||||||
def test_del_no_original_file(self):
|
def test_del_no_original_file(self):
|
||||||
"""__del__ should be a no-op when original_file is None."""
|
"""__del__ should be a no-op when original_file is None."""
|
||||||
from fastdeploy.input.video_utils import VideoReaderWrapper
|
from fastdeploy.input.utils.video import VideoReaderWrapper
|
||||||
|
|
||||||
wrapper = object.__new__(VideoReaderWrapper)
|
wrapper = object.__new__(VideoReaderWrapper)
|
||||||
wrapper.original_file = None
|
wrapper.original_file = None
|
||||||
@@ -125,7 +125,7 @@ class TestVideoReaderWrapper(unittest.TestCase):
|
|||||||
import os
|
import os
|
||||||
import tempfile
|
import tempfile
|
||||||
|
|
||||||
from fastdeploy.input.video_utils import VideoReaderWrapper
|
from fastdeploy.input.utils.video import VideoReaderWrapper
|
||||||
|
|
||||||
with tempfile.NamedTemporaryFile(delete=False) as f:
|
with tempfile.NamedTemporaryFile(delete=False) as f:
|
||||||
tmp_path = f.name
|
tmp_path = f.name
|
||||||
@@ -138,7 +138,7 @@ class TestVideoReaderWrapper(unittest.TestCase):
|
|||||||
|
|
||||||
def test_non_gif_string_path_does_not_set_original_file(self):
|
def test_non_gif_string_path_does_not_set_original_file(self):
|
||||||
"""Passing a non-GIF string path must NOT set original_file (bug fix)."""
|
"""Passing a non-GIF string path must NOT set original_file (bug fix)."""
|
||||||
from fastdeploy.input.video_utils import VideoReaderWrapper
|
from fastdeploy.input.utils.video import VideoReaderWrapper
|
||||||
|
|
||||||
mock_reader = _make_mock_reader()
|
mock_reader = _make_mock_reader()
|
||||||
mock_decord = MagicMock()
|
mock_decord = MagicMock()
|
||||||
@@ -151,7 +151,7 @@ class TestVideoReaderWrapper(unittest.TestCase):
|
|||||||
|
|
||||||
def test_bytesio_non_gif_path_does_not_set_original_file(self):
|
def test_bytesio_non_gif_path_does_not_set_original_file(self):
|
||||||
"""Passing a BytesIO that is NOT a GIF must not set original_file."""
|
"""Passing a BytesIO that is NOT a GIF must not set original_file."""
|
||||||
from fastdeploy.input.video_utils import VideoReaderWrapper
|
from fastdeploy.input.utils.video import VideoReaderWrapper
|
||||||
|
|
||||||
mock_reader = _make_mock_reader()
|
mock_reader = _make_mock_reader()
|
||||||
mock_decord = MagicMock()
|
mock_decord = MagicMock()
|
||||||
@@ -172,16 +172,16 @@ class TestVideoReaderWrapper(unittest.TestCase):
|
|||||||
class TestReadVideoDecord(unittest.TestCase):
|
class TestReadVideoDecord(unittest.TestCase):
|
||||||
def _patch_wrapper(self, num_frames=100, fps=25.0):
|
def _patch_wrapper(self, num_frames=100, fps=25.0):
|
||||||
"""Return a context manager that replaces VideoReaderWrapper with a mock."""
|
"""Return a context manager that replaces VideoReaderWrapper with a mock."""
|
||||||
from fastdeploy.input import video_utils
|
from fastdeploy.input.utils import video
|
||||||
|
|
||||||
mock_wrapper = MagicMock()
|
mock_wrapper = MagicMock()
|
||||||
mock_wrapper.__len__ = MagicMock(return_value=num_frames)
|
mock_wrapper.__len__ = MagicMock(return_value=num_frames)
|
||||||
mock_wrapper.get_avg_fps = MagicMock(return_value=fps)
|
mock_wrapper.get_avg_fps = MagicMock(return_value=fps)
|
||||||
return patch.object(video_utils, "VideoReaderWrapper", return_value=mock_wrapper), mock_wrapper
|
return patch.object(video, "VideoReaderWrapper", return_value=mock_wrapper), mock_wrapper
|
||||||
|
|
||||||
def test_existing_wrapper_passthrough(self):
|
def test_existing_wrapper_passthrough(self):
|
||||||
"""Already-wrapped reader is returned as-is."""
|
"""Already-wrapped reader is returned as-is."""
|
||||||
from fastdeploy.input.video_utils import VideoReaderWrapper
|
from fastdeploy.input.utils.video import VideoReaderWrapper
|
||||||
|
|
||||||
mock_wrapper = MagicMock(spec=VideoReaderWrapper)
|
mock_wrapper = MagicMock(spec=VideoReaderWrapper)
|
||||||
mock_wrapper.__len__ = MagicMock(return_value=50)
|
mock_wrapper.__len__ = MagicMock(return_value=50)
|
||||||
@@ -196,7 +196,7 @@ class TestReadVideoDecord(unittest.TestCase):
|
|||||||
|
|
||||||
def test_bytes_input_converted_to_bytesio(self):
|
def test_bytes_input_converted_to_bytesio(self):
|
||||||
"""bytes input is converted to BytesIO before creating VideoReaderWrapper."""
|
"""bytes input is converted to BytesIO before creating VideoReaderWrapper."""
|
||||||
from fastdeploy.input import video_utils
|
from fastdeploy.input.utils import video
|
||||||
|
|
||||||
captured = []
|
captured = []
|
||||||
|
|
||||||
@@ -210,14 +210,14 @@ class TestReadVideoDecord(unittest.TestCase):
|
|||||||
def get_avg_fps(self):
|
def get_avg_fps(self):
|
||||||
return 10.0
|
return 10.0
|
||||||
|
|
||||||
with patch.object(video_utils, "VideoReaderWrapper", FakeWrapper):
|
with patch.object(video, "VideoReaderWrapper", FakeWrapper):
|
||||||
reader, meta, path = read_video_decord(b"fake_video_bytes")
|
reader, meta, path = read_video_decord(b"fake_video_bytes")
|
||||||
|
|
||||||
self.assertIsInstance(captured[0], io.BytesIO)
|
self.assertIsInstance(captured[0], io.BytesIO)
|
||||||
|
|
||||||
def test_string_path_input(self):
|
def test_string_path_input(self):
|
||||||
"""String path is passed through to VideoReaderWrapper."""
|
"""String path is passed through to VideoReaderWrapper."""
|
||||||
from fastdeploy.input import video_utils
|
from fastdeploy.input.utils import video
|
||||||
|
|
||||||
class FakeWrapper:
|
class FakeWrapper:
|
||||||
def __init__(self, path, *args, **kwargs):
|
def __init__(self, path, *args, **kwargs):
|
||||||
@@ -229,7 +229,7 @@ class TestReadVideoDecord(unittest.TestCase):
|
|||||||
def get_avg_fps(self):
|
def get_avg_fps(self):
|
||||||
return 30.0
|
return 30.0
|
||||||
|
|
||||||
with patch.object(video_utils, "VideoReaderWrapper", FakeWrapper):
|
with patch.object(video, "VideoReaderWrapper", FakeWrapper):
|
||||||
reader, meta, path = read_video_decord("/fake/path.mp4")
|
reader, meta, path = read_video_decord("/fake/path.mp4")
|
||||||
|
|
||||||
self.assertEqual(meta["num_of_frame"], 60)
|
self.assertEqual(meta["num_of_frame"], 60)
|
||||||
@@ -333,18 +333,18 @@ class TestSampleFramesDispatcher(unittest.TestCase):
|
|||||||
META = {"num_of_frame": 100, "fps": 25.0}
|
META = {"num_of_frame": 100, "fps": 25.0}
|
||||||
|
|
||||||
def test_default_variant_is_paddleocr(self):
|
def test_default_variant_is_paddleocr(self):
|
||||||
with patch("fastdeploy.input.video_utils.sample_frames_paddleocr", wraps=sample_frames_paddleocr) as mock_fn:
|
with patch("fastdeploy.input.utils.video.sample_frames_paddleocr", wraps=sample_frames_paddleocr) as mock_fn:
|
||||||
sample_frames(1, 4, 100, self.META, num_frames=8)
|
sample_frames(1, 4, 100, self.META, num_frames=8)
|
||||||
mock_fn.assert_called_once()
|
mock_fn.assert_called_once()
|
||||||
|
|
||||||
def test_qwen_variant_dispatched(self):
|
def test_qwen_variant_dispatched(self):
|
||||||
with patch("fastdeploy.input.video_utils.sample_frames_qwen", wraps=sample_frames_qwen) as mock_fn:
|
with patch("fastdeploy.input.utils.video.sample_frames_qwen", wraps=sample_frames_qwen) as mock_fn:
|
||||||
sample_frames(2, 4, 100, self.META, num_frames=8, variant="qwen")
|
sample_frames(2, 4, 100, self.META, num_frames=8, variant="qwen")
|
||||||
mock_fn.assert_called_once()
|
mock_fn.assert_called_once()
|
||||||
|
|
||||||
def test_qwen_none_fps_converted_to_sentinel(self):
|
def test_qwen_none_fps_converted_to_sentinel(self):
|
||||||
"""None fps/num_frames → converted to -1 before calling sample_frames_qwen."""
|
"""None fps/num_frames → converted to -1 before calling sample_frames_qwen."""
|
||||||
with patch("fastdeploy.input.video_utils.sample_frames_qwen", return_value=np.array([])) as mock_fn:
|
with patch("fastdeploy.input.utils.video.sample_frames_qwen", return_value=np.array([])) as mock_fn:
|
||||||
sample_frames(2, 4, 100, self.META, fps=None, num_frames=None, variant="qwen")
|
sample_frames(2, 4, 100, self.META, fps=None, num_frames=None, variant="qwen")
|
||||||
args = mock_fn.call_args[0]
|
args = mock_fn.call_args[0]
|
||||||
self.assertEqual(args[4], -1) # fps sentinel
|
self.assertEqual(args[4], -1) # fps sentinel
|
||||||
|
|||||||
Reference in New Issue
Block a user