mirror of
https://github.com/PaddlePaddle/FastDeploy.git
synced 2026-04-23 17:11:21 +08:00
[DataProcessor] Refactor multimodal processor: extract encoding strategies and unify MM processing pipeline (#7298)
* merge mm processor
This commit is contained in:
@@ -0,0 +1,143 @@
|
||||
# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
"""Per-model-type configuration for the unified MultiModalProcessor."""
|
||||
|
||||
from dataclasses import dataclass, field
|
||||
from typing import Dict, Optional
|
||||
|
||||
QWEN_VL = "qwen_vl"
|
||||
QWEN3_VL = "qwen3_vl"
|
||||
PADDLEOCR_VL = "paddleocr_vl"
|
||||
ERNIE4_5_VL = "ernie4_5_vl"
|
||||
|
||||
|
||||
@dataclass(frozen=True)
|
||||
class MMModelConfig:
|
||||
image_placeholder: str
|
||||
video_placeholder: str
|
||||
|
||||
tokenizer_type: str = "auto" # "auto" | "ernie4_5"
|
||||
|
||||
default_min_frames: int = 4
|
||||
default_max_frames: int = 768
|
||||
default_target_frames: int = -1
|
||||
default_fps: float = 2.0
|
||||
default_frames_sample: str = "leading"
|
||||
|
||||
has_bad_words: bool = True
|
||||
has_tool_role: bool = False # ernie: role_prefixes includes "tool"
|
||||
default_thinking: bool = False # ernie: default enable_thinking=True
|
||||
force_disable_thinking: bool = False # qwen_vl, qwen3_vl: force enable_thinking=False
|
||||
set_default_reasoning_max_tokens: bool = False # ernie: auto-set reasoning_max_tokens
|
||||
cap_response_max_tokens: bool = False # ernie: cap max_tokens by response_max_tokens
|
||||
has_logits_processor_think: bool = False # ernie: _prepare_think_stop_sentence
|
||||
|
||||
chat_template_pass_request: bool = False # ernie: pass full request obj
|
||||
|
||||
supports_prompt_token_ids: bool = False # qwen3, ernie
|
||||
|
||||
preserve_prompt_token_ids: bool = False # qwen3, ernie: don't overwrite existing
|
||||
|
||||
stop_tokens_variant: str = "default" # "default" | "qwen3"
|
||||
|
||||
image_token_str: str = ""
|
||||
video_token_str: str = ""
|
||||
|
||||
expected_kwargs: Dict[str, type] = field(default_factory=dict)
|
||||
|
||||
video_min_pixels: Optional[int] = None
|
||||
video_max_pixels: Optional[int] = None
|
||||
|
||||
# ---- Conv params source ----
|
||||
conv_params_from_kwargs: bool = False # ernie: from processor_kwargs; else: from image_processor
|
||||
|
||||
# ---- tokens_per_second ----
|
||||
has_tokens_per_second: bool = True # qwen-family: read from config; ernie: False
|
||||
|
||||
|
||||
_QWEN_KWARGS = {
|
||||
"video_max_frames": int,
|
||||
"video_min_frames": int,
|
||||
}
|
||||
|
||||
_ERNIE_KWARGS = {
|
||||
"spatial_conv_size": int,
|
||||
"temporal_conv_size": int,
|
||||
"image_min_pixels": int,
|
||||
"image_max_pixels": int,
|
||||
"video_min_pixels": int,
|
||||
"video_max_pixels": int,
|
||||
"video_target_frames": int,
|
||||
"video_frames_sample": str,
|
||||
"video_max_frames": int,
|
||||
"video_min_frames": int,
|
||||
"video_fps": int,
|
||||
}
|
||||
|
||||
|
||||
MODEL_CONFIGS: Dict[str, MMModelConfig] = {
|
||||
QWEN_VL: MMModelConfig(
|
||||
image_placeholder="<|image_pad|>",
|
||||
video_placeholder="<|video_pad|>",
|
||||
image_token_str="<|image_pad|>",
|
||||
video_token_str="<|video_pad|>",
|
||||
force_disable_thinking=True,
|
||||
expected_kwargs=_QWEN_KWARGS,
|
||||
),
|
||||
QWEN3_VL: MMModelConfig(
|
||||
image_placeholder="<|image_pad|>",
|
||||
video_placeholder="<|video_pad|>",
|
||||
image_token_str="<|image_pad|>",
|
||||
video_token_str="<|video_pad|>",
|
||||
force_disable_thinking=True,
|
||||
supports_prompt_token_ids=True,
|
||||
preserve_prompt_token_ids=True,
|
||||
stop_tokens_variant="qwen3",
|
||||
video_min_pixels=128 * 28 * 28,
|
||||
video_max_pixels=768 * 28 * 28,
|
||||
expected_kwargs=_QWEN_KWARGS,
|
||||
),
|
||||
PADDLEOCR_VL: MMModelConfig(
|
||||
image_placeholder="<|IMAGE_PLACEHOLDER|>",
|
||||
video_placeholder="<|video_pad|>",
|
||||
image_token_str="<|IMAGE_PLACEHOLDER|>",
|
||||
video_token_str="<|video_pad|>",
|
||||
has_bad_words=False,
|
||||
default_fps=-1.0,
|
||||
expected_kwargs=_QWEN_KWARGS,
|
||||
),
|
||||
ERNIE4_5_VL: MMModelConfig(
|
||||
image_placeholder="<|image@placeholder|>",
|
||||
video_placeholder="<|video@placeholder|>",
|
||||
tokenizer_type="ernie4_5",
|
||||
default_min_frames=16,
|
||||
default_max_frames=180,
|
||||
default_fps=2.0,
|
||||
default_frames_sample="leading",
|
||||
has_tool_role=True,
|
||||
default_thinking=True,
|
||||
set_default_reasoning_max_tokens=True,
|
||||
cap_response_max_tokens=True,
|
||||
has_logits_processor_think=True,
|
||||
chat_template_pass_request=True,
|
||||
supports_prompt_token_ids=True,
|
||||
preserve_prompt_token_ids=True,
|
||||
image_token_str="<|IMAGE_PLACEHOLDER|>",
|
||||
video_token_str="<|IMAGE_PLACEHOLDER|>",
|
||||
conv_params_from_kwargs=True,
|
||||
has_tokens_per_second=False,
|
||||
expected_kwargs=_ERNIE_KWARGS,
|
||||
),
|
||||
}
|
||||
Reference in New Issue
Block a user