mirror of
https://github.com/PaddlePaddle/FastDeploy.git
synced 2026-04-23 00:17:25 +08:00
3f84d8d893
* merge mm processor
144 lines
4.7 KiB
Python
144 lines
4.7 KiB
Python
# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
|
|
#
|
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
# you may not use this file except in compliance with the License.
|
|
# You may obtain a copy of the License at
|
|
#
|
|
# http://www.apache.org/licenses/LICENSE-2.0
|
|
#
|
|
# Unless required by applicable law or agreed to in writing, software
|
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
# See the License for the specific language governing permissions and
|
|
# limitations under the License.
|
|
|
|
"""Per-model-type configuration for the unified MultiModalProcessor."""
|
|
|
|
from dataclasses import dataclass, field
|
|
from typing import Dict, Optional
|
|
|
|
QWEN_VL = "qwen_vl"
|
|
QWEN3_VL = "qwen3_vl"
|
|
PADDLEOCR_VL = "paddleocr_vl"
|
|
ERNIE4_5_VL = "ernie4_5_vl"
|
|
|
|
|
|
@dataclass(frozen=True)
|
|
class MMModelConfig:
|
|
image_placeholder: str
|
|
video_placeholder: str
|
|
|
|
tokenizer_type: str = "auto" # "auto" | "ernie4_5"
|
|
|
|
default_min_frames: int = 4
|
|
default_max_frames: int = 768
|
|
default_target_frames: int = -1
|
|
default_fps: float = 2.0
|
|
default_frames_sample: str = "leading"
|
|
|
|
has_bad_words: bool = True
|
|
has_tool_role: bool = False # ernie: role_prefixes includes "tool"
|
|
default_thinking: bool = False # ernie: default enable_thinking=True
|
|
force_disable_thinking: bool = False # qwen_vl, qwen3_vl: force enable_thinking=False
|
|
set_default_reasoning_max_tokens: bool = False # ernie: auto-set reasoning_max_tokens
|
|
cap_response_max_tokens: bool = False # ernie: cap max_tokens by response_max_tokens
|
|
has_logits_processor_think: bool = False # ernie: _prepare_think_stop_sentence
|
|
|
|
chat_template_pass_request: bool = False # ernie: pass full request obj
|
|
|
|
supports_prompt_token_ids: bool = False # qwen3, ernie
|
|
|
|
preserve_prompt_token_ids: bool = False # qwen3, ernie: don't overwrite existing
|
|
|
|
stop_tokens_variant: str = "default" # "default" | "qwen3"
|
|
|
|
image_token_str: str = ""
|
|
video_token_str: str = ""
|
|
|
|
expected_kwargs: Dict[str, type] = field(default_factory=dict)
|
|
|
|
video_min_pixels: Optional[int] = None
|
|
video_max_pixels: Optional[int] = None
|
|
|
|
# ---- Conv params source ----
|
|
conv_params_from_kwargs: bool = False # ernie: from processor_kwargs; else: from image_processor
|
|
|
|
# ---- tokens_per_second ----
|
|
has_tokens_per_second: bool = True # qwen-family: read from config; ernie: False
|
|
|
|
|
|
_QWEN_KWARGS = {
|
|
"video_max_frames": int,
|
|
"video_min_frames": int,
|
|
}
|
|
|
|
_ERNIE_KWARGS = {
|
|
"spatial_conv_size": int,
|
|
"temporal_conv_size": int,
|
|
"image_min_pixels": int,
|
|
"image_max_pixels": int,
|
|
"video_min_pixels": int,
|
|
"video_max_pixels": int,
|
|
"video_target_frames": int,
|
|
"video_frames_sample": str,
|
|
"video_max_frames": int,
|
|
"video_min_frames": int,
|
|
"video_fps": int,
|
|
}
|
|
|
|
|
|
MODEL_CONFIGS: Dict[str, MMModelConfig] = {
|
|
QWEN_VL: MMModelConfig(
|
|
image_placeholder="<|image_pad|>",
|
|
video_placeholder="<|video_pad|>",
|
|
image_token_str="<|image_pad|>",
|
|
video_token_str="<|video_pad|>",
|
|
force_disable_thinking=True,
|
|
expected_kwargs=_QWEN_KWARGS,
|
|
),
|
|
QWEN3_VL: MMModelConfig(
|
|
image_placeholder="<|image_pad|>",
|
|
video_placeholder="<|video_pad|>",
|
|
image_token_str="<|image_pad|>",
|
|
video_token_str="<|video_pad|>",
|
|
force_disable_thinking=True,
|
|
supports_prompt_token_ids=True,
|
|
preserve_prompt_token_ids=True,
|
|
stop_tokens_variant="qwen3",
|
|
video_min_pixels=128 * 28 * 28,
|
|
video_max_pixels=768 * 28 * 28,
|
|
expected_kwargs=_QWEN_KWARGS,
|
|
),
|
|
PADDLEOCR_VL: MMModelConfig(
|
|
image_placeholder="<|IMAGE_PLACEHOLDER|>",
|
|
video_placeholder="<|video_pad|>",
|
|
image_token_str="<|IMAGE_PLACEHOLDER|>",
|
|
video_token_str="<|video_pad|>",
|
|
has_bad_words=False,
|
|
default_fps=-1.0,
|
|
expected_kwargs=_QWEN_KWARGS,
|
|
),
|
|
ERNIE4_5_VL: MMModelConfig(
|
|
image_placeholder="<|image@placeholder|>",
|
|
video_placeholder="<|video@placeholder|>",
|
|
tokenizer_type="ernie4_5",
|
|
default_min_frames=16,
|
|
default_max_frames=180,
|
|
default_fps=2.0,
|
|
default_frames_sample="leading",
|
|
has_tool_role=True,
|
|
default_thinking=True,
|
|
set_default_reasoning_max_tokens=True,
|
|
cap_response_max_tokens=True,
|
|
has_logits_processor_think=True,
|
|
chat_template_pass_request=True,
|
|
supports_prompt_token_ids=True,
|
|
preserve_prompt_token_ids=True,
|
|
image_token_str="<|IMAGE_PLACEHOLDER|>",
|
|
video_token_str="<|IMAGE_PLACEHOLDER|>",
|
|
conv_params_from_kwargs=True,
|
|
has_tokens_per_second=False,
|
|
expected_kwargs=_ERNIE_KWARGS,
|
|
),
|
|
}
|