Files
FastDeploy/fastdeploy/input/mm_model_config.py
T

144 lines
4.7 KiB
Python

# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Per-model-type configuration for the unified MultiModalProcessor."""
from dataclasses import dataclass, field
from typing import Dict, Optional
QWEN_VL = "qwen_vl"
QWEN3_VL = "qwen3_vl"
PADDLEOCR_VL = "paddleocr_vl"
ERNIE4_5_VL = "ernie4_5_vl"
@dataclass(frozen=True)
class MMModelConfig:
image_placeholder: str
video_placeholder: str
tokenizer_type: str = "auto" # "auto" | "ernie4_5"
default_min_frames: int = 4
default_max_frames: int = 768
default_target_frames: int = -1
default_fps: float = 2.0
default_frames_sample: str = "leading"
has_bad_words: bool = True
has_tool_role: bool = False # ernie: role_prefixes includes "tool"
default_thinking: bool = False # ernie: default enable_thinking=True
force_disable_thinking: bool = False # qwen_vl, qwen3_vl: force enable_thinking=False
set_default_reasoning_max_tokens: bool = False # ernie: auto-set reasoning_max_tokens
cap_response_max_tokens: bool = False # ernie: cap max_tokens by response_max_tokens
has_logits_processor_think: bool = False # ernie: _prepare_think_stop_sentence
chat_template_pass_request: bool = False # ernie: pass full request obj
supports_prompt_token_ids: bool = False # qwen3, ernie
preserve_prompt_token_ids: bool = False # qwen3, ernie: don't overwrite existing
stop_tokens_variant: str = "default" # "default" | "qwen3"
image_token_str: str = ""
video_token_str: str = ""
expected_kwargs: Dict[str, type] = field(default_factory=dict)
video_min_pixels: Optional[int] = None
video_max_pixels: Optional[int] = None
# ---- Conv params source ----
conv_params_from_kwargs: bool = False # ernie: from processor_kwargs; else: from image_processor
# ---- tokens_per_second ----
has_tokens_per_second: bool = True # qwen-family: read from config; ernie: False
_QWEN_KWARGS = {
"video_max_frames": int,
"video_min_frames": int,
}
_ERNIE_KWARGS = {
"spatial_conv_size": int,
"temporal_conv_size": int,
"image_min_pixels": int,
"image_max_pixels": int,
"video_min_pixels": int,
"video_max_pixels": int,
"video_target_frames": int,
"video_frames_sample": str,
"video_max_frames": int,
"video_min_frames": int,
"video_fps": int,
}
MODEL_CONFIGS: Dict[str, MMModelConfig] = {
QWEN_VL: MMModelConfig(
image_placeholder="<|image_pad|>",
video_placeholder="<|video_pad|>",
image_token_str="<|image_pad|>",
video_token_str="<|video_pad|>",
force_disable_thinking=True,
expected_kwargs=_QWEN_KWARGS,
),
QWEN3_VL: MMModelConfig(
image_placeholder="<|image_pad|>",
video_placeholder="<|video_pad|>",
image_token_str="<|image_pad|>",
video_token_str="<|video_pad|>",
force_disable_thinking=True,
supports_prompt_token_ids=True,
preserve_prompt_token_ids=True,
stop_tokens_variant="qwen3",
video_min_pixels=128 * 28 * 28,
video_max_pixels=768 * 28 * 28,
expected_kwargs=_QWEN_KWARGS,
),
PADDLEOCR_VL: MMModelConfig(
image_placeholder="<|IMAGE_PLACEHOLDER|>",
video_placeholder="<|video_pad|>",
image_token_str="<|IMAGE_PLACEHOLDER|>",
video_token_str="<|video_pad|>",
has_bad_words=False,
default_fps=-1.0,
expected_kwargs=_QWEN_KWARGS,
),
ERNIE4_5_VL: MMModelConfig(
image_placeholder="<|image@placeholder|>",
video_placeholder="<|video@placeholder|>",
tokenizer_type="ernie4_5",
default_min_frames=16,
default_max_frames=180,
default_fps=2.0,
default_frames_sample="leading",
has_tool_role=True,
default_thinking=True,
set_default_reasoning_max_tokens=True,
cap_response_max_tokens=True,
has_logits_processor_think=True,
chat_template_pass_request=True,
supports_prompt_token_ids=True,
preserve_prompt_token_ids=True,
image_token_str="<|IMAGE_PLACEHOLDER|>",
video_token_str="<|IMAGE_PLACEHOLDER|>",
conv_params_from_kwargs=True,
has_tokens_per_second=False,
expected_kwargs=_ERNIE_KWARGS,
),
}