FastDeploy/fastdeploy/input/mm_model_config.py

# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

"""Per-model-type configuration for the unified MultiModalProcessor."""

from dataclasses import dataclass, field
from typing import Dict, Optional

QWEN_VL = "qwen_vl"
QWEN3_VL = "qwen3_vl"
PADDLEOCR_VL = "paddleocr_vl"
ERNIE4_5_VL = "ernie4_5_vl"


@dataclass(frozen=True)
class MMModelConfig:
    image_placeholder: str
    video_placeholder: str

    tokenizer_type: str = "auto"  # "auto" | "ernie4_5"

    default_min_frames: int = 4
    default_max_frames: int = 768
    default_target_frames: int = -1
    default_fps: float = 2.0
    default_frames_sample: str = "leading"

    has_bad_words: bool = True
    has_tool_role: bool = False  # ernie: role_prefixes includes "tool"
    default_thinking: bool = False  # ernie: default enable_thinking=True
    force_disable_thinking: bool = False  # qwen_vl, qwen3_vl: force enable_thinking=False
    set_default_reasoning_max_tokens: bool = False  # ernie: auto-set reasoning_max_tokens
    cap_response_max_tokens: bool = False  # ernie: cap max_tokens by response_max_tokens
    has_logits_processor_think: bool = False  # ernie: _prepare_think_stop_sentence

    chat_template_pass_request: bool = False  # ernie: pass full request obj

    supports_prompt_token_ids: bool = False  # qwen3, ernie

    preserve_prompt_token_ids: bool = False  # qwen3, ernie: don't overwrite existing

    stop_tokens_variant: str = "default"  # "default" | "qwen3"

    image_token_str: str = ""
    video_token_str: str = ""

    expected_kwargs: Dict[str, type] = field(default_factory=dict)

    video_min_pixels: Optional[int] = None
    video_max_pixels: Optional[int] = None

    # ---- Conv params source ----
    conv_params_from_kwargs: bool = False  # ernie: from processor_kwargs; else: from image_processor

    # ---- tokens_per_second ----
    has_tokens_per_second: bool = True  # qwen-family: read from config; ernie: False


_QWEN_KWARGS = {
    "video_max_frames": int,
    "video_min_frames": int,
}

_ERNIE_KWARGS = {
    "spatial_conv_size": int,
    "temporal_conv_size": int,
    "image_min_pixels": int,
    "image_max_pixels": int,
    "video_min_pixels": int,
    "video_max_pixels": int,
    "video_target_frames": int,
    "video_frames_sample": str,
    "video_max_frames": int,
    "video_min_frames": int,
    "video_fps": int,
}


MODEL_CONFIGS: Dict[str, MMModelConfig] = {
    QWEN_VL: MMModelConfig(
        image_placeholder="<|image_pad|>",
        video_placeholder="<|video_pad|>",
        image_token_str="<|image_pad|>",
        video_token_str="<|video_pad|>",
        force_disable_thinking=True,
        expected_kwargs=_QWEN_KWARGS,
    ),
    QWEN3_VL: MMModelConfig(
        image_placeholder="<|image_pad|>",
        video_placeholder="<|video_pad|>",
        image_token_str="<|image_pad|>",
        video_token_str="<|video_pad|>",
        force_disable_thinking=True,
        supports_prompt_token_ids=True,
        preserve_prompt_token_ids=True,
        stop_tokens_variant="qwen3",
        video_min_pixels=128 * 28 * 28,
        video_max_pixels=768 * 28 * 28,
        expected_kwargs=_QWEN_KWARGS,
    ),
    PADDLEOCR_VL: MMModelConfig(
        image_placeholder="<|IMAGE_PLACEHOLDER|>",
        video_placeholder="<|video_pad|>",
        image_token_str="<|IMAGE_PLACEHOLDER|>",
        video_token_str="<|video_pad|>",
        has_bad_words=False,
        default_fps=-1.0,
        expected_kwargs=_QWEN_KWARGS,
    ),
    ERNIE4_5_VL: MMModelConfig(
        image_placeholder="<|image@placeholder|>",
        video_placeholder="<|video@placeholder|>",
        tokenizer_type="ernie4_5",
        default_min_frames=16,
        default_max_frames=180,
        default_fps=2.0,
        default_frames_sample="leading",
        has_tool_role=True,
        default_thinking=True,
        set_default_reasoning_max_tokens=True,
        cap_response_max_tokens=True,
        has_logits_processor_think=True,
        chat_template_pass_request=True,
        supports_prompt_token_ids=True,
        preserve_prompt_token_ids=True,
        image_token_str="<|IMAGE_PLACEHOLDER|>",
        video_token_str="<|IMAGE_PLACEHOLDER|>",
        conv_params_from_kwargs=True,
        has_tokens_per_second=False,
        expected_kwargs=_ERNIE_KWARGS,
    ),
}