[DataProcessor] Refactor multimodal processor: extract encoding strategies and unify MM processing pipeline (#7298)

* merge mm processor
2026-04-23 17:11:21 +08:00 · 2026-04-15 19:01:06 +08:00
parent a218d29488
commit 3f84d8d893
36 changed files with 4016 additions and 681 deletions
@@ -0,0 +1,143 @@
+# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Per-model-type configuration for the unified MultiModalProcessor."""
+
+from dataclasses import dataclass, field
+from typing import Dict, Optional
+
+QWEN_VL = "qwen_vl"
+QWEN3_VL = "qwen3_vl"
+PADDLEOCR_VL = "paddleocr_vl"
+ERNIE4_5_VL = "ernie4_5_vl"
+
+
+@dataclass(frozen=True)
+class MMModelConfig:
+    image_placeholder: str
+    video_placeholder: str
+
+    tokenizer_type: str = "auto"  # "auto" | "ernie4_5"
+
+    default_min_frames: int = 4
+    default_max_frames: int = 768
+    default_target_frames: int = -1
+    default_fps: float = 2.0
+    default_frames_sample: str = "leading"
+
+    has_bad_words: bool = True
+    has_tool_role: bool = False  # ernie: role_prefixes includes "tool"
+    default_thinking: bool = False  # ernie: default enable_thinking=True
+    force_disable_thinking: bool = False  # qwen_vl, qwen3_vl: force enable_thinking=False
+    set_default_reasoning_max_tokens: bool = False  # ernie: auto-set reasoning_max_tokens
+    cap_response_max_tokens: bool = False  # ernie: cap max_tokens by response_max_tokens
+    has_logits_processor_think: bool = False  # ernie: _prepare_think_stop_sentence
+
+    chat_template_pass_request: bool = False  # ernie: pass full request obj
+
+    supports_prompt_token_ids: bool = False  # qwen3, ernie
+
+    preserve_prompt_token_ids: bool = False  # qwen3, ernie: don't overwrite existing
+
+    stop_tokens_variant: str = "default"  # "default" | "qwen3"
+
+    image_token_str: str = ""
+    video_token_str: str = ""
+
+    expected_kwargs: Dict[str, type] = field(default_factory=dict)
+
+    video_min_pixels: Optional[int] = None
+    video_max_pixels: Optional[int] = None
+
+    # ---- Conv params source ----
+    conv_params_from_kwargs: bool = False  # ernie: from processor_kwargs; else: from image_processor
+
+    # ---- tokens_per_second ----
+    has_tokens_per_second: bool = True  # qwen-family: read from config; ernie: False
+
+
+_QWEN_KWARGS = {
+    "video_max_frames": int,
+    "video_min_frames": int,
+}
+
+_ERNIE_KWARGS = {
+    "spatial_conv_size": int,
+    "temporal_conv_size": int,
+    "image_min_pixels": int,
+    "image_max_pixels": int,
+    "video_min_pixels": int,
+    "video_max_pixels": int,
+    "video_target_frames": int,
+    "video_frames_sample": str,
+    "video_max_frames": int,
+    "video_min_frames": int,
+    "video_fps": int,
+}
+
+
+MODEL_CONFIGS: Dict[str, MMModelConfig] = {
+    QWEN_VL: MMModelConfig(
+        image_placeholder="<|image_pad|>",
+        video_placeholder="<|video_pad|>",
+        image_token_str="<|image_pad|>",
+        video_token_str="<|video_pad|>",
+        force_disable_thinking=True,
+        expected_kwargs=_QWEN_KWARGS,
+    ),
+    QWEN3_VL: MMModelConfig(
+        image_placeholder="<|image_pad|>",
+        video_placeholder="<|video_pad|>",
+        image_token_str="<|image_pad|>",
+        video_token_str="<|video_pad|>",
+        force_disable_thinking=True,
+        supports_prompt_token_ids=True,
+        preserve_prompt_token_ids=True,
+        stop_tokens_variant="qwen3",
+        video_min_pixels=128 * 28 * 28,
+        video_max_pixels=768 * 28 * 28,
+        expected_kwargs=_QWEN_KWARGS,
+    ),
+    PADDLEOCR_VL: MMModelConfig(
+        image_placeholder="<|IMAGE_PLACEHOLDER|>",
+        video_placeholder="<|video_pad|>",
+        image_token_str="<|IMAGE_PLACEHOLDER|>",
+        video_token_str="<|video_pad|>",
+        has_bad_words=False,
+        default_fps=-1.0,
+        expected_kwargs=_QWEN_KWARGS,
+    ),
+    ERNIE4_5_VL: MMModelConfig(
+        image_placeholder="<|image@placeholder|>",
+        video_placeholder="<|video@placeholder|>",
+        tokenizer_type="ernie4_5",
+        default_min_frames=16,
+        default_max_frames=180,
+        default_fps=2.0,
+        default_frames_sample="leading",
+        has_tool_role=True,
+        default_thinking=True,
+        set_default_reasoning_max_tokens=True,
+        cap_response_max_tokens=True,
+        has_logits_processor_think=True,
+        chat_template_pass_request=True,
+        supports_prompt_token_ids=True,
+        preserve_prompt_token_ids=True,
+        image_token_str="<|IMAGE_PLACEHOLDER|>",
+        video_token_str="<|IMAGE_PLACEHOLDER|>",
+        conv_params_from_kwargs=True,
+        has_tokens_per_second=False,
+        expected_kwargs=_ERNIE_KWARGS,
+    ),
+}