[Models] Add Qwen3-VL Model Support (#5763)

* support v1 loader * remove useless code * remove useless * [Model] support Qwen3VL images success * [Model] support Qwen3VL rope_3d * [Model] support Qwen3VL remove log * [Model] support Qwen3VL RL * [Model] support Qwen3VL tp * [Model] support Qwen3VL video * [Model] support Qwen3VL fix ernievl * [Model] support Qwen3VL fix get_image_boundaries.cc array out of bounds * [Model] support Qwen3VL fix multi card * [Model] support Qwen3VL file close * [Model] support Qwen3VL fix ce * [Model] support Qwen3VL fix unittest * [Model] support Qwen3VL add unittest --------- Co-authored-by: Ayakouji <yuhongh@qq.com>
2026-04-24 01:29:57 +08:00 · 2025-12-29 17:39:33 +08:00
parent a3f0696e35
commit 9286403570
36 changed files with 5917 additions and 91 deletions
@@ -19,6 +19,7 @@ import pickle
 from typing import Any, Dict, List, Optional, Tuple, Union

 import numpy as np
+import paddle
 import zmq
 from paddleformers.transformers import AutoTokenizer
 from PIL import Image
@@ -26,6 +27,7 @@ from PIL import Image
 from fastdeploy.engine.request import ImagePosition
 from fastdeploy.entrypoints.chat_utils import parse_chat_messages
 from fastdeploy.input.ernie4_5_vl_processor import read_video_decord
+from fastdeploy.input.mm_data_processor import MMBaseDataProcessor
 from fastdeploy.input.utils import IDS_TYPE_FLAG
 from fastdeploy.multimodal.hasher import MultimodalHasher
 from fastdeploy.utils import data_processor_logger
@@ -34,7 +36,7 @@ from .image_processor import ImageProcessor
 from .process_video import sample_frames


-class DataProcessor:
+class DataProcessor(MMBaseDataProcessor):
    """
    Processes multimodal inputs (text, images, videos) into model-ready formats.

@@ -74,6 +76,7 @@ class DataProcessor:
            tokens_per_second: Temporal resolution for positional embeddings
            **kwargs: Additional configuration
        """
+        super().__init__()
        self.min_frames = video_min_frames
        self.max_frames = video_max_frames
        self.target_frames = video_target_frames
@@ -112,6 +115,26 @@ class DataProcessor:
            "assistant": "Assistant: ",
        }

+    @staticmethod
+    def mm_num_tokens(grid_thw: list | list[list[int]] | np.ndarray | paddle.Tensor) -> int | list[int]:
+        """
+        Calculate the number of tokens in the multimodal input.
+        """
+        if isinstance(grid_thw, paddle.Tensor):
+            grid_thw = grid_thw.numpy()
+
+        if len(grid_thw) == 0:
+            return 0
+
+        def calc_one(thw):
+            t, h, w = map(int, thw)
+            return t * h * w // 4
+
+        if isinstance(grid_thw[0], (list, tuple, np.ndarray)):
+            return [calc_one(x) for x in grid_thw]
+
+        return calc_one(grid_thw)
+
    def text2ids(self, text, images=None, videos=None, image_uuid=None, video_uuid=None):
        """
        Convert text with image/video placeholders into model inputs.