mirror of
https://github.com/PaddlePaddle/FastDeploy.git
synced 2026-04-24 17:49:42 +08:00
[Models] Add Qwen3-VL Model Support (#5763)
* support v1 loader * remove useless code * remove useless * [Model] support Qwen3VL images success * [Model] support Qwen3VL rope_3d * [Model] support Qwen3VL remove log * [Model] support Qwen3VL RL * [Model] support Qwen3VL tp * [Model] support Qwen3VL video * [Model] support Qwen3VL fix ernievl * [Model] support Qwen3VL fix get_image_boundaries.cc array out of bounds * [Model] support Qwen3VL fix multi card * [Model] support Qwen3VL file close * [Model] support Qwen3VL fix ce * [Model] support Qwen3VL fix unittest * [Model] support Qwen3VL add unittest --------- Co-authored-by: Ayakouji <yuhongh@qq.com>
This commit is contained in:
@@ -23,6 +23,7 @@ from collections import defaultdict
|
||||
from typing import Any, Dict, List, Optional, Tuple, Union
|
||||
|
||||
import numpy as np
|
||||
import paddle
|
||||
import zmq
|
||||
from paddleformers.transformers.image_utils import ChannelDimension
|
||||
from PIL import Image
|
||||
@@ -30,6 +31,7 @@ from PIL import Image
|
||||
from fastdeploy.engine.request import ImagePosition
|
||||
from fastdeploy.entrypoints.chat_utils import parse_chat_messages
|
||||
from fastdeploy.input.ernie4_5_tokenizer import Ernie4_5Tokenizer
|
||||
from fastdeploy.input.mm_data_processor import MMBaseDataProcessor
|
||||
from fastdeploy.input.utils import IDS_TYPE_FLAG
|
||||
from fastdeploy.multimodal.hasher import MultimodalHasher
|
||||
from fastdeploy.utils import data_processor_logger
|
||||
@@ -70,7 +72,7 @@ def fancy_print(input_ids, tokenizer, image_patch_id=None):
|
||||
return res
|
||||
|
||||
|
||||
class DataProcessor:
|
||||
class DataProcessor(MMBaseDataProcessor):
|
||||
"""
|
||||
Processes multimodal chat messages into model-ready inputs,
|
||||
handling text, images, and videos with 3D positional embeddings.
|
||||
@@ -102,6 +104,7 @@ class DataProcessor:
|
||||
video_fps: int = 2,
|
||||
**kwargs,
|
||||
) -> None:
|
||||
super().__init__()
|
||||
# Tokenizer and image preprocessor
|
||||
self.model_name_or_path = tokenizer_name
|
||||
self._load_tokenizer()
|
||||
@@ -152,6 +155,29 @@ class DataProcessor:
|
||||
"tool": "Tool: ",
|
||||
}
|
||||
|
||||
@staticmethod
|
||||
def mm_num_tokens(grid_thw: list | list[list[int]] | np.ndarray | paddle.Tensor) -> int | list[int]:
|
||||
"""
|
||||
Calculate the number of tokens in the multimodal input.
|
||||
"""
|
||||
if isinstance(grid_thw, paddle.Tensor):
|
||||
grid_thw = grid_thw.numpy()
|
||||
|
||||
if len(grid_thw) == 0:
|
||||
return 0
|
||||
|
||||
def calc_one(thw):
|
||||
t, h, w = map(int, thw)
|
||||
if t == 1:
|
||||
return t * h * w // 4
|
||||
else:
|
||||
return t * h * w // 4 // 2
|
||||
|
||||
if isinstance(grid_thw[0], (list, tuple, np.ndarray)):
|
||||
return [calc_one(x) for x in grid_thw]
|
||||
|
||||
return calc_one(grid_thw)
|
||||
|
||||
def _build_token_type_mapping(self) -> Dict[Any, int]:
|
||||
mapping = defaultdict(lambda: IDS_TYPE_FLAG["text"])
|
||||
for token in (
|
||||
|
||||
Reference in New Issue
Block a user