mirror of
https://github.com/PaddlePaddle/FastDeploy.git
synced 2026-04-23 08:21:53 +08:00
[Models] Add Qwen3-VL Model Support (#5763)
* support v1 loader * remove useless code * remove useless * [Model] support Qwen3VL images success * [Model] support Qwen3VL rope_3d * [Model] support Qwen3VL remove log * [Model] support Qwen3VL RL * [Model] support Qwen3VL tp * [Model] support Qwen3VL video * [Model] support Qwen3VL fix ernievl * [Model] support Qwen3VL fix get_image_boundaries.cc array out of bounds * [Model] support Qwen3VL fix multi card * [Model] support Qwen3VL file close * [Model] support Qwen3VL fix ce * [Model] support Qwen3VL fix unittest * [Model] support Qwen3VL add unittest --------- Co-authored-by: Ayakouji <yuhongh@qq.com>
This commit is contained in:
@@ -19,6 +19,7 @@ import pickle
|
||||
from typing import Any, Dict, List, Optional, Tuple, Union
|
||||
|
||||
import numpy as np
|
||||
import paddle
|
||||
import zmq
|
||||
from paddleformers.transformers import AutoTokenizer
|
||||
from PIL import Image
|
||||
@@ -26,6 +27,7 @@ from PIL import Image
|
||||
from fastdeploy.engine.request import ImagePosition
|
||||
from fastdeploy.entrypoints.chat_utils import parse_chat_messages
|
||||
from fastdeploy.input.ernie4_5_vl_processor import read_video_decord
|
||||
from fastdeploy.input.mm_data_processor import MMBaseDataProcessor
|
||||
from fastdeploy.input.utils import IDS_TYPE_FLAG
|
||||
from fastdeploy.multimodal.hasher import MultimodalHasher
|
||||
from fastdeploy.utils import data_processor_logger
|
||||
@@ -33,8 +35,13 @@ from fastdeploy.utils import data_processor_logger
|
||||
from .image_processor import ImageProcessor
|
||||
from .process_video import sample_frames
|
||||
|
||||
FRAME_FACTOR = 2
|
||||
FPS = 2.0
|
||||
FPS_MIN_FRAMES = 4
|
||||
FPS_MAX_FRAMES = 768
|
||||
|
||||
class DataProcessor:
|
||||
|
||||
class DataProcessor(MMBaseDataProcessor):
|
||||
"""
|
||||
Processes multimodal inputs (text, images, videos) into model-ready formats.
|
||||
|
||||
@@ -56,10 +63,10 @@ class DataProcessor:
|
||||
self,
|
||||
model_path: str,
|
||||
enable_processor_cache: bool = False,
|
||||
video_min_frames: int = 4,
|
||||
video_max_frames: int = 768,
|
||||
video_min_frames: int = FPS_MIN_FRAMES,
|
||||
video_max_frames: int = FPS_MAX_FRAMES,
|
||||
video_target_frames: int = -1,
|
||||
video_fps: int = -1,
|
||||
video_fps: int = FPS,
|
||||
tokens_per_second: int = 2,
|
||||
tokenizer=None,
|
||||
**kwargs,
|
||||
@@ -74,10 +81,12 @@ class DataProcessor:
|
||||
tokens_per_second: Temporal resolution for positional embeddings
|
||||
**kwargs: Additional configuration
|
||||
"""
|
||||
super().__init__()
|
||||
self.min_frames = video_min_frames
|
||||
self.max_frames = video_max_frames
|
||||
self.target_frames = video_target_frames
|
||||
self.fps = video_fps
|
||||
self.frame_factor = FRAME_FACTOR
|
||||
|
||||
# Initialize tokenizer with left padding and fast tokenizer
|
||||
if tokenizer is None:
|
||||
@@ -111,6 +120,26 @@ class DataProcessor:
|
||||
"assistant": "Assistant: ",
|
||||
}
|
||||
|
||||
@staticmethod
|
||||
def mm_num_tokens(grid_thw: list | list[list[int]] | np.ndarray | paddle.Tensor) -> int | list[int]:
|
||||
"""
|
||||
Calculate the number of tokens in the multimodal input.
|
||||
"""
|
||||
if isinstance(grid_thw, paddle.Tensor):
|
||||
grid_thw = grid_thw.numpy()
|
||||
|
||||
if len(grid_thw) == 0:
|
||||
return 0
|
||||
|
||||
def calc_one(thw):
|
||||
t, h, w = map(int, thw)
|
||||
return t * h * w // 4
|
||||
|
||||
if isinstance(grid_thw[0], (list, tuple, np.ndarray)):
|
||||
return [calc_one(x) for x in grid_thw]
|
||||
|
||||
return calc_one(grid_thw)
|
||||
|
||||
def text2ids(self, text, images=None, videos=None, image_uuid=None, video_uuid=None):
|
||||
"""
|
||||
Convert text with image/video placeholders into model inputs.
|
||||
@@ -410,7 +439,9 @@ class DataProcessor:
|
||||
grid_thw = ret["grid_thw"].tolist()
|
||||
|
||||
outputs["mm_positions"].append(ImagePosition(len(outputs["input_ids"]), num_tokens))
|
||||
outputs["input_ids"].extend([self.video_token_id] * num_tokens)
|
||||
# Hack code. In order to adapt to the framework, only image_token can be passed
|
||||
# The correct way should be to use [self.video_token_id] * num_tokens
|
||||
outputs["input_ids"].extend([self.image_token_id] * num_tokens)
|
||||
outputs["token_type_ids"].extend([IDS_TYPE_FLAG["video"]] * num_tokens)
|
||||
outputs["num_input_video_tokens"] += int(num_tokens)
|
||||
|
||||
@@ -514,7 +545,7 @@ class DataProcessor:
|
||||
|
||||
# Sample frames according to specifications
|
||||
frame_indices = sample_frames(
|
||||
frame_factor=self.temporal_conv_size, # Ensure divisible by temporal patch size
|
||||
frame_factor=self.frame_factor, # Ensure divisible by temporal patch size
|
||||
min_frames=min_frames,
|
||||
max_frames=max_frames,
|
||||
metadata=meta,
|
||||
|
||||
Reference in New Issue
Block a user