mirror of
https://github.com/PaddlePaddle/FastDeploy.git
synced 2026-04-23 17:11:21 +08:00
9286403570
* support v1 loader * remove useless code * remove useless * [Model] support Qwen3VL images success * [Model] support Qwen3VL rope_3d * [Model] support Qwen3VL remove log * [Model] support Qwen3VL RL * [Model] support Qwen3VL tp * [Model] support Qwen3VL video * [Model] support Qwen3VL fix ernievl * [Model] support Qwen3VL fix get_image_boundaries.cc array out of bounds * [Model] support Qwen3VL fix multi card * [Model] support Qwen3VL file close * [Model] support Qwen3VL fix ce * [Model] support Qwen3VL fix unittest * [Model] support Qwen3VL add unittest --------- Co-authored-by: Ayakouji <yuhongh@qq.com>
65 lines
2.2 KiB
Python
65 lines
2.2 KiB
Python
"""
|
|
# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
|
|
#
|
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
# you may not use this file except in compliance with the License.
|
|
# You may obtain a copy of the License at
|
|
#
|
|
# http://www.apache.org/licenses/LICENSE-2.0
|
|
#
|
|
# Unless required by applicable law or agreed to in writing, software
|
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
# See the License for the specific language governing permissions and
|
|
# limitations under the License.
|
|
"""
|
|
|
|
from __future__ import annotations
|
|
|
|
from paddleformers.transformers.configuration_utils import PretrainedConfig
|
|
|
|
__all__ = [
|
|
"Qwen3VisionTransformerConfig",
|
|
]
|
|
|
|
|
|
class Qwen3VisionTransformerConfig(PretrainedConfig):
|
|
r"""Configuration for the Qwen3 vision encoder used in Qwen3-VL."""
|
|
|
|
model_type = "qwen3_vision_transformer"
|
|
|
|
def __init__(
|
|
self,
|
|
depth: int = 27,
|
|
hidden_size: int = 1152,
|
|
hidden_act: str = "gelu_tanh",
|
|
intermediate_size: int = 4304,
|
|
num_heads: int = 16,
|
|
in_channels: int = 3,
|
|
patch_size: int = 16,
|
|
spatial_merge_size: int = 2,
|
|
temporal_patch_size: int = 2,
|
|
out_hidden_size: int = 3584,
|
|
num_position_embeddings: int = 2304,
|
|
deepstack_visual_indexes: list[int] | None = None,
|
|
initializer_range: float = 0.02,
|
|
tokens_per_second: int = 2,
|
|
**kwargs,
|
|
) -> None:
|
|
super().__init__(**kwargs)
|
|
|
|
self.depth = depth
|
|
self.hidden_size = hidden_size
|
|
self.intermediate_size = intermediate_size
|
|
self.hidden_act = hidden_act
|
|
self.num_heads = num_heads
|
|
self.in_channels = in_channels
|
|
self.patch_size = patch_size
|
|
self.spatial_merge_size = spatial_merge_size
|
|
self.temporal_patch_size = temporal_patch_size
|
|
self.out_hidden_size = out_hidden_size
|
|
self.num_position_embeddings = num_position_embeddings
|
|
self.initializer_range = initializer_range
|
|
self.deepstack_visual_indexes = list(deepstack_visual_indexes or [])
|
|
self.tokens_per_second = tokens_per_second
|