""" # Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. """ from __future__ import annotations from paddleformers.transformers.configuration_utils import PretrainedConfig __all__ = [ "Qwen3VisionTransformerConfig", ] class Qwen3VisionTransformerConfig(PretrainedConfig): r"""Configuration for the Qwen3 vision encoder used in Qwen3-VL.""" model_type = "qwen3_vision_transformer" def __init__( self, depth: int = 27, hidden_size: int = 1152, hidden_act: str = "gelu_tanh", intermediate_size: int = 4304, num_heads: int = 16, in_channels: int = 3, patch_size: int = 16, spatial_merge_size: int = 2, temporal_patch_size: int = 2, out_hidden_size: int = 3584, num_position_embeddings: int = 2304, deepstack_visual_indexes: list[int] | None = None, initializer_range: float = 0.02, tokens_per_second: int = 2, **kwargs, ) -> None: super().__init__(**kwargs) self.depth = depth self.hidden_size = hidden_size self.intermediate_size = intermediate_size self.hidden_act = hidden_act self.num_heads = num_heads self.in_channels = in_channels self.patch_size = patch_size self.spatial_merge_size = spatial_merge_size self.temporal_patch_size = temporal_patch_size self.out_hidden_size = out_hidden_size self.num_position_embeddings = num_position_embeddings self.initializer_range = initializer_range self.deepstack_visual_indexes = list(deepstack_visual_indexes or []) self.tokens_per_second = tokens_per_second