[Feature] Add Qwen25-VL Processor (#3501)

* add qwen-2.5-vl processor * add qwen25-vl processor * add qwen25-vl processor * add qwen25-vl processor * add qwen25-vl processor position_ids * add qwen25-vl processor * add qwen25-vl processor * position_ids * add test for qwen25-vl * organize comments * formatted * qwen_vl_processor * add qwen_vl_processor unittest * update model path * update model path * update qwen_vl_processor unittest * add unittest and bug fix * add unittest and bug fix * Update fastdeploy/input/qwen_mm_processor/image_processor.py Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com> * Update fastdeploy/input/qwen_vl_processor.py Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com> --------- Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com>
2026-04-23 00:17:25 +08:00 · 2025-08-22 16:49:42 +08:00
parent 5b66462f0e
commit 27666ee586
8 changed files with 1657 additions and 4 deletions
@@ -0,0 +1,248 @@
+"""
+# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+
+import unittest
+from unittest.mock import MagicMock, patch
+
+import numpy as np
+from PIL import Image
+
+from fastdeploy.engine.request import Request
+from fastdeploy.input.qwen_vl_processor import QwenVLProcessor
+
+
+def mock_pil_image(height, width):
+    """
+    Generate mock random RGB image
+
+    Args:
+        height: Image height in pixels
+        width: Image width in pixels
+
+    Returns:
+        PIL.Image object with random RGB data
+    """
+    rgb_image = np.random.randint(0, 256, (height, width, 3), dtype=np.uint8)
+    return Image.fromarray(rgb_image)
+
+
+def mock_read_frames(height: int, width: int, nums_frame: int, fps: int):
+    """
+    Generate mock video frames with metadata for testing purposes
+
+    Creates synthetic video data by generating random RGB frames and constructing
+    corresponding metadata to simulate real video processing.
+
+    Args:
+        height (int): Height of video frames in pixels
+        width (int): Width of video frames in pixels
+        nums_frame (int): Number of frames to generate
+        fps (int): Frames per second for the mock video
+
+    Returns:
+        tuple: A tuple containing:
+            frames (numpy.ndarray): Array of shape (nums_frame, height, width, 3)
+                containing randomly generated RGB frames
+            meta (dict): Dictionary with video metadata:
+                - fps (int): Frames per second (same as input)
+                - duration (float): Calculated duration in seconds (nums_frame/fps)
+                - num_of_frame (int): Number of frames (same as nums_frame input)
+    """
+    frames = []
+    for _ in range(nums_frame):
+        frame = np.random.randint(0, 256, (height, width, 3), dtype=np.uint8)
+        frames.append(frame)
+    frames = np.stack(frames, axis=0)
+
+    meta = {
+        "fps": fps,
+        "duration": nums_frame / fps,
+        "num_of_frame": nums_frame,
+    }
+    return frames, meta
+
+
+class TestQwenVLProcessor(unittest.TestCase):
+    """
+    Unit tests for Qwen Vision-Language Processor functionality
+    """
+
+    def setUp(self):
+        """
+        Initialize test case with:
+        - Mock configuration
+        - Patched message parsing and video processing methods
+        - QwenVLProcessor instance with test parameters
+        """
+        config = MagicMock()
+        config.vision_config.tokens_per_second = 2
+
+        self.patcher_parse_image = patch(
+            "fastdeploy.entrypoints.chat_utils.MultiModalPartParser.parse_image", return_value=mock_pil_image(480, 640)
+        )
+        self.patcher_parse_image.start()
+
+        self.patcher_parse_video = patch(
+            "fastdeploy.entrypoints.chat_utils.MultiModalPartParser.parse_video", return_value=b"123"
+        )
+        self.patcher_parse_video.start()
+
+        self.patcher_read_frames = patch(
+            "fastdeploy.input.qwen_mm_processor.process.read_frames", return_value=mock_read_frames(480, 640, 5, 2)
+        )
+        self.patcher_read_frames.start()
+
+        mm_processor_kwargs = {
+            "video_max_frames": 10,
+            "video_min_frames": 1,
+        }
+        limit_mm_per_prompt = {"image": 1, "video": 1, "audio": 1}
+
+        model_name_or_path = "/ModelData/Qwen2.5-VL-7B-Instruct"
+        self.processor = QwenVLProcessor(
+            config=config,
+            model_name_or_path=model_name_or_path,
+            limit_mm_per_prompt=limit_mm_per_prompt,
+            mm_processor_kwargs=mm_processor_kwargs,
+            reasoning_parser_obj=None,
+            tool_parser_obj=None,
+        )
+
+    def tearDown(self) -> None:
+        """Clean up test case by stopping all mock patches"""
+        self.patcher_read_frames.stop()
+        self.patcher_parse_image.stop()
+        self.patcher_parse_video.stop()
+
+    def test_process_request(self):
+        """
+        Test processing of Request object with multimodal input
+
+        Validates:
+        1. Token ID lengths match position_ids and token_type_ids shapes
+        2. Image processing produces expected output dimensions
+        3. Video processing produces expected output dimensions
+        4. Correct counts for images (1) and videos (1)
+        """
+        prompt = {
+            "request_id": "12345",
+            "messages": [
+                {
+                    "role": "user",
+                    "content": [
+                        {"type": "image_url", "image_url": {"url": "file://demo.jpeg"}},
+                        {"type": "video_url", "video_url": {"url": "file://3_frame_video.mp4"}},
+                        {"type": "text", "text": "Describe image and video."},
+                    ],
+                }
+            ],
+        }
+
+        request = Request.from_dict(prompt)
+        result = self.processor.process_request(request, 1024 * 100)
+
+        self.assertEqual(result.prompt_token_ids_len, result.multimodal_inputs["position_ids"].shape[0])
+        self.assertEqual(result.prompt_token_ids_len, result.multimodal_inputs["token_type_ids"].shape[0])
+        self.assertEqual(
+            result.multimodal_inputs["images"].shape[0],
+            sum(map(lambda x: x.prod(), result.multimodal_inputs["grid_thw"])),
+        )
+        self.assertEqual(
+            result.multimodal_inputs["image_type_ids"].shape[0], result.multimodal_inputs["grid_thw"][:, 0].sum()
+        )
+        self.assertEqual(result.multimodal_inputs["pic_cnt"], 1)
+        self.assertEqual(result.multimodal_inputs["video_cnt"], 1)
+
+    def test_process_request_dict(self):
+        """
+        Test processing of dictionary-format request with multimodal input
+
+        Validates:
+        1. Token ID lengths match position_ids and token_type_ids shapes
+        2. Image processing produces expected output dimensions
+        3. Video processing produces expected output dimensions
+        4. Correct counts for images (1) and videos (1)
+        """
+        num_generated_token_ids = 10
+        request = {
+            "request_id": "12345",
+            "metadata": {
+                "generated_token_ids": [1] * num_generated_token_ids,
+            },
+            "stop": ["stop", "eof"],
+            "messages": [
+                {
+                    "role": "user",
+                    "content": [
+                        {"type": "image_url", "image_url": {"url": "file://demo.jpeg"}},
+                        {"type": "video_url", "video_url": {"url": "file://3_frame_video.mp4"}},
+                        {"type": "text", "text": "Describe image and video."},
+                    ],
+                }
+            ],
+        }
+
+        result = self.processor.process_request_dict(request, 1024 * 100)
+
+        self.assertEqual(result["prompt_token_ids_len"], result["multimodal_inputs"]["position_ids"].shape[0])
+        self.assertEqual(result["prompt_token_ids_len"], result["multimodal_inputs"]["token_type_ids"].shape[0])
+        self.assertEqual(
+            result["multimodal_inputs"]["images"].shape[0],
+            sum(map(lambda x: x.prod(), result["multimodal_inputs"]["grid_thw"])),
+        )
+        self.assertEqual(
+            result["multimodal_inputs"]["image_type_ids"].shape[0], result["multimodal_inputs"]["grid_thw"][:, 0].sum()
+        )
+        self.assertEqual(result["multimodal_inputs"]["pic_cnt"], 1)
+        self.assertEqual(result["multimodal_inputs"]["video_cnt"], 1)
+
+    def test_prompt(self):
+        """
+        Test processing of prompt with image and video placeholders
+
+        Validates:
+        1. Token ID lengths match position_ids and token_type_ids shapes
+        2. Image processing produces expected output dimensions
+        3. Video processing produces expected output dimensions
+        4. Correct counts for images (1) and videos (1)
+        """
+        prompt = {
+            "request_id": "12345",
+            "prompt": "<|image@placeholder|><|video@placeholder|>Describe image and video.",
+            "multimodal_data": {
+                "image": [mock_pil_image(10, 2100)],
+                "video": [{"video": b"123", "fps": 5}],
+            },
+        }
+
+        request = Request.from_dict(prompt)
+        result = self.processor.process_request(request, 1024 * 100)
+
+        self.assertEqual(result.prompt_token_ids_len, result.multimodal_inputs["position_ids"].shape[0])
+        self.assertEqual(result.prompt_token_ids_len, result.multimodal_inputs["token_type_ids"].shape[0])
+        self.assertEqual(
+            result.multimodal_inputs["images"].shape[0],
+            sum(map(lambda x: x.prod(), result.multimodal_inputs["grid_thw"])),
+        )
+        self.assertEqual(
+            result.multimodal_inputs["image_type_ids"].shape[0], result.multimodal_inputs["grid_thw"][:, 0].sum()
+        )
+        self.assertEqual(result.multimodal_inputs["pic_cnt"], 1)
+        self.assertEqual(result.multimodal_inputs["video_cnt"], 1)
+
+
+if __name__ == "__main__":
+    unittest.main()