[Models] Add Qwen3-VL Model Support (#5763)

* support v1 loader * remove useless code * remove useless * [Model] support Qwen3VL images success * [Model] support Qwen3VL rope_3d * [Model] support Qwen3VL remove log * [Model] support Qwen3VL RL * [Model] support Qwen3VL tp * [Model] support Qwen3VL video * [Model] support Qwen3VL fix ernievl * [Model] support Qwen3VL fix get_image_boundaries.cc array out of bounds * [Model] support Qwen3VL fix multi card * [Model] support Qwen3VL file close * [Model] support Qwen3VL fix ce * [Model] support Qwen3VL fix unittest * [Model] support Qwen3VL add unittest --------- Co-authored-by: Ayakouji <yuhongh@qq.com>
2026-04-23 08:21:53 +08:00 · 2025-12-29 17:39:33 +08:00
parent a3f0696e35
commit 9286403570
36 changed files with 5917 additions and 91 deletions
@@ -22,6 +22,7 @@ from PIL import Image

 from fastdeploy.engine.request import Request
 from fastdeploy.input.qwen_vl_processor import QwenVLProcessor
+from fastdeploy.input.qwen_vl_processor.process_video import sample_frames


 def mock_pil_image(height, width):
@@ -390,6 +391,376 @@ class TestQwenVLProcessor(unittest.TestCase):
        self.processor.process_request_dict(request, max_model_len=512)
        self.assertEqual(request["enable_thinking"], True)

+    def test_parse_processor_kwargs_valid(self):
+        """Test _parse_processor_kwargs with valid input"""
+        valid_kwargs = {"video_max_frames": 10, "video_min_frames": 1}
+        result = self.processor._parse_processor_kwargs(valid_kwargs)
+        self.assertEqual(result, valid_kwargs)
+
+    def test_parse_processor_kwargs_empty(self):
+        """Test _parse_processor_kwargs with empty input"""
+        result = self.processor._parse_processor_kwargs(None)
+        self.assertEqual(result, {})
+
+    def test_parse_processor_kwargs_invalid_type(self):
+        """Test _parse_processor_kwargs with invalid type"""
+        result = self.processor._parse_processor_kwargs("invalid")
+        self.assertEqual(result, {})
+
+    def test_parse_processor_kwargs_invalid_value_type(self):
+        """Test _parse_processor_kwargs with invalid value type"""
+        invalid_kwargs = {"video_max_frames": "10"}  # Should be int
+        result = self.processor._parse_processor_kwargs(invalid_kwargs)
+        self.assertEqual(result, {})
+
+    def test_parse_processor_kwargs_mixed_valid_invalid(self):
+        """Test _parse_processor_kwargs with mixed valid and invalid types"""
+        mixed_kwargs = {"video_max_frames": 10, "video_min_frames": "invalid"}
+        result = self.processor._parse_processor_kwargs(mixed_kwargs)
+        self.assertEqual(result, {})
+
+    def test_parse_limits_valid(self):
+        """Test _parse_limits with valid limits"""
+        limits = {"image": 2, "video": 3}
+        result = self.processor._parse_limits(limits)
+        expected = {"image": 2, "video": 3, "audio": 1}
+        self.assertEqual(result, expected)
+
+    def test_parse_limits_empty(self):
+        """Test _parse_limits with empty input"""
+        result = self.processor._parse_limits(None)
+        expected = {"image": 1, "video": 1, "audio": 1}
+        self.assertEqual(result, expected)
+
+    def test_parse_limits_invalid_type(self):
+        """Test _parse_limits with invalid type"""
+        result = self.processor._parse_limits("invalid")
+        expected = {"image": 1, "video": 1, "audio": 1}
+        self.assertEqual(result, expected)
+
+    def test_parse_limits_partial(self):
+        """Test _parse_limits with partial limits"""
+        limits = {"image": 5}
+        result = self.processor._parse_limits(limits)
+        expected = {"image": 5, "video": 1, "audio": 1}
+        self.assertEqual(result, expected)
+
+    def test_check_mm_limits_dict_valid(self):
+        """Test _check_mm_limits with valid dict input"""
+        mm_data = {"image": [mock_pil_image(10, 10)], "video": [{"video": b"123"}]}
+        # Should not raise exception
+        self.processor._check_mm_limits(mm_data)
+
+    def test_check_mm_limits_dict_exceed_limit(self):
+        """Test _check_mm_limits when dict input exceeds limit"""
+        mm_data = {"image": [mock_pil_image(10, 10), mock_pil_image(10, 10)]}
+        with self.assertRaises(ValueError) as context:
+            self.processor._check_mm_limits(mm_data)
+        self.assertIn("Too many image items", str(context.exception))
+
+    def test_check_mm_limits_messages_valid(self):
+        """Test _check_mm_limits with valid messages input"""
+        messages = [
+            {
+                "role": "user",
+                "content": [
+                    {"type": "image_url", "image_url": {"url": "file://demo.jpeg"}},
+                    {"type": "text", "text": "Describe this image."},
+                ],
+            }
+        ]
+        # Should not raise exception
+        self.processor._check_mm_limits(messages)
+
+    def test_check_mm_limits_messages_exceed_limit(self):
+        """Test _check_mm_limits when messages input exceeds limit"""
+        messages = [
+            {
+                "role": "user",
+                "content": [
+                    {"type": "image_url", "image_url": {"url": "file://demo1.jpeg"}},
+                    {"type": "image_url", "image_url": {"url": "file://demo2.jpeg"}},
+                ],
+            }
+        ]
+        with self.assertRaises(ValueError) as context:
+            self.processor._check_mm_limits(messages)
+        self.assertIn("Too many image items", str(context.exception))
+
+    def test_check_mm_limits_video_exceed(self):
+        """Test _check_mm_limits when video exceeds limit"""
+        mm_data = {"video": [{"video": b"123"}, {"video": b"456"}]}
+        with self.assertRaises(ValueError) as context:
+            self.processor._check_mm_limits(mm_data)
+        self.assertIn("Too many video items", str(context.exception))
+
+    def test_process_request_dict_with_prompt(self):
+        """Test process_request_dict with prompt format"""
+        request = {
+            "request_id": "12345",
+            "prompt": "Test prompt",
+            "multimodal_data": {"image": [mock_pil_image(10, 10)]},
+        }
+        result = self.processor.process_request_dict(request, 1024)
+        self.assertIn("prompt_token_ids", result)
+        self.assertIn("multimodal_inputs", result)
+
+    def test_process_request_dict_with_messages(self):
+        """Test process_request_dict with messages format"""
+        request = {
+            "request_id": "12345",
+            "messages": [
+                {
+                    "role": "user",
+                    "content": [{"type": "text", "text": "Hello"}],
+                }
+            ],
+        }
+        result = self.processor.process_request_dict(request, 1024)
+        self.assertIn("prompt_token_ids", result)
+        self.assertIn("multimodal_inputs", result)
+
+    def test_process_request_dict_invalid_format(self):
+        """Test process_request_dict with invalid format"""
+        request = {"request_id": "12345"}
+        with self.assertRaises(ValueError) as context:
+            self.processor.process_request_dict(request, 1024)
+        self.assertIn("must contain 'prompt', or 'messages'", str(context.exception))
+
+    def test_process_request_dict_with_bad_words(self):
+        """Test process_request_dict with bad_words"""
+        request = {
+            "request_id": "12345",
+            "prompt": "Test prompt",
+            "bad_words": ["bad", "word"],
+            "bad_words_token_ids": [100, 200],
+        }
+        result = self.processor.process_request_dict(request, 1024)
+        # Verify bad_words_token_ids is set
+        self.assertIn("bad_words_token_ids", result)
+        self.assertIsNotNone(result["bad_words_token_ids"])
+
+    def test_process_request_dict_invalid_chat_template_kwargs(self):
+        """Test process_request_dict with invalid chat_template_kwargs"""
+        request = {
+            "request_id": "12345",
+            "messages": [{"role": "user", "content": [{"type": "text", "text": "Hello"}]}],
+            "chat_template_kwargs": "invalid",
+        }
+        with self.assertRaises(ValueError) as context:
+            self.processor.process_request_dict(request, 1024)
+        self.assertIn("must be a dict", str(context.exception))
+
+    def test_process_request_dict_with_completion_token_ids(self):
+        """Test process_request_dict with completion_token_ids"""
+        request = {
+            "request_id": "12345",
+            "prompt": "Test",
+            "completion_token_ids": [1, 2, 3],
+        }
+        result = self.processor.process_request_dict(request, 1024)
+        # Verify completion tokens are appended
+        self.assertGreater(len(result["prompt_token_ids"]), 3)
+
+    def test_process_request_dict_prompt_truncation(self):
+        """Test process_request_dict with prompt truncation"""
+        # Create a long prompt that exceeds max_model_len
+        long_prompt = "Test " * 1000
+        request = {
+            "request_id": "12345",
+            "prompt": long_prompt,
+        }
+        result = self.processor.process_request_dict(request, 100)
+        # Verify prompt is truncated
+        self.assertLessEqual(len(result["prompt_token_ids"]), 99)
+
+    def test_process_request_dict_default_max_tokens(self):
+        """Test process_request_dict sets default max_tokens"""
+        request = {
+            "request_id": "12345",
+            "prompt": "Test",
+        }
+        result = self.processor.process_request_dict(request, 1024)
+        self.assertIn("max_tokens", result)
+        self.assertGreater(result["max_tokens"], 0)
+
+    def test_process_request_dict_enable_thinking_false(self):
+        """Test process_request_dict sets enable_thinking to False"""
+        request = {
+            "request_id": "12345",
+            "prompt": "Test",
+            "enable_thinking": True,
+        }
+        result = self.processor.process_request_dict(request, 1024)
+        self.assertFalse(result["enable_thinking"])
+
+    def test_append_completion_tokens(self):
+        """Test append_completion_tokens method"""
+        multimodal_inputs = {
+            "input_ids": [1, 2, 3],
+            "token_type_ids": [0, 0, 0],
+            "position_ids": [np.array([[0, 1, 2], [0, 1, 2], [0, 1, 2]])],
+            "cur_position": 3,
+        }
+        completion_token_ids = [4, 5]
+        self.processor.append_completion_tokens(multimodal_inputs, completion_token_ids)
+
+        self.assertEqual(multimodal_inputs["input_ids"], [1, 2, 3, 4, 5])
+        self.assertEqual(multimodal_inputs["token_type_ids"], [0, 0, 0, 0, 0])
+        self.assertEqual(multimodal_inputs["cur_position"], 5)
+
+    def test_pack_outputs_with_images(self):
+        """Test pack_outputs with image data"""
+        outputs = {
+            "images": [np.array([[1, 2], [3, 4]]), np.array([[5, 6], [7, 8]])],
+            "grid_thw": [np.array([2, 2, 1]), np.array([2, 2, 1])],
+            "image_type_ids": [0, 1],
+            "input_ids": [1, 2, 3],
+            "token_type_ids": [0, 0, 0],
+            "position_ids": [np.array([[0, 1, 2], [0, 1, 2], [0, 1, 2]])],
+        }
+        result = self.processor.pack_outputs(outputs)
+
+        self.assertIsNotNone(result["images"])
+        self.assertIsNotNone(result["grid_thw"])
+        self.assertIsNotNone(result["image_type_ids"])
+        self.assertEqual(result["images"].shape[0], 4)
+        self.assertEqual(result["grid_thw"].shape[0], 2)
+
+    def test_pack_outputs_without_images(self):
+        """Test pack_outputs without image data"""
+        outputs = {
+            "images": [],
+            "grid_thw": [],
+            "image_type_ids": [],
+            "input_ids": [1, 2, 3],
+            "token_type_ids": [0, 0, 0],
+            "position_ids": [np.array([[0, 1, 2], [0, 1, 2], [0, 1, 2]])],
+        }
+        result = self.processor.pack_outputs(outputs)
+
+        # Test that image-related fields are None when no images
+        self.assertIsNone(result["images"])
+        self.assertIsNone(result["grid_thw"])
+        self.assertIsNone(result["image_type_ids"])
+
+        # Test data types
+        self.assertEqual(result["input_ids"].dtype, np.int64)
+        self.assertEqual(result["token_type_ids"].dtype, np.int64)
+        self.assertEqual(result["position_ids"].dtype, np.int64)
+
+        # Test patch IDs are set
+        self.assertIn("image_patch_id", result)
+        self.assertIn("video_patch_id", result)
+        self.assertIn("mm_num_token_func", result)
+
+
+class TestSampleFrames(unittest.TestCase):
+    """
+    Unit tests for sample_frames function
+    """
+
+    def setUp(self):
+        self.metadata = {
+            "num_of_frame": 100,
+            "fps": 25,
+        }
+
+    def test_fps_and_num_frames_mutually_exclusive(self):
+        with self.assertRaises(ValueError):
+            sample_frames(
+                frame_factor=4,
+                min_frames=8,
+                max_frames=32,
+                metadata=self.metadata,
+                fps=2,
+                num_frames=16,
+            )
+
+    def test_num_frames_round_to_factor(self):
+        indices = sample_frames(
+            frame_factor=4,
+            min_frames=8,
+            max_frames=64,
+            metadata=self.metadata,
+            num_frames=18,  # round(18 / 4) * 4 = 16
+        )
+
+        self.assertEqual(len(indices), 16)
+        self.assertEqual(indices[0], 0)
+        self.assertLess(indices[-1], self.metadata["num_of_frame"])
+
+    def test_fps_sampling_basic(self):
+        # total = 100 frames, fps=25, target fps=5 → 20 frames
+        indices = sample_frames(
+            frame_factor=4,
+            min_frames=8,
+            max_frames=64,
+            metadata=self.metadata,
+            fps=5,
+        )
+
+        self.assertEqual(len(indices), 20)
+        self.assertEqual(indices.dtype, np.int32)
+        self.assertEqual(indices[0], 0)
+
+    def test_fps_respects_min_frames(self):
+        indices = sample_frames(
+            frame_factor=4,
+            min_frames=24,
+            max_frames=64,
+            metadata=self.metadata,
+            fps=1,  # very small fps
+        )
+
+        self.assertEqual(len(indices), 24)
+
+    def test_num_frames_exceeds_total_raises(self):
+        with self.assertRaises(ValueError):
+            sample_frames(
+                frame_factor=4,
+                min_frames=8,
+                max_frames=200,
+                metadata=self.metadata,
+                num_frames=200,
+            )
+
+    def test_force_multiple_of_4_hack(self):
+        indices = sample_frames(
+            frame_factor=2,
+            min_frames=2,
+            max_frames=100,
+            metadata=self.metadata,
+            num_frames=10,  # 10 % 4 != 0 → hack → 8
+        )
+
+        self.assertEqual(len(indices), 8)
+        self.assertEqual(len(indices) % 4, 0)
+
+    def test_keep_all_frames_when_num_frames_zero(self):
+        indices = sample_frames(
+            frame_factor=4,
+            min_frames=0,
+            max_frames=100,
+            metadata=self.metadata,
+            num_frames=0,
+        )
+
+        self.assertEqual(len(indices), self.metadata["num_of_frame"])
+        np.testing.assert_array_equal(indices, np.arange(0, 100, dtype=np.int32))
+
+    def test_indices_evenly_spaced(self):
+        indices = sample_frames(
+            frame_factor=4,
+            min_frames=8,
+            max_frames=32,
+            metadata=self.metadata,
+            num_frames=16,
+        )
+
+        diffs = np.diff(indices)
+        self.assertTrue(np.all(diffs > 0))
+

 if __name__ == "__main__":
    unittest.main()