feat: add support for API usage with multimodal models (#4548)

* feat: add support for API usage with multimodal models * completion_tokens contains num_image_tokens * remove test_request.py * fix: paddle.device.is_compiled_with_cuda() * fix test_unstream_without_logprobs
2026-04-23 00:17:25 +08:00 · 2025-10-28 20:23:46 +08:00
parent e1ac90d787
commit 2a9ed72533
10 changed files with 256 additions and 21 deletions
@@ -388,6 +388,7 @@ class TestMaxStreamingResponseTokens(IsolatedAsyncioTestCase):
                        "reasoning_content": "Normal reasoning",
                        "tool_call": None,
                        "num_cached_tokens": 3,
+                        "num_image_tokens": 2,
                        "raw_prediction": "raw_answer_0",
                    },
                    "finished": True,
@@ -403,6 +404,7 @@ class TestMaxStreamingResponseTokens(IsolatedAsyncioTestCase):
                    "tool_calls": None,
                    "raw_prediction": "raw_answer_0",
                    "num_cached_tokens": 3,
+                    "num_image_tokens": 2,
                    "finish_reason": "stop",
                },
            },
@@ -415,6 +417,7 @@ class TestMaxStreamingResponseTokens(IsolatedAsyncioTestCase):
                        "reasoning_content": None,
                        "tool_call": None,
                        "num_cached_tokens": 0,
+                        "num_image_tokens": 0,
                        "raw_prediction": None,
                    },
                    "finished": True,
@@ -430,6 +433,7 @@ class TestMaxStreamingResponseTokens(IsolatedAsyncioTestCase):
                    "tool_calls": None,
                    "raw_prediction": None,
                    "num_cached_tokens": 0,
+                    "num_image_tokens": 0,
                    "finish_reason": "stop",
                },
            },
@@ -442,6 +446,7 @@ class TestMaxStreamingResponseTokens(IsolatedAsyncioTestCase):
        mock_response_processor.enable_multimodal_content.return_value = False
        completion_token_ids = [[], []]
        num_cached_tokens = [0, 0]
+        num_image_tokens = [0, 0]

        for idx, case in enumerate(test_cases):
            actual_choice = await self.chat_serving._create_chat_completion_choice(
@@ -453,6 +458,7 @@ class TestMaxStreamingResponseTokens(IsolatedAsyncioTestCase):
                prompt_tokens=prompt_tokens,
                completion_token_ids=completion_token_ids[idx],
                num_cached_tokens=num_cached_tokens,
+                num_image_tokens=num_image_tokens,
                logprob_contents=logprob_contents,
                response_processor=mock_response_processor,
            )
@@ -468,6 +474,7 @@ class TestMaxStreamingResponseTokens(IsolatedAsyncioTestCase):
            self.assertEqual(actual_choice.message.completion_token_ids, completion_token_ids[idx])

            self.assertEqual(num_cached_tokens[expected["index"]], expected["num_cached_tokens"])
+            self.assertEqual(num_image_tokens[expected["index"]], expected["num_image_tokens"])
            self.assertEqual(actual_choice.finish_reason, expected["finish_reason"])
            assert actual_choice.logprobs is not None