feat: add support for API usage with multimodal models (#4548)

* feat: add support for API usage with multimodal models * completion_tokens contains num_image_tokens * remove test_request.py * fix: paddle.device.is_compiled_with_cuda() * fix test_unstream_without_logprobs
2026-04-23 00:17:25 +08:00 · 2025-10-28 20:23:46 +08:00
parent e1ac90d787
commit 2a9ed72533
10 changed files with 256 additions and 21 deletions
@@ -66,6 +66,7 @@ class CompletionTokenUsageInfo(BaseModel):
    """

    reasoning_tokens: Optional[int] = None
+    image_tokens: Optional[int] = None


 class PromptTokenUsageInfo(BaseModel):
@@ -74,6 +75,8 @@ class PromptTokenUsageInfo(BaseModel):
    """

    cached_tokens: Optional[int] = None
+    image_tokens: Optional[int] = None
+    video_tokens: Optional[int] = None


 class UsageInfo(BaseModel):
@@ -16,6 +16,7 @@

 from typing import Any, List, Optional

+from fastdeploy.entrypoints.openai.usage_calculator import count_tokens
 from fastdeploy.input.tokenzier_client import AsyncTokenizerClient, ImageDecodeRequest
 from fastdeploy.utils import api_server_logger

@@ -104,6 +105,7 @@ class ChatResponseProcessor:
                            image_output = self._end_image_code_request_output
                            image_output["outputs"]["multipart"] = [image]
                            image_output["outputs"]["token_ids"] = all_tokens
+                            image_output["outputs"]["num_image_tokens"] = count_tokens(all_tokens)
                            yield image_output

                    self.data_processor.process_response_dict(
@@ -124,6 +126,7 @@ class ChatResponseProcessor:
                token_ids = request_output["outputs"]["token_ids"]
                if token_ids[-1] == self.eos_token_id:
                    multipart = []
+                    num_image_tokens = 0
                    for part in self._multipart_buffer:
                        if part["decode_type"] == 0:
                            self.data_processor.process_response_dict(
@@ -139,6 +142,7 @@ class ChatResponseProcessor:
                            if self.decoder_client:
                                req_id = part["request_output"]["request_id"]
                                all_tokens = part["request_output"]["outputs"]["token_ids"]
+                                num_image_tokens += count_tokens(all_tokens)

                                image_ret = await self.decoder_client.decode_image(
                                    request=ImageDecodeRequest(req_id=req_id, data=all_tokens)
@@ -150,4 +154,5 @@ class ChatResponseProcessor:

                    lasrt_request_output = self._multipart_buffer[-1]["request_output"]
                    lasrt_request_output["outputs"]["multipart"] = multipart
+                    lasrt_request_output["outputs"]["num_image_tokens"] = num_image_tokens
                    yield lasrt_request_output
@@ -189,6 +189,8 @@ class OpenAIServingChat:
        previous_num_tokens = [0] * num_choices
        reasoning_num_tokens = [0] * num_choices
        num_prompt_tokens = 0
+        num_cached_tokens = 0
+        num_image_tokens = [0] * num_choices
        tool_called = [False] * num_choices
        max_streaming_response_tokens = (
            request.max_streaming_response_tokens
@@ -321,6 +323,9 @@ class OpenAIServingChat:
                    output_top_logprobs = output["top_logprobs"]
                    output_draft_top_logprobs = output["draft_top_logprobs"]
                    previous_num_tokens[idx] += len(output["token_ids"])
+                    if output.get("num_image_tokens"):
+                        previous_num_tokens[idx] += output.get("num_image_tokens")
+                        num_image_tokens[idx] += output.get("num_image_tokens")
                    reasoning_num_tokens[idx] += output.get("reasoning_token_num", 0)
                    logprobs_res: Optional[LogProbs] = None
                    draft_logprobs_res: Optional[LogProbs] = None
@@ -389,8 +394,10 @@ class OpenAIServingChat:
                            prompt_tokens=num_prompt_tokens,
                            completion_tokens=previous_num_tokens[idx],
                            total_tokens=num_prompt_tokens + previous_num_tokens[idx],
+                            prompt_tokens_details=PromptTokenUsageInfo(cached_tokens=num_cached_tokens),
                            completion_tokens_details=CompletionTokenUsageInfo(
-                                reasoning_tokens=reasoning_num_tokens[idx]
+                                reasoning_tokens=reasoning_num_tokens[idx],
+                                image_tokens=num_image_tokens[idx],
                            ),
                        )
                    choices.append(choice)
@@ -409,7 +416,10 @@ class OpenAIServingChat:
                    prompt_tokens=num_prompt_tokens,
                    completion_tokens=completion_tokens,
                    total_tokens=num_prompt_tokens + completion_tokens,
-                    completion_tokens_details=CompletionTokenUsageInfo(reasoning_tokens=reasoning_tokens),
+                    prompt_tokens_details=PromptTokenUsageInfo(cached_tokens=num_cached_tokens),
+                    completion_tokens_details=CompletionTokenUsageInfo(
+                        image_tokens=sum(num_image_tokens), reasoning_tokens=reasoning_tokens
+                    ),
                )
                chunk = ChatCompletionStreamResponse(
                    id=request_id,
@@ -466,6 +476,7 @@ class OpenAIServingChat:
            draft_logprob_contents = [[] for _ in range(num_choices)]
            completion_token_ids = [[] for _ in range(num_choices)]
            num_cached_tokens = [0] * num_choices
+            num_image_tokens = [0] * num_choices
            response_processor = ChatResponseProcessor(
                data_processor=self.engine_client.data_processor,
                enable_mm_output=self.enable_mm_output,
@@ -531,6 +542,9 @@ class OpenAIServingChat:
                    if data["finished"]:
                        num_choices -= 1
                        reasoning_num_tokens[idx] = data["outputs"].get("reasoning_token_num", 0)
+                        if data["outputs"].get("image_token_num"):
+                            previous_num_tokens[idx] += data["outputs"].get("image_token_num")
+                            num_image_tokens[idx] = data["outputs"].get("image_token_num")
                        choice = await self._create_chat_completion_choice(
                            output=output,
                            index=idx,
@@ -540,6 +554,7 @@ class OpenAIServingChat:
                            prompt_tokens=prompt_tokens,
                            completion_token_ids=completion_token_ids[idx],
                            num_cached_tokens=num_cached_tokens,
+                            num_image_tokens=num_image_tokens,
                            logprob_contents=logprob_contents,
                            response_processor=response_processor,
                        )
@@ -557,7 +572,9 @@ class OpenAIServingChat:
            completion_tokens=num_generated_tokens,
            total_tokens=num_prompt_tokens + num_generated_tokens,
            prompt_tokens_details=PromptTokenUsageInfo(cached_tokens=sum(num_cached_tokens)),
-            completion_tokens_details=CompletionTokenUsageInfo(reasoning_tokens=num_reasoning_tokens),
+            completion_tokens_details=CompletionTokenUsageInfo(
+                reasoning_tokens=num_reasoning_tokens, image_tokens=sum(num_image_tokens)
+            ),
        )
        choices = sorted(choices, key=lambda x: x.index)
        res = ChatCompletionResponse(
@@ -580,6 +597,7 @@ class OpenAIServingChat:
        prompt_tokens: str,
        completion_token_ids: list,
        num_cached_tokens: list,
+        num_image_tokens: list,
        logprob_contents: list,
        response_processor: ChatResponseProcessor,
    ) -> ChatCompletionResponseChoice:
@@ -609,6 +627,7 @@ class OpenAIServingChat:
        has_no_token_limit = request.max_tokens is None and request.max_completion_tokens is None
        max_tokens = request.max_completion_tokens or request.max_tokens
        num_cached_tokens[index] = output.get("num_cached_tokens", 0)
+        num_image_tokens[index] = output.get("num_image_tokens", 0)

        finish_reason = "stop"
        if has_no_token_limit or previous_num_tokens != max_tokens:
@@ -33,6 +33,7 @@ from fastdeploy.entrypoints.openai.protocol import (
    CompletionTokenUsageInfo,
    ErrorInfo,
    ErrorResponse,
+    PromptTokenUsageInfo,
    UsageInfo,
 )
 from fastdeploy.utils import (
@@ -370,6 +371,8 @@ class OpenAIServingCompletion:
                req_id = f"{request_id}_{i}"
                dealer.write([b"", req_id.encode("utf-8")])  # 发送多路请求
            output_tokens = [0] * num_choices
+            num_cache_tokens = [0] * num_choices
+            num_image_tokens = [0] * num_choices
            inference_start_time = [0] * num_choices
            reasoning_tokens = [0] * num_choices
            first_iteration = [True] * num_choices
@@ -459,7 +462,11 @@ class OpenAIServingCompletion:
                            draft_logprobs_res = self._create_completion_logprobs(
                                output_draft_top_logprobs, request.logprobs, 0
                            )
-                    output_tokens[idx] += 1
+                    output_tokens[idx] += len(output.get("token_ids", [])) or 0
+                    num_cache_tokens[idx] += output.get("num_cache_tokens") or 0
+                    if output.get("num_image_tokens"):
+                        output_tokens[idx] += output.get("num_image_tokens")
+                        num_image_tokens[idx] += output.get("num_image_tokens")
                    reasoning_tokens[idx] += output.get("reasoning_token_num", 0)
                    delta_message = CompletionResponseStreamChoice(
                        index=idx,
@@ -527,8 +534,9 @@ class OpenAIServingCompletion:
                                        prompt_batched_token_ids[idx // (1 if request.n is None else request.n)]
                                    )
                                    + output_tokens[idx],
+                                    prompt_tokens_details=PromptTokenUsageInfo(cached_tokens=num_cache_tokens[idx]),
                                    completion_tokens_details=CompletionTokenUsageInfo(
-                                        reasoning_tokens=reasoning_tokens[idx]
+                                        image_tokens=num_image_tokens[idx], reasoning_tokens=reasoning_tokens[idx]
                                    ),
                                ),
                            )
@@ -559,6 +567,8 @@ class OpenAIServingCompletion:
        choices: List[CompletionResponseChoice] = []
        num_prompt_tokens = 0
        num_generated_tokens = 0
+        num_cache_tokens = 0
+        num_image_tokens = 0
        num_reasoning_tokens = 0

        for idx in range(len(final_res_batch)):
@@ -614,6 +624,10 @@ class OpenAIServingCompletion:
            num_generated_tokens += final_res["output_token_ids"]

            num_prompt_tokens += len(prompt_token_ids)
+            num_cache_tokens += output.get("num_cache_tokens") or 0
+            if output.get("num_image_tokens"):
+                num_generated_tokens += output.get("num_image_tokens")
+                num_image_tokens += output.get("num_image_tokens")

            num_reasoning_tokens += output.get("reasoning_token_num", 0)

@@ -622,7 +636,10 @@ class OpenAIServingCompletion:
            prompt_tokens=num_prompt_tokens,
            completion_tokens=num_generated_tokens,
            total_tokens=num_prompt_tokens + num_generated_tokens,
-            completion_tokens_details=CompletionTokenUsageInfo(reasoning_tokens=num_reasoning_tokens),
+            prompt_tokens_details=PromptTokenUsageInfo(cached_tokens=num_cache_tokens),
+            completion_tokens_details=CompletionTokenUsageInfo(
+                reasoning_tokens=num_reasoning_tokens, image_tokens=num_image_tokens
+            ),
        )
        del request

@@ -0,0 +1,33 @@
+"""
+# Copyright (c) 2025  PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+
+import numpy as np
+
+
+def count_tokens(tokens):
+    """
+    Count the number of tokens in a nested list or array structure.
+    """
+    count = 0
+    stack = [tokens]
+    while stack:
+        current = stack.pop()
+        if isinstance(current, (list, tuple, np.ndarray)):
+            for item in reversed(current):
+                stack.append(item)
+        else:
+            count += 1
+    return count