mirror of
https://github.com/PaddlePaddle/FastDeploy.git
synced 2026-04-23 00:17:25 +08:00
feat: add support for API usage with multimodal models (#4548)
* feat: add support for API usage with multimodal models * completion_tokens contains num_image_tokens * remove test_request.py * fix: paddle.device.is_compiled_with_cuda() * fix test_unstream_without_logprobs
This commit is contained in:
@@ -66,6 +66,7 @@ class CompletionTokenUsageInfo(BaseModel):
|
||||
"""
|
||||
|
||||
reasoning_tokens: Optional[int] = None
|
||||
image_tokens: Optional[int] = None
|
||||
|
||||
|
||||
class PromptTokenUsageInfo(BaseModel):
|
||||
@@ -74,6 +75,8 @@ class PromptTokenUsageInfo(BaseModel):
|
||||
"""
|
||||
|
||||
cached_tokens: Optional[int] = None
|
||||
image_tokens: Optional[int] = None
|
||||
video_tokens: Optional[int] = None
|
||||
|
||||
|
||||
class UsageInfo(BaseModel):
|
||||
|
||||
@@ -16,6 +16,7 @@
|
||||
|
||||
from typing import Any, List, Optional
|
||||
|
||||
from fastdeploy.entrypoints.openai.usage_calculator import count_tokens
|
||||
from fastdeploy.input.tokenzier_client import AsyncTokenizerClient, ImageDecodeRequest
|
||||
from fastdeploy.utils import api_server_logger
|
||||
|
||||
@@ -104,6 +105,7 @@ class ChatResponseProcessor:
|
||||
image_output = self._end_image_code_request_output
|
||||
image_output["outputs"]["multipart"] = [image]
|
||||
image_output["outputs"]["token_ids"] = all_tokens
|
||||
image_output["outputs"]["num_image_tokens"] = count_tokens(all_tokens)
|
||||
yield image_output
|
||||
|
||||
self.data_processor.process_response_dict(
|
||||
@@ -124,6 +126,7 @@ class ChatResponseProcessor:
|
||||
token_ids = request_output["outputs"]["token_ids"]
|
||||
if token_ids[-1] == self.eos_token_id:
|
||||
multipart = []
|
||||
num_image_tokens = 0
|
||||
for part in self._multipart_buffer:
|
||||
if part["decode_type"] == 0:
|
||||
self.data_processor.process_response_dict(
|
||||
@@ -139,6 +142,7 @@ class ChatResponseProcessor:
|
||||
if self.decoder_client:
|
||||
req_id = part["request_output"]["request_id"]
|
||||
all_tokens = part["request_output"]["outputs"]["token_ids"]
|
||||
num_image_tokens += count_tokens(all_tokens)
|
||||
|
||||
image_ret = await self.decoder_client.decode_image(
|
||||
request=ImageDecodeRequest(req_id=req_id, data=all_tokens)
|
||||
@@ -150,4 +154,5 @@ class ChatResponseProcessor:
|
||||
|
||||
lasrt_request_output = self._multipart_buffer[-1]["request_output"]
|
||||
lasrt_request_output["outputs"]["multipart"] = multipart
|
||||
lasrt_request_output["outputs"]["num_image_tokens"] = num_image_tokens
|
||||
yield lasrt_request_output
|
||||
|
||||
@@ -189,6 +189,8 @@ class OpenAIServingChat:
|
||||
previous_num_tokens = [0] * num_choices
|
||||
reasoning_num_tokens = [0] * num_choices
|
||||
num_prompt_tokens = 0
|
||||
num_cached_tokens = 0
|
||||
num_image_tokens = [0] * num_choices
|
||||
tool_called = [False] * num_choices
|
||||
max_streaming_response_tokens = (
|
||||
request.max_streaming_response_tokens
|
||||
@@ -321,6 +323,9 @@ class OpenAIServingChat:
|
||||
output_top_logprobs = output["top_logprobs"]
|
||||
output_draft_top_logprobs = output["draft_top_logprobs"]
|
||||
previous_num_tokens[idx] += len(output["token_ids"])
|
||||
if output.get("num_image_tokens"):
|
||||
previous_num_tokens[idx] += output.get("num_image_tokens")
|
||||
num_image_tokens[idx] += output.get("num_image_tokens")
|
||||
reasoning_num_tokens[idx] += output.get("reasoning_token_num", 0)
|
||||
logprobs_res: Optional[LogProbs] = None
|
||||
draft_logprobs_res: Optional[LogProbs] = None
|
||||
@@ -389,8 +394,10 @@ class OpenAIServingChat:
|
||||
prompt_tokens=num_prompt_tokens,
|
||||
completion_tokens=previous_num_tokens[idx],
|
||||
total_tokens=num_prompt_tokens + previous_num_tokens[idx],
|
||||
prompt_tokens_details=PromptTokenUsageInfo(cached_tokens=num_cached_tokens),
|
||||
completion_tokens_details=CompletionTokenUsageInfo(
|
||||
reasoning_tokens=reasoning_num_tokens[idx]
|
||||
reasoning_tokens=reasoning_num_tokens[idx],
|
||||
image_tokens=num_image_tokens[idx],
|
||||
),
|
||||
)
|
||||
choices.append(choice)
|
||||
@@ -409,7 +416,10 @@ class OpenAIServingChat:
|
||||
prompt_tokens=num_prompt_tokens,
|
||||
completion_tokens=completion_tokens,
|
||||
total_tokens=num_prompt_tokens + completion_tokens,
|
||||
completion_tokens_details=CompletionTokenUsageInfo(reasoning_tokens=reasoning_tokens),
|
||||
prompt_tokens_details=PromptTokenUsageInfo(cached_tokens=num_cached_tokens),
|
||||
completion_tokens_details=CompletionTokenUsageInfo(
|
||||
image_tokens=sum(num_image_tokens), reasoning_tokens=reasoning_tokens
|
||||
),
|
||||
)
|
||||
chunk = ChatCompletionStreamResponse(
|
||||
id=request_id,
|
||||
@@ -466,6 +476,7 @@ class OpenAIServingChat:
|
||||
draft_logprob_contents = [[] for _ in range(num_choices)]
|
||||
completion_token_ids = [[] for _ in range(num_choices)]
|
||||
num_cached_tokens = [0] * num_choices
|
||||
num_image_tokens = [0] * num_choices
|
||||
response_processor = ChatResponseProcessor(
|
||||
data_processor=self.engine_client.data_processor,
|
||||
enable_mm_output=self.enable_mm_output,
|
||||
@@ -531,6 +542,9 @@ class OpenAIServingChat:
|
||||
if data["finished"]:
|
||||
num_choices -= 1
|
||||
reasoning_num_tokens[idx] = data["outputs"].get("reasoning_token_num", 0)
|
||||
if data["outputs"].get("image_token_num"):
|
||||
previous_num_tokens[idx] += data["outputs"].get("image_token_num")
|
||||
num_image_tokens[idx] = data["outputs"].get("image_token_num")
|
||||
choice = await self._create_chat_completion_choice(
|
||||
output=output,
|
||||
index=idx,
|
||||
@@ -540,6 +554,7 @@ class OpenAIServingChat:
|
||||
prompt_tokens=prompt_tokens,
|
||||
completion_token_ids=completion_token_ids[idx],
|
||||
num_cached_tokens=num_cached_tokens,
|
||||
num_image_tokens=num_image_tokens,
|
||||
logprob_contents=logprob_contents,
|
||||
response_processor=response_processor,
|
||||
)
|
||||
@@ -557,7 +572,9 @@ class OpenAIServingChat:
|
||||
completion_tokens=num_generated_tokens,
|
||||
total_tokens=num_prompt_tokens + num_generated_tokens,
|
||||
prompt_tokens_details=PromptTokenUsageInfo(cached_tokens=sum(num_cached_tokens)),
|
||||
completion_tokens_details=CompletionTokenUsageInfo(reasoning_tokens=num_reasoning_tokens),
|
||||
completion_tokens_details=CompletionTokenUsageInfo(
|
||||
reasoning_tokens=num_reasoning_tokens, image_tokens=sum(num_image_tokens)
|
||||
),
|
||||
)
|
||||
choices = sorted(choices, key=lambda x: x.index)
|
||||
res = ChatCompletionResponse(
|
||||
@@ -580,6 +597,7 @@ class OpenAIServingChat:
|
||||
prompt_tokens: str,
|
||||
completion_token_ids: list,
|
||||
num_cached_tokens: list,
|
||||
num_image_tokens: list,
|
||||
logprob_contents: list,
|
||||
response_processor: ChatResponseProcessor,
|
||||
) -> ChatCompletionResponseChoice:
|
||||
@@ -609,6 +627,7 @@ class OpenAIServingChat:
|
||||
has_no_token_limit = request.max_tokens is None and request.max_completion_tokens is None
|
||||
max_tokens = request.max_completion_tokens or request.max_tokens
|
||||
num_cached_tokens[index] = output.get("num_cached_tokens", 0)
|
||||
num_image_tokens[index] = output.get("num_image_tokens", 0)
|
||||
|
||||
finish_reason = "stop"
|
||||
if has_no_token_limit or previous_num_tokens != max_tokens:
|
||||
|
||||
@@ -33,6 +33,7 @@ from fastdeploy.entrypoints.openai.protocol import (
|
||||
CompletionTokenUsageInfo,
|
||||
ErrorInfo,
|
||||
ErrorResponse,
|
||||
PromptTokenUsageInfo,
|
||||
UsageInfo,
|
||||
)
|
||||
from fastdeploy.utils import (
|
||||
@@ -370,6 +371,8 @@ class OpenAIServingCompletion:
|
||||
req_id = f"{request_id}_{i}"
|
||||
dealer.write([b"", req_id.encode("utf-8")]) # 发送多路请求
|
||||
output_tokens = [0] * num_choices
|
||||
num_cache_tokens = [0] * num_choices
|
||||
num_image_tokens = [0] * num_choices
|
||||
inference_start_time = [0] * num_choices
|
||||
reasoning_tokens = [0] * num_choices
|
||||
first_iteration = [True] * num_choices
|
||||
@@ -459,7 +462,11 @@ class OpenAIServingCompletion:
|
||||
draft_logprobs_res = self._create_completion_logprobs(
|
||||
output_draft_top_logprobs, request.logprobs, 0
|
||||
)
|
||||
output_tokens[idx] += 1
|
||||
output_tokens[idx] += len(output.get("token_ids", [])) or 0
|
||||
num_cache_tokens[idx] += output.get("num_cache_tokens") or 0
|
||||
if output.get("num_image_tokens"):
|
||||
output_tokens[idx] += output.get("num_image_tokens")
|
||||
num_image_tokens[idx] += output.get("num_image_tokens")
|
||||
reasoning_tokens[idx] += output.get("reasoning_token_num", 0)
|
||||
delta_message = CompletionResponseStreamChoice(
|
||||
index=idx,
|
||||
@@ -527,8 +534,9 @@ class OpenAIServingCompletion:
|
||||
prompt_batched_token_ids[idx // (1 if request.n is None else request.n)]
|
||||
)
|
||||
+ output_tokens[idx],
|
||||
prompt_tokens_details=PromptTokenUsageInfo(cached_tokens=num_cache_tokens[idx]),
|
||||
completion_tokens_details=CompletionTokenUsageInfo(
|
||||
reasoning_tokens=reasoning_tokens[idx]
|
||||
image_tokens=num_image_tokens[idx], reasoning_tokens=reasoning_tokens[idx]
|
||||
),
|
||||
),
|
||||
)
|
||||
@@ -559,6 +567,8 @@ class OpenAIServingCompletion:
|
||||
choices: List[CompletionResponseChoice] = []
|
||||
num_prompt_tokens = 0
|
||||
num_generated_tokens = 0
|
||||
num_cache_tokens = 0
|
||||
num_image_tokens = 0
|
||||
num_reasoning_tokens = 0
|
||||
|
||||
for idx in range(len(final_res_batch)):
|
||||
@@ -614,6 +624,10 @@ class OpenAIServingCompletion:
|
||||
num_generated_tokens += final_res["output_token_ids"]
|
||||
|
||||
num_prompt_tokens += len(prompt_token_ids)
|
||||
num_cache_tokens += output.get("num_cache_tokens") or 0
|
||||
if output.get("num_image_tokens"):
|
||||
num_generated_tokens += output.get("num_image_tokens")
|
||||
num_image_tokens += output.get("num_image_tokens")
|
||||
|
||||
num_reasoning_tokens += output.get("reasoning_token_num", 0)
|
||||
|
||||
@@ -622,7 +636,10 @@ class OpenAIServingCompletion:
|
||||
prompt_tokens=num_prompt_tokens,
|
||||
completion_tokens=num_generated_tokens,
|
||||
total_tokens=num_prompt_tokens + num_generated_tokens,
|
||||
completion_tokens_details=CompletionTokenUsageInfo(reasoning_tokens=num_reasoning_tokens),
|
||||
prompt_tokens_details=PromptTokenUsageInfo(cached_tokens=num_cache_tokens),
|
||||
completion_tokens_details=CompletionTokenUsageInfo(
|
||||
reasoning_tokens=num_reasoning_tokens, image_tokens=num_image_tokens
|
||||
),
|
||||
)
|
||||
del request
|
||||
|
||||
|
||||
@@ -0,0 +1,33 @@
|
||||
"""
|
||||
# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License"
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
"""
|
||||
|
||||
import numpy as np
|
||||
|
||||
|
||||
def count_tokens(tokens):
|
||||
"""
|
||||
Count the number of tokens in a nested list or array structure.
|
||||
"""
|
||||
count = 0
|
||||
stack = [tokens]
|
||||
while stack:
|
||||
current = stack.pop()
|
||||
if isinstance(current, (list, tuple, np.ndarray)):
|
||||
for item in reversed(current):
|
||||
stack.append(item)
|
||||
else:
|
||||
count += 1
|
||||
return count
|
||||
Reference in New Issue
Block a user