feat: add support for API usage with multimodal models (#4548)

* feat: add support for API usage with multimodal models

* completion_tokens contains num_image_tokens

* remove test_request.py

* fix: paddle.device.is_compiled_with_cuda()

* fix test_unstream_without_logprobs
This commit is contained in:
SunLei
2025-10-28 20:23:46 +08:00
committed by GitHub
parent e1ac90d787
commit 2a9ed72533
10 changed files with 256 additions and 21 deletions
@@ -16,6 +16,7 @@
from typing import Any, List, Optional
from fastdeploy.entrypoints.openai.usage_calculator import count_tokens
from fastdeploy.input.tokenzier_client import AsyncTokenizerClient, ImageDecodeRequest
from fastdeploy.utils import api_server_logger
@@ -104,6 +105,7 @@ class ChatResponseProcessor:
image_output = self._end_image_code_request_output
image_output["outputs"]["multipart"] = [image]
image_output["outputs"]["token_ids"] = all_tokens
image_output["outputs"]["num_image_tokens"] = count_tokens(all_tokens)
yield image_output
self.data_processor.process_response_dict(
@@ -124,6 +126,7 @@ class ChatResponseProcessor:
token_ids = request_output["outputs"]["token_ids"]
if token_ids[-1] == self.eos_token_id:
multipart = []
num_image_tokens = 0
for part in self._multipart_buffer:
if part["decode_type"] == 0:
self.data_processor.process_response_dict(
@@ -139,6 +142,7 @@ class ChatResponseProcessor:
if self.decoder_client:
req_id = part["request_output"]["request_id"]
all_tokens = part["request_output"]["outputs"]["token_ids"]
num_image_tokens += count_tokens(all_tokens)
image_ret = await self.decoder_client.decode_image(
request=ImageDecodeRequest(req_id=req_id, data=all_tokens)
@@ -150,4 +154,5 @@ class ChatResponseProcessor:
lasrt_request_output = self._multipart_buffer[-1]["request_output"]
lasrt_request_output["outputs"]["multipart"] = multipart
lasrt_request_output["outputs"]["num_image_tokens"] = num_image_tokens
yield lasrt_request_output