""" # Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. """ import inspect import json import unittest from unittest import IsolatedAsyncioTestCase from unittest.mock import AsyncMock, Mock, patch from fastdeploy.entrypoints.openai.protocol import ( ChatCompletionRequest, ChatCompletionResponseChoice, CompletionRequest, ) from fastdeploy.entrypoints.openai.serving_chat import OpenAIServingChat from fastdeploy.entrypoints.openai.serving_completion import OpenAIServingCompletion class TestMaxStreamingResponseTokens(IsolatedAsyncioTestCase): async def asyncSetUp(self): self.engine_client = Mock() self.engine_client.connection_initialized = False self.engine_client.connection_manager = AsyncMock() self.engine_client.connection_manager.initialize = AsyncMock() self.engine_client.connection_manager.get_connection = AsyncMock() self.engine_client.connection_manager.cleanup_request = AsyncMock() self.engine_client.semaphore = Mock() self.engine_client.semaphore.acquire = AsyncMock() self.engine_client.semaphore.release = Mock() self.engine_client.data_processor = Mock() self.engine_client.is_master = True self.engine_client.check_model_weight_status = Mock(return_value=False) self.chat_serving = OpenAIServingChat( engine_client=self.engine_client, models=None, pid=123, ips=None, max_waiting_time=30, chat_template="default", enable_mm_output=False, tokenizer_base_url=None, ) self.completion_serving = OpenAIServingCompletion( engine_client=self.engine_client, models=None, pid=123, ips=None, max_waiting_time=30 ) def test_metadata_parameter_setting(self): request = ChatCompletionRequest( model="test-model", messages=[{"role": "user", "content": "Hello"}], stream=True, metadata={"max_streaming_response_tokens": 100}, ) max_tokens = ( request.max_streaming_response_tokens if request.max_streaming_response_tokens is not None else (request.metadata or {}).get("max_streaming_response_tokens", 1) ) self.assertEqual(max_tokens, 100) def test_default_value(self): request = ChatCompletionRequest( model="test-model", messages=[{"role": "user", "content": "Hello"}], stream=True ) max_tokens = ( request.max_streaming_response_tokens if request.max_streaming_response_tokens is not None else (request.metadata or {}).get("max_streaming_response_tokens", 1) ) self.assertEqual(max_tokens, 1) def test_edge_case_zero_value(self): request = ChatCompletionRequest( model="test-model", messages=[{"role": "user", "content": "Hello"}], stream=True, max_streaming_response_tokens=0, ) max_streaming_response_tokens = ( request.max_streaming_response_tokens if request.max_streaming_response_tokens is not None else (request.metadata or {}).get("max_streaming_response_tokens", 1) ) max_streaming_response_tokens = max(1, max_streaming_response_tokens) self.assertEqual(max_streaming_response_tokens, 1) @patch("fastdeploy.entrypoints.openai.serving_chat.api_server_logger") @patch("fastdeploy.entrypoints.openai.serving_chat.ChatResponseProcessor") async def test_integration_with_chat_stream_generator(self, mock_processor_class, mock_logger): response_data = [ { "request_id": "test_request_id_0", "outputs": { "token_ids": [1], "text": "a", "top_logprobs": None, "draft_top_logprobs": None, "reasoning_content": "", "tool_calls": None, "skipped": False, }, "metrics": {"first_token_time": 0.1, "inference_start_time": 0.1}, "finished": False, }, { "request_id": "test_request_id_0", "outputs": { "token_ids": [2], "text": "b", "top_logprobs": None, "draft_top_logprobs": None, "reasoning_content": "", "tool_calls": None, "skipped": False, }, "metrics": {"engine_recv_latest_token_time": 0.2, "first_token_time": None}, "finished": False, }, { "request_id": "test_request_id_0", "outputs": { "token_ids": [3], "text": "c", "top_logprobs": None, "draft_top_logprobs": None, "reasoning_content": "", "tool_calls": None, "skipped": False, }, "metrics": {"engine_recv_latest_token_time": 0.3, "first_token_time": None}, "finished": False, }, { "request_id": "test_request_id_0", "outputs": { "token_ids": [4], "text": "d", "top_logprobs": None, "draft_top_logprobs": None, "reasoning_content": "", "tool_calls": None, "skipped": False, }, "metrics": {"engine_recv_latest_token_time": 0.4, "first_token_time": None}, "finished": False, }, { "request_id": "test_request_id_0", "outputs": { "token_ids": [5], "text": "e", "top_logprobs": None, "draft_top_logprobs": None, "reasoning_content": "", "tool_calls": None, "skipped": False, }, "metrics": {"engine_recv_latest_token_time": 0.5, "first_token_time": None}, "finished": False, }, { "request_id": "test_request_id_0", "outputs": { "token_ids": [6], "text": "f", "top_logprobs": None, "draft_top_logprobs": None, "reasoning_content": "", "tool_calls": None, "skipped": False, }, "metrics": {"engine_recv_latest_token_time": 0.6, "first_token_time": None}, "finished": False, }, { "request_id": "test_request_id_0", "outputs": { "token_ids": [7], "text": "g", "top_logprobs": None, "draft_top_logprobs": None, "reasoning_content": "", "tool_calls": None, "skipped": False, }, "metrics": {"engine_recv_latest_token_time": 0.7, "first_token_time": None, "request_start_time": 0.1}, "finished": True, }, ] mock_response_queue = AsyncMock() mock_response_queue.get.side_effect = response_data mock_dealer = Mock() mock_dealer.write = Mock() # Mock the connection manager call self.engine_client.connection_manager.get_connection = AsyncMock( return_value=(mock_dealer, mock_response_queue) ) mock_processor_instance = Mock() async def mock_process_response_chat_single(response, stream, include_stop_str_in_output, request=None): yield response mock_processor_instance.process_response_chat = mock_process_response_chat_single mock_processor_instance.enable_multimodal_content = Mock(return_value=False) mock_processor_class.return_value = mock_processor_instance request = ChatCompletionRequest( model="test-model", messages=[{"role": "user", "content": "Hello"}], stream=True, max_streaming_response_tokens=3, ) generator = self.chat_serving.chat_completion_stream_generator( request=request, request_id="test_request_id", model_name="test-model", prompt_token_ids=[1, 2, 3], prompt_tokens="Hello", max_tokens=10, ) chunks = [] async for chunk in generator: chunks.append(chunk) self.assertGreater(len(chunks), 0, "No chucks!") parsed_chunks = [] for i, chunk_str in enumerate(chunks): if i == 0: continue if chunk_str.startswith("data: ") and chunk_str.endswith("\n\n"): json_part = chunk_str[6:-2] if json_part == "[DONE]": parsed_chunks.append({"type": "done", "raw": chunk_str}) break try: chunk_dict = json.loads(json_part) parsed_chunks.append(chunk_dict) except json.JSONDecodeError as e: self.fail(f"Cannot parser {i + 1} chunk, JSON: {e}\n origin string: {repr(chunk_str)}") else: self.fail(f"{i + 1} chunk is unexcepted 'data: JSON\\n\\n': {repr(chunk_str)}") for chunk_dict in parsed_chunks: choices_list = chunk_dict["choices"] if choices_list[-1].get("finish_reason") is not None: break else: self.assertEqual(len(choices_list), 3, f"Chunk {chunk_dict} should has three choices") found_done = any("[DONE]" in chunk for chunk in chunks) self.assertTrue(found_done, "Not Receive '[DONE]'") @patch("fastdeploy.entrypoints.openai.serving_completion.api_server_logger") async def test_integration_with_completion_stream_generator(self, mock_logger): response_data = [ [ { "request_id": "test-request-id_0", "outputs": { "token_ids": [1], "text": "a", "top_logprobs": None, "draft_top_logprobs": None, "reasoning_content": "", "tool_calls": None, "skipped": False, }, "metrics": {"first_token_time": 0.1, "inference_start_time": 0.1}, "finished": False, }, { "request_id": "test-request-id_0", "outputs": { "token_ids": [2], "text": "b", "top_logprobs": None, "draft_top_logprobs": None, "reasoning_content": "", "tool_calls": None, "skipped": False, }, "metrics": {"engine_recv_latest_token_time": 0.2, "first_token_time": None}, "finished": False, }, ], [ { "request_id": "test-request-id_0", "outputs": { "token_ids": [7], "text": "g", "top_logprobs": None, "draft_top_logprobs": None, "reasoning_content": "", "tool_calls": None, "skipped": False, }, "metrics": { "engine_recv_latest_token_time": 0.7, "first_token_time": None, "request_start_time": 0.1, }, "finished": True, } ], ] mock_response_queue = AsyncMock() mock_response_queue.get.side_effect = response_data mock_dealer = Mock() mock_dealer.write = Mock() # Mock the connection manager call self.engine_client.connection_manager.get_connection = AsyncMock( return_value=(mock_dealer, mock_response_queue) ) request = CompletionRequest(model="test-model", prompt="Hello", stream=True, max_streaming_response_tokens=3) generator = self.completion_serving.completion_stream_generator( request=request, num_choices=1, request_id="test-request-id", model_name="test-model", created_time=11, prompt_batched_token_ids=[[1, 2, 3]], prompt_tokens_list=["Hello"], max_tokens_list=[100], ) chunks = [] async for chunk in generator: chunks.append(chunk) self.assertGreater(len(chunks), 0, "No chucks!") parsed_chunks = [] for i, chunk_str in enumerate(chunks): if chunk_str.startswith("data: ") and chunk_str.endswith("\n\n"): json_part = chunk_str[6:-2] if json_part == "[DONE]": break try: chunk_dict = json.loads(json_part) parsed_chunks.append(chunk_dict) except json.JSONDecodeError as e: self.fail(f"Cannot parser {i + 1} chunk, JSON: {e}\n origin string: {repr(chunk_str)}") else: self.fail(f"{i + 1} chunk is unexcepted 'data: JSON\\n\\n': {repr(chunk_str)}") self.assertEqual(len(parsed_chunks), 1) for chunk_dict in parsed_chunks: choices_list = chunk_dict["choices"] self.assertEqual(len(choices_list), 3, f"Chunk {chunk_dict} should has three choices") self.assertEqual( choices_list[-1].get("finish_reason"), "stop", f"Chunk {chunk_dict} should has stop reason" ) found_done = any("[DONE]" in chunk for chunk in chunks) self.assertTrue(found_done, "Not Receive '[DONE]'") @patch("fastdeploy.entrypoints.openai.serving_completion.envs.ZMQ_SEND_BATCH_DATA", 0) @patch("fastdeploy.entrypoints.openai.serving_completion.api_server_logger") async def test_completion_full_generator(self, mock_logger): final_response_data = [ { "request_id": "test_request_id_0", "outputs": { "token_ids": [7, 8, 9], "text": " world!", "top_logprobs": [ {"a": 0.1, "b": 0.2}, {"c": 0.3, "d": 0.4}, {"e": 0.5, "f": 0.6}, ], }, "finished": True, "metrics": {}, }, { "request_id": "test_request_id_1", "outputs": { "token_ids": [10, 11, 12], "text": " there!", "top_logprobs": [ {"g": 0.7, "h": 0.8}, {"i": 0.9, "j": 1.0}, {"k": 1.1, "l": 1.2}, ], }, "finished": True, "metrics": {}, }, ] mock_response_queue = AsyncMock() mock_response_queue.get.side_effect = [ [final_response_data[0]], [final_response_data[1]], ] mock_dealer = Mock() mock_dealer.write = Mock() self.engine_client.connection_manager.get_connection.return_value = (mock_dealer, mock_response_queue) expected_completion_response = Mock() self.completion_serving.request_output_to_completion_response = Mock(return_value=expected_completion_response) request = CompletionRequest( model="test_model", prompt="Hello", max_tokens=10, stream=False, n=2, echo=False, ) num_choices = 2 request_id = "test_request_id" created_time = 1655136000 model_name = "test_model" prompt_batched_token_ids = [[1, 2, 3], [4, 5, 6]] prompt_tokens_list = ["Hello", "Hello"] actual_response = await self.completion_serving.completion_full_generator( request=request, num_choices=num_choices, request_id=request_id, created_time=created_time, model_name=model_name, prompt_batched_token_ids=prompt_batched_token_ids, prompt_tokens_list=prompt_tokens_list, max_tokens_list=[100, 100], ) self.assertEqual(actual_response, expected_completion_response) self.engine_client.connection_manager.get_connection.assert_called_once_with(request_id, num_choices) self.assertEqual(mock_dealer.write.call_count, num_choices) mock_dealer.write.assert_any_call([b"", b"test_request_id_0"]) mock_dealer.write.assert_any_call([b"", b"test_request_id_1"]) mock_response_queue.get.assert_awaited() self.assertEqual(mock_response_queue.get.call_count, 2) self.assertEqual(self.engine_client.data_processor.process_response_dict.call_count, len(final_response_data)) self.completion_serving.request_output_to_completion_response.assert_called_once() self.engine_client.semaphore.release.assert_called_once() self.engine_client.connection_manager.cleanup_request.assert_awaited_once_with(request_id) async def test_create_chat_completion_choice(self): """ Test core & edge scenarios for _create_chat_completion_choice: """ test_cases = [ { "test_data": { "request_id": "test_0", "outputs": { "token_ids": [123, 456], "text": "Normal AI response", "reasoning_content": "Normal reasoning", "tool_call": None, "num_image_tokens": 2, "raw_prediction": "raw_answer_0", }, "num_cached_tokens": 3, "finished": True, "previous_num_tokens": 2, }, "mock_request": ChatCompletionRequest( model="test", messages=[], return_token_ids=True, max_tokens=10, n=2 ), "expected": { "index": 0, "content": "Normal AI response", "reasoning_content": "Normal reasoning", "tool_calls": None, "raw_prediction": "raw_answer_0", "num_cached_tokens": 3, "num_image_tokens": 2, "finish_reason": "stop", }, }, { "test_data": { "request_id": "test_1", "outputs": { "token_ids": [123, 456, 789], "text": "Edge case response", "reasoning_content": None, "tool_call": None, "num_image_tokens": 0, "raw_prediction": None, }, "num_cached_tokens": 0, "finished": True, "previous_num_tokens": 1, }, "mock_request": ChatCompletionRequest( model="test", messages=[], return_token_ids=True, max_tokens=1, n=2 ), "expected": { "index": 1, "content": "Edge case response", "reasoning_content": None, "tool_calls": None, "raw_prediction": None, "num_cached_tokens": 0, "num_image_tokens": 0, "finish_reason": "length", }, }, ] prompt_token_ids = [1, 2] prompt_tokens = "test_prompt" logprob_contents = [[{"token": "hello", "logprob": 0.1}], [{"token": "hello", "logprob": 0.1}]] draft_logprob_contents = [[{"token": "hello", "logprob": 0.1}], [{"token": "hello", "logprob": 0.1}]] mock_response_processor = Mock() mock_response_processor.enable_multimodal_content.return_value = False completion_token_ids = [[], []] num_cached_tokens = [0, 0] num_input_image_tokens = [0, 0] num_input_video_tokens = [0, 0] num_image_tokens = [0, 0] prompt_logprobs_res_list = [[], []] max_tokens_list = [10, 1] for idx, case in enumerate(test_cases): actual_choice = await self.chat_serving._create_chat_completion_choice( data=case["test_data"], request=case["mock_request"], prompt_token_ids=prompt_token_ids, prompt_tokens=prompt_tokens, completion_token_ids=completion_token_ids[idx], previous_num_tokens=case["test_data"]["previous_num_tokens"], num_cached_tokens=num_cached_tokens, num_input_image_tokens=num_input_image_tokens, num_input_video_tokens=num_input_video_tokens, num_image_tokens=num_image_tokens, logprob_contents=logprob_contents, draft_logprob_contents=draft_logprob_contents, prompt_logprobs_res_list=prompt_logprobs_res_list, response_processor=mock_response_processor, max_tokens=max_tokens_list[idx], speculate_metrics=None, ) expected = case["expected"] self.assertIsInstance(actual_choice, ChatCompletionResponseChoice) self.assertEqual(actual_choice.index, expected["index"]) self.assertEqual(actual_choice.message.content, expected["content"]) self.assertEqual(actual_choice.message.reasoning_content, expected["reasoning_content"]) self.assertEqual(actual_choice.message.tool_calls, expected["tool_calls"]) self.assertEqual(actual_choice.message.completion_token_ids, completion_token_ids[idx]) self.assertEqual(num_cached_tokens[expected["index"]], expected["num_cached_tokens"]) self.assertEqual(num_image_tokens[expected["index"]], expected["num_image_tokens"]) self.assertEqual(actual_choice.finish_reason, expected["finish_reason"]) assert actual_choice.logprobs is not None @patch("fastdeploy.entrypoints.openai.serving_chat.api_server_logger") @patch("fastdeploy.entrypoints.openai.serving_chat.ChatResponseProcessor") async def test_chat_stream_usage_fields(self, mock_response_processor, api_server_logger): response_data = [ { "request_id": "test-request-id_0", "outputs": { "token_ids": [1], "text": "a", "top_logprobs": None, "draft_top_logprobs": None, "reasoning_content": "", "tool_calls": None, "skipped": False, }, "metrics": {"first_token_time": 0.1, "inference_start_time": 0.1, "request_start_time": 0.0}, "finished": False, }, { "request_id": "test-request-id_0", "outputs": { "token_ids": [2, 3], "text": "bc", "top_logprobs": None, "draft_top_logprobs": None, "reasoning_content": "", "tool_calls": None, "skipped": False, }, "metrics": {"engine_recv_latest_token_time": 0.3, "first_token_time": None, "request_start_time": 0.0}, "finished": True, }, ] mock_response_queue = AsyncMock() mock_response_queue.get.side_effect = response_data mock_dealer = Mock() mock_dealer.write = Mock() self.engine_client.connection_manager.get_connection = AsyncMock( return_value=(mock_dealer, mock_response_queue) ) mock_processor_instance = Mock() async def mock_process_response_chat(response, stream, include_stop_str_in_output, request=None): delta_msg_mock = Mock() delta_msg_mock.content = response["outputs"]["text"] if response["outputs"]["text"] == "a": delta_msg_mock.reasoning_content = "Thinking for a" elif response["outputs"]["text"] == "bc": delta_msg_mock.reasoning_content = "Thinking for bc" delta_msg_mock.tool_calls = None response["outputs"]["delta_message"] = delta_msg_mock response["outputs"]["enable_parser"] = False reasoning_content = ( delta_msg_mock.reasoning_content if (delta_msg_mock and delta_msg_mock.reasoning_content) else None ) reasoning_tokens = reasoning_content.split() if reasoning_content else [] response["outputs"]["reasoning_token_num"] = len(reasoning_tokens) response["outputs"]["num_cached_tokens"] = response.get("num_cached_tokens", 0) yield response mock_processor_instance.process_response_chat = mock_process_response_chat mock_processor_instance.enable_multimodal_content = Mock(return_value=False) mock_processor_instance.reasoning_parser = Mock(__class__.__name__ == "Ernie45VLThinkingReasoningParser") mock_processor_instance.data_processor = Mock( process_response_dict=lambda resp, stream, enable_thinking, include_stop_str_in_output: resp ) mock_response_processor.return_value = mock_processor_instance request = ChatCompletionRequest( model="test-model", messages=[{"role": "user", "content": "Hello"}], stream=True, max_streaming_response_tokens=3, return_token_ids=True, stream_options={"include_usage": True, "continuous_usage_stats": True}, chat_template_kwargs={"enable_thinking": True}, ) generator = self.chat_serving.chat_completion_stream_generator( request=request, request_id="test-request-id", model_name="test-model", prompt_token_ids=[10, 20, 30], prompt_tokens="Hello", max_tokens=10, ) chunks = [] async for chunk in generator: chunks.append(chunk) if "[DONE]" in chunk: break parsed_chunks = [] for chunk_str in chunks: if chunk_str.startswith("data: ") and chunk_str.endswith("\n\n"): json_part = chunk_str[6:-2] if json_part == "[DONE]": break try: chunk_dict = json.loads(json_part) parsed_chunks.append(chunk_dict) except json.JSONDecodeError: self.fail(f"Invalid JSON in chunk: {chunk_str}") first_chunk = parsed_chunks[0] self.assertIn("usage", first_chunk, "First chunk should contain usage") self.assertEqual(first_chunk["usage"]["prompt_tokens"], 3, "First chunk prompt_tokens mismatch") self.assertIn("prompt_tokens_details", first_chunk["usage"]) middle_chunk = next(c for c in parsed_chunks if "choices" in c and len(c["choices"]) > 0) self.assertIn("usage", middle_chunk, "Middle chunk should contain usage") self.assertIn( "completion_tokens_details", middle_chunk["usage"], "Middle chunk missing completion_tokens_details" ) self.assertIn( "reasoning_tokens", middle_chunk["usage"]["completion_tokens_details"], "completion_tokens_details should contain reasoning_tokens", ) self.assertGreaterEqual( middle_chunk["usage"]["completion_tokens_details"]["reasoning_tokens"], 0, "reasoning_tokens should be greater than 0", ) final_usage_chunk = parsed_chunks[-1] self.assertIn("usage", final_usage_chunk, "Final chunk missing 'usage'") self.assertEqual(final_usage_chunk["usage"]["completion_tokens"], 3, "Final completion_tokens mismatch") self.assertEqual(final_usage_chunk["usage"]["total_tokens"], 3 + 3, "Final total_tokens mismatch") @patch("fastdeploy.entrypoints.openai.serving_completion.api_server_logger") async def test_completion_stream_usage_fields(self, mock_logger): """测试completion流式响应中当include_usage为True时,usage字段的正确性""" response_data = [ [ { "request_id": "test-request-id_0", "outputs": { "token_ids": [10], "text": "a", "top_logprobs": None, "draft_top_logprobs": None, "reasoning_content": "", "tool_calls": None, "skipped": False, }, "metrics": { "engine_recv_latest_token_time": 0.3, "first_token_time": 0.1, "inference_start_time": 0.1, "request_start_time": 0.0, }, "finished": False, } ], [ { "request_id": "test-request-id_0", "outputs": { "token_ids": [2], "text": "bc", "top_logprobs": None, "draft_top_logprobs": None, "reasoning_content": "", "tool_calls": None, "skipped": False, }, "metrics": { "engine_recv_latest_token_time": 0.3, "first_token_time": 0.1, "inference_start_time": 0.1, "request_start_time": 0.0, }, "finished": True, } ], ] mock_response_queue = AsyncMock() mock_response_queue.get.side_effect = response_data mock_dealer = Mock() mock_dealer.write = Mock() self.engine_client.connection_manager.get_connection = AsyncMock( return_value=(mock_dealer, mock_response_queue) ) from fastdeploy.entrypoints.openai.protocol import StreamOptions request = CompletionRequest( model="test-model", prompt="Hello", stream=True, max_streaming_response_tokens=3, return_token_ids=True, stream_options=StreamOptions(include_usage=True), ) generator = self.completion_serving.completion_stream_generator( request=request, num_choices=1, request_id="test-request-id", created_time=1620000000, model_name="test-model", prompt_batched_token_ids=[[10, 20, 30]], prompt_tokens_list=["Hello"], max_tokens_list=[100], ) chunks = [] async for chunk in generator: chunks.append(chunk) if "[DONE]" in chunk: break parsed_chunks = [] for chunk_str in chunks: if chunk_str.startswith("data: ") and chunk_str.endswith("\n\n"): json_part = chunk_str[6:-2] if json_part == "[DONE]": break try: chunk_dict = json.loads(json_part) parsed_chunks.append(chunk_dict) except json.JSONDecodeError: self.fail(f"Invalid JSON in chunk: {chunk_str}") middle_chunks = [c for c in parsed_chunks if "choices" in c and len(c["choices"]) > 0] for chunk in middle_chunks: self.assertNotIn("usage", chunk, "Middle chunks should not contain 'usage'") self.assertGreater(len(parsed_chunks), 0, "No parsed chunks found") final_usage_chunk = None for chunk in reversed(parsed_chunks): if "usage" in chunk: final_usage_chunk = chunk break self.assertIsNotNone(final_usage_chunk, "Final usage chunk not found") self.assertEqual(final_usage_chunk["usage"]["prompt_tokens"], 3, "Final prompt_tokens mismatch") self.assertEqual(final_usage_chunk["usage"]["completion_tokens"], 2, "Final completion_tokens mismatch") self.assertEqual(final_usage_chunk["usage"]["total_tokens"], 5, "Final total_tokens mismatch") self.assertIn( "completion_tokens_details", final_usage_chunk["usage"], "Final chunk missing completion_tokens_details" ) self.assertIn( "reasoning_tokens", final_usage_chunk["usage"]["completion_tokens_details"], "completion_tokens_details should contain reasoning_tokens", ) self.assertGreaterEqual( final_usage_chunk["usage"]["completion_tokens_details"]["reasoning_tokens"], 0, "reasoning_tokens count mismatch", ) @patch("fastdeploy.entrypoints.openai.serving_completion.api_server_logger") async def test_completion_full_generator_async_process_response_dict(self, mock_logger): final_response_data = [ { "request_id": "test_request_id_0", "outputs": { "token_ids": [7, 8, 9], "text": " world!", }, "finished": True, "metrics": {}, }, { "request_id": "test_request_id_1", "outputs": { "token_ids": [10, 11, 12], "text": " there!", }, "finished": True, "metrics": {}, }, ] mock_response_queue = AsyncMock() mock_response_queue.get.side_effect = [ [final_response_data[0]], [final_response_data[1]], ] mock_dealer = Mock() mock_dealer.write = Mock() self.engine_client.connection_manager.get_connection.return_value = (mock_dealer, mock_response_queue) expected_completion_response = Mock() self.completion_serving.request_output_to_completion_response = Mock(return_value=expected_completion_response) request = CompletionRequest( model="test_model", prompt="Hello", max_tokens=10, stream=False, n=2, echo=False, ) num_choices = 2 request_id = "test_request_id" created_time = 1655136000 model_name = "test_model" prompt_batched_token_ids = [[1, 2, 3], [4, 5, 6]] prompt_tokens_list = ["Hello", "Hello"] self.engine_client.data_processor.process_response_dict = AsyncMock() actual_response = await self.completion_serving.completion_full_generator( request=request, num_choices=num_choices, request_id=request_id, created_time=created_time, model_name=model_name, prompt_batched_token_ids=prompt_batched_token_ids, prompt_tokens_list=prompt_tokens_list, max_tokens_list=[100, 100], ) self.assertEqual(actual_response, expected_completion_response) self.assertTrue(inspect.iscoroutinefunction(self.engine_client.data_processor.process_response_dict)) self.engine_client.data_processor.process_response_dict.assert_awaited() actual_call_times = self.engine_client.data_processor.process_response_dict.call_count expected_call_times = len(final_response_data) self.assertEqual(actual_call_times, expected_call_times) call_args_list = self.engine_client.data_processor.process_response_dict.call_args_list self.assertEqual(len(call_args_list), expected_call_times) for idx, data in enumerate(final_response_data): args, kwargs = call_args_list[idx] self.assertEqual(args[0], data) self.assertEqual(kwargs.get("stream"), False) self.assertEqual(kwargs.get("include_stop_str_in_output"), request.include_stop_str_in_output) @patch("fastdeploy.entrypoints.openai.serving_completion.api_server_logger") async def test_completion_stream_generator_async_process_response_dict(self, mock_logger): final_response_data = [ [ { "request_id": "test-request-id_0", "outputs": { "index": 0, "send_idx": 0, "token_ids": [1], "text": "a", "top_logprobs": {"a": 0.98, "b": 0.02}, "draft_top_logprobs": {"a": 0.98, "b": 0.02}, "reasoning_content": "", "tool_calls": None, "skipped": False, }, "finished": False, "metrics": { "first_token_time": 1620000000, "inference_start_time": 1620000000, "engine_recv_latest_token_time": 1620000000, }, "error_code": 200, } ], [ { "request_id": "test-request-id_0", "outputs": { "index": 0, "send_idx": 1, "token_ids": [2], "text": "b", "top_logprobs": {"a": 0.98, "b": 0.02}, "draft_top_logprobs": {"a": 0.98, "b": 0.02}, "reasoning_content": "", "tool_calls": None, "skipped": False, }, "finished": False, "metrics": { "first_token_time": 1620000000, "inference_start_time": 1620000000, "engine_recv_latest_token_time": 1620000000, }, "error_code": 200, } ], [ { "request_id": "test-request-id_0", "outputs": { "index": 0, "send_idx": 2, "token_ids": [7], "text": "g", "top_logprobs": {"a": 0.98, "b": 0.02}, "draft_top_logprobs": {"a": 0.98, "b": 0.02}, "reasoning_content": "", "tool_calls": None, "skipped": False, }, "finished": True, "metrics": { "first_token_time": 1620000000, "inference_start_time": 1620000000, "engine_recv_latest_token_time": 1620000000, }, "error_code": 200, } ], ] mock_response_queue = AsyncMock() mock_response_queue.get.side_effect = final_response_data mock_dealer = Mock() mock_dealer.write = Mock() self.engine_client.connection_manager.get_connection = AsyncMock( return_value=(mock_dealer, mock_response_queue) ) request = CompletionRequest( model="test-model", prompt="Hello", stream=True, max_streaming_response_tokens=3, n=1, echo=False, max_tokens=100, ) self.engine_client.data_processor.process_response_dict = AsyncMock() generator = self.completion_serving.completion_stream_generator( request=request, num_choices=1, request_id="test-request-id", created_time=1620000000, model_name="test-model", prompt_batched_token_ids=[[1, 2, 3]], prompt_tokens_list=["Hello"], max_tokens_list=[100], ) chunks = [] async for chunk in generator: chunks.append(chunk) if "[DONE]" in chunk: break self.assertGreater(len(chunks), 0) self.assertTrue(inspect.iscoroutinefunction(self.engine_client.data_processor.process_response_dict)) self.engine_client.data_processor.process_response_dict.assert_awaited() flat_response_data = [] for sub_list in final_response_data: flat_response_data.extend(sub_list) expected_call_times = len(flat_response_data) actual_call_times = self.engine_client.data_processor.process_response_dict.call_count self.assertEqual(actual_call_times, expected_call_times) call_args_list = self.engine_client.data_processor.process_response_dict.call_args_list self.assertEqual(len(call_args_list), expected_call_times) for idx, data in enumerate(flat_response_data): args, kwargs = call_args_list[idx] self.assertEqual(args[0], data) self.assertEqual(kwargs.get("stream"), True) self.assertEqual(kwargs.get("include_stop_str_in_output"), request.include_stop_str_in_output) if __name__ == "__main__": unittest.main()