# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. import json import os import re import shutil import signal import subprocess import sys import time import openai import pytest tests_dir = os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "..")) sys.path.insert(0, tests_dir) from e2e.utils.serving_utils import ( FD_API_PORT, FD_CACHE_QUEUE_PORT, FD_ENGINE_QUEUE_PORT, FD_METRICS_PORT, clean_ports, is_port_open, ) @pytest.fixture(scope="session", autouse=True) def setup_and_run_server(): """ Pytest fixture that runs once per test session: - Cleans ports before tests - Starts the API server as a subprocess - Waits for server port to open (up to 30 seconds) - Tears down server after all tests finish """ print("Pre-test port cleanup...") clean_ports() base_path = os.getenv("MODEL_PATH") if base_path: model_path = os.path.join(base_path, "ernie-4_5-21b-a3b-bf16-paddle") else: model_path = "./ernie-4_5-21b-a3b-bf16-paddle" log_path = "server.log" cmd = [ sys.executable, "-m", "fastdeploy.entrypoints.openai.api_server", "--model", model_path, "--port", str(FD_API_PORT), "--tensor-parallel-size", "1", "--engine-worker-queue-port", str(FD_ENGINE_QUEUE_PORT), "--metrics-port", str(FD_METRICS_PORT), "--cache-queue-port", str(FD_CACHE_QUEUE_PORT), "--max-model-len", "32768", "--max-num-seqs", "128", "--quantization", "wint4", "--graph-optimization-config", '{"cudagraph_capture_sizes": [1]}', "--guided-decoding-backend", "auto", ] # Start subprocess in new process group # 清除log目录 if os.path.exists("log"): shutil.rmtree("log") with open(log_path, "w") as logfile: process = subprocess.Popen( cmd, stdout=logfile, stderr=subprocess.STDOUT, start_new_session=True, # Enables killing full group via os.killpg ) # Wait up to 300 seconds for API server to be ready for _ in range(300): if is_port_open("127.0.0.1", FD_API_PORT): print(f"API server is up on port {FD_API_PORT}") break time.sleep(1) else: print("[TIMEOUT] API server failed to start in 5 minutes. Cleaning up...") try: os.killpg(process.pid, signal.SIGTERM) except Exception as e: print(f"Failed to kill process group: {e}") raise RuntimeError(f"API server did not start on port {FD_API_PORT}") yield # Run tests print("\n===== Post-test server cleanup... =====") try: os.killpg(process.pid, signal.SIGTERM) print(f"API server (pid={process.pid}) terminated") except Exception as e: print(f"Failed to terminate API server: {e}") @pytest.fixture(scope="session") def api_url(request): """ Returns the API endpoint URL for chat completions. """ return f"http://0.0.0.0:{FD_API_PORT}/v1/chat/completions" @pytest.fixture(scope="session") def metrics_url(request): """ Returns the metrics endpoint URL. """ return f"http://0.0.0.0:{FD_METRICS_PORT}/metrics" @pytest.fixture def headers(): """ Returns common HTTP request headers. """ return {"Content-Type": "application/json"} @pytest.fixture def consistent_payload(): """ Returns a fixed payload for consistency testing, including a fixed random seed and temperature. """ return { "messages": [{"role": "user", "content": "用一句话介绍 PaddlePaddle"}], "temperature": 0.9, "top_p": 0, # fix top_p to reduce randomness "seed": 13, # fixed random seed } @pytest.fixture def openai_client(): ip = "0.0.0.0" service_http_port = str(FD_API_PORT) client = openai.Client( base_url=f"http://{ip}:{service_http_port}/v1", api_key="EMPTY_API_KEY", ) return client # ========================== # Helper functions for structured outputs testing # ========================== def streaming_chat_base(openai_client, chat_param): """ Test streaming chat base functionality with the local service """ assert isinstance(chat_param, dict), f"{chat_param} should be a dict" assert "messages" in chat_param, f"{chat_param} should contain messages" response = openai_client.chat.completions.create( model="default", stream=True, **chat_param, ) output = [] for chunk in response: if hasattr(chunk.choices[0], "delta") and hasattr(chunk.choices[0].delta, "content"): output.append(chunk.choices[0].delta.content) assert len(output) > 2 return "".join(output) def non_streaming_chat_base(openai_client, chat_param): """ Test non streaming chat base functionality with the local service """ assert isinstance(chat_param, dict), f"{chat_param} should be a dict" assert "messages" in chat_param, f"{chat_param} should contain messages" response = openai_client.chat.completions.create( model="default", stream=False, **chat_param, ) assert hasattr(response, "choices") assert len(response.choices) > 0 assert hasattr(response.choices[0], "message") assert hasattr(response.choices[0].message, "content") return response.choices[0].message.content # ========================== # Structured outputs tests # ========================== @pytest.mark.skip(reason="Temporarily skip this case due to unstable execution") def test_structured_outputs_json_schema(openai_client): """ Test structured outputs json_schema functionality with the local service """ chat_param = { "temperature": 1, "max_tokens": 1024, } # json_object json_chat_param = { "messages": [ { "role": "user", "content": "Generate a JSON object containing: names of China's Four Great Inventions, their dynasties of origin, and brief descriptions (each under 50 characters)", } ], "response_format": {"type": "json_object"}, } json_chat_param.update(chat_param) response = streaming_chat_base(openai_client, json_chat_param) try: json.loads(response) is_valid = True except ValueError: is_valid = False assert is_valid, f"json_schema streaming response: {response} is not a valid json" response = non_streaming_chat_base(openai_client, json_chat_param) try: json.loads(response) is_valid = True except ValueError: is_valid = False assert is_valid, f"json_schema non_streaming response: {response} is not a valid json" # json_schema from enum import Enum from pydantic import BaseModel class BookType(str, Enum): romance = "Romance" historical = "Historical" adventure = "Adventure" mystery = "Mystery" dystopian = "Dystopian" class BookDescription(BaseModel): author: str title: str genre: BookType json_schema_param = { "messages": [ { "role": "user", "content": "Generate a JSON describing a literary work, including author, title and book type.", } ], "response_format": { "type": "json_schema", "json_schema": {"name": "book-description", "schema": BookDescription.model_json_schema()}, }, } json_schema_param.update(chat_param) response = streaming_chat_base(openai_client, json_schema_param) try: json_schema_response = json.loads(response) is_valid = True except ValueError: is_valid = False assert is_valid, f"json_schema streaming response: {response} is not a valid json" assert ( "author" in json_schema_response and "title" in json_schema_response and "genre" in json_schema_response ), f"json_schema streaming response: {response} is not a valid book-description" assert json_schema_response["genre"] in { genre.value for genre in BookType }, f"json_schema streaming response: {json_schema_response['genre']} is not a valid book-type" response = non_streaming_chat_base(openai_client, json_schema_param) try: json_schema_response = json.loads(response) is_valid = True except ValueError: is_valid = False assert is_valid, f"json_schema non_streaming response: {response} is not a valid json" assert ( "author" in json_schema_response and "title" in json_schema_response and "genre" in json_schema_response ), f"json_schema non_streaming response: {response} is not a valid book-description" assert json_schema_response["genre"] in { genre.value for genre in BookType }, f"json_schema non_streaming response: {json_schema_response['genre']} is not a valid book-type" @pytest.mark.skip(reason="Temporarily skip this case due to unstable execution") def test_structured_outputs_structural_tag(openai_client): """ Test structured outputs structural_tag functionality with the local service """ content_str = """ You have the following function available: { "name": "get_current_date", "description": "Get current date and time for given timezone", "parameters": { "type": "object", "properties": { "timezone": { "type": "string", "description": "Timezone to get current date/time, e.g.: Asia/Shanghai", } }, "required": ["timezone"], } } If you choose to call only this function, reply in this format: <{start_tag}={function_name}>{parameters}{end_tag} where: start_tag => ` JSON dictionary with parameter names as keys end_tag => `` Example: {"param": "value"} Note: - Function call must follow specified format - Required parameters must be specified - Only one function can be called at a time - Place entire function call response on a single line You are an AI assistant. Answer the following question. """ structural_tag_param = { "temperature": 1, "max_tokens": 1024, "messages": [ { "role": "system", "content": content_str, }, { "role": "user", "content": "You're traveling to Shanghai today", }, ], "response_format": { "type": "structural_tag", "structures": [ { "begin": "", "schema": { "type": "object", "properties": { "timezone": { "type": "string", "description": "Timezone to get current date/time, e.g.: Asia/Shanghai", } }, "required": ["timezone"], }, "end": "", } ], "triggers": ["" text "" style_attribute ::= " style=" dq style_value dq style_value ::= (font_style ("; " font_weight)?) | (font_weight ("; " font_style)?) font_style ::= "font-family: '" font_name "'" font_weight ::= "font-weight: " weight_value font_name ::= "Arial" | "Times New Roman" | "Courier New" weight_value ::= "normal" | "bold" text ::= [A-Za-z0-9 ]+ dq ::= ["] """ grammar_param = { "temperature": 1, "top_p": 0.0, "max_tokens": 1024, "messages": [ { "role": "user", "content": "Generate HTML code for this heading in bold Times New Roman font: ERNIE Bot", } ], "extra_body": {"guided_grammar": html_h1_grammar}, } pattern = r'^[A-Za-z0-9 ]+$' response = streaming_chat_base(openai_client, grammar_param) assert re.fullmatch(pattern, response), f"grammar streaming response: {response} is not as expected" response = non_streaming_chat_base(openai_client, grammar_param) assert re.fullmatch(pattern, response), f"grammar non_streaming response: {response} is not as expected"