# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License" # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. import asyncio import os import shutil import signal import subprocess import sys import time import httpx import pytest import requests tests_dir = os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "..")) sys.path.insert(0, tests_dir) from e2e.utils.serving_utils import ( FD_API_PORT, FD_CACHE_QUEUE_PORT, FD_ENGINE_QUEUE_PORT, FD_METRICS_PORT, clean_ports, is_port_open, ) @pytest.fixture(scope="session", autouse=True) def setup_and_run_server(): """ Pytest fixture that runs once per test session: - Cleans ports before tests - Starts the API server as a subprocess - Waits for server port to open (up to 30 seconds) - Tears down server after all tests finish """ print("Pre-test port cleanup...") FD_CONTROLLER_PORT = int(os.getenv("FD_CONTROLLER_PORT", 8333)) clean_ports([FD_API_PORT, FD_ENGINE_QUEUE_PORT, FD_METRICS_PORT, FD_CACHE_QUEUE_PORT, FD_CONTROLLER_PORT]) env = os.environ.copy() env["CUDA_VISIBLE_DEVICES"] = "0,1" env["ENABLE_V1_KVCACHE_SCHEDULER"] = "1" base_path = os.getenv("MODEL_PATH") if base_path: model_path = os.path.join(base_path, "TP2") else: model_path = "./TP2" log_path = "server.log" cmd = [ sys.executable, "-m", "fastdeploy.entrypoints.openai.api_server", "--model", model_path, "--port", str(FD_API_PORT), "--tensor-parallel-size", "2", "--engine-worker-queue-port", str(FD_ENGINE_QUEUE_PORT), "--metrics-port", str(FD_METRICS_PORT), "--cache-queue-port", str(FD_CACHE_QUEUE_PORT), "--controller-port", str(FD_CONTROLLER_PORT), "--max-model-len", "32768", "--max-num-seqs", "1", "--gpu-memory-utilization", "0.9", "--load-strategy", "ipc_snapshot", "--dynamic-load-weight", ] # Start subprocess in new process group # 清除log目录 if os.path.exists("log"): shutil.rmtree("log") with open(log_path, "w") as logfile: process = subprocess.Popen( cmd, stdout=logfile, stderr=subprocess.STDOUT, start_new_session=True, # Enables killing full group via os.killpg env=env, ) # Wait up to 300 seconds for API server to be ready for _ in range(300): if is_port_open("127.0.0.1", FD_API_PORT): print(f"API server is up on port {FD_API_PORT}") break time.sleep(1) else: print("[TIMEOUT] API server failed to start in 5 minutes. Cleaning up...") try: os.killpg(process.pid, signal.SIGTERM) except Exception as e: print(f"Failed to kill process group: {e}") raise RuntimeError(f"API server did not start on port {FD_API_PORT}") yield # Run tests print("\n===== Post-test server cleanup... =====") try: os.killpg(process.pid, signal.SIGTERM) print(f"API server (pid={process.pid}) terminated") except Exception as e: print(f"Failed to terminate API server: {e}") async def send_inference(idx, client: httpx.AsyncClient): try: url = f"http://0.0.0.0:{FD_API_PORT}/v1/chat/completions" data = { "model": "dummy", "messages": [{"role": "user", "content": f"hello {idx}"}], "metadata": {"min_tokens": 1000}, } resp = await client.post(url, json=data, timeout=20) return resp.status_code except Exception as e: print(f"infer {idx} error:", e) return None async def run_concurrent_inference(n): async with httpx.AsyncClient() as client: tasks = [send_inference(i, client) for i in range(n)] results = await asyncio.gather(*tasks, return_exceptions=True) return results def async_concurrency(n=10): print(f"Launching {n} concurrent async inference requests...") t0 = time.time() results = asyncio.run(run_concurrent_inference(n)) print("Done in", time.time() - t0, "seconds") print("Status codes:", results) def parse_prometheus_to_dict(metrics_text: str): """转换为dict格式""" result = {} for line in metrics_text.split("\n"): line = line.strip() # 跳过注释和空行 if not line or line.startswith("#"): continue if "{" in line: # 有 label metric_name = line.split("{", 1)[0] labels_str = line[line.index("{") + 1 : line.index("}")] value = float(line.split("}")[1].strip()) # 解析 labels labels = {} for kv in labels_str.split(","): if "=" not in kv: continue k, v = kv.split("=") labels[k] = v.strip('"') # 存储 if metric_name not in result: result[metric_name] = [] result[metric_name].append({"labels": labels, "value": value}) else: # 无 label metric_name, value_str = line.split() result[metric_name] = float(value_str) return result def get_metrics_dict(metrics_url): """获取metrics指标数据""" resp = requests.get(metrics_url, timeout=5) assert resp.status_code == 200, f"Unexpected status code: {resp.status_code}" assert "text/plain" in resp.headers["Content-Type"], "Content-Type is not text/plain" # Parse Prometheus metrics data metrics_data = resp.text print(metrics_data) metrics_dict = parse_prometheus_to_dict(metrics_data) # print("\nParsed dict:") # print(metrics_dict) print("num_requests_running:", metrics_dict["fastdeploy:num_requests_running"]) print("num_requests_waiting", metrics_dict["fastdeploy:num_requests_waiting"]) return metrics_dict def test_metrics_with_clear_and_reset(): """ Test the metrics monitoring endpoint. """ metrics_url = f"http://0.0.0.0:{FD_METRICS_PORT}/metrics" async_concurrency(n=10) time.sleep(0.3) # ===== clear_load_weight ===== clear_url = f"http://0.0.0.0:{FD_API_PORT}/clear_load_weight" print("Calling clear_load_weight...") r = requests.get(clear_url, timeout=30) assert r.status_code == 200, f"clear_load_weight failed: {r.status_code}" metrics = get_metrics_dict(metrics_url) running = metrics["fastdeploy:num_requests_running"] waiting = metrics["fastdeploy:num_requests_waiting"] print( "ASSERT after the clear_load_weight operation, the value is 0 (Request interruption stopped inference, and related requests were cleared):", running, "waiting:", waiting, ) # assert running == 0 and waiting == 0, "Expected both running and waiting to be 0 after clear_load_weight" if __name__ == "__main__": test_metrics_with_clear_and_reset()