mirror of
https://github.com/PaddlePaddle/FastDeploy.git
synced 2026-04-23 00:17:25 +08:00
6727df8286
* optimize ttft * fix * fix * fix ci * fix ci * fix * fix bug * fix * add comments * fix ci * fix * fix ci * fix format * update according to review * add comment * fix * fix format
245 lines
7.5 KiB
Python
245 lines
7.5 KiB
Python
# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
|
|
#
|
|
# Licensed under the Apache License, Version 2.0 (the "License"
|
|
# you may not use this file except in compliance with the License.
|
|
# You may obtain a copy of the License at
|
|
#
|
|
# http://www.apache.org/licenses/LICENSE-2.0
|
|
#
|
|
# Unless required by applicable law or agreed to in writing, software
|
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
# See the License for the specific language governing permissions and
|
|
# limitations under the License.
|
|
|
|
import asyncio
|
|
import os
|
|
import shutil
|
|
import signal
|
|
import subprocess
|
|
import sys
|
|
import time
|
|
|
|
import httpx
|
|
import pytest
|
|
import requests
|
|
|
|
tests_dir = os.path.abspath(os.path.join(os.path.dirname(__file__), "..", ".."))
|
|
sys.path.insert(0, tests_dir)
|
|
|
|
from e2e.utils.serving_utils import (
|
|
FD_API_PORT,
|
|
FD_CACHE_QUEUE_PORT,
|
|
FD_ENGINE_QUEUE_PORT,
|
|
FD_METRICS_PORT,
|
|
clean_ports,
|
|
is_port_open,
|
|
)
|
|
|
|
|
|
@pytest.fixture(scope="session", autouse=True)
|
|
def setup_and_run_server():
|
|
"""
|
|
Pytest fixture that runs once per test session:
|
|
- Cleans ports before tests
|
|
- Starts the API server as a subprocess
|
|
- Waits for server port to open (up to 30 seconds)
|
|
- Tears down server after all tests finish
|
|
"""
|
|
print("Pre-test port cleanup...")
|
|
FD_CONTROLLER_PORT = int(os.getenv("FD_CONTROLLER_PORT", 8333))
|
|
clean_ports([FD_API_PORT, FD_ENGINE_QUEUE_PORT, FD_METRICS_PORT, FD_CACHE_QUEUE_PORT, FD_CONTROLLER_PORT])
|
|
|
|
env = os.environ.copy()
|
|
env["CUDA_VISIBLE_DEVICES"] = "0,1"
|
|
env["ENABLE_V1_KVCACHE_SCHEDULER"] = "1"
|
|
|
|
base_path = os.getenv("MODEL_PATH")
|
|
if base_path:
|
|
model_path = os.path.join(base_path, "TP2")
|
|
else:
|
|
model_path = "./TP2"
|
|
|
|
log_path = "server.log"
|
|
cmd = [
|
|
sys.executable,
|
|
"-m",
|
|
"fastdeploy.entrypoints.openai.api_server",
|
|
"--model",
|
|
model_path,
|
|
"--port",
|
|
str(FD_API_PORT),
|
|
"--tensor-parallel-size",
|
|
"2",
|
|
"--engine-worker-queue-port",
|
|
str(FD_ENGINE_QUEUE_PORT),
|
|
"--metrics-port",
|
|
str(FD_METRICS_PORT),
|
|
"--cache-queue-port",
|
|
str(FD_CACHE_QUEUE_PORT),
|
|
"--controller-port",
|
|
str(FD_CONTROLLER_PORT),
|
|
"--max-model-len",
|
|
"32768",
|
|
"--max-num-seqs",
|
|
"1",
|
|
"--gpu-memory-utilization",
|
|
"0.9",
|
|
"--load-strategy",
|
|
"ipc_snapshot",
|
|
"--dynamic-load-weight",
|
|
]
|
|
|
|
# Start subprocess in new process group
|
|
# 清除log目录
|
|
if os.path.exists("log"):
|
|
shutil.rmtree("log")
|
|
with open(log_path, "w") as logfile:
|
|
process = subprocess.Popen(
|
|
cmd,
|
|
stdout=logfile,
|
|
stderr=subprocess.STDOUT,
|
|
start_new_session=True, # Enables killing full group via os.killpg
|
|
env=env,
|
|
)
|
|
|
|
# Wait up to 300 seconds for API server to be ready
|
|
for _ in range(300):
|
|
if is_port_open("127.0.0.1", FD_API_PORT):
|
|
print(f"API server is up on port {FD_API_PORT}")
|
|
break
|
|
time.sleep(1)
|
|
else:
|
|
print("[TIMEOUT] API server failed to start in 5 minutes. Cleaning up...")
|
|
try:
|
|
os.killpg(process.pid, signal.SIGTERM)
|
|
except Exception as e:
|
|
print(f"Failed to kill process group: {e}")
|
|
raise RuntimeError(f"API server did not start on port {FD_API_PORT}")
|
|
|
|
yield # Run tests
|
|
|
|
print("\n===== Post-test server cleanup... =====")
|
|
try:
|
|
os.killpg(process.pid, signal.SIGTERM)
|
|
print(f"API server (pid={process.pid}) terminated")
|
|
except Exception as e:
|
|
print(f"Failed to terminate API server: {e}")
|
|
|
|
|
|
async def send_inference(idx, client: httpx.AsyncClient):
|
|
try:
|
|
url = f"http://0.0.0.0:{FD_API_PORT}/v1/chat/completions"
|
|
data = {
|
|
"model": "dummy",
|
|
"messages": [{"role": "user", "content": f"hello {idx}"}],
|
|
"metadata": {"min_tokens": 1000},
|
|
}
|
|
resp = await client.post(url, json=data, timeout=20)
|
|
return resp.status_code
|
|
except Exception as e:
|
|
print(f"infer {idx} error:", e)
|
|
return None
|
|
|
|
|
|
async def run_concurrent_inference(n):
|
|
async with httpx.AsyncClient() as client:
|
|
tasks = [send_inference(i, client) for i in range(n)]
|
|
results = await asyncio.gather(*tasks, return_exceptions=True)
|
|
return results
|
|
|
|
|
|
def async_concurrency(n=10):
|
|
print(f"Launching {n} concurrent async inference requests...")
|
|
t0 = time.time()
|
|
results = asyncio.run(run_concurrent_inference(n))
|
|
print("Done in", time.time() - t0, "seconds")
|
|
print("Status codes:", results)
|
|
|
|
|
|
def parse_prometheus_to_dict(metrics_text: str):
|
|
"""转换为dict格式"""
|
|
result = {}
|
|
for line in metrics_text.split("\n"):
|
|
line = line.strip()
|
|
# 跳过注释和空行
|
|
if not line or line.startswith("#"):
|
|
continue
|
|
|
|
if "{" in line: # 有 label
|
|
metric_name = line.split("{", 1)[0]
|
|
labels_str = line[line.index("{") + 1 : line.index("}")]
|
|
value = float(line.split("}")[1].strip())
|
|
|
|
# 解析 labels
|
|
labels = {}
|
|
for kv in labels_str.split(","):
|
|
if "=" not in kv:
|
|
continue
|
|
k, v = kv.split("=")
|
|
labels[k] = v.strip('"')
|
|
|
|
# 存储
|
|
if metric_name not in result:
|
|
result[metric_name] = []
|
|
result[metric_name].append({"labels": labels, "value": value})
|
|
|
|
else: # 无 label
|
|
metric_name, value_str = line.split()
|
|
result[metric_name] = float(value_str)
|
|
|
|
return result
|
|
|
|
|
|
def get_metrics_dict(metrics_url):
|
|
"""获取metrics指标数据"""
|
|
resp = requests.get(metrics_url, timeout=5)
|
|
|
|
assert resp.status_code == 200, f"Unexpected status code: {resp.status_code}"
|
|
assert "text/plain" in resp.headers["Content-Type"], "Content-Type is not text/plain"
|
|
|
|
# Parse Prometheus metrics data
|
|
metrics_data = resp.text
|
|
print(metrics_data)
|
|
metrics_dict = parse_prometheus_to_dict(metrics_data)
|
|
# print("\nParsed dict:")
|
|
# print(metrics_dict)
|
|
print("num_requests_running:", metrics_dict["fastdeploy:num_requests_running"])
|
|
print("num_requests_waiting", metrics_dict["fastdeploy:num_requests_waiting"])
|
|
|
|
return metrics_dict
|
|
|
|
|
|
def test_metrics_with_clear_and_reset():
|
|
"""
|
|
Test the metrics monitoring endpoint.
|
|
"""
|
|
pass # not stable, uncomment after bug fix
|
|
# metrics_url = f"http://0.0.0.0:{FD_METRICS_PORT}/metrics"
|
|
|
|
# async_concurrency(n=10)
|
|
|
|
# time.sleep(0.3)
|
|
|
|
# ===== clear_load_weight =====
|
|
# clear_url = f"http://0.0.0.0:{FD_API_PORT}/clear_load_weight"
|
|
# print("Calling clear_load_weight...")
|
|
# r = requests.get(clear_url, timeout=30)
|
|
# assert r.status_code == 200, f"clear_load_weight failed: {r.status_code}"
|
|
|
|
# metrics = get_metrics_dict(metrics_url)
|
|
# running = metrics["fastdeploy:num_requests_running"]
|
|
# waiting = metrics["fastdeploy:num_requests_waiting"]
|
|
|
|
# print(
|
|
# "ASSERT after the clear_load_weight operation, the value is 0 (Request interruption stopped inference, and related requests were cleared):",
|
|
# running,
|
|
# "waiting:",
|
|
# waiting,
|
|
# )
|
|
# assert running == 0 and waiting == 0, "Expected both running and waiting to be 0 after clear_load_weight"
|
|
|
|
|
|
if __name__ == "__main__":
|
|
test_metrics_with_clear_and_reset()
|