[BugFix] fix speculative gauge metrics in multi api server (#7082)

This commit is contained in:
Yonghua Li
2026-03-31 10:52:50 +08:00
committed by GitHub
parent 6727df8286
commit 6d9739f360
5 changed files with 147 additions and 48 deletions
-9
View File
@@ -69,19 +69,10 @@ from fastdeploy.utils import _is_package_installed, envs
# Configure root logger
_configure_logger()
import uuid
# suppress warning log from paddlepaddle
os.environ["GLOG_minloglevel"] = "2"
# suppress log from aistudio
os.environ["AISTUDIO_LOG"] = "critical"
# set prometheus dir
if os.getenv("PROMETHEUS_MULTIPROC_DIR", "") == "":
prom_dir = f"/tmp/fd_prom_{str(uuid.uuid4())}"
os.environ["PROMETHEUS_MULTIPROC_DIR"] = prom_dir
if os.path.exists(prom_dir):
os.rmdir(prom_dir)
os.mkdir(prom_dir)
import typing
@@ -107,6 +107,15 @@ def start_servers(
env = os.environ.copy()
env["FD_ENABLE_MULTI_API_SERVER"] = "1"
env["FD_LOG_DIR"] = env.get("FD_LOG_DIR", "log") + f"/log_{i}"
if "PROMETHEUS_MULTIPROC_DIR" in env:
prom_dir = env.get("PROMETHEUS_MULTIPROC_DIR")
prom_dir_i = os.path.join(os.path.dirname(prom_dir), os.path.basename(prom_dir) + f"_dp{i}")
# Create the directory if it doesn't exist
if not os.path.exists(prom_dir_i):
os.makedirs(prom_dir_i, exist_ok=True)
env["PROMETHEUS_MULTIPROC_DIR"] = prom_dir_i
logger.info(f"Set PROMETHEUS_MULTIPROC_DIR for DP {i}: {prom_dir_i}")
cmd = [
sys.executable,
"-m",
+41 -13
View File
@@ -66,7 +66,7 @@ class SimpleCollector(Collector):
Metric: Prometheus Metric objects that are not excluded.
"""
for metric in self.base_registry.collect():
if not any(name.startswith(metric.name) for name in self.exclude_names):
if not any(metric.name.startswith(name) for name in self.exclude_names):
yield metric
@@ -84,11 +84,15 @@ def get_filtered_metrics() -> str:
multiprocess.MultiProcessCollector(base_registry)
filtered_registry = CollectorRegistry()
# 注册一个新的colletor,过滤gauge指标
filtered_registry.register(SimpleCollector(base_registry, EXCLUDE_LABELS))
# 动态获取需要排除的 gauge 指标列表
exclude_labels = main_process_metrics.get_excluded_metrics()
# 注册一个新的collector,过滤gauge指标
filtered_registry.register(SimpleCollector(base_registry, exclude_labels))
# 将gauge指标重新注册到filtered_registry中,从内存中读取
main_process_metrics.re_register_gauge(filtered_registry)
# 将speculative中的gauge指标也重新注册
main_process_metrics.re_register_speculative_gauge(filtered_registry)
return generate_latest(filtered_registry).decode("utf-8")
@@ -196,7 +200,7 @@ class MetricsManager:
"type": Gauge,
"name": "fastdeploy:num_requests_running",
"description": "Number of requests currently running",
"kwargs": {"multiprocess_mode": "sum"},
"kwargs": {},
},
"num_requests_waiting": {
"type": Gauge,
@@ -626,19 +630,22 @@ class MetricsManager:
# 在模块加载,指标注册先设置Prometheus环境变量
setup_multiprocess_prometheus()
# 动态创建所有指标
# 动态创建所有非 gauge 型指标
for metric_name, config in self.METRICS.items():
setattr(
self,
metric_name,
config["type"](config["name"], config["description"], **config["kwargs"]),
)
# 动态创建所有指标
# 动态创建所有 gauge 型指标,统一配置 multiprocess_mode 为 livesum
for metric_name, config in self.GAUGE_METRICS.items():
kwargs = config["kwargs"].copy()
if "multiprocess_mode" not in kwargs:
kwargs["multiprocess_mode"] = "livesum"
setattr(
self,
metric_name,
config["type"](config["name"], config["description"], **config["kwargs"]),
config["type"](config["name"], config["description"], **kwargs),
)
# 动态创建server metrics
for metric_name, config in self.SERVER_METRICS.items():
@@ -696,17 +703,22 @@ class MetricsManager:
Gauge(
f"{config['name']}_{i}",
f"{config['description']} (head {i})",
multiprocess_mode="livesum",
)
)
setattr(self, metric_name, gauges)
else:
# For Gauge metrics, automatically add multiprocess_mode="livesum"
kwargs = config["kwargs"].copy()
if config["type"] == Gauge and "multiprocess_mode" not in kwargs:
kwargs["multiprocess_mode"] = "livesum"
setattr(
self,
metric_name,
config["type"](
config["name"],
config["description"],
**config["kwargs"],
**kwargs,
),
)
@@ -767,6 +779,19 @@ class MetricsManager:
else:
registry.register(getattr(self, metric_name))
def re_register_speculative_gauge(self, registry: CollectorRegistry):
"""Re-register gauge metrics from SPECULATIVE_METRICS to the specified registry"""
# Check if SPECULATIVE_METRICS was initialized in this process
# (it's an instance attribute set by _init_speculative_metrics, not the class-level empty dict)
if not hasattr(self, "spec_decode_draft_acceptance_rate"):
return
for metric_name, config in self.SPECULATIVE_METRICS.items():
if metric_name == "spec_decode_draft_single_head_acceptance_rate":
for gauge in getattr(self, metric_name):
registry.register(gauge)
elif config["type"] == Gauge:
registry.register(getattr(self, metric_name))
def re_register_gauge(self, registry: CollectorRegistry):
"""Re-register gauge to the specified registry"""
for metric_name in self.GAUGE_METRICS:
@@ -790,10 +815,15 @@ class MetricsManager:
if hasattr(main_process_metrics, "spec_decode_draft_acceptance_rate"):
self.register_speculative_metrics(registry)
@classmethod
def get_excluded_metrics(cls) -> Set[str]:
def get_excluded_metrics(self) -> Set[str]:
"""Get the set of indicator names that need to be excluded"""
return {config["name"] for config in cls.GAUGE_METRICS.values()}
excluded = {config["name"] for config in self.GAUGE_METRICS.values()}
# Also add gauge metrics from SPECULATIVE_METRICS (if initialized)
if hasattr(self, "SPECULATIVE_METRICS"):
for config in self.SPECULATIVE_METRICS.values():
if config["type"] == Gauge or config["type"] == list[Gauge]:
excluded.add(config["name"])
return excluded
main_process_metrics = MetricsManager()
@@ -801,5 +831,3 @@ main_process_metrics = MetricsManager()
# 由于zmq指标记录比较耗时,默认不开启,通过DEBUG参数开启
if envs.FD_DEBUG:
main_process_metrics.init_zmq_metrics()
EXCLUDE_LABELS = MetricsManager.get_excluded_metrics()
@@ -180,6 +180,58 @@ class TestMultiApiServer(unittest.TestCase):
mock_proc1.wait.assert_called_once()
mock_proc2.wait.assert_called_once()
@patch("fastdeploy.entrypoints.openai.multi_api_server.subprocess.Popen")
@patch("fastdeploy.entrypoints.openai.multi_api_server.is_port_available")
def test_prometheus_multiprocess_dir_per_dp(self, mock_is_port_available, mock_popen):
"""Test that each DP server gets a unique PROMETHEUS_MULTIPROC_DIR"""
# Mock port availability check
mock_is_port_available.return_value = True
# Mock subprocess.Popen to capture env passed to each server
envs_captured = []
def capture_popen(*args, **kwargs):
envs_captured.append(kwargs.get("env", {}).copy())
mock_proc = MagicMock()
mock_proc.pid = 1000 + len(envs_captured)
return mock_proc
mock_popen.side_effect = capture_popen
# Call start_servers with 2 servers
processes = start_servers(
server_count=2,
device_count=2,
server_args=self.test_server_args,
ports="8000,8001",
metrics_ports="8800,8801",
controller_ports="-1",
)
# Verify subprocess.Popen was called twice
self.assertEqual(mock_popen.call_count, 2)
self.assertEqual(len(envs_captured), 2)
self.assertEqual(len(processes), 2)
# Verify each server has a unique PROMETHEUS_MULTIPROC_DIR
prom_dirs = []
for i, env in enumerate(envs_captured):
prom_dir = env.get("PROMETHEUS_MULTIPROC_DIR")
print(f"Server {i} PROMETHEUS_MULTIPROC_DIR: {prom_dir}")
self.assertIsNotNone(prom_dir, f"Server {i} should have PROMETHEUS_MULTIPROC_DIR set")
prom_dirs.append(prom_dir)
# Verify all PROMETHEUS_MULTIPROC_DIR values are unique
self.assertEqual(
len(prom_dirs), len(set(prom_dirs)), "Each DP server should have a unique PROMETHEUS_MULTIPROC_DIR"
)
# Verify each directory contains the server index
for i, prom_dir in enumerate(prom_dirs):
# The directory should contain the server index (0 or 1)
# to uniquely identify each server's metrics directory
self.assertIn(f"_dp{i}", prom_dir, f"PROMETHEUS_MULTIPROC_DIR for server {i} should contain _dp{i}")
if __name__ == "__main__":
unittest.main()
+45 -26
View File
@@ -14,46 +14,65 @@
# limitations under the License.
"""
import os
import unittest
from unittest.mock import patch
from prometheus_client import Gauge
from fastdeploy.metrics.metrics import get_filtered_metrics
from fastdeploy.metrics.metrics import get_filtered_metrics, main_process_metrics
from fastdeploy.spec_decode import SpecMethod
class TestGetFilteredMetrics(unittest.TestCase):
def test_filtered_and_custom_metrics(self):
"""
Test get_filtered_metrics function:
1. Exclude specific metrics from base_registry
2. Keep other metrics in base_registry
3. Ensure metrics registered by extra_register_func are effective
"""
# Simulated metrics in base_registry (Gauge instances)
g_keep = Gauge("metric_to_keep", "Kept metric")
g_keep.set(1.23)
g_exclude = Gauge("metric_to_exclude", "Excluded metric")
g_exclude.set(99)
# Fake MultiProcessCollector: register our simulated metrics
def _collect_metrics_with_mocked_multiprocess(self, metric_name, multiprocess_value):
def fake_multiprocess_collector(registry):
registry.register(g_keep)
registry.register(g_exclude)
gauge = Gauge(metric_name, f"fake metric for {metric_name}", ["pid"], registry=registry)
gauge.labels(pid="10001").set(multiprocess_value)
with patch(
"fastdeploy.metrics.metrics.multiprocess.MultiProcessCollector", side_effect=fake_multiprocess_collector
with (
patch.dict(os.environ, {"PROMETHEUS_MULTIPROC_DIR": "/tmp/fake-prometheus-multiproc-dir"}, clear=False),
patch(
"fastdeploy.metrics.metrics.multiprocess.MultiProcessCollector",
side_effect=fake_multiprocess_collector,
),
):
result = get_filtered_metrics()
return get_filtered_metrics()
print("==== result ====\n", result)
def _assert_unique_metric_value(self, metrics_text, metric_name, expected_value):
metric_lines = [line for line in metrics_text.splitlines() if line.startswith(f"{metric_name} ")]
self.assertEqual(metric_lines, [f"{metric_name} {expected_value}"])
self.assertNotIn("pid=", metrics_text)
# 2. Kept metric should appear
self.assertIn("metric_to_keep", result)
def test_regular_gauge_returns_single_value_without_pid(self):
metric = main_process_metrics.batch_size
metric.set(8.0)
self.assertIn("metric_to_exclude", result)
result = self._collect_metrics_with_mocked_multiprocess(metric._name, multiprocess_value=1008.0)
self._assert_unique_metric_value(result, metric._name, 8.0)
def test_speculative_gauge_returns_single_value_without_pid(self):
if not hasattr(main_process_metrics, "spec_decode_draft_acceptance_rate"):
main_process_metrics._init_speculative_metrics(SpecMethod.MTP, 2)
metric = main_process_metrics.spec_decode_draft_acceptance_rate
metric.set(0.75)
result = self._collect_metrics_with_mocked_multiprocess(metric._name, multiprocess_value=1000.75)
self._assert_unique_metric_value(result, metric._name, 0.75)
def test_speculative_single_head_gauge_returns_single_value_without_pid(self):
if not hasattr(main_process_metrics, "spec_decode_draft_acceptance_rate"):
main_process_metrics._init_speculative_metrics(SpecMethod.MTP, 2)
metric = main_process_metrics.spec_decode_draft_single_head_acceptance_rate[0]
metric.set(0.6)
result = self._collect_metrics_with_mocked_multiprocess(metric._name, multiprocess_value=1000.6)
self._assert_unique_metric_value(result, metric._name, 0.6)
if __name__ == "__main__":