From 18e79dd660d723e239a47a2d5dddf0a47abf0a3f Mon Sep 17 00:00:00 2001 From: Jiang-Jia-Jun <163579578+Jiang-Jia-Jun@users.noreply.github.com> Date: Mon, 9 Feb 2026 10:27:56 +0800 Subject: [PATCH] [Metrics] Support cpu-cache-block-num (#6390) Co-authored-by: root --- docs/online_serving/metrics.md | 1 + docs/zh/online_serving/metrics.md | 3 ++- fastdeploy/cache_manager/prefix_cache_manager.py | 2 ++ fastdeploy/metrics/metrics.py | 9 ++++++++- 4 files changed, 13 insertions(+), 2 deletions(-) diff --git a/docs/online_serving/metrics.md b/docs/online_serving/metrics.md index d7fe4f2a6a..8e2dd28688 100644 --- a/docs/online_serving/metrics.md +++ b/docs/online_serving/metrics.md @@ -37,6 +37,7 @@ After FastDeploy is launched, it supports continuous monitoring of the FastDeplo | KV Cache | `fastdeploy:available_gpu_block_num` | Gauge | Available GPU blocks in cache (including unreleased prefix blocks) | count | | KV Cache | `fastdeploy:free_gpu_block_num` | Gauge | Number of free GPU blocks in cache | count | | KV Cache | `fastdeploy:max_gpu_block_num` | Gauge | Total number of GPU blocks initialized at startup | count | +| KV Cache | `fastdeploy:max_cpu_block_num` | Gauge | Total number of CPU blocks initialized at startup | count | | KV Cache | `fastdeploy:available_gpu_resource` | Gauge | Ratio of available GPU blocks to total GPU blocks | % | | KV Cache | `fastdeploy:gpu_cache_usage_perc` | Gauge | GPU KV cache utilization | % | | KV Cache | `fastdeploy:send_cache_failed_num` | Counter | Total number of cache send failures | count | diff --git a/docs/zh/online_serving/metrics.md b/docs/zh/online_serving/metrics.md index 26d00baa42..630f68e2ff 100644 --- a/docs/zh/online_serving/metrics.md +++ b/docs/zh/online_serving/metrics.md @@ -36,7 +36,8 @@ | KV缓存 | `fastdeploy:prefix_cpu_cache_token_num` | Counter | 位于 CPU 上的前缀缓存 token 总数 | 个 | | KV缓存 | `fastdeploy:available_gpu_block_num` | Gauge | 缓存中可用的 GPU 块数量(包含尚未正式释放的前缀缓存块)| 个 | | KV缓存 | `fastdeploy:free_gpu_block_num` | Gauge | 缓存中的可用块数 | 个 | -| KV缓存 | `fastdeploy:max_gpu_block_num` | Gauge | 服务启动时确定的总块数 | 个 | +| KV缓存 | `fastdeploy:max_gpu_block_num` | Gauge | 服务启动时确定的 GPU 总块数 | 个 | +| KV缓存 | `fastdeploy:max_cpu_block_num` | Gauge | 服务启动时确定的 CPU 总块数 | 个 | | KV缓存 | `fastdeploy:available_gpu_resource` | Gauge | 可用块占比,即可用 GPU 块数量 / 最大GPU块数量| 百分比 | | KV缓存 | `fastdeploy:gpu_cache_usage_perc` | Gauge | GPU 上的 KV 缓存使用率 | 百分比 | | KV缓存 | `fastdeploy:send_cache_failed_num` | Counter | 发送缓存失败的总次数 | 个 | diff --git a/fastdeploy/cache_manager/prefix_cache_manager.py b/fastdeploy/cache_manager/prefix_cache_manager.py index 945523fe90..572c6ed0b0 100644 --- a/fastdeploy/cache_manager/prefix_cache_manager.py +++ b/fastdeploy/cache_manager/prefix_cache_manager.py @@ -129,6 +129,7 @@ class PrefixCacheManager: ) main_process_metrics.max_gpu_block_num.set(self.num_gpu_blocks) + main_process_metrics.max_cpu_block_num.set(self.num_cpu_blocks) main_process_metrics.available_gpu_block_num.set(self.num_gpu_blocks) main_process_metrics.free_gpu_block_num.set(self.num_gpu_blocks) main_process_metrics.available_gpu_resource.set(1.0) @@ -457,6 +458,7 @@ class PrefixCacheManager: self.node_id_pool = list(range(self.num_gpu_blocks + self.num_cpu_blocks)) main_process_metrics.max_gpu_block_num.set(self.num_gpu_blocks) + main_process_metrics.max_cpu_block_num.set(self.num_cpu_blocks) main_process_metrics.available_gpu_block_num.set(self.num_gpu_blocks) main_process_metrics.free_gpu_block_num.set(self.num_gpu_blocks) main_process_metrics.available_gpu_resource.set(1.0) diff --git a/fastdeploy/metrics/metrics.py b/fastdeploy/metrics/metrics.py index 719ef4a887..2384e57c7b 100644 --- a/fastdeploy/metrics/metrics.py +++ b/fastdeploy/metrics/metrics.py @@ -158,6 +158,7 @@ class MetricsManager: available_gpu_block_num: "Gauge" free_gpu_block_num: "Gauge" max_gpu_block_num: "Gauge" + max_cpu_block_num: "Gauge" available_gpu_resource: "Gauge" requests_number: "Counter" send_cache_failed_num: "Counter" @@ -235,7 +236,13 @@ class MetricsManager: "max_gpu_block_num": { "type": Gauge, "name": "fastdeploy:max_gpu_block_num", - "description": "Number of total blocks determined when service started", + "description": "Number of total GPU blocks determined when service started", + "kwargs": {}, + }, + "max_cpu_block_num": { + "type": Gauge, + "name": "fastdeploy:max_cpu_block_num", + "description": "Number of total CPU blocks determined when service started", "kwargs": {}, }, "available_gpu_resource": {