mirror of
https://github.com/PaddlePaddle/FastDeploy.git
synced 2026-04-23 00:17:25 +08:00
[Metrics] Support cpu-cache-block-num (#6390)
Co-authored-by: root <root@szzj-bcc-offline-1487319.szzj.baidu.com>
This commit is contained in:
@@ -37,6 +37,7 @@ After FastDeploy is launched, it supports continuous monitoring of the FastDeplo
|
|||||||
| KV Cache | `fastdeploy:available_gpu_block_num` | Gauge | Available GPU blocks in cache (including unreleased prefix blocks) | count |
|
| KV Cache | `fastdeploy:available_gpu_block_num` | Gauge | Available GPU blocks in cache (including unreleased prefix blocks) | count |
|
||||||
| KV Cache | `fastdeploy:free_gpu_block_num` | Gauge | Number of free GPU blocks in cache | count |
|
| KV Cache | `fastdeploy:free_gpu_block_num` | Gauge | Number of free GPU blocks in cache | count |
|
||||||
| KV Cache | `fastdeploy:max_gpu_block_num` | Gauge | Total number of GPU blocks initialized at startup | count |
|
| KV Cache | `fastdeploy:max_gpu_block_num` | Gauge | Total number of GPU blocks initialized at startup | count |
|
||||||
|
| KV Cache | `fastdeploy:max_cpu_block_num` | Gauge | Total number of CPU blocks initialized at startup | count |
|
||||||
| KV Cache | `fastdeploy:available_gpu_resource` | Gauge | Ratio of available GPU blocks to total GPU blocks | % |
|
| KV Cache | `fastdeploy:available_gpu_resource` | Gauge | Ratio of available GPU blocks to total GPU blocks | % |
|
||||||
| KV Cache | `fastdeploy:gpu_cache_usage_perc` | Gauge | GPU KV cache utilization | % |
|
| KV Cache | `fastdeploy:gpu_cache_usage_perc` | Gauge | GPU KV cache utilization | % |
|
||||||
| KV Cache | `fastdeploy:send_cache_failed_num` | Counter | Total number of cache send failures | count |
|
| KV Cache | `fastdeploy:send_cache_failed_num` | Counter | Total number of cache send failures | count |
|
||||||
|
|||||||
@@ -36,7 +36,8 @@
|
|||||||
| KV缓存 | `fastdeploy:prefix_cpu_cache_token_num` | Counter | 位于 CPU 上的前缀缓存 token 总数 | 个 |
|
| KV缓存 | `fastdeploy:prefix_cpu_cache_token_num` | Counter | 位于 CPU 上的前缀缓存 token 总数 | 个 |
|
||||||
| KV缓存 | `fastdeploy:available_gpu_block_num` | Gauge | 缓存中可用的 GPU 块数量(包含尚未正式释放的前缀缓存块)| 个 |
|
| KV缓存 | `fastdeploy:available_gpu_block_num` | Gauge | 缓存中可用的 GPU 块数量(包含尚未正式释放的前缀缓存块)| 个 |
|
||||||
| KV缓存 | `fastdeploy:free_gpu_block_num` | Gauge | 缓存中的可用块数 | 个 |
|
| KV缓存 | `fastdeploy:free_gpu_block_num` | Gauge | 缓存中的可用块数 | 个 |
|
||||||
| KV缓存 | `fastdeploy:max_gpu_block_num` | Gauge | 服务启动时确定的总块数 | 个 |
|
| KV缓存 | `fastdeploy:max_gpu_block_num` | Gauge | 服务启动时确定的 GPU 总块数 | 个 |
|
||||||
|
| KV缓存 | `fastdeploy:max_cpu_block_num` | Gauge | 服务启动时确定的 CPU 总块数 | 个 |
|
||||||
| KV缓存 | `fastdeploy:available_gpu_resource` | Gauge | 可用块占比,即可用 GPU 块数量 / 最大GPU块数量| 百分比 |
|
| KV缓存 | `fastdeploy:available_gpu_resource` | Gauge | 可用块占比,即可用 GPU 块数量 / 最大GPU块数量| 百分比 |
|
||||||
| KV缓存 | `fastdeploy:gpu_cache_usage_perc` | Gauge | GPU 上的 KV 缓存使用率 | 百分比 |
|
| KV缓存 | `fastdeploy:gpu_cache_usage_perc` | Gauge | GPU 上的 KV 缓存使用率 | 百分比 |
|
||||||
| KV缓存 | `fastdeploy:send_cache_failed_num` | Counter | 发送缓存失败的总次数 | 个 |
|
| KV缓存 | `fastdeploy:send_cache_failed_num` | Counter | 发送缓存失败的总次数 | 个 |
|
||||||
|
|||||||
@@ -129,6 +129,7 @@ class PrefixCacheManager:
|
|||||||
)
|
)
|
||||||
|
|
||||||
main_process_metrics.max_gpu_block_num.set(self.num_gpu_blocks)
|
main_process_metrics.max_gpu_block_num.set(self.num_gpu_blocks)
|
||||||
|
main_process_metrics.max_cpu_block_num.set(self.num_cpu_blocks)
|
||||||
main_process_metrics.available_gpu_block_num.set(self.num_gpu_blocks)
|
main_process_metrics.available_gpu_block_num.set(self.num_gpu_blocks)
|
||||||
main_process_metrics.free_gpu_block_num.set(self.num_gpu_blocks)
|
main_process_metrics.free_gpu_block_num.set(self.num_gpu_blocks)
|
||||||
main_process_metrics.available_gpu_resource.set(1.0)
|
main_process_metrics.available_gpu_resource.set(1.0)
|
||||||
@@ -457,6 +458,7 @@ class PrefixCacheManager:
|
|||||||
self.node_id_pool = list(range(self.num_gpu_blocks + self.num_cpu_blocks))
|
self.node_id_pool = list(range(self.num_gpu_blocks + self.num_cpu_blocks))
|
||||||
|
|
||||||
main_process_metrics.max_gpu_block_num.set(self.num_gpu_blocks)
|
main_process_metrics.max_gpu_block_num.set(self.num_gpu_blocks)
|
||||||
|
main_process_metrics.max_cpu_block_num.set(self.num_cpu_blocks)
|
||||||
main_process_metrics.available_gpu_block_num.set(self.num_gpu_blocks)
|
main_process_metrics.available_gpu_block_num.set(self.num_gpu_blocks)
|
||||||
main_process_metrics.free_gpu_block_num.set(self.num_gpu_blocks)
|
main_process_metrics.free_gpu_block_num.set(self.num_gpu_blocks)
|
||||||
main_process_metrics.available_gpu_resource.set(1.0)
|
main_process_metrics.available_gpu_resource.set(1.0)
|
||||||
|
|||||||
@@ -158,6 +158,7 @@ class MetricsManager:
|
|||||||
available_gpu_block_num: "Gauge"
|
available_gpu_block_num: "Gauge"
|
||||||
free_gpu_block_num: "Gauge"
|
free_gpu_block_num: "Gauge"
|
||||||
max_gpu_block_num: "Gauge"
|
max_gpu_block_num: "Gauge"
|
||||||
|
max_cpu_block_num: "Gauge"
|
||||||
available_gpu_resource: "Gauge"
|
available_gpu_resource: "Gauge"
|
||||||
requests_number: "Counter"
|
requests_number: "Counter"
|
||||||
send_cache_failed_num: "Counter"
|
send_cache_failed_num: "Counter"
|
||||||
@@ -235,7 +236,13 @@ class MetricsManager:
|
|||||||
"max_gpu_block_num": {
|
"max_gpu_block_num": {
|
||||||
"type": Gauge,
|
"type": Gauge,
|
||||||
"name": "fastdeploy:max_gpu_block_num",
|
"name": "fastdeploy:max_gpu_block_num",
|
||||||
"description": "Number of total blocks determined when service started",
|
"description": "Number of total GPU blocks determined when service started",
|
||||||
|
"kwargs": {},
|
||||||
|
},
|
||||||
|
"max_cpu_block_num": {
|
||||||
|
"type": Gauge,
|
||||||
|
"name": "fastdeploy:max_cpu_block_num",
|
||||||
|
"description": "Number of total CPU blocks determined when service started",
|
||||||
"kwargs": {},
|
"kwargs": {},
|
||||||
},
|
},
|
||||||
"available_gpu_resource": {
|
"available_gpu_resource": {
|
||||||
|
|||||||
Reference in New Issue
Block a user