mirror of
https://github.com/PaddlePaddle/FastDeploy.git
synced 2026-04-23 00:17:25 +08:00
[Feature] Support KV Cache Storage (#5571)
* Support Mooncake Store * up * up * add op * fix conflict * fix error * up for comments * avoid thread lock * up * fix unittest * fix unittest * remove debug info * consider tp_size > 1 * add default rdma_nics * add utils * up * fix error --------- Co-authored-by: YuBaoku <49938469+EmmonsCurse@users.noreply.github.com>
This commit is contained in:
@@ -102,7 +102,7 @@ def metrics_summary(metrics, token_timestamps):
|
||||
|
||||
# prefill 总耗时
|
||||
summary["prefill_cost_time"] = safe_cost(m0.get("send_request_output_to_decode_time"), arrival_time)
|
||||
# prefill准备耗时
|
||||
# prefill准备总耗时
|
||||
summary["prefill_prepare_cost_time"] = safe_cost(inference_start_time, arrival_time)
|
||||
# 预处理耗时
|
||||
summary["preprocess_cost_time"] = safe_cost(m0.get("scheduler_recv_req_time"), arrival_time)
|
||||
@@ -114,6 +114,10 @@ def metrics_summary(metrics, token_timestamps):
|
||||
summary["ask_decode_resource_cost_time"] = safe_cost(
|
||||
m0.get("ask_decode_resource_finish_time"), m0.get("ask_decode_resource_start_time")
|
||||
)
|
||||
# scheduler调度耗时
|
||||
summary["schedule_cost_time"] = safe_cost(
|
||||
m0.get("inference_start_time"), m0.get("ask_decode_resource_finish_time")
|
||||
)
|
||||
# prefill 的首 token 推理耗时
|
||||
summary["prefill_first_token_infer_cost_time"] = safe_cost(
|
||||
m0.get("engine_recv_first_token_time"), inference_start_time
|
||||
@@ -143,6 +147,19 @@ def metrics_summary(metrics, token_timestamps):
|
||||
token_timestamps[1], m_last.get("decode_recv_second_token_time")
|
||||
)
|
||||
|
||||
# MIX 模式下,scheduler调度耗时
|
||||
summary["mixed_schedule_cost_time"] = safe_cost(m0.get("inference_start_time"), m0.get("engine_get_req_time"))
|
||||
# MIX 模式下,返回首 token 链路耗时
|
||||
summary["mixed_first_token_transmission_cost_time"] = safe_cost(
|
||||
token_timestamps[0], m0.get("engine_recv_first_token_time")
|
||||
)
|
||||
|
||||
summary["gpu_cache_token_num"] = m0.get("gpu_cache_token_num")
|
||||
summary["cpu_cache_token_num"] = m0.get("cpu_cache_token_num")
|
||||
summary["storage_cache_token_num"] = m0.get("storage_cache_token_num")
|
||||
summary["gpu_cpu_cache_prepare_time"] = m0.get("gpu_cpu_cache_prepare_time")
|
||||
summary["storage_cache_prepare_time"] = m0.get("storage_cache_prepare_time")
|
||||
|
||||
return summary
|
||||
|
||||
|
||||
|
||||
@@ -695,7 +695,7 @@ async def benchmark(
|
||||
print("{:<40} {:<10.2f}".format(f"P{p_word} {metric_name} (ms):", value))
|
||||
result[f"p{p_word}_{metric_attribute_name}_ms"] = value
|
||||
|
||||
def process_pd_metrics(model_outputs, metric_key):
|
||||
def process_pd_metrics(model_outputs, metric_key, is_time=True):
|
||||
# 收集所有该 metric 的数值
|
||||
values = []
|
||||
percentiles = []
|
||||
@@ -712,24 +712,29 @@ async def benchmark(
|
||||
print(f"[WARN] metric_key '{metric_key}' not found in outputs.")
|
||||
return
|
||||
|
||||
arr = np.array(values) * 1000 # 秒 -> 毫秒
|
||||
if is_time:
|
||||
arr = np.array(values) * 1000 # 秒 -> 毫秒
|
||||
suffix = "(ms)"
|
||||
else:
|
||||
arr = np.array(values)
|
||||
suffix = ""
|
||||
|
||||
print("{s:{c}^{n}}".format(s=metric_key, n=50, c="-"))
|
||||
print(
|
||||
"{:<40} {:<10.2f}".format(
|
||||
f"Mean {metric_key} (ms):",
|
||||
f"Mean {metric_key} {suffix}:",
|
||||
np.mean(arr),
|
||||
)
|
||||
)
|
||||
print(
|
||||
"{:<40} {:<10.2f}".format(
|
||||
f"Median {metric_key} (ms):",
|
||||
f"Median {metric_key} {suffix}:",
|
||||
np.median(arr),
|
||||
)
|
||||
)
|
||||
for p in percentiles:
|
||||
v = np.percentile(arr, p)
|
||||
print("{:<40} {:<10.2f}".format(f"P{str(int(p)) if int(p) == p else str(p)} {metric_key} (ms):", v))
|
||||
print("{:<40} {:<10.2f}".format(f"P{str(int(p)) if int(p) == p else str(p)} {metric_key} {suffix}:", v))
|
||||
# print(f"P{str(int(p)) if int(p) == p else str(p)} {metric_key} (ms): {v:10.2f}")
|
||||
print(
|
||||
"{:<40} {:<10.2f}".format(
|
||||
@@ -785,6 +790,7 @@ async def benchmark(
|
||||
process_pd_metrics(outputs, "prefill_prepare_cost_time")
|
||||
process_pd_metrics(outputs, "preprocess_cost_time")
|
||||
process_pd_metrics(outputs, "cache_in_scheduler_cost_time")
|
||||
process_pd_metrics(outputs, "schedule_cost_time")
|
||||
process_pd_metrics(outputs, "ask_decode_resource_cost_time")
|
||||
process_pd_metrics(outputs, "prefill_first_token_infer_cost_time")
|
||||
process_pd_metrics(outputs, "wait_sending_cache_cost_time")
|
||||
@@ -793,6 +799,12 @@ async def benchmark(
|
||||
process_pd_metrics(outputs, "decode_second_token_infer_cost_time")
|
||||
process_pd_metrics(outputs, "first_token_transmission_cost_time")
|
||||
process_pd_metrics(outputs, "second_token_transmission_cost_time")
|
||||
process_pd_metrics(outputs, "mixed_schedule_cost_time")
|
||||
process_pd_metrics(outputs, "gpu_cache_token_num", is_time=False)
|
||||
process_pd_metrics(outputs, "cpu_cache_token_num", is_time=False)
|
||||
process_pd_metrics(outputs, "storage_cache_token_num", is_time=False)
|
||||
process_pd_metrics(outputs, "gpu_cpu_cache_prepare_time")
|
||||
process_pd_metrics(outputs, "storage_cache_prepare_time")
|
||||
process_one_length("input_len", "Cached Tokens", "Cached Tokens")
|
||||
process_one_length("s_input_len", "Input Length", "Infer Input Length")
|
||||
process_one_length("reasoning_len", "Reasoning Lenth", "思考长度")
|
||||
|
||||
Reference in New Issue
Block a user