mirror of
https://github.com/PaddlePaddle/FastDeploy.git
synced 2026-04-23 00:17:25 +08:00
[Feature] 统一请求完成日志格式并增强统计信息 (#6405)
将原来分散的两行日志合并为一行,同时增加更多统计信息展示。
主要变更:
- 整合原有的 "Request finished" 和 "token ratio" 两行日志为单行格式
- 新增 InputToken:输入token数量
- 新增 CachedDetail:缓存详情(包含CachedToken/GPU/CPU)
- 新增 OutputToken:输出token数量
- 新增 TTFT:首Token时延(秒)
- 新增 E2E:整句时延(秒)
- 保留 IsPrefill 和 RecoveryStop 标志
新日志格式示例:
Request=chatcmpl-xxx, InputToken=18, CachedDetail={"CachedToken": 0, "GPU": 0, "CPU": 0}, OutputToken=247, TokenRatio=315.77, TTFT=0.02, E2E=0.78, IsPrefill=False, RecoveryStop=False
Co-authored-by: Ducc <ducc@baidu.com>
Co-authored-by: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
@@ -230,13 +230,31 @@ class TokenProcessor:
|
||||
result.finished = True
|
||||
if recovery_stop:
|
||||
result.error_msg = "Recover is not supported, the result is incomplete!"
|
||||
llm_logger.info(
|
||||
f"Request: {task_id} finished, number of " f"generated tokens: {self.tokens_counter[task_id]}."
|
||||
)
|
||||
|
||||
# Calculate statistics for the combined log
|
||||
is_decode = self.cfg.scheduler_config.splitwise_role == "decode"
|
||||
inference_start_time = task.metrics.get_inference_start_time(is_decode)
|
||||
token_ratio = self.tokens_counter[task_id] / (time.time() - inference_start_time)
|
||||
llm_logger.info(f"Request: {task_id} token ratio: {token_ratio}")
|
||||
task.metrics.cal_cost_time()
|
||||
e2e_time = current_time - inference_start_time
|
||||
token_ratio = self.tokens_counter[task_id] / e2e_time
|
||||
|
||||
# Get cache information
|
||||
gpu_cache = getattr(task.metrics, "gpu_cache_token_num", 0)
|
||||
cpu_cache = getattr(task.metrics, "cpu_cache_token_num", 0)
|
||||
total_cached = gpu_cache + cpu_cache
|
||||
|
||||
# Build cached detail dict
|
||||
cached_detail = f'{{"CachedToken": {total_cached}, "GPU": {gpu_cache}, "CPU": {cpu_cache}}}'
|
||||
|
||||
# Print combined log with all required information
|
||||
ttft = task.metrics.first_token_time if task.metrics.first_token_time else 0
|
||||
llm_logger.info(
|
||||
f"Request={task_id}, InputToken={task.prompt_token_ids_len}, "
|
||||
f"CachedDetail={cached_detail}, OutputToken={self.tokens_counter[task_id]}, "
|
||||
f"TokenRatio={token_ratio:.2f}, TTFT={ttft:.2f}, "
|
||||
f"E2E={e2e_time:.2f}, IsPrefill={is_prefill}, RecoveryStop={recovery_stop}"
|
||||
)
|
||||
|
||||
main_process_metrics.request_token_ratio.observe(token_ratio)
|
||||
llm_logger.info(f"{self.resource_manager.info()}")
|
||||
if self.cfg.speculative_config.method:
|
||||
@@ -906,13 +924,30 @@ class TokenProcessor:
|
||||
)
|
||||
if recovery_stop:
|
||||
result.error_msg = "Recover is not supported, the result is incomplete!"
|
||||
llm_logger.info(
|
||||
f"Request: {task_id} finished, number of "
|
||||
f"generated tokens: {self.tokens_counter[task_id]}, token_id:{token_id},is_prefill:{is_prefill},recovery_stop:{recovery_stop}"
|
||||
)
|
||||
|
||||
# Calculate statistics for the combined log
|
||||
inference_start_time = task.metrics.get_inference_start_time(is_decode)
|
||||
token_ratio = self.tokens_counter[task_id] / (time.time() - inference_start_time)
|
||||
llm_logger.info(f"Request: {task_id} token ratio: {token_ratio}")
|
||||
task.metrics.cal_cost_time()
|
||||
e2e_time = current_time - inference_start_time
|
||||
token_ratio = self.tokens_counter[task_id] / e2e_time
|
||||
|
||||
# Get cache information
|
||||
gpu_cache = getattr(task.metrics, "gpu_cache_token_num", 0)
|
||||
cpu_cache = getattr(task.metrics, "cpu_cache_token_num", 0)
|
||||
total_cached = gpu_cache + cpu_cache
|
||||
|
||||
# Build cached detail dict
|
||||
cached_detail = f'{{"CachedToken": {total_cached}, "GPU": {gpu_cache}, "CPU": {cpu_cache}}}'
|
||||
|
||||
# Print combined log with all required information
|
||||
ttft = task.metrics.first_token_time if task.metrics.first_token_time else 0
|
||||
llm_logger.info(
|
||||
f"Request={task_id}, InputToken={task.prompt_token_ids_len}, "
|
||||
f"CachedDetail={cached_detail}, OutputToken={self.tokens_counter[task_id]}, "
|
||||
f"TokenRatio={token_ratio:.2f}, TTFT={ttft:.2f}, "
|
||||
f"E2E={e2e_time:.2f}, IsPrefill={is_prefill}, RecoveryStop={recovery_stop}"
|
||||
)
|
||||
|
||||
main_process_metrics.request_token_ratio.observe(token_ratio)
|
||||
llm_logger.info(f"{self.resource_manager.info()}")
|
||||
if self.cfg.speculative_config.method:
|
||||
|
||||
Reference in New Issue
Block a user