[Feature] 统一请求完成日志格式并增强统计信息 (#6405)

将原来分散的两行日志合并为一行,同时增加更多统计信息展示。

主要变更:
- 整合原有的 "Request finished" 和 "token ratio" 两行日志为单行格式
- 新增 InputToken:输入token数量
- 新增 CachedDetail:缓存详情(包含CachedToken/GPU/CPU)
- 新增 OutputToken:输出token数量
- 新增 TTFT:首Token时延(秒)
- 新增 E2E:整句时延(秒)
- 保留 IsPrefill 和 RecoveryStop 标志

新日志格式示例:
Request=chatcmpl-xxx, InputToken=18, CachedDetail={"CachedToken": 0, "GPU": 0, "CPU": 0}, OutputToken=247, TokenRatio=315.77, TTFT=0.02, E2E=0.78, IsPrefill=False, RecoveryStop=False

Co-authored-by: Ducc <ducc@baidu.com>
Co-authored-by: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
Jiang-Jia-Jun
2026-02-09 21:06:55 +08:00
committed by GitHub
parent 5bfc0938e2
commit 4e06df520e
+46 -11
View File
@@ -230,13 +230,31 @@ class TokenProcessor:
result.finished = True
if recovery_stop:
result.error_msg = "Recover is not supported, the result is incomplete!"
llm_logger.info(
f"Request: {task_id} finished, number of " f"generated tokens: {self.tokens_counter[task_id]}."
)
# Calculate statistics for the combined log
is_decode = self.cfg.scheduler_config.splitwise_role == "decode"
inference_start_time = task.metrics.get_inference_start_time(is_decode)
token_ratio = self.tokens_counter[task_id] / (time.time() - inference_start_time)
llm_logger.info(f"Request: {task_id} token ratio: {token_ratio}")
task.metrics.cal_cost_time()
e2e_time = current_time - inference_start_time
token_ratio = self.tokens_counter[task_id] / e2e_time
# Get cache information
gpu_cache = getattr(task.metrics, "gpu_cache_token_num", 0)
cpu_cache = getattr(task.metrics, "cpu_cache_token_num", 0)
total_cached = gpu_cache + cpu_cache
# Build cached detail dict
cached_detail = f'{{"CachedToken": {total_cached}, "GPU": {gpu_cache}, "CPU": {cpu_cache}}}'
# Print combined log with all required information
ttft = task.metrics.first_token_time if task.metrics.first_token_time else 0
llm_logger.info(
f"Request={task_id}, InputToken={task.prompt_token_ids_len}, "
f"CachedDetail={cached_detail}, OutputToken={self.tokens_counter[task_id]}, "
f"TokenRatio={token_ratio:.2f}, TTFT={ttft:.2f}, "
f"E2E={e2e_time:.2f}, IsPrefill={is_prefill}, RecoveryStop={recovery_stop}"
)
main_process_metrics.request_token_ratio.observe(token_ratio)
llm_logger.info(f"{self.resource_manager.info()}")
if self.cfg.speculative_config.method:
@@ -906,13 +924,30 @@ class TokenProcessor:
)
if recovery_stop:
result.error_msg = "Recover is not supported, the result is incomplete!"
llm_logger.info(
f"Request: {task_id} finished, number of "
f"generated tokens: {self.tokens_counter[task_id]}, token_id:{token_id},is_prefill:{is_prefill},recovery_stop:{recovery_stop}"
)
# Calculate statistics for the combined log
inference_start_time = task.metrics.get_inference_start_time(is_decode)
token_ratio = self.tokens_counter[task_id] / (time.time() - inference_start_time)
llm_logger.info(f"Request: {task_id} token ratio: {token_ratio}")
task.metrics.cal_cost_time()
e2e_time = current_time - inference_start_time
token_ratio = self.tokens_counter[task_id] / e2e_time
# Get cache information
gpu_cache = getattr(task.metrics, "gpu_cache_token_num", 0)
cpu_cache = getattr(task.metrics, "cpu_cache_token_num", 0)
total_cached = gpu_cache + cpu_cache
# Build cached detail dict
cached_detail = f'{{"CachedToken": {total_cached}, "GPU": {gpu_cache}, "CPU": {cpu_cache}}}'
# Print combined log with all required information
ttft = task.metrics.first_token_time if task.metrics.first_token_time else 0
llm_logger.info(
f"Request={task_id}, InputToken={task.prompt_token_ids_len}, "
f"CachedDetail={cached_detail}, OutputToken={self.tokens_counter[task_id]}, "
f"TokenRatio={token_ratio:.2f}, TTFT={ttft:.2f}, "
f"E2E={e2e_time:.2f}, IsPrefill={is_prefill}, RecoveryStop={recovery_stop}"
)
main_process_metrics.request_token_ratio.observe(token_ratio)
llm_logger.info(f"{self.resource_manager.info()}")
if self.cfg.speculative_config.method: