[Feature] 统一请求完成日志格式并增强统计信息 (#6405)

将原来分散的两行日志合并为一行，同时增加更多统计信息展示。主要变更： - 整合原有的 "Request finished" 和 "token ratio" 两行日志为单行格式 - 新增 InputToken：输入token数量 - 新增 CachedDetail：缓存详情（包含CachedToken/GPU/CPU） - 新增 OutputToken：输出token数量 - 新增 TTFT：首Token时延（秒） - 新增 E2E：整句时延（秒） - 保留 IsPrefill 和 RecoveryStop 标志新日志格式示例： Request=chatcmpl-xxx, InputToken=18, CachedDetail={"CachedToken": 0, "GPU": 0, "CPU": 0}, OutputToken=247, TokenRatio=315.77, TTFT=0.02, E2E=0.78, IsPrefill=False, RecoveryStop=False Co-authored-by: Ducc <ducc@baidu.com> Co-authored-by: Claude Opus 4.5 <noreply@anthropic.com>
2026-04-23 00:17:25 +08:00 · 2026-02-09 21:06:55 +08:00
parent 5bfc0938e2
commit 4e06df520e
1 changed files with 46 additions and 11 deletions
@@ -230,13 +230,31 @@ class TokenProcessor:
                result.finished = True
                if recovery_stop:
                    result.error_msg = "Recover is not supported, the result is incomplete!"
-                llm_logger.info(
-                    f"Request: {task_id} finished, number of " f"generated tokens: {self.tokens_counter[task_id]}."
-                )
+
+                # Calculate statistics for the combined log
                is_decode = self.cfg.scheduler_config.splitwise_role == "decode"
                inference_start_time = task.metrics.get_inference_start_time(is_decode)
-                token_ratio = self.tokens_counter[task_id] / (time.time() - inference_start_time)
-                llm_logger.info(f"Request: {task_id} token ratio: {token_ratio}")
+                task.metrics.cal_cost_time()
+                e2e_time = current_time - inference_start_time
+                token_ratio = self.tokens_counter[task_id] / e2e_time
+
+                # Get cache information
+                gpu_cache = getattr(task.metrics, "gpu_cache_token_num", 0)
+                cpu_cache = getattr(task.metrics, "cpu_cache_token_num", 0)
+                total_cached = gpu_cache + cpu_cache
+
+                # Build cached detail dict
+                cached_detail = f'{{"CachedToken": {total_cached}, "GPU": {gpu_cache}, "CPU": {cpu_cache}}}'
+
+                # Print combined log with all required information
+                ttft = task.metrics.first_token_time if task.metrics.first_token_time else 0
+                llm_logger.info(
+                    f"Request={task_id}, InputToken={task.prompt_token_ids_len}, "
+                    f"CachedDetail={cached_detail}, OutputToken={self.tokens_counter[task_id]}, "
+                    f"TokenRatio={token_ratio:.2f}, TTFT={ttft:.2f}, "
+                    f"E2E={e2e_time:.2f}, IsPrefill={is_prefill}, RecoveryStop={recovery_stop}"
+                )
+
                main_process_metrics.request_token_ratio.observe(token_ratio)
                llm_logger.info(f"{self.resource_manager.info()}")
                if self.cfg.speculative_config.method:
@@ -906,13 +924,30 @@ class TokenProcessor:
                    )
                    if recovery_stop:
                        result.error_msg = "Recover is not supported, the result is incomplete!"
-                    llm_logger.info(
-                        f"Request: {task_id} finished, number of "
-                        f"generated tokens: {self.tokens_counter[task_id]}, token_id:{token_id},is_prefill:{is_prefill},recovery_stop:{recovery_stop}"
-                    )
+
+                    # Calculate statistics for the combined log
                    inference_start_time = task.metrics.get_inference_start_time(is_decode)
-                    token_ratio = self.tokens_counter[task_id] / (time.time() - inference_start_time)
-                    llm_logger.info(f"Request: {task_id} token ratio: {token_ratio}")
+                    task.metrics.cal_cost_time()
+                    e2e_time = current_time - inference_start_time
+                    token_ratio = self.tokens_counter[task_id] / e2e_time
+
+                    # Get cache information
+                    gpu_cache = getattr(task.metrics, "gpu_cache_token_num", 0)
+                    cpu_cache = getattr(task.metrics, "cpu_cache_token_num", 0)
+                    total_cached = gpu_cache + cpu_cache
+
+                    # Build cached detail dict
+                    cached_detail = f'{{"CachedToken": {total_cached}, "GPU": {gpu_cache}, "CPU": {cpu_cache}}}'
+
+                    # Print combined log with all required information
+                    ttft = task.metrics.first_token_time if task.metrics.first_token_time else 0
+                    llm_logger.info(
+                        f"Request={task_id}, InputToken={task.prompt_token_ids_len}, "
+                        f"CachedDetail={cached_detail}, OutputToken={self.tokens_counter[task_id]}, "
+                        f"TokenRatio={token_ratio:.2f}, TTFT={ttft:.2f}, "
+                        f"E2E={e2e_time:.2f}, IsPrefill={is_prefill}, RecoveryStop={recovery_stop}"
+                    )
+
                    main_process_metrics.request_token_ratio.observe(token_ratio)
                    llm_logger.info(f"{self.resource_manager.info()}")
                    if self.cfg.speculative_config.method: