[BugFix] Fix inaccurate cache hit rate and TTFT after request preemption (#6620)

* [chore] add has_been_rescheduled flag for requests

* [refactor] rename reschedule to preempted for accuracy and fix cache hit metrics

* [chore] add ttft_s
This commit is contained in:
Yonghua Li
2026-03-05 16:25:02 +08:00
committed by GitHub
parent 326b9755aa
commit fa1906bd6f
5 changed files with 74 additions and 42 deletions
+44 -30
View File
@@ -509,17 +509,22 @@ class EngineService:
if not is_decode:
self.llm_logger.info(f"Tasks are sent to engine, req_ids={req_ids}")
for task in tasks:
task.metrics.inference_start_time = time.time()
tracing.trace_report_span(
tracing.TraceSpanName.SCHEDULE,
task.request_id.split("_")[0],
int(task.metrics.scheduler_recv_req_time * 1e9),
int(task.metrics.inference_start_time * 1e9),
thread_finish_flag=True,
)
trace_print(LoggingEventName.RESOURCE_ALLOCATE_END, task.request_id, getattr(task, "user", ""))
trace_print(LoggingEventName.REQUEST_SCHEDULE_END, task.request_id, getattr(task, "user", ""))
trace_print(LoggingEventName.INFERENCE_START, task.request_id, getattr(task, "user", ""))
if not getattr(task, "has_been_preempted_before", False):
task.metrics.inference_start_time = time.time()
tracing.trace_report_span(
tracing.TraceSpanName.SCHEDULE,
task.request_id.split("_")[0],
int(task.metrics.scheduler_recv_req_time * 1e9),
int(task.metrics.inference_start_time * 1e9),
thread_finish_flag=True,
)
trace_print(LoggingEventName.RESOURCE_ALLOCATE_END, task.request_id, getattr(task, "user", ""))
trace_print(LoggingEventName.REQUEST_SCHEDULE_END, task.request_id, getattr(task, "user", ""))
trace_print(LoggingEventName.INFERENCE_START, task.request_id, getattr(task, "user", ""))
else:
trace_print(
LoggingEventName.RESCHEDULED_INFERENCE_START, task.request_id, getattr(task, "user", "")
)
if not is_prefill:
if not self.cfg.model_config.enable_mm:
self.update_requests_chunk_size(tasks)
@@ -1022,28 +1027,37 @@ class EngineService:
for task in tasks:
if task.task_type == RequestType.PREFILL:
rid = task.request_id.split("_")[0]
trace_carrier = task.trace_carrier
tracing.trace_set_proc_propagate_context(rid, trace_carrier)
trace_carrier = tracing.trace_get_proc_propagate_context(rid)
task.trace_carrier = trace_carrier
tracing.trace_report_span(
tracing.TraceSpanName.SCHEDULE,
rid,
int(task.metrics.scheduler_recv_req_time * 1e9),
int(time.time() * 1e9),
thread_finish_flag=True,
)
trace_print(
LoggingEventName.RESOURCE_ALLOCATE_END, task.request_id, getattr(task, "user", "")
)
trace_print(
LoggingEventName.REQUEST_SCHEDULE_END, task.request_id, getattr(task, "user", "")
)
trace_print(LoggingEventName.INFERENCE_START, task.request_id, getattr(task, "user", ""))
if isinstance(task, Request) and task.has_been_preempted_before:
trace_print(
LoggingEventName.RESCHEDULED_INFERENCE_START,
task.request_id,
getattr(task, "user", ""),
)
else:
trace_carrier = task.trace_carrier
tracing.trace_set_proc_propagate_context(rid, trace_carrier)
trace_carrier = tracing.trace_get_proc_propagate_context(rid)
task.trace_carrier = trace_carrier
tracing.trace_report_span(
tracing.TraceSpanName.SCHEDULE,
rid,
int(task.metrics.scheduler_recv_req_time * 1e9),
int(time.time() * 1e9),
thread_finish_flag=True,
)
trace_print(
LoggingEventName.RESOURCE_ALLOCATE_END, task.request_id, getattr(task, "user", "")
)
trace_print(
LoggingEventName.REQUEST_SCHEDULE_END, task.request_id, getattr(task, "user", "")
)
trace_print(
LoggingEventName.INFERENCE_START, task.request_id, getattr(task, "user", "")
)
if isinstance(task, Request):
if self.cfg.scheduler_config.splitwise_role == "decode":
task.metrics.decode_inference_start_time = time.time()
else:
elif not task.has_been_preempted_before:
task.metrics.inference_start_time = time.time()
self.engine_worker_queue.put_tasks((tasks, self.resource_manager.real_bsz))