mirror of
https://github.com/PaddlePaddle/FastDeploy.git
synced 2026-04-23 00:17:25 +08:00
[BugFix] Fix inaccurate cache hit rate and TTFT after request preemption (#6620)
* [chore] add has_been_rescheduled flag for requests * [refactor] rename reschedule to preempted for accuracy and fix cache hit metrics * [chore] add ttft_s
This commit is contained in:
@@ -509,17 +509,22 @@ class EngineService:
|
||||
if not is_decode:
|
||||
self.llm_logger.info(f"Tasks are sent to engine, req_ids={req_ids}")
|
||||
for task in tasks:
|
||||
task.metrics.inference_start_time = time.time()
|
||||
tracing.trace_report_span(
|
||||
tracing.TraceSpanName.SCHEDULE,
|
||||
task.request_id.split("_")[0],
|
||||
int(task.metrics.scheduler_recv_req_time * 1e9),
|
||||
int(task.metrics.inference_start_time * 1e9),
|
||||
thread_finish_flag=True,
|
||||
)
|
||||
trace_print(LoggingEventName.RESOURCE_ALLOCATE_END, task.request_id, getattr(task, "user", ""))
|
||||
trace_print(LoggingEventName.REQUEST_SCHEDULE_END, task.request_id, getattr(task, "user", ""))
|
||||
trace_print(LoggingEventName.INFERENCE_START, task.request_id, getattr(task, "user", ""))
|
||||
if not getattr(task, "has_been_preempted_before", False):
|
||||
task.metrics.inference_start_time = time.time()
|
||||
tracing.trace_report_span(
|
||||
tracing.TraceSpanName.SCHEDULE,
|
||||
task.request_id.split("_")[0],
|
||||
int(task.metrics.scheduler_recv_req_time * 1e9),
|
||||
int(task.metrics.inference_start_time * 1e9),
|
||||
thread_finish_flag=True,
|
||||
)
|
||||
trace_print(LoggingEventName.RESOURCE_ALLOCATE_END, task.request_id, getattr(task, "user", ""))
|
||||
trace_print(LoggingEventName.REQUEST_SCHEDULE_END, task.request_id, getattr(task, "user", ""))
|
||||
trace_print(LoggingEventName.INFERENCE_START, task.request_id, getattr(task, "user", ""))
|
||||
else:
|
||||
trace_print(
|
||||
LoggingEventName.RESCHEDULED_INFERENCE_START, task.request_id, getattr(task, "user", "")
|
||||
)
|
||||
if not is_prefill:
|
||||
if not self.cfg.model_config.enable_mm:
|
||||
self.update_requests_chunk_size(tasks)
|
||||
@@ -1022,28 +1027,37 @@ class EngineService:
|
||||
for task in tasks:
|
||||
if task.task_type == RequestType.PREFILL:
|
||||
rid = task.request_id.split("_")[0]
|
||||
trace_carrier = task.trace_carrier
|
||||
tracing.trace_set_proc_propagate_context(rid, trace_carrier)
|
||||
trace_carrier = tracing.trace_get_proc_propagate_context(rid)
|
||||
task.trace_carrier = trace_carrier
|
||||
tracing.trace_report_span(
|
||||
tracing.TraceSpanName.SCHEDULE,
|
||||
rid,
|
||||
int(task.metrics.scheduler_recv_req_time * 1e9),
|
||||
int(time.time() * 1e9),
|
||||
thread_finish_flag=True,
|
||||
)
|
||||
trace_print(
|
||||
LoggingEventName.RESOURCE_ALLOCATE_END, task.request_id, getattr(task, "user", "")
|
||||
)
|
||||
trace_print(
|
||||
LoggingEventName.REQUEST_SCHEDULE_END, task.request_id, getattr(task, "user", "")
|
||||
)
|
||||
trace_print(LoggingEventName.INFERENCE_START, task.request_id, getattr(task, "user", ""))
|
||||
if isinstance(task, Request) and task.has_been_preempted_before:
|
||||
trace_print(
|
||||
LoggingEventName.RESCHEDULED_INFERENCE_START,
|
||||
task.request_id,
|
||||
getattr(task, "user", ""),
|
||||
)
|
||||
else:
|
||||
trace_carrier = task.trace_carrier
|
||||
tracing.trace_set_proc_propagate_context(rid, trace_carrier)
|
||||
trace_carrier = tracing.trace_get_proc_propagate_context(rid)
|
||||
task.trace_carrier = trace_carrier
|
||||
tracing.trace_report_span(
|
||||
tracing.TraceSpanName.SCHEDULE,
|
||||
rid,
|
||||
int(task.metrics.scheduler_recv_req_time * 1e9),
|
||||
int(time.time() * 1e9),
|
||||
thread_finish_flag=True,
|
||||
)
|
||||
trace_print(
|
||||
LoggingEventName.RESOURCE_ALLOCATE_END, task.request_id, getattr(task, "user", "")
|
||||
)
|
||||
trace_print(
|
||||
LoggingEventName.REQUEST_SCHEDULE_END, task.request_id, getattr(task, "user", "")
|
||||
)
|
||||
trace_print(
|
||||
LoggingEventName.INFERENCE_START, task.request_id, getattr(task, "user", "")
|
||||
)
|
||||
if isinstance(task, Request):
|
||||
if self.cfg.scheduler_config.splitwise_role == "decode":
|
||||
task.metrics.decode_inference_start_time = time.time()
|
||||
else:
|
||||
elif not task.has_been_preempted_before:
|
||||
task.metrics.inference_start_time = time.time()
|
||||
self.engine_worker_queue.put_tasks((tasks, self.resource_manager.real_bsz))
|
||||
|
||||
|
||||
Reference in New Issue
Block a user