[BugFix] Fix inaccurate cache hit rate and TTFT after request preemption (#6620)

* [chore] add has_been_rescheduled flag for requests

* [refactor] rename reschedule to preempted for accuracy and fix cache hit metrics

* [chore] add ttft_s
This commit is contained in:
Yonghua Li
2026-03-05 16:25:02 +08:00
committed by GitHub
parent 326b9755aa
commit fa1906bd6f
5 changed files with 74 additions and 42 deletions
+4
View File
@@ -34,6 +34,8 @@ class LoggingEventName(Enum):
INFERENCE_END = "INFERENCE_END"
POSTPROCESSING_START = "POSTPROCESSING_START"
POSTPROCESSING_END = "POSTPROCESSING_END"
PREEMPTED = "PREEMPTED"
RESCHEDULED_INFERENCE_START = "RESCHEDULED_INFERENCE_START"
class StageName(Enum):
@@ -60,6 +62,8 @@ EVENT_TO_STAGE_MAP = {
LoggingEventName.INFERENCE_START: StageName.PREFILL,
LoggingEventName.FIRST_TOKEN_GENERATED: StageName.PREFILL,
LoggingEventName.DECODE_START: StageName.DECODE,
LoggingEventName.PREEMPTED: StageName.DECODE,
LoggingEventName.RESCHEDULED_INFERENCE_START: StageName.DECODE,
LoggingEventName.INFERENCE_END: StageName.DECODE,
LoggingEventName.POSTPROCESSING_START: StageName.POSTPROCESSING,
LoggingEventName.POSTPROCESSING_END: StageName.POSTPROCESSING,