[Feature] Support for request-level speculative decoding metrics monitoring. (#5518)

* support spec metrics monitor per request

* fix bug

* remove debug log

* fix ut bugs
This commit is contained in:
GoldPancake
2025-12-12 12:22:18 +08:00
committed by GitHub
parent 3c1f7b85a4
commit 909059c60a
11 changed files with 154 additions and 47 deletions
@@ -57,6 +57,7 @@ from fastdeploy.worker.output import (
LogprobsLists,
LogprobsTensors,
PromptLogprobs,
SpeculateMetrics,
)
NONES = itertools.repeat(None)
@@ -384,6 +385,8 @@ class OpenAIServingChat:
request.include_logprobs_decode_token,
)
output_speculate_metrics = res["metrics"].get("speculate_metrics", None)
delta_message = DeltaMessage(
reasoning_content="",
prompt_token_ids=None,
@@ -415,6 +418,7 @@ class OpenAIServingChat:
logprobs=logprobs_res,
draft_logprobs=draft_logprobs_res,
arrival_time=arrival_time,
speculate_metrics=output_speculate_metrics,
)
if res["finished"]:
num_choices -= 1
@@ -537,6 +541,7 @@ class OpenAIServingChat:
decoder_base_url=self.tokenizer_base_url,
)
prompt_logprobs_res_list = [[] for _ in range(num_choices)]
speculate_metrics = [None for _ in range(num_choices)]
choices = []
while num_choices > 0:
if self.engine_client.check_model_weight_status():
@@ -613,6 +618,7 @@ class OpenAIServingChat:
)
if prompt_logprobs_res:
prompt_logprobs_res_list[idx].extend(clamp_prompt_logprobs(prompt_logprobs_res))
speculate_metrics[idx] = data["metrics"].get("speculate_metrics", None)
if data["finished"]:
num_choices -= 1
reasoning_num_tokens[idx] = data["outputs"].get("reasoning_token_num", 0)
@@ -635,6 +641,7 @@ class OpenAIServingChat:
response_processor=response_processor,
prompt_logprobs_res_list=prompt_logprobs_res_list,
max_tokens=max_tokens,
speculate_metrics=speculate_metrics[idx],
)
choices.append(choice)
finally:
@@ -688,6 +695,7 @@ class OpenAIServingChat:
prompt_logprobs_res_list: list,
response_processor: ChatResponseProcessor,
max_tokens: int,
speculate_metrics: SpeculateMetrics | None,
) -> ChatCompletionResponseChoice:
idx = int(data["request_id"].split("_")[-1])
output = data["outputs"]
@@ -745,6 +753,7 @@ class OpenAIServingChat:
draft_logprobs=draft_logprobs_full_res,
prompt_logprobs=prompt_logprobs_full_res,
finish_reason=finish_reason,
speculate_metrics=speculate_metrics,
)
def _create_chat_logprobs(