mirror of
https://github.com/PaddlePaddle/FastDeploy.git
synced 2026-04-23 00:17:25 +08:00
[Feature] Support for request-level speculative decoding metrics monitoring. (#5518)
* support spec metrics monitor per request * fix bug * remove debug log * fix ut bugs
This commit is contained in:
@@ -57,6 +57,7 @@ from fastdeploy.worker.output import (
|
||||
LogprobsLists,
|
||||
LogprobsTensors,
|
||||
PromptLogprobs,
|
||||
SpeculateMetrics,
|
||||
)
|
||||
|
||||
NONES = itertools.repeat(None)
|
||||
@@ -384,6 +385,8 @@ class OpenAIServingChat:
|
||||
request.include_logprobs_decode_token,
|
||||
)
|
||||
|
||||
output_speculate_metrics = res["metrics"].get("speculate_metrics", None)
|
||||
|
||||
delta_message = DeltaMessage(
|
||||
reasoning_content="",
|
||||
prompt_token_ids=None,
|
||||
@@ -415,6 +418,7 @@ class OpenAIServingChat:
|
||||
logprobs=logprobs_res,
|
||||
draft_logprobs=draft_logprobs_res,
|
||||
arrival_time=arrival_time,
|
||||
speculate_metrics=output_speculate_metrics,
|
||||
)
|
||||
if res["finished"]:
|
||||
num_choices -= 1
|
||||
@@ -537,6 +541,7 @@ class OpenAIServingChat:
|
||||
decoder_base_url=self.tokenizer_base_url,
|
||||
)
|
||||
prompt_logprobs_res_list = [[] for _ in range(num_choices)]
|
||||
speculate_metrics = [None for _ in range(num_choices)]
|
||||
choices = []
|
||||
while num_choices > 0:
|
||||
if self.engine_client.check_model_weight_status():
|
||||
@@ -613,6 +618,7 @@ class OpenAIServingChat:
|
||||
)
|
||||
if prompt_logprobs_res:
|
||||
prompt_logprobs_res_list[idx].extend(clamp_prompt_logprobs(prompt_logprobs_res))
|
||||
speculate_metrics[idx] = data["metrics"].get("speculate_metrics", None)
|
||||
if data["finished"]:
|
||||
num_choices -= 1
|
||||
reasoning_num_tokens[idx] = data["outputs"].get("reasoning_token_num", 0)
|
||||
@@ -635,6 +641,7 @@ class OpenAIServingChat:
|
||||
response_processor=response_processor,
|
||||
prompt_logprobs_res_list=prompt_logprobs_res_list,
|
||||
max_tokens=max_tokens,
|
||||
speculate_metrics=speculate_metrics[idx],
|
||||
)
|
||||
choices.append(choice)
|
||||
finally:
|
||||
@@ -688,6 +695,7 @@ class OpenAIServingChat:
|
||||
prompt_logprobs_res_list: list,
|
||||
response_processor: ChatResponseProcessor,
|
||||
max_tokens: int,
|
||||
speculate_metrics: SpeculateMetrics | None,
|
||||
) -> ChatCompletionResponseChoice:
|
||||
idx = int(data["request_id"].split("_")[-1])
|
||||
output = data["outputs"]
|
||||
@@ -745,6 +753,7 @@ class OpenAIServingChat:
|
||||
draft_logprobs=draft_logprobs_full_res,
|
||||
prompt_logprobs=prompt_logprobs_full_res,
|
||||
finish_reason=finish_reason,
|
||||
speculate_metrics=speculate_metrics,
|
||||
)
|
||||
|
||||
def _create_chat_logprobs(
|
||||
|
||||
Reference in New Issue
Block a user