mirror of
https://github.com/PaddlePaddle/FastDeploy.git
synced 2026-04-23 00:17:25 +08:00
[Speculative Decoding] split draft_tokens into standalone post-processing path (#5205)
* refactor(mtp): split draft_tokens into standalone post-processing path for MTP + logprobs * Restore Request.__repr__ implementation * ci * add envs * fix unittest
This commit is contained in:
@@ -571,6 +571,7 @@ class OpenAIServingChat:
|
||||
num_input_video_tokens=num_input_video_tokens,
|
||||
num_image_tokens=num_image_tokens,
|
||||
logprob_contents=logprob_contents,
|
||||
draft_logprob_contents=draft_logprob_contents,
|
||||
response_processor=response_processor,
|
||||
max_tokens=max_tokens,
|
||||
)
|
||||
@@ -622,6 +623,7 @@ class OpenAIServingChat:
|
||||
num_input_video_tokens: list,
|
||||
num_image_tokens: list,
|
||||
logprob_contents: list,
|
||||
draft_logprob_contents: list,
|
||||
response_processor: ChatResponseProcessor,
|
||||
max_tokens: int,
|
||||
) -> ChatCompletionResponseChoice:
|
||||
@@ -649,6 +651,9 @@ class OpenAIServingChat:
|
||||
logprobs_full_res = None
|
||||
if logprob_contents[idx]:
|
||||
logprobs_full_res = LogProbs(content=logprob_contents[idx])
|
||||
draft_logprobs_full_res = None
|
||||
if draft_logprob_contents[idx]:
|
||||
draft_logprobs_full_res = LogProbs(content=draft_logprob_contents[idx])
|
||||
|
||||
num_cached_tokens[idx] = data.get("num_cached_tokens", 0)
|
||||
num_input_image_tokens[idx] = data.get("num_input_image_tokens", 0)
|
||||
@@ -669,6 +674,7 @@ class OpenAIServingChat:
|
||||
index=idx,
|
||||
message=message,
|
||||
logprobs=logprobs_full_res,
|
||||
draft_logprobs=draft_logprobs_full_res,
|
||||
finish_reason=finish_reason,
|
||||
)
|
||||
|
||||
|
||||
Reference in New Issue
Block a user