[XPU]add enable_logprob (#5279)

* [XPU]Update document

* [XPU]Update documentation

* [XPU]add enable_logprob

* Fix code style issues

* “doc”

* “docs”

* “doc”

* Fix code style via pre-commit

---------

Co-authored-by: root <root@gajl-bbc-onlinec-com-1498354.gajl.baidu.com>
This commit is contained in:
qw86972190
2025-12-02 15:32:28 +08:00
committed by GitHub
parent c563eca791
commit 6048ea37bd
7 changed files with 315 additions and 14 deletions
@@ -20,6 +20,7 @@ import paddle
from fastdeploy import envs
from fastdeploy.model_executor.forward_meta import XPUForwardMeta
from fastdeploy.model_executor.layers.sample.sampler import Sampler
from fastdeploy.platforms import current_platform
from fastdeploy.worker.output import ModelOutputData
@@ -32,6 +33,7 @@ if current_platform.is_xpu():
limit_thinking_content_length_v1,
limit_thinking_content_length_v2,
save_output,
save_output_topk,
set_stop_value_multi_ends,
speculate_clear_accept_nums,
speculate_get_output_padding_offset,
@@ -210,7 +212,7 @@ def xpu_process_output(
def xpu_post_process_normal(
sampled_token_ids: paddle.Tensor,
sampler_output: Sampler,
model_output: ModelOutputData,
share_inputs: Dict[str, paddle.Tensor],
block_size: int = 64,
@@ -220,6 +222,8 @@ def xpu_post_process_normal(
) -> None:
""" """
sampled_token_ids = sampler_output.sampled_token_ids
if think_end_id > 0:
limit_strategy = envs.FD_LIMIT_THINKING_CONTENT_TRUNCATE_STR
max_think_lens = share_inputs["max_think_lens"]
@@ -310,12 +314,27 @@ def xpu_post_process_normal(
# 3. Transmit the model's output and stop generation signal via message queue.
# In the future, we will abandon this approach.
if not skip_save_output:
save_output(
sampled_token_ids,
model_output.not_need_stop,
model_output.mp_rank,
False, # use_ep
)
if sampler_output.logprobs_tensors is None:
save_output(
sampled_token_ids,
model_output.not_need_stop,
model_output.mp_rank,
False, # use_ep
)
else:
if save_output_topk is None:
raise ImportError(
"save_output_topk operator is not available. "
"Please rebuild the XPU operators with the new get_output_msg_with_topk.cc and save_output_msg_with_topk.cc files."
)
save_output_topk(
sampled_token_ids,
sampler_output.logprobs_tensors.logprob_token_ids,
sampler_output.logprobs_tensors.logprobs,
sampler_output.logprobs_tensors.selected_token_ranks,
model_output.not_need_stop,
model_output.mp_rank,
)
def xpu_post_process_specualate(