mirror of
https://github.com/PaddlePaddle/FastDeploy.git
synced 2026-04-23 08:21:53 +08:00
[OP][Feature] 统一 limit_thinking_content_length CUDA 算子,支持回复长度限制与注入序列 (#6493)
* Initial plan * Migrate PRs #6311, #6129, #6305 to develop and merge unit tests Co-authored-by: yuanlehome <23653004+yuanlehome@users.noreply.github.com> * fix * update * fix * fix ci * fix ci * Initial plan * test: add test_chat_with_response_max_tokens to test_EB_VL_Lite_serving.py Co-authored-by: yuanlehome <23653004+yuanlehome@users.noreply.github.com> * test: add disable-thinking case to test_chat_with_response_max_tokens Co-authored-by: yuanlehome <23653004+yuanlehome@users.noreply.github.com> * test: add both reasoning_max_tokens and response_max_tokens case Co-authored-by: yuanlehome <23653004+yuanlehome@users.noreply.github.com> * fix ci * fix ci * fix ci --------- Co-authored-by: copilot-swe-agent[bot] <198982749+Copilot@users.noreply.github.com> Co-authored-by: yuanlehome <23653004+yuanlehome@users.noreply.github.com>
This commit is contained in:
@@ -73,6 +73,7 @@ class SamplingParams:
|
||||
can complete the sequence.
|
||||
max_tokens: Maximum number of tokens to generate per output sequence.
|
||||
reasoning_max_tokens: Maximum number of tokens to generate for reasoning per output sequence.
|
||||
response_max_tokens: Maximum number of tokens to generate for response per output sequence.
|
||||
min_tokens: Minimum number of tokens to generate per output sequence
|
||||
before EOS or stop_token_ids can be generated
|
||||
logprobs: Number of log probabilities to return per output token.
|
||||
@@ -99,6 +100,7 @@ class SamplingParams:
|
||||
stop_seqs_len: Optional[int] = None
|
||||
max_tokens: Optional[int] = None
|
||||
reasoning_max_tokens: Optional[int] = None
|
||||
response_max_tokens: Optional[int] = None
|
||||
min_tokens: int = 1
|
||||
logprobs: Optional[int] = None
|
||||
prompt_logprobs: Optional[int] = None
|
||||
@@ -176,6 +178,11 @@ class SamplingParams:
|
||||
if getattr(req, "reasoning_max_tokens", None) is not None
|
||||
else cls.reasoning_max_tokens
|
||||
),
|
||||
response_max_tokens=(
|
||||
getattr(req, "response_max_tokens", None)
|
||||
if getattr(req, "response_max_tokens", None) is not None
|
||||
else cls.response_max_tokens
|
||||
),
|
||||
min_tokens=(
|
||||
getattr(req, "min_tokens", None) if getattr(req, "min_tokens", None) is not None else cls.min_tokens
|
||||
),
|
||||
@@ -232,6 +239,7 @@ class SamplingParams:
|
||||
stop_token_ids=None,
|
||||
max_tokens=None,
|
||||
reasoning_max_tokens=None,
|
||||
response_max_tokens=None,
|
||||
min_tokens=1,
|
||||
logprobs=None,
|
||||
prompt_logprobs=None,
|
||||
@@ -256,6 +264,7 @@ class SamplingParams:
|
||||
stop_token_ids=stop_token_ids,
|
||||
max_tokens=max_tokens if max_tokens is not None else 8192,
|
||||
reasoning_max_tokens=reasoning_max_tokens,
|
||||
response_max_tokens=response_max_tokens,
|
||||
min_tokens=min_tokens,
|
||||
logprobs=logprobs,
|
||||
prompt_logprobs=prompt_logprobs,
|
||||
@@ -298,6 +307,7 @@ class SamplingParams:
|
||||
|
||||
if self.reasoning_max_tokens is not None and self.reasoning_max_tokens > self.max_tokens:
|
||||
self.reasoning_max_tokens = self.max_tokens
|
||||
# response_max_tokens TODO
|
||||
|
||||
if self.min_tokens < 0:
|
||||
raise ValueError(f"min_tokens must be greater than or equal to 0, " f"got {self.min_tokens}.")
|
||||
|
||||
Reference in New Issue
Block a user