mirror of
https://github.com/PaddlePaddle/FastDeploy.git
synced 2026-04-24 01:29:57 +08:00
[Cherry-Pick][OP][Feature] 统一 limit_thinking_content_length CUDA 算子,支持回复长度限制与注入序列 (#6506)
* Initial plan * feat: migrate core PR6493 changes to release 2.4 Co-authored-by: yuanlehome <23653004+yuanlehome@users.noreply.github.com> * fix ci * fix ci * fix ci * fix ci --------- Co-authored-by: copilot-swe-agent[bot] <198982749+Copilot@users.noreply.github.com> Co-authored-by: yuanlehome <23653004+yuanlehome@users.noreply.github.com>
This commit is contained in:
@@ -1019,41 +1019,28 @@ void SaveOutMmsgStatic(const paddle::Tensor& x,
|
||||
int64_t rank_id,
|
||||
bool save_each_rank);
|
||||
|
||||
void LimitThinkingContentLengthV1(const paddle::Tensor& next_tokens,
|
||||
const paddle::Tensor& max_think_lens,
|
||||
const paddle::Tensor& step_idx,
|
||||
const paddle::Tensor& limit_think_status,
|
||||
const paddle::Tensor& stop_flags,
|
||||
const paddle::Tensor& eos_token_ids,
|
||||
const int64_t think_end_id);
|
||||
void LimitThinkingContentLength(const paddle::Tensor& next_tokens,
|
||||
const paddle::Tensor& max_think_lens,
|
||||
const paddle::Tensor& max_reply_lens,
|
||||
const paddle::Tensor& step_idx,
|
||||
const paddle::Tensor& limit_status,
|
||||
const paddle::Tensor& stop_flags,
|
||||
const paddle::Tensor& eos_token_ids,
|
||||
const paddle::Tensor& inject_token_ids,
|
||||
const int64_t think_end_id,
|
||||
const bool splitwise_role_is_decode);
|
||||
|
||||
void LimitThinkingContentLengthV2(const paddle::Tensor& next_tokens,
|
||||
const paddle::Tensor& max_think_lens,
|
||||
const paddle::Tensor& step_idx,
|
||||
const paddle::Tensor& limit_think_status,
|
||||
const paddle::Tensor& stop_flags,
|
||||
const int64_t think_end_id,
|
||||
const int64_t line_break_id);
|
||||
|
||||
void SpeculateLimitThinkingContentLengthV1(
|
||||
const paddle::Tensor& next_tokens,
|
||||
const paddle::Tensor& max_think_lens,
|
||||
const paddle::Tensor& step_idx,
|
||||
const paddle::Tensor& limit_think_status,
|
||||
const paddle::Tensor& accept_num,
|
||||
const paddle::Tensor& stop_flags,
|
||||
const paddle::Tensor& eos_token_ids,
|
||||
const int64_t think_end_id);
|
||||
|
||||
void SpeculateLimitThinkingContentLengthV2(
|
||||
const paddle::Tensor& next_tokens,
|
||||
const paddle::Tensor& max_think_lens,
|
||||
const paddle::Tensor& step_idx,
|
||||
const paddle::Tensor& limit_think_status,
|
||||
const paddle::Tensor& accept_num,
|
||||
const paddle::Tensor& stop_flags,
|
||||
const int64_t think_end_id,
|
||||
const int64_t line_break_id);
|
||||
void SpeculateLimitThinkingContentLength(const paddle::Tensor& next_tokens,
|
||||
const paddle::Tensor& max_think_lens,
|
||||
const paddle::Tensor& max_reply_lens,
|
||||
const paddle::Tensor& step_idx,
|
||||
const paddle::Tensor& limit_status,
|
||||
const paddle::Tensor& accept_num,
|
||||
const paddle::Tensor& stop_flags,
|
||||
const paddle::Tensor& eos_token_ids,
|
||||
const paddle::Tensor& inject_token_ids,
|
||||
const int64_t think_end_id,
|
||||
const bool splitwise_role_is_decode);
|
||||
|
||||
void SpeculateGetLogits(const paddle::Tensor& draft_logits,
|
||||
const paddle::Tensor& next_token_num,
|
||||
@@ -1665,20 +1652,12 @@ PYBIND11_MODULE(fastdeploy_ops, m) {
|
||||
|
||||
m.def("save_output", &SaveOutMmsgStatic, "save_output function");
|
||||
|
||||
m.def("limit_thinking_content_length_v1",
|
||||
&LimitThinkingContentLengthV1,
|
||||
"limit_thinking_content_length_v1 function");
|
||||
m.def("limit_thinking_content_length",
|
||||
&LimitThinkingContentLength,
|
||||
"limit_thinking_content_length function");
|
||||
|
||||
m.def("limit_thinking_content_length_v2",
|
||||
&LimitThinkingContentLengthV2,
|
||||
"limit_thinking_content_length_v2 function");
|
||||
|
||||
m.def("speculate_limit_thinking_content_length_v1",
|
||||
&SpeculateLimitThinkingContentLengthV1,
|
||||
"speculate limit thinking content length function");
|
||||
|
||||
m.def("speculate_limit_thinking_content_length_v2",
|
||||
&SpeculateLimitThinkingContentLengthV2,
|
||||
m.def("speculate_limit_thinking_content_length",
|
||||
&SpeculateLimitThinkingContentLength,
|
||||
"speculate limit thinking content length function");
|
||||
|
||||
m.def("speculate_get_logits",
|
||||
|
||||
Reference in New Issue
Block a user