[Cherry-Pick][OP][Feature] 统一 limit_thinking_content_length CUDA 算子,支持回复长度限制与注入序列 (#6506)

* Initial plan

* feat: migrate core PR6493 changes to release 2.4

Co-authored-by: yuanlehome <23653004+yuanlehome@users.noreply.github.com>

* fix ci

* fix ci

* fix ci

* fix ci

---------

Co-authored-by: copilot-swe-agent[bot] <198982749+Copilot@users.noreply.github.com>
Co-authored-by: yuanlehome <23653004+yuanlehome@users.noreply.github.com>
This commit is contained in:
Yuanle Liu
2026-02-26 10:02:01 +08:00
committed by GitHub
parent 2bd6263f82
commit 2b79d971f1
27 changed files with 721 additions and 1660 deletions
+26 -47
View File
@@ -1019,41 +1019,28 @@ void SaveOutMmsgStatic(const paddle::Tensor& x,
int64_t rank_id,
bool save_each_rank);
void LimitThinkingContentLengthV1(const paddle::Tensor& next_tokens,
const paddle::Tensor& max_think_lens,
const paddle::Tensor& step_idx,
const paddle::Tensor& limit_think_status,
const paddle::Tensor& stop_flags,
const paddle::Tensor& eos_token_ids,
const int64_t think_end_id);
void LimitThinkingContentLength(const paddle::Tensor& next_tokens,
const paddle::Tensor& max_think_lens,
const paddle::Tensor& max_reply_lens,
const paddle::Tensor& step_idx,
const paddle::Tensor& limit_status,
const paddle::Tensor& stop_flags,
const paddle::Tensor& eos_token_ids,
const paddle::Tensor& inject_token_ids,
const int64_t think_end_id,
const bool splitwise_role_is_decode);
void LimitThinkingContentLengthV2(const paddle::Tensor& next_tokens,
const paddle::Tensor& max_think_lens,
const paddle::Tensor& step_idx,
const paddle::Tensor& limit_think_status,
const paddle::Tensor& stop_flags,
const int64_t think_end_id,
const int64_t line_break_id);
void SpeculateLimitThinkingContentLengthV1(
const paddle::Tensor& next_tokens,
const paddle::Tensor& max_think_lens,
const paddle::Tensor& step_idx,
const paddle::Tensor& limit_think_status,
const paddle::Tensor& accept_num,
const paddle::Tensor& stop_flags,
const paddle::Tensor& eos_token_ids,
const int64_t think_end_id);
void SpeculateLimitThinkingContentLengthV2(
const paddle::Tensor& next_tokens,
const paddle::Tensor& max_think_lens,
const paddle::Tensor& step_idx,
const paddle::Tensor& limit_think_status,
const paddle::Tensor& accept_num,
const paddle::Tensor& stop_flags,
const int64_t think_end_id,
const int64_t line_break_id);
void SpeculateLimitThinkingContentLength(const paddle::Tensor& next_tokens,
const paddle::Tensor& max_think_lens,
const paddle::Tensor& max_reply_lens,
const paddle::Tensor& step_idx,
const paddle::Tensor& limit_status,
const paddle::Tensor& accept_num,
const paddle::Tensor& stop_flags,
const paddle::Tensor& eos_token_ids,
const paddle::Tensor& inject_token_ids,
const int64_t think_end_id,
const bool splitwise_role_is_decode);
void SpeculateGetLogits(const paddle::Tensor& draft_logits,
const paddle::Tensor& next_token_num,
@@ -1665,20 +1652,12 @@ PYBIND11_MODULE(fastdeploy_ops, m) {
m.def("save_output", &SaveOutMmsgStatic, "save_output function");
m.def("limit_thinking_content_length_v1",
&LimitThinkingContentLengthV1,
"limit_thinking_content_length_v1 function");
m.def("limit_thinking_content_length",
&LimitThinkingContentLength,
"limit_thinking_content_length function");
m.def("limit_thinking_content_length_v2",
&LimitThinkingContentLengthV2,
"limit_thinking_content_length_v2 function");
m.def("speculate_limit_thinking_content_length_v1",
&SpeculateLimitThinkingContentLengthV1,
"speculate limit thinking content length function");
m.def("speculate_limit_thinking_content_length_v2",
&SpeculateLimitThinkingContentLengthV2,
m.def("speculate_limit_thinking_content_length",
&SpeculateLimitThinkingContentLength,
"speculate limit thinking content length function");
m.def("speculate_get_logits",