[Cherry-Pick][OP][Feature] 统一 limit_thinking_content_length CUDA 算子，支持回复长度限制与注入序列 (#6506)

* Initial plan * feat: migrate core PR6493 changes to release 2.4 Co-authored-by: yuanlehome <23653004+yuanlehome@users.noreply.github.com> * fix ci * fix ci * fix ci * fix ci --------- Co-authored-by: copilot-swe-agent[bot] <198982749+Copilot@users.noreply.github.com> Co-authored-by: yuanlehome <23653004+yuanlehome@users.noreply.github.com>
2026-04-24 01:29:57 +08:00 · 2026-02-26 10:02:01 +08:00
parent 2bd6263f82
commit 2b79d971f1
27 changed files with 721 additions and 1660 deletions
@@ -1019,41 +1019,28 @@ void SaveOutMmsgStatic(const paddle::Tensor& x,
                       int64_t rank_id,
                       bool save_each_rank);

-void LimitThinkingContentLengthV1(const paddle::Tensor& next_tokens,
-                                  const paddle::Tensor& max_think_lens,
-                                  const paddle::Tensor& step_idx,
-                                  const paddle::Tensor& limit_think_status,
-                                  const paddle::Tensor& stop_flags,
-                                  const paddle::Tensor& eos_token_ids,
-                                  const int64_t think_end_id);
+void LimitThinkingContentLength(const paddle::Tensor& next_tokens,
+                                const paddle::Tensor& max_think_lens,
+                                const paddle::Tensor& max_reply_lens,
+                                const paddle::Tensor& step_idx,
+                                const paddle::Tensor& limit_status,
+                                const paddle::Tensor& stop_flags,
+                                const paddle::Tensor& eos_token_ids,
+                                const paddle::Tensor& inject_token_ids,
+                                const int64_t think_end_id,
+                                const bool splitwise_role_is_decode);

-void LimitThinkingContentLengthV2(const paddle::Tensor& next_tokens,
-                                  const paddle::Tensor& max_think_lens,
-                                  const paddle::Tensor& step_idx,
-                                  const paddle::Tensor& limit_think_status,
-                                  const paddle::Tensor& stop_flags,
-                                  const int64_t think_end_id,
-                                  const int64_t line_break_id);
-
-void SpeculateLimitThinkingContentLengthV1(
-    const paddle::Tensor& next_tokens,
-    const paddle::Tensor& max_think_lens,
-    const paddle::Tensor& step_idx,
-    const paddle::Tensor& limit_think_status,
-    const paddle::Tensor& accept_num,
-    const paddle::Tensor& stop_flags,
-    const paddle::Tensor& eos_token_ids,
-    const int64_t think_end_id);
-
-void SpeculateLimitThinkingContentLengthV2(
-    const paddle::Tensor& next_tokens,
-    const paddle::Tensor& max_think_lens,
-    const paddle::Tensor& step_idx,
-    const paddle::Tensor& limit_think_status,
-    const paddle::Tensor& accept_num,
-    const paddle::Tensor& stop_flags,
-    const int64_t think_end_id,
-    const int64_t line_break_id);
+void SpeculateLimitThinkingContentLength(const paddle::Tensor& next_tokens,
+                                         const paddle::Tensor& max_think_lens,
+                                         const paddle::Tensor& max_reply_lens,
+                                         const paddle::Tensor& step_idx,
+                                         const paddle::Tensor& limit_status,
+                                         const paddle::Tensor& accept_num,
+                                         const paddle::Tensor& stop_flags,
+                                         const paddle::Tensor& eos_token_ids,
+                                         const paddle::Tensor& inject_token_ids,
+                                         const int64_t think_end_id,
+                                         const bool splitwise_role_is_decode);

 void SpeculateGetLogits(const paddle::Tensor& draft_logits,
                        const paddle::Tensor& next_token_num,
@@ -1665,20 +1652,12 @@ PYBIND11_MODULE(fastdeploy_ops, m) {

  m.def("save_output", &SaveOutMmsgStatic, "save_output function");

-  m.def("limit_thinking_content_length_v1",
-        &LimitThinkingContentLengthV1,
-        "limit_thinking_content_length_v1 function");
+  m.def("limit_thinking_content_length",
+        &LimitThinkingContentLength,
+        "limit_thinking_content_length function");

-  m.def("limit_thinking_content_length_v2",
-        &LimitThinkingContentLengthV2,
-        "limit_thinking_content_length_v2 function");
-
-  m.def("speculate_limit_thinking_content_length_v1",
-        &SpeculateLimitThinkingContentLengthV1,
-        "speculate limit thinking content length function");
-
-  m.def("speculate_limit_thinking_content_length_v2",
-        &SpeculateLimitThinkingContentLengthV2,
+  m.def("speculate_limit_thinking_content_length",
+        &SpeculateLimitThinkingContentLength,
        "speculate limit thinking content length function");

  m.def("speculate_get_logits",