[OP][Feature] 统一 limit_thinking_content_length CUDA 算子，支持回复长度限制与注入序列 (#6493)

* Initial plan * Migrate PRs #6311, #6129, #6305 to develop and merge unit tests Co-authored-by: yuanlehome <23653004+yuanlehome@users.noreply.github.com> * fix * update * fix * fix ci * fix ci * Initial plan * test: add test_chat_with_response_max_tokens to test_EB_VL_Lite_serving.py Co-authored-by: yuanlehome <23653004+yuanlehome@users.noreply.github.com> * test: add disable-thinking case to test_chat_with_response_max_tokens Co-authored-by: yuanlehome <23653004+yuanlehome@users.noreply.github.com> * test: add both reasoning_max_tokens and response_max_tokens case Co-authored-by: yuanlehome <23653004+yuanlehome@users.noreply.github.com> * fix ci * fix ci * fix ci --------- Co-authored-by: copilot-swe-agent[bot] <198982749+Copilot@users.noreply.github.com> Co-authored-by: yuanlehome <23653004+yuanlehome@users.noreply.github.com>
2026-04-23 00:17:25 +08:00 · 2026-02-25 21:36:50 +08:00
parent e18397134a
commit 6d3fede240
38 changed files with 771 additions and 1690 deletions
@@ -744,15 +744,30 @@ class GPUModelRunner(ModelRunnerBase):
                length = prefill_end_index - prefill_start_index
                if not self.is_pooling_model:
                    if request.get("enable_thinking") is not None:
-                        self.share_inputs["enable_thinking"][idx : idx + 1, :] = request.get("enable_thinking")
-                    if request.get("enable_thinking", False) and request.get("reasoning_max_tokens", None) is not None:
-                        # Enable thinking
-                        self.share_inputs["max_think_lens"][idx : idx + 1, :] = request.get("reasoning_max_tokens")
-                        self.share_inputs["limit_think_status"][idx : idx + 1, :] = 0
-                    else:
-                        # Disable thinking
-                        self.share_inputs["max_think_lens"][idx : idx + 1, :] = -1
-                        self.share_inputs["limit_think_status"][idx : idx + 1, :] = 0
+                        enable_thinking = bool(request.get("enable_thinking"))
+                        logger.debug(f"request {request.request_id} with {enable_thinking=} at idx {idx}")
+                        self.share_inputs["enable_thinking"][idx : idx + 1, :] = enable_thinking
+                        if enable_thinking:
+                            self.share_inputs["limit_think_status"][idx : idx + 1, :] = 0
+                            if request.get("reasoning_max_tokens") is not None:
+                                # Enable thinking
+                                self.share_inputs["max_think_lens"][idx : idx + 1, :] = request.get(
+                                    "reasoning_max_tokens"
+                                )
+                            else:
+                                self.share_inputs["max_think_lens"][idx : idx + 1, :] = -1
+                            if request.get("response_max_tokens") is not None:
+                                # Enable thinking
+                                self.share_inputs["max_reply_lens"][idx : idx + 1, :] = request.get(
+                                    "response_max_tokens"
+                                )
+                            else:
+                                self.share_inputs["max_reply_lens"][idx : idx + 1, :] = -1
+                        else:
+                            # Disable thinking
+                            self.share_inputs["max_think_lens"][idx : idx + 1, :] = -1
+                            self.share_inputs["max_reply_lens"][idx : idx + 1, :] = -1
+                            self.share_inputs["limit_think_status"][idx : idx + 1, :] = 0

                if isinstance(request.prompt_token_ids, np.ndarray):
                    prompt_token_ids = request.prompt_token_ids.tolist()
@@ -1744,7 +1759,7 @@ class GPUModelRunner(ModelRunnerBase):
            skip_save_output=True,
            async_output_queue=self.async_output_queue,
            think_end_id=self.model_config.think_end_id,
-            line_break_id=self.model_config.line_break_id,
+            splitwise_role_is_decode=self.scheduler_config.splitwise_role == "decode",
        )
        self.exist_prefill_flag = False
        return pooler_output
@@ -1849,7 +1864,7 @@ class GPUModelRunner(ModelRunnerBase):
            skip_save_output=True,
            async_output_queue=self.async_output_queue,
            think_end_id=self.model_config.think_end_id,
-            line_break_id=self.model_config.line_break_id,
+            splitwise_role_is_decode=self.scheduler_config.splitwise_role == "decode",
            enable_entropy=self.enable_entropy and self.parallel_config.tensor_parallel_rank == 0,
        )
        self.exist_prefill_flag = False
@@ -2508,7 +2523,7 @@ class GPUModelRunner(ModelRunnerBase):
                skip_save_output=skip_save_output,
                async_output_queue=self.async_output_queue,
                think_end_id=self.model_config.think_end_id,
-                line_break_id=self.model_config.line_break_id,
+                splitwise_role_is_decode=self.scheduler_config.splitwise_role == "decode",
                enable_entropy=self.enable_entropy and self.parallel_config.tensor_parallel_rank == 0,
            )