[OP][Feature] 统一 limit_thinking_content_length CUDA 算子,支持回复长度限制与注入序列 (#6493)

* Initial plan

* Migrate PRs #6311, #6129, #6305 to develop and merge unit tests

Co-authored-by: yuanlehome <23653004+yuanlehome@users.noreply.github.com>

* fix

* update

* fix

* fix ci

* fix ci

* Initial plan

* test: add test_chat_with_response_max_tokens to test_EB_VL_Lite_serving.py

Co-authored-by: yuanlehome <23653004+yuanlehome@users.noreply.github.com>

* test: add disable-thinking case to test_chat_with_response_max_tokens

Co-authored-by: yuanlehome <23653004+yuanlehome@users.noreply.github.com>

* test: add both reasoning_max_tokens and response_max_tokens case

Co-authored-by: yuanlehome <23653004+yuanlehome@users.noreply.github.com>

* fix ci

* fix ci

* fix ci

---------

Co-authored-by: copilot-swe-agent[bot] <198982749+Copilot@users.noreply.github.com>
Co-authored-by: yuanlehome <23653004+yuanlehome@users.noreply.github.com>
This commit is contained in:
Yuanle Liu
2026-02-25 21:36:50 +08:00
committed by GitHub
parent e18397134a
commit 6d3fede240
38 changed files with 771 additions and 1690 deletions
+27 -12
View File
@@ -744,15 +744,30 @@ class GPUModelRunner(ModelRunnerBase):
length = prefill_end_index - prefill_start_index
if not self.is_pooling_model:
if request.get("enable_thinking") is not None:
self.share_inputs["enable_thinking"][idx : idx + 1, :] = request.get("enable_thinking")
if request.get("enable_thinking", False) and request.get("reasoning_max_tokens", None) is not None:
# Enable thinking
self.share_inputs["max_think_lens"][idx : idx + 1, :] = request.get("reasoning_max_tokens")
self.share_inputs["limit_think_status"][idx : idx + 1, :] = 0
else:
# Disable thinking
self.share_inputs["max_think_lens"][idx : idx + 1, :] = -1
self.share_inputs["limit_think_status"][idx : idx + 1, :] = 0
enable_thinking = bool(request.get("enable_thinking"))
logger.debug(f"request {request.request_id} with {enable_thinking=} at idx {idx}")
self.share_inputs["enable_thinking"][idx : idx + 1, :] = enable_thinking
if enable_thinking:
self.share_inputs["limit_think_status"][idx : idx + 1, :] = 0
if request.get("reasoning_max_tokens") is not None:
# Enable thinking
self.share_inputs["max_think_lens"][idx : idx + 1, :] = request.get(
"reasoning_max_tokens"
)
else:
self.share_inputs["max_think_lens"][idx : idx + 1, :] = -1
if request.get("response_max_tokens") is not None:
# Enable thinking
self.share_inputs["max_reply_lens"][idx : idx + 1, :] = request.get(
"response_max_tokens"
)
else:
self.share_inputs["max_reply_lens"][idx : idx + 1, :] = -1
else:
# Disable thinking
self.share_inputs["max_think_lens"][idx : idx + 1, :] = -1
self.share_inputs["max_reply_lens"][idx : idx + 1, :] = -1
self.share_inputs["limit_think_status"][idx : idx + 1, :] = 0
if isinstance(request.prompt_token_ids, np.ndarray):
prompt_token_ids = request.prompt_token_ids.tolist()
@@ -1744,7 +1759,7 @@ class GPUModelRunner(ModelRunnerBase):
skip_save_output=True,
async_output_queue=self.async_output_queue,
think_end_id=self.model_config.think_end_id,
line_break_id=self.model_config.line_break_id,
splitwise_role_is_decode=self.scheduler_config.splitwise_role == "decode",
)
self.exist_prefill_flag = False
return pooler_output
@@ -1849,7 +1864,7 @@ class GPUModelRunner(ModelRunnerBase):
skip_save_output=True,
async_output_queue=self.async_output_queue,
think_end_id=self.model_config.think_end_id,
line_break_id=self.model_config.line_break_id,
splitwise_role_is_decode=self.scheduler_config.splitwise_role == "decode",
enable_entropy=self.enable_entropy and self.parallel_config.tensor_parallel_rank == 0,
)
self.exist_prefill_flag = False
@@ -2508,7 +2523,7 @@ class GPUModelRunner(ModelRunnerBase):
skip_save_output=skip_save_output,
async_output_queue=self.async_output_queue,
think_end_id=self.model_config.think_end_id,
line_break_id=self.model_config.line_break_id,
splitwise_role_is_decode=self.scheduler_config.splitwise_role == "decode",
enable_entropy=self.enable_entropy and self.parallel_config.tensor_parallel_rank == 0,
)