[OP][Feature] 统一 limit_thinking_content_length CUDA 算子,支持回复长度限制与注入序列 (#6493)

* Initial plan

* Migrate PRs #6311, #6129, #6305 to develop and merge unit tests

Co-authored-by: yuanlehome <23653004+yuanlehome@users.noreply.github.com>

* fix

* update

* fix

* fix ci

* fix ci

* Initial plan

* test: add test_chat_with_response_max_tokens to test_EB_VL_Lite_serving.py

Co-authored-by: yuanlehome <23653004+yuanlehome@users.noreply.github.com>

* test: add disable-thinking case to test_chat_with_response_max_tokens

Co-authored-by: yuanlehome <23653004+yuanlehome@users.noreply.github.com>

* test: add both reasoning_max_tokens and response_max_tokens case

Co-authored-by: yuanlehome <23653004+yuanlehome@users.noreply.github.com>

* fix ci

* fix ci

* fix ci

---------

Co-authored-by: copilot-swe-agent[bot] <198982749+Copilot@users.noreply.github.com>
Co-authored-by: yuanlehome <23653004+yuanlehome@users.noreply.github.com>
This commit is contained in:
Yuanle Liu
2026-02-25 21:36:50 +08:00
committed by GitHub
parent e18397134a
commit 6d3fede240
38 changed files with 771 additions and 1690 deletions
+11 -2
View File
@@ -137,8 +137,11 @@ class Ernie4_5Processor(BaseDataProcessor):
# truncate prompts that exceed the length limit
if max_model_len is not None and len(request.prompt_token_ids) > max_model_len:
request.prompt_token_ids = request.prompt_token_ids[: max_model_len - 1]
max_tokens = max_model_len - len(request.prompt_token_ids)
if request.get("max_tokens") is None:
request.set("max_tokens", max(1, max_model_len - len(request.prompt_token_ids)))
request.set("max_tokens", max(1, max_tokens))
else:
request.set("max_tokens", min(max_tokens, request.get("max_tokens")))
if request.get("temperature") < _SAMPLING_EPS:
# zero temperature is equivalent to greedy sampling
request.set("temperature", 1)
@@ -211,6 +214,7 @@ class Ernie4_5Processor(BaseDataProcessor):
request[k] = v
else:
raise ValueError("Invalid input: chat_template_kwargs must be a dict")
request.setdefault("enable_thinking", True)
request["prompt_token_ids"] = self.messages2ids(request, **chat_template_kwargs)
else:
raise ValueError(f"Request must contain 'prompt_token_ids', 'prompt', or 'messages': {request}")
@@ -221,8 +225,11 @@ class Ernie4_5Processor(BaseDataProcessor):
# truncate prompts that exceed the length limit
if max_model_len is not None and len(request["prompt_token_ids"]) > max_model_len:
request["prompt_token_ids"] = request["prompt_token_ids"][: max_model_len - 1]
max_tokens = max_model_len - len(request["prompt_token_ids"])
if request.get("max_tokens") is None:
request["max_tokens"] = max(1, max_model_len - len(request["prompt_token_ids"]))
request["max_tokens"] = max(1, max_tokens)
else:
request["max_tokens"] = min(max_tokens, request["max_tokens"])
if request.get("temperature") < _SAMPLING_EPS:
# zero temperature is equivalent to greedy sampling
request["temperature"] = 1
@@ -241,6 +248,8 @@ class Ernie4_5Processor(BaseDataProcessor):
else:
self.model_status_dict[request["request_id"]] = model_status
request["enable_thinking"] = model_status == "think_start"
if request.get("response_max_tokens") is not None and request.get("enable_thinking") is False:
request["max_tokens"] = min(request["response_max_tokens"], request["max_tokens"])
data_processor_logger.info(f"Processed request dict: {request}")
return request