mirror of
https://github.com/PaddlePaddle/FastDeploy.git
synced 2026-04-23 00:17:25 +08:00
[Optimization]Unified data processing for online and offline (#6891)
* remove process_request * fix chat * fix unit test * remove process response * fix unit test * fix offline decode * Potential fix for pull request finding Co-authored-by: Copilot Autofix powered by AI <175728472+Copilot@users.noreply.github.com> * fix sampling_params --------- Co-authored-by: Copilot Autofix powered by AI <175728472+Copilot@users.noreply.github.com>
This commit is contained in:
+21
-14
@@ -266,19 +266,24 @@ class LLMEngine:
|
||||
# TODO 输入输出长度确认
|
||||
|
||||
if sampling_params is not None:
|
||||
task.update(asdict(sampling_params))
|
||||
if sampling_params.temperature is not None and abs(sampling_params.temperature) < 1e-06:
|
||||
sampling_params.temperature = 1e-06
|
||||
task.update({k: v for k, v in asdict(sampling_params).items() if v is not None})
|
||||
|
||||
# Prepare chat_template_kwargs before calling process_request_dict
|
||||
chat_template_kwargs = kwargs.get("chat_template_kwargs") or {}
|
||||
chat_template_kwargs["chat_template"] = kwargs.get("chat_template")
|
||||
task["chat_template_kwargs"] = chat_template_kwargs
|
||||
|
||||
# Use dict to call process_request_dict
|
||||
task = self.engine.data_processor.process_request_dict(task, self.cfg.model_config.max_model_len)
|
||||
|
||||
# Create Request struct after processing
|
||||
request = Request.from_dict(task)
|
||||
request.metrics.scheduler_recv_req_time = time.time()
|
||||
llm_logger.info(f"Receive request {request}")
|
||||
if sampling_params is not None:
|
||||
if sampling_params.temperature is not None and abs(sampling_params.temperature) < 1e-06:
|
||||
sampling_params.temperature = 1e-06
|
||||
request.sampling_params = sampling_params
|
||||
request.metrics.preprocess_start_time = time.time()
|
||||
chat_template_kwargs = kwargs.get("chat_template_kwargs") or {}
|
||||
chat_template_kwargs["chat_template"] = kwargs.get("chat_template")
|
||||
kwargs["chat_template_kwargs"] = chat_template_kwargs
|
||||
request = self.engine.data_processor.process_request(request, self.cfg.model_config.max_model_len, **kwargs)
|
||||
|
||||
request.prompt_token_ids_len = len(request.prompt_token_ids)
|
||||
request.need_prefill_tokens = request.prompt_token_ids_len
|
||||
input_ids_len = request.prompt_token_ids_len
|
||||
@@ -716,16 +721,18 @@ class LLMEngine:
|
||||
for result in self._get_generated_tokens(req_id):
|
||||
is_end = result.finished
|
||||
if stream and not is_end:
|
||||
processed = self.engine.data_processor.process_response(result)
|
||||
if processed is None:
|
||||
output = self.engine.data_processor.process_response_dict(
|
||||
result.to_dict(), stream=False, include_stop_str_in_output=False
|
||||
)
|
||||
if output is None:
|
||||
continue
|
||||
output = processed.to_dict()
|
||||
yield output
|
||||
|
||||
# Exit loop if termination condition is met
|
||||
if is_end:
|
||||
processed = self.engine.data_processor.process_response(result)
|
||||
output = processed.to_dict()
|
||||
output = self.engine.data_processor.process_response_dict(
|
||||
result.to_dict(), stream=False, include_stop_str_in_output=False, direct_decode=not stream
|
||||
)
|
||||
llm_logger.debug(f"Generate result: {output}")
|
||||
if not stream:
|
||||
yield output
|
||||
|
||||
Reference in New Issue
Block a user