mirror of
https://github.com/PaddlePaddle/FastDeploy.git
synced 2026-04-23 00:17:25 +08:00
[XPU] fix(xpu_model_runner): reset seq_lens_encoder to 0 for decode role in PD splitwise mode (#6048)
* fix(xpu_model_runner): reset seq_lens_encoder to 0 for decode role in PD splitwise mode - Set seq_lens_encoder to 0 when splitwise_role is 'decode' during prefill processing - This ensures proper continuation of decoding after P generate first token in PD disaggregated architecture - Fixes potential sequence length inconsistency in PD splitwise deployment scenarios * format
This commit is contained in:
@@ -565,6 +565,10 @@ class XPUModelRunner(ModelRunnerBase):
|
||||
len(request.output_token_ids) if prefill_end_index >= len(input_ids) else 0
|
||||
)
|
||||
self.share_inputs["pre_ids"][idx : idx + 1] = -1
|
||||
if (
|
||||
self.fd_config.scheduler_config.splitwise_role == "decode"
|
||||
): # In PD, we continue to decode after P generate first token
|
||||
self.share_inputs["seq_lens_encoder"][idx : idx + 1] = 0
|
||||
has_prefill_task = True
|
||||
elif request.task_type.value == RequestType.DECODE.value: # decode task
|
||||
logger.debug(f"Handle decode request {request} at idx {idx}")
|
||||
|
||||
Reference in New Issue
Block a user