mirror of
https://github.com/PaddlePaddle/FastDeploy.git
synced 2026-04-23 00:17:25 +08:00
[XPU][Graph Optimization] XPU Support CUDAGraph (#6152)
* support cuda graph
This commit is contained in:
@@ -105,6 +105,7 @@ def xpu_pre_process(
|
||||
seq_lens_decoder: Optional[paddle.Tensor] = None,
|
||||
is_profiling: bool = False,
|
||||
forward_meta=None,
|
||||
use_cudagraph=False,
|
||||
) -> XPUForwardMeta:
|
||||
""" """
|
||||
max_len = input_ids.shape[1]
|
||||
@@ -152,7 +153,6 @@ def xpu_pre_process(
|
||||
cu_seqlens_k,
|
||||
) = get_padding_offset(input_ids, cum_offsets_now, token_num, seq_lens_this_time)
|
||||
|
||||
share_inputs["ids_remove_padding"] = None # set this after adjust batch
|
||||
share_inputs["cum_offsets"] = cum_offsets
|
||||
share_inputs["batch_id_per_token"] = batch_id_per_token
|
||||
share_inputs["cu_seqlens_q"] = cu_seqlens_q
|
||||
@@ -221,11 +221,18 @@ def xpu_pre_process(
|
||||
|
||||
adjusted_input = adjusted_input.squeeze(1)
|
||||
|
||||
share_inputs["ids_remove_padding"] = adjusted_input
|
||||
share_inputs["ids_remove_padding"].copy_(adjusted_input, False)
|
||||
xpu_forward_meta.ids_remove_padding = adjusted_input
|
||||
# Set forward_meta.is_profiling to True to skip init_kv_signal_per_query for attention backends
|
||||
xpu_forward_meta.is_profiling = is_profiling
|
||||
return xpu_forward_meta
|
||||
if use_cudagraph:
|
||||
if forward_meta is None:
|
||||
return xpu_forward_meta
|
||||
else:
|
||||
forward_meta.copy_from(xpu_forward_meta)
|
||||
return forward_meta
|
||||
else:
|
||||
return xpu_forward_meta
|
||||
|
||||
|
||||
def xpu_process_output(
|
||||
|
||||
Reference in New Issue
Block a user