[Executor]CUDAGraph support Speculate Decode (#3769)
CE Compile Job / ce_job_pre_check (push) Has been cancelled
CE Compile Job / print_ce_job_pre_check_outputs (push) Has been cancelled
CE Compile Job / FD-Clone-Linux (push) Has been cancelled
CE Compile Job / Show Code Archive Output (push) Has been cancelled
CE Compile Job / BUILD_SM8090 (push) Has been cancelled
CE Compile Job / BUILD_SM8689 (push) Has been cancelled
CE Compile Job / CE_UPLOAD (push) Has been cancelled
Deploy GitHub Pages / deploy (push) Has been cancelled

* success run ngram

* Revert "[Code Simplification] remove cum_offsets (#3410)"

This reverts commit 32b39620bc.

* success run ngram5 tp4 42bs

* success run ngram5 tp4 42bs

* mtp draft commit

* add decorator for target model

* enable draft model in cudagraph v0.5

* revert revrt cum_offset

* enable target model in cudagraph v0.9 And clean debug code

* Revert "success run ngram"

This reverts commit 8351e83993.

* add reverted code

* enable target model in cudagraph v0.9

* solve comment

* fix bid < 0

* Enable Target Model Padding And Draft Model in cudagraph

* solve problem

* delete rebuild padding debug note

* fast compile

* Add capture list for mtp

* success run 256 tp1 mtp

* Enable Lite TP2 Bsz256

* realy enable tp2 bsz 256

* fix problem

* Solve problem for Draft model in cudagraph

* Solve comment

* replace emptytensor as zeros

* Solve comments

* Revert "fast compile"

This reverts commit 834639a7ff.

* fix bug

* fix merge bug

* fix typo

* fix bug

---------

Co-authored-by: lizexu <2694294196@qq.com>
Co-authored-by: littledgg <1658565283@qq.com>
Co-authored-by: zeroRains <linjunlu@zerorains.top>
Co-authored-by: gongshaotian <gstain5555@outlook.com>
This commit is contained in:
RAM
2025-10-09 21:18:29 +08:00
committed by GitHub
parent 7b1689f437
commit aa27b03bc0
19 changed files with 250 additions and 139 deletions
@@ -121,7 +121,7 @@ class CudaGraphPiecewiseBackend:
entry.num_finished_warmup += 1
entry.runnable(**kwargs)
logger.debug(
f"[CUDA GRAPH] Warm up for batch size {entry.real_shape}, "
f"[CUDA GRAPH][ID:{id(self)}] Warm up for batch size {entry.real_shape}, "
f"finished ({n + 1}/{entry.num_finished_warmup}) times"
)
@@ -148,15 +148,15 @@ class CudaGraphPiecewiseBackend:
real_shape = ids_remove_padding.shape[0]
padding_real_shape = self.real_shape_to_captured_size[real_shape]
logger.debug(
f"[CUDA GRAPH] The actual real shape obtained by CUDAGraph is :{real_shape}, "
f"The padded shape is :{padding_real_shape}"
f"[CUDA GRAPH][ID:{id(self)}] The actual real shape obtained by CUDAGraph is :{real_shape}, "
f"The padded shape is :{padding_real_shape}, If Padding :{real_shape != padding_real_shape}"
)
entry = self.concrete_size_entries.get(padding_real_shape)
assert entry is not None, f"real shape:{padding_real_shape} is not in cuda graph capture list."
if entry.runnable is None:
entry.runnable = self.runnable
logger.debug(f"[CUDA GRAPH] New entry lazy initialize with real shape {padding_real_shape}")
logger.debug(f"[CUDA GRAPH][ID:{id(self)}] New entry lazy initialize with real shape {padding_real_shape}")
if not entry.use_cudagraph:
return entry.runnable(**kwargs)
@@ -171,7 +171,7 @@ class CudaGraphPiecewiseBackend:
entry.num_finished_warmup += 1
entry.runnable(**kwargs)
logger.debug(
f"[CUDA GRAPH] Warm up for real shape {padding_real_shape}, "
f"[CUDA GRAPH][ID:{id(self)}] Warm up for real shape {padding_real_shape}, "
f"finished ({n + 1}/{entry.num_finished_warmup}) times"
)
@@ -206,11 +206,11 @@ class CudaGraphPiecewiseBackend:
# For CUDAGraph debug
# self._save_cudagrpah_dot_files(entry)
logger.debug(f"[CUDA GRAPH] CUDAGraph captured for real shape {padding_real_shape}")
logger.debug(f"[CUDA GRAPH][ID:{id(self)}] CUDAGraph captured for real shape {padding_real_shape}")
# Replay
entry.cuda_graph.replay()
logger.debug(f"[CUDA GRAPH] CUDAGraph replayed for real shape {padding_real_shape}")
logger.debug(f"[CUDA GRAPH][ID:{id(self)}] CUDAGraph replayed for real shape {padding_real_shape}")
if len(entry.output_buffers) == 1:
return entry.output_buffers[0]
return entry.output_buffers
@@ -223,18 +223,19 @@ class CudaGraphPiecewiseBackend:
for shape in self.cudagraph_capture_sizes:
self.concrete_size_entries[shape] = ConcreteSizeEntry(real_shape=shape)
logger.info(
f"[CUDA GRAPH] CUDAGraph capture list {self.cudagraph_capture_sizes}, " "Created all real shape entry."
logger.debug(
f"[CUDA GRAPH][ID:{id(self)}] CUDAGraph capture list {self.cudagraph_capture_sizes}, "
"Created all real shape entry."
)
def clear_graph(self):
""" """
# Clear graphs
custom_ar_clear_ipc_handles()
for id, entry in self.concrete_size_entries.items():
for _id, entry in self.concrete_size_entries.items():
if entry.cuda_graph:
del entry.cuda_graph
logger.debug(f"[CUDA GRAPH] The CUDAGraph with shape {id} has been cleared.")
logger.debug(f"[CUDA GRAPH][ID:{id(self)}] The CUDAGraph with shape {_id} has been cleared.")
del self.concrete_size_entries
paddle.device.cuda.empty_cache()