mirror of
https://github.com/PaddlePaddle/FastDeploy.git
synced 2026-04-23 17:11:21 +08:00
[Executor]CUDAGraph support Speculate Decode (#3769)
CE Compile Job / ce_job_pre_check (push) Has been cancelled
CE Compile Job / print_ce_job_pre_check_outputs (push) Has been cancelled
CE Compile Job / FD-Clone-Linux (push) Has been cancelled
CE Compile Job / Show Code Archive Output (push) Has been cancelled
CE Compile Job / BUILD_SM8090 (push) Has been cancelled
CE Compile Job / BUILD_SM8689 (push) Has been cancelled
CE Compile Job / CE_UPLOAD (push) Has been cancelled
Deploy GitHub Pages / deploy (push) Has been cancelled
CE Compile Job / ce_job_pre_check (push) Has been cancelled
CE Compile Job / print_ce_job_pre_check_outputs (push) Has been cancelled
CE Compile Job / FD-Clone-Linux (push) Has been cancelled
CE Compile Job / Show Code Archive Output (push) Has been cancelled
CE Compile Job / BUILD_SM8090 (push) Has been cancelled
CE Compile Job / BUILD_SM8689 (push) Has been cancelled
CE Compile Job / CE_UPLOAD (push) Has been cancelled
Deploy GitHub Pages / deploy (push) Has been cancelled
* success run ngram * Revert "[Code Simplification] remove cum_offsets (#3410)" This reverts commit32b39620bc. * success run ngram5 tp4 42bs * success run ngram5 tp4 42bs * mtp draft commit * add decorator for target model * enable draft model in cudagraph v0.5 * revert revrt cum_offset * enable target model in cudagraph v0.9 And clean debug code * Revert "success run ngram" This reverts commit8351e83993. * add reverted code * enable target model in cudagraph v0.9 * solve comment * fix bid < 0 * Enable Target Model Padding And Draft Model in cudagraph * solve problem * delete rebuild padding debug note * fast compile * Add capture list for mtp * success run 256 tp1 mtp * Enable Lite TP2 Bsz256 * realy enable tp2 bsz 256 * fix problem * Solve problem for Draft model in cudagraph * Solve comment * replace emptytensor as zeros * Solve comments * Revert "fast compile" This reverts commit834639a7ff. * fix bug * fix merge bug * fix typo * fix bug --------- Co-authored-by: lizexu <2694294196@qq.com> Co-authored-by: littledgg <1658565283@qq.com> Co-authored-by: zeroRains <linjunlu@zerorains.top> Co-authored-by: gongshaotian <gstain5555@outlook.com>
This commit is contained in:
@@ -121,7 +121,7 @@ class CudaGraphPiecewiseBackend:
|
||||
entry.num_finished_warmup += 1
|
||||
entry.runnable(**kwargs)
|
||||
logger.debug(
|
||||
f"[CUDA GRAPH] Warm up for batch size {entry.real_shape}, "
|
||||
f"[CUDA GRAPH][ID:{id(self)}] Warm up for batch size {entry.real_shape}, "
|
||||
f"finished ({n + 1}/{entry.num_finished_warmup}) times"
|
||||
)
|
||||
|
||||
@@ -148,15 +148,15 @@ class CudaGraphPiecewiseBackend:
|
||||
real_shape = ids_remove_padding.shape[0]
|
||||
padding_real_shape = self.real_shape_to_captured_size[real_shape]
|
||||
logger.debug(
|
||||
f"[CUDA GRAPH] The actual real shape obtained by CUDAGraph is :{real_shape}, "
|
||||
f"The padded shape is :{padding_real_shape}"
|
||||
f"[CUDA GRAPH][ID:{id(self)}] The actual real shape obtained by CUDAGraph is :{real_shape}, "
|
||||
f"The padded shape is :{padding_real_shape}, If Padding :{real_shape != padding_real_shape}"
|
||||
)
|
||||
|
||||
entry = self.concrete_size_entries.get(padding_real_shape)
|
||||
assert entry is not None, f"real shape:{padding_real_shape} is not in cuda graph capture list."
|
||||
if entry.runnable is None:
|
||||
entry.runnable = self.runnable
|
||||
logger.debug(f"[CUDA GRAPH] New entry lazy initialize with real shape {padding_real_shape}")
|
||||
logger.debug(f"[CUDA GRAPH][ID:{id(self)}] New entry lazy initialize with real shape {padding_real_shape}")
|
||||
|
||||
if not entry.use_cudagraph:
|
||||
return entry.runnable(**kwargs)
|
||||
@@ -171,7 +171,7 @@ class CudaGraphPiecewiseBackend:
|
||||
entry.num_finished_warmup += 1
|
||||
entry.runnable(**kwargs)
|
||||
logger.debug(
|
||||
f"[CUDA GRAPH] Warm up for real shape {padding_real_shape}, "
|
||||
f"[CUDA GRAPH][ID:{id(self)}] Warm up for real shape {padding_real_shape}, "
|
||||
f"finished ({n + 1}/{entry.num_finished_warmup}) times"
|
||||
)
|
||||
|
||||
@@ -206,11 +206,11 @@ class CudaGraphPiecewiseBackend:
|
||||
|
||||
# For CUDAGraph debug
|
||||
# self._save_cudagrpah_dot_files(entry)
|
||||
logger.debug(f"[CUDA GRAPH] CUDAGraph captured for real shape {padding_real_shape}")
|
||||
logger.debug(f"[CUDA GRAPH][ID:{id(self)}] CUDAGraph captured for real shape {padding_real_shape}")
|
||||
|
||||
# Replay
|
||||
entry.cuda_graph.replay()
|
||||
logger.debug(f"[CUDA GRAPH] CUDAGraph replayed for real shape {padding_real_shape}")
|
||||
logger.debug(f"[CUDA GRAPH][ID:{id(self)}] CUDAGraph replayed for real shape {padding_real_shape}")
|
||||
if len(entry.output_buffers) == 1:
|
||||
return entry.output_buffers[0]
|
||||
return entry.output_buffers
|
||||
@@ -223,18 +223,19 @@ class CudaGraphPiecewiseBackend:
|
||||
for shape in self.cudagraph_capture_sizes:
|
||||
self.concrete_size_entries[shape] = ConcreteSizeEntry(real_shape=shape)
|
||||
|
||||
logger.info(
|
||||
f"[CUDA GRAPH] CUDAGraph capture list {self.cudagraph_capture_sizes}, " "Created all real shape entry."
|
||||
logger.debug(
|
||||
f"[CUDA GRAPH][ID:{id(self)}] CUDAGraph capture list {self.cudagraph_capture_sizes}, "
|
||||
"Created all real shape entry."
|
||||
)
|
||||
|
||||
def clear_graph(self):
|
||||
""" """
|
||||
# Clear graphs
|
||||
custom_ar_clear_ipc_handles()
|
||||
for id, entry in self.concrete_size_entries.items():
|
||||
for _id, entry in self.concrete_size_entries.items():
|
||||
if entry.cuda_graph:
|
||||
del entry.cuda_graph
|
||||
logger.debug(f"[CUDA GRAPH] The CUDAGraph with shape {id} has been cleared.")
|
||||
logger.debug(f"[CUDA GRAPH][ID:{id(self)}] The CUDAGraph with shape {_id} has been cleared.")
|
||||
|
||||
del self.concrete_size_entries
|
||||
paddle.device.cuda.empty_cache()
|
||||
|
||||
Reference in New Issue
Block a user