[Executor]CUDAGraph support Speculate Decode (#3769)

* success run ngram * Revert "[Code Simplification] remove cum_offsets (#3410)" This reverts commit 32b39620bc. * success run ngram5 tp4 42bs * success run ngram5 tp4 42bs * mtp draft commit * add decorator for target model * enable draft model in cudagraph v0.5 * revert revrt cum_offset * enable target model in cudagraph v0.9 And clean debug code * Revert "success run ngram" This reverts commit 8351e83993. * add reverted code * enable target model in cudagraph v0.9 * solve comment * fix bid < 0 * Enable Target Model Padding And Draft Model in cudagraph * solve problem * delete rebuild padding debug note * fast compile * Add capture list for mtp * success run 256 tp1 mtp * Enable Lite TP2 Bsz256 * realy enable tp2 bsz 256 * fix problem * Solve problem for Draft model in cudagraph * Solve comment * replace emptytensor as zeros * Solve comments * Revert "fast compile" This reverts commit 834639a7ff. * fix bug * fix merge bug * fix typo * fix bug --------- Co-authored-by: lizexu <2694294196@qq.com> Co-authored-by: littledgg <1658565283@qq.com> Co-authored-by: zeroRains <linjunlu@zerorains.top> Co-authored-by: gongshaotian <gstain5555@outlook.com>
2026-04-23 17:11:21 +08:00 · 2025-10-09 21:18:29 +08:00
parent 7b1689f437
commit aa27b03bc0
19 changed files with 250 additions and 139 deletions
@@ -121,7 +121,7 @@ class CudaGraphPiecewiseBackend:
                entry.num_finished_warmup += 1
                entry.runnable(**kwargs)
                logger.debug(
-                    f"[CUDA GRAPH] Warm up for batch size {entry.real_shape}, "
+                    f"[CUDA GRAPH][ID:{id(self)}] Warm up for batch size {entry.real_shape}, "
                    f"finished ({n + 1}/{entry.num_finished_warmup}) times"
                )

@@ -148,15 +148,15 @@ class CudaGraphPiecewiseBackend:
        real_shape = ids_remove_padding.shape[0]
        padding_real_shape = self.real_shape_to_captured_size[real_shape]
        logger.debug(
-            f"[CUDA GRAPH] The actual real shape obtained by CUDAGraph is :{real_shape}, "
-            f"The padded shape is :{padding_real_shape}"
+            f"[CUDA GRAPH][ID:{id(self)}] The actual real shape obtained by CUDAGraph is :{real_shape}, "
+            f"The padded shape is :{padding_real_shape}, If Padding :{real_shape != padding_real_shape}"
        )

        entry = self.concrete_size_entries.get(padding_real_shape)
        assert entry is not None, f"real shape:{padding_real_shape} is not in cuda graph capture list."
        if entry.runnable is None:
            entry.runnable = self.runnable
-            logger.debug(f"[CUDA GRAPH] New entry lazy initialize with real shape {padding_real_shape}")
+            logger.debug(f"[CUDA GRAPH][ID:{id(self)}] New entry lazy initialize with real shape {padding_real_shape}")

        if not entry.use_cudagraph:
            return entry.runnable(**kwargs)
@@ -171,7 +171,7 @@ class CudaGraphPiecewiseBackend:
                entry.num_finished_warmup += 1
                entry.runnable(**kwargs)
                logger.debug(
-                    f"[CUDA GRAPH] Warm up for real shape {padding_real_shape}, "
+                    f"[CUDA GRAPH][ID:{id(self)}] Warm up for real shape {padding_real_shape}, "
                    f"finished ({n + 1}/{entry.num_finished_warmup}) times"
                )

@@ -206,11 +206,11 @@ class CudaGraphPiecewiseBackend:

            # For CUDAGraph debug
            # self._save_cudagrpah_dot_files(entry)
-            logger.debug(f"[CUDA GRAPH] CUDAGraph captured for real shape {padding_real_shape}")
+            logger.debug(f"[CUDA GRAPH][ID:{id(self)}] CUDAGraph captured for real shape {padding_real_shape}")

        # Replay
        entry.cuda_graph.replay()
-        logger.debug(f"[CUDA GRAPH] CUDAGraph replayed for real shape {padding_real_shape}")
+        logger.debug(f"[CUDA GRAPH][ID:{id(self)}] CUDAGraph replayed for real shape {padding_real_shape}")
        if len(entry.output_buffers) == 1:
            return entry.output_buffers[0]
        return entry.output_buffers
@@ -223,18 +223,19 @@ class CudaGraphPiecewiseBackend:
        for shape in self.cudagraph_capture_sizes:
            self.concrete_size_entries[shape] = ConcreteSizeEntry(real_shape=shape)

-        logger.info(
-            f"[CUDA GRAPH] CUDAGraph capture list {self.cudagraph_capture_sizes}, " "Created all real shape entry."
+        logger.debug(
+            f"[CUDA GRAPH][ID:{id(self)}] CUDAGraph capture list {self.cudagraph_capture_sizes}, "
+            "Created all real shape entry."
        )

    def clear_graph(self):
        """ """
        # Clear graphs
        custom_ar_clear_ipc_handles()
-        for id, entry in self.concrete_size_entries.items():
+        for _id, entry in self.concrete_size_entries.items():
            if entry.cuda_graph:
                del entry.cuda_graph
-                logger.debug(f"[CUDA GRAPH] The CUDAGraph with shape {id} has been cleared.")
+                logger.debug(f"[CUDA GRAPH][ID:{id(self)}] The CUDAGraph with shape {_id} has been cleared.")

        del self.concrete_size_entries
        paddle.device.cuda.empty_cache()