[Cherry-Pick][FDConfig] Auto-scale CUDA Graph Capture & CLI Quantization Params + CUDAGraph Validation (#7215,#7281) (#7301)

* refactor cudagraph args * refactor quant cli param * fix * fix * tmp skip xpu * fix
2026-04-24 09:44:10 +08:00 · 2026-04-10 16:10:31 +08:00
parent 4f36346e14
commit c7560383ab
6 changed files with 170 additions and 87 deletions
@@ -29,6 +29,7 @@ from fastdeploy.distributed.communication import (
    capture_custom_allreduce,
    custom_ar_clear_ipc_handles,
 )
+from fastdeploy.platforms import current_platform
 from fastdeploy.utils import get_logger

 logger = get_logger("cudagrpah_piecewise_backend", "cudagraph_piecewise_backend.log")
@@ -123,9 +124,46 @@ class CudaGraphPiecewiseBackend:
        self.max_num_seqs = fd_config.scheduler_config.max_num_seqs
        self.real_bsz_to_captured_size = fd_config.graph_opt_config.real_bsz_to_captured_size

-    def run_static_model(self, entry: ConcreteSizeEntry, **kwargs):
+        # Expected decode capture sequence (descending), consistent with capture_model() iteration order.
+        # Used to validate that captures happen in the correct order.
+        self._decode_expected_sequence: list[int] = sorted(self.cudagraph_capture_sizes, reverse=True)
+        # Points to the next expected position in _decode_expected_sequence.
+        self._decode_capture_index: int = 0
+
+    def _validate_decode_capture_order(self, shape: int) -> None:
+        """Validate that decode CUDA graph captures happen in expected descending order.
+
+        Raises RuntimeError immediately if the actual capture order deviates from
+        the order defined by cudagraph_capture_sizes (sorted descending).
+        """
+        if current_platform.is_xpu():
+            return
+
+        if self._decode_capture_index >= len(self._decode_expected_sequence):
+            raise RuntimeError(
+                f"[CUDA GRAPH][ID:{id(self)}] Unexpected CUDA graph capture: shape={shape}. "
+                f"All {len(self._decode_expected_sequence)} expected captures have already completed. "
+                f"Expected sequence: {self._decode_expected_sequence}"
+            )
+        expected = self._decode_expected_sequence[self._decode_capture_index]
+        if shape != expected:
+            raise RuntimeError(
+                f"[CUDA GRAPH][ID:{id(self)}] CUDA graph capture order mismatch at index "
+                f"{self._decode_capture_index}: expected shape={expected}, got shape={shape}. "
+                f"Full expected sequence: {self._decode_expected_sequence}"
+            )
+        logger.debug(
+            f"[CUDA GRAPH][ID:{id(self)}] Capture order validated: shape={shape} matches "
+            f"expected sequence at index {self._decode_capture_index} "
+            f"(sequence: {self._decode_expected_sequence})"
+        )
+        self._decode_capture_index += 1
+
+    def run_static_model(self, entry: ConcreteSizeEntry, is_decode: bool = False, **kwargs):

        if not entry.captured:
+            if is_decode:
+                self._validate_decode_capture_order(entry.real_shape)
            # Warmup the model
            for n in range(entry.num_finished_warmup, self.warm_up_size):
                entry.num_finished_warmup += 1
@@ -194,13 +232,14 @@ class CudaGraphPiecewiseBackend:
        # - Static full graph mode: Dynamic for prefill/mixed, Static + CUDAGraph for decode
        # - Dynamic mode: Dynamic + CUDAGraph for decode only
        if static_cudagraph_for_prefill or static_cudagraph_for_decode:
-            return self.run_static_model(entry, **kwargs)
+            return self.run_static_model(entry, is_decode=static_cudagraph_for_decode, **kwargs)

        # Capture a new cuda graph
        if entry.cuda_graph is None:
            assert (
                real_shape == padding_real_shape
            ), f"real_shape:{real_shape} is not equal to padding_real_shape:{padding_real_shape} when capture new graph."
+            self._validate_decode_capture_order(padding_real_shape)
            # Warmup the model
            for n in range(entry.num_finished_warmup, self.warm_up_size):
                entry.num_finished_warmup += 1
@@ -278,6 +317,8 @@ class CudaGraphPiecewiseBackend:
        del self.concrete_size_entries
        paddle.device.cuda.empty_cache()

+        self._decode_capture_index = 0
+
        # Create new entrys
        self._create_entry_dict()