[Feature] Add Deterministic Inference Support (#6476)

* add * [tests] Add Paddle attention determinism tests and refactor resource manager Add comprehensive determinism tests for Paddle attention layer and refactor resource manager for deterministic mode support. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com> * add * add * add * add * add more * add more * fixsome * fixsome * fix bugs * fix bugs * only in gpu * add docs * fix comments * fix some * fix some * fix comments * add more * fix potential problem * remove not need * remove not need * remove no need * fix bug * fix bugs * fix comments * fix comments * Update tests/ce/deterministic/test_determinism_verification.py Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com> * Update tests/inter_communicator/test_ipc_signal.py Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com> * Update tests/layers/test_paddle_attention_determinism.py Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com> * Update tests/engine/test_sampling_params_determinism.py Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com> * Update tests/layers/test_paddle_attention_determinism.py Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com> * Update tests/layers/test_paddle_attention_determinism_standalone.py Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com> * fix comments * fix import error * fix a bug * fix bugs * fix bugs * fix coverage * refine codes * refine code * fix comments * fix comments * fix comments * rm not need * fix allreduce large tensor bug * mv log files * mv log files * add files --------- Co-authored-by: Claude Opus 4.6 <noreply@anthropic.com> Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com>
2026-04-23 08:21:53 +08:00 · 2026-02-27 11:31:51 +08:00
parent c34cb2a8c2
commit edd31e8849
24 changed files with 3364 additions and 27 deletions
@@ -6,6 +6,10 @@ from collections import namedtuple
 from collections.abc import Callable
 from typing import Any, Dict

+from fastdeploy.utils import get_logger
+
+logger = get_logger("worker_process", "worker_process.log")
+
 import paddle
 import triton
 import triton.language as tl
@@ -137,13 +141,13 @@ def get_compute_units():
            device_properties = paddle.cuda.get_device_properties(0)
            NUM_SMS = device_properties.multi_processor_count
        except Exception:
-            print("Could not get CUDA device properties. Falling back to CPU threads.")
+            logger.warning("Could not get CUDA device properties. Falling back to CPU threads.")
            # TODO(liujundong): Paddle lacks a torch.get_num_threads() equivalent for the *configured* thread count.
            # Using os.cpu_count() (total logical cores) as a fallback, which may not be correct.
            # Must check downstream logic to determine if this impacts correctness.
            NUM_SMS = os.cpu_count()
    else:
-        print("No CUDA device available. Using CPU.")
+        logger.warning("No CUDA device available. Using CPU.")
        # For CPU, use the number of CPU cores
        NUM_SMS = os.cpu_count()

@@ -153,7 +157,7 @@ def get_compute_units():
 def matmul_persistent(a: paddle.Tensor, b: paddle.Tensor, bias: paddle.Tensor | None = None):
    # Check constraints.
    assert a.shape[1] == b.shape[0], "Incompatible dimensions"
-    assert a.dtype == b.dtype, "Incompatible dtypes"
+    assert a.dtype == b.dtype, f"Incompatible dtypes: a={a.dtype}, b={b.dtype}"
    assert bias is None or bias.dim() == 1, "Currently assuming bias is 1D, let Horace know if you run into this"

    NUM_SMS = get_compute_units()
@@ -210,9 +214,11 @@ def matmul_persistent(a: paddle.Tensor, b: paddle.Tensor, bias: paddle.Tensor |
        c.stride(0),
        c.stride(1),  #
        NUM_SMS=NUM_SMS,  #
-        A_LARGE=int(a.numel() > 2**31),
-        B_LARGE=int(b.numel() > 2**31),
-        C_LARGE=int(c.numel() > 2**31),
+        # Use M*K, K*N, M*N instead of numel() to avoid cudaErrorStreamCaptureImplicit
+        # during CUDA Graph capture
+        A_LARGE=int(M * K > 2**31),
+        B_LARGE=int(K * N > 2**31),
+        C_LARGE=int(M * N > 2**31),
        HAS_BIAS=int(bias is not None),
        # The Triton compiler (when used with Paddle) cannot handle these variables as booleans. Explicitly cast to int so the compiler can process them.
        **configs[dtype],
@@ -477,6 +483,8 @@ def addmm_batch_invariant(
    So we use `alpha * (x @ y) + beta * input  =  alpha * [ (x @ y) + (beta / alpha) * input ]`
    to minimize the effection on performance
    """
+    if alpha == 0:
+        return paddle.broadcast_to(beta * input, [x.shape[0], y.shape[1]])
    matmul_result = matmul_persistent(a=x, b=y, bias=input * beta / alpha)
    result = alpha * matmul_result
    return result
@@ -490,7 +498,13 @@ def mean_batch_invariant(
    x: paddle.Tensor, axis: list[int] = [], keepdim: bool = False, dtype: paddle.dtype | None = None, out=None
 ) -> paddle.Tensor:
    assert dtype is None or dtype == paddle.float32, f"unsupported dtype: {dtype}"
-    if type(axis) is int:
+    if axis is None:  # Global mean (no axis specified)
+        # Avoid x.numel() to prevent cudaErrorStreamCaptureImplicit during CUDA Graph capture
+        n_elems = 1
+        for s in x.shape:
+            n_elems *= s
+        result = paddle.sum(x, keepdim=keepdim, dtype=paddle.float32) / n_elems
+    elif type(axis) is int:
        result = mean_dim(x, axis, keepdim=keepdim)
    elif len(axis) == 1:  # axis: int | Sequence[int]
        result = mean_dim(x, axis[0], keepdim=keepdim)