FastDeploy/tests/deterministic/test_sampling_determinism.py

# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

"""
Unit test: isolate sampling determinism from model computation.

This test fixes the logits (model output) and runs only the sampling
pipeline multiple times.  If the results differ, the bug is in sampling;
if they are always identical, the non-determinism comes from model
computation (logits differ between runs).

Usage:
    CUDA_VISIBLE_DEVICES=0 pytest tests/deterministic/test_sampling_determinism.py -v -s
"""

import paddle
import paddle.nn.functional as F
import pytest

pytestmark = pytest.mark.gpu

VOCAB_SIZE = 151936  # Qwen2 vocab size
BATCH_SIZE = 1


def _make_logits(seed: int = 42):
    """Create reproducible random logits that look like real model output."""
    paddle.seed(seed)
    # Simulate logits with realistic distribution (not uniform)
    logits = paddle.randn([BATCH_SIZE, VOCAB_SIZE], dtype="float32")
    # Make it slightly peaked (a few tokens have higher logits)
    logits[0, 100] += 5.0
    logits[0, 200] += 4.5
    logits[0, 300] += 4.0
    return logits


def _sample_with_top_p(logits, top_p_val, seed_val):
    """Run the same sampling pipeline as sampler.forward_cuda (non-greedy path)."""
    probs = F.softmax(logits, axis=-1)
    top_p = paddle.to_tensor([top_p_val], dtype="float32")
    topp_seed = paddle.to_tensor([[seed_val]], dtype="int64")
    _, ids = paddle.tensor.top_p_sampling(probs, top_p, topp_seed=topp_seed, seed=-1, mode="truncated")
    return ids.item()


# ---- Test 1: basic repeated sampling on identical logits ----


def test_sampling_determinism_basic():
    """Same logits + same seed -> must produce same token every time."""
    logits = _make_logits(seed=42)
    results = [_sample_with_top_p(logits, top_p_val=0.95, seed_val=200) for _ in range(20)]
    assert len(set(results)) == 1, f"Sampling non-deterministic! Got {len(set(results))} distinct values: {results}"


# ---- Test 2: simulate multi-step decode (seed increments like real runner) ----


def test_sampling_determinism_multistep():
    """Simulate 100 decode steps with seed incrementing by 4 each step."""
    logits = _make_logits(seed=42)

    def run_steps():
        tokens = []
        for step in range(100):
            seed_val = 200 + step * 4  # real runner increments seed by 4
            tok = _sample_with_top_p(logits, top_p_val=0.95, seed_val=seed_val)
            tokens.append(tok)
        return tokens

    run1 = run_steps()
    run2 = run_steps()
    assert run1 == run2, _diff_msg(run1, run2)


# ---- Test 3: interleave GPU work between sampling calls ----


def test_sampling_determinism_with_gpu_noise():
    """
    Insert GPU matmul work between sampling calls to check if
    GPU state residuals affect sampling determinism.
    """
    logits = _make_logits(seed=42)

    def run_steps_with_noise():
        tokens = []
        for step in range(50):
            # Simulate GPU model forward between steps
            _ = paddle.matmul(paddle.randn([256, 256]), paddle.randn([256, 256]))
            seed_val = 200 + step * 4
            tok = _sample_with_top_p(logits, top_p_val=0.95, seed_val=seed_val)
            tokens.append(tok)
        return tokens

    run1 = run_steps_with_noise()
    run2 = run_steps_with_noise()
    assert run1 == run2, _diff_msg(run1, run2)


# ---- Test 4: flat distribution (temp=1.0 scenario, hardest case) ----


def test_sampling_determinism_flat_distribution():
    """
    Flat probability distribution (simulating temp=1.0 with no dominant token).
    This is the hardest case for determinism.
    """
    paddle.seed(99)
    # Logits close to zero -> softmax gives nearly uniform distribution
    logits = paddle.randn([BATCH_SIZE, VOCAB_SIZE], dtype="float32") * 0.1

    results_per_seed = {}
    for seed_val in [100, 200, 300, 400, 500]:
        results = [_sample_with_top_p(logits, top_p_val=0.95, seed_val=seed_val) for _ in range(10)]
        results_per_seed[seed_val] = results
        assert len(set(results)) == 1, (
            f"seed={seed_val}: sampling non-deterministic on flat dist! "
            f"Got {len(set(results))} distinct values: {results}"
        )


# ---- Test 5: different top_p values ----


@pytest.mark.parametrize("top_p_val", [0.5, 0.8, 0.95, 1.0])
def test_sampling_determinism_various_top_p(top_p_val):
    """Determinism across different top_p values."""
    logits = _make_logits(seed=42)
    results = [_sample_with_top_p(logits, top_p_val=top_p_val, seed_val=200) for _ in range(10)]
    assert len(set(results)) == 1, (
        f"top_p={top_p_val}: non-deterministic! " f"Got {len(set(results))} distinct values: {results}"
    )


# ---- Helpers ----


def _diff_msg(run1, run2):
    for i, (a, b) in enumerate(zip(run1, run2)):
        if a != b:
            return f"First diff at step {i}: run1={a}, run2={b}. Total diffs: {sum(1 for x, y in zip(run1, run2) if x != y)}/{len(run1)}"
    return "Lengths differ"


if __name__ == "__main__":
    pytest.main(["-sv", __file__])