Files
FastDeploy/tests/deterministic/test_sampling_determinism.py
T
gongweibao 30f9f33f34 [Feature][BugFix][OP] Enhance Deterministic Inference Mode with Kernel-level Fixes and Batch-invariant BMM (#6610)
* add fa deter

* add ut

* add long sentence

* fix basic

* fix bugs

* fix adn

* fix first

* fix single

* fix single

* fix single test

* refine

* add more test

* refine comments

* add comments of bmm

* fix ci

* remove probe

* add

* remove not need

* refine tests

* fix comments and refine code

* refine code

* refine test

* refine test

* mv 4cards tests

* fix tests

* add

* fix comments

* fix cover

* fix cover

---------

Co-authored-by: gongweibao <gognweibao@baidu.com>
2026-03-09 10:27:53 +08:00

160 lines
5.4 KiB
Python

# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""
Unit test: isolate sampling determinism from model computation.
This test fixes the logits (model output) and runs only the sampling
pipeline multiple times. If the results differ, the bug is in sampling;
if they are always identical, the non-determinism comes from model
computation (logits differ between runs).
Usage:
CUDA_VISIBLE_DEVICES=0 pytest tests/deterministic/test_sampling_determinism.py -v -s
"""
import paddle
import paddle.nn.functional as F
import pytest
pytestmark = pytest.mark.gpu
VOCAB_SIZE = 151936 # Qwen2 vocab size
BATCH_SIZE = 1
def _make_logits(seed: int = 42):
"""Create reproducible random logits that look like real model output."""
paddle.seed(seed)
# Simulate logits with realistic distribution (not uniform)
logits = paddle.randn([BATCH_SIZE, VOCAB_SIZE], dtype="float32")
# Make it slightly peaked (a few tokens have higher logits)
logits[0, 100] += 5.0
logits[0, 200] += 4.5
logits[0, 300] += 4.0
return logits
def _sample_with_top_p(logits, top_p_val, seed_val):
"""Run the same sampling pipeline as sampler.forward_cuda (non-greedy path)."""
probs = F.softmax(logits, axis=-1)
top_p = paddle.to_tensor([top_p_val], dtype="float32")
topp_seed = paddle.to_tensor([[seed_val]], dtype="int64")
_, ids = paddle.tensor.top_p_sampling(probs, top_p, topp_seed=topp_seed, seed=-1, mode="truncated")
return ids.item()
# ---- Test 1: basic repeated sampling on identical logits ----
def test_sampling_determinism_basic():
"""Same logits + same seed -> must produce same token every time."""
logits = _make_logits(seed=42)
results = [_sample_with_top_p(logits, top_p_val=0.95, seed_val=200) for _ in range(20)]
assert len(set(results)) == 1, f"Sampling non-deterministic! Got {len(set(results))} distinct values: {results}"
# ---- Test 2: simulate multi-step decode (seed increments like real runner) ----
def test_sampling_determinism_multistep():
"""Simulate 100 decode steps with seed incrementing by 4 each step."""
logits = _make_logits(seed=42)
def run_steps():
tokens = []
for step in range(100):
seed_val = 200 + step * 4 # real runner increments seed by 4
tok = _sample_with_top_p(logits, top_p_val=0.95, seed_val=seed_val)
tokens.append(tok)
return tokens
run1 = run_steps()
run2 = run_steps()
assert run1 == run2, _diff_msg(run1, run2)
# ---- Test 3: interleave GPU work between sampling calls ----
def test_sampling_determinism_with_gpu_noise():
"""
Insert GPU matmul work between sampling calls to check if
GPU state residuals affect sampling determinism.
"""
logits = _make_logits(seed=42)
def run_steps_with_noise():
tokens = []
for step in range(50):
# Simulate GPU model forward between steps
_ = paddle.matmul(paddle.randn([256, 256]), paddle.randn([256, 256]))
seed_val = 200 + step * 4
tok = _sample_with_top_p(logits, top_p_val=0.95, seed_val=seed_val)
tokens.append(tok)
return tokens
run1 = run_steps_with_noise()
run2 = run_steps_with_noise()
assert run1 == run2, _diff_msg(run1, run2)
# ---- Test 4: flat distribution (temp=1.0 scenario, hardest case) ----
def test_sampling_determinism_flat_distribution():
"""
Flat probability distribution (simulating temp=1.0 with no dominant token).
This is the hardest case for determinism.
"""
paddle.seed(99)
# Logits close to zero -> softmax gives nearly uniform distribution
logits = paddle.randn([BATCH_SIZE, VOCAB_SIZE], dtype="float32") * 0.1
results_per_seed = {}
for seed_val in [100, 200, 300, 400, 500]:
results = [_sample_with_top_p(logits, top_p_val=0.95, seed_val=seed_val) for _ in range(10)]
results_per_seed[seed_val] = results
assert len(set(results)) == 1, (
f"seed={seed_val}: sampling non-deterministic on flat dist! "
f"Got {len(set(results))} distinct values: {results}"
)
# ---- Test 5: different top_p values ----
@pytest.mark.parametrize("top_p_val", [0.5, 0.8, 0.95, 1.0])
def test_sampling_determinism_various_top_p(top_p_val):
"""Determinism across different top_p values."""
logits = _make_logits(seed=42)
results = [_sample_with_top_p(logits, top_p_val=top_p_val, seed_val=200) for _ in range(10)]
assert len(set(results)) == 1, (
f"top_p={top_p_val}: non-deterministic! " f"Got {len(set(results))} distinct values: {results}"
)
# ---- Helpers ----
def _diff_msg(run1, run2):
for i, (a, b) in enumerate(zip(run1, run2)):
if a != b:
return f"First diff at step {i}: run1={a}, run2={b}. Total diffs: {sum(1 for x, y in zip(run1, run2) if x != y)}/{len(run1)}"
return "Lengths differ"
if __name__ == "__main__":
pytest.main(["-sv", __file__])