Files
FastDeploy/tests/e2e/4cards_cases/test_determinism_long.py
T
gongweibao 48cfb608aa [FDConfig] Reduce FD_CUSTOM_AR_MAX_SIZE_MB default from 64 to 8 (#6997)
Most single-GPU and small-model deployments do not need 64MB custom
all-reduce buffers. Lowering the default to 8MB reduces unnecessary
shared memory allocation. Tests that require larger buffers now
explicitly set the value.

Co-authored-by: gongweibao <gognweibao@baidu.com>
Co-authored-by: Claude Opus 4.6 <noreply@anthropic.com>
2026-03-25 17:40:01 +08:00

415 lines
15 KiB
Python

# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""
Long sequence determinism tests.
This test ensures that the deterministic mode works correctly for long sequences
that trigger the partition_kv code path (num_chunks > 1 when KV length > 1024).
Key requirements:
1. Total KV length (prompt_tokens + max_tokens) must exceed 1024 to trigger partition_kv
2. Recommended: KV length >= 2048 to ensure num_chunks >= 2
Usage:
CUDA_VISIBLE_DEVICES=0,1,2,3 pytest tests/e2e/4cards_cases/test_determinism_long.py -v
"""
import gc
import itertools
import os
from contextlib import contextmanager
import pytest
try:
import paddle.device.cuda as _paddle_cuda
except Exception:
_paddle_cuda = None
try:
from fastdeploy.logger.deterministic_logger import (
_read_logits_md5_file,
_reset_logits_md5_file,
)
except Exception:
_read_logits_md5_file = None
_reset_logits_md5_file = None
pytestmark = pytest.mark.gpu
DEFAULT_MODEL_DIR = "./models"
MODEL_NAME = "Qwen2-7B-Instruct"
@contextmanager
def env_override(mapping):
"""Temporarily set env vars, restoring original values on exit."""
old = {k: os.environ.get(k) for k in mapping}
os.environ.update(mapping)
try:
yield
finally:
for k, v in old.items():
if v is None:
os.environ.pop(k, None)
else:
os.environ[k] = v
@pytest.fixture(scope="module")
def model_path():
model_dir = os.getenv("MODEL_PATH", DEFAULT_MODEL_DIR)
return os.path.join(model_dir, MODEL_NAME)
@pytest.fixture(autouse=True)
def _reset_deterministic_mode():
"""Ensure every test starts with deterministic mode ON."""
os.environ["FD_DETERMINISTIC_MODE"] = "1"
yield
os.environ["FD_DETERMINISTIC_MODE"] = "1"
def _is_high_performance_gpu():
"""
Check if current GPU has performance >= H800.
Uses compute capability as proxy for performance.
H800 has compute capability 9.0, so GPUs with 9.0 or higher are considered high performance.
"""
if _paddle_cuda is None:
return False
try:
props = _paddle_cuda.get_device_properties(0)
# Compute capability comparison
# H800: 9.0, H100: 9.0, H200: 9.0+, B100/B200: 10.0
# Consider GPUs with compute capability >= 9.0 as high performance
min_cc = 9.0
current_cc = props.major * 1.0 + props.minor * 0.1
return current_cc >= min_cc
except Exception:
return False
# ---------------------------------------------------------------------------
# Constants
# ---------------------------------------------------------------------------
# Use smallest chunk_size (64) to maximize num_chunks and increase
# sensitivity to partition_kv non-determinism. With chunk_size=64:
# - 1200 tokens -> 19 chunks (vs 2 chunks with default 1024)
# - More chunks = more merge operations = easier to detect non-determinism
_CHUNK_SIZE_FOR_TEST = "64"
# Long prompt to ensure KV length > 1024 (triggers partition_kv path)
# This sentence is ~20 tokens, repeated 40 times = ~800 tokens
_BASE_SENTENCE = (
"Artificial intelligence has transformed various industries including healthcare, "
"finance, transportation, and education through machine learning algorithms. "
)
_LONG_PROMPT = _BASE_SENTENCE * 40 + (
"Based on the above context about AI, please provide a detailed analysis of "
"the future trends and potential challenges in AI development."
)
# With ~800 token prompt + 512 max_tokens, total KV length ~1312 > 1024
# This ensures num_chunks >= 2, triggering the partition_kv code path
_MAX_TOKENS_LONG = 512
# ---------------------------------------------------------------------------
# Fixtures
# ---------------------------------------------------------------------------
@pytest.fixture(scope="module", autouse=True)
def _module_env():
"""Set env vars BEFORE importing fastdeploy (must happen first)."""
with env_override(
{
"CUDA_VISIBLE_DEVICES": os.environ.get("CUDA_VISIBLE_DEVICES", "0,1,2,3"),
"FD_DETERMINISTIC_MODE": "1",
"FD_CUSTOM_AR_MAX_SIZE_MB": os.environ.get("FD_CUSTOM_AR_MAX_SIZE_MB", "64"),
"FLAGS_max_partition_size": _CHUNK_SIZE_FOR_TEST,
}
):
# Lazy import: env vars must be set before importing fastdeploy
global LLM, SamplingParams # noqa: PLW0603
from fastdeploy import LLM, SamplingParams
yield
@pytest.fixture(scope="module")
def llm(model_path, _module_env):
instance = LLM(
model=model_path,
tensor_parallel_size=int(os.getenv("TP_SIZE", "4")),
max_model_len=8192,
enable_prefix_caching=False,
graph_optimization_config={"use_cudagraph": os.getenv("USE_CUDAGRAPH", "0") == "1"},
)
yield instance
del instance
gc.collect()
# ---------------------------------------------------------------------------
# Helpers
# ---------------------------------------------------------------------------
def _generate_text(llm, prompt, sp):
"""Generate once, return (text, token_ids)."""
out = llm.generate([prompt], sp)[0]
return out.outputs.text, list(out.outputs.token_ids)
def _collect_logits_hashes():
"""Read and clear the per-step logits MD5 hashes written by the worker process."""
if _read_logits_md5_file is None:
return []
try:
return _read_logits_md5_file()
except Exception:
return []
def _reset_logits_hashes():
"""Reset the logits MD5 hash file before a new generate run."""
if _reset_logits_md5_file is None:
return
try:
_reset_logits_md5_file()
except Exception:
pass
def _report_logits_diff(hashes_list):
"""Compare logits hashes between runs and report first divergence."""
if len(hashes_list) < 2 or not hashes_list[0]:
print("[DIAG-LOGITS] No logits hashes collected (FD_DETERMINISTIC_LOG_MODE=1 ?)")
return
baseline = hashes_list[0]
for run_idx, hashes in enumerate(hashes_list[1:], start=1):
min_len = min(len(baseline), len(hashes))
for step in range(min_len):
if baseline[step]["logits_md5"] != hashes[step]["logits_md5"]:
print(f"[DIAG-LOGITS] Run {run_idx}: LOGITS FIRST DIFFER at step {step}")
print(
f"[DIAG-LOGITS] baseline logits_md5={baseline[step]['logits_md5']}, "
f"probs_md5={baseline[step]['probs_md5']}"
)
print(
f"[DIAG-LOGITS] run_{run_idx} logits_md5={hashes[step]['logits_md5']}, "
f"probs_md5={hashes[step]['probs_md5']}"
)
print("[DIAG-LOGITS] -> Non-determinism is in MODEL COMPUTATION (not sampling)")
return
if len(baseline) != len(hashes):
print(
f"[DIAG-LOGITS] Run {run_idx}: All logits identical "
f"but length differs ({len(baseline)} vs {len(hashes)})"
)
return
for step in range(min_len):
if baseline[step]["probs_md5"] != hashes[step]["probs_md5"]:
print(f"[DIAG-LOGITS] Run {run_idx}: logits identical but PROBS DIFFER at step {step}")
print("[DIAG-LOGITS] -> Non-determinism is in SOFTMAX/PENALTY (not model)")
return
print(f"[DIAG-LOGITS] Run {run_idx}: ALL logits AND probs IDENTICAL across {min_len} steps")
print("[DIAG-LOGITS] -> Non-determinism is in SAMPLING OPERATOR")
def _report_token_diff(token_ids_list, sp=None):
"""Report detailed token-level diff to diagnose determinism issues."""
print("\n" + "=" * 70)
print("[DIAG] Token-level determinism diagnosis")
print("=" * 70)
if sp is not None:
print(f"[DIAG] SamplingParams: temperature={sp.temperature}, seed={sp.seed}, top_p={sp.top_p}")
for i, tids in enumerate(token_ids_list):
print(f"[DIAG] Run {i}: {len(tids)} tokens, first 10: {tids[:10]}")
baseline = token_ids_list[0]
for i, tids in enumerate(token_ids_list[1:], start=1):
if tids == baseline:
print(f"[DIAG] Run {i}: IDENTICAL to baseline")
continue
min_len = min(len(baseline), len(tids))
for j in range(min_len):
if baseline[j] != tids[j]:
print(f"[DIAG] Run {i}: FIRST DIVERGENCE at token position {j}")
print(f"[DIAG] baseline[{j}] = {baseline[j]}")
print(f"[DIAG] run_{i}[{j}] = {tids[j]}")
start = max(0, j - 3)
end = min(min_len, j + 4)
print(f"[DIAG] baseline[{start}:{end}] = {baseline[start:end]}")
print(f"[DIAG] run_{i}[{start}:{end}] = {tids[start:end]}")
total_diff = sum(1 for a, b in zip(baseline[:min_len], tids[:min_len]) if a != b)
print(f"[DIAG] Total differing tokens (in shared range): {total_diff}/{min_len}")
break
if len(baseline) != len(tids):
print(f"[DIAG] Length differs: baseline={len(baseline)}, run_{i}={len(tids)}")
print("=" * 70 + "\n")
def _report_text_diff(texts):
"""Report detailed diff when texts differ."""
for i, text in enumerate(texts[1:], start=1):
if text != texts[0]:
if len(text) != len(texts[0]):
print(f"Run {i}: length differs (baseline={len(texts[0])}, got={len(text)})")
for j, (c1, c2) in enumerate(itertools.zip_longest(texts[0], text, fillvalue="")):
if c1 != c2:
print(f"Run {i}: first diff at pos {j}")
print(f" Baseline: {repr(texts[0][max(0, j-10):j+20])}")
print(f" Run {i}: {repr(text[max(0, j-10):j+20])}")
break
def _assert_deterministic(llm, prompt, sp, runs=2):
"""Run *runs* times and assert all outputs are identical (text AND token_ids)."""
all_hashes = []
results = []
for _ in range(runs):
_reset_logits_hashes() # truncate file before each run
results.append(_generate_text(llm, prompt, sp))
all_hashes.append(_collect_logits_hashes())
texts = [r[0] for r in results]
token_ids = [r[1] for r in results]
if not all(t == token_ids[0] for t in token_ids):
_report_token_diff(token_ids, sp)
_report_logits_diff(all_hashes)
pytest.fail("Token IDs differ across runs")
if not all(t == texts[0] for t in texts):
_report_text_diff(texts)
pytest.fail("Text outputs differ across runs")
return texts[0], token_ids[0]
# ===================== Long sequence tests =====================
@pytest.mark.parametrize(
"temp,seed",
[
(0.0, 100),
(1.0, 200),
],
)
def test_deterministic_long_sequence(llm, temp, seed):
"""Long generation (512+ tokens) stays deterministic at various temperatures."""
prompt = "Please describe the history of AI in detail, including major milestones and key technical breakthroughs."
sp = SamplingParams(temperature=temp, top_p=0.95, max_tokens=384, seed=seed)
text, token_ids = _assert_deterministic(llm, prompt, sp)
assert len(token_ids) >= 100, f"Expected >= 100 tokens, got {len(token_ids)}"
def test_deterministic_long_prompt(llm):
"""Long input prompt (prefill-heavy) stays deterministic."""
base = "This is a description about natural language processing. "
long_prompt = (base * 50) + "Please summarize the above."
sp = SamplingParams(temperature=0.5, max_tokens=100, seed=2024)
_assert_deterministic(llm, long_prompt, sp)
# ===================== Partition-kv aware tests =====================
def test_long_sequence_determinism_basic(llm):
"""
Basic long sequence test: KV length > 2048 to trigger partition_kv.
This is the core test that verifies the deterministic mode fix works
for long sequences that would normally trigger num_chunks > 1.
"""
sp = SamplingParams(temperature=0.7, top_p=0.95, max_tokens=512, seed=170)
_, token_ids = _assert_deterministic(llm, _LONG_PROMPT, sp, runs=5)
assert len(token_ids) >= 200, f"Expected >= 200 tokens, got {len(token_ids)}"
@pytest.mark.skipif(
not _is_high_performance_gpu(),
reason="Test only runs on GPUs with performance >= H800 (compute capability >= 9.0)",
)
@pytest.mark.parametrize(
"max_tokens,min_expected,desc",
[
(400, 100, "~1200 total (~19 chunks)"),
(1280, 200, "~2000 total (~32 chunks)"),
(2200, 300, "~3000 total (~47 chunks)"),
],
ids=["19_chunks", "32_chunks", "47_chunks"],
)
@pytest.mark.skip(reason="Skipping because the test takes a long time.")
def test_long_sequence_multiple_lengths(llm, max_tokens, min_expected, desc):
"""
Test determinism across sequence lengths that cross the chunk boundary.
With FLAGS_max_partition_size=64 (chunk_size=64), we test various chunk counts.
Note: min_expected is set conservatively because the model may stop early
due to EOS. The key test is determinism, not exact token count.
"""
sp = SamplingParams(
temperature=0.7,
top_p=0.95,
max_tokens=max_tokens,
seed=42,
)
_, token_ids = _assert_deterministic(llm, _LONG_PROMPT, sp, runs=5)
assert len(token_ids) >= min_expected, f"{desc}: expected >= {min_expected} tokens, got {len(token_ids)}"
def test_long_sequence_batch_invariance(llm):
"""
Long sequence output should be identical regardless of batch position.
This tests that the partition_kv fix maintains batch invariance.
"""
sp = SamplingParams(temperature=0.7, top_p=0.95, max_tokens=_MAX_TOKENS_LONG, seed=170)
baseline_text, baseline_ids = _generate_text(llm, _LONG_PROMPT, sp)
filler = "What is machine learning?"
batch_configs = [
[_LONG_PROMPT, filler],
[filler, _LONG_PROMPT],
[filler, _LONG_PROMPT, filler],
]
for i, batch in enumerate(batch_configs):
outputs = llm.generate(batch, sp)
idx = batch.index(_LONG_PROMPT)
result_text = outputs[idx].outputs.text
result_ids = list(outputs[idx].outputs.token_ids)
assert result_text == baseline_text, f"Batch config {i} (pos {idx}): text differs"
assert result_ids == baseline_ids, f"Batch config {i} (pos {idx}): token_ids differ"
if __name__ == "__main__":
pytest.main(["-sv", __file__])