mirror of
https://github.com/PaddlePaddle/FastDeploy.git
synced 2026-04-23 17:11:21 +08:00
48cfb608aa
Most single-GPU and small-model deployments do not need 64MB custom all-reduce buffers. Lowering the default to 8MB reduces unnecessary shared memory allocation. Tests that require larger buffers now explicitly set the value. Co-authored-by: gongweibao <gognweibao@baidu.com> Co-authored-by: Claude Opus 4.6 <noreply@anthropic.com>
415 lines
15 KiB
Python
415 lines
15 KiB
Python
# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
|
|
#
|
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
# you may not use this file except in compliance with the License.
|
|
# You may obtain a copy of the License at
|
|
#
|
|
# http://www.apache.org/licenses/LICENSE-2.0
|
|
#
|
|
# Unless required by applicable law or agreed to in writing, software
|
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
# See the License for the specific language governing permissions and
|
|
# limitations under the License.
|
|
|
|
"""
|
|
Long sequence determinism tests.
|
|
|
|
This test ensures that the deterministic mode works correctly for long sequences
|
|
that trigger the partition_kv code path (num_chunks > 1 when KV length > 1024).
|
|
|
|
Key requirements:
|
|
1. Total KV length (prompt_tokens + max_tokens) must exceed 1024 to trigger partition_kv
|
|
2. Recommended: KV length >= 2048 to ensure num_chunks >= 2
|
|
|
|
Usage:
|
|
CUDA_VISIBLE_DEVICES=0,1,2,3 pytest tests/e2e/4cards_cases/test_determinism_long.py -v
|
|
"""
|
|
|
|
import gc
|
|
import itertools
|
|
import os
|
|
from contextlib import contextmanager
|
|
|
|
import pytest
|
|
|
|
try:
|
|
import paddle.device.cuda as _paddle_cuda
|
|
except Exception:
|
|
_paddle_cuda = None
|
|
|
|
try:
|
|
from fastdeploy.logger.deterministic_logger import (
|
|
_read_logits_md5_file,
|
|
_reset_logits_md5_file,
|
|
)
|
|
except Exception:
|
|
_read_logits_md5_file = None
|
|
_reset_logits_md5_file = None
|
|
|
|
pytestmark = pytest.mark.gpu
|
|
|
|
DEFAULT_MODEL_DIR = "./models"
|
|
MODEL_NAME = "Qwen2-7B-Instruct"
|
|
|
|
|
|
@contextmanager
|
|
def env_override(mapping):
|
|
"""Temporarily set env vars, restoring original values on exit."""
|
|
old = {k: os.environ.get(k) for k in mapping}
|
|
os.environ.update(mapping)
|
|
try:
|
|
yield
|
|
finally:
|
|
for k, v in old.items():
|
|
if v is None:
|
|
os.environ.pop(k, None)
|
|
else:
|
|
os.environ[k] = v
|
|
|
|
|
|
@pytest.fixture(scope="module")
|
|
def model_path():
|
|
model_dir = os.getenv("MODEL_PATH", DEFAULT_MODEL_DIR)
|
|
return os.path.join(model_dir, MODEL_NAME)
|
|
|
|
|
|
@pytest.fixture(autouse=True)
|
|
def _reset_deterministic_mode():
|
|
"""Ensure every test starts with deterministic mode ON."""
|
|
os.environ["FD_DETERMINISTIC_MODE"] = "1"
|
|
yield
|
|
os.environ["FD_DETERMINISTIC_MODE"] = "1"
|
|
|
|
|
|
def _is_high_performance_gpu():
|
|
"""
|
|
Check if current GPU has performance >= H800.
|
|
|
|
Uses compute capability as proxy for performance.
|
|
H800 has compute capability 9.0, so GPUs with 9.0 or higher are considered high performance.
|
|
"""
|
|
if _paddle_cuda is None:
|
|
return False
|
|
try:
|
|
props = _paddle_cuda.get_device_properties(0)
|
|
|
|
# Compute capability comparison
|
|
# H800: 9.0, H100: 9.0, H200: 9.0+, B100/B200: 10.0
|
|
# Consider GPUs with compute capability >= 9.0 as high performance
|
|
min_cc = 9.0
|
|
current_cc = props.major * 1.0 + props.minor * 0.1
|
|
|
|
return current_cc >= min_cc
|
|
except Exception:
|
|
return False
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# Constants
|
|
# ---------------------------------------------------------------------------
|
|
|
|
# Use smallest chunk_size (64) to maximize num_chunks and increase
|
|
# sensitivity to partition_kv non-determinism. With chunk_size=64:
|
|
# - 1200 tokens -> 19 chunks (vs 2 chunks with default 1024)
|
|
# - More chunks = more merge operations = easier to detect non-determinism
|
|
_CHUNK_SIZE_FOR_TEST = "64"
|
|
|
|
# Long prompt to ensure KV length > 1024 (triggers partition_kv path)
|
|
# This sentence is ~20 tokens, repeated 40 times = ~800 tokens
|
|
_BASE_SENTENCE = (
|
|
"Artificial intelligence has transformed various industries including healthcare, "
|
|
"finance, transportation, and education through machine learning algorithms. "
|
|
)
|
|
_LONG_PROMPT = _BASE_SENTENCE * 40 + (
|
|
"Based on the above context about AI, please provide a detailed analysis of "
|
|
"the future trends and potential challenges in AI development."
|
|
)
|
|
|
|
# With ~800 token prompt + 512 max_tokens, total KV length ~1312 > 1024
|
|
# This ensures num_chunks >= 2, triggering the partition_kv code path
|
|
_MAX_TOKENS_LONG = 512
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# Fixtures
|
|
# ---------------------------------------------------------------------------
|
|
|
|
|
|
@pytest.fixture(scope="module", autouse=True)
|
|
def _module_env():
|
|
"""Set env vars BEFORE importing fastdeploy (must happen first)."""
|
|
with env_override(
|
|
{
|
|
"CUDA_VISIBLE_DEVICES": os.environ.get("CUDA_VISIBLE_DEVICES", "0,1,2,3"),
|
|
"FD_DETERMINISTIC_MODE": "1",
|
|
"FD_CUSTOM_AR_MAX_SIZE_MB": os.environ.get("FD_CUSTOM_AR_MAX_SIZE_MB", "64"),
|
|
"FLAGS_max_partition_size": _CHUNK_SIZE_FOR_TEST,
|
|
}
|
|
):
|
|
# Lazy import: env vars must be set before importing fastdeploy
|
|
global LLM, SamplingParams # noqa: PLW0603
|
|
from fastdeploy import LLM, SamplingParams
|
|
|
|
yield
|
|
|
|
|
|
@pytest.fixture(scope="module")
|
|
def llm(model_path, _module_env):
|
|
instance = LLM(
|
|
model=model_path,
|
|
tensor_parallel_size=int(os.getenv("TP_SIZE", "4")),
|
|
max_model_len=8192,
|
|
enable_prefix_caching=False,
|
|
graph_optimization_config={"use_cudagraph": os.getenv("USE_CUDAGRAPH", "0") == "1"},
|
|
)
|
|
yield instance
|
|
del instance
|
|
gc.collect()
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# Helpers
|
|
# ---------------------------------------------------------------------------
|
|
|
|
|
|
def _generate_text(llm, prompt, sp):
|
|
"""Generate once, return (text, token_ids)."""
|
|
out = llm.generate([prompt], sp)[0]
|
|
return out.outputs.text, list(out.outputs.token_ids)
|
|
|
|
|
|
def _collect_logits_hashes():
|
|
"""Read and clear the per-step logits MD5 hashes written by the worker process."""
|
|
if _read_logits_md5_file is None:
|
|
return []
|
|
try:
|
|
return _read_logits_md5_file()
|
|
except Exception:
|
|
return []
|
|
|
|
|
|
def _reset_logits_hashes():
|
|
"""Reset the logits MD5 hash file before a new generate run."""
|
|
if _reset_logits_md5_file is None:
|
|
return
|
|
try:
|
|
_reset_logits_md5_file()
|
|
except Exception:
|
|
pass
|
|
|
|
|
|
def _report_logits_diff(hashes_list):
|
|
"""Compare logits hashes between runs and report first divergence."""
|
|
if len(hashes_list) < 2 or not hashes_list[0]:
|
|
print("[DIAG-LOGITS] No logits hashes collected (FD_DETERMINISTIC_LOG_MODE=1 ?)")
|
|
return
|
|
baseline = hashes_list[0]
|
|
for run_idx, hashes in enumerate(hashes_list[1:], start=1):
|
|
min_len = min(len(baseline), len(hashes))
|
|
for step in range(min_len):
|
|
if baseline[step]["logits_md5"] != hashes[step]["logits_md5"]:
|
|
print(f"[DIAG-LOGITS] Run {run_idx}: LOGITS FIRST DIFFER at step {step}")
|
|
print(
|
|
f"[DIAG-LOGITS] baseline logits_md5={baseline[step]['logits_md5']}, "
|
|
f"probs_md5={baseline[step]['probs_md5']}"
|
|
)
|
|
print(
|
|
f"[DIAG-LOGITS] run_{run_idx} logits_md5={hashes[step]['logits_md5']}, "
|
|
f"probs_md5={hashes[step]['probs_md5']}"
|
|
)
|
|
print("[DIAG-LOGITS] -> Non-determinism is in MODEL COMPUTATION (not sampling)")
|
|
return
|
|
if len(baseline) != len(hashes):
|
|
print(
|
|
f"[DIAG-LOGITS] Run {run_idx}: All logits identical "
|
|
f"but length differs ({len(baseline)} vs {len(hashes)})"
|
|
)
|
|
return
|
|
for step in range(min_len):
|
|
if baseline[step]["probs_md5"] != hashes[step]["probs_md5"]:
|
|
print(f"[DIAG-LOGITS] Run {run_idx}: logits identical but PROBS DIFFER at step {step}")
|
|
print("[DIAG-LOGITS] -> Non-determinism is in SOFTMAX/PENALTY (not model)")
|
|
return
|
|
print(f"[DIAG-LOGITS] Run {run_idx}: ALL logits AND probs IDENTICAL across {min_len} steps")
|
|
print("[DIAG-LOGITS] -> Non-determinism is in SAMPLING OPERATOR")
|
|
|
|
|
|
def _report_token_diff(token_ids_list, sp=None):
|
|
"""Report detailed token-level diff to diagnose determinism issues."""
|
|
print("\n" + "=" * 70)
|
|
print("[DIAG] Token-level determinism diagnosis")
|
|
print("=" * 70)
|
|
if sp is not None:
|
|
print(f"[DIAG] SamplingParams: temperature={sp.temperature}, seed={sp.seed}, top_p={sp.top_p}")
|
|
for i, tids in enumerate(token_ids_list):
|
|
print(f"[DIAG] Run {i}: {len(tids)} tokens, first 10: {tids[:10]}")
|
|
|
|
baseline = token_ids_list[0]
|
|
for i, tids in enumerate(token_ids_list[1:], start=1):
|
|
if tids == baseline:
|
|
print(f"[DIAG] Run {i}: IDENTICAL to baseline")
|
|
continue
|
|
min_len = min(len(baseline), len(tids))
|
|
for j in range(min_len):
|
|
if baseline[j] != tids[j]:
|
|
print(f"[DIAG] Run {i}: FIRST DIVERGENCE at token position {j}")
|
|
print(f"[DIAG] baseline[{j}] = {baseline[j]}")
|
|
print(f"[DIAG] run_{i}[{j}] = {tids[j]}")
|
|
start = max(0, j - 3)
|
|
end = min(min_len, j + 4)
|
|
print(f"[DIAG] baseline[{start}:{end}] = {baseline[start:end]}")
|
|
print(f"[DIAG] run_{i}[{start}:{end}] = {tids[start:end]}")
|
|
total_diff = sum(1 for a, b in zip(baseline[:min_len], tids[:min_len]) if a != b)
|
|
print(f"[DIAG] Total differing tokens (in shared range): {total_diff}/{min_len}")
|
|
break
|
|
if len(baseline) != len(tids):
|
|
print(f"[DIAG] Length differs: baseline={len(baseline)}, run_{i}={len(tids)}")
|
|
print("=" * 70 + "\n")
|
|
|
|
|
|
def _report_text_diff(texts):
|
|
"""Report detailed diff when texts differ."""
|
|
for i, text in enumerate(texts[1:], start=1):
|
|
if text != texts[0]:
|
|
if len(text) != len(texts[0]):
|
|
print(f"Run {i}: length differs (baseline={len(texts[0])}, got={len(text)})")
|
|
for j, (c1, c2) in enumerate(itertools.zip_longest(texts[0], text, fillvalue="")):
|
|
if c1 != c2:
|
|
print(f"Run {i}: first diff at pos {j}")
|
|
print(f" Baseline: {repr(texts[0][max(0, j-10):j+20])}")
|
|
print(f" Run {i}: {repr(text[max(0, j-10):j+20])}")
|
|
break
|
|
|
|
|
|
def _assert_deterministic(llm, prompt, sp, runs=2):
|
|
"""Run *runs* times and assert all outputs are identical (text AND token_ids)."""
|
|
all_hashes = []
|
|
results = []
|
|
for _ in range(runs):
|
|
_reset_logits_hashes() # truncate file before each run
|
|
results.append(_generate_text(llm, prompt, sp))
|
|
all_hashes.append(_collect_logits_hashes())
|
|
|
|
texts = [r[0] for r in results]
|
|
token_ids = [r[1] for r in results]
|
|
|
|
if not all(t == token_ids[0] for t in token_ids):
|
|
_report_token_diff(token_ids, sp)
|
|
_report_logits_diff(all_hashes)
|
|
pytest.fail("Token IDs differ across runs")
|
|
|
|
if not all(t == texts[0] for t in texts):
|
|
_report_text_diff(texts)
|
|
pytest.fail("Text outputs differ across runs")
|
|
|
|
return texts[0], token_ids[0]
|
|
|
|
|
|
# ===================== Long sequence tests =====================
|
|
|
|
|
|
@pytest.mark.parametrize(
|
|
"temp,seed",
|
|
[
|
|
(0.0, 100),
|
|
(1.0, 200),
|
|
],
|
|
)
|
|
def test_deterministic_long_sequence(llm, temp, seed):
|
|
"""Long generation (512+ tokens) stays deterministic at various temperatures."""
|
|
prompt = "Please describe the history of AI in detail, including major milestones and key technical breakthroughs."
|
|
sp = SamplingParams(temperature=temp, top_p=0.95, max_tokens=384, seed=seed)
|
|
|
|
text, token_ids = _assert_deterministic(llm, prompt, sp)
|
|
assert len(token_ids) >= 100, f"Expected >= 100 tokens, got {len(token_ids)}"
|
|
|
|
|
|
def test_deterministic_long_prompt(llm):
|
|
"""Long input prompt (prefill-heavy) stays deterministic."""
|
|
base = "This is a description about natural language processing. "
|
|
long_prompt = (base * 50) + "Please summarize the above."
|
|
sp = SamplingParams(temperature=0.5, max_tokens=100, seed=2024)
|
|
|
|
_assert_deterministic(llm, long_prompt, sp)
|
|
|
|
|
|
# ===================== Partition-kv aware tests =====================
|
|
|
|
|
|
def test_long_sequence_determinism_basic(llm):
|
|
"""
|
|
Basic long sequence test: KV length > 2048 to trigger partition_kv.
|
|
|
|
This is the core test that verifies the deterministic mode fix works
|
|
for long sequences that would normally trigger num_chunks > 1.
|
|
"""
|
|
sp = SamplingParams(temperature=0.7, top_p=0.95, max_tokens=512, seed=170)
|
|
_, token_ids = _assert_deterministic(llm, _LONG_PROMPT, sp, runs=5)
|
|
|
|
assert len(token_ids) >= 200, f"Expected >= 200 tokens, got {len(token_ids)}"
|
|
|
|
|
|
@pytest.mark.skipif(
|
|
not _is_high_performance_gpu(),
|
|
reason="Test only runs on GPUs with performance >= H800 (compute capability >= 9.0)",
|
|
)
|
|
@pytest.mark.parametrize(
|
|
"max_tokens,min_expected,desc",
|
|
[
|
|
(400, 100, "~1200 total (~19 chunks)"),
|
|
(1280, 200, "~2000 total (~32 chunks)"),
|
|
(2200, 300, "~3000 total (~47 chunks)"),
|
|
],
|
|
ids=["19_chunks", "32_chunks", "47_chunks"],
|
|
)
|
|
@pytest.mark.skip(reason="Skipping because the test takes a long time.")
|
|
def test_long_sequence_multiple_lengths(llm, max_tokens, min_expected, desc):
|
|
"""
|
|
Test determinism across sequence lengths that cross the chunk boundary.
|
|
|
|
With FLAGS_max_partition_size=64 (chunk_size=64), we test various chunk counts.
|
|
|
|
Note: min_expected is set conservatively because the model may stop early
|
|
due to EOS. The key test is determinism, not exact token count.
|
|
"""
|
|
sp = SamplingParams(
|
|
temperature=0.7,
|
|
top_p=0.95,
|
|
max_tokens=max_tokens,
|
|
seed=42,
|
|
)
|
|
_, token_ids = _assert_deterministic(llm, _LONG_PROMPT, sp, runs=5)
|
|
assert len(token_ids) >= min_expected, f"{desc}: expected >= {min_expected} tokens, got {len(token_ids)}"
|
|
|
|
|
|
def test_long_sequence_batch_invariance(llm):
|
|
"""
|
|
Long sequence output should be identical regardless of batch position.
|
|
|
|
This tests that the partition_kv fix maintains batch invariance.
|
|
"""
|
|
sp = SamplingParams(temperature=0.7, top_p=0.95, max_tokens=_MAX_TOKENS_LONG, seed=170)
|
|
|
|
baseline_text, baseline_ids = _generate_text(llm, _LONG_PROMPT, sp)
|
|
|
|
filler = "What is machine learning?"
|
|
batch_configs = [
|
|
[_LONG_PROMPT, filler],
|
|
[filler, _LONG_PROMPT],
|
|
[filler, _LONG_PROMPT, filler],
|
|
]
|
|
|
|
for i, batch in enumerate(batch_configs):
|
|
outputs = llm.generate(batch, sp)
|
|
idx = batch.index(_LONG_PROMPT)
|
|
result_text = outputs[idx].outputs.text
|
|
result_ids = list(outputs[idx].outputs.token_ids)
|
|
|
|
assert result_text == baseline_text, f"Batch config {i} (pos {idx}): text differs"
|
|
assert result_ids == baseline_ids, f"Batch config {i} (pos {idx}): token_ids differ"
|
|
|
|
|
|
if __name__ == "__main__":
|
|
pytest.main(["-sv", __file__])
|