mirror of
https://github.com/PaddlePaddle/FastDeploy.git
synced 2026-04-24 01:29:57 +08:00
336 lines
12 KiB
Python
336 lines
12 KiB
Python
# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
|
|
#
|
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
# you may not use this file except in compliance with the License.
|
|
# You may obtain a copy of the License at
|
|
#
|
|
# http://www.apache.org/licenses/LICENSE-2.0
|
|
#
|
|
# Unless required by applicable law or agreed to in writing, software
|
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
# See the License for the specific language governing permissions and
|
|
# limitations under the License.
|
|
|
|
"""
|
|
Determinism offline inference tests using LLM.generate
|
|
|
|
Test scenarios:
|
|
1. Same-prompt repeatability (FD_DETERMINISTIC_MODE=1)
|
|
2. Different batch sizes consistency
|
|
3. Sampling-parameter combinations (temperature x top_p, parametrized)
|
|
4. Minimal output (max_tokens=1, early stop)
|
|
5. Special characters & multi-language prompts
|
|
6. Multi-turn conversation
|
|
7. State isolation (interleaved / interference prompts)
|
|
8. Non-deterministic validation (proves tests are effective)
|
|
|
|
Long sequence / long prompt / batch invariance tests are in test_determinism_long.py.
|
|
|
|
Usage:
|
|
CUDA_VISIBLE_DEVICES=0,1,2,3 pytest tests/e2e/4cards_cases/test_determinism_offline.py -v
|
|
"""
|
|
|
|
import os
|
|
from contextlib import contextmanager
|
|
|
|
import pytest
|
|
|
|
pytestmark = pytest.mark.gpu
|
|
|
|
DEFAULT_MODEL_DIR = "./models"
|
|
MODEL_NAME = "Qwen2-7B-Instruct"
|
|
|
|
|
|
@contextmanager
|
|
def env_override(mapping):
|
|
"""Temporarily set env vars, restoring original values on exit."""
|
|
old = {k: os.environ.get(k) for k in mapping}
|
|
os.environ.update(mapping)
|
|
try:
|
|
yield
|
|
finally:
|
|
for k, v in old.items():
|
|
if v is None:
|
|
os.environ.pop(k, None)
|
|
else:
|
|
os.environ[k] = v
|
|
|
|
|
|
@pytest.fixture(scope="module")
|
|
def model_path():
|
|
model_dir = os.getenv("MODEL_PATH", DEFAULT_MODEL_DIR)
|
|
return os.path.join(model_dir, MODEL_NAME)
|
|
|
|
|
|
@pytest.fixture(autouse=True)
|
|
def _reset_deterministic_mode():
|
|
"""Ensure every test starts with deterministic mode ON."""
|
|
os.environ["FD_DETERMINISTIC_MODE"] = "1"
|
|
yield
|
|
os.environ["FD_DETERMINISTIC_MODE"] = "1"
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# Fixtures
|
|
# ---------------------------------------------------------------------------
|
|
|
|
|
|
@pytest.fixture(scope="module", autouse=True)
|
|
def _module_env():
|
|
"""Set env vars before importing fastdeploy (must happen first)."""
|
|
with env_override(
|
|
{
|
|
"CUDA_VISIBLE_DEVICES": os.environ.get("CUDA_VISIBLE_DEVICES", "0"),
|
|
"FD_DETERMINISTIC_MODE": "1",
|
|
"FD_CUSTOM_AR_MAX_SIZE_MB": "64",
|
|
}
|
|
):
|
|
# Lazy import: env vars must be set before importing fastdeploy
|
|
global LLM, SamplingParams # noqa: PLW0603
|
|
from fastdeploy import LLM, SamplingParams
|
|
|
|
yield
|
|
|
|
|
|
@pytest.fixture(scope="module")
|
|
def llm(model_path, _module_env):
|
|
return LLM(
|
|
model=model_path,
|
|
tensor_parallel_size=4,
|
|
max_model_len=8192,
|
|
enable_prefix_caching=False,
|
|
graph_optimization_config={"use_cudagraph": os.getenv("USE_CUDAGRAPH", "0") == "1"},
|
|
)
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# Helpers
|
|
# ---------------------------------------------------------------------------
|
|
|
|
|
|
def _generate_text(llm, prompt, sp):
|
|
"""Generate once, return (text, token_ids)."""
|
|
out = llm.generate([prompt], sp)[0]
|
|
return out.outputs.text, list(out.outputs.token_ids)
|
|
|
|
|
|
def _assert_deterministic(llm, prompt, sp, runs=2):
|
|
"""Run *runs* times and assert all outputs are identical."""
|
|
results = [_generate_text(llm, prompt, sp) for _ in range(runs)]
|
|
texts = [r[0] for r in results]
|
|
token_ids = [r[1] for r in results]
|
|
assert all(t == texts[0] for t in texts), "Text outputs differ across runs"
|
|
assert all(t == token_ids[0] for t in token_ids), "Token IDs differ across runs"
|
|
return texts[0], token_ids[0]
|
|
|
|
|
|
# ===================== Core determinism tests =====================
|
|
|
|
|
|
def test_deterministic_same_prompt(llm):
|
|
"""Same prompt + same seed produces identical output across 5 runs."""
|
|
sp = SamplingParams(temperature=0.8, top_p=0.95, max_tokens=50, seed=123)
|
|
_assert_deterministic(llm, "Please introduce artificial intelligence in one sentence.", sp, runs=5)
|
|
|
|
|
|
def test_deterministic_different_batch_sizes(llm):
|
|
"""Same prompt is consistent across batch sizes 1 / 2 / 4 / 8."""
|
|
prompt = "What is machine learning?"
|
|
sp = SamplingParams(temperature=0.5, max_tokens=30, seed=789)
|
|
|
|
baseline, _ = _generate_text(llm, prompt, sp)
|
|
|
|
for bs in [2, 4, 8]:
|
|
outputs = llm.generate([prompt] * bs, sp)
|
|
assert outputs[0].outputs.text == baseline, f"Batch size {bs} differs from bs=1"
|
|
|
|
|
|
def test_deterministic_batch_invariance(llm):
|
|
"""Target prompt produces identical output regardless of batch position."""
|
|
prompt = "What kind of programming language is Python?"
|
|
sp = SamplingParams(temperature=0.5, max_tokens=40, seed=456)
|
|
|
|
baseline, _ = _generate_text(llm, prompt, sp)
|
|
|
|
batch_configs = [
|
|
[prompt, "Filler question 1"],
|
|
["Filler question 2", prompt, "Filler question 3"],
|
|
["Filler question 4", "Filler question 5", prompt],
|
|
["Filler 6", "Filler 7", "Filler 8", prompt],
|
|
]
|
|
|
|
for i, batch in enumerate(batch_configs):
|
|
outputs = llm.generate(batch, sp)
|
|
idx = batch.index(prompt)
|
|
assert (
|
|
outputs[idx].outputs.text == baseline
|
|
), f"Batch config {i} (pos {idx}): result differs from single-request baseline"
|
|
|
|
|
|
# ===================== Sampling-parameter combinations =====================
|
|
|
|
|
|
@pytest.mark.parametrize(
|
|
"temp,top_p,seed",
|
|
[
|
|
(0.0, 1.0, 300), # greedy, no top_p filter
|
|
(0.0, 0.0, 301), # double-greedy
|
|
(0.3, 0.9, 302), # low temp, moderate top_p
|
|
(0.8, 0.0, 303), # medium temp, greedy top_p
|
|
(0.8, 1.0, 304), # medium temp, no top_p filter
|
|
(0.8, 0.5, 305), # medium temp, strict top_p
|
|
(1.0, 0.95, 306), # high temp
|
|
(1.5, 0.9, 307), # very high temp
|
|
],
|
|
)
|
|
def test_deterministic_param_combos(llm, temp, top_p, seed):
|
|
"""Determinism holds across various (temperature, top_p) combinations."""
|
|
sp = SamplingParams(temperature=temp, top_p=top_p, max_tokens=30, seed=seed)
|
|
_assert_deterministic(llm, "What is a neural network?", sp)
|
|
|
|
|
|
# ===================== Minimal / boundary output tests =====================
|
|
|
|
|
|
def test_deterministic_max_tokens_one(llm):
|
|
"""Single-token output is deterministic."""
|
|
sp = SamplingParams(temperature=0.1, max_tokens=1, seed=700)
|
|
|
|
text, token_ids = _assert_deterministic(llm, "What color is the sky?", sp)
|
|
assert len(token_ids) == 1, f"Expected 1 token, got {len(token_ids)}"
|
|
|
|
|
|
def test_deterministic_early_stop(llm):
|
|
"""Early stopping via stop sequences is deterministic."""
|
|
sp = SamplingParams(temperature=0.7, max_tokens=100, stop=["\u3002", "."], seed=800)
|
|
|
|
text, token_ids = _assert_deterministic(llm, "Please list three colors:", sp)
|
|
assert len(token_ids) < 100, f"Expected early stop, got {len(token_ids)} tokens"
|
|
|
|
|
|
# ===================== Special input tests =====================
|
|
|
|
|
|
@pytest.mark.parametrize(
|
|
"prompt,seed",
|
|
[
|
|
("What is AI? \U0001f52c\U0001f9e0", 900), # emoji
|
|
("Math: E = mc\u00b2", 901), # superscript
|
|
("Code: def hello(): return 'world'", 902), # code
|
|
("Symbols: @#$%^&*()", 903), # special symbols
|
|
],
|
|
)
|
|
def test_deterministic_special_chars(llm, prompt, seed):
|
|
sp = SamplingParams(temperature=0.5, max_tokens=30, seed=seed)
|
|
_assert_deterministic(llm, prompt, sp)
|
|
|
|
|
|
@pytest.mark.parametrize(
|
|
"lang,prompt,seed",
|
|
[
|
|
("Chinese", "Please introduce artificial intelligence in one sentence.", 1000),
|
|
("English", "What is artificial intelligence in one sentence?", 1001),
|
|
(
|
|
"Japanese",
|
|
"\u4eba\u5de5\u77e5\u80fd\u306b\u3064\u3044\u3066\u4e00\u8a00\u3067\u8aac\u660e\u3057\u3066\u304f\u3060\u3055\u3044\u3002",
|
|
1002,
|
|
),
|
|
("Spanish", "\u00bfQu\u00e9 es la inteligencia artificial en una frase?", 1003),
|
|
],
|
|
)
|
|
def test_deterministic_multi_language(llm, lang, prompt, seed):
|
|
sp = SamplingParams(temperature=0.5, max_tokens=30, seed=seed)
|
|
_assert_deterministic(llm, prompt, sp)
|
|
|
|
|
|
# ===================== Multi-turn conversation test =====================
|
|
|
|
|
|
def test_deterministic_multi_turn(llm):
|
|
"""Multi-turn chat maintains determinism."""
|
|
sp = SamplingParams(temperature=0.5, max_tokens=50, seed=1100)
|
|
|
|
messages1 = [
|
|
{"role": "user", "content": "Hello!"},
|
|
{"role": "assistant", "content": "Hi! How can I help you?"},
|
|
{"role": "user", "content": "Please introduce yourself."},
|
|
]
|
|
|
|
# First full conversation
|
|
r1_turn1 = llm.chat(messages1, sp)[0].outputs.text
|
|
msgs2 = messages1 + [
|
|
{"role": "assistant", "content": r1_turn1},
|
|
{"role": "user", "content": "What can you do?"},
|
|
]
|
|
r1_turn2 = llm.chat(msgs2, sp)[0].outputs.text
|
|
|
|
# Second full conversation (same seed)
|
|
r2_turn1 = llm.chat(messages1, sp)[0].outputs.text
|
|
msgs2_repeat = messages1 + [
|
|
{"role": "assistant", "content": r2_turn1},
|
|
{"role": "user", "content": "What can you do?"},
|
|
]
|
|
r2_turn2 = llm.chat(msgs2_repeat, sp)[0].outputs.text
|
|
|
|
assert r1_turn1 == r2_turn1, "Multi-turn: turn-1 outputs differ"
|
|
assert r1_turn2 == r2_turn2, "Multi-turn: turn-2 outputs differ"
|
|
|
|
|
|
# ===================== State isolation test =====================
|
|
|
|
|
|
def test_deterministic_state_isolation(llm):
|
|
"""Interference prompts and interleaving do not break determinism."""
|
|
prompt_a = "What is Python?"
|
|
prompt_b = "What is JavaScript?"
|
|
sp_a = SamplingParams(temperature=0.5, max_tokens=30, seed=1200)
|
|
sp_b = SamplingParams(temperature=0.5, max_tokens=30, seed=1201)
|
|
|
|
# Round 1
|
|
a1, _ = _generate_text(llm, prompt_a, sp_a)
|
|
b1, _ = _generate_text(llm, prompt_b, sp_b)
|
|
|
|
# Run unrelated interference
|
|
for p in ["Explain reinforcement learning.", "What is NLP?", "List 3 fruits."]:
|
|
llm.generate([p], SamplingParams(temperature=0.7, max_tokens=20, seed=999))
|
|
|
|
# Round 2
|
|
a2, _ = _generate_text(llm, prompt_a, sp_a)
|
|
b2, _ = _generate_text(llm, prompt_b, sp_b)
|
|
|
|
assert a1 == a2, "Prompt A: output changed after interference"
|
|
assert b1 == b2, "Prompt B: output changed after interference"
|
|
|
|
|
|
# ===================== Non-deterministic validation =====================
|
|
|
|
|
|
def test_non_deterministic_validation(llm):
|
|
"""
|
|
Prove that tests are effective:
|
|
- Without seed + without mode: outputs vary
|
|
- With explicit seed: outputs are consistent
|
|
"""
|
|
prompt = "Please explain deep learning in one sentence."
|
|
|
|
# Part 1: no mode, no seed -> outputs should differ
|
|
os.environ.pop("FD_DETERMINISTIC_MODE", None)
|
|
results_no_seed = []
|
|
for _ in range(5):
|
|
sp = SamplingParams(temperature=0.7, max_tokens=30)
|
|
results_no_seed.append(llm.generate([prompt], sp)[0].outputs.text)
|
|
|
|
# Probabilistic, skip if all outputs are the same
|
|
if len(set(results_no_seed)) == 1:
|
|
pytest.skip("Sampling produced identical outputs (probabilistic case)")
|
|
|
|
# Part 2: explicit seed -> outputs must be consistent
|
|
sp_seeded = SamplingParams(temperature=0.7, max_tokens=30, seed=999)
|
|
results_seeded = [llm.generate([prompt], sp_seeded)[0].outputs.text for _ in range(5)]
|
|
assert len(set(results_seeded)) == 1, "With explicit seed: expected consistent outputs"
|
|
|
|
|
|
if __name__ == "__main__":
|
|
pytest.main(["-sv", __file__])
|