# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. """ Long sequence determinism tests. This test ensures that the deterministic mode works correctly for long sequences that trigger the partition_kv code path (num_chunks > 1 when KV length > 1024). Key requirements: 1. Total KV length (prompt_tokens + max_tokens) must exceed 1024 to trigger partition_kv 2. Recommended: KV length >= 2048 to ensure num_chunks >= 2 Usage: CUDA_VISIBLE_DEVICES=0,1,2,3 pytest tests/e2e/4cards_cases/test_determinism_long.py -v """ import gc import itertools import os from contextlib import contextmanager import pytest try: import paddle.device.cuda as _paddle_cuda except Exception: _paddle_cuda = None try: from fastdeploy.logger.deterministic_logger import ( _read_logits_md5_file, _reset_logits_md5_file, ) except Exception: _read_logits_md5_file = None _reset_logits_md5_file = None pytestmark = pytest.mark.gpu DEFAULT_MODEL_DIR = "./models" MODEL_NAME = "Qwen2-7B-Instruct" @contextmanager def env_override(mapping): """Temporarily set env vars, restoring original values on exit.""" old = {k: os.environ.get(k) for k in mapping} os.environ.update(mapping) try: yield finally: for k, v in old.items(): if v is None: os.environ.pop(k, None) else: os.environ[k] = v @pytest.fixture(scope="module") def model_path(): model_dir = os.getenv("MODEL_PATH", DEFAULT_MODEL_DIR) return os.path.join(model_dir, MODEL_NAME) @pytest.fixture(autouse=True) def _reset_deterministic_mode(): """Ensure every test starts with deterministic mode ON.""" os.environ["FD_DETERMINISTIC_MODE"] = "1" yield os.environ["FD_DETERMINISTIC_MODE"] = "1" def _is_high_performance_gpu(): """ Check if current GPU has performance >= H800. Uses compute capability as proxy for performance. H800 has compute capability 9.0, so GPUs with 9.0 or higher are considered high performance. """ if _paddle_cuda is None: return False try: props = _paddle_cuda.get_device_properties(0) # Compute capability comparison # H800: 9.0, H100: 9.0, H200: 9.0+, B100/B200: 10.0 # Consider GPUs with compute capability >= 9.0 as high performance min_cc = 9.0 current_cc = props.major * 1.0 + props.minor * 0.1 return current_cc >= min_cc except Exception: return False # --------------------------------------------------------------------------- # Constants # --------------------------------------------------------------------------- # Use smallest chunk_size (64) to maximize num_chunks and increase # sensitivity to partition_kv non-determinism. With chunk_size=64: # - 1200 tokens -> 19 chunks (vs 2 chunks with default 1024) # - More chunks = more merge operations = easier to detect non-determinism _CHUNK_SIZE_FOR_TEST = "64" # Long prompt to ensure KV length > 1024 (triggers partition_kv path) # This sentence is ~20 tokens, repeated 40 times = ~800 tokens _BASE_SENTENCE = ( "Artificial intelligence has transformed various industries including healthcare, " "finance, transportation, and education through machine learning algorithms. " ) _LONG_PROMPT = _BASE_SENTENCE * 40 + ( "Based on the above context about AI, please provide a detailed analysis of " "the future trends and potential challenges in AI development." ) # With ~800 token prompt + 512 max_tokens, total KV length ~1312 > 1024 # This ensures num_chunks >= 2, triggering the partition_kv code path _MAX_TOKENS_LONG = 512 # --------------------------------------------------------------------------- # Fixtures # --------------------------------------------------------------------------- @pytest.fixture(scope="module", autouse=True) def _module_env(): """Set env vars BEFORE importing fastdeploy (must happen first).""" with env_override( { "CUDA_VISIBLE_DEVICES": os.environ.get("CUDA_VISIBLE_DEVICES", "0,1,2,3"), "FD_DETERMINISTIC_MODE": "1", "FD_CUSTOM_AR_MAX_SIZE_MB": os.environ.get("FD_CUSTOM_AR_MAX_SIZE_MB", "64"), "FLAGS_max_partition_size": _CHUNK_SIZE_FOR_TEST, } ): # Lazy import: env vars must be set before importing fastdeploy global LLM, SamplingParams # noqa: PLW0603 from fastdeploy import LLM, SamplingParams yield @pytest.fixture(scope="module") def llm(model_path, _module_env): instance = LLM( model=model_path, tensor_parallel_size=int(os.getenv("TP_SIZE", "4")), max_model_len=8192, enable_prefix_caching=False, graph_optimization_config={"use_cudagraph": os.getenv("USE_CUDAGRAPH", "0") == "1"}, ) yield instance del instance gc.collect() # --------------------------------------------------------------------------- # Helpers # --------------------------------------------------------------------------- def _generate_text(llm, prompt, sp): """Generate once, return (text, token_ids).""" out = llm.generate([prompt], sp)[0] return out.outputs.text, list(out.outputs.token_ids) def _collect_logits_hashes(): """Read and clear the per-step logits MD5 hashes written by the worker process.""" if _read_logits_md5_file is None: return [] try: return _read_logits_md5_file() except Exception: return [] def _reset_logits_hashes(): """Reset the logits MD5 hash file before a new generate run.""" if _reset_logits_md5_file is None: return try: _reset_logits_md5_file() except Exception: pass def _report_logits_diff(hashes_list): """Compare logits hashes between runs and report first divergence.""" if len(hashes_list) < 2 or not hashes_list[0]: print("[DIAG-LOGITS] No logits hashes collected (FD_DETERMINISTIC_LOG_MODE=1 ?)") return baseline = hashes_list[0] for run_idx, hashes in enumerate(hashes_list[1:], start=1): min_len = min(len(baseline), len(hashes)) for step in range(min_len): if baseline[step]["logits_md5"] != hashes[step]["logits_md5"]: print(f"[DIAG-LOGITS] Run {run_idx}: LOGITS FIRST DIFFER at step {step}") print( f"[DIAG-LOGITS] baseline logits_md5={baseline[step]['logits_md5']}, " f"probs_md5={baseline[step]['probs_md5']}" ) print( f"[DIAG-LOGITS] run_{run_idx} logits_md5={hashes[step]['logits_md5']}, " f"probs_md5={hashes[step]['probs_md5']}" ) print("[DIAG-LOGITS] -> Non-determinism is in MODEL COMPUTATION (not sampling)") return if len(baseline) != len(hashes): print( f"[DIAG-LOGITS] Run {run_idx}: All logits identical " f"but length differs ({len(baseline)} vs {len(hashes)})" ) return for step in range(min_len): if baseline[step]["probs_md5"] != hashes[step]["probs_md5"]: print(f"[DIAG-LOGITS] Run {run_idx}: logits identical but PROBS DIFFER at step {step}") print("[DIAG-LOGITS] -> Non-determinism is in SOFTMAX/PENALTY (not model)") return print(f"[DIAG-LOGITS] Run {run_idx}: ALL logits AND probs IDENTICAL across {min_len} steps") print("[DIAG-LOGITS] -> Non-determinism is in SAMPLING OPERATOR") def _report_token_diff(token_ids_list, sp=None): """Report detailed token-level diff to diagnose determinism issues.""" print("\n" + "=" * 70) print("[DIAG] Token-level determinism diagnosis") print("=" * 70) if sp is not None: print(f"[DIAG] SamplingParams: temperature={sp.temperature}, seed={sp.seed}, top_p={sp.top_p}") for i, tids in enumerate(token_ids_list): print(f"[DIAG] Run {i}: {len(tids)} tokens, first 10: {tids[:10]}") baseline = token_ids_list[0] for i, tids in enumerate(token_ids_list[1:], start=1): if tids == baseline: print(f"[DIAG] Run {i}: IDENTICAL to baseline") continue min_len = min(len(baseline), len(tids)) for j in range(min_len): if baseline[j] != tids[j]: print(f"[DIAG] Run {i}: FIRST DIVERGENCE at token position {j}") print(f"[DIAG] baseline[{j}] = {baseline[j]}") print(f"[DIAG] run_{i}[{j}] = {tids[j]}") start = max(0, j - 3) end = min(min_len, j + 4) print(f"[DIAG] baseline[{start}:{end}] = {baseline[start:end]}") print(f"[DIAG] run_{i}[{start}:{end}] = {tids[start:end]}") total_diff = sum(1 for a, b in zip(baseline[:min_len], tids[:min_len]) if a != b) print(f"[DIAG] Total differing tokens (in shared range): {total_diff}/{min_len}") break if len(baseline) != len(tids): print(f"[DIAG] Length differs: baseline={len(baseline)}, run_{i}={len(tids)}") print("=" * 70 + "\n") def _report_text_diff(texts): """Report detailed diff when texts differ.""" for i, text in enumerate(texts[1:], start=1): if text != texts[0]: if len(text) != len(texts[0]): print(f"Run {i}: length differs (baseline={len(texts[0])}, got={len(text)})") for j, (c1, c2) in enumerate(itertools.zip_longest(texts[0], text, fillvalue="")): if c1 != c2: print(f"Run {i}: first diff at pos {j}") print(f" Baseline: {repr(texts[0][max(0, j-10):j+20])}") print(f" Run {i}: {repr(text[max(0, j-10):j+20])}") break def _assert_deterministic(llm, prompt, sp, runs=2): """Run *runs* times and assert all outputs are identical (text AND token_ids).""" all_hashes = [] results = [] for _ in range(runs): _reset_logits_hashes() # truncate file before each run results.append(_generate_text(llm, prompt, sp)) all_hashes.append(_collect_logits_hashes()) texts = [r[0] for r in results] token_ids = [r[1] for r in results] if not all(t == token_ids[0] for t in token_ids): _report_token_diff(token_ids, sp) _report_logits_diff(all_hashes) pytest.fail("Token IDs differ across runs") if not all(t == texts[0] for t in texts): _report_text_diff(texts) pytest.fail("Text outputs differ across runs") return texts[0], token_ids[0] # ===================== Long sequence tests ===================== @pytest.mark.parametrize( "temp,seed", [ (0.0, 100), (1.0, 200), ], ) def test_deterministic_long_sequence(llm, temp, seed): """Long generation (512+ tokens) stays deterministic at various temperatures.""" prompt = "Please describe the history of AI in detail, including major milestones and key technical breakthroughs." sp = SamplingParams(temperature=temp, top_p=0.95, max_tokens=384, seed=seed) text, token_ids = _assert_deterministic(llm, prompt, sp) assert len(token_ids) >= 100, f"Expected >= 100 tokens, got {len(token_ids)}" def test_deterministic_long_prompt(llm): """Long input prompt (prefill-heavy) stays deterministic.""" base = "This is a description about natural language processing. " long_prompt = (base * 50) + "Please summarize the above." sp = SamplingParams(temperature=0.5, max_tokens=100, seed=2024) _assert_deterministic(llm, long_prompt, sp) # ===================== Partition-kv aware tests ===================== def test_long_sequence_determinism_basic(llm): """ Basic long sequence test: KV length > 2048 to trigger partition_kv. This is the core test that verifies the deterministic mode fix works for long sequences that would normally trigger num_chunks > 1. """ sp = SamplingParams(temperature=0.7, top_p=0.95, max_tokens=512, seed=170) _, token_ids = _assert_deterministic(llm, _LONG_PROMPT, sp, runs=5) assert len(token_ids) >= 200, f"Expected >= 200 tokens, got {len(token_ids)}" @pytest.mark.skipif( not _is_high_performance_gpu(), reason="Test only runs on GPUs with performance >= H800 (compute capability >= 9.0)", ) @pytest.mark.parametrize( "max_tokens,min_expected,desc", [ (400, 100, "~1200 total (~19 chunks)"), (1280, 200, "~2000 total (~32 chunks)"), (2200, 300, "~3000 total (~47 chunks)"), ], ids=["19_chunks", "32_chunks", "47_chunks"], ) @pytest.mark.skip(reason="Skipping because the test takes a long time.") def test_long_sequence_multiple_lengths(llm, max_tokens, min_expected, desc): """ Test determinism across sequence lengths that cross the chunk boundary. With FLAGS_max_partition_size=64 (chunk_size=64), we test various chunk counts. Note: min_expected is set conservatively because the model may stop early due to EOS. The key test is determinism, not exact token count. """ sp = SamplingParams( temperature=0.7, top_p=0.95, max_tokens=max_tokens, seed=42, ) _, token_ids = _assert_deterministic(llm, _LONG_PROMPT, sp, runs=5) assert len(token_ids) >= min_expected, f"{desc}: expected >= {min_expected} tokens, got {len(token_ids)}" def test_long_sequence_batch_invariance(llm): """ Long sequence output should be identical regardless of batch position. This tests that the partition_kv fix maintains batch invariance. """ sp = SamplingParams(temperature=0.7, top_p=0.95, max_tokens=_MAX_TOKENS_LONG, seed=170) baseline_text, baseline_ids = _generate_text(llm, _LONG_PROMPT, sp) filler = "What is machine learning?" batch_configs = [ [_LONG_PROMPT, filler], [filler, _LONG_PROMPT], [filler, _LONG_PROMPT, filler], ] for i, batch in enumerate(batch_configs): outputs = llm.generate(batch, sp) idx = batch.index(_LONG_PROMPT) result_text = outputs[idx].outputs.text result_ids = list(outputs[idx].outputs.token_ids) assert result_text == baseline_text, f"Batch config {i} (pos {idx}): text differs" assert result_ids == baseline_ids, f"Batch config {i} (pos {idx}): token_ids differ" if __name__ == "__main__": pytest.main(["-sv", __file__])