mirror of
https://github.com/PaddlePaddle/FastDeploy.git
synced 2026-04-23 00:17:25 +08:00
[Feature] Add Deterministic Inference Support (#6476)
* add * [tests] Add Paddle attention determinism tests and refactor resource manager Add comprehensive determinism tests for Paddle attention layer and refactor resource manager for deterministic mode support. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com> * add * add * add * add * add more * add more * fixsome * fixsome * fix bugs * fix bugs * only in gpu * add docs * fix comments * fix some * fix some * fix comments * add more * fix potential problem * remove not need * remove not need * remove no need * fix bug * fix bugs * fix comments * fix comments * Update tests/ce/deterministic/test_determinism_verification.py Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com> * Update tests/inter_communicator/test_ipc_signal.py Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com> * Update tests/layers/test_paddle_attention_determinism.py Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com> * Update tests/engine/test_sampling_params_determinism.py Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com> * Update tests/layers/test_paddle_attention_determinism.py Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com> * Update tests/layers/test_paddle_attention_determinism_standalone.py Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com> * fix comments * fix import error * fix a bug * fix bugs * fix bugs * fix coverage * refine codes * refine code * fix comments * fix comments * fix comments * rm not need * fix allreduce large tensor bug * mv log files * mv log files * add files --------- Co-authored-by: Claude Opus 4.6 <noreply@anthropic.com> Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com>
This commit is contained in:
@@ -0,0 +1,472 @@
|
||||
# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
"""
|
||||
Chunked Prefill Determinism Tests
|
||||
|
||||
Test _get_num_new_tokens alignment behavior in ResourceManagerV1:
|
||||
1. Deterministic disabled (no alignment)
|
||||
2. Deterministic enabled (split_kv_size boundary alignment)
|
||||
3. Boundary cases
|
||||
4. Continuous chunk consistency
|
||||
5. Multimodal inputs (image / video / audio)
|
||||
6. Real batch scheduling scenarios
|
||||
7. Corner cases (empty request, invalid state, large split, dynamic switch, etc.)
|
||||
"""
|
||||
|
||||
import os
|
||||
import unittest
|
||||
|
||||
from fastdeploy.engine.request import Request
|
||||
from fastdeploy.engine.sched.resource_manager_v1 import ResourceManagerV1
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Minimal config stubs -- only fields accessed by ResourceManagerV1.__init__
|
||||
# and _get_num_new_tokens are kept.
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
|
||||
class ModelConfig:
|
||||
def __init__(self):
|
||||
self.enable_mm = False
|
||||
self.causal = True
|
||||
|
||||
|
||||
class CacheConfig:
|
||||
def __init__(self):
|
||||
self.block_size = 16
|
||||
self.enable_prefix_caching = False
|
||||
self.kvcache_storage_backend = None
|
||||
self.write_policy = None
|
||||
self.num_cpu_blocks = 0
|
||||
self.total_block_num = 10000
|
||||
self.prefill_kvcache_block_num = 10000
|
||||
self.max_encoder_cache = 0
|
||||
self.max_processor_cache = 0
|
||||
self.bytes_per_token_per_layer = 32 * 32 * 128 * 2
|
||||
|
||||
|
||||
class ParallelConfig:
|
||||
def __init__(self):
|
||||
self.local_engine_worker_queue_port = None
|
||||
self.tensor_parallel_size = 1
|
||||
|
||||
|
||||
class SpeculativeConfig:
|
||||
def __init__(self):
|
||||
self.method = None
|
||||
self.num_speculative_tokens = 0
|
||||
self.model_type = None
|
||||
|
||||
|
||||
class StubConfig:
|
||||
"""Assembles the minimal sub-configs needed by ResourceManagerV1."""
|
||||
|
||||
def __init__(self):
|
||||
self.model_config = ModelConfig()
|
||||
self.cache_config = CacheConfig()
|
||||
self.parallel_config = ParallelConfig()
|
||||
self.speculative_config = SpeculativeConfig()
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Helpers
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
|
||||
def _create_request(request_id, prompt_token_ids, num_computed_tokens=0, multimodal_inputs=None):
|
||||
"""Create a real Request object for testing."""
|
||||
return Request(
|
||||
request_id=request_id,
|
||||
prompt_token_ids=prompt_token_ids,
|
||||
prompt_token_ids_len=len(prompt_token_ids),
|
||||
num_computed_tokens=num_computed_tokens,
|
||||
multimodal_inputs=multimodal_inputs,
|
||||
)
|
||||
|
||||
|
||||
def _build_mm_inputs(prompt_len, text_len, modal_id, extra=None):
|
||||
"""Build a multimodal_inputs dict for a single-modality request."""
|
||||
mm_len = prompt_len - text_len
|
||||
patch_idx_val = modal_id # 1=image, 2=video, 3=audio
|
||||
inputs = {
|
||||
"image_patch_id": prompt_len + 1,
|
||||
"image_end_id": prompt_len + 2,
|
||||
"video_patch_id": prompt_len + 3,
|
||||
"video_end_id": prompt_len + 4,
|
||||
"audio_patch_id": prompt_len + 5,
|
||||
"audio_end_id": prompt_len + 6,
|
||||
"patch_idx": [0] * text_len + [patch_idx_val] * mm_len,
|
||||
"patch_map": [
|
||||
{"modal_id": 0, "end_idx": text_len, "image_num": 0, "video_num": 0},
|
||||
{
|
||||
"modal_id": modal_id,
|
||||
"end_idx": prompt_len,
|
||||
"image_num": 1 if modal_id == 1 else 0,
|
||||
"video_num": 1 if modal_id == 2 else 0,
|
||||
},
|
||||
],
|
||||
"tts": False,
|
||||
}
|
||||
if extra:
|
||||
inputs.update(extra)
|
||||
return inputs
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Test class
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
|
||||
class TestChunkedPrefillDeterminism(unittest.TestCase):
|
||||
"""Test _get_num_new_tokens alignment in deterministic mode."""
|
||||
|
||||
def setUp(self):
|
||||
self._saved_env = {}
|
||||
for key in ("FD_DETERMINISTIC_MODE", "FD_DETERMINISTIC_SPLIT_KV_SIZE"):
|
||||
self._saved_env[key] = os.environ.get(key)
|
||||
self.config = StubConfig()
|
||||
self.rm = self._create_resource_manager(self.config)
|
||||
|
||||
def tearDown(self):
|
||||
for key, value in self._saved_env.items():
|
||||
if value is None:
|
||||
os.environ.pop(key, None)
|
||||
else:
|
||||
os.environ[key] = value
|
||||
|
||||
# -- env helpers --
|
||||
|
||||
def _enable_deterministic(self, split_kv_size=16):
|
||||
os.environ["FD_DETERMINISTIC_MODE"] = "1"
|
||||
os.environ["FD_DETERMINISTIC_SPLIT_KV_SIZE"] = str(split_kv_size)
|
||||
|
||||
def _disable_deterministic(self):
|
||||
os.environ.pop("FD_DETERMINISTIC_MODE", None)
|
||||
os.environ.pop("FD_DETERMINISTIC_SPLIT_KV_SIZE", None)
|
||||
|
||||
def _create_resource_manager(self, config):
|
||||
return ResourceManagerV1(
|
||||
max_num_seqs=32,
|
||||
config=config,
|
||||
tensor_parallel_size=1,
|
||||
splitwise_role="mixed",
|
||||
local_data_parallel_id=0,
|
||||
)
|
||||
|
||||
def _create_mm_resource_manager(self):
|
||||
config = StubConfig()
|
||||
config.model_config.enable_mm = True
|
||||
return self._create_resource_manager(config)
|
||||
|
||||
# ==================== 1. Deterministic disabled ====================
|
||||
|
||||
def test_get_num_new_tokens_deterministic_disabled(self):
|
||||
"""No alignment when deterministic mode is off; budget=0 returns 0."""
|
||||
self._disable_deterministic()
|
||||
|
||||
test_cases = [
|
||||
# (prompt_tokens, num_computed, token_budget, expected)
|
||||
(list(range(100)), 0, 50, 50),
|
||||
(list(range(100)), 50, 30, 30),
|
||||
(list(range(100)), 90, 20, 10),
|
||||
(list(range(32)), 0, 15, 15),
|
||||
# budget=0 -> 0
|
||||
(list(range(100)), 0, 0, 0),
|
||||
]
|
||||
for prompt_ids, num_computed, budget, expected in test_cases:
|
||||
with self.subTest(prompt_len=len(prompt_ids), computed=num_computed, budget=budget):
|
||||
req = _create_request("req", prompt_ids, num_computed)
|
||||
result = self.rm._get_num_new_tokens(req, budget)
|
||||
self.assertEqual(result, expected)
|
||||
|
||||
# ==================== 2. Deterministic enabled alignment ====================
|
||||
|
||||
def test_get_num_new_tokens_deterministic_enabled_alignment(self):
|
||||
"""Results must align to split_kv_size boundary."""
|
||||
split_kv_size = 16
|
||||
self._enable_deterministic(split_kv_size)
|
||||
|
||||
test_cases = [
|
||||
# (prompt_tokens, num_computed, token_budget, expected)
|
||||
(list(range(100)), 0, 20, 16),
|
||||
(list(range(100)), 0, 32, 32),
|
||||
(list(range(100)), 0, 40, 32),
|
||||
(list(range(100)), 0, 50, 48),
|
||||
(list(range(100)), 8, 20, 8),
|
||||
(list(range(100)), 8, 30, 24),
|
||||
(list(range(100)), 16, 20, 16),
|
||||
(list(range(100)), 16, 25, 16),
|
||||
]
|
||||
for prompt_ids, num_computed, budget, expected in test_cases:
|
||||
with self.subTest(computed=num_computed, budget=budget):
|
||||
req = _create_request("req", prompt_ids, num_computed)
|
||||
result = self.rm._get_num_new_tokens(req, budget)
|
||||
self.assertEqual(result, expected)
|
||||
# Verify alignment
|
||||
if result > 0:
|
||||
final_pos = num_computed + result
|
||||
self.assertEqual(final_pos % split_kv_size, 0)
|
||||
|
||||
# ==================== 3. Boundary cases ====================
|
||||
|
||||
def test_get_num_new_tokens_boundary_cases(self):
|
||||
"""Boundary conditions including large budget."""
|
||||
split_kv_size = 16
|
||||
self._enable_deterministic(split_kv_size)
|
||||
|
||||
test_cases = [
|
||||
(list(range(100)), 0, 5, "budget < split_kv_size, start at 0"),
|
||||
(list(range(100)), 0, 1, "budget = 1, start at 0"),
|
||||
(list(range(100)), 10, 5, "budget < split_kv_size, start at 10"),
|
||||
(list(range(100)), 15, 5, "budget < split_kv_size, near boundary"),
|
||||
(list(range(16)), 0, 16, "exactly split_kv_size tokens needed"),
|
||||
(list(range(16)), 0, 32, "budget > needed"),
|
||||
# Very large budget (overflow guard)
|
||||
(list(range(100)), 0, 1000000, "very large budget"),
|
||||
]
|
||||
for prompt_ids, num_computed, budget, desc in test_cases:
|
||||
with self.subTest(desc=desc):
|
||||
req = _create_request("req", prompt_ids, num_computed)
|
||||
result = self.rm._get_num_new_tokens(req, budget)
|
||||
max_possible = min(len(prompt_ids) - num_computed, budget)
|
||||
self.assertGreaterEqual(result, 0)
|
||||
self.assertLessEqual(result, max_possible)
|
||||
|
||||
# ==================== 4. Chunk consistency ====================
|
||||
|
||||
def test_get_num_new_tokens_consistency_across_chunks(self):
|
||||
"""All chunk boundaries must align to split_kv_size."""
|
||||
split_kv_size = 16
|
||||
self._enable_deterministic(split_kv_size)
|
||||
|
||||
prompt_ids = list(range(112))
|
||||
budget = 50
|
||||
num_computed = 0
|
||||
chunk_sizes = []
|
||||
|
||||
while num_computed < len(prompt_ids):
|
||||
req = _create_request("req", prompt_ids, num_computed)
|
||||
result = self.rm._get_num_new_tokens(req, budget)
|
||||
if result == 0:
|
||||
break
|
||||
chunk_sizes.append(result)
|
||||
num_computed += result
|
||||
|
||||
# Every intermediate boundary must be aligned; final position may equal seq length
|
||||
position = 0
|
||||
for chunk_size in chunk_sizes:
|
||||
position += chunk_size
|
||||
is_ok = (position % split_kv_size == 0) or (position == len(prompt_ids))
|
||||
self.assertTrue(is_ok, f"position {position} not aligned to {split_kv_size}")
|
||||
|
||||
self.assertEqual(num_computed, len(prompt_ids))
|
||||
|
||||
# ==================== 5. Multimodal (parameterized) ====================
|
||||
|
||||
_MULTIMODAL_CASES = [
|
||||
{"name": "image", "prompt_len": 150, "text_len": 50, "modal_id": 1, "budget": 60, "extra": {}},
|
||||
{
|
||||
"name": "video",
|
||||
"prompt_len": 200,
|
||||
"text_len": 80,
|
||||
"modal_id": 2,
|
||||
"budget": 50,
|
||||
"extra": {"can_split_idx_list": [96, 112, 128, 144, 160, 176, 192]},
|
||||
},
|
||||
{"name": "audio", "prompt_len": 120, "text_len": 60, "modal_id": 3, "budget": 40, "extra": {}},
|
||||
]
|
||||
|
||||
def test_multimodal_input_single_modality(self):
|
||||
"""Token allocation for image / video / audio multimodal requests."""
|
||||
self._enable_deterministic(16)
|
||||
rm = self._create_mm_resource_manager()
|
||||
|
||||
for case in self._MULTIMODAL_CASES:
|
||||
with self.subTest(modality=case["name"]):
|
||||
prompt_ids = list(range(case["prompt_len"]))
|
||||
mm_inputs = _build_mm_inputs(case["prompt_len"], case["text_len"], case["modal_id"], case["extra"])
|
||||
req = _create_request(f"mm_{case['name']}", prompt_ids, 0, mm_inputs)
|
||||
result = rm._get_num_new_tokens(req, case["budget"])
|
||||
self.assertGreaterEqual(result, 0)
|
||||
self.assertLessEqual(result, case["budget"])
|
||||
|
||||
# ==================== 6. Real batch scheduling ====================
|
||||
|
||||
def test_real_batch_scheduling_concurrent_requests(self):
|
||||
"""Multiple requests competing for budget, all must respect alignment."""
|
||||
split_kv_size = 16
|
||||
self._enable_deterministic(split_kv_size)
|
||||
budget = 50
|
||||
|
||||
batch = [
|
||||
("req1", list(range(27)), 0),
|
||||
("req2", list(range(63)), 0),
|
||||
("req3", list(range(128)), 0),
|
||||
("req4", list(range(60)), 10),
|
||||
("req5", list(range(47)), 7),
|
||||
]
|
||||
for rid, prompt_ids, computed in batch:
|
||||
with self.subTest(request=rid):
|
||||
req = _create_request(rid, prompt_ids, computed)
|
||||
result = self.rm._get_num_new_tokens(req, budget)
|
||||
final_pos = computed + result
|
||||
max_possible = min(len(prompt_ids) - computed, budget)
|
||||
self.assertLessEqual(result, max_possible)
|
||||
if result > 0:
|
||||
is_ok = (final_pos % split_kv_size == 0) or (final_pos == len(prompt_ids))
|
||||
self.assertTrue(is_ok, f"{rid}: final_pos={final_pos} not aligned")
|
||||
|
||||
def test_real_batch_scheduling_continuous_prefill(self):
|
||||
"""Continuous prefill: all chunks fully consume a 47-token prompt."""
|
||||
split_kv_size = 16
|
||||
self._enable_deterministic(split_kv_size)
|
||||
|
||||
prompt_ids = list(range(47))
|
||||
budget = 50
|
||||
num_computed = 0
|
||||
iterations = 0
|
||||
|
||||
while num_computed < len(prompt_ids) and iterations < 10:
|
||||
req = _create_request("cont", prompt_ids, num_computed)
|
||||
result = self.rm._get_num_new_tokens(req, budget)
|
||||
self.assertGreater(result, 0, f"stuck at {num_computed}")
|
||||
final_pos = num_computed + result
|
||||
is_ok = (final_pos % split_kv_size == 0) or (final_pos == len(prompt_ids))
|
||||
self.assertTrue(is_ok, f"chunk ending at {final_pos} not aligned")
|
||||
num_computed += result
|
||||
iterations += 1
|
||||
|
||||
self.assertEqual(num_computed, len(prompt_ids))
|
||||
|
||||
def test_real_batch_scheduling_with_multimodal_requests(self):
|
||||
"""Mixed batch: text-only + image requests."""
|
||||
self._enable_deterministic(16)
|
||||
rm = self._create_mm_resource_manager()
|
||||
budget = 30
|
||||
|
||||
# Text-only request
|
||||
req_text = _create_request("text_only", list(range(100)), 0)
|
||||
r1 = rm._get_num_new_tokens(req_text, budget)
|
||||
self.assertGreaterEqual(r1, 0)
|
||||
self.assertLessEqual(r1, budget)
|
||||
|
||||
# Image request
|
||||
mm_inputs = _build_mm_inputs(80, 40, modal_id=1)
|
||||
req_img = _create_request("with_image", list(range(80)), 0, mm_inputs)
|
||||
r2 = rm._get_num_new_tokens(req_img, budget)
|
||||
self.assertGreaterEqual(r2, 0)
|
||||
self.assertLessEqual(r2, budget)
|
||||
|
||||
# ==================== 7. Corner cases ====================
|
||||
|
||||
def test_corner_case_invalid_request_states(self):
|
||||
"""Empty prompt, completed prefill, and num_computed > need_prefill must assert."""
|
||||
self._enable_deterministic(16)
|
||||
|
||||
# Empty prompt
|
||||
with self.subTest(case="empty prompt"):
|
||||
with self.assertRaises(AssertionError):
|
||||
self.rm._get_num_new_tokens(_create_request("e", [], 0), 50)
|
||||
|
||||
# Already completed
|
||||
with self.subTest(case="completed prefill"):
|
||||
with self.assertRaises(AssertionError):
|
||||
self.rm._get_num_new_tokens(_create_request("c", list(range(100)), 100), 50)
|
||||
|
||||
# Inconsistent state
|
||||
with self.subTest(case="num_computed > need_prefill"):
|
||||
with self.assertRaises(AssertionError):
|
||||
self.rm._get_num_new_tokens(_create_request("i", list(range(50)), 100), 50)
|
||||
|
||||
# Zero budget (legitimate, returns 0)
|
||||
with self.subTest(case="zero budget"):
|
||||
result = self.rm._get_num_new_tokens(_create_request("z", list(range(100)), 0), 0)
|
||||
self.assertEqual(result, 0)
|
||||
|
||||
def test_corner_case_minimum_split_size(self):
|
||||
"""split_kv_size=1: every position is aligned, so max allocation is allowed."""
|
||||
self._enable_deterministic(1)
|
||||
|
||||
for prompt_ids, computed, budget, expected in [
|
||||
(list(range(100)), 0, 20, 20),
|
||||
(list(range(100)), 10, 15, 15),
|
||||
(list(range(100)), 50, 10, 10),
|
||||
]:
|
||||
with self.subTest(computed=computed, budget=budget):
|
||||
req = _create_request("min", prompt_ids, computed)
|
||||
result = self.rm._get_num_new_tokens(req, budget)
|
||||
self.assertEqual(result, expected)
|
||||
|
||||
def test_corner_case_large_split_size(self):
|
||||
"""split_kv_size >> budget or sequence length."""
|
||||
test_cases = [
|
||||
# (split_kv_size, prompt_ids, num_computed, budget, description)
|
||||
(128, list(range(100)), 0, 10, "split >> budget: budget=10"),
|
||||
(128, list(range(100)), 0, 1, "split >> budget: budget=1"),
|
||||
(128, list(range(100)), 64, 20, "split >> budget: near boundary"),
|
||||
(256, list(range(50)), 0, 100, "split >> seq_len"),
|
||||
]
|
||||
for split_kv_size, prompt_ids, computed, budget, desc in test_cases:
|
||||
with self.subTest(desc=desc):
|
||||
self._enable_deterministic(split_kv_size)
|
||||
req = _create_request("lg", prompt_ids, computed)
|
||||
result = self.rm._get_num_new_tokens(req, budget)
|
||||
max_possible = min(len(prompt_ids) - computed, budget)
|
||||
self.assertGreaterEqual(result, 0)
|
||||
self.assertLessEqual(result, max_possible)
|
||||
|
||||
def test_corner_case_dynamic_config_switch(self):
|
||||
"""Switching from non-deterministic to deterministic mid-stream."""
|
||||
# Phase 1: non-deterministic
|
||||
self._disable_deterministic()
|
||||
req1 = _create_request("sw1", list(range(100)), 0)
|
||||
result1 = self.rm._get_num_new_tokens(req1, 30)
|
||||
|
||||
# Phase 2: enable deterministic, continue from result1
|
||||
split_kv_size = 16
|
||||
self._enable_deterministic(split_kv_size)
|
||||
req2 = _create_request("sw2", list(range(100)), result1)
|
||||
result2 = self.rm._get_num_new_tokens(req2, 30)
|
||||
|
||||
if result2 > 0:
|
||||
final_pos = result1 + result2
|
||||
is_aligned = (final_pos % split_kv_size == 0) or (final_pos == 100)
|
||||
self.assertTrue(is_aligned, f"final_pos={final_pos} not aligned after switch")
|
||||
|
||||
def test_deterministic_return_zero_budget_below_boundary(self):
|
||||
"""Returns 0 when budget cannot reach the next alignment boundary."""
|
||||
split_kv_size = 16
|
||||
self._enable_deterministic(split_kv_size)
|
||||
|
||||
test_cases = [
|
||||
# (prompt_ids, num_computed, budget)
|
||||
# pos=10, next_boundary=16, need 6, budget=5
|
||||
(list(range(100)), 10, 5),
|
||||
# pos=1, next_boundary=16, need 15, budget=3
|
||||
(list(range(100)), 1, 3),
|
||||
# pos=17, next_boundary=32, need 15, budget=14
|
||||
(list(range(100)), 17, 14),
|
||||
# budget=0 (deterministic)
|
||||
(list(range(100)), 0, 0),
|
||||
]
|
||||
for prompt_ids, computed, budget in test_cases:
|
||||
with self.subTest(computed=computed, budget=budget):
|
||||
req = _create_request("det0", prompt_ids, computed)
|
||||
result = self.rm._get_num_new_tokens(req, budget)
|
||||
self.assertEqual(result, 0)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
unittest.main(verbosity=2)
|
||||
Reference in New Issue
Block a user