[Feature] Add Deterministic Inference Support (#6476)

* add

* [tests] Add Paddle attention determinism tests and refactor resource manager

Add comprehensive determinism tests for Paddle attention layer and refactor
resource manager for deterministic mode support.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>

* add

* add

* add

* add

* add more

* add more

* fixsome

* fixsome

* fix bugs

* fix bugs

* only in gpu

* add docs

* fix comments

* fix some

* fix some

* fix comments

* add more

* fix potential problem

* remove not need

* remove not need

* remove no need

* fix bug

* fix bugs

* fix comments

* fix comments

* Update tests/ce/deterministic/test_determinism_verification.py

Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com>

* Update tests/inter_communicator/test_ipc_signal.py

Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com>

* Update tests/layers/test_paddle_attention_determinism.py

Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com>

* Update tests/engine/test_sampling_params_determinism.py

Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com>

* Update tests/layers/test_paddle_attention_determinism.py

Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com>

* Update tests/layers/test_paddle_attention_determinism_standalone.py

Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com>

* fix comments

* fix import error

* fix a bug

* fix bugs

* fix bugs

* fix coverage

* refine codes

* refine code

* fix comments

* fix comments

* fix comments

* rm not need

* fix allreduce large tensor bug

* mv log files

* mv log files

* add files

---------

Co-authored-by: Claude Opus 4.6 <noreply@anthropic.com>
Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com>
This commit is contained in:
gongweibao
2026-02-27 11:31:51 +08:00
committed by GitHub
parent c34cb2a8c2
commit edd31e8849
24 changed files with 3364 additions and 27 deletions
+57 -2
View File
@@ -453,7 +453,42 @@ class ResourceManagerV1(ResourceManager):
def _get_num_new_tokens(self, request, token_budget):
# TODO: set condition to new _get_num_new_tokens
num_new_tokens = request.need_prefill_tokens - request.num_computed_tokens
assert num_new_tokens > 0, (
f"Request {request.request_id} has no remaining tokens: "
f"need_prefill={request.need_prefill_tokens}, computed={request.num_computed_tokens}"
)
num_new_tokens = min(num_new_tokens, token_budget)
# Deterministic mode: align chunk boundaries to split_kv_size
# This ensures batch-invariant attention by making each chunk
# a multiple of the split-KV block size (default 16)
if envs.FD_DETERMINISTIC_MODE:
split_kv_size = envs.FD_DETERMINISTIC_SPLIT_KV_SIZE
current_pos = request.num_computed_tokens
remaining_tokens = request.need_prefill_tokens - current_pos
# Case 1: Final chunk - no alignment needed
if remaining_tokens < split_kv_size:
aligned_end = current_pos + remaining_tokens
else:
# Case 2: Need to align to split_kv_size boundary
# Calculate next boundary position
next_boundary = ((current_pos + split_kv_size - 1) // split_kv_size) * split_kv_size
tokens_to_boundary = next_boundary - current_pos
# Not enough budget to reach the next boundary: defer to next iteration
if token_budget < tokens_to_boundary:
return 0
# Align to as many full boundaries as budget allows
aligned_end = ((current_pos + token_budget) // split_kv_size) * split_kv_size
num_new_tokens = aligned_end - current_pos
# Don't exceed the original budget or remaining tokens
num_new_tokens = min(
num_new_tokens, token_budget, request.need_prefill_tokens - request.num_computed_tokens
)
if (
current_platform.is_intel_hpu()
and request.need_prefill_tokens - request.num_computed_tokens > token_budget
@@ -466,7 +501,11 @@ class ResourceManagerV1(ResourceManager):
return num_new_tokens
inputs = request.multimodal_inputs
if inputs.get("patch_idx", None) is not None and inputs.get("patch_map", None) is not None:
if (
inputs is not None
and inputs.get("patch_idx", None) is not None
and inputs.get("patch_map", None) is not None
):
pre_end_idx = request.num_computed_tokens
new_end_idx = pre_end_idx + num_new_tokens
@@ -541,7 +580,8 @@ class ResourceManagerV1(ResourceManager):
request.video_end = end_patch_map["video_num"]
request.audio_end = _compute_audio_prefix_count(new_end_idx, end_patch_idx)
elif (
inputs.get("images", None) is not None
inputs is not None
and inputs.get("images", None) is not None
and inputs.get("image_patch_id", None) is not None
and inputs.get("grid_thw", None) is not None
):
@@ -790,6 +830,9 @@ class ResourceManagerV1(ResourceManager):
req_index += 1
continue
num_new_tokens = self._get_num_new_tokens(request, token_budget)
if num_new_tokens == 0:
req_index += 1
continue
num_new_block = self.get_new_block_nums(request, num_new_tokens)
# Allocate blocks to prefill
if self.cache_manager.can_allocate_gpu_blocks(num_new_block):
@@ -863,6 +906,12 @@ class ResourceManagerV1(ResourceManager):
continue
# Allocate blocks for the tokens that does not hit cache
num_new_tokens = self._get_num_new_tokens(request, token_budget)
if num_new_tokens == 0:
if self.config.cache_config.enable_prefix_caching:
self._free_blocks(request)
skip_requests.append(request)
self.waiting.popleft()
continue
num_new_block = self.get_new_block_nums(request, num_new_tokens)
can_schedule_block_num_threshold = self._get_can_schedule_prefill_threshold_block(
request, num_new_block
@@ -916,6 +965,12 @@ class ResourceManagerV1(ResourceManager):
# Allocate blocks for the tokens that does not hit cache
num_new_tokens = self._get_num_new_tokens(request, token_budget)
if num_new_tokens == 0:
if self.config.cache_config.enable_prefix_caching:
self._free_blocks(request)
skip_requests.append(request)
self.waiting.popleft()
continue
num_new_block = self.get_new_block_nums(request, num_new_tokens)
can_schedule_block_num_threshold = self._get_can_schedule_prefill_threshold_block(
request, num_new_block