[Feature] Add Deterministic Inference Support (#6476)

* add * [tests] Add Paddle attention determinism tests and refactor resource manager Add comprehensive determinism tests for Paddle attention layer and refactor resource manager for deterministic mode support. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com> * add * add * add * add * add more * add more * fixsome * fixsome * fix bugs * fix bugs * only in gpu * add docs * fix comments * fix some * fix some * fix comments * add more * fix potential problem * remove not need * remove not need * remove no need * fix bug * fix bugs * fix comments * fix comments * Update tests/ce/deterministic/test_determinism_verification.py Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com> * Update tests/inter_communicator/test_ipc_signal.py Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com> * Update tests/layers/test_paddle_attention_determinism.py Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com> * Update tests/engine/test_sampling_params_determinism.py Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com> * Update tests/layers/test_paddle_attention_determinism.py Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com> * Update tests/layers/test_paddle_attention_determinism_standalone.py Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com> * fix comments * fix import error * fix a bug * fix bugs * fix bugs * fix coverage * refine codes * refine code * fix comments * fix comments * fix comments * rm not need * fix allreduce large tensor bug * mv log files * mv log files * add files --------- Co-authored-by: Claude Opus 4.6 <noreply@anthropic.com> Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com>
2026-04-23 00:17:25 +08:00 · 2026-02-27 11:31:51 +08:00
parent c34cb2a8c2
commit edd31e8849
24 changed files with 3364 additions and 27 deletions
@@ -453,7 +453,42 @@ class ResourceManagerV1(ResourceManager):
    def _get_num_new_tokens(self, request, token_budget):
        # TODO: set condition to new _get_num_new_tokens
        num_new_tokens = request.need_prefill_tokens - request.num_computed_tokens
+        assert num_new_tokens > 0, (
+            f"Request {request.request_id} has no remaining tokens: "
+            f"need_prefill={request.need_prefill_tokens}, computed={request.num_computed_tokens}"
+        )
        num_new_tokens = min(num_new_tokens, token_budget)
+
+        # Deterministic mode: align chunk boundaries to split_kv_size
+        # This ensures batch-invariant attention by making each chunk
+        # a multiple of the split-KV block size (default 16)
+        if envs.FD_DETERMINISTIC_MODE:
+            split_kv_size = envs.FD_DETERMINISTIC_SPLIT_KV_SIZE
+            current_pos = request.num_computed_tokens
+            remaining_tokens = request.need_prefill_tokens - current_pos
+
+            # Case 1: Final chunk - no alignment needed
+            if remaining_tokens < split_kv_size:
+                aligned_end = current_pos + remaining_tokens
+            else:
+                # Case 2: Need to align to split_kv_size boundary
+                # Calculate next boundary position
+                next_boundary = ((current_pos + split_kv_size - 1) // split_kv_size) * split_kv_size
+                tokens_to_boundary = next_boundary - current_pos
+
+                # Not enough budget to reach the next boundary: defer to next iteration
+                if token_budget < tokens_to_boundary:
+                    return 0
+
+                # Align to as many full boundaries as budget allows
+                aligned_end = ((current_pos + token_budget) // split_kv_size) * split_kv_size
+
+            num_new_tokens = aligned_end - current_pos
+            # Don't exceed the original budget or remaining tokens
+            num_new_tokens = min(
+                num_new_tokens, token_budget, request.need_prefill_tokens - request.num_computed_tokens
+            )
+
        if (
            current_platform.is_intel_hpu()
            and request.need_prefill_tokens - request.num_computed_tokens > token_budget
@@ -466,7 +501,11 @@ class ResourceManagerV1(ResourceManager):
            return num_new_tokens

        inputs = request.multimodal_inputs
-        if inputs.get("patch_idx", None) is not None and inputs.get("patch_map", None) is not None:
+        if (
+            inputs is not None
+            and inputs.get("patch_idx", None) is not None
+            and inputs.get("patch_map", None) is not None
+        ):
            pre_end_idx = request.num_computed_tokens
            new_end_idx = pre_end_idx + num_new_tokens

@@ -541,7 +580,8 @@ class ResourceManagerV1(ResourceManager):
            request.video_end = end_patch_map["video_num"]
            request.audio_end = _compute_audio_prefix_count(new_end_idx, end_patch_idx)
        elif (
-            inputs.get("images", None) is not None
+            inputs is not None
+            and inputs.get("images", None) is not None
            and inputs.get("image_patch_id", None) is not None
            and inputs.get("grid_thw", None) is not None
        ):
@@ -790,6 +830,9 @@ class ResourceManagerV1(ResourceManager):
                        req_index += 1
                        continue
                    num_new_tokens = self._get_num_new_tokens(request, token_budget)
+                    if num_new_tokens == 0:
+                        req_index += 1
+                        continue
                    num_new_block = self.get_new_block_nums(request, num_new_tokens)
                    # Allocate blocks to prefill
                    if self.cache_manager.can_allocate_gpu_blocks(num_new_block):
@@ -863,6 +906,12 @@ class ResourceManagerV1(ResourceManager):
                            continue
                        # Allocate blocks for the tokens that does not hit cache
                        num_new_tokens = self._get_num_new_tokens(request, token_budget)
+                        if num_new_tokens == 0:
+                            if self.config.cache_config.enable_prefix_caching:
+                                self._free_blocks(request)
+                            skip_requests.append(request)
+                            self.waiting.popleft()
+                            continue
                        num_new_block = self.get_new_block_nums(request, num_new_tokens)
                        can_schedule_block_num_threshold = self._get_can_schedule_prefill_threshold_block(
                            request, num_new_block
@@ -916,6 +965,12 @@ class ResourceManagerV1(ResourceManager):

                        # Allocate blocks for the tokens that does not hit cache
                        num_new_tokens = self._get_num_new_tokens(request, token_budget)
+                        if num_new_tokens == 0:
+                            if self.config.cache_config.enable_prefix_caching:
+                                self._free_blocks(request)
+                            skip_requests.append(request)
+                            self.waiting.popleft()
+                            continue
                        num_new_block = self.get_new_block_nums(request, num_new_tokens)
                        can_schedule_block_num_threshold = self._get_can_schedule_prefill_threshold_block(
                            request, num_new_block