mirror of
https://github.com/PaddlePaddle/FastDeploy.git
synced 2026-04-22 16:07:51 +08:00
[Feature] Add Deterministic Inference Support (#6476)
* add * [tests] Add Paddle attention determinism tests and refactor resource manager Add comprehensive determinism tests for Paddle attention layer and refactor resource manager for deterministic mode support. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com> * add * add * add * add * add more * add more * fixsome * fixsome * fix bugs * fix bugs * only in gpu * add docs * fix comments * fix some * fix some * fix comments * add more * fix potential problem * remove not need * remove not need * remove no need * fix bug * fix bugs * fix comments * fix comments * Update tests/ce/deterministic/test_determinism_verification.py Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com> * Update tests/inter_communicator/test_ipc_signal.py Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com> * Update tests/layers/test_paddle_attention_determinism.py Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com> * Update tests/engine/test_sampling_params_determinism.py Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com> * Update tests/layers/test_paddle_attention_determinism.py Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com> * Update tests/layers/test_paddle_attention_determinism_standalone.py Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com> * fix comments * fix import error * fix a bug * fix bugs * fix bugs * fix coverage * refine codes * refine code * fix comments * fix comments * fix comments * rm not need * fix allreduce large tensor bug * mv log files * mv log files * add files --------- Co-authored-by: Claude Opus 4.6 <noreply@anthropic.com> Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com>
This commit is contained in:
@@ -58,7 +58,7 @@ void decode_alltoall_transpose(paddle::Tensor& inp,
|
||||
auto fa = reinterpret_cast<paddle::CustomAllreduce*>(_fa);
|
||||
auto stream = inp.stream();
|
||||
|
||||
auto input_size = inp.numel() * 2;
|
||||
auto input_size = inp.numel() * phi::SizeOf(inp.dtype());
|
||||
auto token_num = inp.shape()[0];
|
||||
auto hidden_size = inp.shape()[1];
|
||||
auto reg_buffer = reinterpret_cast<void*>(_reg_buffer);
|
||||
@@ -121,7 +121,7 @@ void all_reduce(paddle::Tensor& inp,
|
||||
auto fa = reinterpret_cast<paddle::CustomAllreduce*>(_fa);
|
||||
auto stream = inp.stream();
|
||||
|
||||
auto input_size = inp.numel() * 2;
|
||||
auto input_size = inp.numel() * phi::SizeOf(inp.dtype());
|
||||
auto reg_buffer = reinterpret_cast<void*>(_reg_buffer);
|
||||
if (reg_buffer) {
|
||||
cudaMemcpyAsync(
|
||||
|
||||
@@ -20,8 +20,24 @@ import paddle
|
||||
import paddle.distributed as dist
|
||||
from paddle.distributed import fleet
|
||||
|
||||
import fastdeploy.envs as envs
|
||||
from fastdeploy.utils import register_custom_python_op
|
||||
|
||||
# Constants
|
||||
SUPPORTED_DTYPES = (paddle.float32, paddle.float16, paddle.bfloat16)
|
||||
|
||||
|
||||
def tensor_byte_size(tensor: paddle.Tensor) -> int:
|
||||
"""Compute tensor size in bytes from .shape to avoid numel() which
|
||||
triggers cudaErrorStreamCaptureImplicit during CUDA Graph capture."""
|
||||
size = 1
|
||||
for s in tensor.shape:
|
||||
size *= s
|
||||
size *= tensor.element_size()
|
||||
return size
|
||||
|
||||
|
||||
# Global custom all-reduce instance
|
||||
_TP_AR = None
|
||||
|
||||
|
||||
@@ -36,8 +52,11 @@ def capture_custom_allreduce():
|
||||
|
||||
|
||||
def use_custom_allreduce(
|
||||
tp_group: paddle.distributed.communication.group.Group = None, custom_all_reduce_max_bytes: int = 8192 * 1024
|
||||
):
|
||||
tp_group: paddle.distributed.communication.group.Group = None,
|
||||
custom_all_reduce_max_bytes: int = None,
|
||||
) -> None:
|
||||
if custom_all_reduce_max_bytes is None:
|
||||
custom_all_reduce_max_bytes = envs.FD_CUSTOM_AR_MAX_SIZE_MB * 1024 * 1024
|
||||
if tp_group is None:
|
||||
hcg = fleet.get_hybrid_communicate_group()
|
||||
tp_group = hcg.get_model_parallel_group()
|
||||
@@ -53,17 +72,71 @@ def custom_ar_clear_ipc_handles():
|
||||
_TP_AR.clear_ipc_handles()
|
||||
|
||||
|
||||
def _ensure_deterministic_ready(input_: paddle.Tensor) -> None:
|
||||
"""Validate all preconditions for deterministic all-reduce."""
|
||||
global _TP_AR
|
||||
# Lazy initialization of custom all-reduce
|
||||
if _TP_AR is None:
|
||||
try:
|
||||
hcg = fleet.get_hybrid_communicate_group()
|
||||
tp_group = hcg.get_model_parallel_group()
|
||||
if tp_group is not None and tp_group.nranks > 1:
|
||||
use_custom_allreduce(tp_group)
|
||||
except Exception as e:
|
||||
raise RuntimeError(
|
||||
"DETERMINISTIC_MODE is enabled but cannot auto-initialize custom all-reduce. "
|
||||
"TP all-reduce would use NCCL which may produce non-deterministic results "
|
||||
"due to floating-point accumulation order. "
|
||||
"Ensure fleet is initialized before any TP operations, "
|
||||
"or explicitly call use_custom_allreduce() beforehand."
|
||||
) from e
|
||||
|
||||
if _TP_AR is None:
|
||||
raise RuntimeError(
|
||||
"DETERMINISTIC_MODE is enabled but custom all-reduce is not available. "
|
||||
"Falling back to NCCL would produce non-deterministic results. "
|
||||
"Ensure custom all-reduce is properly initialized via use_custom_allreduce()."
|
||||
)
|
||||
|
||||
if input_.dtype not in SUPPORTED_DTYPES:
|
||||
raise AssertionError(
|
||||
f"DETERMINISTIC_MODE is enabled but input tensor dtype={input_.dtype} is not supported. "
|
||||
f"Custom all-reduce only supports: {', '.join(str(d) for d in SUPPORTED_DTYPES)}. "
|
||||
f"Input tensor shape: {input_.shape}, dtype: {input_.dtype}."
|
||||
)
|
||||
|
||||
# Compute size from .shape to avoid numel() which triggers
|
||||
# cudaErrorStreamCaptureImplicit during CUDA Graph capture
|
||||
inp_size = tensor_byte_size(input_)
|
||||
|
||||
if inp_size % 16 != 0:
|
||||
raise RuntimeError(
|
||||
f"DETERMINISTIC_MODE is enabled but input tensor size ({inp_size} bytes) "
|
||||
f"is not a multiple of 16. Custom all-reduce requires 16-byte aligned tensors. "
|
||||
f"Input tensor shape: {input_.shape}, element_size: {input_.element_size()} bytes, "
|
||||
f"total size: {inp_size} bytes."
|
||||
)
|
||||
|
||||
if inp_size > _TP_AR.max_size:
|
||||
raise RuntimeError(
|
||||
f"DETERMINISTIC_MODE: input tensor ({inp_size} bytes) exceeds "
|
||||
f"custom all-reduce max_size ({_TP_AR.max_size} bytes). "
|
||||
f"Increase buffer size via: export FD_CUSTOM_AR_MAX_SIZE_MB="
|
||||
f"{(inp_size // (1024 * 1024)) + 1}"
|
||||
)
|
||||
|
||||
|
||||
try:
|
||||
|
||||
def tensor_model_parallel_all_reduce_infer_meta(x: "paddle.static.MetaTensor", group_) -> paddle.static.MetaTensor:
|
||||
def tensor_model_parallel_all_reduce_infer_meta(
|
||||
x: "paddle.static.MetaTensor", group_: paddle.distributed.communication.group.Group
|
||||
) -> paddle.static.MetaTensor:
|
||||
return paddle.static.MetaTensor(shape=x.shape, dtype=x.dtype)
|
||||
|
||||
@register_custom_python_op(
|
||||
name="tensor_model_parallel_all_reduce",
|
||||
infer_meta=tensor_model_parallel_all_reduce_infer_meta,
|
||||
input_names=[
|
||||
"input_",
|
||||
],
|
||||
input_names=["input_"],
|
||||
output_names=["out"],
|
||||
inplace_map={},
|
||||
)
|
||||
@@ -72,13 +145,20 @@ try:
|
||||
group_: paddle.distributed.communication.group.Group = None,
|
||||
) -> paddle.Tensor:
|
||||
"""All-reduce the input tensor across model parallel group."""
|
||||
global _TP_AR
|
||||
if input_.shape[0] == 0:
|
||||
return input_
|
||||
global _TP_AR
|
||||
|
||||
if envs.FD_DETERMINISTIC_MODE:
|
||||
_ensure_deterministic_ready(input_)
|
||||
return _TP_AR.custom_all_reduce(input_)
|
||||
|
||||
# for performance, use custom all-reduce if possible
|
||||
if _TP_AR is not None and _TP_AR.should_custom_ar(input_):
|
||||
# TODO: supports different_group custom allreduce
|
||||
input_ = _TP_AR.custom_all_reduce(input_)
|
||||
elif paddle.in_dynamic_mode():
|
||||
return _TP_AR.custom_all_reduce(input_)
|
||||
|
||||
if paddle.in_dynamic_mode():
|
||||
if group_ is not None:
|
||||
dist.all_reduce(input_, group=group_)
|
||||
else:
|
||||
|
||||
@@ -22,6 +22,7 @@ import paddle
|
||||
import paddle.distributed as dist
|
||||
from paddle.distributed.communication.group import Group
|
||||
|
||||
from fastdeploy.distributed.communication import tensor_byte_size
|
||||
from fastdeploy.distributed.custom_all_reduce import cuda_wrapper
|
||||
from fastdeploy.model_executor.ops.gpu import (
|
||||
all_reduce,
|
||||
@@ -133,16 +134,22 @@ class CustomAllreduce:
|
||||
lib.cudaFree(ctypes.c_void_p(pointers[rank]))
|
||||
|
||||
def should_custom_ar(self, inp: paddle.Tensor):
|
||||
if self.capturing:
|
||||
return True
|
||||
inp_size = inp.shape[0] * inp.shape[1] * inp.element_size()
|
||||
inp_size = tensor_byte_size(inp)
|
||||
if inp_size > self.max_size:
|
||||
return False
|
||||
|
||||
# custom allreduce requires input byte size to be multiples of 16
|
||||
if inp_size % 16 != 0:
|
||||
return False
|
||||
|
||||
# for 4 or more non NVLink-capable GPUs, custom allreduce provides
|
||||
# little performance improvement over NCCL.
|
||||
if self.world_size == 2 or self.full_nvlink:
|
||||
return inp_size < self.max_size
|
||||
return True
|
||||
|
||||
if self.capturing:
|
||||
return True
|
||||
|
||||
return False
|
||||
|
||||
def all_reduce(
|
||||
|
||||
@@ -276,7 +276,11 @@ class SamplingParams:
|
||||
|
||||
def __post_init__(self):
|
||||
if self.seed is None:
|
||||
self.seed = random.randint(0, 922337203685477580)
|
||||
# Deterministic mode: use fixed seed
|
||||
if envs.FD_DETERMINISTIC_MODE:
|
||||
self.seed = 42
|
||||
else:
|
||||
self.seed = random.randint(0, 922337203685477580)
|
||||
self._verify_args()
|
||||
|
||||
def _verify_args(self) -> None:
|
||||
|
||||
@@ -453,7 +453,42 @@ class ResourceManagerV1(ResourceManager):
|
||||
def _get_num_new_tokens(self, request, token_budget):
|
||||
# TODO: set condition to new _get_num_new_tokens
|
||||
num_new_tokens = request.need_prefill_tokens - request.num_computed_tokens
|
||||
assert num_new_tokens > 0, (
|
||||
f"Request {request.request_id} has no remaining tokens: "
|
||||
f"need_prefill={request.need_prefill_tokens}, computed={request.num_computed_tokens}"
|
||||
)
|
||||
num_new_tokens = min(num_new_tokens, token_budget)
|
||||
|
||||
# Deterministic mode: align chunk boundaries to split_kv_size
|
||||
# This ensures batch-invariant attention by making each chunk
|
||||
# a multiple of the split-KV block size (default 16)
|
||||
if envs.FD_DETERMINISTIC_MODE:
|
||||
split_kv_size = envs.FD_DETERMINISTIC_SPLIT_KV_SIZE
|
||||
current_pos = request.num_computed_tokens
|
||||
remaining_tokens = request.need_prefill_tokens - current_pos
|
||||
|
||||
# Case 1: Final chunk - no alignment needed
|
||||
if remaining_tokens < split_kv_size:
|
||||
aligned_end = current_pos + remaining_tokens
|
||||
else:
|
||||
# Case 2: Need to align to split_kv_size boundary
|
||||
# Calculate next boundary position
|
||||
next_boundary = ((current_pos + split_kv_size - 1) // split_kv_size) * split_kv_size
|
||||
tokens_to_boundary = next_boundary - current_pos
|
||||
|
||||
# Not enough budget to reach the next boundary: defer to next iteration
|
||||
if token_budget < tokens_to_boundary:
|
||||
return 0
|
||||
|
||||
# Align to as many full boundaries as budget allows
|
||||
aligned_end = ((current_pos + token_budget) // split_kv_size) * split_kv_size
|
||||
|
||||
num_new_tokens = aligned_end - current_pos
|
||||
# Don't exceed the original budget or remaining tokens
|
||||
num_new_tokens = min(
|
||||
num_new_tokens, token_budget, request.need_prefill_tokens - request.num_computed_tokens
|
||||
)
|
||||
|
||||
if (
|
||||
current_platform.is_intel_hpu()
|
||||
and request.need_prefill_tokens - request.num_computed_tokens > token_budget
|
||||
@@ -466,7 +501,11 @@ class ResourceManagerV1(ResourceManager):
|
||||
return num_new_tokens
|
||||
|
||||
inputs = request.multimodal_inputs
|
||||
if inputs.get("patch_idx", None) is not None and inputs.get("patch_map", None) is not None:
|
||||
if (
|
||||
inputs is not None
|
||||
and inputs.get("patch_idx", None) is not None
|
||||
and inputs.get("patch_map", None) is not None
|
||||
):
|
||||
pre_end_idx = request.num_computed_tokens
|
||||
new_end_idx = pre_end_idx + num_new_tokens
|
||||
|
||||
@@ -541,7 +580,8 @@ class ResourceManagerV1(ResourceManager):
|
||||
request.video_end = end_patch_map["video_num"]
|
||||
request.audio_end = _compute_audio_prefix_count(new_end_idx, end_patch_idx)
|
||||
elif (
|
||||
inputs.get("images", None) is not None
|
||||
inputs is not None
|
||||
and inputs.get("images", None) is not None
|
||||
and inputs.get("image_patch_id", None) is not None
|
||||
and inputs.get("grid_thw", None) is not None
|
||||
):
|
||||
@@ -790,6 +830,9 @@ class ResourceManagerV1(ResourceManager):
|
||||
req_index += 1
|
||||
continue
|
||||
num_new_tokens = self._get_num_new_tokens(request, token_budget)
|
||||
if num_new_tokens == 0:
|
||||
req_index += 1
|
||||
continue
|
||||
num_new_block = self.get_new_block_nums(request, num_new_tokens)
|
||||
# Allocate blocks to prefill
|
||||
if self.cache_manager.can_allocate_gpu_blocks(num_new_block):
|
||||
@@ -863,6 +906,12 @@ class ResourceManagerV1(ResourceManager):
|
||||
continue
|
||||
# Allocate blocks for the tokens that does not hit cache
|
||||
num_new_tokens = self._get_num_new_tokens(request, token_budget)
|
||||
if num_new_tokens == 0:
|
||||
if self.config.cache_config.enable_prefix_caching:
|
||||
self._free_blocks(request)
|
||||
skip_requests.append(request)
|
||||
self.waiting.popleft()
|
||||
continue
|
||||
num_new_block = self.get_new_block_nums(request, num_new_tokens)
|
||||
can_schedule_block_num_threshold = self._get_can_schedule_prefill_threshold_block(
|
||||
request, num_new_block
|
||||
@@ -916,6 +965,12 @@ class ResourceManagerV1(ResourceManager):
|
||||
|
||||
# Allocate blocks for the tokens that does not hit cache
|
||||
num_new_tokens = self._get_num_new_tokens(request, token_budget)
|
||||
if num_new_tokens == 0:
|
||||
if self.config.cache_config.enable_prefix_caching:
|
||||
self._free_blocks(request)
|
||||
skip_requests.append(request)
|
||||
self.waiting.popleft()
|
||||
continue
|
||||
num_new_block = self.get_new_block_nums(request, num_new_tokens)
|
||||
can_schedule_block_num_threshold = self._get_can_schedule_prefill_threshold_block(
|
||||
request, num_new_block
|
||||
|
||||
@@ -18,6 +18,14 @@ Environment variables used by FastDeploy.
|
||||
import os
|
||||
from typing import Any, Callable
|
||||
|
||||
|
||||
def _validate_split_kv_size(value: int) -> int:
|
||||
"""Validate FD_DETERMINISTIC_SPLIT_KV_SIZE is a positive power of 2."""
|
||||
if value <= 0 or (value & (value - 1)) != 0:
|
||||
raise ValueError(f"FD_DETERMINISTIC_SPLIT_KV_SIZE must be a positive power of 2, got {value}.")
|
||||
return value
|
||||
|
||||
|
||||
environment_variables: dict[str, Callable[[], Any]] = {
|
||||
# Whether to use BF16 on CPU.
|
||||
"FD_CPU_USE_BF16": lambda: os.getenv("FD_CPU_USE_BF16", "False"),
|
||||
@@ -206,6 +214,18 @@ environment_variables: dict[str, Callable[[], Any]] = {
|
||||
"FD_WORKER_ALIVE_TIMEOUT": lambda: int(os.getenv("FD_WORKER_ALIVE_TIMEOUT", "30")),
|
||||
# File path for file storage backend
|
||||
"FILE_BACKEND_STORAGE_DIR": lambda: str(os.getenv("FILE_BACKEND_STORAGE_DIR", "/tmp/fastdeploy")),
|
||||
# Custom all-reduce max buffer size in MB (default 8MB).
|
||||
# Increase this to avoid NCCL fallback for large tensors in deterministic mode.
|
||||
# E.g. FD_CUSTOM_AR_MAX_SIZE_MB=128 for 128MB.
|
||||
"FD_CUSTOM_AR_MAX_SIZE_MB": lambda: int(os.getenv("FD_CUSTOM_AR_MAX_SIZE_MB", "8")),
|
||||
# Enable deterministic inference mode for chunked prefill alignment
|
||||
"FD_DETERMINISTIC_MODE": lambda: bool(int(os.getenv("FD_DETERMINISTIC_MODE", "0"))),
|
||||
# Split KV block size for deterministic alignment (must be power of 2 and > 0, default 16)
|
||||
"FD_DETERMINISTIC_SPLIT_KV_SIZE": lambda: _validate_split_kv_size(
|
||||
int(os.getenv("FD_DETERMINISTIC_SPLIT_KV_SIZE", "16"))
|
||||
),
|
||||
# Enable determinism logging (print MD5 hashes and debug info)
|
||||
"FD_DETERMINISTIC_LOG_MODE": lambda: bool(int(os.getenv("FD_DETERMINISTIC_LOG_MODE", "0"))),
|
||||
# Whether to use PD REORDER, can set 0 or 1
|
||||
"FD_PD_REORDER": lambda: int(os.getenv("FD_PD_REORDER", "0")),
|
||||
}
|
||||
|
||||
@@ -0,0 +1,220 @@
|
||||
"""
|
||||
# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License"
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
"""
|
||||
|
||||
import hashlib
|
||||
import logging
|
||||
import time
|
||||
|
||||
import numpy as np
|
||||
|
||||
det_logger = logging.getLogger("fastdeploy.deterministic")
|
||||
|
||||
|
||||
class DeterministicLogger:
|
||||
"""Helper for logging tensor MD5 hashes and input details to assist determinism debugging."""
|
||||
|
||||
def __init__(self, share_inputs):
|
||||
self.share_inputs = share_inputs
|
||||
self._current_run_id = None
|
||||
self._batch_counter = 0
|
||||
|
||||
def log_batch_start(self, model_forward_batch):
|
||||
"""Log batch start with run_id tracking and batch counting."""
|
||||
current_run_id = None
|
||||
for req in model_forward_batch or []:
|
||||
if req is not None:
|
||||
parts = req.request_id.split("_")
|
||||
if len(parts) > 1:
|
||||
current_run_id = parts[-1]
|
||||
break
|
||||
if current_run_id is not None and current_run_id != self._current_run_id:
|
||||
self._current_run_id = current_run_id
|
||||
self._batch_counter = 0
|
||||
|
||||
self._batch_counter += 1
|
||||
|
||||
det_logger.info(f"\n{'='*80}")
|
||||
det_logger.info(f"[BATCH-START] Run_{self._current_run_id} Batch_{self._batch_counter}")
|
||||
det_logger.info(f"{'='*80}\n")
|
||||
|
||||
@staticmethod
|
||||
def _compute_tensor_md5(tensor, name="tensor", prefix=""):
|
||||
"""Compute MD5 hash of tensor for comparison"""
|
||||
if tensor is None:
|
||||
return f"{name}_md5=None"
|
||||
|
||||
# Copy tensor to CPU and convert to numpy array
|
||||
try:
|
||||
tensor_cpu = tensor.cpu().numpy().tobytes()
|
||||
except Exception:
|
||||
# For data types that don't support direct tobytes (e.g., float16), convert first
|
||||
tensor_cpu = tensor.cpu().numpy().astype(np.float32).tobytes()
|
||||
|
||||
md5_hash = hashlib.md5(tensor_cpu).hexdigest()
|
||||
return f"{prefix}{name}_md5={md5_hash[:16]}" # Print only first 16 chars to reduce log length
|
||||
|
||||
def log_tensor_md5s(self, tensor_dict, forward_batch_reqs_list=None, stage="forward"):
|
||||
"""Log MD5 hash values for multiple tensors, including per-request MD5
|
||||
|
||||
Args:
|
||||
tensor_dict: {name: tensor} dictionary
|
||||
forward_batch_reqs_list: forward_batch_reqs_list list (may contain None)
|
||||
stage: Stage identifier (e.g., "prefill", "decode", "forward")
|
||||
"""
|
||||
# Get batch size from first valid tensor
|
||||
batch_size = self._get_batch_size(tensor_dict)
|
||||
if batch_size is None:
|
||||
return
|
||||
|
||||
# Get prefill/decode counts
|
||||
prefill_count, decode_count, seq_lens_encoder = self._get_stage_counts(batch_size)
|
||||
|
||||
# Build stage information
|
||||
stage_info = stage
|
||||
if prefill_count > 0 or decode_count > 0:
|
||||
stage_info += f" (prefill={prefill_count}, decode={decode_count})"
|
||||
|
||||
# Compute and log batch MD5
|
||||
batch_md5_info = [
|
||||
self._compute_tensor_md5(tensor, name, prefix="batch_")
|
||||
for name, tensor in tensor_dict.items()
|
||||
if tensor is not None
|
||||
]
|
||||
|
||||
# Log overall batch MD5
|
||||
req_id_str = self._build_req_id_str(forward_batch_reqs_list)
|
||||
det_logger.info(
|
||||
f"[DETERMINISM-MD5] stage={stage_info} | batch_size={batch_size} | "
|
||||
+ (f"requests: {req_id_str} | " if req_id_str else "")
|
||||
+ " | ".join(batch_md5_info)
|
||||
)
|
||||
|
||||
# Log per-request MD5 for decode requests
|
||||
self._log_per_request_md5s(
|
||||
tensor_dict, forward_batch_reqs_list, batch_size, prefill_count, decode_count, seq_lens_encoder
|
||||
)
|
||||
|
||||
@staticmethod
|
||||
def _get_batch_size(tensor_dict):
|
||||
"""Get batch size from first tensor with a shape."""
|
||||
for name, tensor in tensor_dict.items():
|
||||
if tensor is not None and hasattr(tensor, "shape"):
|
||||
return tensor.shape[0]
|
||||
return None
|
||||
|
||||
def _get_stage_counts(self, batch_size):
|
||||
"""Get prefill/decode counts and seq_lens_encoder."""
|
||||
prefill_count = 0
|
||||
decode_count = 0
|
||||
seq_lens_encoder = None
|
||||
|
||||
if self.share_inputs is not None and "seq_lens_encoder" in self.share_inputs:
|
||||
seq_lens_encoder = self.share_inputs["seq_lens_encoder"].cpu().numpy()
|
||||
prefill_count = int((seq_lens_encoder > 0).sum())
|
||||
decode_count = int(batch_size - prefill_count)
|
||||
|
||||
return prefill_count, decode_count, seq_lens_encoder
|
||||
|
||||
@staticmethod
|
||||
def _build_req_id_str(forward_batch_reqs_list):
|
||||
"""Build request ID string from forward_batch_reqs_list."""
|
||||
if forward_batch_reqs_list is None:
|
||||
return ""
|
||||
req_info = [f"[{i}]{req.request_id}" for i, req in enumerate(forward_batch_reqs_list) if req is not None]
|
||||
return ", ".join(req_info)
|
||||
|
||||
def _log_per_request_md5s(
|
||||
self, tensor_dict, forward_batch_reqs_list, batch_size, prefill_count, decode_count, seq_lens_encoder
|
||||
):
|
||||
"""Log per-request MD5 for decode requests.
|
||||
|
||||
In decode phase, tensor shape is [batch_size, hidden_dim] or [batch_size, vocab_size].
|
||||
Can split by batch dimension directly.
|
||||
"""
|
||||
if decode_count == 0 or forward_batch_reqs_list is None:
|
||||
return
|
||||
|
||||
for i, req in enumerate(forward_batch_reqs_list):
|
||||
if req is None or i >= batch_size:
|
||||
continue
|
||||
|
||||
# Check if this is a decode request
|
||||
if seq_lens_encoder is not None:
|
||||
if i >= len(seq_lens_encoder) or int(seq_lens_encoder[i]) != 0:
|
||||
continue # Skip prefill requests
|
||||
elif prefill_count > 0:
|
||||
continue # Mixed batch without seq_lens_encoder, skip all
|
||||
|
||||
req_id = req.request_id
|
||||
req_md5_info = [
|
||||
self._compute_tensor_md5(tensor[i : i + 1], name)
|
||||
for name, tensor in tensor_dict.items()
|
||||
if tensor is not None and hasattr(tensor, "shape") and len(tensor.shape) >= 2
|
||||
]
|
||||
|
||||
if req_md5_info:
|
||||
det_logger.info(f"[DETERMINISM-MD5-REQ] {req_id} | decode | " + " | ".join(req_md5_info))
|
||||
|
||||
def log_prefill_input(self, request_id, idx, prefill_start_index, prefill_end_index, input_ids):
|
||||
"""Log prefill input details for determinism verification."""
|
||||
det_logger.info(
|
||||
f"[DETERMINISM] Prefill input - request_id: {request_id}, "
|
||||
f"idx: {idx}, prefill_start_index: {prefill_start_index}, "
|
||||
f"prefill_end_index: {prefill_end_index}, "
|
||||
f"input_ids: {input_ids}"
|
||||
)
|
||||
|
||||
def log_deterministic_input(self, forward_meta):
|
||||
"""Log determinism inference input information, supports multiple batch requests"""
|
||||
ids = forward_meta.ids_remove_padding
|
||||
req_ids = self.share_inputs.get("req_ids", None)
|
||||
seq_lens_this_time = self.share_inputs.get("seq_lens_this_time", None)
|
||||
seq_lens_encoder = self.share_inputs.get("seq_lens_encoder", None)
|
||||
seq_lens_decoder = self.share_inputs.get("seq_lens_decoder", None)
|
||||
|
||||
# Get batch size
|
||||
num_requests = len(seq_lens_this_time) if seq_lens_this_time is not None else 0
|
||||
|
||||
det_logger.info(f"[DETERMINISM-INPUT] time={time.time():.6f} | batch_size={num_requests}")
|
||||
|
||||
if num_requests == 0 or ids is None:
|
||||
det_logger.info("[DETERMINISM-INPUT] No input data")
|
||||
return
|
||||
|
||||
# Split ids for each request
|
||||
ids_list = ids.cpu().numpy().tolist()
|
||||
offset = 0
|
||||
|
||||
for i in range(num_requests):
|
||||
# Get current request information
|
||||
req_id = req_ids[i] if req_ids is not None and i < len(req_ids) else f"idx_{i}"
|
||||
seq_len = int(seq_lens_this_time[i])
|
||||
seq_len_enc = int(seq_lens_encoder[i]) if seq_lens_encoder is not None and i < len(seq_lens_encoder) else 0
|
||||
seq_len_dec = int(seq_lens_decoder[i]) if seq_lens_decoder is not None and i < len(seq_lens_decoder) else 0
|
||||
|
||||
# Get current request's tokens
|
||||
if seq_len > 0:
|
||||
request_tokens = ids_list[offset : offset + seq_len]
|
||||
else:
|
||||
request_tokens = []
|
||||
|
||||
offset += seq_len
|
||||
|
||||
# Print one line log
|
||||
det_logger.info(
|
||||
f"[DETERMINISM-INPUT] req_id={req_id} | tokens={request_tokens} | "
|
||||
f"len={seq_len} | seq_len_enc={seq_len_enc} | seq_len_dec={seq_len_dec}"
|
||||
)
|
||||
@@ -6,6 +6,10 @@ from collections import namedtuple
|
||||
from collections.abc import Callable
|
||||
from typing import Any, Dict
|
||||
|
||||
from fastdeploy.utils import get_logger
|
||||
|
||||
logger = get_logger("worker_process", "worker_process.log")
|
||||
|
||||
import paddle
|
||||
import triton
|
||||
import triton.language as tl
|
||||
@@ -137,13 +141,13 @@ def get_compute_units():
|
||||
device_properties = paddle.cuda.get_device_properties(0)
|
||||
NUM_SMS = device_properties.multi_processor_count
|
||||
except Exception:
|
||||
print("Could not get CUDA device properties. Falling back to CPU threads.")
|
||||
logger.warning("Could not get CUDA device properties. Falling back to CPU threads.")
|
||||
# TODO(liujundong): Paddle lacks a torch.get_num_threads() equivalent for the *configured* thread count.
|
||||
# Using os.cpu_count() (total logical cores) as a fallback, which may not be correct.
|
||||
# Must check downstream logic to determine if this impacts correctness.
|
||||
NUM_SMS = os.cpu_count()
|
||||
else:
|
||||
print("No CUDA device available. Using CPU.")
|
||||
logger.warning("No CUDA device available. Using CPU.")
|
||||
# For CPU, use the number of CPU cores
|
||||
NUM_SMS = os.cpu_count()
|
||||
|
||||
@@ -153,7 +157,7 @@ def get_compute_units():
|
||||
def matmul_persistent(a: paddle.Tensor, b: paddle.Tensor, bias: paddle.Tensor | None = None):
|
||||
# Check constraints.
|
||||
assert a.shape[1] == b.shape[0], "Incompatible dimensions"
|
||||
assert a.dtype == b.dtype, "Incompatible dtypes"
|
||||
assert a.dtype == b.dtype, f"Incompatible dtypes: a={a.dtype}, b={b.dtype}"
|
||||
assert bias is None or bias.dim() == 1, "Currently assuming bias is 1D, let Horace know if you run into this"
|
||||
|
||||
NUM_SMS = get_compute_units()
|
||||
@@ -210,9 +214,11 @@ def matmul_persistent(a: paddle.Tensor, b: paddle.Tensor, bias: paddle.Tensor |
|
||||
c.stride(0),
|
||||
c.stride(1), #
|
||||
NUM_SMS=NUM_SMS, #
|
||||
A_LARGE=int(a.numel() > 2**31),
|
||||
B_LARGE=int(b.numel() > 2**31),
|
||||
C_LARGE=int(c.numel() > 2**31),
|
||||
# Use M*K, K*N, M*N instead of numel() to avoid cudaErrorStreamCaptureImplicit
|
||||
# during CUDA Graph capture
|
||||
A_LARGE=int(M * K > 2**31),
|
||||
B_LARGE=int(K * N > 2**31),
|
||||
C_LARGE=int(M * N > 2**31),
|
||||
HAS_BIAS=int(bias is not None),
|
||||
# The Triton compiler (when used with Paddle) cannot handle these variables as booleans. Explicitly cast to int so the compiler can process them.
|
||||
**configs[dtype],
|
||||
@@ -477,6 +483,8 @@ def addmm_batch_invariant(
|
||||
So we use `alpha * (x @ y) + beta * input = alpha * [ (x @ y) + (beta / alpha) * input ]`
|
||||
to minimize the effection on performance
|
||||
"""
|
||||
if alpha == 0:
|
||||
return paddle.broadcast_to(beta * input, [x.shape[0], y.shape[1]])
|
||||
matmul_result = matmul_persistent(a=x, b=y, bias=input * beta / alpha)
|
||||
result = alpha * matmul_result
|
||||
return result
|
||||
@@ -490,7 +498,13 @@ def mean_batch_invariant(
|
||||
x: paddle.Tensor, axis: list[int] = [], keepdim: bool = False, dtype: paddle.dtype | None = None, out=None
|
||||
) -> paddle.Tensor:
|
||||
assert dtype is None or dtype == paddle.float32, f"unsupported dtype: {dtype}"
|
||||
if type(axis) is int:
|
||||
if axis is None: # Global mean (no axis specified)
|
||||
# Avoid x.numel() to prevent cudaErrorStreamCaptureImplicit during CUDA Graph capture
|
||||
n_elems = 1
|
||||
for s in x.shape:
|
||||
n_elems *= s
|
||||
result = paddle.sum(x, keepdim=keepdim, dtype=paddle.float32) / n_elems
|
||||
elif type(axis) is int:
|
||||
result = mean_dim(x, axis, keepdim=keepdim)
|
||||
elif len(axis) == 1: # axis: int | Sequence[int]
|
||||
result = mean_dim(x, axis[0], keepdim=keepdim)
|
||||
|
||||
@@ -99,6 +99,7 @@ from fastdeploy import envs
|
||||
from fastdeploy.engine.tasks import PoolingTask
|
||||
from fastdeploy.input.ernie4_5_vl_processor import DataProcessor
|
||||
from fastdeploy.inter_communicator import IPCSignal, ZmqIpcClient
|
||||
from fastdeploy.logger.deterministic_logger import DeterministicLogger
|
||||
from fastdeploy.model_executor.forward_meta import ForwardMeta
|
||||
from fastdeploy.model_executor.layers.pool.metadata import PoolingMetadata
|
||||
from fastdeploy.model_executor.models.ernie4_5_vl.modeling_resampler import ScatterOp
|
||||
@@ -211,6 +212,13 @@ class GPUModelRunner(ModelRunnerBase):
|
||||
|
||||
self.restore_chunked_prefill_request = dict()
|
||||
|
||||
# Initialize deterministic logger (only when deterministic debugging is enabled)
|
||||
self.deterministic_logger = (
|
||||
DeterministicLogger(self.share_inputs)
|
||||
if envs.FD_DETERMINISTIC_MODE and envs.FD_DETERMINISTIC_LOG_MODE
|
||||
else None
|
||||
)
|
||||
|
||||
# Initialize attention Backend
|
||||
# NOTE(gonshaotian): Currently, all attention layers share one attention backend instance.
|
||||
# In the future, we will expand it as a list.
|
||||
@@ -262,6 +270,7 @@ class GPUModelRunner(ModelRunnerBase):
|
||||
self.last_sampler_output = None
|
||||
self.last_post_process_event = None
|
||||
self.last_token_num = -1
|
||||
|
||||
self.enable_overlap_schedule = fd_config.scheduler_config.enable_overlap_schedule and (
|
||||
not self.speculative_decoding
|
||||
)
|
||||
@@ -777,6 +786,14 @@ class GPUModelRunner(ModelRunnerBase):
|
||||
prompt_token_ids = request.prompt_token_ids
|
||||
input_ids = prompt_token_ids + request.output_token_ids
|
||||
prompt_len = len(prompt_token_ids)
|
||||
|
||||
# Log complete input_ids for input determinism verification
|
||||
# Note: Only current request info is logged here; batch info is logged during forward
|
||||
if self.deterministic_logger is not None:
|
||||
self.deterministic_logger.log_prefill_input(
|
||||
request.request_id, idx, prefill_start_index, prefill_end_index, input_ids
|
||||
)
|
||||
|
||||
self.share_inputs["prompt_ids"][idx : idx + 1, :prompt_len] = np.array(prompt_token_ids, dtype="int64")
|
||||
logger.debug(
|
||||
f"Handle prefill request {request} at idx {idx}, "
|
||||
@@ -1653,6 +1670,10 @@ class GPUModelRunner(ModelRunnerBase):
|
||||
encoder_block_shape_q = 64
|
||||
decoder_block_shape_q = 16
|
||||
|
||||
# Deterministic mode: use deterministic_split_kv_size to ensure batch-invariant attention
|
||||
if envs.FD_DETERMINISTIC_MODE:
|
||||
decoder_block_shape_q = envs.FD_DETERMINISTIC_SPLIT_KV_SIZE
|
||||
|
||||
res_buffer = allocate_launch_related_buffer(
|
||||
max_batch_size=self.scheduler_config.max_num_seqs,
|
||||
max_model_len=self.model_config.max_model_len,
|
||||
@@ -2299,6 +2320,9 @@ class GPUModelRunner(ModelRunnerBase):
|
||||
num_running_requests: int = None,
|
||||
last_token_num: int = -1,
|
||||
) -> None:
|
||||
if self.deterministic_logger is not None:
|
||||
self.deterministic_logger.log_batch_start(model_forward_batch)
|
||||
|
||||
# 1. Prepare inputs of model and sampler.
|
||||
p_done_idxs = self._get_p_done_idxs_gd(model_forward_batch, num_running_requests)
|
||||
|
||||
@@ -2423,8 +2447,22 @@ class GPUModelRunner(ModelRunnerBase):
|
||||
)
|
||||
|
||||
# 4. Compute logits, Sample
|
||||
if self.deterministic_logger is not None:
|
||||
# Log MD5 of hidden_states (model output)
|
||||
self.deterministic_logger.log_tensor_md5s(
|
||||
{"hidden_states": hidden_states},
|
||||
forward_batch_reqs_list=self.forward_batch_reqs_list,
|
||||
stage="hidden_states",
|
||||
)
|
||||
|
||||
logits = self.model.compute_logits(hidden_states)
|
||||
|
||||
if self.deterministic_logger is not None:
|
||||
# Log MD5 of logits (before sampling)
|
||||
self.deterministic_logger.log_tensor_md5s(
|
||||
{"logits": logits}, forward_batch_reqs_list=self.forward_batch_reqs_list, stage="logits"
|
||||
)
|
||||
|
||||
if not self.speculative_decoding:
|
||||
set_value_by_flags_and_idx(
|
||||
self.share_inputs["pre_ids"],
|
||||
@@ -2441,6 +2479,14 @@ class GPUModelRunner(ModelRunnerBase):
|
||||
p_done_idxs,
|
||||
)
|
||||
|
||||
if self.deterministic_logger is not None:
|
||||
# Log MD5 of sampling results
|
||||
self.deterministic_logger.log_tensor_md5s(
|
||||
{"sampled_token_ids": sampler_output.sampled_token_ids},
|
||||
forward_batch_reqs_list=self.forward_batch_reqs_list,
|
||||
stage="sampled_tokens",
|
||||
)
|
||||
|
||||
if (
|
||||
self.enable_logprob
|
||||
and not envs.FD_USE_GET_SAVE_OUTPUT_V1
|
||||
|
||||
@@ -43,6 +43,11 @@ class InputBatch:
|
||||
for key, value in values.items():
|
||||
setattr(self, key, value)
|
||||
|
||||
def get(self, key, default=None):
|
||||
if hasattr(self, key):
|
||||
return getattr(self, key)
|
||||
return default
|
||||
|
||||
def pop(self, key, default=None):
|
||||
"""
|
||||
Pop an attribute, similar to dict's pop method
|
||||
|
||||
@@ -1194,6 +1194,21 @@ def run_worker_proc() -> None:
|
||||
worker_proc = PaddleDisWorkerProc(fd_config, ranks, local_rank)
|
||||
worker_proc.init_control()
|
||||
|
||||
# Enable batch-invariant mode for deterministic inference.
|
||||
# This must happen AFTER worker creation but BEFORE model loading,
|
||||
# because enable_batch_invariant_mode() calls paddle.compat.enable_torch_proxy()
|
||||
# which makes torch appear available via proxy. If called before worker creation,
|
||||
# the gpu_model_runner import chain (ernie4_5_vl_processor → paddleformers →
|
||||
# transformers) will fail when transformers tries to query torch metadata.
|
||||
if envs.FD_DETERMINISTIC_MODE:
|
||||
from fastdeploy.model_executor.layers.batch_invariant_ops import (
|
||||
enable_batch_invariant_mode,
|
||||
is_batch_invariant_mode_enabled,
|
||||
)
|
||||
|
||||
if not is_batch_invariant_mode_enabled():
|
||||
enable_batch_invariant_mode()
|
||||
|
||||
# Initialize device and create model runner
|
||||
worker_proc.init_device()
|
||||
|
||||
|
||||
@@ -7,6 +7,9 @@ import paddle
|
||||
from fastdeploy.model_executor.layers.batch_invariant_ops import (
|
||||
set_batch_invariant_mode,
|
||||
)
|
||||
from fastdeploy.model_executor.layers.batch_invariant_ops.batch_invariant_ops import (
|
||||
addmm_batch_invariant,
|
||||
)
|
||||
|
||||
|
||||
class TestBatchInvariantForAddmm(unittest.TestCase):
|
||||
@@ -45,6 +48,23 @@ class TestBatchInvariantForAddmm(unittest.TestCase):
|
||||
if ass:
|
||||
assert max(difflist) == 0
|
||||
|
||||
def test_alpha_zero(self):
|
||||
"""alpha == 0: result should be beta * input broadcast to [M, N]"""
|
||||
M, N, K = 32, 64, 128
|
||||
for dtype in [paddle.float32, paddle.bfloat16]:
|
||||
x = paddle.randn([M, K], dtype=dtype)
|
||||
y = paddle.randn([K, N], dtype=dtype)
|
||||
bias = paddle.randn([N], dtype=dtype)
|
||||
|
||||
for beta in [0.0, 1.0, 2.5]:
|
||||
out = addmm_batch_invariant(bias, x, y, beta=beta, alpha=0.0)
|
||||
expected = (beta * bias).expand([M, N])
|
||||
# shape must be [M, N]
|
||||
assert out.shape == [M, N], f"Expected shape [{M}, {N}], got {out.shape}"
|
||||
# cast to float32 for comparison (bfloat16 not supported by isclose)
|
||||
diff = (out.cast(paddle.float32) - expected.cast(paddle.float32)).abs().max()
|
||||
assert diff.item() == 0, f"dtype={dtype}, beta={beta}, max diff={diff.item()}"
|
||||
|
||||
def test_case(self):
|
||||
# Test with standard Paddle (likely to show differences)
|
||||
print("Standard Paddle:")
|
||||
|
||||
@@ -0,0 +1,29 @@
|
||||
export FD_MODEL_SOURCE=HUGGINGFACE
|
||||
export FD_MODEL_CACHE=./models
|
||||
|
||||
export CUDA_VISIBLE_DEVICES=0
|
||||
export ENABLE_V1_KVCACHE_SCHEDULER=1
|
||||
|
||||
# FD_DETERMINISTIC_MODE: Toggle deterministic mode
|
||||
# 0: Disable deterministic mode (non-deterministic)
|
||||
# 1: Enable deterministic mode (default)
|
||||
# FD_DETERMINISTIC_LOG_MODE: Toggle determinism logging
|
||||
# 0: Disable logging (high performance, recommended for production)
|
||||
# 1: Enable logging with MD5 hashes (debug mode)
|
||||
# Usage: bash start_fd.sh [deterministic_mode] [log_mode]
|
||||
# Example:
|
||||
# bash start_fd.sh 1 0 # Deterministic mode without logging (fast)
|
||||
# bash start_fd.sh 1 1 # Deterministic mode with logging (debug)
|
||||
export FD_DETERMINISTIC_MODE=${1:-1}
|
||||
export FD_DETERMINISTIC_LOG_MODE=${2:-0}
|
||||
|
||||
|
||||
python -m fastdeploy.entrypoints.openai.api_server \
|
||||
--model ./models/Qwen/Qwen2.5-7B \
|
||||
--port 8188 \
|
||||
--tensor-parallel-size 1 \
|
||||
--max-model-len 32768 \
|
||||
--enable-logprob \
|
||||
--graph-optimization-config '{"use_cudagraph":true}' \
|
||||
--no-enable-prefix-caching \
|
||||
--no-enable-output-caching
|
||||
@@ -0,0 +1,470 @@
|
||||
"""
|
||||
Determinism Feature Verification Test
|
||||
|
||||
Reference: test_batch_invariant.py. Verifies whether determinism works correctly.
|
||||
|
||||
Usage:
|
||||
# Step 1: Start server with determinism disabled
|
||||
bash ./tests/ce/deterministic/start_fd.sh 0
|
||||
|
||||
# Step 2: Run non-deterministic test (expected: results differ)
|
||||
python ./tests/ce/deterministic/test_determinism_verification.py --phase non-deterministic
|
||||
|
||||
# Step 3: Stop server
|
||||
bash fastdeploy/stop.sh
|
||||
|
||||
# Step 4: Start server with determinism enabled and logging ON
|
||||
bash ./tests/ce/deterministic/start_fd.sh 1 1
|
||||
|
||||
# Step 5: Run deterministic test (expected: results consistent)
|
||||
python ./tests/ce/deterministic/test_determinism_verification.py --phase deterministic
|
||||
|
||||
Arguments:
|
||||
--phase {deterministic,non-deterministic}
|
||||
Test mode
|
||||
- deterministic: determinism enabled with logging, expected MD5 consistency
|
||||
- non-deterministic: determinism disabled, expected different outputs
|
||||
--api-url API endpoint URL (default: http://localhost:8188/v1/chat/completions)
|
||||
--model Model name (default: Qwen/Qwen2.5-7B)
|
||||
--log-file Server log file path (default: log/workerlog.0)
|
||||
--repeat Number of repeat rounds for non-deterministic phase (default: 3)
|
||||
|
||||
Note: The deterministic test requires FD_DETERMINISTIC_LOG_MODE=1 to extract MD5 values
|
||||
from logs for verification.
|
||||
"""
|
||||
|
||||
import argparse
|
||||
import asyncio
|
||||
import hashlib
|
||||
import logging
|
||||
import os
|
||||
import random
|
||||
import re
|
||||
import sys
|
||||
|
||||
import aiohttp
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
logging.basicConfig(level=logging.INFO, format="%(asctime)s [%(levelname)s] %(message)s")
|
||||
|
||||
# Defaults (overridable via CLI args or env vars)
|
||||
DEFAULT_API_URL = "http://localhost:8188/v1/chat/completions"
|
||||
DEFAULT_MODEL_NAME = "Qwen/Qwen2.5-7B"
|
||||
DEFAULT_LOG_FILE = "log/workerlog.0"
|
||||
DEFAULT_NON_DET_REPEAT = 3
|
||||
|
||||
# Target prompt (we care about its determinism)
|
||||
TARGET_PROMPT = "你好,请简单介绍一下自己。"
|
||||
|
||||
# Distractor prompts (different content, used to create batch interference)
|
||||
DISTRACTOR_PROMPTS = [
|
||||
"今天天气怎么样?",
|
||||
"什么是人工智能?",
|
||||
"如何学习编程?",
|
||||
"什么是机器学习?",
|
||||
"Python 是什么?",
|
||||
]
|
||||
|
||||
# Generation length for target prompt (fixed, longer)
|
||||
TARGET_MAX_TOKENS = 128
|
||||
|
||||
# Generation length range for distractor prompts
|
||||
DISTRACTOR_MAX_TOKENS_RANGE = (8, 32)
|
||||
|
||||
# Health check settings
|
||||
HEALTH_CHECK_INTERVAL = 5
|
||||
HEALTH_CHECK_TIMEOUT = 300
|
||||
|
||||
|
||||
def parse_args():
|
||||
parser = argparse.ArgumentParser(description="Determinism feature verification test")
|
||||
parser.add_argument(
|
||||
"--phase",
|
||||
choices=["deterministic", "non-deterministic"],
|
||||
required=True,
|
||||
help="Test mode: deterministic (enabled) or non-deterministic (disabled)",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--api-url",
|
||||
default=os.environ.get("FD_TEST_API_URL", DEFAULT_API_URL),
|
||||
help=f"API endpoint URL (default: {DEFAULT_API_URL})",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--model",
|
||||
default=os.environ.get("FD_TEST_MODEL", DEFAULT_MODEL_NAME),
|
||||
help=f"Model name (default: {DEFAULT_MODEL_NAME})",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--log-file",
|
||||
default=os.environ.get("FD_TEST_LOG_FILE", DEFAULT_LOG_FILE),
|
||||
help=f"Server log file path (default: {DEFAULT_LOG_FILE})",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--repeat",
|
||||
type=int,
|
||||
default=int(os.environ.get("FD_TEST_REPEAT", DEFAULT_NON_DET_REPEAT)),
|
||||
help=f"Number of repeat rounds for non-deterministic phase (default: {DEFAULT_NON_DET_REPEAT})",
|
||||
)
|
||||
return parser.parse_args()
|
||||
|
||||
|
||||
def extract_md5_from_log(log_file: str, request_id: str) -> list[str]:
|
||||
"""Extract all decode step MD5 values for the specified request from log file."""
|
||||
md5_values = []
|
||||
try:
|
||||
with open(log_file, "r", encoding="utf-8", errors="ignore") as f:
|
||||
pattern = rf"\[DETERMINISM-MD5-REQ\] {re.escape(request_id)} \| decode"
|
||||
for line in f:
|
||||
if re.search(pattern, line):
|
||||
match = re.search(r"hidden_states_md5=([a-f0-9]+)", line)
|
||||
if match:
|
||||
md5_values.append(match.group(1))
|
||||
except FileNotFoundError:
|
||||
logger.warning("Log file not found: %s", log_file)
|
||||
return md5_values
|
||||
|
||||
|
||||
async def wait_for_server(api_url: str) -> None:
|
||||
"""Wait for the server to be ready by polling the API endpoint."""
|
||||
base_url = api_url.rsplit("/v1/", 1)[0]
|
||||
health_url = f"{base_url}/v1/models"
|
||||
timeout = aiohttp.ClientTimeout(total=10)
|
||||
|
||||
logger.info("Waiting for server to be ready at %s ...", base_url)
|
||||
elapsed = 0
|
||||
while elapsed < HEALTH_CHECK_TIMEOUT:
|
||||
try:
|
||||
async with aiohttp.ClientSession(timeout=timeout) as session:
|
||||
async with session.get(health_url) as resp:
|
||||
if resp.status == 200:
|
||||
logger.info("Server is ready.")
|
||||
return
|
||||
except (aiohttp.ClientError, asyncio.TimeoutError):
|
||||
pass
|
||||
await asyncio.sleep(HEALTH_CHECK_INTERVAL)
|
||||
elapsed += HEALTH_CHECK_INTERVAL
|
||||
logger.info(" Still waiting... (%ds/%ds)", elapsed, HEALTH_CHECK_TIMEOUT)
|
||||
|
||||
raise RuntimeError(
|
||||
f"Server not ready after {HEALTH_CHECK_TIMEOUT}s. "
|
||||
f"Check that the server is running and accessible at {base_url}"
|
||||
)
|
||||
|
||||
|
||||
async def send_request(
|
||||
session: aiohttp.ClientSession, api_url: str, prompt: str, request_id: str, max_tokens: int, model: str
|
||||
) -> str:
|
||||
"""Send request and return response content."""
|
||||
request = {
|
||||
"model": model,
|
||||
"messages": [{"role": "user", "content": prompt}],
|
||||
"temperature": 0.8,
|
||||
"top_p": 0.9,
|
||||
"max_tokens": max_tokens,
|
||||
"request_id": request_id,
|
||||
}
|
||||
timeout = aiohttp.ClientTimeout(total=300)
|
||||
async with session.post(api_url, json=request, timeout=timeout) as response:
|
||||
response.raise_for_status()
|
||||
result = await response.json()
|
||||
return result["choices"][0]["message"]["content"]
|
||||
|
||||
|
||||
async def run_test_case(
|
||||
session: aiohttp.ClientSession,
|
||||
api_url: str,
|
||||
test_name: str,
|
||||
test_plan: list[tuple[str, str, bool]],
|
||||
model: str,
|
||||
) -> list[tuple[str, str]]:
|
||||
"""
|
||||
Run a test case.
|
||||
|
||||
Args:
|
||||
api_url: API endpoint URL.
|
||||
test_plan: List of (request_id, prompt, is_target) tuples.
|
||||
model: Model name to use for the request.
|
||||
|
||||
Returns:
|
||||
List of (request_id, result) tuples for target requests only.
|
||||
"""
|
||||
target_count = sum(1 for _, _, t in test_plan if t)
|
||||
distractor_count = len(test_plan) - target_count
|
||||
logger.info(
|
||||
"[Test %s] %d requests (target=%d, distractor=%d)", test_name, len(test_plan), target_count, distractor_count
|
||||
)
|
||||
|
||||
tasks = []
|
||||
for req_id, prompt, is_target in test_plan:
|
||||
max_tokens = TARGET_MAX_TOKENS if is_target else random.randint(*DISTRACTOR_MAX_TOKENS_RANGE)
|
||||
tasks.append(send_request(session, api_url, prompt, req_id, max_tokens, model))
|
||||
|
||||
results = await asyncio.gather(*tasks)
|
||||
|
||||
target_outputs = []
|
||||
for (req_id, _, is_target), result in zip(test_plan, results):
|
||||
marker = "[Target]" if is_target else "[Distractor]"
|
||||
logger.info(" %s %s: %s...", marker, req_id, result[:50])
|
||||
if is_target:
|
||||
target_outputs.append((req_id, result))
|
||||
|
||||
return target_outputs
|
||||
|
||||
|
||||
def _print_section(title: str) -> None:
|
||||
"""Print a section banner."""
|
||||
print("\n" + "=" * 80)
|
||||
print(title)
|
||||
print("=" * 80)
|
||||
|
||||
|
||||
def _check_consistency(
|
||||
items: dict[str, list[str]],
|
||||
label: str,
|
||||
expect_consistent: bool,
|
||||
detail_formatter=None,
|
||||
) -> bool:
|
||||
"""
|
||||
Unified consistency check logic.
|
||||
|
||||
Args:
|
||||
items: Dict mapping unique_key -> list of request_ids sharing that key.
|
||||
label: Description label (e.g. "Text", "MD5 Step 1").
|
||||
expect_consistent: True expects all keys identical, False expects differences.
|
||||
detail_formatter: Optional callable(key) -> str for displaying details on mismatch.
|
||||
|
||||
Returns:
|
||||
True if result matches expectation, False otherwise.
|
||||
"""
|
||||
expected_desc = "consistent" if expect_consistent else "inconsistent"
|
||||
_print_section(f"{label} Consistency Check (Expected: {expected_desc})")
|
||||
|
||||
if not items:
|
||||
logger.warning("No %s values found!", label)
|
||||
return False
|
||||
|
||||
is_consistent = len(items) == 1
|
||||
|
||||
print(f"\n Unique values: {len(items)}")
|
||||
if is_consistent:
|
||||
key = next(iter(items))
|
||||
reqs = items[key]
|
||||
print(f" All {len(reqs)} requests share the same value")
|
||||
else:
|
||||
for i, (key, reqs) in enumerate(items.items(), 1):
|
||||
detail = f" ({detail_formatter(key)})" if detail_formatter else ""
|
||||
print(f" Group {i}: {', '.join(reqs)}{detail}")
|
||||
|
||||
print("-" * 80)
|
||||
|
||||
passed = is_consistent == expect_consistent
|
||||
actual_desc = "consistent" if is_consistent else "inconsistent"
|
||||
status = "PASS" if passed else "FAIL"
|
||||
print(f" {status}: expected {expected_desc}, actual {actual_desc}")
|
||||
print("=" * 80)
|
||||
|
||||
return passed
|
||||
|
||||
|
||||
def compare_text_consistency(target_results: list[tuple[str, str]], expect_consistent: bool = True) -> bool:
|
||||
"""Compare target request text content against expected consistency."""
|
||||
unique_texts: dict[str, list[str]] = {}
|
||||
text_map: dict[str, str] = {}
|
||||
for req_id, text in target_results:
|
||||
text_md5 = hashlib.md5(text.encode("utf-8")).hexdigest()
|
||||
unique_texts.setdefault(text_md5, []).append(req_id)
|
||||
if text_md5 not in text_map:
|
||||
text_map[text_md5] = text
|
||||
|
||||
return _check_consistency(
|
||||
unique_texts,
|
||||
label="Text",
|
||||
expect_consistent=expect_consistent,
|
||||
detail_formatter=lambda key: repr(text_map[key][:50]),
|
||||
)
|
||||
|
||||
|
||||
def compare_md5_consistency(all_md5: dict[str, list[str]], expect_consistent: bool = True) -> bool:
|
||||
"""
|
||||
Compare MD5 results across ALL decode steps and verify against expected consistency.
|
||||
|
||||
For each decode step, checks that all target requests produced identical hidden_states_md5.
|
||||
All steps must be consistent for the overall check to pass.
|
||||
"""
|
||||
if not all_md5:
|
||||
logger.warning("No MD5 values found!")
|
||||
return False
|
||||
|
||||
# Find the minimum number of decode steps across all requests
|
||||
min_steps = min(len(md5s) for md5s in all_md5.values())
|
||||
if min_steps == 0:
|
||||
logger.warning("Some requests have no decode step MD5 values!")
|
||||
return False
|
||||
|
||||
req_ids = list(all_md5.keys())
|
||||
logger.info("Checking MD5 consistency across %d decode steps for %d requests", min_steps, len(req_ids))
|
||||
|
||||
failed_steps = []
|
||||
|
||||
for step in range(min_steps):
|
||||
step_md5s: dict[str, list[str]] = {}
|
||||
for req_id in req_ids:
|
||||
md5_val = all_md5[req_id][step]
|
||||
step_md5s.setdefault(md5_val, []).append(req_id)
|
||||
|
||||
step_consistent = len(step_md5s) == 1
|
||||
|
||||
if not step_consistent:
|
||||
failed_steps.append(step)
|
||||
|
||||
# Print per-step result
|
||||
if step_consistent:
|
||||
md5_val = next(iter(step_md5s))
|
||||
logger.info(" Decode step %d: CONSISTENT (md5=%s)", step + 1, md5_val)
|
||||
else:
|
||||
logger.warning(" Decode step %d: INCONSISTENT (%d different values)", step + 1, len(step_md5s))
|
||||
for md5_val, reqs in step_md5s.items():
|
||||
logger.warning(" md5=%s: %s", md5_val, ", ".join(reqs))
|
||||
|
||||
is_consistent = len(failed_steps) == 0
|
||||
|
||||
_print_section(f"MD5 Consistency Check (all {min_steps} decode steps)")
|
||||
if is_consistent:
|
||||
print(f" All {min_steps} decode steps are consistent across {len(req_ids)} requests")
|
||||
else:
|
||||
print(f" {len(failed_steps)}/{min_steps} decode steps are INCONSISTENT")
|
||||
print(f" Failed steps: {[s + 1 for s in failed_steps]}")
|
||||
print("-" * 80)
|
||||
|
||||
passed = is_consistent == expect_consistent
|
||||
expected_desc = "consistent" if expect_consistent else "inconsistent"
|
||||
actual_desc = "consistent" if is_consistent else "inconsistent"
|
||||
status = "PASS" if passed else "FAIL"
|
||||
print(f" {status}: expected {expected_desc}, actual {actual_desc}")
|
||||
print("=" * 80)
|
||||
|
||||
return passed
|
||||
|
||||
|
||||
# Test cases: (name, plan) where plan is [(request_id, prompt, is_target)]
|
||||
TEST_CASES = [
|
||||
(
|
||||
"case1: Single request (target only)",
|
||||
[
|
||||
("case1-target", TARGET_PROMPT, True),
|
||||
],
|
||||
),
|
||||
(
|
||||
"case2: Two requests (1 target + 1 distractor)",
|
||||
[
|
||||
("case2-distract-a", DISTRACTOR_PROMPTS[0], False),
|
||||
("case2-target", TARGET_PROMPT, True),
|
||||
],
|
||||
),
|
||||
(
|
||||
"case3: Four requests (1 target + 3 distractors)",
|
||||
[
|
||||
("case3-distract-a", DISTRACTOR_PROMPTS[0], False),
|
||||
("case3-distract-b", DISTRACTOR_PROMPTS[1], False),
|
||||
("case3-target", TARGET_PROMPT, True),
|
||||
("case3-distract-c", DISTRACTOR_PROMPTS[2], False),
|
||||
],
|
||||
),
|
||||
(
|
||||
"case4: Six requests (1 target + 5 distractors)",
|
||||
[
|
||||
("case4-distract-a", DISTRACTOR_PROMPTS[0], False),
|
||||
("case4-distract-b", DISTRACTOR_PROMPTS[1], False),
|
||||
("case4-distract-c", DISTRACTOR_PROMPTS[2], False),
|
||||
("case4-distract-d", DISTRACTOR_PROMPTS[3], False),
|
||||
("case4-target", TARGET_PROMPT, True),
|
||||
("case4-distract-e", DISTRACTOR_PROMPTS[4], False),
|
||||
],
|
||||
),
|
||||
]
|
||||
|
||||
|
||||
def _build_test_plan(test_cases, repeat: int = 1):
|
||||
"""
|
||||
Build test plan with optional repetition.
|
||||
|
||||
For repeat > 1, each test case is duplicated with round-suffixed request_ids.
|
||||
This increases sample size for non-deterministic testing.
|
||||
"""
|
||||
if repeat <= 1:
|
||||
return test_cases
|
||||
|
||||
expanded = []
|
||||
for case_name, plan in test_cases:
|
||||
for r in range(repeat):
|
||||
round_name = f"{case_name} (round {r + 1})"
|
||||
round_plan = [(f"{req_id}-r{r + 1}", prompt, is_target) for req_id, prompt, is_target in plan]
|
||||
expanded.append((round_name, round_plan))
|
||||
return expanded
|
||||
|
||||
|
||||
async def main() -> int:
|
||||
args = parse_args()
|
||||
is_deterministic = args.phase == "deterministic"
|
||||
|
||||
_print_section("Determinism Feature Verification Test")
|
||||
print(f"\n Test mode: {args.phase}")
|
||||
print(f" API URL: {args.api_url}")
|
||||
print(f" Model: {args.model}")
|
||||
print(f" Log file: {args.log_file}")
|
||||
if is_deterministic:
|
||||
print(" Expected: All target requests have consistent MD5 values")
|
||||
else:
|
||||
print(f" Expected: Target requests produce different outputs (repeat={args.repeat})")
|
||||
print("=" * 80)
|
||||
|
||||
# Wait for server to be ready
|
||||
await wait_for_server(args.api_url)
|
||||
|
||||
# Build test plan (repeat for non-deterministic to reduce flaky probability)
|
||||
repeat = args.repeat if not is_deterministic else 1
|
||||
test_plan = _build_test_plan(TEST_CASES, repeat=repeat)
|
||||
|
||||
async with aiohttp.ClientSession() as session:
|
||||
all_target_results: list[tuple[str, str]] = []
|
||||
for test_name, plan in test_plan:
|
||||
target_outputs = await run_test_case(session, args.api_url, test_name, plan, args.model)
|
||||
all_target_results.extend(target_outputs)
|
||||
await asyncio.sleep(1)
|
||||
|
||||
target_request_ids = [req_id for req_id, _ in all_target_results]
|
||||
|
||||
_print_section("All tests completed, starting verification...")
|
||||
|
||||
if is_deterministic:
|
||||
# Deterministic mode: compare MD5 across all decode steps
|
||||
all_md5 = {}
|
||||
for req_id in target_request_ids:
|
||||
md5_values = extract_md5_from_log(args.log_file, req_id)
|
||||
if md5_values:
|
||||
all_md5[req_id] = md5_values
|
||||
logger.info("%s: %d decode steps found", req_id, len(md5_values))
|
||||
else:
|
||||
logger.warning("%s: No MD5 logs found", req_id)
|
||||
|
||||
if all_md5:
|
||||
passed = compare_md5_consistency(all_md5, expect_consistent=True)
|
||||
else:
|
||||
logger.warning("No MD5 logs found, fallback to text consistency check")
|
||||
passed = compare_text_consistency(all_target_results, expect_consistent=True)
|
||||
else:
|
||||
# Non-deterministic mode: compare text content
|
||||
passed = compare_text_consistency(all_target_results, expect_consistent=False)
|
||||
|
||||
_print_section("Final Result")
|
||||
if passed:
|
||||
print(f" PASS: {args.phase} mode verified successfully")
|
||||
else:
|
||||
print(f" FAIL: {args.phase} mode verification failed")
|
||||
print("=" * 80)
|
||||
|
||||
return 0 if passed else 1
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
sys.exit(asyncio.run(main()))
|
||||
+30
-2
@@ -11,11 +11,39 @@
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
import pytest
|
||||
|
||||
|
||||
def pytest_configure(config):
|
||||
config.addinivalue_line("markers", "gpu: mark test as requiring GPU platform")
|
||||
|
||||
|
||||
def pytest_collection_modifyitems(config, items):
|
||||
"""Skip GPU-marked tests when not on a GPU platform.
|
||||
|
||||
IMPORTANT: Do NOT import paddle or fastdeploy here. This function runs
|
||||
during pytest collection (before fork). Importing paddle initializes the
|
||||
CUDA runtime, which makes forked child processes unable to re-initialize
|
||||
CUDA (OSError: CUDA error(3), initialization error).
|
||||
"""
|
||||
import glob
|
||||
|
||||
has_gpu = len(glob.glob("/dev/nvidia[0-9]*")) > 0
|
||||
|
||||
if has_gpu:
|
||||
return
|
||||
|
||||
skip_marker = pytest.mark.skip(reason="Test requires GPU platform, skipping on non-GPU")
|
||||
for item in items:
|
||||
if "gpu" in item.keywords:
|
||||
item.add_marker(skip_marker)
|
||||
|
||||
|
||||
import time
|
||||
from typing import Any, Union
|
||||
|
||||
import pytest
|
||||
from e2e.utils.serving_utils import (
|
||||
from e2e.utils.serving_utils import ( # noqa: E402
|
||||
FD_API_PORT,
|
||||
FD_CACHE_QUEUE_PORT,
|
||||
FD_ENGINE_QUEUE_PORT,
|
||||
|
||||
@@ -0,0 +1,357 @@
|
||||
# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
"""
|
||||
Determinism offline inference tests using LLM.generate
|
||||
|
||||
Test scenarios:
|
||||
1. Same-prompt repeatability (FD_DETERMINISTIC_MODE=1)
|
||||
2. Batch invariance (single vs. batch, different positions)
|
||||
3. Different batch sizes consistency
|
||||
4. Sampling-parameter combinations (temperature x top_p, parametrized)
|
||||
5. Long sequence generation (512-1024 tokens)
|
||||
6. Long input prompt handling
|
||||
7. Minimal output (max_tokens=1, early stop)
|
||||
8. Special characters & multi-language prompts
|
||||
9. Multi-turn conversation
|
||||
10. State isolation (interleaved / interference prompts)
|
||||
11. Non-deterministic validation (proves tests are effective)
|
||||
|
||||
Usage:
|
||||
CUDA_VISIBLE_DEVICES=0 pytest tests/deterministic/test_determinism_offline.py -v
|
||||
"""
|
||||
|
||||
import os
|
||||
|
||||
import pytest
|
||||
|
||||
pytestmark = pytest.mark.gpu
|
||||
|
||||
DEFAULT_MODEL_DIR = "./models"
|
||||
MODEL_NAME = "Qwen2-7B-Instruct"
|
||||
|
||||
_ENV_CUDA_VISIBLE_DEVICES = "CUDA_VISIBLE_DEVICES"
|
||||
_ENV_FD_DETERMINISTIC_MODE = "FD_DETERMINISTIC_MODE"
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Fixtures
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
|
||||
@pytest.fixture(scope="module", autouse=True)
|
||||
def _module_env():
|
||||
"""Set env vars before importing fastdeploy (must happen first)."""
|
||||
old_cuda = os.environ.get(_ENV_CUDA_VISIBLE_DEVICES)
|
||||
old_det = os.environ.get(_ENV_FD_DETERMINISTIC_MODE)
|
||||
|
||||
os.environ[_ENV_CUDA_VISIBLE_DEVICES] = os.environ.get(_ENV_CUDA_VISIBLE_DEVICES, "0")
|
||||
os.environ[_ENV_FD_DETERMINISTIC_MODE] = "1"
|
||||
|
||||
global LLM, SamplingParams # noqa: PLW0603
|
||||
from fastdeploy import LLM, SamplingParams
|
||||
|
||||
yield
|
||||
|
||||
if old_cuda is None:
|
||||
os.environ.pop(_ENV_CUDA_VISIBLE_DEVICES, None)
|
||||
else:
|
||||
os.environ[_ENV_CUDA_VISIBLE_DEVICES] = old_cuda
|
||||
if old_det is None:
|
||||
os.environ.pop(_ENV_FD_DETERMINISTIC_MODE, None)
|
||||
else:
|
||||
os.environ[_ENV_FD_DETERMINISTIC_MODE] = old_det
|
||||
|
||||
|
||||
@pytest.fixture(autouse=True)
|
||||
def _reset_deterministic_mode():
|
||||
"""Ensure every test starts with deterministic mode ON."""
|
||||
os.environ[_ENV_FD_DETERMINISTIC_MODE] = "1"
|
||||
yield
|
||||
os.environ[_ENV_FD_DETERMINISTIC_MODE] = "1"
|
||||
|
||||
|
||||
@pytest.fixture(scope="module")
|
||||
def model_path():
|
||||
model_dir = os.getenv("MODEL_PATH", DEFAULT_MODEL_DIR)
|
||||
return os.path.join(model_dir, MODEL_NAME)
|
||||
|
||||
|
||||
@pytest.fixture(scope="module")
|
||||
def llm(model_path, _module_env):
|
||||
return LLM(
|
||||
model=model_path,
|
||||
tensor_parallel_size=1,
|
||||
max_model_len=8192,
|
||||
enable_prefix_caching=False,
|
||||
)
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Helpers
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
|
||||
def _generate_text(llm, prompt, sp):
|
||||
"""Generate once, return (text, token_ids)."""
|
||||
out = llm.generate([prompt], sp)[0]
|
||||
return out.outputs.text, out.outputs.token_ids
|
||||
|
||||
|
||||
def _assert_deterministic(llm, prompt, sp, runs=2):
|
||||
"""Run *runs* times and assert all outputs are identical."""
|
||||
results = [_generate_text(llm, prompt, sp) for _ in range(runs)]
|
||||
texts = [r[0] for r in results]
|
||||
token_ids = [r[1] for r in results]
|
||||
assert all(t == texts[0] for t in texts), "Text outputs differ across runs"
|
||||
assert all(t == token_ids[0] for t in token_ids), "Token IDs differ across runs"
|
||||
return texts[0], token_ids[0]
|
||||
|
||||
|
||||
# ===================== Core determinism tests =====================
|
||||
|
||||
|
||||
def test_deterministic_same_prompt(llm):
|
||||
"""Same prompt + same seed produces identical output across 5 runs."""
|
||||
sp = SamplingParams(temperature=0.8, top_p=0.95, max_tokens=50, seed=123)
|
||||
_assert_deterministic(llm, "Please introduce artificial intelligence in one sentence.", sp, runs=5)
|
||||
|
||||
|
||||
def test_deterministic_batch_invariance(llm):
|
||||
"""Target prompt produces identical output regardless of batch position."""
|
||||
prompt = "What kind of programming language is Python?"
|
||||
sp = SamplingParams(temperature=0.5, max_tokens=40, seed=456)
|
||||
|
||||
baseline, _ = _generate_text(llm, prompt, sp)
|
||||
|
||||
batch_configs = [
|
||||
[prompt, "Filler question 1"],
|
||||
["Filler question 2", prompt, "Filler question 3"],
|
||||
["Filler question 4", "Filler question 5", prompt],
|
||||
["Filler 6", "Filler 7", "Filler 8", prompt],
|
||||
]
|
||||
|
||||
for i, batch in enumerate(batch_configs):
|
||||
outputs = llm.generate(batch, sp)
|
||||
idx = batch.index(prompt)
|
||||
assert (
|
||||
outputs[idx].outputs.text == baseline
|
||||
), f"Batch config {i} (pos {idx}): result differs from single-request baseline"
|
||||
|
||||
|
||||
def test_deterministic_different_batch_sizes(llm):
|
||||
"""Same prompt is consistent across batch sizes 1 / 2 / 4 / 8."""
|
||||
prompt = "What is machine learning?"
|
||||
sp = SamplingParams(temperature=0.5, max_tokens=30, seed=789)
|
||||
|
||||
baseline, _ = _generate_text(llm, prompt, sp)
|
||||
|
||||
for bs in [2, 4, 8]:
|
||||
outputs = llm.generate([prompt] * bs, sp)
|
||||
assert outputs[0].outputs.text == baseline, f"Batch size {bs} differs from bs=1"
|
||||
|
||||
|
||||
# ===================== Sampling-parameter combinations =====================
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"temp,top_p,seed",
|
||||
[
|
||||
(0.0, 1.0, 300), # greedy, no top_p filter
|
||||
(0.0, 0.0, 301), # double-greedy
|
||||
(0.3, 0.9, 302), # low temp, moderate top_p
|
||||
(0.8, 0.0, 303), # medium temp, greedy top_p
|
||||
(0.8, 1.0, 304), # medium temp, no top_p filter
|
||||
(0.8, 0.5, 305), # medium temp, strict top_p
|
||||
(1.0, 0.95, 306), # high temp
|
||||
(1.5, 0.9, 307), # very high temp
|
||||
],
|
||||
)
|
||||
def test_deterministic_param_combos(llm, temp, top_p, seed):
|
||||
"""Determinism holds across various (temperature, top_p) combinations."""
|
||||
sp = SamplingParams(temperature=temp, top_p=top_p, max_tokens=30, seed=seed)
|
||||
_assert_deterministic(llm, "What is a neural network?", sp)
|
||||
|
||||
|
||||
# ===================== Long sequence tests =====================
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"temp,seed",
|
||||
[
|
||||
(0.0, 100),
|
||||
(0.3, 130),
|
||||
(0.5, 150),
|
||||
(0.7, 170),
|
||||
],
|
||||
)
|
||||
def test_deterministic_long_sequence(llm, temp, seed):
|
||||
"""Long generation (512+ tokens) stays deterministic at various temperatures."""
|
||||
prompt = "Please describe the history of AI in detail, including major milestones and key technical breakthroughs."
|
||||
sp = SamplingParams(temperature=temp, top_p=0.95, max_tokens=512, seed=seed)
|
||||
|
||||
text, token_ids = _assert_deterministic(llm, prompt, sp)
|
||||
assert len(token_ids) >= 100, f"Expected >= 100 tokens, got {len(token_ids)}"
|
||||
|
||||
|
||||
def test_deterministic_long_prompt(llm):
|
||||
"""Long input prompt (prefill-heavy) stays deterministic."""
|
||||
base = "This is a description about natural language processing. "
|
||||
long_prompt = (base * 50) + "Please summarize the above."
|
||||
sp = SamplingParams(temperature=0.5, max_tokens=100, seed=2024)
|
||||
|
||||
_assert_deterministic(llm, long_prompt, sp)
|
||||
|
||||
|
||||
# ===================== Minimal / boundary output tests =====================
|
||||
|
||||
|
||||
def test_deterministic_max_tokens_one(llm):
|
||||
"""Single-token output is deterministic."""
|
||||
sp = SamplingParams(temperature=0.1, max_tokens=1, seed=700)
|
||||
|
||||
text, token_ids = _assert_deterministic(llm, "What color is the sky?", sp)
|
||||
assert len(token_ids) == 1, f"Expected 1 token, got {len(token_ids)}"
|
||||
|
||||
|
||||
def test_deterministic_early_stop(llm):
|
||||
"""Early stopping via stop sequences is deterministic."""
|
||||
sp = SamplingParams(temperature=0.7, max_tokens=100, stop=["\u3002", "."], seed=800)
|
||||
|
||||
text, token_ids = _assert_deterministic(llm, "Please list three colors:", sp)
|
||||
assert len(token_ids) < 100, f"Expected early stop, got {len(token_ids)} tokens"
|
||||
|
||||
|
||||
# ===================== Special input tests =====================
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"prompt,seed",
|
||||
[
|
||||
("What is AI? \U0001f52c\U0001f9e0", 900), # emoji
|
||||
("Math: E = mc\u00b2", 901), # superscript
|
||||
("Code: def hello(): return 'world'", 902), # code
|
||||
("Symbols: @#$%^&*()", 903), # special symbols
|
||||
],
|
||||
)
|
||||
def test_deterministic_special_chars(llm, prompt, seed):
|
||||
sp = SamplingParams(temperature=0.5, max_tokens=30, seed=seed)
|
||||
_assert_deterministic(llm, prompt, sp)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"lang,prompt,seed",
|
||||
[
|
||||
("Chinese", "Please introduce artificial intelligence in one sentence.", 1000),
|
||||
("English", "What is artificial intelligence in one sentence?", 1001),
|
||||
(
|
||||
"Japanese",
|
||||
"\u4eba\u5de5\u77e5\u80fd\u306b\u3064\u3044\u3066\u4e00\u8a00\u3067\u8aac\u660e\u3057\u3066\u304f\u3060\u3055\u3044\u3002",
|
||||
1002,
|
||||
),
|
||||
("Spanish", "\u00bfQu\u00e9 es la inteligencia artificial en una frase?", 1003),
|
||||
],
|
||||
)
|
||||
def test_deterministic_multi_language(llm, lang, prompt, seed):
|
||||
sp = SamplingParams(temperature=0.5, max_tokens=30, seed=seed)
|
||||
_assert_deterministic(llm, prompt, sp)
|
||||
|
||||
|
||||
# ===================== Multi-turn conversation test =====================
|
||||
|
||||
|
||||
def test_deterministic_multi_turn(llm):
|
||||
"""Multi-turn chat maintains determinism."""
|
||||
sp = SamplingParams(temperature=0.5, max_tokens=50, seed=1100)
|
||||
|
||||
messages1 = [
|
||||
{"role": "user", "content": "Hello!"},
|
||||
{"role": "assistant", "content": "Hi! How can I help you?"},
|
||||
{"role": "user", "content": "Please introduce yourself."},
|
||||
]
|
||||
|
||||
# First full conversation
|
||||
r1_turn1 = llm.chat(messages1, sp)[0].outputs.text
|
||||
msgs2 = messages1 + [
|
||||
{"role": "assistant", "content": r1_turn1},
|
||||
{"role": "user", "content": "What can you do?"},
|
||||
]
|
||||
r1_turn2 = llm.chat(msgs2, sp)[0].outputs.text
|
||||
|
||||
# Second full conversation (same seed)
|
||||
r2_turn1 = llm.chat(messages1, sp)[0].outputs.text
|
||||
msgs2_repeat = messages1 + [
|
||||
{"role": "assistant", "content": r2_turn1},
|
||||
{"role": "user", "content": "What can you do?"},
|
||||
]
|
||||
r2_turn2 = llm.chat(msgs2_repeat, sp)[0].outputs.text
|
||||
|
||||
assert r1_turn1 == r2_turn1, "Multi-turn: turn-1 outputs differ"
|
||||
assert r1_turn2 == r2_turn2, "Multi-turn: turn-2 outputs differ"
|
||||
|
||||
|
||||
# ===================== State isolation test =====================
|
||||
|
||||
|
||||
def test_deterministic_state_isolation(llm):
|
||||
"""Interference prompts and interleaving do not break determinism."""
|
||||
prompt_a = "What is Python?"
|
||||
prompt_b = "What is JavaScript?"
|
||||
sp_a = SamplingParams(temperature=0.5, max_tokens=30, seed=1200)
|
||||
sp_b = SamplingParams(temperature=0.5, max_tokens=30, seed=1201)
|
||||
|
||||
# Round 1
|
||||
a1, _ = _generate_text(llm, prompt_a, sp_a)
|
||||
b1, _ = _generate_text(llm, prompt_b, sp_b)
|
||||
|
||||
# Run unrelated interference
|
||||
for p in ["Explain reinforcement learning.", "What is NLP?", "List 3 fruits."]:
|
||||
llm.generate([p], SamplingParams(temperature=0.7, max_tokens=20, seed=999))
|
||||
|
||||
# Round 2
|
||||
a2, _ = _generate_text(llm, prompt_a, sp_a)
|
||||
b2, _ = _generate_text(llm, prompt_b, sp_b)
|
||||
|
||||
assert a1 == a2, "Prompt A: output changed after interference"
|
||||
assert b1 == b2, "Prompt B: output changed after interference"
|
||||
|
||||
|
||||
# ===================== Non-deterministic validation =====================
|
||||
|
||||
|
||||
def test_non_deterministic_validation(llm):
|
||||
"""
|
||||
Prove that tests are effective:
|
||||
- Without seed + without mode: outputs vary
|
||||
- With explicit seed: outputs are consistent
|
||||
"""
|
||||
prompt = "Please explain deep learning in one sentence."
|
||||
|
||||
# Part 1: no mode, no seed -> outputs should differ
|
||||
os.environ.pop("FD_DETERMINISTIC_MODE", None)
|
||||
results_no_seed = []
|
||||
for _ in range(5):
|
||||
sp = SamplingParams(temperature=0.7, max_tokens=30)
|
||||
results_no_seed.append(llm.generate([prompt], sp)[0].outputs.text)
|
||||
|
||||
assert len(set(results_no_seed)) > 1, "Without seed/mode: expected varied outputs, got all identical"
|
||||
|
||||
# Part 2: explicit seed -> outputs must be consistent
|
||||
sp_seeded = SamplingParams(temperature=0.7, max_tokens=30, seed=999)
|
||||
results_seeded = [llm.generate([prompt], sp_seeded)[0].outputs.text for _ in range(5)]
|
||||
assert len(set(results_seeded)) == 1, "With explicit seed: expected consistent outputs"
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
pytest.main(["-sv", __file__])
|
||||
@@ -0,0 +1,296 @@
|
||||
# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
"""
|
||||
Determinism unit tests (lightweight, no model loading required)
|
||||
|
||||
Test scenarios:
|
||||
1. SamplingParams seed behavior in deterministic / non-deterministic mode
|
||||
2. Environment variable handling (FD_DETERMINISTIC_MODE, SPLIT_KV_SIZE, LOG_MODE)
|
||||
3. Token allocation alignment logic (_get_num_new_tokens)
|
||||
4. Cross-mode behavior validation
|
||||
|
||||
Usage:
|
||||
pytest tests/deterministic/test_determinism_standalone.py -v
|
||||
"""
|
||||
|
||||
import importlib
|
||||
import os
|
||||
from dataclasses import dataclass
|
||||
from typing import Optional
|
||||
|
||||
import pytest
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Helpers
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
|
||||
def _reload_sp():
|
||||
"""Reload envs + sampling_params so env-var changes take effect."""
|
||||
import fastdeploy.engine.sampling_params as sp_module
|
||||
import fastdeploy.envs as envs_module
|
||||
|
||||
importlib.reload(envs_module)
|
||||
importlib.reload(sp_module)
|
||||
return sp_module, envs_module
|
||||
|
||||
|
||||
@dataclass
|
||||
class _FakeRequest:
|
||||
"""Minimal stand-in for a scheduler request object."""
|
||||
|
||||
need_prefill_tokens: int
|
||||
num_computed_tokens: int
|
||||
request_id: str = "fake-0"
|
||||
prompt_token_ids: Optional[list] = None
|
||||
multimodal_inputs: Optional[dict] = None
|
||||
with_image: bool = False
|
||||
|
||||
|
||||
def _align_tokens(current_pos, remaining, budget, split_kv_size):
|
||||
"""
|
||||
Pure-function replica of the alignment logic in
|
||||
ResourceManagerV1._get_num_new_tokens (deterministic branch).
|
||||
|
||||
Returns the number of new tokens to allocate.
|
||||
"""
|
||||
if remaining < split_kv_size:
|
||||
# Final chunk - no alignment needed
|
||||
return min(remaining, budget)
|
||||
|
||||
# Next split_kv_size boundary from current_pos
|
||||
next_boundary = ((current_pos + split_kv_size - 1) // split_kv_size) * split_kv_size
|
||||
tokens_to_boundary = next_boundary - current_pos
|
||||
|
||||
if budget < tokens_to_boundary:
|
||||
return 0 # defer
|
||||
|
||||
aligned_end = ((current_pos + budget) // split_kv_size) * split_kv_size
|
||||
num_new = aligned_end - current_pos
|
||||
return min(num_new, budget, remaining)
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Fixtures
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
|
||||
@pytest.fixture(autouse=True)
|
||||
def _clean_env():
|
||||
"""Save and restore determinism-related env vars around every test."""
|
||||
keys = [
|
||||
"FD_DETERMINISTIC_MODE",
|
||||
"FD_DETERMINISTIC_SPLIT_KV_SIZE",
|
||||
"FD_DETERMINISTIC_LOG_MODE",
|
||||
]
|
||||
saved = {k: os.environ.get(k) for k in keys}
|
||||
yield
|
||||
for k, v in saved.items():
|
||||
if v is None:
|
||||
os.environ.pop(k, None)
|
||||
else:
|
||||
os.environ[k] = v
|
||||
|
||||
|
||||
def _set_env(key, value):
|
||||
if value is None:
|
||||
os.environ.pop(key, None)
|
||||
else:
|
||||
os.environ[key] = value
|
||||
|
||||
|
||||
# ===================== SamplingParams seed tests =====================
|
||||
|
||||
|
||||
class TestSamplingParamsSeed:
|
||||
"""Verify seed assignment in SamplingParams under different modes."""
|
||||
|
||||
def test_non_deterministic_uses_random_seed(self):
|
||||
"""Without FD_DETERMINISTIC_MODE, each SamplingParams gets a random seed."""
|
||||
_set_env("FD_DETERMINISTIC_MODE", None)
|
||||
sp_mod, _ = _reload_sp()
|
||||
|
||||
seeds = {sp_mod.SamplingParams().seed for _ in range(10)}
|
||||
assert len(seeds) > 1, "Non-deterministic mode should produce different random seeds"
|
||||
|
||||
def test_deterministic_uses_fixed_seed(self):
|
||||
"""With FD_DETERMINISTIC_MODE=1, default seed is always 42."""
|
||||
_set_env("FD_DETERMINISTIC_MODE", "1")
|
||||
sp_mod, _ = _reload_sp()
|
||||
|
||||
seeds = {sp_mod.SamplingParams().seed for _ in range(10)}
|
||||
assert seeds == {42}, f"Deterministic mode should always use seed=42, got {seeds}"
|
||||
|
||||
def test_explicit_seed_overrides_mode(self):
|
||||
"""User-supplied seed takes precedence over deterministic default."""
|
||||
_set_env("FD_DETERMINISTIC_MODE", "1")
|
||||
sp_mod, _ = _reload_sp()
|
||||
|
||||
assert sp_mod.SamplingParams(seed=123).seed == 123
|
||||
|
||||
def test_seed_zero_is_valid(self):
|
||||
"""seed=0 must not be confused with 'unset'."""
|
||||
_set_env("FD_DETERMINISTIC_MODE", "1")
|
||||
sp_mod, _ = _reload_sp()
|
||||
|
||||
assert sp_mod.SamplingParams(seed=0).seed == 0
|
||||
|
||||
def test_seed_max_value(self):
|
||||
"""Upper-bound seed accepted by _verify_args."""
|
||||
_set_env("FD_DETERMINISTIC_MODE", "1")
|
||||
sp_mod, _ = _reload_sp()
|
||||
|
||||
max_seed = 922337203685477580
|
||||
assert sp_mod.SamplingParams(seed=max_seed).seed == max_seed
|
||||
|
||||
def test_explicit_seed_works_in_both_modes(self):
|
||||
"""Same explicit seed yields same value regardless of mode."""
|
||||
explicit_seed = 12345
|
||||
for mode in ("0", "1"):
|
||||
_set_env("FD_DETERMINISTIC_MODE", mode)
|
||||
sp_mod, _ = _reload_sp()
|
||||
assert sp_mod.SamplingParams(seed=explicit_seed).seed == explicit_seed
|
||||
|
||||
|
||||
# ===================== Environment variable tests =====================
|
||||
|
||||
|
||||
class TestDeterminismEnvVars:
|
||||
"""Verify env-var parsing in fastdeploy.envs."""
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"raw,expected",
|
||||
[
|
||||
(None, False),
|
||||
("0", False),
|
||||
("1", True),
|
||||
],
|
||||
)
|
||||
def test_deterministic_mode(self, raw, expected):
|
||||
_set_env("FD_DETERMINISTIC_MODE", raw)
|
||||
_, envs_mod = _reload_sp()
|
||||
assert envs_mod.FD_DETERMINISTIC_MODE is expected
|
||||
|
||||
def test_split_kv_size_default(self):
|
||||
_set_env("FD_DETERMINISTIC_SPLIT_KV_SIZE", None)
|
||||
_, envs_mod = _reload_sp()
|
||||
assert envs_mod.FD_DETERMINISTIC_SPLIT_KV_SIZE == 16
|
||||
|
||||
def test_split_kv_size_custom(self):
|
||||
_set_env("FD_DETERMINISTIC_SPLIT_KV_SIZE", "32")
|
||||
_, envs_mod = _reload_sp()
|
||||
assert envs_mod.FD_DETERMINISTIC_SPLIT_KV_SIZE == 32
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"raw,expected",
|
||||
[
|
||||
(None, False),
|
||||
("1", True),
|
||||
],
|
||||
)
|
||||
def test_log_mode(self, raw, expected):
|
||||
_set_env("FD_DETERMINISTIC_LOG_MODE", raw)
|
||||
_, envs_mod = _reload_sp()
|
||||
assert envs_mod.FD_DETERMINISTIC_LOG_MODE is expected
|
||||
|
||||
|
||||
# ===================== Token alignment logic tests =====================
|
||||
|
||||
|
||||
class TestTokenAlignment:
|
||||
"""
|
||||
Verify the deterministic token-alignment algorithm.
|
||||
|
||||
The alignment logic ensures chunk boundaries fall on split_kv_size
|
||||
multiples so that attention computation is batch-invariant.
|
||||
"""
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"cur,remaining,budget,kv,expected",
|
||||
[
|
||||
# --- basic cases (cur=0) ---
|
||||
(0, 100, 5, 16, 0), # budget < kv_size, defer
|
||||
(0, 100, 16, 16, 16), # budget == kv_size
|
||||
(0, 100, 32, 16, 32), # budget == 2*kv_size
|
||||
(0, 100, 50, 16, 48), # round-down to 48
|
||||
# --- non-zero current_pos ---
|
||||
(10, 90, 20, 16, 6), # next boundary=16, then end=16, alloc=6
|
||||
(8, 92, 20, 16, 8), # next boundary=16, aligned_end=16, alloc=8
|
||||
(16, 84, 32, 16, 32), # already on boundary
|
||||
(15, 85, 1, 16, 1), # exactly 1 token to next boundary
|
||||
(17, 83, 2, 16, 0), # 15 tokens to boundary=32, budget=2 => defer
|
||||
# --- final-chunk (remaining < kv_size) ---
|
||||
(96, 4, 10, 16, 4), # final chunk, no alignment
|
||||
(96, 4, 2, 16, 2), # final chunk, budget < remaining
|
||||
# --- large kv_size ---
|
||||
(0, 200, 100, 64, 64), # kv=64, 100//64*64=64
|
||||
(0, 200, 128, 64, 128), # kv=64, 128//64*64=128
|
||||
],
|
||||
)
|
||||
def test_align_tokens(self, cur, remaining, budget, kv, expected):
|
||||
result = _align_tokens(cur, remaining, budget, kv)
|
||||
assert result == expected, (
|
||||
f"align_tokens(cur={cur}, remaining={remaining}, budget={budget}, kv={kv}): "
|
||||
f"expected {expected}, got {result}"
|
||||
)
|
||||
|
||||
def test_alignment_vs_non_deterministic(self):
|
||||
"""Deterministic mode allocates fewer tokens due to alignment."""
|
||||
budget, kv = 50, 16
|
||||
det_result = _align_tokens(0, 100, budget, kv) # 48
|
||||
non_det_result = min(100, budget) # 50
|
||||
assert det_result < non_det_result
|
||||
assert det_result == 48
|
||||
assert non_det_result == 50
|
||||
|
||||
def test_result_always_on_boundary_or_final_allocation(self):
|
||||
"""After allocation, (current_pos + result) sits on a kv boundary
|
||||
unless this allocation exhausts all remaining tokens."""
|
||||
kv = 16
|
||||
for cur in range(0, 80, 7):
|
||||
for remaining in [5, 10, 30, 60, 100]:
|
||||
for budget in [1, 8, 16, 32, 64]:
|
||||
result = _align_tokens(cur, remaining, budget, kv)
|
||||
if result == 0:
|
||||
continue
|
||||
end = cur + result
|
||||
is_final = result == remaining
|
||||
if remaining >= kv and not is_final:
|
||||
assert end % kv == 0, (
|
||||
f"cur={cur} remaining={remaining} budget={budget}: " f"end={end} is not aligned to {kv}"
|
||||
)
|
||||
|
||||
|
||||
# ===================== Cross-mode behavior validation =====================
|
||||
|
||||
|
||||
class TestCrossModeBehavior:
|
||||
"""Prove that mode switch actually changes observable behavior."""
|
||||
|
||||
def test_deterministic_mode_consistent_seeds(self):
|
||||
_set_env("FD_DETERMINISTIC_MODE", "1")
|
||||
sp_mod, _ = _reload_sp()
|
||||
seeds = [sp_mod.SamplingParams().seed for _ in range(10)]
|
||||
assert len(set(seeds)) == 1 and seeds[0] == 42
|
||||
|
||||
def test_non_deterministic_mode_varied_seeds(self):
|
||||
_set_env("FD_DETERMINISTIC_MODE", "0")
|
||||
sp_mod, _ = _reload_sp()
|
||||
seeds = [sp_mod.SamplingParams().seed for _ in range(10)]
|
||||
assert len(set(seeds)) > 1
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
pytest.main(["-sv", __file__])
|
||||
@@ -0,0 +1,212 @@
|
||||
#!/usr/bin/env python
|
||||
# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
"""
|
||||
All-Reduce Deterministic Test with Real Communication
|
||||
|
||||
Tests:
|
||||
1. Custom All-Reduce is deterministic for supported dtypes (float32, float16, bfloat16)
|
||||
2. Non-16 byte aligned tensors raise RuntimeError in deterministic mode
|
||||
3. Unsupported dtypes (int32) raise AssertionError in deterministic mode
|
||||
|
||||
Run:
|
||||
python -m paddle.distributed.launch --gpus=0,1,2,3 tests/distributed/allreduce_deterministic.py
|
||||
"""
|
||||
|
||||
import os
|
||||
|
||||
import paddle
|
||||
import paddle.distributed as dist
|
||||
import pytest
|
||||
|
||||
pytestmark = pytest.mark.gpu
|
||||
|
||||
from fastdeploy import envs
|
||||
from fastdeploy.distributed import communication
|
||||
from fastdeploy.distributed.communication import tensor_model_parallel_all_reduce
|
||||
|
||||
SUPPORTED_DTYPES = [paddle.float32, paddle.float16, paddle.bfloat16]
|
||||
TENSOR_SIZE = 2048
|
||||
NUM_RUNS = 20
|
||||
|
||||
|
||||
def _create_tensor(size: int, dtype: paddle.dtype, rank: int) -> paddle.Tensor:
|
||||
"""Create a test tensor with appropriate dtype and scaling."""
|
||||
if dtype == paddle.int32:
|
||||
return paddle.randint(-100, 100, shape=[size, 1], dtype=dtype) * (rank + 1)
|
||||
return paddle.randn([size, 1], dtype=dtype) * (rank + 1)
|
||||
|
||||
|
||||
def _check_results_identical(results: list) -> bool:
|
||||
"""Check if all results are identical."""
|
||||
if not results:
|
||||
return True
|
||||
return all((results[0] == r).all() for r in results[1:])
|
||||
|
||||
|
||||
def _init_custom_allreduce(world_size: int):
|
||||
"""Initialize custom all-reduce for testing."""
|
||||
mp_group = dist.new_group(ranks=list(range(world_size)))
|
||||
communication.use_custom_allreduce(mp_group, 8192 * 1024)
|
||||
return mp_group
|
||||
|
||||
|
||||
def _enable_deterministic_mode():
|
||||
"""Enable deterministic mode via environment variable."""
|
||||
os.environ["FD_DETERMINISTIC_MODE"] = "1"
|
||||
assert envs.FD_DETERMINISTIC_MODE, f"FD_DETERMINISTIC_MODE should be True but got {envs.FD_DETERMINISTIC_MODE}"
|
||||
|
||||
|
||||
def test_custom_allreduce_deterministic(rank, world_size, dtype):
|
||||
"""Custom all-reduce should be deterministic."""
|
||||
_mp_group = _init_custom_allreduce(world_size) # noqa: F841
|
||||
results = []
|
||||
|
||||
for _ in range(NUM_RUNS):
|
||||
paddle.seed(42 + rank)
|
||||
x = _create_tensor(TENSOR_SIZE, dtype, rank)
|
||||
result = tensor_model_parallel_all_reduce(x)
|
||||
results.append(result.astype("float32").numpy().copy())
|
||||
dist.barrier()
|
||||
|
||||
communication.custom_ar_clear_ipc_handles()
|
||||
return _check_results_identical(results)
|
||||
|
||||
|
||||
def _init_large_custom_allreduce(world_size: int):
|
||||
"""Initialize custom all-reduce with 128MB buffer for large tensor tests."""
|
||||
_enable_deterministic_mode()
|
||||
large_max_size = 128 * 1024 * 1024 # 128MB
|
||||
mp_group = dist.new_group(ranks=list(range(world_size)))
|
||||
# Properly close old instance to free GPU buffers and IPC handles
|
||||
if communication._TP_AR is not None:
|
||||
communication._TP_AR.close()
|
||||
communication._TP_AR = None
|
||||
communication.use_custom_allreduce(mp_group, large_max_size)
|
||||
|
||||
|
||||
def test_large_tensor_correctness(rank, world_size, dtype):
|
||||
"""Large tensor (> default 8MB) should produce correct results with increased max_size."""
|
||||
# 2M elements * 2 bytes (bf16) = 4MB; 8M elements * 2 bytes = 16MB (> 8MB default)
|
||||
large_sizes = [2 * 1024 * 1024, 8 * 1024 * 1024]
|
||||
for large_size in large_sizes:
|
||||
expected_val = float(world_size * (world_size + 1) // 2)
|
||||
x = paddle.full([large_size, 1], float(rank + 1), dtype=dtype)
|
||||
result = tensor_model_parallel_all_reduce(x)
|
||||
|
||||
# Cast to float32 before numpy() since bfloat16 has no native numpy support
|
||||
result_np = result.astype("float32").numpy().flatten()
|
||||
max_diff = abs(result_np - expected_val).max()
|
||||
if max_diff > 0.01:
|
||||
raise AssertionError(
|
||||
f"Large tensor AR mismatch for {dtype}, size={large_size}: "
|
||||
f"expected={expected_val}, got_sample={result_np[:5]}, max_diff={max_diff}"
|
||||
)
|
||||
dist.barrier()
|
||||
|
||||
|
||||
def test_large_tensor_deterministic(rank, world_size, dtype):
|
||||
"""Multiple runs of large tensor all-reduce must produce bitwise-identical results."""
|
||||
# 8M elements * 2 bytes (bf16) = 16MB, exceeds default 8MB
|
||||
large_size = 8 * 1024 * 1024
|
||||
results = []
|
||||
for _ in range(NUM_RUNS):
|
||||
paddle.seed(42 + rank)
|
||||
x = _create_tensor(large_size, dtype, rank)
|
||||
result = tensor_model_parallel_all_reduce(x)
|
||||
results.append(result.astype("float32").numpy().copy())
|
||||
dist.barrier()
|
||||
|
||||
return _check_results_identical(results)
|
||||
|
||||
|
||||
def test_non_16_aligned_raises_error(rank, world_size):
|
||||
"""Non-16 byte aligned tensors should raise RuntimeError in deterministic mode."""
|
||||
_enable_deterministic_mode()
|
||||
mp_group = _init_custom_allreduce(world_size)
|
||||
# 1026 * 4 = 4104 bytes (NOT multiple of 16)
|
||||
x = paddle.to_tensor([1.0] * 1026, dtype=paddle.float32).reshape([1026, 1])
|
||||
|
||||
try:
|
||||
with pytest.raises(RuntimeError, match="DETERMINISTIC_MODE.*multiple of 16"):
|
||||
tensor_model_parallel_all_reduce(x, group_=mp_group)
|
||||
finally:
|
||||
communication.custom_ar_clear_ipc_handles()
|
||||
|
||||
|
||||
def test_unsupported_dtype_raises_error(rank, world_size):
|
||||
"""Unsupported dtypes should raise AssertionError in deterministic mode."""
|
||||
_enable_deterministic_mode()
|
||||
mp_group = _init_custom_allreduce(world_size)
|
||||
x = _create_tensor(TENSOR_SIZE, paddle.int32, rank)
|
||||
|
||||
try:
|
||||
with pytest.raises(AssertionError, match="DETERMINISTIC_MODE.*not supported"):
|
||||
tensor_model_parallel_all_reduce(x, group_=mp_group)
|
||||
finally:
|
||||
communication.custom_ar_clear_ipc_handles()
|
||||
|
||||
|
||||
def main():
|
||||
if not dist.is_initialized():
|
||||
paddle.distributed.init_parallel_env()
|
||||
|
||||
rank = dist.get_rank()
|
||||
world_size = dist.get_world_size()
|
||||
|
||||
assert world_size >= 2, f"Test requires at least 2 GPUs, got {world_size}"
|
||||
|
||||
print(f"All-Reduce Deterministic Test (world_size={world_size}, runs={NUM_RUNS})")
|
||||
|
||||
# Error path tests
|
||||
test_non_16_aligned_raises_error(rank, world_size)
|
||||
print("PASS: non-16 byte aligned tensor raises RuntimeError")
|
||||
dist.barrier()
|
||||
|
||||
test_unsupported_dtype_raises_error(rank, world_size)
|
||||
print("PASS: unsupported dtype (int32) raises AssertionError")
|
||||
dist.barrier()
|
||||
|
||||
# Determinism tests for supported dtypes (small tensors)
|
||||
for dtype in SUPPORTED_DTYPES:
|
||||
assert test_custom_allreduce_deterministic(
|
||||
rank, world_size, dtype
|
||||
), f"Custom all-reduce is NOT deterministic for {dtype}"
|
||||
print(f"PASS: custom all-reduce deterministic for {dtype}")
|
||||
dist.barrier()
|
||||
|
||||
# Large tensor tests (> default 8MB, using increased max_size)
|
||||
# Create one 128MB instance shared by all dtype tests to avoid IPC buffer leaks
|
||||
_init_large_custom_allreduce(world_size)
|
||||
|
||||
for dtype in SUPPORTED_DTYPES:
|
||||
test_large_tensor_correctness(rank, world_size, dtype)
|
||||
print(f"PASS: large tensor all-reduce correctness for {dtype}")
|
||||
dist.barrier()
|
||||
|
||||
for dtype in SUPPORTED_DTYPES:
|
||||
assert test_large_tensor_deterministic(
|
||||
rank, world_size, dtype
|
||||
), f"Large tensor all-reduce is NOT deterministic for {dtype}"
|
||||
print(f"PASS: large tensor all-reduce deterministic for {dtype}")
|
||||
dist.barrier()
|
||||
|
||||
communication.custom_ar_clear_ipc_handles()
|
||||
|
||||
print("All tests passed.")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
@@ -0,0 +1,36 @@
|
||||
import os
|
||||
import subprocess
|
||||
import sys
|
||||
|
||||
import pytest
|
||||
|
||||
pytestmark = pytest.mark.gpu
|
||||
|
||||
|
||||
def test_rollout_model_with_distributed_launch():
|
||||
"""
|
||||
test_rollout_model
|
||||
"""
|
||||
current_dir = os.path.dirname(os.path.abspath(__file__))
|
||||
rollout_script = os.path.join(current_dir, "allreduce_deterministic.py")
|
||||
|
||||
command = [sys.executable, "-m", "paddle.distributed.launch", "--gpus", "0,1", rollout_script]
|
||||
|
||||
print(f"Executing command: {' '.join(command)}")
|
||||
|
||||
process = subprocess.Popen(command, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True)
|
||||
|
||||
try:
|
||||
stdout, stderr = process.communicate(timeout=300)
|
||||
return_code = process.returncode
|
||||
except subprocess.TimeoutExpired:
|
||||
process.kill()
|
||||
stdout, stderr = process.communicate()
|
||||
return_code = -1
|
||||
|
||||
print("\n" + "=" * 50 + " STDOUT " + "=" * 50)
|
||||
print(stdout)
|
||||
print("\n" + "=" * 50 + " STDERR " + "=" * 50)
|
||||
print(stderr)
|
||||
|
||||
assert return_code == 0, f"Process exited with code {return_code}\nSTDERR: {stderr[-500:] if stderr else 'N/A'}"
|
||||
@@ -0,0 +1,84 @@
|
||||
# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
import os
|
||||
import unittest
|
||||
|
||||
from fastdeploy.engine.sampling_params import SamplingParams
|
||||
|
||||
MAX_SEED = 922337203685477580
|
||||
|
||||
|
||||
class TestSamplingParamsDeterminism(unittest.TestCase):
|
||||
"""Test SamplingParams deterministic seed behavior"""
|
||||
|
||||
_ENV_KEYS = ("FD_DETERMINISTIC_MODE",)
|
||||
|
||||
def setUp(self):
|
||||
"""Save and clear deterministic env vars"""
|
||||
self._saved_env = {k: os.environ.pop(k, None) for k in self._ENV_KEYS}
|
||||
|
||||
def tearDown(self):
|
||||
"""Restore original env vars"""
|
||||
for key, value in self._saved_env.items():
|
||||
if value is None:
|
||||
os.environ.pop(key, None)
|
||||
else:
|
||||
os.environ[key] = value
|
||||
|
||||
def test_fixed_seed_in_deterministic_mode(self):
|
||||
"""seed=None should always resolve to 42 when FD_DETERMINISTIC_MODE=1"""
|
||||
os.environ["FD_DETERMINISTIC_MODE"] = "1"
|
||||
|
||||
for _ in range(5):
|
||||
params = SamplingParams(seed=None)
|
||||
self.assertEqual(params.seed, 42)
|
||||
|
||||
def test_random_seed_in_non_deterministic_mode(self):
|
||||
"""seed=None should produce varying seeds when FD_DETERMINISTIC_MODE=0"""
|
||||
os.environ["FD_DETERMINISTIC_MODE"] = "0"
|
||||
|
||||
seeds = {SamplingParams(seed=None).seed for _ in range(10)}
|
||||
self.assertGreaterEqual(len(seeds), 2)
|
||||
|
||||
def test_explicit_seed_respected_in_both_modes(self):
|
||||
"""Explicit seed values should be kept regardless of deterministic mode"""
|
||||
test_seeds = [0, 1, 100, MAX_SEED]
|
||||
for mode in ("0", "1"):
|
||||
os.environ["FD_DETERMINISTIC_MODE"] = mode
|
||||
for seed in test_seeds:
|
||||
params = SamplingParams(seed=seed)
|
||||
self.assertEqual(params.seed, seed)
|
||||
|
||||
def test_seed_out_of_range_rejected(self):
|
||||
"""Seeds outside [0, MAX_SEED] should raise ValueError"""
|
||||
with self.assertRaises(ValueError):
|
||||
SamplingParams(seed=-1)
|
||||
|
||||
with self.assertRaises(ValueError):
|
||||
SamplingParams(seed=MAX_SEED + 1)
|
||||
|
||||
def test_env_switch_changes_behavior(self):
|
||||
"""Switching FD_DETERMINISTIC_MODE at runtime should affect subsequent SamplingParams"""
|
||||
os.environ["FD_DETERMINISTIC_MODE"] = "1"
|
||||
params_det = SamplingParams(seed=None)
|
||||
self.assertEqual(params_det.seed, 42)
|
||||
|
||||
os.environ["FD_DETERMINISTIC_MODE"] = "0"
|
||||
seeds = {SamplingParams(seed=None).seed for _ in range(10)}
|
||||
# At least some seeds should differ from the fixed value
|
||||
self.assertGreaterEqual(len(seeds), 2)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
unittest.main(verbosity=2)
|
||||
@@ -0,0 +1,345 @@
|
||||
"""
|
||||
# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
"""
|
||||
|
||||
import time
|
||||
import unittest
|
||||
from multiprocessing.shared_memory import SharedMemory
|
||||
|
||||
import numpy as np
|
||||
import pytest
|
||||
|
||||
from fastdeploy.inter_communicator.ipc_signal import IPCSignal, shared_memory_exists
|
||||
|
||||
|
||||
class TestSharedMemoryExists(unittest.TestCase):
|
||||
"""Test cases for shared_memory_exists function."""
|
||||
|
||||
def test_returns_false_for_nonexistent_memory(self):
|
||||
"""Test that shared_memory_exists returns False for non-existent shared memory."""
|
||||
result = shared_memory_exists(f"nonexistent_shm_{time.time()}")
|
||||
self.assertFalse(result)
|
||||
|
||||
def test_returns_true_for_existing_memory(self):
|
||||
"""Test that shared_memory_exists returns True for existing shared memory."""
|
||||
name = f"test_shm_{time.time()}"
|
||||
shm = SharedMemory(name=name, create=True, size=1024)
|
||||
try:
|
||||
result = shared_memory_exists(name)
|
||||
self.assertTrue(result)
|
||||
finally:
|
||||
try:
|
||||
shm.close()
|
||||
shm.unlink()
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"dtype,shape,initial_value",
|
||||
[
|
||||
(np.int32, (10,), [0, 1, 2, 3, 4, 5, 6, 7, 8, 9]),
|
||||
(np.float32, (5,), [0.0, 1.5, 2.5, 3.5, 4.5]),
|
||||
(np.int64, (3, 3), [[1, 2, 3], [4, 5, 6], [7, 8, 9]]),
|
||||
(np.uint8, (4,), [0, 127, 200, 255]),
|
||||
],
|
||||
)
|
||||
def test_ipc_signal_create_with_array(dtype, shape, initial_value):
|
||||
"""Test IPCSignal creation with numpy array."""
|
||||
name = f"test_ipc_signal_{time.time()}"
|
||||
array = np.array(initial_value, dtype=dtype)
|
||||
|
||||
signal = IPCSignal(name=name, array=array, dtype=dtype, create=True)
|
||||
try:
|
||||
# Verify value is initialized correctly
|
||||
np.testing.assert_array_equal(signal.value, array)
|
||||
np.testing.assert_equal(signal.value.dtype, dtype)
|
||||
|
||||
# Verify shared memory exists
|
||||
assert shared_memory_exists(name)
|
||||
finally:
|
||||
try:
|
||||
signal.clear()
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
|
||||
class TestIPCSignal(unittest.TestCase):
|
||||
"""Test cases for IPCSignal class."""
|
||||
|
||||
def setUp(self):
|
||||
"""Set up test fixtures."""
|
||||
self.test_name_base = f"test_ipc_signal_{time.time()}"
|
||||
self._signals_to_clean = []
|
||||
|
||||
def tearDown(self):
|
||||
"""Clean up all tracked signals."""
|
||||
for signal in self._signals_to_clean:
|
||||
try:
|
||||
signal.clear()
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
def _track(self, signal):
|
||||
"""Register a signal for automatic cleanup in tearDown."""
|
||||
self._signals_to_clean.append(signal)
|
||||
return signal
|
||||
|
||||
def test_create_with_suffix(self):
|
||||
"""Test IPCSignal creation with suffix."""
|
||||
name = self.test_name_base
|
||||
suffix = 123
|
||||
|
||||
array = np.array([1, 2, 3], dtype=np.int32)
|
||||
signal = self._track(IPCSignal(name=name, array=array, dtype=np.int32, suffix=suffix, create=True))
|
||||
|
||||
expected_name = f"{name}.{suffix}"
|
||||
self.assertTrue(shared_memory_exists(expected_name))
|
||||
np.testing.assert_array_equal(signal.value, array)
|
||||
|
||||
def test_attach_to_existing(self):
|
||||
"""Test IPCSignal attaching to existing shared memory."""
|
||||
name = f"{self.test_name_base}_attach"
|
||||
array = np.array([10, 20, 30], dtype=np.int64)
|
||||
|
||||
# Create shared memory
|
||||
signal1 = self._track(IPCSignal(name=name, array=array, dtype=np.int64, create=True))
|
||||
signal1.value[0] = 99 # Modify value
|
||||
|
||||
# Attach to existing
|
||||
signal2 = IPCSignal(name=name, array=array, dtype=np.int64, create=False)
|
||||
|
||||
# Verify value is shared
|
||||
self.assertEqual(signal2.value[0], 99)
|
||||
np.testing.assert_array_equal(signal2.value, signal1.value)
|
||||
|
||||
def test_dtype_mismatch_raises_assertion(self):
|
||||
"""Test that dtype mismatch raises AssertionError."""
|
||||
name = f"{self.test_name_base}_mismatch"
|
||||
array = np.array([1, 2, 3], dtype=np.int32)
|
||||
|
||||
with self.assertRaises(AssertionError):
|
||||
IPCSignal(name=name, array=array, dtype=np.float32, create=True)
|
||||
|
||||
def test_non_numpy_array_raises_assertion(self):
|
||||
"""Test that non-numpy array raises AssertionError."""
|
||||
name = f"{self.test_name_base}_non_array"
|
||||
|
||||
with self.assertRaises(AssertionError):
|
||||
IPCSignal(name=name, array=[1, 2, 3], dtype=np.int32, create=True)
|
||||
|
||||
def test_create_with_shm_size(self):
|
||||
"""Test IPCSignal creation with shm_size (no array)."""
|
||||
name = f"{self.test_name_base}_size"
|
||||
|
||||
signal = self._track(IPCSignal(name=name, shm_size=1024, create=True))
|
||||
|
||||
# Verify signal is created but value is None (no array template)
|
||||
self.assertTrue(shared_memory_exists(name))
|
||||
self.assertIsNone(signal.value)
|
||||
|
||||
def test_attach_with_shm_size(self):
|
||||
"""Test IPCSignal attach with shm_size (no array)."""
|
||||
name = f"{self.test_name_base}_attach_size"
|
||||
|
||||
# Create
|
||||
self._track(IPCSignal(name=name, shm_size=512, create=True))
|
||||
|
||||
# Attach
|
||||
signal2 = IPCSignal(name=name, shm_size=512, create=False)
|
||||
|
||||
self.assertTrue(shared_memory_exists(name))
|
||||
self.assertIsNone(signal2.value)
|
||||
|
||||
def test_shm_size_required_without_array_and_dtype(self):
|
||||
"""Test that shm_size is required when array and dtype are None."""
|
||||
name = f"{self.test_name_base}_no_size"
|
||||
|
||||
with self.assertRaises(AssertionError):
|
||||
IPCSignal(name=name, create=True)
|
||||
|
||||
def test_clear_removes_shared_memory(self):
|
||||
"""Test that clear() properly removes shared memory."""
|
||||
name = f"{self.test_name_base}_clear"
|
||||
array = np.array([1, 2, 3], dtype=np.int32)
|
||||
|
||||
signal = IPCSignal(name=name, array=array, dtype=np.int32, create=True)
|
||||
self.assertTrue(shared_memory_exists(name))
|
||||
|
||||
signal.clear()
|
||||
self.assertFalse(shared_memory_exists(name))
|
||||
|
||||
def test_clear_idempotent(self):
|
||||
"""Test that clear() can be called multiple times safely."""
|
||||
name = f"{self.test_name_base}_idempotent"
|
||||
array = np.array([1, 2, 3], dtype=np.int32)
|
||||
|
||||
signal = IPCSignal(name=name, array=array, dtype=np.int32, create=True)
|
||||
|
||||
# Should not raise exception
|
||||
signal.clear()
|
||||
signal.clear() # Call again
|
||||
|
||||
def test_value_sharing_between_processes_mock(self):
|
||||
"""Test that value is shared (mocked for unit test)."""
|
||||
name = f"{self.test_name_base}_shared"
|
||||
array = np.array([100, 200, 300], dtype=np.int64)
|
||||
|
||||
signal1 = self._track(IPCSignal(name=name, array=array, dtype=np.int64, create=True))
|
||||
signal2 = IPCSignal(name=name, array=array, dtype=np.int64, create=False)
|
||||
|
||||
# Modify through signal1
|
||||
signal1.value[0] = 999
|
||||
signal1.value[1] = 888
|
||||
signal1.value[2] = 777
|
||||
|
||||
# Verify signal2 sees changes
|
||||
self.assertEqual(signal2.value[0], 999)
|
||||
self.assertEqual(signal2.value[1], 888)
|
||||
self.assertEqual(signal2.value[2], 777)
|
||||
|
||||
def test_multiple_array_creation_replaces_existing(self):
|
||||
"""Test that creating with same name replaces existing shared memory."""
|
||||
name = f"{self.test_name_base}_replace"
|
||||
array1 = np.array([1, 2, 3], dtype=np.int32)
|
||||
array2 = np.array([4, 5, 6], dtype=np.int32)
|
||||
|
||||
signal1 = IPCSignal(name=name, array=array1, dtype=np.int32, create=True)
|
||||
signal1.clear()
|
||||
|
||||
signal2 = self._track(IPCSignal(name=name, array=array2, dtype=np.int32, create=True))
|
||||
|
||||
np.testing.assert_array_equal(signal2.value, array2)
|
||||
|
||||
def test_clear_closes_and_unlinks(self):
|
||||
"""Test that clear() both closes and unlinks the shared memory."""
|
||||
name = f"{self.test_name_base}_unlink"
|
||||
array = np.array([1, 2, 3], dtype=np.int32)
|
||||
|
||||
signal = IPCSignal(name=name, array=array, dtype=np.int32, create=True)
|
||||
|
||||
# After clear, the shared memory should be removed
|
||||
signal.clear()
|
||||
self.assertFalse(shared_memory_exists(name))
|
||||
|
||||
# Attempting to attach should fail
|
||||
try:
|
||||
_ = SharedMemory(name=name, create=False)
|
||||
self.fail("Should have raised FileNotFoundError")
|
||||
except FileNotFoundError:
|
||||
pass
|
||||
|
||||
def test_raw_buffer_read_write_with_shm_size(self):
|
||||
"""Test raw buffer read/write in shm_size mode."""
|
||||
name = f"{self.test_name_base}_raw_buf"
|
||||
data = b"hello ipc signal"
|
||||
|
||||
signal1 = self._track(IPCSignal(name=name, shm_size=1024, create=True))
|
||||
signal1.shm.buf[: len(data)] = data
|
||||
|
||||
signal2 = IPCSignal(name=name, shm_size=1024, create=False)
|
||||
self.assertEqual(bytes(signal2.shm.buf[: len(data)]), data)
|
||||
|
||||
def test_create_overwrites_existing_without_clear(self):
|
||||
"""Test that create=True on existing name auto-unlinks and recreates."""
|
||||
name = f"{self.test_name_base}_overwrite"
|
||||
array1 = np.array([1, 2, 3], dtype=np.int32)
|
||||
array2 = np.array([7, 8, 9], dtype=np.int32)
|
||||
|
||||
# Create first signal, do NOT clear it
|
||||
IPCSignal(name=name, array=array1, dtype=np.int32, create=True)
|
||||
|
||||
# Create again with same name — should auto-unlink old and recreate
|
||||
signal2 = self._track(IPCSignal(name=name, array=array2, dtype=np.int32, create=True))
|
||||
np.testing.assert_array_equal(signal2.value, array2)
|
||||
|
||||
def test_attach_nonexistent_raises_error(self):
|
||||
"""Test that create=False on non-existent shm raises FileNotFoundError."""
|
||||
name = f"nonexistent_signal_{time.time()}"
|
||||
array = np.array([1, 2, 3], dtype=np.int32)
|
||||
|
||||
with self.assertRaises(FileNotFoundError):
|
||||
IPCSignal(name=name, array=array, dtype=np.int32, create=False)
|
||||
|
||||
|
||||
class TestIPCSignalEdgeCases(unittest.TestCase):
|
||||
"""Test edge cases for IPCSignal."""
|
||||
|
||||
def test_empty_array_raises_error(self):
|
||||
"""Test IPCSignal with empty array raises ValueError due to nbytes=0."""
|
||||
name = f"test_empty_array_{time.time()}"
|
||||
array = np.array([], dtype=np.int32)
|
||||
|
||||
with self.assertRaises(ValueError):
|
||||
IPCSignal(name=name, array=array, dtype=np.int32, create=True)
|
||||
|
||||
def test_large_array(self):
|
||||
"""Test IPCSignal with large array."""
|
||||
name = f"test_large_array_{time.time()}"
|
||||
size = 10000
|
||||
array = np.arange(size, dtype=np.int64)
|
||||
|
||||
signal = IPCSignal(name=name, array=array, dtype=np.int64, create=True)
|
||||
try:
|
||||
np.testing.assert_array_equal(signal.value, array)
|
||||
finally:
|
||||
try:
|
||||
signal.clear()
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
def test_multidimensional_array(self):
|
||||
"""Test IPCSignal with multidimensional array."""
|
||||
name = f"test_multi_array_{time.time()}"
|
||||
array = np.array([[1, 2, 3], [4, 5, 6], [7, 8, 9]], dtype=np.int32)
|
||||
|
||||
signal = IPCSignal(name=name, array=array, dtype=np.int32, create=True)
|
||||
try:
|
||||
self.assertEqual(signal.value.shape, (3, 3))
|
||||
np.testing.assert_array_equal(signal.value, array)
|
||||
finally:
|
||||
try:
|
||||
signal.clear()
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
def test_different_numeric_types(self):
|
||||
"""Test IPCSignal with different numeric types."""
|
||||
name_base = f"test_types_{time.time()}"
|
||||
|
||||
test_cases = [
|
||||
(np.int8, [1, 2, 3]),
|
||||
(np.int16, [1000, 2000, 3000]),
|
||||
(np.int32, [100000, 200000, 300000]),
|
||||
(np.int64, [1000000000, 2000000000, 3000000000]),
|
||||
(np.float32, [1.5, 2.5, 3.5]),
|
||||
(np.float64, [1.123456789, 2.987654321, 3.5]),
|
||||
]
|
||||
|
||||
for i, (dtype, values) in enumerate(test_cases):
|
||||
name = f"{name_base}_{i}"
|
||||
array = np.array(values, dtype=dtype)
|
||||
signal = IPCSignal(name=name, array=array, dtype=dtype, create=True)
|
||||
try:
|
||||
np.testing.assert_array_equal(signal.value, array)
|
||||
finally:
|
||||
try:
|
||||
signal.clear()
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
unittest.main()
|
||||
@@ -0,0 +1,177 @@
|
||||
# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
"""
|
||||
Flash Attention V2 / V3 determinism tests.
|
||||
|
||||
Verify bitwise determinism of flash-backend SDPA when explicitly
|
||||
selecting FA version via FLAGS_flash_attn_version (2 or 3).
|
||||
"""
|
||||
|
||||
import unittest
|
||||
|
||||
import pytest
|
||||
|
||||
pytestmark = pytest.mark.gpu
|
||||
|
||||
import paddle
|
||||
import paddle.nn.functional as F
|
||||
|
||||
# --------------- constants ---------------
|
||||
BATCH_SIZE = 2
|
||||
NUM_HEADS = 32
|
||||
HEAD_DIM = 64
|
||||
SEQ_LEN = 2048
|
||||
NUM_RUNS = 5
|
||||
|
||||
|
||||
# --------------- helpers ---------------
|
||||
def _make_qkv(batch_size, num_heads, seq_len, head_dim, dtype="float16", seed=42):
|
||||
"""Create deterministic q/k/v tensors."""
|
||||
paddle.seed(seed)
|
||||
shape = [batch_size, num_heads, seq_len, head_dim]
|
||||
return (
|
||||
paddle.randn(shape, dtype=dtype),
|
||||
paddle.randn(shape, dtype=dtype),
|
||||
paddle.randn(shape, dtype=dtype),
|
||||
)
|
||||
|
||||
|
||||
def _assert_deterministic(test_case, func, num_runs=NUM_RUNS):
|
||||
"""Run *func* multiple times and assert all results are bitwise equal."""
|
||||
results = [func().clone() for _ in range(num_runs)]
|
||||
for i in range(1, num_runs):
|
||||
test_case.assertTrue(
|
||||
paddle.equal(results[0], results[i]).all().item(),
|
||||
f"Run 0 vs Run {i} differ",
|
||||
)
|
||||
|
||||
|
||||
# --------------- test class ---------------
|
||||
class TestFlashAttentionVersionsDeterminism(unittest.TestCase):
|
||||
"""Test determinism when switching between FA2 and FA3."""
|
||||
|
||||
FA_VERSIONS = [2, 3]
|
||||
|
||||
def setUp(self):
|
||||
if not paddle.is_compiled_with_cuda():
|
||||
self.skipTest("Flash Attention requires CUDA")
|
||||
paddle.set_device("gpu")
|
||||
# Save/restore flag to avoid cross-test pollution
|
||||
self._saved_version = paddle.base.framework.get_flags(["FLAGS_flash_attn_version"])["FLAGS_flash_attn_version"]
|
||||
|
||||
def tearDown(self):
|
||||
paddle.set_flags({"FLAGS_flash_attn_version": self._saved_version})
|
||||
|
||||
def _skip_if_fa3_unsupported(self):
|
||||
prop = paddle.device.cuda.get_device_properties()
|
||||
sm = prop.major * 10 + prop.minor
|
||||
if sm < 89 or sm >= 100:
|
||||
self.skipTest(f"FA3 requires SM89-SM99, current SM{sm}")
|
||||
|
||||
def _set_fa_version(self, version):
|
||||
if version == 3:
|
||||
self._skip_if_fa3_unsupported()
|
||||
paddle.set_flags({"FLAGS_flash_attn_version": version})
|
||||
|
||||
def _flash_sdpa(self, q, k, v, **kwargs):
|
||||
"""Thin wrapper: synchronize then call flash-backend SDPA."""
|
||||
paddle.device.synchronize()
|
||||
return F.scaled_dot_product_attention(q, k, v, backend="flash", **kwargs)
|
||||
|
||||
# ==================== tests ====================
|
||||
|
||||
def test_determinism(self):
|
||||
"""Multi-run determinism for FA2/FA3, causal and non-causal."""
|
||||
for version in self.FA_VERSIONS:
|
||||
for is_causal in [False, True]:
|
||||
with self.subTest(version=version, is_causal=is_causal):
|
||||
self._set_fa_version(version)
|
||||
q, k, v = _make_qkv(BATCH_SIZE, NUM_HEADS, SEQ_LEN, HEAD_DIM)
|
||||
_assert_deterministic(
|
||||
self,
|
||||
lambda: self._flash_sdpa(q, k, v, is_causal=is_causal, enable_gqa=False),
|
||||
)
|
||||
|
||||
def test_batch_invariance(self):
|
||||
"""First-sample result should be identical across batch sizes."""
|
||||
for version in self.FA_VERSIONS:
|
||||
with self.subTest(version=version):
|
||||
self._set_fa_version(version)
|
||||
max_bs = 8
|
||||
q, k, v = _make_qkv(max_bs, NUM_HEADS, SEQ_LEN, HEAD_DIM)
|
||||
|
||||
ref = self._flash_sdpa(q[:1], k[:1], v[:1], is_causal=False, enable_gqa=False)
|
||||
for bs in [2, 4, 8]:
|
||||
result = self._flash_sdpa(q[:bs], k[:bs], v[:bs], is_causal=False, enable_gqa=False)
|
||||
self.assertTrue(
|
||||
paddle.equal(ref, result[0:1]).all().item(),
|
||||
f"FA{version} batch invariance failed at bs={bs}",
|
||||
)
|
||||
|
||||
def test_seq_length_determinism(self):
|
||||
"""Determinism across various sequence lengths (including boundaries)."""
|
||||
seq_lengths = [1, 2, 4, 8, 16, 64, 128, 256, 512, 1024, 2048, 4096]
|
||||
for version in self.FA_VERSIONS:
|
||||
for seq_len in seq_lengths:
|
||||
with self.subTest(version=version, seq_len=seq_len):
|
||||
self._set_fa_version(version)
|
||||
q, k, v = _make_qkv(BATCH_SIZE, NUM_HEADS, seq_len, HEAD_DIM)
|
||||
_assert_deterministic(
|
||||
self,
|
||||
lambda: self._flash_sdpa(q, k, v, is_causal=False, enable_gqa=False),
|
||||
num_runs=2,
|
||||
)
|
||||
|
||||
def test_dtype_determinism(self):
|
||||
"""Determinism across float16 and float32."""
|
||||
for version in self.FA_VERSIONS:
|
||||
for dtype in ["float16", "float32"]:
|
||||
with self.subTest(version=version, dtype=dtype):
|
||||
self._set_fa_version(version)
|
||||
q, k, v = _make_qkv(BATCH_SIZE, NUM_HEADS, SEQ_LEN, HEAD_DIM, dtype=dtype)
|
||||
_assert_deterministic(
|
||||
self,
|
||||
lambda: self._flash_sdpa(q, k, v, is_causal=False, enable_gqa=False),
|
||||
num_runs=3,
|
||||
)
|
||||
|
||||
def test_head_config_determinism(self):
|
||||
"""Determinism across different head configurations."""
|
||||
for version in self.FA_VERSIONS:
|
||||
for num_heads, head_dim in [(1, 64), (7, 64), (32, 64)]:
|
||||
with self.subTest(version=version, num_heads=num_heads, head_dim=head_dim):
|
||||
self._set_fa_version(version)
|
||||
q, k, v = _make_qkv(BATCH_SIZE, num_heads, SEQ_LEN, head_dim)
|
||||
_assert_deterministic(
|
||||
self,
|
||||
lambda: self._flash_sdpa(q, k, v, is_causal=False, enable_gqa=False),
|
||||
num_runs=2,
|
||||
)
|
||||
|
||||
def test_gqa_determinism(self):
|
||||
"""Determinism with GQA enabled."""
|
||||
for version in self.FA_VERSIONS:
|
||||
with self.subTest(version=version):
|
||||
self._set_fa_version(version)
|
||||
q, k, v = _make_qkv(BATCH_SIZE, NUM_HEADS, SEQ_LEN, HEAD_DIM)
|
||||
_assert_deterministic(
|
||||
self,
|
||||
lambda: self._flash_sdpa(q, k, v, is_causal=False, enable_gqa=True),
|
||||
num_runs=3,
|
||||
)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
unittest.main(verbosity=2)
|
||||
@@ -0,0 +1,472 @@
|
||||
# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
"""
|
||||
Chunked Prefill Determinism Tests
|
||||
|
||||
Test _get_num_new_tokens alignment behavior in ResourceManagerV1:
|
||||
1. Deterministic disabled (no alignment)
|
||||
2. Deterministic enabled (split_kv_size boundary alignment)
|
||||
3. Boundary cases
|
||||
4. Continuous chunk consistency
|
||||
5. Multimodal inputs (image / video / audio)
|
||||
6. Real batch scheduling scenarios
|
||||
7. Corner cases (empty request, invalid state, large split, dynamic switch, etc.)
|
||||
"""
|
||||
|
||||
import os
|
||||
import unittest
|
||||
|
||||
from fastdeploy.engine.request import Request
|
||||
from fastdeploy.engine.sched.resource_manager_v1 import ResourceManagerV1
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Minimal config stubs -- only fields accessed by ResourceManagerV1.__init__
|
||||
# and _get_num_new_tokens are kept.
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
|
||||
class ModelConfig:
|
||||
def __init__(self):
|
||||
self.enable_mm = False
|
||||
self.causal = True
|
||||
|
||||
|
||||
class CacheConfig:
|
||||
def __init__(self):
|
||||
self.block_size = 16
|
||||
self.enable_prefix_caching = False
|
||||
self.kvcache_storage_backend = None
|
||||
self.write_policy = None
|
||||
self.num_cpu_blocks = 0
|
||||
self.total_block_num = 10000
|
||||
self.prefill_kvcache_block_num = 10000
|
||||
self.max_encoder_cache = 0
|
||||
self.max_processor_cache = 0
|
||||
self.bytes_per_token_per_layer = 32 * 32 * 128 * 2
|
||||
|
||||
|
||||
class ParallelConfig:
|
||||
def __init__(self):
|
||||
self.local_engine_worker_queue_port = None
|
||||
self.tensor_parallel_size = 1
|
||||
|
||||
|
||||
class SpeculativeConfig:
|
||||
def __init__(self):
|
||||
self.method = None
|
||||
self.num_speculative_tokens = 0
|
||||
self.model_type = None
|
||||
|
||||
|
||||
class StubConfig:
|
||||
"""Assembles the minimal sub-configs needed by ResourceManagerV1."""
|
||||
|
||||
def __init__(self):
|
||||
self.model_config = ModelConfig()
|
||||
self.cache_config = CacheConfig()
|
||||
self.parallel_config = ParallelConfig()
|
||||
self.speculative_config = SpeculativeConfig()
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Helpers
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
|
||||
def _create_request(request_id, prompt_token_ids, num_computed_tokens=0, multimodal_inputs=None):
|
||||
"""Create a real Request object for testing."""
|
||||
return Request(
|
||||
request_id=request_id,
|
||||
prompt_token_ids=prompt_token_ids,
|
||||
prompt_token_ids_len=len(prompt_token_ids),
|
||||
num_computed_tokens=num_computed_tokens,
|
||||
multimodal_inputs=multimodal_inputs,
|
||||
)
|
||||
|
||||
|
||||
def _build_mm_inputs(prompt_len, text_len, modal_id, extra=None):
|
||||
"""Build a multimodal_inputs dict for a single-modality request."""
|
||||
mm_len = prompt_len - text_len
|
||||
patch_idx_val = modal_id # 1=image, 2=video, 3=audio
|
||||
inputs = {
|
||||
"image_patch_id": prompt_len + 1,
|
||||
"image_end_id": prompt_len + 2,
|
||||
"video_patch_id": prompt_len + 3,
|
||||
"video_end_id": prompt_len + 4,
|
||||
"audio_patch_id": prompt_len + 5,
|
||||
"audio_end_id": prompt_len + 6,
|
||||
"patch_idx": [0] * text_len + [patch_idx_val] * mm_len,
|
||||
"patch_map": [
|
||||
{"modal_id": 0, "end_idx": text_len, "image_num": 0, "video_num": 0},
|
||||
{
|
||||
"modal_id": modal_id,
|
||||
"end_idx": prompt_len,
|
||||
"image_num": 1 if modal_id == 1 else 0,
|
||||
"video_num": 1 if modal_id == 2 else 0,
|
||||
},
|
||||
],
|
||||
"tts": False,
|
||||
}
|
||||
if extra:
|
||||
inputs.update(extra)
|
||||
return inputs
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Test class
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
|
||||
class TestChunkedPrefillDeterminism(unittest.TestCase):
|
||||
"""Test _get_num_new_tokens alignment in deterministic mode."""
|
||||
|
||||
def setUp(self):
|
||||
self._saved_env = {}
|
||||
for key in ("FD_DETERMINISTIC_MODE", "FD_DETERMINISTIC_SPLIT_KV_SIZE"):
|
||||
self._saved_env[key] = os.environ.get(key)
|
||||
self.config = StubConfig()
|
||||
self.rm = self._create_resource_manager(self.config)
|
||||
|
||||
def tearDown(self):
|
||||
for key, value in self._saved_env.items():
|
||||
if value is None:
|
||||
os.environ.pop(key, None)
|
||||
else:
|
||||
os.environ[key] = value
|
||||
|
||||
# -- env helpers --
|
||||
|
||||
def _enable_deterministic(self, split_kv_size=16):
|
||||
os.environ["FD_DETERMINISTIC_MODE"] = "1"
|
||||
os.environ["FD_DETERMINISTIC_SPLIT_KV_SIZE"] = str(split_kv_size)
|
||||
|
||||
def _disable_deterministic(self):
|
||||
os.environ.pop("FD_DETERMINISTIC_MODE", None)
|
||||
os.environ.pop("FD_DETERMINISTIC_SPLIT_KV_SIZE", None)
|
||||
|
||||
def _create_resource_manager(self, config):
|
||||
return ResourceManagerV1(
|
||||
max_num_seqs=32,
|
||||
config=config,
|
||||
tensor_parallel_size=1,
|
||||
splitwise_role="mixed",
|
||||
local_data_parallel_id=0,
|
||||
)
|
||||
|
||||
def _create_mm_resource_manager(self):
|
||||
config = StubConfig()
|
||||
config.model_config.enable_mm = True
|
||||
return self._create_resource_manager(config)
|
||||
|
||||
# ==================== 1. Deterministic disabled ====================
|
||||
|
||||
def test_get_num_new_tokens_deterministic_disabled(self):
|
||||
"""No alignment when deterministic mode is off; budget=0 returns 0."""
|
||||
self._disable_deterministic()
|
||||
|
||||
test_cases = [
|
||||
# (prompt_tokens, num_computed, token_budget, expected)
|
||||
(list(range(100)), 0, 50, 50),
|
||||
(list(range(100)), 50, 30, 30),
|
||||
(list(range(100)), 90, 20, 10),
|
||||
(list(range(32)), 0, 15, 15),
|
||||
# budget=0 -> 0
|
||||
(list(range(100)), 0, 0, 0),
|
||||
]
|
||||
for prompt_ids, num_computed, budget, expected in test_cases:
|
||||
with self.subTest(prompt_len=len(prompt_ids), computed=num_computed, budget=budget):
|
||||
req = _create_request("req", prompt_ids, num_computed)
|
||||
result = self.rm._get_num_new_tokens(req, budget)
|
||||
self.assertEqual(result, expected)
|
||||
|
||||
# ==================== 2. Deterministic enabled alignment ====================
|
||||
|
||||
def test_get_num_new_tokens_deterministic_enabled_alignment(self):
|
||||
"""Results must align to split_kv_size boundary."""
|
||||
split_kv_size = 16
|
||||
self._enable_deterministic(split_kv_size)
|
||||
|
||||
test_cases = [
|
||||
# (prompt_tokens, num_computed, token_budget, expected)
|
||||
(list(range(100)), 0, 20, 16),
|
||||
(list(range(100)), 0, 32, 32),
|
||||
(list(range(100)), 0, 40, 32),
|
||||
(list(range(100)), 0, 50, 48),
|
||||
(list(range(100)), 8, 20, 8),
|
||||
(list(range(100)), 8, 30, 24),
|
||||
(list(range(100)), 16, 20, 16),
|
||||
(list(range(100)), 16, 25, 16),
|
||||
]
|
||||
for prompt_ids, num_computed, budget, expected in test_cases:
|
||||
with self.subTest(computed=num_computed, budget=budget):
|
||||
req = _create_request("req", prompt_ids, num_computed)
|
||||
result = self.rm._get_num_new_tokens(req, budget)
|
||||
self.assertEqual(result, expected)
|
||||
# Verify alignment
|
||||
if result > 0:
|
||||
final_pos = num_computed + result
|
||||
self.assertEqual(final_pos % split_kv_size, 0)
|
||||
|
||||
# ==================== 3. Boundary cases ====================
|
||||
|
||||
def test_get_num_new_tokens_boundary_cases(self):
|
||||
"""Boundary conditions including large budget."""
|
||||
split_kv_size = 16
|
||||
self._enable_deterministic(split_kv_size)
|
||||
|
||||
test_cases = [
|
||||
(list(range(100)), 0, 5, "budget < split_kv_size, start at 0"),
|
||||
(list(range(100)), 0, 1, "budget = 1, start at 0"),
|
||||
(list(range(100)), 10, 5, "budget < split_kv_size, start at 10"),
|
||||
(list(range(100)), 15, 5, "budget < split_kv_size, near boundary"),
|
||||
(list(range(16)), 0, 16, "exactly split_kv_size tokens needed"),
|
||||
(list(range(16)), 0, 32, "budget > needed"),
|
||||
# Very large budget (overflow guard)
|
||||
(list(range(100)), 0, 1000000, "very large budget"),
|
||||
]
|
||||
for prompt_ids, num_computed, budget, desc in test_cases:
|
||||
with self.subTest(desc=desc):
|
||||
req = _create_request("req", prompt_ids, num_computed)
|
||||
result = self.rm._get_num_new_tokens(req, budget)
|
||||
max_possible = min(len(prompt_ids) - num_computed, budget)
|
||||
self.assertGreaterEqual(result, 0)
|
||||
self.assertLessEqual(result, max_possible)
|
||||
|
||||
# ==================== 4. Chunk consistency ====================
|
||||
|
||||
def test_get_num_new_tokens_consistency_across_chunks(self):
|
||||
"""All chunk boundaries must align to split_kv_size."""
|
||||
split_kv_size = 16
|
||||
self._enable_deterministic(split_kv_size)
|
||||
|
||||
prompt_ids = list(range(112))
|
||||
budget = 50
|
||||
num_computed = 0
|
||||
chunk_sizes = []
|
||||
|
||||
while num_computed < len(prompt_ids):
|
||||
req = _create_request("req", prompt_ids, num_computed)
|
||||
result = self.rm._get_num_new_tokens(req, budget)
|
||||
if result == 0:
|
||||
break
|
||||
chunk_sizes.append(result)
|
||||
num_computed += result
|
||||
|
||||
# Every intermediate boundary must be aligned; final position may equal seq length
|
||||
position = 0
|
||||
for chunk_size in chunk_sizes:
|
||||
position += chunk_size
|
||||
is_ok = (position % split_kv_size == 0) or (position == len(prompt_ids))
|
||||
self.assertTrue(is_ok, f"position {position} not aligned to {split_kv_size}")
|
||||
|
||||
self.assertEqual(num_computed, len(prompt_ids))
|
||||
|
||||
# ==================== 5. Multimodal (parameterized) ====================
|
||||
|
||||
_MULTIMODAL_CASES = [
|
||||
{"name": "image", "prompt_len": 150, "text_len": 50, "modal_id": 1, "budget": 60, "extra": {}},
|
||||
{
|
||||
"name": "video",
|
||||
"prompt_len": 200,
|
||||
"text_len": 80,
|
||||
"modal_id": 2,
|
||||
"budget": 50,
|
||||
"extra": {"can_split_idx_list": [96, 112, 128, 144, 160, 176, 192]},
|
||||
},
|
||||
{"name": "audio", "prompt_len": 120, "text_len": 60, "modal_id": 3, "budget": 40, "extra": {}},
|
||||
]
|
||||
|
||||
def test_multimodal_input_single_modality(self):
|
||||
"""Token allocation for image / video / audio multimodal requests."""
|
||||
self._enable_deterministic(16)
|
||||
rm = self._create_mm_resource_manager()
|
||||
|
||||
for case in self._MULTIMODAL_CASES:
|
||||
with self.subTest(modality=case["name"]):
|
||||
prompt_ids = list(range(case["prompt_len"]))
|
||||
mm_inputs = _build_mm_inputs(case["prompt_len"], case["text_len"], case["modal_id"], case["extra"])
|
||||
req = _create_request(f"mm_{case['name']}", prompt_ids, 0, mm_inputs)
|
||||
result = rm._get_num_new_tokens(req, case["budget"])
|
||||
self.assertGreaterEqual(result, 0)
|
||||
self.assertLessEqual(result, case["budget"])
|
||||
|
||||
# ==================== 6. Real batch scheduling ====================
|
||||
|
||||
def test_real_batch_scheduling_concurrent_requests(self):
|
||||
"""Multiple requests competing for budget, all must respect alignment."""
|
||||
split_kv_size = 16
|
||||
self._enable_deterministic(split_kv_size)
|
||||
budget = 50
|
||||
|
||||
batch = [
|
||||
("req1", list(range(27)), 0),
|
||||
("req2", list(range(63)), 0),
|
||||
("req3", list(range(128)), 0),
|
||||
("req4", list(range(60)), 10),
|
||||
("req5", list(range(47)), 7),
|
||||
]
|
||||
for rid, prompt_ids, computed in batch:
|
||||
with self.subTest(request=rid):
|
||||
req = _create_request(rid, prompt_ids, computed)
|
||||
result = self.rm._get_num_new_tokens(req, budget)
|
||||
final_pos = computed + result
|
||||
max_possible = min(len(prompt_ids) - computed, budget)
|
||||
self.assertLessEqual(result, max_possible)
|
||||
if result > 0:
|
||||
is_ok = (final_pos % split_kv_size == 0) or (final_pos == len(prompt_ids))
|
||||
self.assertTrue(is_ok, f"{rid}: final_pos={final_pos} not aligned")
|
||||
|
||||
def test_real_batch_scheduling_continuous_prefill(self):
|
||||
"""Continuous prefill: all chunks fully consume a 47-token prompt."""
|
||||
split_kv_size = 16
|
||||
self._enable_deterministic(split_kv_size)
|
||||
|
||||
prompt_ids = list(range(47))
|
||||
budget = 50
|
||||
num_computed = 0
|
||||
iterations = 0
|
||||
|
||||
while num_computed < len(prompt_ids) and iterations < 10:
|
||||
req = _create_request("cont", prompt_ids, num_computed)
|
||||
result = self.rm._get_num_new_tokens(req, budget)
|
||||
self.assertGreater(result, 0, f"stuck at {num_computed}")
|
||||
final_pos = num_computed + result
|
||||
is_ok = (final_pos % split_kv_size == 0) or (final_pos == len(prompt_ids))
|
||||
self.assertTrue(is_ok, f"chunk ending at {final_pos} not aligned")
|
||||
num_computed += result
|
||||
iterations += 1
|
||||
|
||||
self.assertEqual(num_computed, len(prompt_ids))
|
||||
|
||||
def test_real_batch_scheduling_with_multimodal_requests(self):
|
||||
"""Mixed batch: text-only + image requests."""
|
||||
self._enable_deterministic(16)
|
||||
rm = self._create_mm_resource_manager()
|
||||
budget = 30
|
||||
|
||||
# Text-only request
|
||||
req_text = _create_request("text_only", list(range(100)), 0)
|
||||
r1 = rm._get_num_new_tokens(req_text, budget)
|
||||
self.assertGreaterEqual(r1, 0)
|
||||
self.assertLessEqual(r1, budget)
|
||||
|
||||
# Image request
|
||||
mm_inputs = _build_mm_inputs(80, 40, modal_id=1)
|
||||
req_img = _create_request("with_image", list(range(80)), 0, mm_inputs)
|
||||
r2 = rm._get_num_new_tokens(req_img, budget)
|
||||
self.assertGreaterEqual(r2, 0)
|
||||
self.assertLessEqual(r2, budget)
|
||||
|
||||
# ==================== 7. Corner cases ====================
|
||||
|
||||
def test_corner_case_invalid_request_states(self):
|
||||
"""Empty prompt, completed prefill, and num_computed > need_prefill must assert."""
|
||||
self._enable_deterministic(16)
|
||||
|
||||
# Empty prompt
|
||||
with self.subTest(case="empty prompt"):
|
||||
with self.assertRaises(AssertionError):
|
||||
self.rm._get_num_new_tokens(_create_request("e", [], 0), 50)
|
||||
|
||||
# Already completed
|
||||
with self.subTest(case="completed prefill"):
|
||||
with self.assertRaises(AssertionError):
|
||||
self.rm._get_num_new_tokens(_create_request("c", list(range(100)), 100), 50)
|
||||
|
||||
# Inconsistent state
|
||||
with self.subTest(case="num_computed > need_prefill"):
|
||||
with self.assertRaises(AssertionError):
|
||||
self.rm._get_num_new_tokens(_create_request("i", list(range(50)), 100), 50)
|
||||
|
||||
# Zero budget (legitimate, returns 0)
|
||||
with self.subTest(case="zero budget"):
|
||||
result = self.rm._get_num_new_tokens(_create_request("z", list(range(100)), 0), 0)
|
||||
self.assertEqual(result, 0)
|
||||
|
||||
def test_corner_case_minimum_split_size(self):
|
||||
"""split_kv_size=1: every position is aligned, so max allocation is allowed."""
|
||||
self._enable_deterministic(1)
|
||||
|
||||
for prompt_ids, computed, budget, expected in [
|
||||
(list(range(100)), 0, 20, 20),
|
||||
(list(range(100)), 10, 15, 15),
|
||||
(list(range(100)), 50, 10, 10),
|
||||
]:
|
||||
with self.subTest(computed=computed, budget=budget):
|
||||
req = _create_request("min", prompt_ids, computed)
|
||||
result = self.rm._get_num_new_tokens(req, budget)
|
||||
self.assertEqual(result, expected)
|
||||
|
||||
def test_corner_case_large_split_size(self):
|
||||
"""split_kv_size >> budget or sequence length."""
|
||||
test_cases = [
|
||||
# (split_kv_size, prompt_ids, num_computed, budget, description)
|
||||
(128, list(range(100)), 0, 10, "split >> budget: budget=10"),
|
||||
(128, list(range(100)), 0, 1, "split >> budget: budget=1"),
|
||||
(128, list(range(100)), 64, 20, "split >> budget: near boundary"),
|
||||
(256, list(range(50)), 0, 100, "split >> seq_len"),
|
||||
]
|
||||
for split_kv_size, prompt_ids, computed, budget, desc in test_cases:
|
||||
with self.subTest(desc=desc):
|
||||
self._enable_deterministic(split_kv_size)
|
||||
req = _create_request("lg", prompt_ids, computed)
|
||||
result = self.rm._get_num_new_tokens(req, budget)
|
||||
max_possible = min(len(prompt_ids) - computed, budget)
|
||||
self.assertGreaterEqual(result, 0)
|
||||
self.assertLessEqual(result, max_possible)
|
||||
|
||||
def test_corner_case_dynamic_config_switch(self):
|
||||
"""Switching from non-deterministic to deterministic mid-stream."""
|
||||
# Phase 1: non-deterministic
|
||||
self._disable_deterministic()
|
||||
req1 = _create_request("sw1", list(range(100)), 0)
|
||||
result1 = self.rm._get_num_new_tokens(req1, 30)
|
||||
|
||||
# Phase 2: enable deterministic, continue from result1
|
||||
split_kv_size = 16
|
||||
self._enable_deterministic(split_kv_size)
|
||||
req2 = _create_request("sw2", list(range(100)), result1)
|
||||
result2 = self.rm._get_num_new_tokens(req2, 30)
|
||||
|
||||
if result2 > 0:
|
||||
final_pos = result1 + result2
|
||||
is_aligned = (final_pos % split_kv_size == 0) or (final_pos == 100)
|
||||
self.assertTrue(is_aligned, f"final_pos={final_pos} not aligned after switch")
|
||||
|
||||
def test_deterministic_return_zero_budget_below_boundary(self):
|
||||
"""Returns 0 when budget cannot reach the next alignment boundary."""
|
||||
split_kv_size = 16
|
||||
self._enable_deterministic(split_kv_size)
|
||||
|
||||
test_cases = [
|
||||
# (prompt_ids, num_computed, budget)
|
||||
# pos=10, next_boundary=16, need 6, budget=5
|
||||
(list(range(100)), 10, 5),
|
||||
# pos=1, next_boundary=16, need 15, budget=3
|
||||
(list(range(100)), 1, 3),
|
||||
# pos=17, next_boundary=32, need 15, budget=14
|
||||
(list(range(100)), 17, 14),
|
||||
# budget=0 (deterministic)
|
||||
(list(range(100)), 0, 0),
|
||||
]
|
||||
for prompt_ids, computed, budget in test_cases:
|
||||
with self.subTest(computed=computed, budget=budget):
|
||||
req = _create_request("det0", prompt_ids, computed)
|
||||
result = self.rm._get_num_new_tokens(req, budget)
|
||||
self.assertEqual(result, 0)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
unittest.main(verbosity=2)
|
||||
@@ -0,0 +1,345 @@
|
||||
"""
|
||||
# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License"
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
"""
|
||||
|
||||
import logging
|
||||
import os
|
||||
import sys
|
||||
import types
|
||||
import unittest
|
||||
from types import SimpleNamespace
|
||||
from unittest.mock import Mock
|
||||
|
||||
import numpy as np
|
||||
|
||||
# Register fastdeploy as a bare namespace package so that
|
||||
# ``from fastdeploy.worker.deterministic_logger import ...`` does NOT
|
||||
# execute fastdeploy/__init__.py (which pulls in paddle, paddleformers, etc.).
|
||||
_project_root = os.path.abspath(os.path.join(os.path.dirname(__file__), "..", ".."))
|
||||
for _pkg, _rel_path in [
|
||||
("fastdeploy", "fastdeploy"),
|
||||
("fastdeploy.logger", "fastdeploy/logger"),
|
||||
("fastdeploy.worker", "fastdeploy/worker"),
|
||||
]:
|
||||
if _pkg not in sys.modules:
|
||||
_mod = types.ModuleType(_pkg)
|
||||
_mod.__path__ = [os.path.join(_project_root, _rel_path)]
|
||||
_mod.__package__ = _pkg
|
||||
sys.modules[_pkg] = _mod
|
||||
|
||||
from fastdeploy.logger.deterministic_logger import DeterministicLogger # noqa: E402
|
||||
|
||||
|
||||
def _make_tensor(array):
|
||||
"""Create a mock tensor that behaves like a paddle Tensor for testing."""
|
||||
arr = np.array(array)
|
||||
tensor = Mock()
|
||||
tensor.cpu.return_value = tensor
|
||||
tensor.numpy.return_value = arr
|
||||
tensor.shape = arr.shape
|
||||
tensor.__len__ = lambda self: arr.shape[0]
|
||||
tensor.__getitem__ = lambda self, idx: _make_tensor(arr[idx])
|
||||
return tensor
|
||||
|
||||
|
||||
class TestComputeTensorMd5(unittest.TestCase):
|
||||
def test_none_tensor(self):
|
||||
result = DeterministicLogger._compute_tensor_md5(None, name="x")
|
||||
self.assertEqual(result, "x_md5=None")
|
||||
|
||||
def test_deterministic_hash(self):
|
||||
t = _make_tensor([1.0, 2.0, 3.0])
|
||||
r1 = DeterministicLogger._compute_tensor_md5(t, name="a")
|
||||
r2 = DeterministicLogger._compute_tensor_md5(t, name="a")
|
||||
self.assertEqual(r1, r2)
|
||||
self.assertIn("a_md5=", r1)
|
||||
|
||||
def test_different_tensors_different_hash(self):
|
||||
t1 = _make_tensor([1.0, 2.0])
|
||||
t2 = _make_tensor([3.0, 4.0])
|
||||
r1 = DeterministicLogger._compute_tensor_md5(t1, name="x")
|
||||
r2 = DeterministicLogger._compute_tensor_md5(t2, name="x")
|
||||
self.assertNotEqual(r1, r2)
|
||||
|
||||
def test_prefix(self):
|
||||
t = _make_tensor([1.0])
|
||||
result = DeterministicLogger._compute_tensor_md5(t, name="h", prefix="batch_")
|
||||
self.assertTrue(result.startswith("batch_h_md5="))
|
||||
|
||||
def test_md5_truncated_to_16_chars(self):
|
||||
t = _make_tensor([1.0, 2.0, 3.0])
|
||||
result = DeterministicLogger._compute_tensor_md5(t, name="x")
|
||||
md5_value = result.split("=")[1]
|
||||
self.assertEqual(len(md5_value), 16)
|
||||
|
||||
|
||||
class TestGetBatchSize(unittest.TestCase):
|
||||
def test_returns_first_tensor_batch_size(self):
|
||||
t = _make_tensor([[1.0, 2.0], [3.0, 4.0], [5.0, 6.0]])
|
||||
result = DeterministicLogger._get_batch_size({"a": t})
|
||||
self.assertEqual(result, 3)
|
||||
|
||||
def test_skips_none_tensors(self):
|
||||
t = _make_tensor([[1.0], [2.0]])
|
||||
result = DeterministicLogger._get_batch_size({"a": None, "b": t})
|
||||
self.assertEqual(result, 2)
|
||||
|
||||
def test_returns_none_for_empty_dict(self):
|
||||
self.assertIsNone(DeterministicLogger._get_batch_size({}))
|
||||
|
||||
def test_returns_none_for_all_none(self):
|
||||
self.assertIsNone(DeterministicLogger._get_batch_size({"a": None}))
|
||||
|
||||
|
||||
class TestBuildReqIdStr(unittest.TestCase):
|
||||
def test_none_list(self):
|
||||
self.assertEqual(DeterministicLogger._build_req_id_str(None), "")
|
||||
|
||||
def test_single_request(self):
|
||||
req = Mock(request_id="req-001")
|
||||
result = DeterministicLogger._build_req_id_str([req])
|
||||
self.assertEqual(result, "[0]req-001")
|
||||
|
||||
def test_multiple_requests_with_none(self):
|
||||
r1 = Mock(request_id="r1")
|
||||
r2 = Mock(request_id="r2")
|
||||
result = DeterministicLogger._build_req_id_str([r1, None, r2])
|
||||
self.assertEqual(result, "[0]r1, [2]r2")
|
||||
|
||||
|
||||
class TestGetStageCounts(unittest.TestCase):
|
||||
def test_no_seq_lens_encoder(self):
|
||||
logger = DeterministicLogger(share_inputs={})
|
||||
prefill, decode, enc = logger._get_stage_counts(batch_size=4)
|
||||
self.assertEqual(prefill, 0)
|
||||
self.assertEqual(decode, 0)
|
||||
self.assertIsNone(enc)
|
||||
|
||||
def test_with_seq_lens_encoder(self):
|
||||
# seq_lens_encoder: [5, 0, 3, 0] -> 2 prefill, 2 decode
|
||||
enc_tensor = _make_tensor([5, 0, 3, 0])
|
||||
logger = DeterministicLogger(share_inputs={"seq_lens_encoder": enc_tensor})
|
||||
prefill, decode, enc = logger._get_stage_counts(batch_size=4)
|
||||
self.assertEqual(prefill, 2)
|
||||
self.assertEqual(decode, 2)
|
||||
np.testing.assert_array_equal(enc, np.array([5, 0, 3, 0]))
|
||||
|
||||
def test_all_prefill(self):
|
||||
enc_tensor = _make_tensor([10, 20])
|
||||
logger = DeterministicLogger(share_inputs={"seq_lens_encoder": enc_tensor})
|
||||
prefill, decode, _ = logger._get_stage_counts(batch_size=2)
|
||||
self.assertEqual(prefill, 2)
|
||||
self.assertEqual(decode, 0)
|
||||
|
||||
def test_none_share_inputs(self):
|
||||
logger = DeterministicLogger(share_inputs=None)
|
||||
prefill, decode, enc = logger._get_stage_counts(batch_size=4)
|
||||
self.assertEqual(prefill, 0)
|
||||
self.assertEqual(decode, 0)
|
||||
self.assertIsNone(enc)
|
||||
|
||||
|
||||
class TestLogTensorMd5s(unittest.TestCase):
|
||||
def test_logs_batch_md5(self):
|
||||
t = _make_tensor([[1.0, 2.0], [3.0, 4.0]])
|
||||
logger = DeterministicLogger(share_inputs={})
|
||||
with self.assertLogs("fastdeploy.deterministic", level="INFO") as cm:
|
||||
logger.log_tensor_md5s({"hidden": t}, stage="test_stage")
|
||||
self.assertTrue(any("[DETERMINISM-MD5]" in msg for msg in cm.output))
|
||||
self.assertTrue(any("stage=test_stage" in msg for msg in cm.output))
|
||||
|
||||
def test_skips_when_no_valid_tensor(self):
|
||||
logger = DeterministicLogger(share_inputs={})
|
||||
det_log = logging.getLogger("fastdeploy.deterministic")
|
||||
det_log.setLevel(logging.INFO)
|
||||
# Should not raise, just silently return
|
||||
logger.log_tensor_md5s({"a": None})
|
||||
|
||||
def test_logs_with_request_ids(self):
|
||||
t = _make_tensor([[1.0], [2.0]])
|
||||
req = Mock(request_id="req-42")
|
||||
logger = DeterministicLogger(share_inputs={})
|
||||
with self.assertLogs("fastdeploy.deterministic", level="INFO") as cm:
|
||||
logger.log_tensor_md5s({"x": t}, forward_batch_reqs_list=[req], stage="s")
|
||||
self.assertTrue(any("req-42" in msg for msg in cm.output))
|
||||
|
||||
def test_logs_per_request_md5_for_decode(self):
|
||||
# 2 requests, both decode (seq_lens_encoder = [0, 0])
|
||||
t = _make_tensor([[1.0, 2.0], [3.0, 4.0]])
|
||||
enc_tensor = _make_tensor([0, 0])
|
||||
r1 = Mock(request_id="r1")
|
||||
r2 = Mock(request_id="r2")
|
||||
logger = DeterministicLogger(share_inputs={"seq_lens_encoder": enc_tensor})
|
||||
with self.assertLogs("fastdeploy.deterministic", level="INFO") as cm:
|
||||
logger.log_tensor_md5s({"out": t}, forward_batch_reqs_list=[r1, r2], stage="decode")
|
||||
req_msgs = [msg for msg in cm.output if "[DETERMINISM-MD5-REQ]" in msg]
|
||||
self.assertEqual(len(req_msgs), 2)
|
||||
|
||||
|
||||
class TestLogDeterministicInput(unittest.TestCase):
|
||||
def _make_forward_meta(self, ids_list):
|
||||
ids_tensor = _make_tensor(ids_list)
|
||||
return SimpleNamespace(ids_remove_padding=ids_tensor)
|
||||
|
||||
def test_logs_input_info(self):
|
||||
forward_meta = self._make_forward_meta([101, 102, 201])
|
||||
share_inputs = {
|
||||
"req_ids": ["req-a", "req-b"],
|
||||
"seq_lens_this_time": [2, 1],
|
||||
"seq_lens_encoder": [2, 0],
|
||||
"seq_lens_decoder": [0, 5],
|
||||
}
|
||||
logger = DeterministicLogger(share_inputs=share_inputs)
|
||||
with self.assertLogs("fastdeploy.deterministic", level="INFO") as cm:
|
||||
logger.log_deterministic_input(forward_meta)
|
||||
output = "\n".join(cm.output)
|
||||
self.assertIn("batch_size=2", output)
|
||||
self.assertIn("req_id=req-a", output)
|
||||
self.assertIn("req_id=req-b", output)
|
||||
self.assertIn("tokens=[101, 102]", output)
|
||||
self.assertIn("tokens=[201]", output)
|
||||
|
||||
def test_no_input_data(self):
|
||||
forward_meta = SimpleNamespace(ids_remove_padding=None)
|
||||
share_inputs = {
|
||||
"req_ids": None,
|
||||
"seq_lens_this_time": [],
|
||||
"seq_lens_encoder": None,
|
||||
"seq_lens_decoder": None,
|
||||
}
|
||||
logger = DeterministicLogger(share_inputs=share_inputs)
|
||||
with self.assertLogs("fastdeploy.deterministic", level="INFO") as cm:
|
||||
logger.log_deterministic_input(forward_meta)
|
||||
self.assertTrue(any("No input data" in msg for msg in cm.output))
|
||||
|
||||
def test_fallback_req_id(self):
|
||||
forward_meta = self._make_forward_meta([10, 20])
|
||||
share_inputs = {
|
||||
"req_ids": None,
|
||||
"seq_lens_this_time": [1, 1],
|
||||
"seq_lens_encoder": None,
|
||||
"seq_lens_decoder": None,
|
||||
}
|
||||
logger = DeterministicLogger(share_inputs=share_inputs)
|
||||
with self.assertLogs("fastdeploy.deterministic", level="INFO") as cm:
|
||||
logger.log_deterministic_input(forward_meta)
|
||||
output = "\n".join(cm.output)
|
||||
self.assertIn("req_id=idx_0", output)
|
||||
self.assertIn("req_id=idx_1", output)
|
||||
|
||||
|
||||
class TestLogBatchStart(unittest.TestCase):
|
||||
def _make_logger(self):
|
||||
return DeterministicLogger(share_inputs={})
|
||||
|
||||
def _make_req(self, request_id):
|
||||
return Mock(request_id=request_id)
|
||||
|
||||
def test_logs_batch_start(self):
|
||||
logger = self._make_logger()
|
||||
batch = [self._make_req("prompt_0")]
|
||||
with self.assertLogs("fastdeploy.deterministic", level="INFO") as cm:
|
||||
logger.log_batch_start(batch)
|
||||
output = "\n".join(cm.output)
|
||||
self.assertIn("[BATCH-START]", output)
|
||||
self.assertIn("Run_0", output)
|
||||
self.assertIn("Batch_1", output)
|
||||
|
||||
def test_batch_counter_increments(self):
|
||||
logger = self._make_logger()
|
||||
batch = [self._make_req("prompt_0")]
|
||||
with self.assertLogs("fastdeploy.deterministic", level="INFO"):
|
||||
logger.log_batch_start(batch)
|
||||
with self.assertLogs("fastdeploy.deterministic", level="INFO") as cm:
|
||||
logger.log_batch_start(batch)
|
||||
output = "\n".join(cm.output)
|
||||
self.assertIn("Batch_2", output)
|
||||
|
||||
def test_run_id_change_resets_counter(self):
|
||||
logger = self._make_logger()
|
||||
batch_0 = [self._make_req("prompt_0")]
|
||||
batch_1 = [self._make_req("prompt_1")]
|
||||
with self.assertLogs("fastdeploy.deterministic", level="INFO"):
|
||||
logger.log_batch_start(batch_0)
|
||||
logger.log_batch_start(batch_0) # Batch_2
|
||||
# Switch to run_id 1 => counter resets
|
||||
with self.assertLogs("fastdeploy.deterministic", level="INFO") as cm:
|
||||
logger.log_batch_start(batch_1)
|
||||
output = "\n".join(cm.output)
|
||||
self.assertIn("Run_1", output)
|
||||
self.assertIn("Batch_1", output)
|
||||
|
||||
def test_skips_none_requests(self):
|
||||
logger = self._make_logger()
|
||||
batch = [None, self._make_req("req_5")]
|
||||
with self.assertLogs("fastdeploy.deterministic", level="INFO") as cm:
|
||||
logger.log_batch_start(batch)
|
||||
output = "\n".join(cm.output)
|
||||
self.assertIn("Run_5", output)
|
||||
|
||||
def test_empty_batch(self):
|
||||
logger = self._make_logger()
|
||||
with self.assertLogs("fastdeploy.deterministic", level="INFO") as cm:
|
||||
logger.log_batch_start([])
|
||||
output = "\n".join(cm.output)
|
||||
self.assertIn("Run_None", output)
|
||||
self.assertIn("Batch_1", output)
|
||||
|
||||
def test_none_batch(self):
|
||||
logger = self._make_logger()
|
||||
with self.assertLogs("fastdeploy.deterministic", level="INFO") as cm:
|
||||
logger.log_batch_start(None)
|
||||
output = "\n".join(cm.output)
|
||||
self.assertIn("Batch_1", output)
|
||||
|
||||
|
||||
class TestLogPrefillInput(unittest.TestCase):
|
||||
def test_logs_prefill_input(self):
|
||||
logger = DeterministicLogger(share_inputs={})
|
||||
with self.assertLogs("fastdeploy.deterministic", level="INFO") as cm:
|
||||
logger.log_prefill_input(
|
||||
request_id="req-001",
|
||||
idx=0,
|
||||
prefill_start_index=0,
|
||||
prefill_end_index=5,
|
||||
input_ids=[101, 102, 103, 104, 105],
|
||||
)
|
||||
output = "\n".join(cm.output)
|
||||
self.assertIn("[DETERMINISM] Prefill input", output)
|
||||
self.assertIn("request_id: req-001", output)
|
||||
self.assertIn("idx: 0", output)
|
||||
self.assertIn("prefill_start_index: 0", output)
|
||||
self.assertIn("prefill_end_index: 5", output)
|
||||
self.assertIn("[101, 102, 103, 104, 105]", output)
|
||||
|
||||
def test_logs_with_nonzero_start_index(self):
|
||||
logger = DeterministicLogger(share_inputs={})
|
||||
with self.assertLogs("fastdeploy.deterministic", level="INFO") as cm:
|
||||
logger.log_prefill_input(
|
||||
request_id="req-002",
|
||||
idx=3,
|
||||
prefill_start_index=10,
|
||||
prefill_end_index=20,
|
||||
input_ids=list(range(20)),
|
||||
)
|
||||
output = "\n".join(cm.output)
|
||||
self.assertIn("request_id: req-002", output)
|
||||
self.assertIn("idx: 3", output)
|
||||
self.assertIn("prefill_start_index: 10", output)
|
||||
self.assertIn("prefill_end_index: 20", output)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
unittest.main()
|
||||
Reference in New Issue
Block a user