mirror of
https://github.com/PaddlePaddle/FastDeploy.git
synced 2026-04-23 00:17:25 +08:00
[FDConfig] Reduce FD_CUSTOM_AR_MAX_SIZE_MB default from 64 to 8 (#6997)
Most single-GPU and small-model deployments do not need 64MB custom all-reduce buffers. Lowering the default to 8MB reduces unnecessary shared memory allocation. Tests that require larger buffers now explicitly set the value. Co-authored-by: gongweibao <gognweibao@baidu.com> Co-authored-by: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
+2
-2
@@ -225,10 +225,10 @@ environment_variables: dict[str, Callable[[], Any]] = {
|
||||
"FD_WORKER_ALIVE_TIMEOUT": lambda: int(os.getenv("FD_WORKER_ALIVE_TIMEOUT", "30")),
|
||||
# File path for file storage backend
|
||||
"FILE_BACKEND_STORAGE_DIR": lambda: str(os.getenv("FILE_BACKEND_STORAGE_DIR", "/tmp/fastdeploy")),
|
||||
# Custom all-reduce max buffer size in MB (default 64MB).
|
||||
# Custom all-reduce max buffer size in MB (default 8MB).
|
||||
# Increase this to avoid NCCL fallback for large tensors in deterministic mode.
|
||||
# E.g. FD_CUSTOM_AR_MAX_SIZE_MB=128 for 128MB.
|
||||
"FD_CUSTOM_AR_MAX_SIZE_MB": lambda: int(os.getenv("FD_CUSTOM_AR_MAX_SIZE_MB", "64")),
|
||||
"FD_CUSTOM_AR_MAX_SIZE_MB": lambda: int(os.getenv("FD_CUSTOM_AR_MAX_SIZE_MB", "8")),
|
||||
# Enable deterministic inference mode for chunked prefill alignment
|
||||
"FD_DETERMINISTIC_MODE": lambda: bool(int(os.getenv("FD_DETERMINISTIC_MODE", "0"))),
|
||||
# Split KV block size for deterministic alignment (must be power of 2 and > 0, default 16)
|
||||
|
||||
Reference in New Issue
Block a user