[Optimize] Optimize ttft for ep (#6098)

* optimize ttft

* fix

* fix

* fix ci

* fix ci

* fix

* fix bug

* fix

* add comments

* fix ci

* fix
This commit is contained in:
chenjian
2026-02-04 15:03:29 +08:00
committed by GitHub
parent 6e96bd0bd2
commit 90db0bdd0d
10 changed files with 118 additions and 142 deletions
-3
View File
@@ -147,9 +147,6 @@ environment_variables: dict[str, Callable[[], Any]] = {
# Whether to enable the decode caches requests for preallocating resource
"FD_ENABLE_CACHE_TASK": lambda: os.getenv("FD_ENABLE_CACHE_TASK", "0"),
# Batched token timeout in EP
"FD_EP_BATCHED_TOKEN_TIMEOUT": lambda: float(os.getenv("FD_EP_BATCHED_TOKEN_TIMEOUT", "0.1")),
# Max pre-fetch requests number in PD
"FD_EP_MAX_PREFETCH_TASK_NUM": lambda: int(os.getenv("FD_EP_MAX_PREFETCH_TASK_NUM", "8")),
-3
View File
@@ -147,9 +147,6 @@ environment_variables: dict[str, Callable[[], Any]] = {
# 是否启用 decode 缓存请求以预分配资源
"FD_ENABLE_CACHE_TASK": lambda: os.getenv("FD_ENABLE_CACHE_TASK", "0"),
# EP 中批处理 token 的超时时间
"FD_EP_BATCHED_TOKEN_TIMEOUT": lambda: float(os.getenv("FD_EP_BATCHED_TOKEN_TIMEOUT", "0.1")),
# PD 中最大预取请求数量
"FD_EP_MAX_PREFETCH_TASK_NUM": lambda: int(os.getenv("FD_EP_MAX_PREFETCH_TASK_NUM", "8")),