mirror of
https://github.com/PaddlePaddle/FastDeploy.git
synced 2026-04-23 00:17:25 +08:00
[Feature] implement log channel separation and request log level system (#7190)
* feat: implement log channel separation and request log level system * fix: log system improvements based on review * add request_id to error logs, use RequestLogLevel enum, and unify logger implementation from utils to logger module
This commit is contained in:
@@ -25,7 +25,7 @@ environment_variables: dict[str, Callable[[], Any]] = {
|
|||||||
"FD_LOG_REQUESTS": lambda: int(os.getenv("FD_LOG_REQUESTS", "1")),
|
"FD_LOG_REQUESTS": lambda: int(os.getenv("FD_LOG_REQUESTS", "1")),
|
||||||
|
|
||||||
# Request logging detail level (0-3). Higher level means more verbose output.
|
# Request logging detail level (0-3). Higher level means more verbose output.
|
||||||
"FD_LOG_REQUESTS_LEVEL": lambda: int(os.getenv("FD_LOG_REQUESTS_LEVEL", "0")),
|
"FD_LOG_REQUESTS_LEVEL": lambda: int(os.getenv("FD_LOG_REQUESTS_LEVEL", "2")),
|
||||||
|
|
||||||
# Max field length for request logging truncation.
|
# Max field length for request logging truncation.
|
||||||
"FD_LOG_MAX_LEN": lambda: int(os.getenv("FD_LOG_MAX_LEN", "2048")),
|
"FD_LOG_MAX_LEN": lambda: int(os.getenv("FD_LOG_MAX_LEN", "2048")),
|
||||||
|
|||||||
+40
-7
@@ -5,12 +5,48 @@
|
|||||||
FastDeploy generates the following log files during deployment. Below is an explanation of each log's purpose.
|
FastDeploy generates the following log files during deployment. Below is an explanation of each log's purpose.
|
||||||
By default, logs are stored in the `log` directory under the execution path. To specify a custom directory, set the environment variable `FD_LOG_DIR`.
|
By default, logs are stored in the `log` directory under the execution path. To specify a custom directory, set the environment variable `FD_LOG_DIR`.
|
||||||
|
|
||||||
|
## Log Channel Separation
|
||||||
|
|
||||||
|
FastDeploy separates logs into three channels:
|
||||||
|
|
||||||
|
| Channel | Logger Name | Output Files | Description |
|
||||||
|
|---------|-------------|--------------|-------------|
|
||||||
|
| main | `fastdeploy.main.*` | `fastdeploy.log`, `console.log` | Main logs for system configuration, startup info, etc. |
|
||||||
|
| request | `fastdeploy.request.*` | `request.log` | Request logs for request lifecycle and processing details |
|
||||||
|
| console | `fastdeploy.console.*` | `console.log` | Console logs, output to terminal and console.log |
|
||||||
|
|
||||||
|
## Request Log Levels
|
||||||
|
|
||||||
|
Request logs (`request.log`) support 4 levels, controlled by the environment variable `FD_LOG_REQUESTS_LEVEL`:
|
||||||
|
|
||||||
|
| Level | Enum Name | Description | Example Content |
|
||||||
|
|-------|-----------|-------------|-----------------|
|
||||||
|
| 0 | LIFECYCLE | Lifecycle start/end | Request creation/initialization, completion stats (InputToken/OutputToken/latency), first and last streaming response, request abort |
|
||||||
|
| 1 | STAGES | Processing stages | Semaphore acquire/release, first token time recording, signal handling (preemption/abortion/recovery), cache task, preprocess time, parameter adjustment warnings |
|
||||||
|
| 2 | CONTENT | Content and scheduling | Request parameters, processed request, scheduling info (enqueue/pull/finish), response content (long content is truncated) |
|
||||||
|
| 3 | FULL | Complete raw data | Complete request and response data, raw received request |
|
||||||
|
|
||||||
|
Default level is 2 (CONTENT), which logs request parameters, scheduling info, and response content. Lower levels (0-1) only log critical events, while level 3 includes complete raw data.
|
||||||
|
|
||||||
|
## Log-Related Environment Variables
|
||||||
|
|
||||||
|
| Variable | Default | Description |
|
||||||
|
|----------|---------|-------------|
|
||||||
|
| `FD_LOG_DIR` | `log` | Log file storage directory |
|
||||||
|
| `FD_LOG_LEVEL` | `INFO` | Log level, supports `INFO` or `DEBUG` |
|
||||||
|
| `FD_LOG_REQUESTS` | `1` | Enable request logging, `0` to disable, `1` to enable |
|
||||||
|
| `FD_LOG_REQUESTS_LEVEL` | `2` | Request log level, range 0-3 |
|
||||||
|
| `FD_LOG_MAX_LEN` | `2048` | Maximum length for L2 level log content (excess is truncated) |
|
||||||
|
| `FD_LOG_BACKUP_COUNT` | `7` | Number of log files to retain |
|
||||||
|
| `FD_DEBUG` | `0` | Debug mode, `1` enables DEBUG log level |
|
||||||
|
|
||||||
## Inference Service Logs
|
## Inference Service Logs
|
||||||
|
|
||||||
|
* `fastdeploy.log` : Main log file, records system configuration, startup information, runtime status, etc.
|
||||||
|
* `request.log` : Request log file, records user request lifecycle and processing details
|
||||||
|
* `console.log` : Console log, records model startup time and other information. This log is also printed to the console.
|
||||||
|
* `error.log` : Error log file, records all ERROR and above level logs
|
||||||
* `backup_env.*.json` : Records environment variables set during instance startup. The number of files matches the number of GPU cards.
|
* `backup_env.*.json` : Records environment variables set during instance startup. The number of files matches the number of GPU cards.
|
||||||
* `envlog.*` : Logs environment variables set during instance startup. The number of files matches the number of GPU cards.
|
|
||||||
* `console.log` : Records model startup time and other information. This log is also printed to the console.
|
|
||||||
* `data_processor.log` : Logs input/output data encoding and decoding details.
|
|
||||||
* `fastdeploy.log` : Records configuration information during instance startup, as well as request and response details during runtime.
|
|
||||||
* `workerlog.*` : Tracks model loading progress and inference operator errors. Each GPU card has a corresponding file.
|
* `workerlog.*` : Tracks model loading progress and inference operator errors. Each GPU card has a corresponding file.
|
||||||
* `worker_process.log` : Logs engine inference data for each iteration.
|
* `worker_process.log` : Logs engine inference data for each iteration.
|
||||||
* `cache_manager.log` : Records KV Cache logical index allocation for each request and cache hit status.
|
* `cache_manager.log` : Records KV Cache logical index allocation for each request and cache hit status.
|
||||||
@@ -18,9 +54,6 @@ By default, logs are stored in the `log` directory under the execution path. To
|
|||||||
* `gpu_worker.log` : Records KV Cache block count information during profiling.
|
* `gpu_worker.log` : Records KV Cache block count information during profiling.
|
||||||
* `gpu_model_runner.log` : Contains model details and loading time.
|
* `gpu_model_runner.log` : Contains model details and loading time.
|
||||||
|
|
||||||
## Online Inference Client Logs
|
|
||||||
* `api_server.log` : Logs startup parameters and received request information.
|
|
||||||
|
|
||||||
## Scheduler Logs
|
## Scheduler Logs
|
||||||
* `scheduler.log` : Records scheduler information, including node status and request allocation details.
|
* `scheduler.log` : Records scheduler information, including node status and request allocation details.
|
||||||
|
|
||||||
|
|||||||
@@ -25,7 +25,7 @@ environment_variables: dict[str, Callable[[], Any]] = {
|
|||||||
"FD_LOG_REQUESTS": lambda: int(os.getenv("FD_LOG_REQUESTS", "1")),
|
"FD_LOG_REQUESTS": lambda: int(os.getenv("FD_LOG_REQUESTS", "1")),
|
||||||
|
|
||||||
# 请求日志详细级别 (0-3)。级别越高输出越详细。
|
# 请求日志详细级别 (0-3)。级别越高输出越详细。
|
||||||
"FD_LOG_REQUESTS_LEVEL": lambda: int(os.getenv("FD_LOG_REQUESTS_LEVEL", "0")),
|
"FD_LOG_REQUESTS_LEVEL": lambda: int(os.getenv("FD_LOG_REQUESTS_LEVEL", "2")),
|
||||||
|
|
||||||
# 请求日志字段截断最大长度。
|
# 请求日志字段截断最大长度。
|
||||||
"FD_LOG_MAX_LEN": lambda: int(os.getenv("FD_LOG_MAX_LEN", "2048")),
|
"FD_LOG_MAX_LEN": lambda: int(os.getenv("FD_LOG_MAX_LEN", "2048")),
|
||||||
|
|||||||
+42
-10
@@ -2,15 +2,51 @@
|
|||||||
|
|
||||||
# 日志说明
|
# 日志说明
|
||||||
|
|
||||||
FastDeploy 在部署过程中,会产生如下日志文件,各日志含义说明
|
FastDeploy 在部署过程中,会产生如下日志文件,各日志含义说明。
|
||||||
默认日志目录为执行目录下的 `log` 文件夹,若需要指定可设置环境变量 `FD_LOG_DIR`。
|
默认日志目录为执行目录下的 `log` 文件夹,若需要指定可设置环境变量 `FD_LOG_DIR`。
|
||||||
|
|
||||||
|
## 日志通道划分
|
||||||
|
|
||||||
|
FastDeploy 将日志分为三个通道:
|
||||||
|
|
||||||
|
| 通道 | Logger 名称 | 输出文件 | 说明 |
|
||||||
|
|------|-------------|----------|------|
|
||||||
|
| main | `fastdeploy.main.*` | `fastdeploy.log`, `console.log` | 主日志,记录系统配置、启动信息等 |
|
||||||
|
| request | `fastdeploy.request.*` | `request.log` | 请求日志,记录请求生命周期和处理细节 |
|
||||||
|
| console | `fastdeploy.console.*` | `console.log` | 控制台日志,输出到终端和 console.log |
|
||||||
|
|
||||||
|
## 请求日志级别
|
||||||
|
|
||||||
|
请求日志 (`request.log`) 支持 4 个级别,通过环境变量 `FD_LOG_REQUESTS_LEVEL` 控制:
|
||||||
|
|
||||||
|
| 级别 | 枚举名 | 说明 | 示例内容 |
|
||||||
|
|------|--------|------|----------|
|
||||||
|
| 0 | LIFECYCLE | 生命周期起止 | 请求创建/初始化、完成统计(InputToken/OutputToken/耗时)、流式响应首次和最后发送、请求中止 |
|
||||||
|
| 1 | STAGES | 处理阶段 | 信号量获取/释放、首 token 时间记录、信号处理(preemption/abortion/recovery)、缓存任务、预处理耗时、参数调整警告 |
|
||||||
|
| 2 | CONTENT | 内容和调度 | 请求参数、处理后的请求、调度信息(入队/拉取/完成)、响应内容(超长内容会被截断) |
|
||||||
|
| 3 | FULL | 完整数据 | 完整的请求和响应数据、原始接收请求 |
|
||||||
|
|
||||||
|
默认级别为 2 (CONTENT),记录请求参数、调度信息和响应内容。较低级别 (0-1) 只记录关键事件,级别 3 则包含完整原始数据。
|
||||||
|
|
||||||
|
## 日志相关环境变量
|
||||||
|
|
||||||
|
| 环境变量 | 默认值 | 说明 |
|
||||||
|
|----------|--------|------|
|
||||||
|
| `FD_LOG_DIR` | `log` | 日志文件存储目录 |
|
||||||
|
| `FD_LOG_LEVEL` | `INFO` | 日志级别,支持 `INFO` 或 `DEBUG` |
|
||||||
|
| `FD_LOG_REQUESTS` | `1` | 是否启用请求日志,`0` 禁用,`1` 启用 |
|
||||||
|
| `FD_LOG_REQUESTS_LEVEL` | `2` | 请求日志级别,范围 0-3 |
|
||||||
|
| `FD_LOG_MAX_LEN` | `2048` | L2 级别日志内容的最大长度(超出部分会被截断) |
|
||||||
|
| `FD_LOG_BACKUP_COUNT` | `7` | 日志文件保留数量 |
|
||||||
|
| `FD_DEBUG` | `0` | 调试模式,`1` 启用时日志级别设为 `DEBUG` |
|
||||||
|
|
||||||
## 推理服务日志
|
## 推理服务日志
|
||||||
|
|
||||||
|
* `fastdeploy.log` : 主日志文件,记录系统配置、启动信息、运行状态等
|
||||||
|
* `request.log` : 请求日志文件,记录用户请求的生命周期和处理细节
|
||||||
|
* `console.log` : 控制台日志,记录模型启动耗时等信息,该日志信息会被打印到控制台
|
||||||
|
* `error.log` : 错误日志文件,记录所有 ERROR 及以上级别的日志
|
||||||
* `backup_env.*.json` : 记录当前实例启动时设置的环境变量,文件个数与卡数相同
|
* `backup_env.*.json` : 记录当前实例启动时设置的环境变量,文件个数与卡数相同
|
||||||
* `envlog.*` : 记录当前实例启动时设置的环境变量,文件个数与卡数相同
|
|
||||||
* `console.log` : 记录模型启动耗时等信息,该日志信息会被打印到控制台
|
|
||||||
* `data_processor.log` : 记录输入数据及输出输出编码解码的内容
|
|
||||||
* `fastdeploy.log` : 记录当前实例启动的各个 config 的信息,运行中记录用户请求的 request 及 response 信息
|
|
||||||
* `workerlog.*` : 记录模型启动加载进度及推理算子报错信息,每个卡对应一个文件
|
* `workerlog.*` : 记录模型启动加载进度及推理算子报错信息,每个卡对应一个文件
|
||||||
* `worker_process.log` : 记录引擎每一轮推理的数据
|
* `worker_process.log` : 记录引擎每一轮推理的数据
|
||||||
* `cache_manager.log` : 记录每一个请求分配 KV Cache 的逻辑索引,以及当前请求的命中情况
|
* `cache_manager.log` : 记录每一个请求分配 KV Cache 的逻辑索引,以及当前请求的命中情况
|
||||||
@@ -18,9 +54,6 @@ FastDeploy 在部署过程中,会产生如下日志文件,各日志含义说
|
|||||||
* `gpu_worker.log` : 记录 profile 时计算 KV Cache block 数目的信息
|
* `gpu_worker.log` : 记录 profile 时计算 KV Cache block 数目的信息
|
||||||
* `gpu_model_runner.log` : 当前的模型信息及加载时间
|
* `gpu_model_runner.log` : 当前的模型信息及加载时间
|
||||||
|
|
||||||
## 在线推理客户端日志
|
|
||||||
* `api_server.log` : 记录启动参数,及接收到的请求信息
|
|
||||||
|
|
||||||
## 调度器日志
|
## 调度器日志
|
||||||
* `scheduler.log` : 记录调度器的信息包含当前结点的信息,每条请求分配的信息
|
* `scheduler.log` : 记录调度器的信息包含当前结点的信息,每条请求分配的信息
|
||||||
|
|
||||||
@@ -31,12 +64,11 @@ FastDeploy 在部署过程中,会产生如下日志文件,各日志含义说
|
|||||||
|
|
||||||
* `cache_queue_manager.log` : 记录启动参数,及接收到的请求信息
|
* `cache_queue_manager.log` : 记录启动参数,及接收到的请求信息
|
||||||
* `cache_transfer_manager.log` : 记录启动参数,及接收到的请求信息
|
* `cache_transfer_manager.log` : 记录启动参数,及接收到的请求信息
|
||||||
* `cache_queue_manager.log` : 记录启动参数,及接收到的请求信息
|
|
||||||
* `launch_cache_manager.log` : 启动 cache transfer 记录启动参数,报错信息
|
* `launch_cache_manager.log` : 启动 cache transfer 记录启动参数,报错信息
|
||||||
|
|
||||||
## PD 分离相关日志
|
## PD 分离相关日志
|
||||||
|
|
||||||
* `cache_messager.log` : 记录P 实例使用的传输协议及传输信息
|
* `cache_messager.log` : 记录 P 实例使用的传输协议及传输信息
|
||||||
* `splitwise_connector.log` : 记录收到 P/D 发送的数据,及建联信息
|
* `splitwise_connector.log` : 记录收到 P/D 发送的数据,及建联信息
|
||||||
|
|
||||||
## CudaGraph 相关日志
|
## CudaGraph 相关日志
|
||||||
|
|||||||
@@ -37,6 +37,7 @@ from fastdeploy.entrypoints.openai.utils import DealerConnectionManager
|
|||||||
from fastdeploy.input.preprocess import InputPreprocessor
|
from fastdeploy.input.preprocess import InputPreprocessor
|
||||||
from fastdeploy.inter_communicator import IPCSignal
|
from fastdeploy.inter_communicator import IPCSignal
|
||||||
from fastdeploy.inter_communicator.zmq_client import ZmqIpcClient
|
from fastdeploy.inter_communicator.zmq_client import ZmqIpcClient
|
||||||
|
from fastdeploy.logger.request_logger import log_request_error
|
||||||
from fastdeploy.metrics.metrics import main_process_metrics
|
from fastdeploy.metrics.metrics import main_process_metrics
|
||||||
from fastdeploy.utils import EngineError, envs, llm_logger
|
from fastdeploy.utils import EngineError, envs, llm_logger
|
||||||
|
|
||||||
@@ -562,7 +563,7 @@ class AsyncLLM(EngineServiceClient):
|
|||||||
llm_logger.info(f"Request {conn_request_id} generator exit (outer)")
|
llm_logger.info(f"Request {conn_request_id} generator exit (outer)")
|
||||||
return
|
return
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
llm_logger.error(f"Request {conn_request_id} failed: {e}")
|
log_request_error(message="Request {request_id} failed: {error}", request_id=conn_request_id, error=e)
|
||||||
raise EngineError(str(e), error_code=500) from e
|
raise EngineError(str(e), error_code=500) from e
|
||||||
finally:
|
finally:
|
||||||
# Ensure request_map/request_num are cleaned up
|
# Ensure request_map/request_num are cleaned up
|
||||||
@@ -584,7 +585,7 @@ class AsyncLLM(EngineServiceClient):
|
|||||||
await self.connection_manager.cleanup_request(request_id)
|
await self.connection_manager.cleanup_request(request_id)
|
||||||
llm_logger.info(f"Aborted request {request_id}")
|
llm_logger.info(f"Aborted request {request_id}")
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
llm_logger.error(f"Failed to abort request {request_id}: {e}")
|
log_request_error(message="Failed to abort request {request_id}: {error}", request_id=request_id, error=e)
|
||||||
|
|
||||||
async def shutdown(self):
|
async def shutdown(self):
|
||||||
"""
|
"""
|
||||||
|
|||||||
+45
-11
@@ -44,6 +44,11 @@ from fastdeploy.engine.common_engine import (
|
|||||||
from fastdeploy.engine.expert_service import start_data_parallel_service
|
from fastdeploy.engine.expert_service import start_data_parallel_service
|
||||||
from fastdeploy.engine.request import Request
|
from fastdeploy.engine.request import Request
|
||||||
from fastdeploy.inter_communicator import EngineWorkerQueue, IPCSignal
|
from fastdeploy.inter_communicator import EngineWorkerQueue, IPCSignal
|
||||||
|
from fastdeploy.logger.request_logger import (
|
||||||
|
RequestLogLevel,
|
||||||
|
log_request,
|
||||||
|
log_request_error,
|
||||||
|
)
|
||||||
from fastdeploy.metrics.metrics import main_process_metrics
|
from fastdeploy.metrics.metrics import main_process_metrics
|
||||||
from fastdeploy.platforms import current_platform
|
from fastdeploy.platforms import current_platform
|
||||||
from fastdeploy.utils import EngineError, console_logger, envs, llm_logger
|
from fastdeploy.utils import EngineError, console_logger, envs, llm_logger
|
||||||
@@ -285,7 +290,7 @@ class LLMEngine:
|
|||||||
# Create Request struct after processing
|
# Create Request struct after processing
|
||||||
request = Request.from_dict(task)
|
request = Request.from_dict(task)
|
||||||
request.metrics.scheduler_recv_req_time = time.time()
|
request.metrics.scheduler_recv_req_time = time.time()
|
||||||
llm_logger.info(f"Receive request {request}")
|
log_request(RequestLogLevel.CONTENT, message="Receive request {request}", request=request)
|
||||||
request.metrics.preprocess_start_time = time.time()
|
request.metrics.preprocess_start_time = time.time()
|
||||||
|
|
||||||
request.prompt_token_ids_len = len(request.prompt_token_ids)
|
request.prompt_token_ids_len = len(request.prompt_token_ids)
|
||||||
@@ -304,12 +309,20 @@ class LLMEngine:
|
|||||||
f"Input text is too long, length of prompt token({input_ids_len}) "
|
f"Input text is too long, length of prompt token({input_ids_len}) "
|
||||||
f"+ min_dec_len ({min_tokens}) >= max_model_len "
|
f"+ min_dec_len ({min_tokens}) >= max_model_len "
|
||||||
)
|
)
|
||||||
llm_logger.error(error_msg)
|
log_request_error(
|
||||||
|
message="request[{request_id}] error: {error}",
|
||||||
|
request_id=request.get("request_id"),
|
||||||
|
error=error_msg,
|
||||||
|
)
|
||||||
raise EngineError(error_msg, error_code=400)
|
raise EngineError(error_msg, error_code=400)
|
||||||
|
|
||||||
if input_ids_len > self.cfg.model_config.max_model_len:
|
if input_ids_len > self.cfg.model_config.max_model_len:
|
||||||
error_msg = f"Length of input token({input_ids_len}) exceeds the limit max_model_len({self.cfg.model_config.max_model_len})."
|
error_msg = f"Length of input token({input_ids_len}) exceeds the limit max_model_len({self.cfg.model_config.max_model_len})."
|
||||||
llm_logger.error(error_msg)
|
log_request_error(
|
||||||
|
message="request[{request_id}] error: {error}",
|
||||||
|
request_id=request.get("request_id"),
|
||||||
|
error=error_msg,
|
||||||
|
)
|
||||||
raise EngineError(error_msg, error_code=400)
|
raise EngineError(error_msg, error_code=400)
|
||||||
|
|
||||||
if request.get("stop_seqs_len") is not None:
|
if request.get("stop_seqs_len") is not None:
|
||||||
@@ -320,7 +333,11 @@ class LLMEngine:
|
|||||||
f"Length of stop ({stop_seqs_len}) exceeds the limit max_stop_seqs_num({max_stop_seqs_num})."
|
f"Length of stop ({stop_seqs_len}) exceeds the limit max_stop_seqs_num({max_stop_seqs_num})."
|
||||||
"Please reduce the number of stop or set a lager max_stop_seqs_num by `FD_MAX_STOP_SEQS_NUM`"
|
"Please reduce the number of stop or set a lager max_stop_seqs_num by `FD_MAX_STOP_SEQS_NUM`"
|
||||||
)
|
)
|
||||||
llm_logger.error(error_msg)
|
log_request_error(
|
||||||
|
message="request[{request_id}] error: {error}",
|
||||||
|
request_id=request.get("request_id"),
|
||||||
|
error=error_msg,
|
||||||
|
)
|
||||||
raise EngineError(error_msg, error_code=400)
|
raise EngineError(error_msg, error_code=400)
|
||||||
stop_seqs_max_len = envs.FD_STOP_SEQS_MAX_LEN
|
stop_seqs_max_len = envs.FD_STOP_SEQS_MAX_LEN
|
||||||
for single_stop_seq_len in stop_seqs_len:
|
for single_stop_seq_len in stop_seqs_len:
|
||||||
@@ -329,7 +346,11 @@ class LLMEngine:
|
|||||||
f"Length of stop_seqs({single_stop_seq_len}) exceeds the limit stop_seqs_max_len({stop_seqs_max_len})."
|
f"Length of stop_seqs({single_stop_seq_len}) exceeds the limit stop_seqs_max_len({stop_seqs_max_len})."
|
||||||
"Please reduce the length of stop sequences or set a larger stop_seqs_max_len by `FD_STOP_SEQS_MAX_LEN`"
|
"Please reduce the length of stop sequences or set a larger stop_seqs_max_len by `FD_STOP_SEQS_MAX_LEN`"
|
||||||
)
|
)
|
||||||
llm_logger.error(error_msg)
|
log_request_error(
|
||||||
|
message="request[{request_id}] error: {error}",
|
||||||
|
request_id=request.get("request_id"),
|
||||||
|
error=error_msg,
|
||||||
|
)
|
||||||
raise EngineError(error_msg, error_code=400)
|
raise EngineError(error_msg, error_code=400)
|
||||||
|
|
||||||
if self._has_guided_input(request):
|
if self._has_guided_input(request):
|
||||||
@@ -342,14 +363,22 @@ class LLMEngine:
|
|||||||
request, err_msg = self.guided_decoding_checker.schema_format(request)
|
request, err_msg = self.guided_decoding_checker.schema_format(request)
|
||||||
|
|
||||||
if err_msg is not None:
|
if err_msg is not None:
|
||||||
llm_logger.error(err_msg)
|
log_request_error(
|
||||||
|
message="request[{request_id}] error: {error}",
|
||||||
|
request_id=request.get("request_id"),
|
||||||
|
error=err_msg,
|
||||||
|
)
|
||||||
raise EngineError(err_msg, error_code=400)
|
raise EngineError(err_msg, error_code=400)
|
||||||
|
|
||||||
request.metrics.preprocess_end_time = time.time()
|
request.metrics.preprocess_end_time = time.time()
|
||||||
request.metrics.scheduler_recv_req_time = time.time()
|
request.metrics.scheduler_recv_req_time = time.time()
|
||||||
self.engine.scheduler.put_requests([request])
|
self.engine.scheduler.put_requests([request])
|
||||||
llm_logger.info(f"Cache task with request_id ({request.get('request_id')})")
|
log_request(
|
||||||
llm_logger.debug(f"cache task: {request}")
|
RequestLogLevel.STAGES,
|
||||||
|
message="Cache task with request_id ({request_id})",
|
||||||
|
request_id=request.get("request_id"),
|
||||||
|
)
|
||||||
|
log_request(RequestLogLevel.FULL, message="cache task: {request}", request=request)
|
||||||
|
|
||||||
def _worker_processes_ready(self):
|
def _worker_processes_ready(self):
|
||||||
"""
|
"""
|
||||||
@@ -717,11 +746,16 @@ class LLMEngine:
|
|||||||
Yields:
|
Yields:
|
||||||
dict: The generated response.
|
dict: The generated response.
|
||||||
"""
|
"""
|
||||||
llm_logger.info(f"Starting generation for prompt: {prompts}")
|
log_request(RequestLogLevel.CONTENT, message="Starting generation for prompt: {prompts}", prompts=prompts)
|
||||||
try:
|
try:
|
||||||
req_id = self._format_and_add_data(prompts)
|
req_id = self._format_and_add_data(prompts)
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
llm_logger.error(f"Error happened while adding request, details={e}, {str(traceback.format_exc())}")
|
log_request_error(
|
||||||
|
message="request[{request_id}] error while adding request: {error}, {traceback}",
|
||||||
|
request_id=prompts.get("request_id"),
|
||||||
|
error=str(e),
|
||||||
|
traceback=traceback.format_exc(),
|
||||||
|
)
|
||||||
raise EngineError(str(e), error_code=400)
|
raise EngineError(str(e), error_code=400)
|
||||||
|
|
||||||
# Get the result of the current request
|
# Get the result of the current request
|
||||||
@@ -740,7 +774,7 @@ class LLMEngine:
|
|||||||
output = self.engine.data_processor.process_response_dict(
|
output = self.engine.data_processor.process_response_dict(
|
||||||
result.to_dict(), stream=False, include_stop_str_in_output=False, direct_decode=not stream
|
result.to_dict(), stream=False, include_stop_str_in_output=False, direct_decode=not stream
|
||||||
)
|
)
|
||||||
llm_logger.debug(f"Generate result: {output}")
|
log_request(RequestLogLevel.FULL, message="Generate result: {output}", output=output)
|
||||||
if not stream:
|
if not stream:
|
||||||
yield output
|
yield output
|
||||||
else:
|
else:
|
||||||
|
|||||||
@@ -39,7 +39,11 @@ from fastdeploy.entrypoints.openai.protocol import (
|
|||||||
StructuralTagResponseFormat,
|
StructuralTagResponseFormat,
|
||||||
ToolCall,
|
ToolCall,
|
||||||
)
|
)
|
||||||
from fastdeploy.utils import data_processor_logger
|
from fastdeploy.logger.request_logger import (
|
||||||
|
RequestLogLevel,
|
||||||
|
log_request,
|
||||||
|
log_request_error,
|
||||||
|
)
|
||||||
from fastdeploy.worker.output import (
|
from fastdeploy.worker.output import (
|
||||||
LogprobsLists,
|
LogprobsLists,
|
||||||
PromptLogprobs,
|
PromptLogprobs,
|
||||||
@@ -313,15 +317,13 @@ class Request:
|
|||||||
), "The parameter `raw_request` is not supported now, please use completion api instead."
|
), "The parameter `raw_request` is not supported now, please use completion api instead."
|
||||||
for key, value in req.metadata.items():
|
for key, value in req.metadata.items():
|
||||||
setattr(request, key, value)
|
setattr(request, key, value)
|
||||||
from fastdeploy.utils import api_server_logger
|
log_request(RequestLogLevel.STAGES, message="The parameter metadata is obsolete.")
|
||||||
|
|
||||||
api_server_logger.warning("The parameter metadata is obsolete.")
|
|
||||||
|
|
||||||
return request
|
return request
|
||||||
|
|
||||||
@classmethod
|
@classmethod
|
||||||
def from_dict(cls, d: dict):
|
def from_dict(cls, d: dict):
|
||||||
data_processor_logger.debug(f"{d}")
|
log_request(RequestLogLevel.FULL, message="{request}", request=d)
|
||||||
sampling_params: SamplingParams = None
|
sampling_params: SamplingParams = None
|
||||||
pooling_params: PoolingParams = None
|
pooling_params: PoolingParams = None
|
||||||
metrics: RequestMetrics = None
|
metrics: RequestMetrics = None
|
||||||
@@ -352,8 +354,11 @@ class Request:
|
|||||||
ImagePosition(**mm_pos) if not isinstance(mm_pos, ImagePosition) else mm_pos
|
ImagePosition(**mm_pos) if not isinstance(mm_pos, ImagePosition) else mm_pos
|
||||||
)
|
)
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
data_processor_logger.error(
|
log_request_error(
|
||||||
f"Convert mm_positions to ImagePosition error: {e}, {str(traceback.format_exc())}"
|
message="request[{request_id}] Convert mm_positions to ImagePosition error: {error}, {traceback}",
|
||||||
|
request_id=d.get("request_id"),
|
||||||
|
error=str(e),
|
||||||
|
traceback=traceback.format_exc(),
|
||||||
)
|
)
|
||||||
return cls(
|
return cls(
|
||||||
request_id=d["request_id"],
|
request_id=d["request_id"],
|
||||||
|
|||||||
@@ -23,6 +23,11 @@ from fastapi.responses import Response, StreamingResponse
|
|||||||
|
|
||||||
from fastdeploy.engine.args_utils import EngineArgs
|
from fastdeploy.engine.args_utils import EngineArgs
|
||||||
from fastdeploy.engine.engine import LLMEngine
|
from fastdeploy.engine.engine import LLMEngine
|
||||||
|
from fastdeploy.logger.request_logger import (
|
||||||
|
RequestLogLevel,
|
||||||
|
log_request,
|
||||||
|
log_request_error,
|
||||||
|
)
|
||||||
from fastdeploy.utils import (
|
from fastdeploy.utils import (
|
||||||
FlexibleArgumentParser,
|
FlexibleArgumentParser,
|
||||||
api_server_logger,
|
api_server_logger,
|
||||||
@@ -61,7 +66,7 @@ async def generate(request: dict):
|
|||||||
"""
|
"""
|
||||||
generate stream api
|
generate stream api
|
||||||
"""
|
"""
|
||||||
api_server_logger.info(f"Receive request: {request}")
|
log_request(RequestLogLevel.FULL, message="Receive request: {request}", request=request)
|
||||||
stream = request.get("stream", 0)
|
stream = request.get("stream", 0)
|
||||||
|
|
||||||
if not stream:
|
if not stream:
|
||||||
@@ -72,7 +77,11 @@ async def generate(request: dict):
|
|||||||
output = result
|
output = result
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
# 记录完整的异常堆栈信息
|
# 记录完整的异常堆栈信息
|
||||||
api_server_logger.error(f"Error during generation: {e!s}", exc_info=True)
|
log_request_error(
|
||||||
|
message="request[{request_id}] Error during generation: {error}",
|
||||||
|
request_id=request.get("request_id"),
|
||||||
|
error=str(e),
|
||||||
|
)
|
||||||
# 返回结构化的错误消息并终止流
|
# 返回结构化的错误消息并终止流
|
||||||
output = {"error": str(e), "error_type": e.__class__.__name__}
|
output = {"error": str(e), "error_type": e.__class__.__name__}
|
||||||
return output
|
return output
|
||||||
@@ -84,7 +93,11 @@ async def generate(request: dict):
|
|||||||
yield f"data: {json.dumps(result)}\n\n"
|
yield f"data: {json.dumps(result)}\n\n"
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
# 记录完整的异常堆栈信息
|
# 记录完整的异常堆栈信息
|
||||||
api_server_logger.error(f"Error during generation: {e!s}", exc_info=True)
|
log_request_error(
|
||||||
|
message="request[{request_id}] Error during generation: {error}",
|
||||||
|
request_id=request.get("request_id"),
|
||||||
|
error=str(e),
|
||||||
|
)
|
||||||
# 返回结构化的错误消息并终止流
|
# 返回结构化的错误消息并终止流
|
||||||
error_msg = {"error": str(e), "error_type": e.__class__.__name__}
|
error_msg = {"error": str(e), "error_type": e.__class__.__name__}
|
||||||
yield f"data: {json.dumps(error_msg)}\n\n"
|
yield f"data: {json.dumps(error_msg)}\n\n"
|
||||||
|
|||||||
@@ -49,6 +49,11 @@ from fastdeploy.inter_communicator import (
|
|||||||
RearrangeExpertStatus,
|
RearrangeExpertStatus,
|
||||||
ZmqIpcClient,
|
ZmqIpcClient,
|
||||||
)
|
)
|
||||||
|
from fastdeploy.logger.request_logger import (
|
||||||
|
RequestLogLevel,
|
||||||
|
log_request,
|
||||||
|
log_request_error,
|
||||||
|
)
|
||||||
from fastdeploy.metrics.metrics import main_process_metrics
|
from fastdeploy.metrics.metrics import main_process_metrics
|
||||||
from fastdeploy.platforms import current_platform
|
from fastdeploy.platforms import current_platform
|
||||||
from fastdeploy.trace.constants import LoggingEventName
|
from fastdeploy.trace.constants import LoggingEventName
|
||||||
@@ -362,12 +367,16 @@ class EngineClient:
|
|||||||
|
|
||||||
if "messages" in task:
|
if "messages" in task:
|
||||||
task["messages"] = None
|
task["messages"] = None
|
||||||
api_server_logger.info(f"task['max_tokens']:{task['max_tokens']}")
|
|
||||||
main_process_metrics.request_params_max_tokens.observe(task["max_tokens"])
|
main_process_metrics.request_params_max_tokens.observe(task["max_tokens"])
|
||||||
main_process_metrics.prompt_tokens_total.inc(input_ids_len)
|
main_process_metrics.prompt_tokens_total.inc(input_ids_len)
|
||||||
main_process_metrics.request_prompt_tokens.observe(input_ids_len)
|
main_process_metrics.request_prompt_tokens.observe(input_ids_len)
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
api_server_logger.error(f"add_requests error: {e}, {str(traceback.format_exc())}")
|
log_request_error(
|
||||||
|
message="request[{request_id}] add_requests error: {error}, {traceback}",
|
||||||
|
request_id=task.get("request_id"),
|
||||||
|
error=e,
|
||||||
|
traceback=traceback.format_exc(),
|
||||||
|
)
|
||||||
raise EngineError(str(e), error_code=400)
|
raise EngineError(str(e), error_code=400)
|
||||||
|
|
||||||
if input_ids_len + min_tokens >= self.max_model_len:
|
if input_ids_len + min_tokens >= self.max_model_len:
|
||||||
@@ -375,14 +384,18 @@ class EngineClient:
|
|||||||
f"Input text is too long, input_ids_len ({input_ids_len}) "
|
f"Input text is too long, input_ids_len ({input_ids_len}) "
|
||||||
f"+ min_tokens({min_tokens}) >= max_model_len({self.max_model_len})"
|
f"+ min_tokens({min_tokens}) >= max_model_len({self.max_model_len})"
|
||||||
)
|
)
|
||||||
api_server_logger.error(error_msg)
|
log_request_error(
|
||||||
|
message="request[{request_id}] {error_msg}", request_id=task.get("request_id"), error_msg=error_msg
|
||||||
|
)
|
||||||
raise EngineError(error_msg, error_code=400)
|
raise EngineError(error_msg, error_code=400)
|
||||||
|
|
||||||
if input_ids_len > self.max_model_len:
|
if input_ids_len > self.max_model_len:
|
||||||
error_msg = (
|
error_msg = (
|
||||||
f"Length of input token({input_ids_len}) exceeds the limit max_model_len({self.max_model_len})."
|
f"Length of input token({input_ids_len}) exceeds the limit max_model_len({self.max_model_len})."
|
||||||
)
|
)
|
||||||
api_server_logger.error(error_msg)
|
log_request_error(
|
||||||
|
message="request[{request_id}] {error_msg}", request_id=task.get("request_id"), error_msg=error_msg
|
||||||
|
)
|
||||||
raise EngineError(error_msg, error_code=400)
|
raise EngineError(error_msg, error_code=400)
|
||||||
|
|
||||||
if "stop_seqs_len" in task and task["stop_seqs_len"]:
|
if "stop_seqs_len" in task and task["stop_seqs_len"]:
|
||||||
@@ -393,7 +406,9 @@ class EngineClient:
|
|||||||
f"Length of stop ({stop_seqs_len}) exceeds the limit max_stop_seqs_num({max_stop_seqs_num})."
|
f"Length of stop ({stop_seqs_len}) exceeds the limit max_stop_seqs_num({max_stop_seqs_num})."
|
||||||
"Please reduce the number of stop or set a lager max_stop_seqs_num by `FD_MAX_STOP_SEQS_NUM`"
|
"Please reduce the number of stop or set a lager max_stop_seqs_num by `FD_MAX_STOP_SEQS_NUM`"
|
||||||
)
|
)
|
||||||
api_server_logger.error(error_msg)
|
log_request_error(
|
||||||
|
message="request[{request_id}] {error_msg}", request_id=task.get("request_id"), error_msg=error_msg
|
||||||
|
)
|
||||||
raise EngineError(error_msg, error_code=400)
|
raise EngineError(error_msg, error_code=400)
|
||||||
stop_seqs_max_len = envs.FD_STOP_SEQS_MAX_LEN
|
stop_seqs_max_len = envs.FD_STOP_SEQS_MAX_LEN
|
||||||
for single_stop_seq_len in stop_seqs_len:
|
for single_stop_seq_len in stop_seqs_len:
|
||||||
@@ -402,18 +417,28 @@ class EngineClient:
|
|||||||
f"Length of stop_seqs({single_stop_seq_len}) exceeds the limit stop_seqs_max_len({stop_seqs_max_len})."
|
f"Length of stop_seqs({single_stop_seq_len}) exceeds the limit stop_seqs_max_len({stop_seqs_max_len})."
|
||||||
"Please reduce the length of stop sequences or set a larger stop_seqs_max_len by `FD_STOP_SEQS_MAX_LEN`"
|
"Please reduce the length of stop sequences or set a larger stop_seqs_max_len by `FD_STOP_SEQS_MAX_LEN`"
|
||||||
)
|
)
|
||||||
api_server_logger.error(error_msg)
|
log_request_error(
|
||||||
|
message="request[{request_id}] {error_msg}",
|
||||||
|
request_id=task.get("request_id"),
|
||||||
|
error_msg=error_msg,
|
||||||
|
)
|
||||||
raise EngineError(error_msg, error_code=400)
|
raise EngineError(error_msg, error_code=400)
|
||||||
|
|
||||||
task["metrics"]["preprocess_end_time"] = time.time()
|
task["metrics"]["preprocess_end_time"] = time.time()
|
||||||
preprocess_cost_time = task["metrics"]["preprocess_end_time"] - task["metrics"]["preprocess_start_time"]
|
preprocess_cost_time = task["metrics"]["preprocess_end_time"] - task["metrics"]["preprocess_start_time"]
|
||||||
api_server_logger.info(
|
log_request(
|
||||||
f"Cache request with request_id ({task.get('request_id')}), "
|
level=RequestLogLevel.STAGES,
|
||||||
f"preprocess time cost {preprocess_cost_time}"
|
message="Cache request with request_id ({request_id}), preprocess time cost {preprocess_cost_time}",
|
||||||
|
request_id=task.get("request_id"),
|
||||||
|
preprocess_cost_time=preprocess_cost_time,
|
||||||
)
|
)
|
||||||
|
|
||||||
self.valid_parameters(task)
|
self.valid_parameters(task)
|
||||||
api_server_logger.debug(f"Receive task: {task}")
|
log_request(
|
||||||
|
level=RequestLogLevel.FULL,
|
||||||
|
message="Receive task: {task}",
|
||||||
|
task=task,
|
||||||
|
)
|
||||||
n = task.get("n", 1)
|
n = task.get("n", 1)
|
||||||
try:
|
try:
|
||||||
request_id_idx = task.get("request_id")
|
request_id_idx = task.get("request_id")
|
||||||
@@ -433,7 +458,12 @@ class EngineClient:
|
|||||||
tracing.TraceSpanName.PREPROCESSING, task.get("request_id").split("_")[0], thread_finish_flag=True
|
tracing.TraceSpanName.PREPROCESSING, task.get("request_id").split("_")[0], thread_finish_flag=True
|
||||||
)
|
)
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
api_server_logger.error(f"zmq_client send task error: {e}, {str(traceback.format_exc())}")
|
log_request_error(
|
||||||
|
message="request[{request_id}] zmq_client send task error: {error}, {traceback}",
|
||||||
|
request_id=task.get("request_id"),
|
||||||
|
error=e,
|
||||||
|
traceback=traceback.format_exc(),
|
||||||
|
)
|
||||||
raise EngineError(str(e), error_code=400)
|
raise EngineError(str(e), error_code=400)
|
||||||
|
|
||||||
def _send_task(self, task):
|
def _send_task(self, task):
|
||||||
@@ -455,8 +485,11 @@ class EngineClient:
|
|||||||
|
|
||||||
if data.get("max_tokens") is not None:
|
if data.get("max_tokens") is not None:
|
||||||
if data["max_tokens"] < 1 or data["max_tokens"] >= self.max_model_len:
|
if data["max_tokens"] < 1 or data["max_tokens"] >= self.max_model_len:
|
||||||
api_server_logger.error(
|
log_request_error(
|
||||||
f"req_id:{data['request_id']}, max_tokens must be defined [1, {self.max_model_len}), but now it's {data['max_tokens']}."
|
message="req_id:{request_id}, max_tokens must be defined [1, {max_model_len}), but now it's {max_tokens}.",
|
||||||
|
request_id=data["request_id"],
|
||||||
|
max_model_len=self.max_model_len,
|
||||||
|
max_tokens=data["max_tokens"],
|
||||||
)
|
)
|
||||||
raise ValueError(
|
raise ValueError(
|
||||||
f"max_tokens can be defined [1, {self.max_model_len}), but now it's {data['max_tokens']}."
|
f"max_tokens can be defined [1, {self.max_model_len}), but now it's {data['max_tokens']}."
|
||||||
@@ -467,14 +500,18 @@ class EngineClient:
|
|||||||
raise ParameterError("reasoning_max_tokens", "reasoning_max_tokens must be greater than 0")
|
raise ParameterError("reasoning_max_tokens", "reasoning_max_tokens must be greater than 0")
|
||||||
if data["reasoning_max_tokens"] > data["max_tokens"]:
|
if data["reasoning_max_tokens"] > data["max_tokens"]:
|
||||||
data["reasoning_max_tokens"] = data["max_tokens"]
|
data["reasoning_max_tokens"] = data["max_tokens"]
|
||||||
api_server_logger.warning(
|
log_request(
|
||||||
f"req_id: {data['request_id']}, reasoning_max_tokens exceeds max_tokens, the value of reasoning_max_tokens will be adjusted to {data['max_tokens']}"
|
level=RequestLogLevel.STAGES,
|
||||||
|
message="req_id: {request_id}, reasoning_max_tokens exceeds max_tokens, the value of reasoning_max_tokens will be adjusted to {max_tokens}",
|
||||||
|
request_id=data["request_id"],
|
||||||
|
max_tokens=data["max_tokens"],
|
||||||
)
|
)
|
||||||
if data.get("reasoning_effort") is not None:
|
if data.get("reasoning_effort") is not None:
|
||||||
data["reasoning_max_tokens"] = None
|
data["reasoning_max_tokens"] = None
|
||||||
api_server_logger.warning(
|
log_request(
|
||||||
f"req_id: {data['request_id']}, reasoning_max_tokens and reasoning_effort are both set, "
|
level=RequestLogLevel.STAGES,
|
||||||
f"enable_thinking will be disabled."
|
message="req_id: {request_id}, reasoning_max_tokens and reasoning_effort are both set, enable_thinking will be disabled.",
|
||||||
|
request_id=data["request_id"],
|
||||||
)
|
)
|
||||||
|
|
||||||
if data.get("response_max_tokens") is not None:
|
if data.get("response_max_tokens") is not None:
|
||||||
@@ -493,7 +530,9 @@ class EngineClient:
|
|||||||
is_chat = True
|
is_chat = True
|
||||||
if not self.enable_logprob:
|
if not self.enable_logprob:
|
||||||
err_msg = "Logprobs is disabled, please enable it in startup config."
|
err_msg = "Logprobs is disabled, please enable it in startup config."
|
||||||
api_server_logger.error(err_msg)
|
log_request_error(
|
||||||
|
message="request[{request_id}] {err_msg}", request_id=data.get("request_id"), err_msg=err_msg
|
||||||
|
)
|
||||||
raise ParameterError("logprobs", err_msg)
|
raise ParameterError("logprobs", err_msg)
|
||||||
top_logprobs = data.get("top_logprobs")
|
top_logprobs = data.get("top_logprobs")
|
||||||
elif isinstance(logprobs, int):
|
elif isinstance(logprobs, int):
|
||||||
@@ -506,11 +545,15 @@ class EngineClient:
|
|||||||
max_logprobs = self.ori_vocab_size
|
max_logprobs = self.ori_vocab_size
|
||||||
if max_logprobs < -1:
|
if max_logprobs < -1:
|
||||||
err_msg = f"Invalid 'max_logprobs': must be >= -1, got {max_logprobs}."
|
err_msg = f"Invalid 'max_logprobs': must be >= -1, got {max_logprobs}."
|
||||||
api_server_logger.error(err_msg)
|
log_request_error(
|
||||||
|
message="request[{request_id}] {err_msg}", request_id=data.get("request_id"), err_msg=err_msg
|
||||||
|
)
|
||||||
raise ValueError("max_logprobs", err_msg)
|
raise ValueError("max_logprobs", err_msg)
|
||||||
if max_logprobs > self.ori_vocab_size:
|
if max_logprobs > self.ori_vocab_size:
|
||||||
err_msg = f"Invalid 'max_logprobs': must be <= vocab_size {self.ori_vocab_size}, got {max_logprobs}."
|
err_msg = f"Invalid 'max_logprobs': must be <= vocab_size {self.ori_vocab_size}, got {max_logprobs}."
|
||||||
api_server_logger.error(err_msg)
|
log_request_error(
|
||||||
|
message="request[{request_id}] {err_msg}", request_id=data.get("request_id"), err_msg=err_msg
|
||||||
|
)
|
||||||
raise ValueError("max_logprobs", err_msg)
|
raise ValueError("max_logprobs", err_msg)
|
||||||
|
|
||||||
prompt_logprobs = data.get("prompt_logprobs", None)
|
prompt_logprobs = data.get("prompt_logprobs", None)
|
||||||
@@ -518,41 +561,55 @@ class EngineClient:
|
|||||||
if prompt_logprobs is not None:
|
if prompt_logprobs is not None:
|
||||||
if not self.enable_logprob:
|
if not self.enable_logprob:
|
||||||
err_msg = "`enable_logprob` is disabled, please enable it in startup config."
|
err_msg = "`enable_logprob` is disabled, please enable it in startup config."
|
||||||
api_server_logger.error(err_msg)
|
log_request_error(
|
||||||
|
message="request[{request_id}] {err_msg}", request_id=data.get("request_id"), err_msg=err_msg
|
||||||
|
)
|
||||||
raise ParameterError("prompt_logprobs", err_msg)
|
raise ParameterError("prompt_logprobs", err_msg)
|
||||||
|
|
||||||
if not envs.FD_USE_GET_SAVE_OUTPUT_V1:
|
if not envs.FD_USE_GET_SAVE_OUTPUT_V1:
|
||||||
err_msg = "prompt_logprobs is not support when FD_USE_GET_SAVE_OUTPUT_V1 is disabled."
|
err_msg = "prompt_logprobs is not support when FD_USE_GET_SAVE_OUTPUT_V1 is disabled."
|
||||||
api_server_logger.error(err_msg)
|
log_request_error(
|
||||||
|
message="request[{request_id}] {err_msg}", request_id=data.get("request_id"), err_msg=err_msg
|
||||||
|
)
|
||||||
raise ParameterError("prompt_logprobs", err_msg)
|
raise ParameterError("prompt_logprobs", err_msg)
|
||||||
|
|
||||||
if self.enable_prefix_caching:
|
if self.enable_prefix_caching:
|
||||||
err_msg = "prompt_logprobs is not support when prefix caching is enabled."
|
err_msg = "prompt_logprobs is not support when prefix caching is enabled."
|
||||||
api_server_logger.error(err_msg)
|
log_request_error(
|
||||||
|
message="request[{request_id}] {err_msg}", request_id=data.get("request_id"), err_msg=err_msg
|
||||||
|
)
|
||||||
raise ParameterError("prompt_logprobs", err_msg)
|
raise ParameterError("prompt_logprobs", err_msg)
|
||||||
|
|
||||||
if prompt_logprobs == -1 and self.ori_vocab_size > max_logprobs:
|
if prompt_logprobs == -1 and self.ori_vocab_size > max_logprobs:
|
||||||
err_msg = f"The requested value of ({self.ori_vocab_size}) for prompt_logprobs (-1) exceeds the maximum allowed value of ({max_logprobs})"
|
err_msg = f"The requested value of ({self.ori_vocab_size}) for prompt_logprobs (-1) exceeds the maximum allowed value of ({max_logprobs})"
|
||||||
api_server_logger.error(err_msg)
|
log_request_error(
|
||||||
|
message="request[{request_id}] {err_msg}", request_id=data.get("request_id"), err_msg=err_msg
|
||||||
|
)
|
||||||
raise ValueError("prompt_logprobs", err_msg)
|
raise ValueError("prompt_logprobs", err_msg)
|
||||||
|
|
||||||
if prompt_logprobs < -1:
|
if prompt_logprobs < -1:
|
||||||
err_msg = (
|
err_msg = (
|
||||||
f"prompt_logprobs must be a non-negative value or -1; the current value is {prompt_logprobs}."
|
f"prompt_logprobs must be a non-negative value or -1; the current value is {prompt_logprobs}."
|
||||||
)
|
)
|
||||||
api_server_logger.error(err_msg)
|
log_request_error(
|
||||||
|
message="request[{request_id}] {err_msg}", request_id=data.get("request_id"), err_msg=err_msg
|
||||||
|
)
|
||||||
raise ValueError("prompt_logprobs", err_msg)
|
raise ValueError("prompt_logprobs", err_msg)
|
||||||
|
|
||||||
if prompt_logprobs > max_logprobs:
|
if prompt_logprobs > max_logprobs:
|
||||||
err_msg = f"Number of prompt_logprobs requested ({prompt_logprobs}) exceeds maximum allowed value ({max_logprobs})."
|
err_msg = f"Number of prompt_logprobs requested ({prompt_logprobs}) exceeds maximum allowed value ({max_logprobs})."
|
||||||
api_server_logger.error(err_msg)
|
log_request_error(
|
||||||
|
message="request[{request_id}] {err_msg}", request_id=data.get("request_id"), err_msg=err_msg
|
||||||
|
)
|
||||||
raise ValueError("prompt_logprobs", err_msg)
|
raise ValueError("prompt_logprobs", err_msg)
|
||||||
|
|
||||||
# enable_logprob
|
# enable_logprob
|
||||||
if top_logprobs is not None:
|
if top_logprobs is not None:
|
||||||
if not self.enable_logprob:
|
if not self.enable_logprob:
|
||||||
err_msg = "Logprobs is disabled, please enable it in startup config."
|
err_msg = "Logprobs is disabled, please enable it in startup config."
|
||||||
api_server_logger.error(err_msg)
|
log_request_error(
|
||||||
|
message="request[{request_id}] {err_msg}", request_id=data.get("request_id"), err_msg=err_msg
|
||||||
|
)
|
||||||
raise ParameterError("top_logprobs" if is_chat else "logprobs", err_msg)
|
raise ParameterError("top_logprobs" if is_chat else "logprobs", err_msg)
|
||||||
|
|
||||||
if not isinstance(top_logprobs, int):
|
if not isinstance(top_logprobs, int):
|
||||||
@@ -560,28 +617,38 @@ class EngineClient:
|
|||||||
err_msg = (
|
err_msg = (
|
||||||
f"Invalid type for {'top_logprobs' if is_chat else 'logprobs'}: expected int but got {err_type}."
|
f"Invalid type for {'top_logprobs' if is_chat else 'logprobs'}: expected int but got {err_type}."
|
||||||
)
|
)
|
||||||
api_server_logger.error(err_msg)
|
log_request_error(
|
||||||
|
message="request[{request_id}] {err_msg}", request_id=data.get("request_id"), err_msg=err_msg
|
||||||
|
)
|
||||||
raise ParameterError("top_logprobs" if is_chat else "logprobs", err_msg)
|
raise ParameterError("top_logprobs" if is_chat else "logprobs", err_msg)
|
||||||
|
|
||||||
if top_logprobs > max_logprobs:
|
if top_logprobs > max_logprobs:
|
||||||
err_msg = f"Number of {'top_logprobs' if is_chat else 'logprobs'} requested ({top_logprobs}) exceeds maximum allowed value ({max_logprobs})."
|
err_msg = f"Number of {'top_logprobs' if is_chat else 'logprobs'} requested ({top_logprobs}) exceeds maximum allowed value ({max_logprobs})."
|
||||||
api_server_logger.error(err_msg)
|
log_request_error(
|
||||||
|
message="request[{request_id}] {err_msg}", request_id=data.get("request_id"), err_msg=err_msg
|
||||||
|
)
|
||||||
raise ValueError("top_logprobs" if is_chat else "logprobs", err_msg)
|
raise ValueError("top_logprobs" if is_chat else "logprobs", err_msg)
|
||||||
|
|
||||||
if not envs.FD_USE_GET_SAVE_OUTPUT_V1:
|
if not envs.FD_USE_GET_SAVE_OUTPUT_V1:
|
||||||
if top_logprobs < 0 or top_logprobs > max_logprobs:
|
if top_logprobs < 0 or top_logprobs > max_logprobs:
|
||||||
err_msg = f"{'top_logprobs' if is_chat else 'logprobs'} must be between 0 and {max_logprobs}; the current value is {top_logprobs}."
|
err_msg = f"{'top_logprobs' if is_chat else 'logprobs'} must be between 0 and {max_logprobs}; the current value is {top_logprobs}."
|
||||||
api_server_logger.error(err_msg)
|
log_request_error(
|
||||||
|
message="request[{request_id}] {err_msg}", request_id=data.get("request_id"), err_msg=err_msg
|
||||||
|
)
|
||||||
raise ValueError("top_logprobs" if is_chat else "logprobs", err_msg)
|
raise ValueError("top_logprobs" if is_chat else "logprobs", err_msg)
|
||||||
else:
|
else:
|
||||||
if top_logprobs == -1 and self.ori_vocab_size > max_logprobs:
|
if top_logprobs == -1 and self.ori_vocab_size > max_logprobs:
|
||||||
err_msg = f"The requested value of ({self.ori_vocab_size}) for {'top_logprobs' if is_chat else 'logprobs'} (-1) exceeds the maximum allowed value of ({max_logprobs})"
|
err_msg = f"The requested value of ({self.ori_vocab_size}) for {'top_logprobs' if is_chat else 'logprobs'} (-1) exceeds the maximum allowed value of ({max_logprobs})"
|
||||||
api_server_logger.error(err_msg)
|
log_request_error(
|
||||||
|
message="request[{request_id}] {err_msg}", request_id=data.get("request_id"), err_msg=err_msg
|
||||||
|
)
|
||||||
raise ValueError("top_logprobs" if is_chat else "logprobs", err_msg)
|
raise ValueError("top_logprobs" if is_chat else "logprobs", err_msg)
|
||||||
|
|
||||||
if top_logprobs < -1:
|
if top_logprobs < -1:
|
||||||
err_msg = f"{'top_logprobs' if is_chat else 'logprobs'} must be a non-negative value or -1; the current value is {top_logprobs}."
|
err_msg = f"{'top_logprobs' if is_chat else 'logprobs'} must be a non-negative value or -1; the current value is {top_logprobs}."
|
||||||
api_server_logger.error(err_msg)
|
log_request_error(
|
||||||
|
message="request[{request_id}] {err_msg}", request_id=data.get("request_id"), err_msg=err_msg
|
||||||
|
)
|
||||||
raise ValueError("top_logprobs" if is_chat else "logprobs", err_msg)
|
raise ValueError("top_logprobs" if is_chat else "logprobs", err_msg)
|
||||||
|
|
||||||
def check_health(self, time_interval_threashold=30):
|
def check_health(self, time_interval_threashold=30):
|
||||||
@@ -617,12 +684,21 @@ class EngineClient:
|
|||||||
return response
|
return response
|
||||||
except asyncio.TimeoutError:
|
except asyncio.TimeoutError:
|
||||||
error_response = ControlResponse(request_id, 500, "Timeout waiting for control method response")
|
error_response = ControlResponse(request_id, 500, "Timeout waiting for control method response")
|
||||||
api_server_logger.error(f"Control request timed out: {error_response}")
|
log_request_error(
|
||||||
|
message="request[{request_id}] Control request timed out: {error_response}",
|
||||||
|
request_id=request_id,
|
||||||
|
error_response=error_response,
|
||||||
|
)
|
||||||
return error_response
|
return error_response
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
import traceback
|
import traceback
|
||||||
|
|
||||||
api_server_logger.error(f"Unknown error in control method: {str(e)}\n{traceback.format_exc()}")
|
log_request_error(
|
||||||
|
message="request[{request_id}] Unknown error in control method: {error}\n{traceback}",
|
||||||
|
request_id=request_id,
|
||||||
|
error=str(e),
|
||||||
|
traceback=traceback.format_exc(),
|
||||||
|
)
|
||||||
error_response = ControlResponse(request_id, 500, str(e))
|
error_response = ControlResponse(request_id, 500, str(e))
|
||||||
return error_response
|
return error_response
|
||||||
|
|
||||||
@@ -1024,7 +1100,11 @@ class EngineClient:
|
|||||||
|
|
||||||
async def abort(self, request_id, n=1) -> None:
|
async def abort(self, request_id, n=1) -> None:
|
||||||
if envs.FD_ENABLE_REQUEST_DISCONNECT_STOP_INFERENCE:
|
if envs.FD_ENABLE_REQUEST_DISCONNECT_STOP_INFERENCE:
|
||||||
api_server_logger.info(f"abort request_id:{request_id}")
|
log_request(
|
||||||
|
level=RequestLogLevel.LIFECYCLE,
|
||||||
|
message="abort request_id: {request_id}",
|
||||||
|
request_id=request_id,
|
||||||
|
)
|
||||||
if n <= 0:
|
if n <= 0:
|
||||||
api_server_logger.warning("Abort function called with non-positive n: %d. No requests aborted.", n)
|
api_server_logger.warning("Abort function called with non-positive n: %d. No requests aborted.", n)
|
||||||
return
|
return
|
||||||
@@ -1044,7 +1124,11 @@ class EngineClient:
|
|||||||
}
|
}
|
||||||
self._send_task(data)
|
self._send_task(data)
|
||||||
|
|
||||||
api_server_logger.info("Aborted request(s) %s.", ",".join(request_ids))
|
log_request(
|
||||||
|
level=RequestLogLevel.LIFECYCLE,
|
||||||
|
message="Aborted request(s) {request_ids}.",
|
||||||
|
request_ids=",".join(request_ids),
|
||||||
|
)
|
||||||
|
|
||||||
def process_messages(self, messages):
|
def process_messages(self, messages):
|
||||||
for message in messages:
|
for message in messages:
|
||||||
|
|||||||
@@ -36,6 +36,7 @@ from fastdeploy.entrypoints.chat_utils import load_chat_template
|
|||||||
from fastdeploy.entrypoints.openai.protocol import ChatCompletionToolsParam
|
from fastdeploy.entrypoints.openai.protocol import ChatCompletionToolsParam
|
||||||
from fastdeploy.entrypoints.openai.tool_parsers import ToolParserManager
|
from fastdeploy.entrypoints.openai.tool_parsers import ToolParserManager
|
||||||
from fastdeploy.input.utils import validate_model_path
|
from fastdeploy.input.utils import validate_model_path
|
||||||
|
from fastdeploy.logger.request_logger import log_request_error
|
||||||
from fastdeploy.utils import (
|
from fastdeploy.utils import (
|
||||||
deprecated_kwargs_warning,
|
deprecated_kwargs_warning,
|
||||||
llm_logger,
|
llm_logger,
|
||||||
@@ -139,7 +140,11 @@ class LLM:
|
|||||||
continue
|
continue
|
||||||
self.req_output[request_id].add(result)
|
self.req_output[request_id].add(result)
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
llm_logger.error(f"Unexpected error happened: {e}, {traceback.format_exc()!s}")
|
log_request_error(
|
||||||
|
message="Unexpected error happened: {error}, {traceback}",
|
||||||
|
error=e,
|
||||||
|
traceback=traceback.format_exc(),
|
||||||
|
)
|
||||||
|
|
||||||
def generate(
|
def generate(
|
||||||
self,
|
self,
|
||||||
@@ -437,7 +442,11 @@ class LLM:
|
|||||||
return result
|
return result
|
||||||
|
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
llm_logger.error(f"Error building sample logprobs from LogprobsLists: {e}, {str(traceback.format_exc())}")
|
log_request_error(
|
||||||
|
message="Error building sample logprobs from LogprobsLists: {error}, {traceback}",
|
||||||
|
error=e,
|
||||||
|
traceback=traceback.format_exc(),
|
||||||
|
)
|
||||||
|
|
||||||
def _build_prompt_logprobs(
|
def _build_prompt_logprobs(
|
||||||
self,
|
self,
|
||||||
|
|||||||
@@ -73,6 +73,11 @@ from fastdeploy.entrypoints.openai.v1.serving_completion import (
|
|||||||
OpenAIServingCompletion as OpenAIServingCompletionV1,
|
OpenAIServingCompletion as OpenAIServingCompletionV1,
|
||||||
)
|
)
|
||||||
from fastdeploy.envs import environment_variables
|
from fastdeploy.envs import environment_variables
|
||||||
|
from fastdeploy.logger.request_logger import (
|
||||||
|
RequestLogLevel,
|
||||||
|
log_request,
|
||||||
|
log_request_error,
|
||||||
|
)
|
||||||
from fastdeploy.metrics.metrics import get_filtered_metrics
|
from fastdeploy.metrics.metrics import get_filtered_metrics
|
||||||
from fastdeploy.utils import (
|
from fastdeploy.utils import (
|
||||||
ExceptionHandler,
|
ExceptionHandler,
|
||||||
@@ -325,7 +330,11 @@ async def connection_manager():
|
|||||||
await asyncio.wait_for(connection_semaphore.acquire(), timeout=0.001)
|
await asyncio.wait_for(connection_semaphore.acquire(), timeout=0.001)
|
||||||
yield
|
yield
|
||||||
except asyncio.TimeoutError:
|
except asyncio.TimeoutError:
|
||||||
api_server_logger.info(f"Reach max request concurrency, semaphore status: {connection_semaphore.status()}")
|
log_request(
|
||||||
|
level=RequestLogLevel.LIFECYCLE,
|
||||||
|
message="Reach max request concurrency, semaphore status: {status}",
|
||||||
|
status=connection_semaphore.status(),
|
||||||
|
)
|
||||||
raise HTTPException(
|
raise HTTPException(
|
||||||
status_code=429, detail=f"Too many requests,current max concurrency is {args.max_concurrency}"
|
status_code=429, detail=f"Too many requests,current max concurrency is {args.max_concurrency}"
|
||||||
)
|
)
|
||||||
@@ -545,7 +554,7 @@ async def create_chat_completion(request: ChatCompletionRequest, req: Request):
|
|||||||
"""
|
"""
|
||||||
Create a chat completion for the provided prompt and parameters.
|
Create a chat completion for the provided prompt and parameters.
|
||||||
"""
|
"""
|
||||||
api_server_logger.debug(f"Chat Received request: {request.model_dump_json()}")
|
log_request(RequestLogLevel.FULL, message="Chat Received request: {request}", request=request.model_dump_json())
|
||||||
if envs.TRACES_ENABLE:
|
if envs.TRACES_ENABLE:
|
||||||
if req.headers:
|
if req.headers:
|
||||||
headers = dict(req.headers)
|
headers = dict(req.headers)
|
||||||
@@ -572,7 +581,11 @@ async def create_chat_completion(request: ChatCompletionRequest, req: Request):
|
|||||||
return StreamingResponse(content=wrapped_generator(), media_type="text/event-stream")
|
return StreamingResponse(content=wrapped_generator(), media_type="text/event-stream")
|
||||||
|
|
||||||
except HTTPException as e:
|
except HTTPException as e:
|
||||||
api_server_logger.error(f"Error in chat completion: {str(e)}")
|
log_request_error(
|
||||||
|
message="request[{request_id}] Error in chat completion: {error}",
|
||||||
|
request_id=getattr(request, "request_id", None),
|
||||||
|
error=str(e),
|
||||||
|
)
|
||||||
return JSONResponse(status_code=e.status_code, content={"detail": e.detail})
|
return JSONResponse(status_code=e.status_code, content={"detail": e.detail})
|
||||||
|
|
||||||
|
|
||||||
@@ -582,7 +595,9 @@ async def create_completion(request: CompletionRequest, req: Request):
|
|||||||
"""
|
"""
|
||||||
Create a completion for the provided prompt and parameters.
|
Create a completion for the provided prompt and parameters.
|
||||||
"""
|
"""
|
||||||
api_server_logger.info(f"Completion Received request: {request.model_dump_json()}")
|
log_request(
|
||||||
|
RequestLogLevel.FULL, message="Completion Received request: {request}", request=request.model_dump_json()
|
||||||
|
)
|
||||||
if envs.TRACES_ENABLE:
|
if envs.TRACES_ENABLE:
|
||||||
if req.headers:
|
if req.headers:
|
||||||
headers = dict(req.headers)
|
headers = dict(req.headers)
|
||||||
|
|||||||
@@ -31,6 +31,7 @@ from pydantic import (
|
|||||||
)
|
)
|
||||||
|
|
||||||
from fastdeploy.engine.pooling_params import PoolingParams
|
from fastdeploy.engine.pooling_params import PoolingParams
|
||||||
|
from fastdeploy.logger.request_logger import RequestLogLevel, log_request
|
||||||
from fastdeploy.worker.output import PromptLogprobs, SpeculateMetrics
|
from fastdeploy.worker.output import PromptLogprobs, SpeculateMetrics
|
||||||
|
|
||||||
|
|
||||||
@@ -758,9 +759,7 @@ class ChatCompletionRequest(BaseModel):
|
|||||||
), "The parameter `raw_request` is not supported now, please use completion api instead."
|
), "The parameter `raw_request` is not supported now, please use completion api instead."
|
||||||
for key, value in self.metadata.items():
|
for key, value in self.metadata.items():
|
||||||
req_dict[key] = value
|
req_dict[key] = value
|
||||||
from fastdeploy.utils import api_server_logger
|
log_request(RequestLogLevel.STAGES, message="The parameter metadata is obsolete.")
|
||||||
|
|
||||||
api_server_logger.warning("The parameter metadata is obsolete.")
|
|
||||||
for key, value in self.dict().items():
|
for key, value in self.dict().items():
|
||||||
if value is not None:
|
if value is not None:
|
||||||
req_dict[key] = value
|
req_dict[key] = value
|
||||||
|
|||||||
@@ -44,6 +44,11 @@ from fastdeploy.entrypoints.openai.protocol import (
|
|||||||
UsageInfo,
|
UsageInfo,
|
||||||
)
|
)
|
||||||
from fastdeploy.entrypoints.openai.response_processors import ChatResponseProcessor
|
from fastdeploy.entrypoints.openai.response_processors import ChatResponseProcessor
|
||||||
|
from fastdeploy.logger.request_logger import (
|
||||||
|
RequestLogLevel,
|
||||||
|
log_request,
|
||||||
|
log_request_error,
|
||||||
|
)
|
||||||
from fastdeploy.metrics.metrics import main_process_metrics
|
from fastdeploy.metrics.metrics import main_process_metrics
|
||||||
from fastdeploy.trace.constants import LoggingEventName
|
from fastdeploy.trace.constants import LoggingEventName
|
||||||
from fastdeploy.trace.trace_logger import print as trace_print
|
from fastdeploy.trace.trace_logger import print as trace_print
|
||||||
@@ -112,14 +117,16 @@ class OpenAIServingChat:
|
|||||||
err_msg = (
|
err_msg = (
|
||||||
f"Only master node can accept completion request, please send request to master node: {self.master_ip}"
|
f"Only master node can accept completion request, please send request to master node: {self.master_ip}"
|
||||||
)
|
)
|
||||||
api_server_logger.error(err_msg)
|
log_request_error(message="request[{request_id}] {error}", request_id=request.request_id, error=err_msg)
|
||||||
return ErrorResponse(error=ErrorInfo(message=err_msg, type=ErrorType.INTERNAL_ERROR))
|
return ErrorResponse(error=ErrorInfo(message=err_msg, type=ErrorType.INTERNAL_ERROR))
|
||||||
|
|
||||||
if self.models:
|
if self.models:
|
||||||
is_supported, request.model = self.models.is_supported_model(request.model)
|
is_supported, request.model = self.models.is_supported_model(request.model)
|
||||||
if not is_supported:
|
if not is_supported:
|
||||||
err_msg = f"Unsupported model: [{request.model}], support [{', '.join([x.name for x in self.models.model_paths])}] or default"
|
err_msg = f"Unsupported model: [{request.model}], support [{', '.join([x.name for x in self.models.model_paths])}] or default"
|
||||||
api_server_logger.error(err_msg)
|
log_request_error(
|
||||||
|
message="request[{request_id}] {error}", request_id=request.request_id, error=err_msg
|
||||||
|
)
|
||||||
return ErrorResponse(
|
return ErrorResponse(
|
||||||
error=ErrorInfo(message=err_msg, type=ErrorType.INTERNAL_ERROR, code=ErrorCode.MODEL_NOT_SUPPORT)
|
error=ErrorInfo(message=err_msg, type=ErrorType.INTERNAL_ERROR, code=ErrorCode.MODEL_NOT_SUPPORT)
|
||||||
)
|
)
|
||||||
@@ -129,7 +136,11 @@ class OpenAIServingChat:
|
|||||||
await self.engine_client.semaphore.acquire()
|
await self.engine_client.semaphore.acquire()
|
||||||
else:
|
else:
|
||||||
await asyncio.wait_for(self.engine_client.semaphore.acquire(), timeout=self.max_waiting_time)
|
await asyncio.wait_for(self.engine_client.semaphore.acquire(), timeout=self.max_waiting_time)
|
||||||
api_server_logger.info(f"current {self.engine_client.semaphore.status()}")
|
log_request(
|
||||||
|
RequestLogLevel.STAGES,
|
||||||
|
message="semaphore status: {status}",
|
||||||
|
status=self.engine_client.semaphore.status(),
|
||||||
|
)
|
||||||
|
|
||||||
if request.request_id is not None:
|
if request.request_id is not None:
|
||||||
request_id = request.request_id
|
request_id = request.request_id
|
||||||
@@ -141,7 +152,11 @@ class OpenAIServingChat:
|
|||||||
request_id = f"chatcmpl-{uuid.uuid4()}"
|
request_id = f"chatcmpl-{uuid.uuid4()}"
|
||||||
tracing.trace_req_start(rid=request_id, trace_content=request.trace_context, role="FastDeploy")
|
tracing.trace_req_start(rid=request_id, trace_content=request.trace_context, role="FastDeploy")
|
||||||
del request.trace_context
|
del request.trace_context
|
||||||
api_server_logger.info(f"create chat completion request: {request_id}")
|
log_request(
|
||||||
|
level=RequestLogLevel.LIFECYCLE,
|
||||||
|
message="create chat completion request: {request_id}",
|
||||||
|
request_id=request_id,
|
||||||
|
)
|
||||||
prompt_tokens = None
|
prompt_tokens = None
|
||||||
max_tokens = None
|
max_tokens = None
|
||||||
try:
|
try:
|
||||||
@@ -156,14 +171,19 @@ class OpenAIServingChat:
|
|||||||
if isinstance(prompt_token_ids, np.ndarray):
|
if isinstance(prompt_token_ids, np.ndarray):
|
||||||
prompt_token_ids = prompt_token_ids.tolist()
|
prompt_token_ids = prompt_token_ids.tolist()
|
||||||
except ParameterError as e:
|
except ParameterError as e:
|
||||||
api_server_logger.error(f"request[{request_id}] generator error: {str(e)}, {e.message}")
|
log_request_error(
|
||||||
|
message="request[{request_id}] generator error: {error}, {error_message}",
|
||||||
|
request_id=request_id,
|
||||||
|
error=str(e),
|
||||||
|
error_message=e.message,
|
||||||
|
)
|
||||||
self.engine_client.semaphore.release()
|
self.engine_client.semaphore.release()
|
||||||
return ErrorResponse(
|
return ErrorResponse(
|
||||||
error=ErrorInfo(message=str(e.message), type=ErrorType.INVALID_REQUEST_ERROR, param=e.param)
|
error=ErrorInfo(message=str(e.message), type=ErrorType.INVALID_REQUEST_ERROR, param=e.param)
|
||||||
)
|
)
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
error_msg = f"request[{request_id}] generator error: {str(e)}, {str(traceback.format_exc())}"
|
error_msg = f"request[{request_id}] generator error: {str(e)}, {str(traceback.format_exc())}"
|
||||||
api_server_logger.error(error_msg)
|
log_request_error(message=error_msg)
|
||||||
self.engine_client.semaphore.release()
|
self.engine_client.semaphore.release()
|
||||||
return ErrorResponse(error=ErrorInfo(message=error_msg, type=ErrorType.INVALID_REQUEST_ERROR))
|
return ErrorResponse(error=ErrorInfo(message=error_msg, type=ErrorType.INVALID_REQUEST_ERROR))
|
||||||
|
|
||||||
@@ -178,12 +198,12 @@ class OpenAIServingChat:
|
|||||||
)
|
)
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
error_msg = f"request[{request_id}]full generator error: {str(e)}, {str(traceback.format_exc())}"
|
error_msg = f"request[{request_id}]full generator error: {str(e)}, {str(traceback.format_exc())}"
|
||||||
api_server_logger.error(error_msg)
|
log_request_error(message=error_msg)
|
||||||
return ErrorResponse(error=ErrorInfo(message=error_msg, type=ErrorType.INTERNAL_ERROR))
|
return ErrorResponse(error=ErrorInfo(message=error_msg, type=ErrorType.INTERNAL_ERROR))
|
||||||
except asyncio.CancelledError as e:
|
except asyncio.CancelledError as e:
|
||||||
await self.engine_client.abort(f"{request_id}_0", 1 if request.n is None else request.n)
|
await self.engine_client.abort(f"{request_id}_0", 1 if request.n is None else request.n)
|
||||||
error_msg = f"request[{request_id}_0] client disconnected: {str(e)}, {str(traceback.format_exc())}"
|
error_msg = f"request[{request_id}_0] client disconnected: {str(e)}, {str(traceback.format_exc())}"
|
||||||
api_server_logger.error(error_msg)
|
log_request_error(message=error_msg)
|
||||||
return ErrorResponse(
|
return ErrorResponse(
|
||||||
error=ErrorInfo(message=error_msg, type=ErrorType.INVALID_REQUEST_ERROR, code=ErrorCode.CLIENT_ABORTED)
|
error=ErrorInfo(message=error_msg, type=ErrorType.INVALID_REQUEST_ERROR, code=ErrorCode.CLIENT_ABORTED)
|
||||||
)
|
)
|
||||||
@@ -192,13 +212,13 @@ class OpenAIServingChat:
|
|||||||
f"request[{request_id}] waiting error: {str(e)}, {str(traceback.format_exc())}, "
|
f"request[{request_id}] waiting error: {str(e)}, {str(traceback.format_exc())}, "
|
||||||
f"max waiting time: {self.max_waiting_time}"
|
f"max waiting time: {self.max_waiting_time}"
|
||||||
)
|
)
|
||||||
api_server_logger.error(error_msg)
|
log_request_error(message=error_msg)
|
||||||
return ErrorResponse(
|
return ErrorResponse(
|
||||||
error=ErrorInfo(message=error_msg, type=ErrorType.TIMEOUT_ERROR, code=ErrorCode.TIMEOUT)
|
error=ErrorInfo(message=error_msg, type=ErrorType.TIMEOUT_ERROR, code=ErrorCode.TIMEOUT)
|
||||||
)
|
)
|
||||||
|
|
||||||
def _create_streaming_error_response(self, message: str) -> str:
|
def _create_streaming_error_response(self, message: str) -> str:
|
||||||
api_server_logger.error(message)
|
log_request_error(message=message)
|
||||||
error_response = ErrorResponse(error=ErrorInfo(message=message, type=ErrorType.INTERNAL_ERROR))
|
error_response = ErrorResponse(error=ErrorInfo(message=message, type=ErrorType.INTERNAL_ERROR))
|
||||||
return error_response.model_dump_json()
|
return error_response.model_dump_json()
|
||||||
|
|
||||||
@@ -249,7 +269,9 @@ class OpenAIServingChat:
|
|||||||
choices=[],
|
choices=[],
|
||||||
model=model_name,
|
model=model_name,
|
||||||
)
|
)
|
||||||
api_server_logger.info(f"create chat completion request: {request_id}")
|
log_request(
|
||||||
|
RequestLogLevel.LIFECYCLE, message="create chat completion request: {request_id}", request_id=request_id
|
||||||
|
)
|
||||||
|
|
||||||
try:
|
try:
|
||||||
dealer, response_queue = await self.engine_client.connection_manager.get_connection(
|
dealer, response_queue = await self.engine_client.connection_manager.get_connection(
|
||||||
@@ -372,7 +394,12 @@ class OpenAIServingChat:
|
|||||||
completion_tokens_details=CompletionTokenUsageInfo(reasoning_tokens=0),
|
completion_tokens_details=CompletionTokenUsageInfo(reasoning_tokens=0),
|
||||||
)
|
)
|
||||||
yield f"data: {chunk.model_dump_json(exclude_unset=True)} \n\n"
|
yield f"data: {chunk.model_dump_json(exclude_unset=True)} \n\n"
|
||||||
api_server_logger.info(f"Chat Streaming response send_idx 0: {chunk.model_dump_json()}")
|
log_request(
|
||||||
|
level=RequestLogLevel.LIFECYCLE,
|
||||||
|
message="Chat Streaming response send_idx 0: request_id={request_id}, completion_tokens={completion_tokens}",
|
||||||
|
request_id=request_id,
|
||||||
|
completion_tokens=0,
|
||||||
|
)
|
||||||
first_iteration = False
|
first_iteration = False
|
||||||
|
|
||||||
output = res["outputs"]
|
output = res["outputs"]
|
||||||
@@ -497,7 +524,14 @@ class OpenAIServingChat:
|
|||||||
chunk.choices = choices
|
chunk.choices = choices
|
||||||
yield f"data: {chunk.model_dump_json(exclude_unset=True)}\n\n"
|
yield f"data: {chunk.model_dump_json(exclude_unset=True)}\n\n"
|
||||||
if res["finished"]:
|
if res["finished"]:
|
||||||
api_server_logger.info(f"Chat Streaming response last send: {chunk.model_dump_json()}")
|
log_request(
|
||||||
|
level=RequestLogLevel.LIFECYCLE,
|
||||||
|
message="Chat Streaming response last send: request_id={request_id}, finish_reason={finish_reason}, completion_tokens={completion_tokens}, logprobs={logprobs}",
|
||||||
|
request_id=request_id,
|
||||||
|
finish_reason=choice.finish_reason,
|
||||||
|
completion_tokens=previous_num_tokens[idx],
|
||||||
|
logprobs=logprobs_res,
|
||||||
|
)
|
||||||
choices = []
|
choices = []
|
||||||
|
|
||||||
if include_usage:
|
if include_usage:
|
||||||
@@ -525,7 +559,7 @@ class OpenAIServingChat:
|
|||||||
except asyncio.CancelledError as e:
|
except asyncio.CancelledError as e:
|
||||||
await self.engine_client.abort(f"{request_id}_0", 1 if request.n is None else request.n)
|
await self.engine_client.abort(f"{request_id}_0", 1 if request.n is None else request.n)
|
||||||
error_msg = f"request[{request_id}_0] client disconnected: {str(e)}, {str(traceback.format_exc())}"
|
error_msg = f"request[{request_id}_0] client disconnected: {str(e)}, {str(traceback.format_exc())}"
|
||||||
api_server_logger.error(error_msg)
|
log_request_error(message=error_msg)
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
error_data = self._create_streaming_error_response(
|
error_data = self._create_streaming_error_response(
|
||||||
f"request[{request_id}] generate stream error: {str(e)}, {str(traceback.format_exc())}"
|
f"request[{request_id}] generate stream error: {str(e)}, {str(traceback.format_exc())}"
|
||||||
@@ -536,7 +570,12 @@ class OpenAIServingChat:
|
|||||||
tracing.trace_req_finish(request_id)
|
tracing.trace_req_finish(request_id)
|
||||||
await self.engine_client.connection_manager.cleanup_request(request_id)
|
await self.engine_client.connection_manager.cleanup_request(request_id)
|
||||||
self.engine_client.semaphore.release()
|
self.engine_client.semaphore.release()
|
||||||
api_server_logger.info(f"release {request_id} {self.engine_client.semaphore.status()}")
|
log_request(
|
||||||
|
level=RequestLogLevel.STAGES,
|
||||||
|
message="release {request_id} {status}",
|
||||||
|
request_id=request_id,
|
||||||
|
status=self.engine_client.semaphore.status(),
|
||||||
|
)
|
||||||
yield "data: [DONE]\n\n"
|
yield "data: [DONE]\n\n"
|
||||||
|
|
||||||
async def chat_completion_full_generator(
|
async def chat_completion_full_generator(
|
||||||
@@ -704,7 +743,9 @@ class OpenAIServingChat:
|
|||||||
tracing.trace_req_finish(request_id)
|
tracing.trace_req_finish(request_id)
|
||||||
await self.engine_client.connection_manager.cleanup_request(request_id)
|
await self.engine_client.connection_manager.cleanup_request(request_id)
|
||||||
self.engine_client.semaphore.release()
|
self.engine_client.semaphore.release()
|
||||||
api_server_logger.info(f"release {self.engine_client.semaphore.status()}")
|
log_request(
|
||||||
|
RequestLogLevel.STAGES, message="release {status}", status=self.engine_client.semaphore.status()
|
||||||
|
)
|
||||||
|
|
||||||
num_prompt_tokens = len(prompt_token_ids)
|
num_prompt_tokens = len(prompt_token_ids)
|
||||||
num_generated_tokens = sum(previous_num_tokens)
|
num_generated_tokens = sum(previous_num_tokens)
|
||||||
@@ -731,7 +772,7 @@ class OpenAIServingChat:
|
|||||||
choices=choices,
|
choices=choices,
|
||||||
usage=usage,
|
usage=usage,
|
||||||
)
|
)
|
||||||
api_server_logger.info(f"Chat response: {res.model_dump_json()}")
|
log_request(RequestLogLevel.CONTENT, message="Chat response: {response}", response=res.model_dump_json())
|
||||||
return res
|
return res
|
||||||
|
|
||||||
async def _create_chat_completion_choice(
|
async def _create_chat_completion_choice(
|
||||||
@@ -904,7 +945,7 @@ class OpenAIServingChat:
|
|||||||
|
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
error_msg = f"Error in _build_logprobs_response: {e}, {str(traceback.format_exc())}"
|
error_msg = f"Error in _build_logprobs_response: {e}, {str(traceback.format_exc())}"
|
||||||
api_server_logger.error(error_msg)
|
log_request_error(message=error_msg)
|
||||||
return None
|
return None
|
||||||
|
|
||||||
def _build_prompt_logprobs(
|
def _build_prompt_logprobs(
|
||||||
|
|||||||
@@ -41,6 +41,11 @@ from fastdeploy.entrypoints.openai.protocol import (
|
|||||||
PromptTokenUsageInfo,
|
PromptTokenUsageInfo,
|
||||||
UsageInfo,
|
UsageInfo,
|
||||||
)
|
)
|
||||||
|
from fastdeploy.logger.request_logger import (
|
||||||
|
RequestLogLevel,
|
||||||
|
log_request,
|
||||||
|
log_request_error,
|
||||||
|
)
|
||||||
from fastdeploy.trace.constants import LoggingEventName
|
from fastdeploy.trace.constants import LoggingEventName
|
||||||
from fastdeploy.trace.trace_logger import print as trace_print
|
from fastdeploy.trace.trace_logger import print as trace_print
|
||||||
from fastdeploy.utils import (
|
from fastdeploy.utils import (
|
||||||
@@ -91,13 +96,15 @@ class OpenAIServingCompletion:
|
|||||||
err_msg = (
|
err_msg = (
|
||||||
f"Only master node can accept completion request, please send request to master node: {self.master_ip}"
|
f"Only master node can accept completion request, please send request to master node: {self.master_ip}"
|
||||||
)
|
)
|
||||||
api_server_logger.error(err_msg)
|
log_request_error(message="request[{request_id}] {error}", request_id=request.request_id, error=err_msg)
|
||||||
return ErrorResponse(error=ErrorInfo(message=err_msg, type=ErrorType.INTERNAL_ERROR))
|
return ErrorResponse(error=ErrorInfo(message=err_msg, type=ErrorType.INTERNAL_ERROR))
|
||||||
if self.models:
|
if self.models:
|
||||||
is_supported, request.model = self.models.is_supported_model(request.model)
|
is_supported, request.model = self.models.is_supported_model(request.model)
|
||||||
if not is_supported:
|
if not is_supported:
|
||||||
err_msg = f"Unsupported model: [{request.model}], support [{', '.join([x.name for x in self.models.model_paths])}] or default"
|
err_msg = f"Unsupported model: [{request.model}], support [{', '.join([x.name for x in self.models.model_paths])}] or default"
|
||||||
api_server_logger.error(err_msg)
|
log_request_error(
|
||||||
|
message="request[{request_id}] {error}", request_id=request.request_id, error=err_msg
|
||||||
|
)
|
||||||
return ErrorResponse(
|
return ErrorResponse(
|
||||||
error=ErrorInfo(message=err_msg, type=ErrorType.INTERNAL_ERROR, code=ErrorCode.MODEL_NOT_SUPPORT)
|
error=ErrorInfo(message=err_msg, type=ErrorType.INTERNAL_ERROR, code=ErrorCode.MODEL_NOT_SUPPORT)
|
||||||
)
|
)
|
||||||
@@ -110,7 +117,11 @@ class OpenAIServingCompletion:
|
|||||||
request_id = f"cmpl-{request.user}-{uuid.uuid4()}"
|
request_id = f"cmpl-{request.user}-{uuid.uuid4()}"
|
||||||
else:
|
else:
|
||||||
request_id = f"cmpl-{uuid.uuid4()}"
|
request_id = f"cmpl-{uuid.uuid4()}"
|
||||||
api_server_logger.info(f"Initialize request {request_id}: {request}")
|
log_request(
|
||||||
|
level=RequestLogLevel.LIFECYCLE,
|
||||||
|
message="Initialize request {request_id}",
|
||||||
|
request_id=request_id,
|
||||||
|
)
|
||||||
tracing.trace_req_start(rid=request_id, trace_content=request.trace_context, role="FastDeploy")
|
tracing.trace_req_start(rid=request_id, trace_content=request.trace_context, role="FastDeploy")
|
||||||
del request.trace_context
|
del request.trace_context
|
||||||
request_prompt_ids = None
|
request_prompt_ids = None
|
||||||
@@ -147,15 +158,20 @@ class OpenAIServingCompletion:
|
|||||||
else:
|
else:
|
||||||
raise ValueError("Prompt type must be one of: str, list[str], list[int], list[list[int]]")
|
raise ValueError("Prompt type must be one of: str, list[str], list[int], list[list[int]]")
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
error_msg = f"OpenAIServingCompletion create_completion: {e}, {str(traceback.format_exc())}"
|
error_msg = f"request[{request_id}] create_completion: {e}, {str(traceback.format_exc())}"
|
||||||
api_server_logger.error(error_msg)
|
log_request_error(message=error_msg)
|
||||||
return ErrorResponse(error=ErrorInfo(message=error_msg, type=ErrorType.INTERNAL_ERROR))
|
return ErrorResponse(error=ErrorInfo(message=error_msg, type=ErrorType.INTERNAL_ERROR))
|
||||||
|
|
||||||
if request_prompt_ids is not None:
|
if request_prompt_ids is not None:
|
||||||
request_prompts = request_prompt_ids
|
request_prompts = request_prompt_ids
|
||||||
|
|
||||||
num_choices = len(request_prompts) * (1 if request.n is None else request.n)
|
num_choices = len(request_prompts) * (1 if request.n is None else request.n)
|
||||||
api_server_logger.info(f"Start preprocessing request: req_id={request_id}), num_choices={num_choices}")
|
log_request(
|
||||||
|
RequestLogLevel.STAGES,
|
||||||
|
message="Start preprocessing request: req_id={request_id}), num_choices={num_choices}",
|
||||||
|
request_id=request_id,
|
||||||
|
num_choices=num_choices,
|
||||||
|
)
|
||||||
prompt_batched_token_ids = []
|
prompt_batched_token_ids = []
|
||||||
prompt_tokens_list = []
|
prompt_tokens_list = []
|
||||||
max_tokens_list = []
|
max_tokens_list = []
|
||||||
@@ -169,7 +185,7 @@ class OpenAIServingCompletion:
|
|||||||
f"OpenAIServingCompletion waiting error: {e}, {str(traceback.format_exc())}, "
|
f"OpenAIServingCompletion waiting error: {e}, {str(traceback.format_exc())}, "
|
||||||
f"max waiting time: {self.max_waiting_time}"
|
f"max waiting time: {self.max_waiting_time}"
|
||||||
)
|
)
|
||||||
api_server_logger.error(error_msg)
|
log_request_error(message="request[{request_id}] {error}", request_id=request_id, error=error_msg)
|
||||||
return ErrorResponse(
|
return ErrorResponse(
|
||||||
error=ErrorInfo(message=error_msg, code=ErrorCode.TIMEOUT, type=ErrorType.TIMEOUT_ERROR)
|
error=ErrorInfo(message=error_msg, code=ErrorCode.TIMEOUT, type=ErrorType.TIMEOUT_ERROR)
|
||||||
)
|
)
|
||||||
@@ -188,14 +204,19 @@ class OpenAIServingCompletion:
|
|||||||
max_tokens_list.append(current_req_dict.get("max_tokens"))
|
max_tokens_list.append(current_req_dict.get("max_tokens"))
|
||||||
del current_req_dict
|
del current_req_dict
|
||||||
except ParameterError as e:
|
except ParameterError as e:
|
||||||
api_server_logger.error(f"OpenAIServingCompletion format error: {e}, {e.message}")
|
log_request_error(
|
||||||
|
message="request[{request_id}] format error: {error}, {error_message}",
|
||||||
|
request_id=request_id,
|
||||||
|
error=e,
|
||||||
|
error_message=e.message,
|
||||||
|
)
|
||||||
self.engine_client.semaphore.release()
|
self.engine_client.semaphore.release()
|
||||||
return ErrorResponse(
|
return ErrorResponse(
|
||||||
error=ErrorInfo(code="400", message=str(e.message), type="invalid_request", param=e.param)
|
error=ErrorInfo(code="400", message=str(e.message), type="invalid_request", param=e.param)
|
||||||
)
|
)
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
error_msg = f"OpenAIServingCompletion format error: {e}, {str(traceback.format_exc())}"
|
error_msg = f"request[{request_id}] format error: {e}, {str(traceback.format_exc())}"
|
||||||
api_server_logger.error(error_msg)
|
log_request_error(message=error_msg)
|
||||||
self.engine_client.semaphore.release()
|
self.engine_client.semaphore.release()
|
||||||
return ErrorResponse(
|
return ErrorResponse(
|
||||||
error=ErrorInfo(message=str(e), code=ErrorCode.INVALID_VALUE, type=ErrorType.INVALID_REQUEST_ERROR)
|
error=ErrorInfo(message=str(e), code=ErrorCode.INVALID_VALUE, type=ErrorType.INVALID_REQUEST_ERROR)
|
||||||
@@ -226,20 +247,20 @@ class OpenAIServingCompletion:
|
|||||||
)
|
)
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
error_msg = (
|
error_msg = (
|
||||||
f"OpenAIServingCompletion completion_full_generator error: {e}, {str(traceback.format_exc())}"
|
f"request[{request_id}] completion_full_generator error: {e}, {str(traceback.format_exc())}"
|
||||||
)
|
)
|
||||||
api_server_logger.error(error_msg)
|
log_request_error(message=error_msg)
|
||||||
return ErrorResponse(error=ErrorInfo(message=error_msg, type=ErrorType.INTERNAL_ERROR))
|
return ErrorResponse(error=ErrorInfo(message=error_msg, type=ErrorType.INTERNAL_ERROR))
|
||||||
except asyncio.CancelledError as e:
|
except asyncio.CancelledError as e:
|
||||||
await self.engine_client.abort(f"{request_id}_0", num_choices)
|
await self.engine_client.abort(f"{request_id}_0", num_choices)
|
||||||
error_msg = f"request[{request_id}_0] client disconnected: {str(e)}, {str(traceback.format_exc())}"
|
error_msg = f"request[{request_id}_0] client disconnected: {str(e)}, {str(traceback.format_exc())}"
|
||||||
api_server_logger.error(error_msg)
|
log_request_error(message=error_msg)
|
||||||
return ErrorResponse(
|
return ErrorResponse(
|
||||||
error=ErrorInfo(message=error_msg, type=ErrorType.INVALID_REQUEST_ERROR, code=ErrorCode.CLIENT_ABORTED)
|
error=ErrorInfo(message=error_msg, type=ErrorType.INVALID_REQUEST_ERROR, code=ErrorCode.CLIENT_ABORTED)
|
||||||
)
|
)
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
error_msg = f"OpenAIServingCompletion create_completion error: {e}, {str(traceback.format_exc())}"
|
error_msg = f"request[{request_id}] create_completion error: {e}, {str(traceback.format_exc())}"
|
||||||
api_server_logger.error(error_msg)
|
log_request_error(message=error_msg)
|
||||||
return ErrorResponse(error=ErrorInfo(message=error_msg, type=ErrorType.INTERNAL_ERROR))
|
return ErrorResponse(error=ErrorInfo(message=error_msg, type=ErrorType.INTERNAL_ERROR))
|
||||||
|
|
||||||
async def completion_full_generator(
|
async def completion_full_generator(
|
||||||
@@ -368,10 +389,16 @@ class OpenAIServingCompletion:
|
|||||||
prompt_tokens_list=prompt_tokens_list,
|
prompt_tokens_list=prompt_tokens_list,
|
||||||
max_tokens_list=max_tokens_list,
|
max_tokens_list=max_tokens_list,
|
||||||
)
|
)
|
||||||
api_server_logger.info(f"Completion response: {res.model_dump_json()}")
|
log_request(
|
||||||
|
RequestLogLevel.CONTENT, message="Completion response: {response}", response=res.model_dump_json()
|
||||||
|
)
|
||||||
return res
|
return res
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
api_server_logger.error(f"Error in completion_full_generator: {e}", exc_info=True)
|
log_request_error(
|
||||||
|
message="request[{request_id}] error in completion_full_generator: {error}",
|
||||||
|
request_id=request_id,
|
||||||
|
error=e,
|
||||||
|
)
|
||||||
finally:
|
finally:
|
||||||
trace_print(LoggingEventName.POSTPROCESSING_END, request_id, getattr(request, "user", ""))
|
trace_print(LoggingEventName.POSTPROCESSING_END, request_id, getattr(request, "user", ""))
|
||||||
tracing.trace_req_finish(request_id)
|
tracing.trace_req_finish(request_id)
|
||||||
@@ -514,8 +541,11 @@ class OpenAIServingCompletion:
|
|||||||
],
|
],
|
||||||
)
|
)
|
||||||
yield f"data: {chunk.model_dump_json(exclude_unset=True)}\n\n"
|
yield f"data: {chunk.model_dump_json(exclude_unset=True)}\n\n"
|
||||||
api_server_logger.info(
|
log_request(
|
||||||
f"Completion Streaming response send_idx 0: {chunk.model_dump_json()}"
|
level=RequestLogLevel.LIFECYCLE,
|
||||||
|
message="Completion Streaming response send_idx 0: request_id={request_id}, completion_tokens={completion_tokens}",
|
||||||
|
request_id=request_id,
|
||||||
|
completion_tokens=0,
|
||||||
)
|
)
|
||||||
first_iteration[idx] = False
|
first_iteration[idx] = False
|
||||||
|
|
||||||
@@ -592,8 +622,11 @@ class OpenAIServingCompletion:
|
|||||||
if send_idx == 0 and not request.return_token_ids:
|
if send_idx == 0 and not request.return_token_ids:
|
||||||
chunk_temp = chunk
|
chunk_temp = chunk
|
||||||
chunk_temp.choices = choices
|
chunk_temp.choices = choices
|
||||||
api_server_logger.info(
|
log_request(
|
||||||
f"Completion Streaming response send_idx 0: {chunk_temp.model_dump_json()}"
|
level=RequestLogLevel.LIFECYCLE,
|
||||||
|
message="Completion Streaming response send_idx 0: request_id={request_id}, completion_tokens={completion_tokens}",
|
||||||
|
request_id=request_id,
|
||||||
|
completion_tokens=output_tokens[idx],
|
||||||
)
|
)
|
||||||
del chunk_temp
|
del chunk_temp
|
||||||
|
|
||||||
@@ -646,14 +679,26 @@ class OpenAIServingCompletion:
|
|||||||
metrics=res["metrics"] if request.collect_metrics else None,
|
metrics=res["metrics"] if request.collect_metrics else None,
|
||||||
)
|
)
|
||||||
yield f"data: {usage_chunk.model_dump_json(exclude_unset=True)}\n\n"
|
yield f"data: {usage_chunk.model_dump_json(exclude_unset=True)}\n\n"
|
||||||
api_server_logger.info(f"Completion Streaming response last send: {chunk.model_dump_json()}")
|
log_request(
|
||||||
|
level=RequestLogLevel.LIFECYCLE,
|
||||||
|
message="Completion Streaming response last send: request_id={request_id}, finish_reason={finish_reason}, completion_tokens={completion_tokens}, logprobs={logprobs}",
|
||||||
|
request_id=request_id,
|
||||||
|
finish_reason=chunk.choices[-1].finish_reason if chunk.choices else None,
|
||||||
|
completion_tokens=output_tokens[idx],
|
||||||
|
logprobs=logprobs_res,
|
||||||
|
)
|
||||||
|
|
||||||
except asyncio.CancelledError as e:
|
except asyncio.CancelledError as e:
|
||||||
await self.engine_client.abort(f"{request_id}_0", num_choices)
|
await self.engine_client.abort(f"{request_id}_0", num_choices)
|
||||||
error_msg = f"request[{request_id}_0] client disconnected: {str(e)}, {str(traceback.format_exc())}"
|
error_msg = f"request[{request_id}_0] client disconnected: {str(e)}, {str(traceback.format_exc())}"
|
||||||
api_server_logger.error(error_msg)
|
log_request_error(message=error_msg)
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
api_server_logger.error(f"Error in completion_stream_generator: {e}, {str(traceback.format_exc())}")
|
log_request_error(
|
||||||
|
message="request[{request_id}] error in completion_stream_generator: {error}, {traceback}",
|
||||||
|
request_id=request_id,
|
||||||
|
error=e,
|
||||||
|
traceback=traceback.format_exc(),
|
||||||
|
)
|
||||||
yield f"data: {ErrorResponse(error=ErrorInfo(message=str(e), code='400', type=ErrorType.INTERNAL_ERROR)).model_dump_json(exclude_unset=True)}\n\n"
|
yield f"data: {ErrorResponse(error=ErrorInfo(message=str(e), code='400', type=ErrorType.INTERNAL_ERROR)).model_dump_json(exclude_unset=True)}\n\n"
|
||||||
finally:
|
finally:
|
||||||
trace_print(LoggingEventName.POSTPROCESSING_END, request_id, getattr(request, "user", ""))
|
trace_print(LoggingEventName.POSTPROCESSING_END, request_id, getattr(request, "user", ""))
|
||||||
@@ -887,7 +932,11 @@ class OpenAIServingCompletion:
|
|||||||
)
|
)
|
||||||
|
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
api_server_logger.error(f"Error in _build_logprobs_response: {str(e)}, {str(traceback.format_exc())}")
|
log_request_error(
|
||||||
|
message="Error in _build_logprobs_response: {error}, {traceback}",
|
||||||
|
error=str(e),
|
||||||
|
traceback=traceback.format_exc(),
|
||||||
|
)
|
||||||
return None
|
return None
|
||||||
|
|
||||||
def _build_prompt_logprobs(
|
def _build_prompt_logprobs(
|
||||||
|
|||||||
@@ -35,7 +35,7 @@ from fastdeploy.entrypoints.openai.protocol import (
|
|||||||
UsageInfo,
|
UsageInfo,
|
||||||
)
|
)
|
||||||
from fastdeploy.entrypoints.openai.serving_engine import ServeContext, ZmqOpenAIServing
|
from fastdeploy.entrypoints.openai.serving_engine import ServeContext, ZmqOpenAIServing
|
||||||
from fastdeploy.utils import api_server_logger
|
from fastdeploy.logger.request_logger import RequestLogLevel, log_request
|
||||||
|
|
||||||
|
|
||||||
def _get_embedding(
|
def _get_embedding(
|
||||||
@@ -140,7 +140,12 @@ class OpenAIServingEmbedding(ZmqOpenAIServing):
|
|||||||
@override
|
@override
|
||||||
def _build_response(self, ctx: ServeContext, request_output: dict):
|
def _build_response(self, ctx: ServeContext, request_output: dict):
|
||||||
"""Generate final embedding response"""
|
"""Generate final embedding response"""
|
||||||
api_server_logger.info(f"[{ctx.request_id}] Embedding RequestOutput received:{request_output}")
|
log_request(
|
||||||
|
level=RequestLogLevel.CONTENT,
|
||||||
|
message="[{request_id}] Embedding RequestOutput received:{request_output}",
|
||||||
|
request_id=ctx.request_id,
|
||||||
|
request_output=request_output,
|
||||||
|
)
|
||||||
|
|
||||||
base = PoolingRequestOutput.from_dict(request_output)
|
base = PoolingRequestOutput.from_dict(request_output)
|
||||||
embedding_res = EmbeddingRequestOutput.from_base(base)
|
embedding_res = EmbeddingRequestOutput.from_base(base)
|
||||||
|
|||||||
@@ -33,6 +33,11 @@ from fastdeploy.entrypoints.openai.protocol import (
|
|||||||
InvalidParameterException,
|
InvalidParameterException,
|
||||||
)
|
)
|
||||||
from fastdeploy.envs import FD_SUPPORT_MAX_CONNECTIONS
|
from fastdeploy.envs import FD_SUPPORT_MAX_CONNECTIONS
|
||||||
|
from fastdeploy.logger.request_logger import (
|
||||||
|
RequestLogLevel,
|
||||||
|
log_request,
|
||||||
|
log_request_error,
|
||||||
|
)
|
||||||
from fastdeploy.utils import ErrorCode, ErrorType, StatefulSemaphore, api_server_logger
|
from fastdeploy.utils import ErrorCode, ErrorType, StatefulSemaphore, api_server_logger
|
||||||
|
|
||||||
RequestT = TypeVar("RequestT")
|
RequestT = TypeVar("RequestT")
|
||||||
@@ -96,13 +101,18 @@ class OpenAIServing(ABC, Generic[RequestT]):
|
|||||||
is_supported, adjusted_name = self.models.is_supported_model(model_name)
|
is_supported, adjusted_name = self.models.is_supported_model(model_name)
|
||||||
if not is_supported:
|
if not is_supported:
|
||||||
err_msg = f"Unsupported model: [{model_name}]"
|
err_msg = f"Unsupported model: [{model_name}]"
|
||||||
api_server_logger.error(err_msg)
|
log_request_error(message=err_msg)
|
||||||
return is_supported, adjusted_name
|
return is_supported, adjusted_name
|
||||||
|
|
||||||
async def _acquire_semaphore(self, request_id: str) -> bool:
|
async def _acquire_semaphore(self, request_id: str) -> bool:
|
||||||
"""Acquire engine client semaphore with timeout"""
|
"""Acquire engine client semaphore with timeout"""
|
||||||
try:
|
try:
|
||||||
api_server_logger.info(f"Acquire request:{request_id} status:{self._get_semaphore().status()}")
|
log_request(
|
||||||
|
level=RequestLogLevel.STAGES,
|
||||||
|
message="Acquire request:{request_id} status:{status}",
|
||||||
|
request_id=request_id,
|
||||||
|
status=self._get_semaphore().status(),
|
||||||
|
)
|
||||||
if self.max_waiting_time < 0:
|
if self.max_waiting_time < 0:
|
||||||
await self._get_semaphore().acquire()
|
await self._get_semaphore().acquire()
|
||||||
else:
|
else:
|
||||||
@@ -111,13 +121,18 @@ class OpenAIServing(ABC, Generic[RequestT]):
|
|||||||
except asyncio.TimeoutError:
|
except asyncio.TimeoutError:
|
||||||
self._release_semaphore(request_id)
|
self._release_semaphore(request_id)
|
||||||
error_msg = f"Request waiting timeout, request:{request_id} max waiting time:{self.max_waiting_time}"
|
error_msg = f"Request waiting timeout, request:{request_id} max waiting time:{self.max_waiting_time}"
|
||||||
api_server_logger.error(error_msg)
|
log_request_error(message=error_msg)
|
||||||
return False
|
return False
|
||||||
|
|
||||||
def _release_semaphore(self, request_id: str) -> None:
|
def _release_semaphore(self, request_id: str) -> None:
|
||||||
"""Release engine client semaphore"""
|
"""Release engine client semaphore"""
|
||||||
self._get_semaphore().release()
|
self._get_semaphore().release()
|
||||||
api_server_logger.info(f"Release request:{request_id} status:{self._get_semaphore().status()}")
|
log_request(
|
||||||
|
level=RequestLogLevel.STAGES,
|
||||||
|
message="Release request:{request_id} status:{status}",
|
||||||
|
request_id=request_id,
|
||||||
|
status=self._get_semaphore().status(),
|
||||||
|
)
|
||||||
|
|
||||||
def _create_error_response(
|
def _create_error_response(
|
||||||
self,
|
self,
|
||||||
@@ -128,7 +143,7 @@ class OpenAIServing(ABC, Generic[RequestT]):
|
|||||||
) -> ErrorResponse:
|
) -> ErrorResponse:
|
||||||
"""Create standardized error response"""
|
"""Create standardized error response"""
|
||||||
traceback.print_exc()
|
traceback.print_exc()
|
||||||
api_server_logger.error(message)
|
log_request_error(message=message)
|
||||||
return ErrorResponse(error=ErrorInfo(message=message, type=error_type, code=code, param=param))
|
return ErrorResponse(error=ErrorInfo(message=message, type=error_type, code=code, param=param))
|
||||||
|
|
||||||
def _generate_request_id(self, request: RequestT) -> str:
|
def _generate_request_id(self, request: RequestT) -> str:
|
||||||
@@ -193,7 +208,12 @@ class OpenAIServing(ABC, Generic[RequestT]):
|
|||||||
|
|
||||||
request_id = self._generate_request_id(request)
|
request_id = self._generate_request_id(request)
|
||||||
ctx.request_id = request_id
|
ctx.request_id = request_id
|
||||||
api_server_logger.info(f"Initialize request {request_id}: {request}")
|
log_request(
|
||||||
|
level=RequestLogLevel.LIFECYCLE,
|
||||||
|
message="Initialize request {request_id}: {request}",
|
||||||
|
request_id=request_id,
|
||||||
|
request=request,
|
||||||
|
)
|
||||||
|
|
||||||
# Step 2: Semaphore acquisition
|
# Step 2: Semaphore acquisition
|
||||||
if not await self._acquire_semaphore(request_id):
|
if not await self._acquire_semaphore(request_id):
|
||||||
@@ -252,7 +272,12 @@ class ZmqOpenAIServing(OpenAIServing):
|
|||||||
request_dicts = self._request_to_batch_dicts(ctx)
|
request_dicts = self._request_to_batch_dicts(ctx)
|
||||||
ctx.preprocess_requests = request_dicts
|
ctx.preprocess_requests = request_dicts
|
||||||
for request_dict in request_dicts:
|
for request_dict in request_dicts:
|
||||||
api_server_logger.info(f"batch add request_id: {request_dict['request_id']}, request: {request_dict}")
|
log_request(
|
||||||
|
level=RequestLogLevel.CONTENT,
|
||||||
|
message="batch add request_id: {request_id}, request: {request}",
|
||||||
|
request_id=request_dict["request_id"],
|
||||||
|
request=request_dict,
|
||||||
|
)
|
||||||
await self.engine_client.format_and_add_data(request_dict)
|
await self.engine_client.format_and_add_data(request_dict)
|
||||||
|
|
||||||
def _process_chat_template_kwargs(self, request_dict):
|
def _process_chat_template_kwargs(self, request_dict):
|
||||||
@@ -283,7 +308,11 @@ class ZmqOpenAIServing(OpenAIServing):
|
|||||||
while num_choices > 0:
|
while num_choices > 0:
|
||||||
request_output_dicts = await asyncio.wait_for(request_output_queue.get(), timeout=60)
|
request_output_dicts = await asyncio.wait_for(request_output_queue.get(), timeout=60)
|
||||||
for request_output_dict in request_output_dicts:
|
for request_output_dict in request_output_dicts:
|
||||||
api_server_logger.debug(f"Received RequestOutput: {request_output_dict}")
|
log_request(
|
||||||
|
level=RequestLogLevel.FULL,
|
||||||
|
message="Received RequestOutput: {request_output}",
|
||||||
|
request_output=request_output_dict,
|
||||||
|
)
|
||||||
if request_output_dict["finished"] is True:
|
if request_output_dict["finished"] is True:
|
||||||
num_choices -= 1
|
num_choices -= 1
|
||||||
yield request_output_dict
|
yield request_output_dict
|
||||||
@@ -301,7 +330,12 @@ class ZmqOpenAIServing(OpenAIServing):
|
|||||||
async def _acquire_semaphore(self, request_id: str) -> bool:
|
async def _acquire_semaphore(self, request_id: str) -> bool:
|
||||||
"""Acquire engine client semaphore with timeout"""
|
"""Acquire engine client semaphore with timeout"""
|
||||||
try:
|
try:
|
||||||
api_server_logger.info(f"Acquire request:{request_id} status:{self._get_semaphore().status()}")
|
log_request(
|
||||||
|
level=RequestLogLevel.STAGES,
|
||||||
|
message="Acquire request:{request_id} status:{status}",
|
||||||
|
request_id=request_id,
|
||||||
|
status=self._get_semaphore().status(),
|
||||||
|
)
|
||||||
if self.max_waiting_time < 0:
|
if self.max_waiting_time < 0:
|
||||||
await self._get_semaphore().acquire()
|
await self._get_semaphore().acquire()
|
||||||
else:
|
else:
|
||||||
@@ -310,14 +344,19 @@ class ZmqOpenAIServing(OpenAIServing):
|
|||||||
except asyncio.TimeoutError:
|
except asyncio.TimeoutError:
|
||||||
self._release_semaphore(request_id)
|
self._release_semaphore(request_id)
|
||||||
error_msg = f"Request waiting timeout, request:{request_id} max waiting time:{self.max_waiting_time}"
|
error_msg = f"Request waiting timeout, request:{request_id} max waiting time:{self.max_waiting_time}"
|
||||||
api_server_logger.error(error_msg)
|
log_request_error(message=error_msg)
|
||||||
return False
|
return False
|
||||||
|
|
||||||
@override
|
@override
|
||||||
def _release_semaphore(self, request_id: str) -> None:
|
def _release_semaphore(self, request_id: str) -> None:
|
||||||
"""Release engine client semaphore"""
|
"""Release engine client semaphore"""
|
||||||
self._get_semaphore().release()
|
self._get_semaphore().release()
|
||||||
api_server_logger.info(f"Release request:{request_id} status:{self._get_semaphore().status()}")
|
log_request(
|
||||||
|
level=RequestLogLevel.STAGES,
|
||||||
|
message="Release request:{request_id} status:{status}",
|
||||||
|
request_id=request_id,
|
||||||
|
status=self._get_semaphore().status(),
|
||||||
|
)
|
||||||
|
|
||||||
@override
|
@override
|
||||||
def _check_master(self) -> bool:
|
def _check_master(self) -> bool:
|
||||||
|
|||||||
@@ -24,7 +24,8 @@ from fastdeploy.entrypoints.openai.protocol import (
|
|||||||
ModelList,
|
ModelList,
|
||||||
ModelPermission,
|
ModelPermission,
|
||||||
)
|
)
|
||||||
from fastdeploy.utils import ErrorType, api_server_logger, get_host_ip
|
from fastdeploy.logger.request_logger import log_request_error
|
||||||
|
from fastdeploy.utils import ErrorType, get_host_ip
|
||||||
|
|
||||||
|
|
||||||
@dataclass
|
@dataclass
|
||||||
@@ -86,7 +87,7 @@ class OpenAIServingModels:
|
|||||||
err_msg = (
|
err_msg = (
|
||||||
f"Only master node can accept models request, please send request to master node: {self.master_ip}"
|
f"Only master node can accept models request, please send request to master node: {self.master_ip}"
|
||||||
)
|
)
|
||||||
api_server_logger.error(err_msg)
|
log_request_error(message=err_msg)
|
||||||
return ErrorResponse(error=ErrorInfo(message=err_msg, type=ErrorType.INTERNAL_ERROR))
|
return ErrorResponse(error=ErrorInfo(message=err_msg, type=ErrorType.INTERNAL_ERROR))
|
||||||
model_infos = [
|
model_infos = [
|
||||||
ModelInfo(
|
ModelInfo(
|
||||||
|
|||||||
@@ -28,7 +28,7 @@ from fastdeploy.entrypoints.openai.protocol import (
|
|||||||
UsageInfo,
|
UsageInfo,
|
||||||
)
|
)
|
||||||
from fastdeploy.entrypoints.openai.serving_engine import ServeContext, ZmqOpenAIServing
|
from fastdeploy.entrypoints.openai.serving_engine import ServeContext, ZmqOpenAIServing
|
||||||
from fastdeploy.utils import api_server_logger
|
from fastdeploy.logger.request_logger import RequestLogLevel, log_request
|
||||||
|
|
||||||
|
|
||||||
class OpenAIServingReward(ZmqOpenAIServing):
|
class OpenAIServingReward(ZmqOpenAIServing):
|
||||||
@@ -77,7 +77,7 @@ class OpenAIServingReward(ZmqOpenAIServing):
|
|||||||
response: ChatRewardResponse = None
|
response: ChatRewardResponse = None
|
||||||
generators: AsyncGenerator[ChatRewardResponse, None] = self.handle(ctx)
|
generators: AsyncGenerator[ChatRewardResponse, None] = self.handle(ctx)
|
||||||
async for r in generators:
|
async for r in generators:
|
||||||
api_server_logger.info(f"engine pooling result:{r}")
|
log_request(RequestLogLevel.CONTENT, message="engine pooling result: {result}", result=r)
|
||||||
r.data[0].index = idx
|
r.data[0].index = idx
|
||||||
idx += 1
|
idx += 1
|
||||||
if response is None or isinstance(r, ErrorResponse):
|
if response is None or isinstance(r, ErrorResponse):
|
||||||
@@ -93,7 +93,12 @@ class OpenAIServingReward(ZmqOpenAIServing):
|
|||||||
@override
|
@override
|
||||||
def _build_response(self, ctx: ServeContext, request_output: dict):
|
def _build_response(self, ctx: ServeContext, request_output: dict):
|
||||||
"""Generate final reward response"""
|
"""Generate final reward response"""
|
||||||
api_server_logger.info(f"[{ctx.request_id}] Reward RequestOutput received:{request_output}")
|
log_request(
|
||||||
|
level=RequestLogLevel.CONTENT,
|
||||||
|
message="Reward RequestOutput received: request_id={request_id}, output={request_output}",
|
||||||
|
request_id=ctx.request_id,
|
||||||
|
request_output=request_output,
|
||||||
|
)
|
||||||
|
|
||||||
base = PoolingRequestOutput.from_dict(request_output)
|
base = PoolingRequestOutput.from_dict(request_output)
|
||||||
reward_res = RewardRequestOutput.from_base(base)
|
reward_res = RewardRequestOutput.from_base(base)
|
||||||
|
|||||||
@@ -41,6 +41,7 @@ from fastdeploy.entrypoints.openai.tool_parsers.abstract_tool_parser import (
|
|||||||
ToolParser,
|
ToolParser,
|
||||||
ToolParserManager,
|
ToolParserManager,
|
||||||
)
|
)
|
||||||
|
from fastdeploy.logger.request_logger import log_request_error
|
||||||
from fastdeploy.utils import data_processor_logger
|
from fastdeploy.utils import data_processor_logger
|
||||||
|
|
||||||
|
|
||||||
@@ -184,7 +185,9 @@ class Ernie45VLThinkingToolParser(ToolParser):
|
|||||||
continue
|
continue
|
||||||
|
|
||||||
if not function_call_arr:
|
if not function_call_arr:
|
||||||
data_processor_logger.error("No valid tool calls found")
|
log_request_error(
|
||||||
|
message="request[{request_id}] No valid tool calls found", request_id=request.request_id
|
||||||
|
)
|
||||||
return ExtractedToolCallInformation(tools_called=False, content=model_output)
|
return ExtractedToolCallInformation(tools_called=False, content=model_output)
|
||||||
|
|
||||||
tool_calls = []
|
tool_calls = []
|
||||||
@@ -226,7 +229,11 @@ class Ernie45VLThinkingToolParser(ToolParser):
|
|||||||
)
|
)
|
||||||
|
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
data_processor_logger.error(f"Error in extracting tool call from response: {str(e)}")
|
log_request_error(
|
||||||
|
message="request[{request_id}] Error in extracting tool call from response: {error}",
|
||||||
|
request_id=request.request_id,
|
||||||
|
error=str(e),
|
||||||
|
)
|
||||||
return ExtractedToolCallInformation(tools_called=False, tool_calls=None, content=model_output)
|
return ExtractedToolCallInformation(tools_called=False, tool_calls=None, content=model_output)
|
||||||
|
|
||||||
def extract_tool_calls_streaming(
|
def extract_tool_calls_streaming(
|
||||||
@@ -343,7 +350,11 @@ class Ernie45VLThinkingToolParser(ToolParser):
|
|||||||
)
|
)
|
||||||
return delta
|
return delta
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
data_processor_logger.error(f"Error in streaming tool call extraction: {str(e)}")
|
log_request_error(
|
||||||
|
message="request[{request_id}] Error in streaming tool call extraction: {error}",
|
||||||
|
request_id=request.get("request_id"),
|
||||||
|
error=str(e),
|
||||||
|
)
|
||||||
return None
|
return None
|
||||||
if "</tool_call>" in self.buffer:
|
if "</tool_call>" in self.buffer:
|
||||||
end_pos = self.buffer.find("</tool_call>")
|
end_pos = self.buffer.find("</tool_call>")
|
||||||
@@ -354,5 +365,9 @@ class Ernie45VLThinkingToolParser(ToolParser):
|
|||||||
return delta
|
return delta
|
||||||
|
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
data_processor_logger.error(f"Error in streaming tool call extraction: {str(e)}")
|
log_request_error(
|
||||||
|
message="request[{request_id}] Error in streaming tool call extraction: {error}",
|
||||||
|
request_id=request.get("request_id"),
|
||||||
|
error=str(e),
|
||||||
|
)
|
||||||
return None
|
return None
|
||||||
|
|||||||
@@ -41,6 +41,7 @@ from fastdeploy.entrypoints.openai.tool_parsers.abstract_tool_parser import (
|
|||||||
ToolParser,
|
ToolParser,
|
||||||
ToolParserManager,
|
ToolParserManager,
|
||||||
)
|
)
|
||||||
|
from fastdeploy.logger.request_logger import log_request_error
|
||||||
from fastdeploy.utils import data_processor_logger as logger
|
from fastdeploy.utils import data_processor_logger as logger
|
||||||
|
|
||||||
|
|
||||||
@@ -254,8 +255,11 @@ class ErnieX1ToolParser(ToolParser):
|
|||||||
logger.debug("Skipping text %s - no arguments", delta_text)
|
logger.debug("Skipping text %s - no arguments", delta_text)
|
||||||
delta = None
|
delta = None
|
||||||
|
|
||||||
elif cur_arguments is None and prev_arguments is not None:
|
elif not cur_arguments and prev_arguments:
|
||||||
logger.error("should be impossible to have arguments reset " "mid-call. skipping streaming anything.")
|
log_request_error(
|
||||||
|
message="request[{request_id}] should be impossible to have arguments reset mid-call. skipping streaming anything.",
|
||||||
|
request_id=request.request_id,
|
||||||
|
)
|
||||||
delta = None
|
delta = None
|
||||||
|
|
||||||
elif cur_arguments is not None and prev_arguments is None:
|
elif cur_arguments is not None and prev_arguments is None:
|
||||||
|
|||||||
@@ -45,8 +45,12 @@ from fastdeploy.entrypoints.openai.v1.serving_base import (
|
|||||||
ServingResponseContext,
|
ServingResponseContext,
|
||||||
)
|
)
|
||||||
from fastdeploy.input.tokenizer_client import AsyncTokenizerClient, ImageDecodeRequest
|
from fastdeploy.input.tokenizer_client import AsyncTokenizerClient, ImageDecodeRequest
|
||||||
|
from fastdeploy.logger.request_logger import (
|
||||||
|
RequestLogLevel,
|
||||||
|
log_request,
|
||||||
|
log_request_error,
|
||||||
|
)
|
||||||
from fastdeploy.metrics.metrics import main_process_metrics
|
from fastdeploy.metrics.metrics import main_process_metrics
|
||||||
from fastdeploy.utils import api_server_logger
|
|
||||||
from fastdeploy.worker.output import LogprobsLists
|
from fastdeploy.worker.output import LogprobsLists
|
||||||
|
|
||||||
|
|
||||||
@@ -178,7 +182,7 @@ class OpenAIServingChat(OpenAiServingBase):
|
|||||||
|
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
error_msg = f"Error in _build_logprobs_response: {e}, {str(traceback.format_exc())}"
|
error_msg = f"Error in _build_logprobs_response: {e}, {str(traceback.format_exc())}"
|
||||||
api_server_logger.error(error_msg)
|
log_request_error(message=error_msg)
|
||||||
return None
|
return None
|
||||||
|
|
||||||
@override
|
@override
|
||||||
@@ -302,7 +306,14 @@ class OpenAIServingChat(OpenAiServingBase):
|
|||||||
max_tokens = request.max_completion_tokens or request.max_tokens
|
max_tokens = request.max_completion_tokens or request.max_tokens
|
||||||
choice_completion_tokens = response_ctx.choice_completion_tokens_dict[output.index]
|
choice_completion_tokens = response_ctx.choice_completion_tokens_dict[output.index]
|
||||||
choice.finish_reason = self._calc_finish_reason(request_output, max_tokens, choice_completion_tokens)
|
choice.finish_reason = self._calc_finish_reason(request_output, max_tokens, choice_completion_tokens)
|
||||||
api_server_logger.info(f"Chat Streaming response last send: {chunk.model_dump_json()}")
|
log_request(
|
||||||
|
level=RequestLogLevel.LIFECYCLE,
|
||||||
|
message="Chat Streaming response last send: request_id={request_id}, finish_reason={finish_reason}, completion_tokens={completion_tokens}, logprobs={logprobs}",
|
||||||
|
request_id=request_id,
|
||||||
|
finish_reason=choice.finish_reason,
|
||||||
|
completion_tokens=choice_completion_tokens,
|
||||||
|
logprobs=choice.logprobs,
|
||||||
|
)
|
||||||
|
|
||||||
yield f"data: {chunk.model_dump_json(exclude_unset=True)}\n\n"
|
yield f"data: {chunk.model_dump_json(exclude_unset=True)}\n\n"
|
||||||
if request_output.finished and response_ctx.remain_choices == 0:
|
if request_output.finished and response_ctx.remain_choices == 0:
|
||||||
@@ -339,7 +350,11 @@ class OpenAIServingChat(OpenAiServingBase):
|
|||||||
res = ChatCompletionResponse(
|
res = ChatCompletionResponse(
|
||||||
id=ctx.request_id, model=request.model, choices=choices, created=ctx.created_time, usage=response_ctx.usage
|
id=ctx.request_id, model=request.model, choices=choices, created=ctx.created_time, usage=response_ctx.usage
|
||||||
)
|
)
|
||||||
api_server_logger.info(f"Chat response: {res.model_dump_json()}")
|
log_request(
|
||||||
|
level=RequestLogLevel.CONTENT,
|
||||||
|
message="Chat response: {response}",
|
||||||
|
response=res.model_dump_json(),
|
||||||
|
)
|
||||||
return res
|
return res
|
||||||
|
|
||||||
async def _create_chat_completion_choice(
|
async def _create_chat_completion_choice(
|
||||||
|
|||||||
@@ -38,7 +38,12 @@ from fastdeploy.entrypoints.openai.v1.serving_base import (
|
|||||||
ServeContext,
|
ServeContext,
|
||||||
ServingResponseContext,
|
ServingResponseContext,
|
||||||
)
|
)
|
||||||
from fastdeploy.utils import ErrorType, api_server_logger
|
from fastdeploy.logger.request_logger import (
|
||||||
|
RequestLogLevel,
|
||||||
|
log_request,
|
||||||
|
log_request_error,
|
||||||
|
)
|
||||||
|
from fastdeploy.utils import ErrorType
|
||||||
from fastdeploy.worker.output import LogprobsLists
|
from fastdeploy.worker.output import LogprobsLists
|
||||||
|
|
||||||
|
|
||||||
@@ -94,7 +99,7 @@ class OpenAIServingCompletion(OpenAiServingBase):
|
|||||||
raise ValueError("Prompt type must be one of: str, list[str], list[int], list[list[int]]")
|
raise ValueError("Prompt type must be one of: str, list[str], list[int], list[list[int]]")
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
error_msg = f"OpenAIServingCompletion create_completion: {e}, {str(traceback.format_exc())}"
|
error_msg = f"OpenAIServingCompletion create_completion: {e}, {str(traceback.format_exc())}"
|
||||||
api_server_logger.error(error_msg)
|
log_request_error(message=error_msg)
|
||||||
return ErrorResponse(error=ErrorInfo(message=error_msg, type=ErrorType.INTERNAL_ERROR))
|
return ErrorResponse(error=ErrorInfo(message=error_msg, type=ErrorType.INTERNAL_ERROR))
|
||||||
|
|
||||||
if request_prompt_ids is not None:
|
if request_prompt_ids is not None:
|
||||||
@@ -199,7 +204,11 @@ class OpenAIServingCompletion(OpenAiServingBase):
|
|||||||
)
|
)
|
||||||
|
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
api_server_logger.error(f"Error in _build_logprobs_response: {str(e)}, {str(traceback.format_exc())}")
|
log_request_error(
|
||||||
|
message="Error in _build_logprobs_response: {error}, {traceback}",
|
||||||
|
error=str(e),
|
||||||
|
traceback=traceback.format_exc(),
|
||||||
|
)
|
||||||
return None
|
return None
|
||||||
|
|
||||||
async def _build_stream_response(
|
async def _build_stream_response(
|
||||||
@@ -271,9 +280,21 @@ class OpenAIServingCompletion(OpenAiServingBase):
|
|||||||
choice.finish_reason = self._calc_finish_reason(
|
choice.finish_reason = self._calc_finish_reason(
|
||||||
request_output, request.max_tokens, choice_completion_tokens
|
request_output, request.max_tokens, choice_completion_tokens
|
||||||
)
|
)
|
||||||
api_server_logger.info(f"Completion Streaming response last send: {chunk.model_dump_json()}")
|
log_request(
|
||||||
|
level=RequestLogLevel.LIFECYCLE,
|
||||||
|
message="Completion Streaming response last send: request_id={request_id}, finish_reason={finish_reason}, completion_tokens={completion_tokens}, logprobs={logprobs}",
|
||||||
|
request_id=request_id,
|
||||||
|
finish_reason=choice.finish_reason,
|
||||||
|
completion_tokens=choice_completion_tokens,
|
||||||
|
logprobs=choice.logprobs,
|
||||||
|
)
|
||||||
if send_idx == 0 and not request.return_token_ids:
|
if send_idx == 0 and not request.return_token_ids:
|
||||||
api_server_logger.info(f"Completion Streaming response send_idx 0: {chunk.model_dump_json()}")
|
log_request(
|
||||||
|
level=RequestLogLevel.LIFECYCLE,
|
||||||
|
message="Completion Streaming response send_idx 0: request_id={request_id}, completion_tokens={completion_tokens}",
|
||||||
|
request_id=request_id,
|
||||||
|
completion_tokens=response_ctx.choice_completion_tokens_dict[output.index],
|
||||||
|
)
|
||||||
yield f"data: {chunk.model_dump_json()}\n\n"
|
yield f"data: {chunk.model_dump_json()}\n\n"
|
||||||
if request_output.finished and response_ctx.remain_choices == 0:
|
if request_output.finished and response_ctx.remain_choices == 0:
|
||||||
if include_usage:
|
if include_usage:
|
||||||
@@ -287,7 +308,12 @@ class OpenAIServingCompletion(OpenAiServingBase):
|
|||||||
yield f"data: {usage_chunk.model_dump_json(exclude_unset=True)}\n\n"
|
yield f"data: {usage_chunk.model_dump_json(exclude_unset=True)}\n\n"
|
||||||
yield "data: [DONE]\n\n"
|
yield "data: [DONE]\n\n"
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
api_server_logger.error(f"Error in completion_stream_generator: {e}, {str(traceback.format_exc())}")
|
log_request_error(
|
||||||
|
message="request[{request_id}] Error in completion_stream_generator: {error}, {traceback}",
|
||||||
|
request_id=request_id,
|
||||||
|
error=e,
|
||||||
|
traceback=traceback.format_exc(),
|
||||||
|
)
|
||||||
yield f"data: {ErrorResponse(error=ErrorInfo(message=str(e), code='400', type=ErrorType.INTERNAL_ERROR)).model_dump_json(exclude_unset=True)}\n\n"
|
yield f"data: {ErrorResponse(error=ErrorInfo(message=str(e), code='400', type=ErrorType.INTERNAL_ERROR)).model_dump_json(exclude_unset=True)}\n\n"
|
||||||
|
|
||||||
async def _build_full_response(
|
async def _build_full_response(
|
||||||
@@ -321,10 +347,18 @@ class OpenAIServingCompletion(OpenAiServingBase):
|
|||||||
choices=choices,
|
choices=choices,
|
||||||
usage=response_ctx.usage,
|
usage=response_ctx.usage,
|
||||||
)
|
)
|
||||||
api_server_logger.info(f"Completion response: {res.model_dump_json()}")
|
log_request(
|
||||||
|
level=RequestLogLevel.FULL,
|
||||||
|
message="Completion response: {response}",
|
||||||
|
response=res.model_dump_json(),
|
||||||
|
)
|
||||||
return res
|
return res
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
api_server_logger.error(f"Error in completion_full_generator: {e}", exc_info=True)
|
log_request_error(
|
||||||
|
message="request[{request_id}] Error in completion_full_generator: {error}",
|
||||||
|
request_id=ctx.request_id,
|
||||||
|
error=e,
|
||||||
|
)
|
||||||
return self._create_error_response(str(e))
|
return self._create_error_response(str(e))
|
||||||
|
|
||||||
def build_completion_choice(
|
def build_completion_choice(
|
||||||
|
|||||||
+1
-1
@@ -43,7 +43,7 @@ environment_variables: dict[str, Callable[[], Any]] = {
|
|||||||
# Request logging master switch. Set to 0 to disable request logging.
|
# Request logging master switch. Set to 0 to disable request logging.
|
||||||
"FD_LOG_REQUESTS": lambda: int(os.getenv("FD_LOG_REQUESTS", "1")),
|
"FD_LOG_REQUESTS": lambda: int(os.getenv("FD_LOG_REQUESTS", "1")),
|
||||||
# Request logging detail level (0-3). Higher level means more verbose output.
|
# Request logging detail level (0-3). Higher level means more verbose output.
|
||||||
"FD_LOG_REQUESTS_LEVEL": lambda: int(os.getenv("FD_LOG_REQUESTS_LEVEL", "0")),
|
"FD_LOG_REQUESTS_LEVEL": lambda: int(os.getenv("FD_LOG_REQUESTS_LEVEL", "2")),
|
||||||
# Max field length for request logging truncation.
|
# Max field length for request logging truncation.
|
||||||
"FD_LOG_MAX_LEN": lambda: int(os.getenv("FD_LOG_MAX_LEN", "2048")),
|
"FD_LOG_MAX_LEN": lambda: int(os.getenv("FD_LOG_MAX_LEN", "2048")),
|
||||||
# Unified trace mode: off, local, otel, all.
|
# Unified trace mode: off, local, otel, all.
|
||||||
|
|||||||
@@ -55,6 +55,7 @@ from paddleformers.transformers import Llama3Tokenizer, LlamaTokenizer
|
|||||||
|
|
||||||
from fastdeploy import envs
|
from fastdeploy import envs
|
||||||
from fastdeploy.input.utils import process_stop_token_ids
|
from fastdeploy.input.utils import process_stop_token_ids
|
||||||
|
from fastdeploy.logger.request_logger import RequestLogLevel, log_request
|
||||||
from fastdeploy.utils import data_processor_logger
|
from fastdeploy.utils import data_processor_logger
|
||||||
|
|
||||||
_SAMPLING_EPS = 1e-5
|
_SAMPLING_EPS = 1e-5
|
||||||
@@ -164,7 +165,12 @@ class BaseTextProcessor(ABC):
|
|||||||
req_id = request.get("request_id", None) if isinstance(request, dict) else None
|
req_id = request.get("request_id", None) if isinstance(request, dict) else None
|
||||||
tokens = self.tokenizer.tokenize(spliced_message)
|
tokens = self.tokenizer.tokenize(spliced_message)
|
||||||
token_ids = self.tokenizer.convert_tokens_to_ids(tokens)
|
token_ids = self.tokenizer.convert_tokens_to_ids(tokens)
|
||||||
data_processor_logger.info(f"req_id:{req_id}, tokens:{tokens}, token_ids: {token_ids}")
|
log_request(
|
||||||
|
level=1,
|
||||||
|
message="req_id:{req_id}, token_ids: {token_ids}",
|
||||||
|
req_id=req_id,
|
||||||
|
token_ids=token_ids,
|
||||||
|
)
|
||||||
return token_ids
|
return token_ids
|
||||||
|
|
||||||
# ------------------------------------------------------------------
|
# ------------------------------------------------------------------
|
||||||
@@ -359,7 +365,7 @@ class BaseTextProcessor(ABC):
|
|||||||
|
|
||||||
def process_request_dict(self, request, max_model_len=None, **kwargs):
|
def process_request_dict(self, request, max_model_len=None, **kwargs):
|
||||||
"""Unified request pre-processing shared by all processors."""
|
"""Unified request pre-processing shared by all processors."""
|
||||||
data_processor_logger.info(f"Start processing request dict: {request}")
|
log_request(RequestLogLevel.CONTENT, message="Start processing request dict: {request}", request=request)
|
||||||
request = self._apply_default_parameters(request)
|
request = self._apply_default_parameters(request)
|
||||||
if not request.get("eos_token_ids"):
|
if not request.get("eos_token_ids"):
|
||||||
request["eos_token_ids"] = self.eos_token_ids
|
request["eos_token_ids"] = self.eos_token_ids
|
||||||
@@ -440,7 +446,7 @@ class BaseTextProcessor(ABC):
|
|||||||
if request.get("response_max_tokens") is not None and request.get("enable_thinking") is False:
|
if request.get("response_max_tokens") is not None and request.get("enable_thinking") is False:
|
||||||
request["max_tokens"] = min(request["response_max_tokens"], request["max_tokens"])
|
request["max_tokens"] = min(request["response_max_tokens"], request["max_tokens"])
|
||||||
|
|
||||||
data_processor_logger.info(f"Processed request dict: {request}")
|
log_request(RequestLogLevel.CONTENT, message="Processed request dict: {request}", request=request)
|
||||||
return request
|
return request
|
||||||
|
|
||||||
def _apply_reasoning_parser(self, request):
|
def _apply_reasoning_parser(self, request):
|
||||||
@@ -481,7 +487,12 @@ class BaseTextProcessor(ABC):
|
|||||||
if seq != self.tokenizer.eos_token_id:
|
if seq != self.tokenizer.eos_token_id:
|
||||||
stop_seqs.append(self.tokenizer.convert_tokens_to_ids(self.tokenizer.tokenize(seq)))
|
stop_seqs.append(self.tokenizer.convert_tokens_to_ids(self.tokenizer.tokenize(seq)))
|
||||||
stop_seqs, stop_seqs_len = self.pad_batch_data(stop_seqs, pad_id=-1, return_seq_len=True, return_array=False)
|
stop_seqs, stop_seqs_len = self.pad_batch_data(stop_seqs, pad_id=-1, return_seq_len=True, return_array=False)
|
||||||
data_processor_logger.debug(f"processed stop_seqs: {stop_seqs}, {stop_seqs_len}")
|
log_request(
|
||||||
|
level=3,
|
||||||
|
message="processed stop_seqs: {stop_seqs}, {stop_seqs_len}",
|
||||||
|
stop_seqs=stop_seqs,
|
||||||
|
stop_seqs_len=stop_seqs_len,
|
||||||
|
)
|
||||||
return stop_seqs, stop_seqs_len
|
return stop_seqs, stop_seqs_len
|
||||||
|
|
||||||
# ------------------------------------------------------------------
|
# ------------------------------------------------------------------
|
||||||
@@ -607,14 +618,20 @@ class BaseTextProcessor(ABC):
|
|||||||
prompt_token_ids = self.tokenizer.convert_tokens_to_ids(self.tokenizer.tokenize(prompt))
|
prompt_token_ids = self.tokenizer.convert_tokens_to_ids(self.tokenizer.tokenize(prompt))
|
||||||
if len(prompt_token_ids) != 1:
|
if len(prompt_token_ids) != 1:
|
||||||
if not add_prefix_space:
|
if not add_prefix_space:
|
||||||
data_processor_logger.warning(
|
log_request(
|
||||||
f"bad_words: '{prompt}' tokenises to {len(prompt_token_ids)} tokens, skipping"
|
level=1,
|
||||||
|
message="bad_words: '{prompt}' tokenises to {num_tokens} tokens, skipping",
|
||||||
|
prompt=prompt,
|
||||||
|
num_tokens=len(prompt_token_ids),
|
||||||
)
|
)
|
||||||
continue
|
continue
|
||||||
if prompt_token_ids[0] > self.tokenizer.vocab_size:
|
if prompt_token_ids[0] > self.tokenizer.vocab_size:
|
||||||
if not add_prefix_space:
|
if not add_prefix_space:
|
||||||
data_processor_logger.warning(
|
log_request(
|
||||||
f"bad_words: '{prompt}' token id {prompt_token_ids[0]} > vocab_size, skipping"
|
level=1,
|
||||||
|
message="bad_words: '{prompt}' token id {token_id} > vocab_size, skipping",
|
||||||
|
prompt=prompt,
|
||||||
|
token_id=prompt_token_ids[0],
|
||||||
)
|
)
|
||||||
continue
|
continue
|
||||||
if prompt_token_ids not in token_ids:
|
if prompt_token_ids not in token_ids:
|
||||||
|
|||||||
@@ -22,6 +22,7 @@ from paddleformers.generation import GenerationConfig
|
|||||||
|
|
||||||
from fastdeploy.input.ernie4_5_processor import Ernie4_5Processor
|
from fastdeploy.input.ernie4_5_processor import Ernie4_5Processor
|
||||||
from fastdeploy.input.utils import IDS_TYPE_FLAG, process_stop_token_ids
|
from fastdeploy.input.utils import IDS_TYPE_FLAG, process_stop_token_ids
|
||||||
|
from fastdeploy.logger.request_logger import RequestLogLevel, log_request
|
||||||
from fastdeploy.utils import data_processor_logger
|
from fastdeploy.utils import data_processor_logger
|
||||||
|
|
||||||
from .process import DataProcessor
|
from .process import DataProcessor
|
||||||
@@ -294,7 +295,7 @@ class Ernie4_5_VLProcessor(Ernie4_5Processor):
|
|||||||
if request.get("response_max_tokens") is not None and request.get("enable_thinking") is False:
|
if request.get("response_max_tokens") is not None and request.get("enable_thinking") is False:
|
||||||
request["max_tokens"] = min(request["response_max_tokens"], request["max_tokens"])
|
request["max_tokens"] = min(request["response_max_tokens"], request["max_tokens"])
|
||||||
|
|
||||||
data_processor_logger.info(f"Processed request {request}")
|
log_request(RequestLogLevel.CONTENT, message="Processed request: {request}", request=request)
|
||||||
return request
|
return request
|
||||||
|
|
||||||
def append_completion_tokens(self, multimodal_inputs, completion_token_ids):
|
def append_completion_tokens(self, multimodal_inputs, completion_token_ids):
|
||||||
|
|||||||
@@ -17,6 +17,7 @@
|
|||||||
import numpy as np
|
import numpy as np
|
||||||
|
|
||||||
from fastdeploy.input.text_processor import DataProcessor as TextProcessor
|
from fastdeploy.input.text_processor import DataProcessor as TextProcessor
|
||||||
|
from fastdeploy.logger.request_logger import RequestLogLevel, log_request
|
||||||
from fastdeploy.utils import data_processor_logger
|
from fastdeploy.utils import data_processor_logger
|
||||||
|
|
||||||
from .process import DataProcessor
|
from .process import DataProcessor
|
||||||
@@ -261,7 +262,7 @@ class Qwen3VLProcessor(TextProcessor):
|
|||||||
request["max_tokens"] = max(1, max_tokens)
|
request["max_tokens"] = max(1, max_tokens)
|
||||||
else:
|
else:
|
||||||
request["max_tokens"] = min(max_tokens, request["max_tokens"])
|
request["max_tokens"] = min(max_tokens, request["max_tokens"])
|
||||||
data_processor_logger.info(f"Processed request {request}")
|
log_request(RequestLogLevel.CONTENT, message="Processed request: {request}", request=request)
|
||||||
|
|
||||||
return request
|
return request
|
||||||
|
|
||||||
|
|||||||
@@ -18,6 +18,7 @@ import numpy as np
|
|||||||
|
|
||||||
from fastdeploy.input.text_processor import DataProcessor as TextProcessor
|
from fastdeploy.input.text_processor import DataProcessor as TextProcessor
|
||||||
from fastdeploy.input.utils import process_stop_token_ids
|
from fastdeploy.input.utils import process_stop_token_ids
|
||||||
|
from fastdeploy.logger.request_logger import RequestLogLevel, log_request
|
||||||
from fastdeploy.utils import data_processor_logger
|
from fastdeploy.utils import data_processor_logger
|
||||||
|
|
||||||
from .process import DataProcessor
|
from .process import DataProcessor
|
||||||
@@ -262,7 +263,7 @@ class QwenVLProcessor(TextProcessor):
|
|||||||
else:
|
else:
|
||||||
self.model_status_dict[request["request_id"]] = model_status
|
self.model_status_dict[request["request_id"]] = model_status
|
||||||
request["enable_thinking"] = model_status == "think_start"
|
request["enable_thinking"] = model_status == "think_start"
|
||||||
data_processor_logger.info(f"Processed request {request}")
|
log_request(RequestLogLevel.CONTENT, message="Processed request: {request}", request=request)
|
||||||
|
|
||||||
return request
|
return request
|
||||||
|
|
||||||
|
|||||||
@@ -20,6 +20,7 @@ from typing import Any, Optional, Union
|
|||||||
import httpx
|
import httpx
|
||||||
from pydantic import BaseModel, HttpUrl
|
from pydantic import BaseModel, HttpUrl
|
||||||
|
|
||||||
|
from fastdeploy.logger.request_logger import log_request, log_request_error
|
||||||
from fastdeploy.utils import data_processor_logger
|
from fastdeploy.utils import data_processor_logger
|
||||||
|
|
||||||
|
|
||||||
@@ -161,7 +162,12 @@ class AsyncTokenizerClient:
|
|||||||
|
|
||||||
except httpx.RequestError as e:
|
except httpx.RequestError as e:
|
||||||
# Network error, keep polling
|
# Network error, keep polling
|
||||||
data_processor_logger.debug(f"Request error while polling tokenize task {task_tag}: {e}")
|
log_request(
|
||||||
|
level=3,
|
||||||
|
message="Request error while polling tokenize task {task_tag}: {error}",
|
||||||
|
task_tag=task_tag,
|
||||||
|
error=str(e),
|
||||||
|
)
|
||||||
|
|
||||||
# 超时检测
|
# 超时检测
|
||||||
if asyncio.get_event_loop().time() - start_time > self.max_wait:
|
if asyncio.get_event_loop().time() - start_time > self.max_wait:
|
||||||
@@ -193,10 +199,15 @@ class AsyncTokenizerClient:
|
|||||||
raise RuntimeError(f"Tokenize task creation failed, {resp.json().get('message')}")
|
raise RuntimeError(f"Tokenize task creation failed, {resp.json().get('message')}")
|
||||||
return resp.json().get("result")
|
return resp.json().get("result")
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
data_processor_logger.error(f"Attempt to decode_request {attempt + 1} failed: {e}")
|
log_request_error(
|
||||||
|
message="Attempt to decode_request {attempt} failed: {error}",
|
||||||
|
attempt=attempt + 1,
|
||||||
|
error=str(e),
|
||||||
|
)
|
||||||
if attempt == self.max_retries - 1:
|
if attempt == self.max_retries - 1:
|
||||||
data_processor_logger.error(
|
log_request_error(
|
||||||
f"Max retries of decode_request reached. Giving up. request is {request}"
|
message="Max retries of decode_request reached. Giving up. request is {request}",
|
||||||
|
request=str(request),
|
||||||
)
|
)
|
||||||
await asyncio.sleep(1)
|
await asyncio.sleep(1)
|
||||||
except httpx.RequestError as e:
|
except httpx.RequestError as e:
|
||||||
|
|||||||
@@ -0,0 +1,124 @@
|
|||||||
|
"""
|
||||||
|
# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
|
||||||
|
#
|
||||||
|
# Licensed under the Apache License, Version 2.0 (the "License"
|
||||||
|
# you may not use this file except in compliance with the License.
|
||||||
|
# You may obtain a copy of the License at
|
||||||
|
#
|
||||||
|
# http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
# Unless required by applicable law or agreed to in writing, software
|
||||||
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
# See the License for the specific language governing permissions and
|
||||||
|
# limitations under the License.
|
||||||
|
"""
|
||||||
|
|
||||||
|
"""
|
||||||
|
FastDeploy Logging Module
|
||||||
|
|
||||||
|
Unified logging management module providing:
|
||||||
|
- Formatters: ColoredFormatter, CustomFormatter
|
||||||
|
- Handlers: DailyRotatingFileHandler, IntervalRotatingFileHandler, LazyFileHandler
|
||||||
|
- Logger: FastDeployLogger
|
||||||
|
- Request Logger: log_request, log_request_error, RequestLogLevel
|
||||||
|
- Setup: setup_logging
|
||||||
|
- Pre-defined logger instances (lazy-loaded)
|
||||||
|
|
||||||
|
Usage:
|
||||||
|
from fastdeploy.logger import get_logger, llm_logger, console_logger
|
||||||
|
from fastdeploy.logger import log_request, log_request_error, RequestLogLevel
|
||||||
|
"""
|
||||||
|
|
||||||
|
# Formatters
|
||||||
|
from fastdeploy.logger.formatters import ColoredFormatter, CustomFormatter
|
||||||
|
|
||||||
|
# Handlers
|
||||||
|
from fastdeploy.logger.handlers import (
|
||||||
|
DailyRotatingFileHandler,
|
||||||
|
IntervalRotatingFileHandler,
|
||||||
|
LazyFileHandler,
|
||||||
|
)
|
||||||
|
|
||||||
|
# Logger
|
||||||
|
from fastdeploy.logger.logger import FastDeployLogger
|
||||||
|
|
||||||
|
# Request logger
|
||||||
|
from fastdeploy.logger.request_logger import (
|
||||||
|
RequestLogLevel,
|
||||||
|
log_request,
|
||||||
|
log_request_error,
|
||||||
|
)
|
||||||
|
|
||||||
|
# Setup
|
||||||
|
from fastdeploy.logger.setup_logging import setup_logging
|
||||||
|
|
||||||
|
|
||||||
|
def get_logger(name, file_name=None, without_formater=False, print_to_console=False, channel=None):
|
||||||
|
"""Convenience function to get a logger instance"""
|
||||||
|
return FastDeployLogger().get_logger(name, file_name, without_formater, print_to_console, channel=channel)
|
||||||
|
|
||||||
|
|
||||||
|
# Pre-defined logger instance configs (name, file_name, without_formater, print_to_console, channel)
|
||||||
|
_LOGGER_CONFIGS = {
|
||||||
|
"llm_logger": ("fastdeploy", None, False, False, "main"),
|
||||||
|
"data_processor_logger": ("data_processor", None, False, False, "main"),
|
||||||
|
"scheduler_logger": ("scheduler", None, False, False, "main"),
|
||||||
|
"api_server_logger": ("api_server", None, False, False, "main"),
|
||||||
|
"console_logger": (None, None, False, False, "console"),
|
||||||
|
"spec_logger": ("speculate", "speculate.log", False, False, None),
|
||||||
|
"zmq_client_logger": ("zmq_client", "comm.log", False, False, None),
|
||||||
|
"router_logger": ("router", "comm.log", False, False, None),
|
||||||
|
"fmq_logger": ("fmq", "comm.log", False, False, None),
|
||||||
|
"obj_logger": ("obj", "obj.log", False, False, None),
|
||||||
|
"register_manager_logger": ("register_manager", "register_manager.log", False, False, None),
|
||||||
|
"_request_logger": ("request", None, False, False, "request"),
|
||||||
|
}
|
||||||
|
|
||||||
|
_logger_cache = {}
|
||||||
|
|
||||||
|
|
||||||
|
def __getattr__(name):
|
||||||
|
"""Lazy-load pre-defined logger instances"""
|
||||||
|
if name in _LOGGER_CONFIGS:
|
||||||
|
if name not in _logger_cache:
|
||||||
|
cfg = _LOGGER_CONFIGS[name]
|
||||||
|
_logger_cache[name] = get_logger(cfg[0], cfg[1], cfg[2], cfg[3], cfg[4])
|
||||||
|
return _logger_cache[name]
|
||||||
|
if name == "trace_logger":
|
||||||
|
if name not in _logger_cache:
|
||||||
|
_logger_cache[name] = FastDeployLogger().get_trace_logger("trace", "trace.log")
|
||||||
|
return _logger_cache[name]
|
||||||
|
raise AttributeError(f"module {__name__!r} has no attribute {name!r}")
|
||||||
|
|
||||||
|
|
||||||
|
__all__ = [
|
||||||
|
# Formatters
|
||||||
|
"ColoredFormatter",
|
||||||
|
"CustomFormatter",
|
||||||
|
# Handlers
|
||||||
|
"DailyRotatingFileHandler",
|
||||||
|
"IntervalRotatingFileHandler",
|
||||||
|
"LazyFileHandler",
|
||||||
|
# Logger
|
||||||
|
"FastDeployLogger",
|
||||||
|
"get_logger",
|
||||||
|
# Request logger
|
||||||
|
"RequestLogLevel",
|
||||||
|
"log_request",
|
||||||
|
"log_request_error",
|
||||||
|
# Setup
|
||||||
|
"setup_logging",
|
||||||
|
# Pre-defined logger instances (lazy-loaded)
|
||||||
|
"llm_logger",
|
||||||
|
"data_processor_logger",
|
||||||
|
"scheduler_logger",
|
||||||
|
"api_server_logger",
|
||||||
|
"console_logger",
|
||||||
|
"spec_logger",
|
||||||
|
"zmq_client_logger",
|
||||||
|
"trace_logger",
|
||||||
|
"router_logger",
|
||||||
|
"fmq_logger",
|
||||||
|
"obj_logger",
|
||||||
|
"register_manager_logger",
|
||||||
|
]
|
||||||
|
|||||||
@@ -0,0 +1,47 @@
|
|||||||
|
# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
|
||||||
|
#
|
||||||
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
# you may not use this file except in compliance with the License.
|
||||||
|
# You may obtain a copy of the License at
|
||||||
|
#
|
||||||
|
# http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
#
|
||||||
|
# Unless required by applicable law or agreed to in writing, software
|
||||||
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
# See the License for the specific language governing permissions and
|
||||||
|
# limitations under the License.
|
||||||
|
|
||||||
|
"""
|
||||||
|
Log configuration parsing module
|
||||||
|
"""
|
||||||
|
|
||||||
|
import os
|
||||||
|
|
||||||
|
|
||||||
|
def resolve_log_level(raw_level=None, debug_enabled=None) -> str:
|
||||||
|
"""
|
||||||
|
Resolve log level configuration
|
||||||
|
|
||||||
|
Priority: FD_LOG_LEVEL > FD_DEBUG
|
||||||
|
"""
|
||||||
|
raw = os.getenv("FD_LOG_LEVEL") if raw_level is None else raw_level
|
||||||
|
# Handle None or string "None" case
|
||||||
|
if raw and str(raw).upper() != "NONE":
|
||||||
|
level = raw.upper()
|
||||||
|
if level not in {"INFO", "DEBUG"}:
|
||||||
|
raise ValueError(f"Unsupported FD_LOG_LEVEL: {raw}")
|
||||||
|
return level
|
||||||
|
debug = os.getenv("FD_DEBUG", "0") if debug_enabled is None else str(debug_enabled)
|
||||||
|
return "DEBUG" if debug == "1" else "INFO"
|
||||||
|
|
||||||
|
|
||||||
|
def resolve_request_logging_defaults() -> dict[str, int]:
|
||||||
|
"""
|
||||||
|
Resolve request logging default configuration
|
||||||
|
"""
|
||||||
|
return {
|
||||||
|
"enabled": int(os.getenv("FD_LOG_REQUESTS", "1")),
|
||||||
|
"level": int(os.getenv("FD_LOG_REQUESTS_LEVEL", "2")),
|
||||||
|
"max_len": int(os.getenv("FD_LOG_MAX_LEN", "2048")),
|
||||||
|
}
|
||||||
+107
-17
@@ -20,21 +20,36 @@ This module provides the get_logger method to uniformly manage logging behavior
|
|||||||
|
|
||||||
import logging
|
import logging
|
||||||
import os
|
import os
|
||||||
|
import sys
|
||||||
import threading
|
import threading
|
||||||
from contextlib import contextmanager
|
from contextlib import contextmanager
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
|
|
||||||
from fastdeploy import envs
|
from fastdeploy import envs
|
||||||
from fastdeploy.logger.formatters import CustomFormatter
|
from fastdeploy.logger.config import resolve_log_level
|
||||||
|
from fastdeploy.logger.formatters import ColoredFormatter, CustomFormatter
|
||||||
from fastdeploy.logger.handlers import DailyRotatingFileHandler, LazyFileHandler
|
from fastdeploy.logger.handlers import DailyRotatingFileHandler, LazyFileHandler
|
||||||
from fastdeploy.logger.setup_logging import setup_logging
|
from fastdeploy.logger.setup_logging import setup_logging
|
||||||
|
|
||||||
|
# Standard log format
|
||||||
|
_LOG_FORMAT = "%(levelname)-8s %(asctime)s %(process)-5s %(filename)s[line:%(lineno)d] %(message)s"
|
||||||
|
|
||||||
|
|
||||||
class FastDeployLogger:
|
class FastDeployLogger:
|
||||||
_instance = None
|
_instance = None
|
||||||
_initialized = False
|
_initialized = False
|
||||||
_lock = threading.RLock()
|
_lock = threading.RLock()
|
||||||
|
|
||||||
|
# Channel to file mapping
|
||||||
|
_channel_files = {
|
||||||
|
"main": "fastdeploy.log",
|
||||||
|
"request": "request.log",
|
||||||
|
"console": "console.log",
|
||||||
|
}
|
||||||
|
|
||||||
|
# Cache for channel loggers that have been configured
|
||||||
|
_configured_channels = set()
|
||||||
|
|
||||||
def __new__(cls):
|
def __new__(cls):
|
||||||
"""Singleton pattern implementation"""
|
"""Singleton pattern implementation"""
|
||||||
if cls._instance is None:
|
if cls._instance is None:
|
||||||
@@ -50,7 +65,7 @@ class FastDeployLogger:
|
|||||||
setup_logging()
|
setup_logging()
|
||||||
self._initialized = True
|
self._initialized = True
|
||||||
|
|
||||||
def get_logger(self, name, file_name=None, without_formater=False, print_to_console=False):
|
def get_logger(self, name, file_name=None, without_formater=False, print_to_console=False, channel=None):
|
||||||
"""
|
"""
|
||||||
Get logger (compatible with the original interface)
|
Get logger (compatible with the original interface)
|
||||||
|
|
||||||
@@ -59,7 +74,14 @@ class FastDeployLogger:
|
|||||||
file_name: Log file name (for compatibility)
|
file_name: Log file name (for compatibility)
|
||||||
without_formater: Whether to not use a formatter
|
without_formater: Whether to not use a formatter
|
||||||
print_to_console: Whether to print to console
|
print_to_console: Whether to print to console
|
||||||
|
channel: Log channel (main, request, console)
|
||||||
"""
|
"""
|
||||||
|
# If channel is specified, use channel-based logging
|
||||||
|
if channel is not None:
|
||||||
|
if not self._initialized:
|
||||||
|
self._initialize()
|
||||||
|
return self._get_channel_logger(name, channel)
|
||||||
|
|
||||||
# If only one parameter is provided, use the new unified naming convention
|
# If only one parameter is provided, use the new unified naming convention
|
||||||
if file_name is None and not without_formater and not print_to_console:
|
if file_name is None and not without_formater and not print_to_console:
|
||||||
# Lazy initialization
|
# Lazy initialization
|
||||||
@@ -74,27 +96,95 @@ class FastDeployLogger:
|
|||||||
"""
|
"""
|
||||||
New unified way to get logger
|
New unified way to get logger
|
||||||
"""
|
"""
|
||||||
if name is None:
|
return self._get_channel_logger(name, "main")
|
||||||
return logging.getLogger("fastdeploy")
|
|
||||||
|
|
||||||
# Handle __main__ special case
|
def _get_channel_logger(self, name, channel):
|
||||||
if name == "__main__":
|
"""
|
||||||
|
Get logger through channel with manual handler setup.
|
||||||
|
|
||||||
|
Uses manual addHandler instead of dictConfig for better performance.
|
||||||
|
Handlers are attached to the channel root logger (fastdeploy.{channel}),
|
||||||
|
and child loggers propagate to it.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
name: logger name
|
||||||
|
channel: log channel (main, request, console)
|
||||||
|
"""
|
||||||
|
# Get or create the channel root logger (all handlers go here)
|
||||||
|
channel_root_name = f"fastdeploy.{channel}"
|
||||||
|
channel_logger = logging.getLogger(channel_root_name)
|
||||||
|
|
||||||
|
# Configure the channel root logger once
|
||||||
|
if channel not in self._configured_channels:
|
||||||
|
self._configured_channels.add(channel)
|
||||||
|
|
||||||
|
log_dir = envs.FD_LOG_DIR
|
||||||
|
os.makedirs(log_dir, exist_ok=True)
|
||||||
|
|
||||||
|
# Resolve log level (priority: FD_LOG_LEVEL > FD_DEBUG)
|
||||||
|
log_level = resolve_log_level()
|
||||||
|
channel_logger.setLevel(logging.DEBUG if log_level == "DEBUG" else logging.INFO)
|
||||||
|
|
||||||
|
# Create formatters
|
||||||
|
file_formatter = logging.Formatter(_LOG_FORMAT)
|
||||||
|
console_formatter = ColoredFormatter(_LOG_FORMAT)
|
||||||
|
|
||||||
|
# Clear existing handlers
|
||||||
|
for handler in channel_logger.handlers[:]:
|
||||||
|
channel_logger.removeHandler(handler)
|
||||||
|
|
||||||
|
# Create file handler for this channel
|
||||||
|
file_name = self._channel_files.get(channel, f"{channel}.log")
|
||||||
|
log_file = os.path.join(log_dir, file_name)
|
||||||
|
backup_count = int(envs.FD_LOG_BACKUP_COUNT)
|
||||||
|
|
||||||
|
file_handler = LazyFileHandler(log_file, backupCount=backup_count)
|
||||||
|
file_handler.setFormatter(file_formatter)
|
||||||
|
channel_logger.addHandler(file_handler)
|
||||||
|
|
||||||
|
# Error file handler (all channels write errors to error.log)
|
||||||
|
error_log_file = os.path.join(log_dir, "error.log")
|
||||||
|
error_file_handler = LazyFileHandler(
|
||||||
|
filename=error_log_file, backupCount=backup_count, level=logging.ERROR
|
||||||
|
)
|
||||||
|
error_file_handler.setFormatter(file_formatter)
|
||||||
|
channel_logger.addHandler(error_file_handler)
|
||||||
|
|
||||||
|
# Stderr handler for ERROR level (all channels output errors to stderr)
|
||||||
|
stderr_handler = logging.StreamHandler(sys.stderr)
|
||||||
|
stderr_handler.setLevel(logging.ERROR)
|
||||||
|
stderr_handler.setFormatter(console_formatter)
|
||||||
|
channel_logger.addHandler(stderr_handler)
|
||||||
|
|
||||||
|
# Console stdout handler for console channel only
|
||||||
|
if channel == "console":
|
||||||
|
stdout_handler = logging.StreamHandler(sys.stdout)
|
||||||
|
stdout_handler.setLevel(logging.DEBUG if log_level == "DEBUG" else logging.INFO)
|
||||||
|
stdout_handler.setFormatter(console_formatter)
|
||||||
|
# Filter to exclude ERROR and above (they go to stderr)
|
||||||
|
stdout_handler.addFilter(lambda record: record.levelno < logging.ERROR)
|
||||||
|
channel_logger.addHandler(stdout_handler)
|
||||||
|
|
||||||
|
channel_logger.propagate = False
|
||||||
|
|
||||||
|
# Determine the actual logger name and return the appropriate logger
|
||||||
|
if name is None or name == "fastdeploy":
|
||||||
|
return channel_logger
|
||||||
|
elif name == "__main__":
|
||||||
import __main__
|
import __main__
|
||||||
|
|
||||||
# Get the __file__ attribute of the main module
|
|
||||||
if hasattr(__main__, "__file__"):
|
if hasattr(__main__, "__file__"):
|
||||||
# Get the main module file name
|
|
||||||
base_name = Path(__main__.__file__).stem
|
base_name = Path(__main__.__file__).stem
|
||||||
# Create logger with prefix
|
logger_name = f"{channel_root_name}.{base_name}"
|
||||||
return logging.getLogger(f"fastdeploy.main.{base_name}")
|
|
||||||
return logging.getLogger("fastdeploy.main")
|
|
||||||
|
|
||||||
# If already in fastdeploy namespace, use directly
|
|
||||||
if name.startswith("fastdeploy.") or name == "fastdeploy":
|
|
||||||
return logging.getLogger(name)
|
|
||||||
else:
|
else:
|
||||||
# Add fastdeploy prefix for other cases
|
return channel_logger
|
||||||
return logging.getLogger(f"fastdeploy.{name}")
|
elif name.startswith("fastdeploy."):
|
||||||
|
logger_name = name
|
||||||
|
else:
|
||||||
|
logger_name = f"{channel_root_name}.{name}"
|
||||||
|
|
||||||
|
# Child loggers propagate to channel_logger (which has handlers)
|
||||||
|
return logging.getLogger(logger_name)
|
||||||
|
|
||||||
def get_trace_logger(self, name, file_name, without_formater=False, print_to_console=False):
|
def get_trace_logger(self, name, file_name, without_formater=False, print_to_console=False):
|
||||||
"""
|
"""
|
||||||
|
|||||||
@@ -0,0 +1,94 @@
|
|||||||
|
# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
|
||||||
|
#
|
||||||
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
# you may not use this file except in compliance with the License.
|
||||||
|
# You may obtain a copy of the License at
|
||||||
|
#
|
||||||
|
# http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
#
|
||||||
|
# Unless required by applicable law or agreed to in writing, software
|
||||||
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
# See the License for the specific language governing permissions and
|
||||||
|
# limitations under the License.
|
||||||
|
|
||||||
|
"""
|
||||||
|
Request logging module
|
||||||
|
|
||||||
|
Provides leveled request logging with L0-L3 levels:
|
||||||
|
- L0: Critical lifecycle events (creation, completion, abort)
|
||||||
|
- L1: Processing stage details
|
||||||
|
- L2: Request/response content (truncated)
|
||||||
|
- L3: Full data
|
||||||
|
"""
|
||||||
|
|
||||||
|
from enum import IntEnum
|
||||||
|
|
||||||
|
from fastdeploy import envs
|
||||||
|
|
||||||
|
|
||||||
|
class RequestLogLevel(IntEnum):
|
||||||
|
"""Request log level"""
|
||||||
|
|
||||||
|
LIFECYCLE = 0 # Lifecycle start/end: creation, completion, abort
|
||||||
|
STAGES = 1 # Processing stages: semaphore, first token, signal handling
|
||||||
|
CONTENT = 2 # Content and scheduling: request params, scheduling, response
|
||||||
|
FULL = 3 # Complete raw data
|
||||||
|
|
||||||
|
|
||||||
|
def _should_log(level: int) -> bool:
|
||||||
|
"""Check if this level should be logged"""
|
||||||
|
if int(envs.FD_LOG_REQUESTS) == 0:
|
||||||
|
return False
|
||||||
|
return int(level) <= int(envs.FD_LOG_REQUESTS_LEVEL)
|
||||||
|
|
||||||
|
|
||||||
|
def _truncate(value):
|
||||||
|
"""Truncate long content"""
|
||||||
|
text = str(value)
|
||||||
|
max_len = int(envs.FD_LOG_MAX_LEN)
|
||||||
|
if len(text) <= max_len:
|
||||||
|
return text
|
||||||
|
return text[:max_len]
|
||||||
|
|
||||||
|
|
||||||
|
def log_request(level: int, message: str, **fields):
|
||||||
|
"""
|
||||||
|
Log request message
|
||||||
|
|
||||||
|
Args:
|
||||||
|
level: Log level (0-3)
|
||||||
|
message: Log message template, supports {field} formatting
|
||||||
|
**fields: Message fields
|
||||||
|
"""
|
||||||
|
if not _should_log(level):
|
||||||
|
return
|
||||||
|
|
||||||
|
from fastdeploy.logger import _request_logger
|
||||||
|
|
||||||
|
if not fields:
|
||||||
|
_request_logger.info(message, stacklevel=2)
|
||||||
|
return
|
||||||
|
|
||||||
|
payload = fields
|
||||||
|
# CONTENT level content needs to be truncated
|
||||||
|
if int(level) == int(RequestLogLevel.CONTENT):
|
||||||
|
payload = {key: _truncate(value) for key, value in fields.items()}
|
||||||
|
|
||||||
|
_request_logger.info(message.format(**payload), stacklevel=2)
|
||||||
|
|
||||||
|
|
||||||
|
def log_request_error(message: str, **fields):
|
||||||
|
"""
|
||||||
|
Log request error message
|
||||||
|
|
||||||
|
Args:
|
||||||
|
message: Log message template, supports {field} formatting
|
||||||
|
**fields: Message fields
|
||||||
|
"""
|
||||||
|
from fastdeploy.logger import _request_logger
|
||||||
|
|
||||||
|
if fields:
|
||||||
|
_request_logger.error(message.format(**fields), stacklevel=2)
|
||||||
|
else:
|
||||||
|
_request_logger.error(message, stacklevel=2)
|
||||||
@@ -14,7 +14,19 @@
|
|||||||
"""
|
"""
|
||||||
|
|
||||||
"""
|
"""
|
||||||
配置日志系统
|
Setup logging system
|
||||||
|
|
||||||
|
This module handles logging initialization:
|
||||||
|
- Basic setup: log directory creation
|
||||||
|
- Optional: load external JSON config file via dictConfig
|
||||||
|
|
||||||
|
Channel-based logger configuration is handled by FastDeployLogger._get_channel_logger()
|
||||||
|
using manual addHandler for better performance.
|
||||||
|
|
||||||
|
Log channels:
|
||||||
|
- main: Main logs -> fastdeploy.log
|
||||||
|
- request: Request logs -> request.log
|
||||||
|
- console: Console logs -> console.log + terminal
|
||||||
"""
|
"""
|
||||||
|
|
||||||
import json
|
import json
|
||||||
@@ -27,54 +39,29 @@ from fastdeploy import envs
|
|||||||
|
|
||||||
|
|
||||||
class MaxLevelFilter(logging.Filter):
|
class MaxLevelFilter(logging.Filter):
|
||||||
"""过滤低于指定级别的日志记录。
|
"""Filter log records below specified level.
|
||||||
|
|
||||||
用于将 INFO/DEBUG 路由到 stdout,ERROR/CRITICAL 路由到 stderr。
|
Used to route INFO/DEBUG to stdout, ERROR/CRITICAL to stderr.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
def __init__(self, level):
|
def __init__(self, level):
|
||||||
super().__init__()
|
super().__init__()
|
||||||
self.level = logging._nameToLevel.get(level, level)
|
self.level = logging._nameToLevel.get(level, level) if isinstance(level, str) else level
|
||||||
|
|
||||||
def filter(self, record):
|
def filter(self, record):
|
||||||
return record.levelno < self.level
|
return record.levelno < self.level
|
||||||
|
|
||||||
|
|
||||||
def setup_logging(log_dir=None, config_file=None):
|
def _build_default_config(log_dir, log_level, backup_count):
|
||||||
"""
|
"""Build default logging configuration for dictConfig"""
|
||||||
设置FastDeploy的日志配置
|
|
||||||
|
|
||||||
Args:
|
|
||||||
log_dir: 日志文件存储目录,如果不提供则使用环境变量
|
|
||||||
config_file: JSON配置文件路径,如果不提供则使用默认配置
|
|
||||||
"""
|
|
||||||
|
|
||||||
# 避免重复配置
|
|
||||||
if getattr(setup_logging, "_configured", False):
|
|
||||||
return logging.getLogger("fastdeploy")
|
|
||||||
|
|
||||||
# 使用环境变量中的日志目录,如果没有则使用传入的参数或默认值
|
|
||||||
if log_dir is None:
|
|
||||||
log_dir = getattr(envs, "FD_LOG_DIR", "log")
|
|
||||||
|
|
||||||
# 确保日志目录存在
|
|
||||||
Path(log_dir).mkdir(parents=True, exist_ok=True)
|
|
||||||
|
|
||||||
# 从环境变量获取日志级别和备份数量
|
|
||||||
is_debug = int(getattr(envs, "FD_DEBUG", 0))
|
|
||||||
FASTDEPLOY_LOGGING_LEVEL = "DEBUG" if is_debug else "INFO"
|
|
||||||
backup_count = int(getattr(envs, "FD_LOG_BACKUP_COUNT", 7))
|
|
||||||
|
|
||||||
# 定义日志输出格式
|
|
||||||
_FORMAT = "%(levelname)-8s %(asctime)s %(process)-5s %(filename)s[line:%(lineno)d] %(message)s"
|
_FORMAT = "%(levelname)-8s %(asctime)s %(process)-5s %(filename)s[line:%(lineno)d] %(message)s"
|
||||||
|
|
||||||
# 默认配置
|
return {
|
||||||
default_config = {
|
|
||||||
"version": 1,
|
"version": 1,
|
||||||
"disable_existing_loggers": False,
|
"disable_existing_loggers": False,
|
||||||
"filters": {
|
"filters": {
|
||||||
"below_error": {
|
"below_error": {
|
||||||
"()": "fastdeploy.logger.setup_logging.MaxLevelFilter",
|
"()": MaxLevelFilter,
|
||||||
"level": "ERROR",
|
"level": "ERROR",
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
@@ -91,99 +78,135 @@ def setup_logging(log_dir=None, config_file=None):
|
|||||||
},
|
},
|
||||||
},
|
},
|
||||||
"handlers": {
|
"handlers": {
|
||||||
# 控制台标准输出,用于 INFO/DEBUG(低于 ERROR 级别)
|
# Console stdout for INFO/DEBUG (below ERROR level)
|
||||||
"console_stdout": {
|
"console_stdout": {
|
||||||
"class": "logging.StreamHandler",
|
"class": "logging.StreamHandler",
|
||||||
"level": FASTDEPLOY_LOGGING_LEVEL,
|
"level": log_level,
|
||||||
"filters": ["below_error"],
|
"filters": ["below_error"],
|
||||||
"formatter": "colored",
|
"formatter": "colored",
|
||||||
"stream": "ext://sys.stdout",
|
"stream": "ext://sys.stdout",
|
||||||
},
|
},
|
||||||
# 控制台错误输出,用于 ERROR/CRITICAL
|
# Console stderr for ERROR/CRITICAL
|
||||||
"console_stderr": {
|
"console_stderr": {
|
||||||
"class": "logging.StreamHandler",
|
"class": "logging.StreamHandler",
|
||||||
"level": "ERROR",
|
"level": "ERROR",
|
||||||
"formatter": "colored",
|
"formatter": "colored",
|
||||||
"stream": "ext://sys.stderr",
|
"stream": "ext://sys.stderr",
|
||||||
},
|
},
|
||||||
# 默认错误日志,保留最新1个小时的日志,位置在log/error.log
|
# Main log file
|
||||||
|
"main_file": {
|
||||||
|
"class": "fastdeploy.logger.handlers.LazyFileHandler",
|
||||||
|
"level": log_level,
|
||||||
|
"formatter": "standard",
|
||||||
|
"filename": os.path.join(log_dir, "fastdeploy.log"),
|
||||||
|
"backupCount": backup_count,
|
||||||
|
},
|
||||||
|
# Console log file
|
||||||
|
"console_file": {
|
||||||
|
"class": "fastdeploy.logger.handlers.LazyFileHandler",
|
||||||
|
"level": log_level,
|
||||||
|
"formatter": "standard",
|
||||||
|
"filename": os.path.join(log_dir, "console.log"),
|
||||||
|
"backupCount": backup_count,
|
||||||
|
},
|
||||||
|
# Request log file
|
||||||
|
"request_file": {
|
||||||
|
"class": "fastdeploy.logger.handlers.LazyFileHandler",
|
||||||
|
"level": log_level,
|
||||||
|
"formatter": "standard",
|
||||||
|
"filename": os.path.join(log_dir, "request.log"),
|
||||||
|
"backupCount": backup_count,
|
||||||
|
},
|
||||||
|
# Error log file
|
||||||
"error_file": {
|
"error_file": {
|
||||||
"class": "logging.handlers.TimedRotatingFileHandler",
|
"class": "fastdeploy.logger.handlers.LazyFileHandler",
|
||||||
"level": "ERROR",
|
"level": "ERROR",
|
||||||
"formatter": "standard",
|
"formatter": "standard",
|
||||||
"filename": os.path.join(log_dir, "error.log"),
|
"filename": os.path.join(log_dir, "error.log"),
|
||||||
"when": "H",
|
"backupCount": backup_count,
|
||||||
"interval": 1,
|
|
||||||
"backupCount": 1,
|
|
||||||
},
|
|
||||||
# 全量日志,保留最新1小时的日志,位置在log/default.log
|
|
||||||
"default_file": {
|
|
||||||
"class": "logging.handlers.TimedRotatingFileHandler",
|
|
||||||
"level": FASTDEPLOY_LOGGING_LEVEL,
|
|
||||||
"formatter": "standard",
|
|
||||||
"filename": os.path.join(log_dir, "default.log"),
|
|
||||||
"when": "H",
|
|
||||||
"interval": 1,
|
|
||||||
"backupCount": 1,
|
|
||||||
},
|
|
||||||
# 错误日志归档,保留7天内的日志,每隔1小时一个文件,形式如:FastDeploy/log/2025-08-14/error_2025-08-14-18.log
|
|
||||||
"error_archive": {
|
|
||||||
"class": "fastdeploy.logger.handlers.IntervalRotatingFileHandler",
|
|
||||||
"level": "ERROR",
|
|
||||||
"formatter": "standard",
|
|
||||||
"filename": os.path.join(log_dir, "error.log"),
|
|
||||||
"backupDays": 7,
|
|
||||||
"interval": 1,
|
|
||||||
"encoding": "utf-8",
|
|
||||||
},
|
|
||||||
# 全量日志归档,保留7天内的日志,每隔1小时一个文件,形式如:FastDeploy/log/2025-08-14/default_2025-08-14-18.log
|
|
||||||
"default_archive": {
|
|
||||||
"class": "fastdeploy.logger.handlers.IntervalRotatingFileHandler",
|
|
||||||
"level": FASTDEPLOY_LOGGING_LEVEL,
|
|
||||||
"formatter": "standard",
|
|
||||||
"filename": os.path.join(log_dir, "default.log"),
|
|
||||||
"backupDays": 7,
|
|
||||||
"interval": 1,
|
|
||||||
"encoding": "utf-8",
|
|
||||||
},
|
},
|
||||||
},
|
},
|
||||||
"loggers": {
|
"loggers": {
|
||||||
# 默认日志记录器,全局共享
|
# Default logger
|
||||||
"fastdeploy": {
|
"fastdeploy": {
|
||||||
"level": "DEBUG",
|
"level": "DEBUG",
|
||||||
"handlers": [
|
"handlers": ["main_file", "console_file", "error_file", "console_stderr"],
|
||||||
"console_stdout",
|
|
||||||
"console_stderr",
|
|
||||||
"error_file",
|
|
||||||
"default_file",
|
|
||||||
"error_archive",
|
|
||||||
"default_archive",
|
|
||||||
],
|
|
||||||
"propagate": False,
|
"propagate": False,
|
||||||
}
|
},
|
||||||
|
# Main channel
|
||||||
|
"fastdeploy.main": {
|
||||||
|
"level": "DEBUG",
|
||||||
|
"handlers": ["main_file", "console_file", "error_file", "console_stderr"],
|
||||||
|
"propagate": False,
|
||||||
|
},
|
||||||
|
# Request channel - only output to request.log and error.log
|
||||||
|
"fastdeploy.request": {
|
||||||
|
"level": "DEBUG",
|
||||||
|
"handlers": ["request_file", "error_file", "console_stderr"],
|
||||||
|
"propagate": False,
|
||||||
|
},
|
||||||
|
# Console channel - output to console.log and terminal
|
||||||
|
"fastdeploy.console": {
|
||||||
|
"level": "DEBUG",
|
||||||
|
"handlers": ["console_file", "console_stdout", "error_file", "console_stderr"],
|
||||||
|
"propagate": False,
|
||||||
|
},
|
||||||
},
|
},
|
||||||
}
|
}
|
||||||
|
|
||||||
# 如果提供了配置文件,则加载配置文件
|
|
||||||
if config_file and os.path.exists(config_file):
|
def setup_logging(log_dir=None, config_file=None):
|
||||||
|
"""
|
||||||
|
Setup FastDeploy logging configuration.
|
||||||
|
|
||||||
|
This function:
|
||||||
|
1. Ensures the log directory exists
|
||||||
|
2. Optionally loads external JSON config file via dictConfig
|
||||||
|
|
||||||
|
Note: Channel-based loggers (get_logger with channel param) use manual addHandler
|
||||||
|
for better performance, independent of dictConfig.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
log_dir: Log file storage directory, uses environment variable if not provided
|
||||||
|
config_file: Optional JSON config file path for dictConfig
|
||||||
|
"""
|
||||||
|
# Avoid duplicate configuration
|
||||||
|
if getattr(setup_logging, "_configured", False):
|
||||||
|
return
|
||||||
|
|
||||||
|
# Use log directory from environment variable, or use provided parameter or default value
|
||||||
|
if log_dir is None:
|
||||||
|
log_dir = getattr(envs, "FD_LOG_DIR", "log")
|
||||||
|
|
||||||
|
# Ensure log directory exists
|
||||||
|
Path(log_dir).mkdir(parents=True, exist_ok=True)
|
||||||
|
|
||||||
|
# Store log_dir for later use
|
||||||
|
setup_logging._log_dir = log_dir
|
||||||
|
|
||||||
|
# If config_file is provided, use dictConfig to load it
|
||||||
|
if config_file is not None:
|
||||||
|
is_debug = int(getattr(envs, "FD_DEBUG", 0))
|
||||||
|
log_level = "DEBUG" if is_debug else "INFO"
|
||||||
|
backup_count = int(getattr(envs, "FD_LOG_BACKUP_COUNT", 7))
|
||||||
|
|
||||||
|
if os.path.exists(config_file):
|
||||||
with open(config_file, "r", encoding="utf-8") as f:
|
with open(config_file, "r", encoding="utf-8") as f:
|
||||||
config = json.load(f)
|
config = json.load(f)
|
||||||
|
|
||||||
# 合并环境变量配置到用户配置中,环境变量的优先级高于自定义的优先级
|
# Merge environment variable config into user config
|
||||||
if "handlers" in config:
|
if "handlers" in config:
|
||||||
for handler_name, handler_config in config["handlers"].items():
|
for handler_config in config["handlers"].values():
|
||||||
if "backupCount" not in handler_config and "DailyRotating" in handler_config.get("class", ""):
|
if "backupCount" not in handler_config and "DailyRotating" in handler_config.get("class", ""):
|
||||||
handler_config["backupCount"] = backup_count
|
handler_config["backupCount"] = backup_count
|
||||||
if handler_config.get("level") == "INFO" and is_debug:
|
if handler_config.get("level") == "INFO" and log_level == "DEBUG":
|
||||||
handler_config["level"] = "DEBUG"
|
handler_config["level"] = "DEBUG"
|
||||||
else:
|
else:
|
||||||
config = default_config
|
# Config file not found, use default config
|
||||||
|
config = _build_default_config(log_dir, log_level, backup_count)
|
||||||
|
|
||||||
# 应用日志配置
|
# Apply logging configuration via dictConfig
|
||||||
logging.config.dictConfig(config)
|
logging.config.dictConfig(config)
|
||||||
|
|
||||||
# 避免重复加载
|
# Mark as configured
|
||||||
setup_logging._configured = True
|
setup_logging._configured = True
|
||||||
|
|
||||||
# 返回fastdeploy的logger
|
|
||||||
return logging.getLogger("fastdeploy")
|
|
||||||
|
|||||||
@@ -40,6 +40,11 @@ from fastdeploy.engine.request import (
|
|||||||
SpeculateMetrics,
|
SpeculateMetrics,
|
||||||
)
|
)
|
||||||
from fastdeploy.inter_communicator import ZmqIpcServer
|
from fastdeploy.inter_communicator import ZmqIpcServer
|
||||||
|
from fastdeploy.logger.request_logger import (
|
||||||
|
RequestLogLevel,
|
||||||
|
log_request,
|
||||||
|
log_request_error,
|
||||||
|
)
|
||||||
from fastdeploy.metrics.metrics import main_process_metrics
|
from fastdeploy.metrics.metrics import main_process_metrics
|
||||||
from fastdeploy.platforms import current_platform
|
from fastdeploy.platforms import current_platform
|
||||||
from fastdeploy.spec_decode import SpecMethod
|
from fastdeploy.spec_decode import SpecMethod
|
||||||
@@ -224,7 +229,11 @@ class TokenProcessor:
|
|||||||
for token_id in token_id_list:
|
for token_id in token_id_list:
|
||||||
recovery_stop = token_id == RECOVERY_STOP_SIGNAL
|
recovery_stop = token_id == RECOVERY_STOP_SIGNAL
|
||||||
if recovery_stop:
|
if recovery_stop:
|
||||||
llm_logger.info(f"recovery stop signal found at task {task_id}")
|
log_request(
|
||||||
|
RequestLogLevel.STAGES,
|
||||||
|
message="recovery stop signal found at task {request_id}",
|
||||||
|
request_id=task_id,
|
||||||
|
)
|
||||||
self.tokens_counter[task_id] += 1
|
self.tokens_counter[task_id] += 1
|
||||||
if token_id != RECOVERY_STOP_SIGNAL:
|
if token_id != RECOVERY_STOP_SIGNAL:
|
||||||
result.outputs.token_ids.append(token_id)
|
result.outputs.token_ids.append(token_id)
|
||||||
@@ -252,12 +261,25 @@ class TokenProcessor:
|
|||||||
|
|
||||||
# Print combined log with all required information
|
# Print combined log with all required information
|
||||||
ttft = task.metrics.first_token_time if task.metrics.first_token_time else 0
|
ttft = task.metrics.first_token_time if task.metrics.first_token_time else 0
|
||||||
llm_logger.info(
|
log_request(
|
||||||
f"Request={task_id}, InputToken={task.prompt_token_ids_len}, "
|
RequestLogLevel.LIFECYCLE,
|
||||||
f"CachedDetail={cached_detail}, OutputToken={self.tokens_counter[task_id]}, "
|
message=(
|
||||||
f"TokenRatio={token_ratio:.2f}, TTFT={ttft:.2f}, "
|
"Request={request_id}, InputToken={input_tokens}, "
|
||||||
f"E2E={e2e_time:.2f}, IsPrefill={is_prefill}, RecoveryStop={recovery_stop}, "
|
"CachedDetail={cached_detail}, OutputToken={output_tokens}, "
|
||||||
f"PreemptedCount={getattr(task.metrics, 'preempted_count', 0)}"
|
"TokenRatio={token_ratio}, TTFT={ttft}, "
|
||||||
|
"E2E={e2e_time}, IsPrefill={is_prefill}, RecoveryStop={recovery_stop}, "
|
||||||
|
"PreemptedCount={preempted_count}"
|
||||||
|
),
|
||||||
|
request_id=task_id,
|
||||||
|
input_tokens=task.prompt_token_ids_len,
|
||||||
|
cached_detail=cached_detail,
|
||||||
|
output_tokens=self.tokens_counter[task_id],
|
||||||
|
token_ratio=f"{token_ratio:.2f}",
|
||||||
|
ttft=f"{ttft:.2f}",
|
||||||
|
e2e_time=f"{e2e_time:.2f}",
|
||||||
|
is_prefill=is_prefill,
|
||||||
|
recovery_stop=recovery_stop,
|
||||||
|
preempted_count=getattr(task.metrics, "preempted_count", 0),
|
||||||
)
|
)
|
||||||
|
|
||||||
main_process_metrics.request_token_ratio.observe(token_ratio)
|
main_process_metrics.request_token_ratio.observe(token_ratio)
|
||||||
@@ -289,13 +311,21 @@ class TokenProcessor:
|
|||||||
task_id in self.resource_manager.to_be_aborted_req_id_set
|
task_id in self.resource_manager.to_be_aborted_req_id_set
|
||||||
and token_ids[-1] == PREEMPTED_TOKEN_ID
|
and token_ids[-1] == PREEMPTED_TOKEN_ID
|
||||||
):
|
):
|
||||||
llm_logger.info(f"start to recycle abort request_id {task_id}")
|
log_request(
|
||||||
|
RequestLogLevel.STAGES,
|
||||||
|
message="start to recycle abort request_id {request_id}",
|
||||||
|
request_id=task_id,
|
||||||
|
)
|
||||||
self.resource_manager.recycle_abort_task(task_id)
|
self.resource_manager.recycle_abort_task(task_id)
|
||||||
if (
|
if (
|
||||||
task_id in self.resource_manager.to_be_rescheduled_request_id_set
|
task_id in self.resource_manager.to_be_rescheduled_request_id_set
|
||||||
and token_ids[-1] == PREEMPTED_TOKEN_ID
|
and token_ids[-1] == PREEMPTED_TOKEN_ID
|
||||||
):
|
):
|
||||||
llm_logger.info(f"sync preemption for request_id {task_id} done.")
|
log_request(
|
||||||
|
RequestLogLevel.STAGES,
|
||||||
|
message="sync preemption for request_id {request_id} done.",
|
||||||
|
request_id=task_id,
|
||||||
|
)
|
||||||
self.resource_manager.reschedule_preempt_task(task_id)
|
self.resource_manager.reschedule_preempt_task(task_id)
|
||||||
continue
|
continue
|
||||||
if self.cfg.scheduler_config.splitwise_role == "decode":
|
if self.cfg.scheduler_config.splitwise_role == "decode":
|
||||||
@@ -351,12 +381,20 @@ class TokenProcessor:
|
|||||||
result.outputs.logprob = float(logprobs_list.logprobs[0][0])
|
result.outputs.logprob = float(logprobs_list.logprobs[0][0])
|
||||||
result.outputs.top_logprobs = logprobs_list
|
result.outputs.top_logprobs = logprobs_list
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
llm_logger.warning(f"Failed to parse logprobs from StreamTransferData: {e}")
|
log_request(
|
||||||
|
RequestLogLevel.STAGES,
|
||||||
|
message="Failed to parse logprobs from StreamTransferData: {error}",
|
||||||
|
error=str(e),
|
||||||
|
)
|
||||||
if getattr(stream_data, "prompt_logprobs", None) is not None:
|
if getattr(stream_data, "prompt_logprobs", None) is not None:
|
||||||
try:
|
try:
|
||||||
result.prompt_logprobs = stream_data.prompt_logprobs
|
result.prompt_logprobs = stream_data.prompt_logprobs
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
llm_logger.warning(f"Failed to parse prompt_logprobs from StreamTransferData: {e}")
|
log_request(
|
||||||
|
RequestLogLevel.STAGES,
|
||||||
|
message="Failed to parse prompt_logprobs from StreamTransferData: {error}",
|
||||||
|
error=str(e),
|
||||||
|
)
|
||||||
if self.tokens_counter[task_id] == 0:
|
if self.tokens_counter[task_id] == 0:
|
||||||
if task.messages is not None:
|
if task.messages is not None:
|
||||||
result.prompt = task.messages
|
result.prompt = task.messages
|
||||||
@@ -394,7 +432,9 @@ class TokenProcessor:
|
|||||||
batch_result = self._process_batch_output_use_zmq(receive_datas)
|
batch_result = self._process_batch_output_use_zmq(receive_datas)
|
||||||
self.postprocess(batch_result)
|
self.postprocess(batch_result)
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
llm_logger.error(f"Receive message:{receive_datas}, error:{e}")
|
log_request_error(
|
||||||
|
message="Receive message:{receive_datas}, error:{error}", receive_datas=receive_datas, error=e
|
||||||
|
)
|
||||||
continue
|
continue
|
||||||
|
|
||||||
def process_sampling_results(self):
|
def process_sampling_results(self):
|
||||||
@@ -511,7 +551,11 @@ class TokenProcessor:
|
|||||||
else:
|
else:
|
||||||
self.cached_generated_tokens.put_results(batch_result)
|
self.cached_generated_tokens.put_results(batch_result)
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
llm_logger.error(f"Error in TokenProcessor's postprocess: {e}, {str(traceback.format_exc())}")
|
log_request_error(
|
||||||
|
message="Error in TokenProcessor's postprocess: {error}, {traceback}",
|
||||||
|
error=e,
|
||||||
|
traceback=traceback.format_exc(),
|
||||||
|
)
|
||||||
|
|
||||||
def _recycle_resources(self, task_id, index, task, result=None, is_prefill=False):
|
def _recycle_resources(self, task_id, index, task, result=None, is_prefill=False):
|
||||||
"""
|
"""
|
||||||
@@ -524,14 +568,21 @@ class TokenProcessor:
|
|||||||
finished_task_ids = self.engine_worker_queue.get_finished_req()
|
finished_task_ids = self.engine_worker_queue.get_finished_req()
|
||||||
if len(finished_task_ids) > 0:
|
if len(finished_task_ids) > 0:
|
||||||
for finished_task_id in finished_task_ids:
|
for finished_task_id in finished_task_ids:
|
||||||
llm_logger.info(f"finished_task_id: {finished_task_id}")
|
log_request(
|
||||||
|
RequestLogLevel.STAGES,
|
||||||
|
message="finished_task_id: {finished_task_id}",
|
||||||
|
finished_task_id=finished_task_id,
|
||||||
|
)
|
||||||
self.prefill_result_status[finished_task_id[0]] = finished_task_id[1]
|
self.prefill_result_status[finished_task_id[0]] = finished_task_id[1]
|
||||||
if task_id in self.prefill_result_status:
|
if task_id in self.prefill_result_status:
|
||||||
if self.prefill_result_status[task_id] != "finished":
|
if self.prefill_result_status[task_id] != "finished":
|
||||||
result.error_code = 400
|
result.error_code = 400
|
||||||
result.error_message = f"{task_id} failed to {self.prefill_result_status[task_id]}"
|
result.error_message = f"{task_id} failed to {self.prefill_result_status[task_id]}"
|
||||||
llm_logger.info(
|
log_request(
|
||||||
f"wait for sending cache, request_id: {task_id}, cost seconds: {time.time()-start_time:.5f}"
|
RequestLogLevel.STAGES,
|
||||||
|
message="wait for sending cache, request_id: {request_id}, cost seconds: {cost_seconds}",
|
||||||
|
request_id=task_id,
|
||||||
|
cost_seconds=f"{time.time()-start_time:.5f}",
|
||||||
)
|
)
|
||||||
result.metrics.send_request_output_to_decode_time = time.time()
|
result.metrics.send_request_output_to_decode_time = time.time()
|
||||||
self.split_connector.send_first_token(task.disaggregate_info, [result])
|
self.split_connector.send_first_token(task.disaggregate_info, [result])
|
||||||
@@ -755,7 +806,11 @@ class TokenProcessor:
|
|||||||
if self.cfg.speculative_config.method:
|
if self.cfg.speculative_config.method:
|
||||||
self._record_speculative_decoding_accept_num_per_request(task_id, accept_num[i])
|
self._record_speculative_decoding_accept_num_per_request(task_id, accept_num[i])
|
||||||
if accept_num[i] == PREEMPTED_TOKEN_ID: # in MTP, means preemption has happened in worker
|
if accept_num[i] == PREEMPTED_TOKEN_ID: # in MTP, means preemption has happened in worker
|
||||||
llm_logger.info(f"sync preemption for request_id {task_id} done.")
|
log_request(
|
||||||
|
RequestLogLevel.STAGES,
|
||||||
|
message="sync preemption for request_id {request_id} done.",
|
||||||
|
request_id=task_id,
|
||||||
|
)
|
||||||
if envs.ENABLE_V1_KVCACHE_SCHEDULER:
|
if envs.ENABLE_V1_KVCACHE_SCHEDULER:
|
||||||
if task_id in self.resource_manager.to_be_aborted_req_id_set:
|
if task_id in self.resource_manager.to_be_aborted_req_id_set:
|
||||||
self.resource_manager.recycle_abort_task(task_id)
|
self.resource_manager.recycle_abort_task(task_id)
|
||||||
@@ -765,7 +820,11 @@ class TokenProcessor:
|
|||||||
if accept_num[i] == -3:
|
if accept_num[i] == -3:
|
||||||
recovery_stop = True
|
recovery_stop = True
|
||||||
if recovery_stop:
|
if recovery_stop:
|
||||||
llm_logger.info(f"recovery stop signal found at task {task_id}")
|
log_request(
|
||||||
|
RequestLogLevel.STAGES,
|
||||||
|
message="recovery stop signal found at task {request_id}",
|
||||||
|
request_id=task_id,
|
||||||
|
)
|
||||||
token_ids = [RECOVERY_STOP_SIGNAL]
|
token_ids = [RECOVERY_STOP_SIGNAL]
|
||||||
elif self.use_logprobs:
|
elif self.use_logprobs:
|
||||||
token_ids = tokens[i][:, 0].tolist()[: accept_num[i]]
|
token_ids = tokens[i][:, 0].tolist()[: accept_num[i]]
|
||||||
@@ -785,7 +844,11 @@ class TokenProcessor:
|
|||||||
token_ids = [token_id]
|
token_ids = [token_id]
|
||||||
recovery_stop = token_id == RECOVERY_STOP_SIGNAL
|
recovery_stop = token_id == RECOVERY_STOP_SIGNAL
|
||||||
if recovery_stop:
|
if recovery_stop:
|
||||||
llm_logger.info(f"recovery stop signal found at task {task_id}")
|
log_request(
|
||||||
|
RequestLogLevel.STAGES,
|
||||||
|
message="recovery stop signal found at task {request_id}",
|
||||||
|
request_id=task_id,
|
||||||
|
)
|
||||||
if not recovery_stop and token_id < 0:
|
if not recovery_stop and token_id < 0:
|
||||||
if envs.ENABLE_V1_KVCACHE_SCHEDULER:
|
if envs.ENABLE_V1_KVCACHE_SCHEDULER:
|
||||||
if (
|
if (
|
||||||
@@ -793,12 +856,20 @@ class TokenProcessor:
|
|||||||
and token_id == PREEMPTED_TOKEN_ID
|
and token_id == PREEMPTED_TOKEN_ID
|
||||||
):
|
):
|
||||||
self.resource_manager.recycle_abort_task(task_id)
|
self.resource_manager.recycle_abort_task(task_id)
|
||||||
llm_logger.info(f"sync abortion for request_id {task_id} done.")
|
log_request(
|
||||||
|
RequestLogLevel.STAGES,
|
||||||
|
message="sync abortion for request_id {request_id} done.",
|
||||||
|
request_id=task_id,
|
||||||
|
)
|
||||||
if (
|
if (
|
||||||
task_id in self.resource_manager.to_be_rescheduled_request_id_set
|
task_id in self.resource_manager.to_be_rescheduled_request_id_set
|
||||||
and token_id == PREEMPTED_TOKEN_ID
|
and token_id == PREEMPTED_TOKEN_ID
|
||||||
):
|
):
|
||||||
llm_logger.info(f"sync preemption for request_id {task_id} done.")
|
log_request(
|
||||||
|
RequestLogLevel.STAGES,
|
||||||
|
message="sync preemption for request_id {request_id} done.",
|
||||||
|
request_id=task_id,
|
||||||
|
)
|
||||||
self.resource_manager.reschedule_preempt_task(task_id)
|
self.resource_manager.reschedule_preempt_task(task_id)
|
||||||
continue
|
continue
|
||||||
if self.cfg.scheduler_config.splitwise_role == "decode":
|
if self.cfg.scheduler_config.splitwise_role == "decode":
|
||||||
@@ -826,7 +897,11 @@ class TokenProcessor:
|
|||||||
task.metrics.record_recv_first_token()
|
task.metrics.record_recv_first_token()
|
||||||
task.metrics.cal_cost_time()
|
task.metrics.cal_cost_time()
|
||||||
metrics = copy.copy(task.metrics)
|
metrics = copy.copy(task.metrics)
|
||||||
llm_logger.info(f"task:{task.request_id} start recode first token")
|
log_request(
|
||||||
|
RequestLogLevel.STAGES,
|
||||||
|
message="task:{request_id} start recode first token",
|
||||||
|
request_id=task.request_id,
|
||||||
|
)
|
||||||
self._record_first_token_metrics(task, current_time)
|
self._record_first_token_metrics(task, current_time)
|
||||||
|
|
||||||
tracing.trace_report_span(
|
tracing.trace_report_span(
|
||||||
@@ -932,12 +1007,26 @@ class TokenProcessor:
|
|||||||
# Print combined log with all required information
|
# Print combined log with all required information
|
||||||
ttft = task.metrics.first_token_time if task.metrics.first_token_time else 0
|
ttft = task.metrics.first_token_time if task.metrics.first_token_time else 0
|
||||||
ttft_s = ttft + task.metrics.time_in_queue
|
ttft_s = ttft + task.metrics.time_in_queue
|
||||||
llm_logger.info(
|
log_request(
|
||||||
f"Request={task_id}, InputToken={task.prompt_token_ids_len}, "
|
RequestLogLevel.LIFECYCLE,
|
||||||
f"CachedDetail={cached_detail}, OutputToken={self.tokens_counter[task_id]}, "
|
message=(
|
||||||
f"TokenRatio={token_ratio:.2f}, TTFT={ttft:.2f}, TTFT_S={ttft_s:.2f}, "
|
"Request={request_id}, InputToken={input_tokens}, "
|
||||||
f"E2E={e2e_time:.2f}, IsPrefill={is_prefill}, RecoveryStop={recovery_stop}, "
|
"CachedDetail={cached_detail}, OutputToken={output_tokens}, "
|
||||||
f"PreemptedCount={getattr(task.metrics, 'preempted_count', 0)}"
|
"TokenRatio={token_ratio}, TTFT={ttft}, TTFT_S={ttft_s}, "
|
||||||
|
"E2E={e2e_time}, IsPrefill={is_prefill}, RecoveryStop={recovery_stop}, "
|
||||||
|
"PreemptedCount={preempted_count}"
|
||||||
|
),
|
||||||
|
request_id=task_id,
|
||||||
|
input_tokens=task.prompt_token_ids_len,
|
||||||
|
cached_detail=cached_detail,
|
||||||
|
output_tokens=self.tokens_counter[task_id],
|
||||||
|
token_ratio=f"{token_ratio:.2f}",
|
||||||
|
ttft=f"{ttft:.2f}",
|
||||||
|
ttft_s=f"{ttft_s:.2f}",
|
||||||
|
e2e_time=f"{e2e_time:.2f}",
|
||||||
|
is_prefill=is_prefill,
|
||||||
|
recovery_stop=recovery_stop,
|
||||||
|
preempted_count=getattr(task.metrics, "preempted_count", 0),
|
||||||
)
|
)
|
||||||
|
|
||||||
main_process_metrics.request_token_ratio.observe(token_ratio)
|
main_process_metrics.request_token_ratio.observe(token_ratio)
|
||||||
@@ -946,8 +1035,11 @@ class TokenProcessor:
|
|||||||
self._compute_speculative_status(result)
|
self._compute_speculative_status(result)
|
||||||
if not is_prefill:
|
if not is_prefill:
|
||||||
self._record_completion_metrics(task, current_time)
|
self._record_completion_metrics(task, current_time)
|
||||||
llm_logger.info(f"task {task_id} received eos token. Recycling.")
|
log_request(
|
||||||
|
RequestLogLevel.STAGES,
|
||||||
|
message="task {request_id} received eos token. Recycling.",
|
||||||
|
request_id=task_id,
|
||||||
|
)
|
||||||
if (
|
if (
|
||||||
envs.ENABLE_V1_KVCACHE_SCHEDULER
|
envs.ENABLE_V1_KVCACHE_SCHEDULER
|
||||||
and self.cfg.cache_config.enable_prefix_caching
|
and self.cfg.cache_config.enable_prefix_caching
|
||||||
@@ -957,7 +1049,11 @@ class TokenProcessor:
|
|||||||
task
|
task
|
||||||
) # when enable prefix caching, cache kv cache for output tokens
|
) # when enable prefix caching, cache kv cache for output tokens
|
||||||
self._recycle_resources(task_id, i, task, result, is_prefill)
|
self._recycle_resources(task_id, i, task, result, is_prefill)
|
||||||
llm_logger.info(f"eos token {task_id} Recycle end.")
|
log_request(
|
||||||
|
RequestLogLevel.STAGES,
|
||||||
|
message="eos token {request_id} Recycle end.",
|
||||||
|
request_id=task_id,
|
||||||
|
)
|
||||||
break
|
break
|
||||||
|
|
||||||
llm_logger.debug(f"get response from infer: {result}")
|
llm_logger.debug(f"get response from infer: {result}")
|
||||||
|
|||||||
@@ -21,6 +21,7 @@ import time
|
|||||||
from typing import Dict, List, Optional
|
from typing import Dict, List, Optional
|
||||||
|
|
||||||
from fastdeploy.engine.request import Request, RequestOutput
|
from fastdeploy.engine.request import Request, RequestOutput
|
||||||
|
from fastdeploy.logger.request_logger import RequestLogLevel, log_request
|
||||||
from fastdeploy.scheduler.data import ScheduledResponse
|
from fastdeploy.scheduler.data import ScheduledResponse
|
||||||
from fastdeploy.scheduler.local_scheduler import LocalScheduler
|
from fastdeploy.scheduler.local_scheduler import LocalScheduler
|
||||||
from fastdeploy.utils import get_logger
|
from fastdeploy.utils import get_logger
|
||||||
@@ -58,7 +59,11 @@ class DPLocalScheduler(LocalScheduler):
|
|||||||
|
|
||||||
finished_responses = [response.request_id for response in responses if response.finished]
|
finished_responses = [response.request_id for response in responses if response.finished]
|
||||||
if len(finished_responses) > 0:
|
if len(finished_responses) > 0:
|
||||||
self.scheduler_logger.info(f"Scheduler has received some finished responses: {finished_responses}")
|
log_request(
|
||||||
|
RequestLogLevel.CONTENT,
|
||||||
|
message="Scheduler has received some finished responses: {request_ids}",
|
||||||
|
request_ids=finished_responses,
|
||||||
|
)
|
||||||
|
|
||||||
with self.mutex:
|
with self.mutex:
|
||||||
self.batch_responses_per_step.append([response.raw for response in responses])
|
self.batch_responses_per_step.append([response.raw for response in responses])
|
||||||
@@ -146,8 +151,10 @@ class DPLocalScheduler(LocalScheduler):
|
|||||||
self.ids_read_cursor += 1
|
self.ids_read_cursor += 1
|
||||||
|
|
||||||
if len(requests) > 0:
|
if len(requests) > 0:
|
||||||
self.scheduler_logger.info(
|
log_request(
|
||||||
f"Scheduler has pulled some request: {[request.request_id for request in requests]}"
|
RequestLogLevel.CONTENT,
|
||||||
|
message="Scheduler has pulled some request: {request_ids}",
|
||||||
|
request_ids=[request.request_id for request in requests],
|
||||||
)
|
)
|
||||||
|
|
||||||
return requests
|
return requests
|
||||||
@@ -195,7 +202,11 @@ class DPScheduler:
|
|||||||
def _put_requests_to_local(self):
|
def _put_requests_to_local(self):
|
||||||
while True:
|
while True:
|
||||||
request = self.request_queues.get()
|
request = self.request_queues.get()
|
||||||
self.scheduler_logger.info(f"Receive request from puller, request_id: {request.request_id}")
|
log_request(
|
||||||
|
RequestLogLevel.CONTENT,
|
||||||
|
message="Receive request from puller, request_id: {request_id}",
|
||||||
|
request_id=request.request_id,
|
||||||
|
)
|
||||||
self._scheduler.put_requests([request])
|
self._scheduler.put_requests([request])
|
||||||
|
|
||||||
def _get_response_from_local(self):
|
def _get_response_from_local(self):
|
||||||
|
|||||||
@@ -25,6 +25,11 @@ import crcmod
|
|||||||
from redis import ConnectionPool
|
from redis import ConnectionPool
|
||||||
|
|
||||||
from fastdeploy.engine.request import Request, RequestOutput
|
from fastdeploy.engine.request import Request, RequestOutput
|
||||||
|
from fastdeploy.logger.request_logger import (
|
||||||
|
RequestLogLevel,
|
||||||
|
log_request,
|
||||||
|
log_request_error,
|
||||||
|
)
|
||||||
from fastdeploy.scheduler import utils
|
from fastdeploy.scheduler import utils
|
||||||
from fastdeploy.scheduler.data import ScheduledRequest, ScheduledResponse
|
from fastdeploy.scheduler.data import ScheduledRequest, ScheduledResponse
|
||||||
from fastdeploy.scheduler.storage import AdaptedRedis
|
from fastdeploy.scheduler.storage import AdaptedRedis
|
||||||
@@ -370,7 +375,11 @@ class GlobalScheduler:
|
|||||||
rem_amount=0,
|
rem_amount=0,
|
||||||
ttl=self.ttl,
|
ttl=self.ttl,
|
||||||
)
|
)
|
||||||
scheduler_logger.info(f"Scheduler has enqueued some requests: {requests}")
|
log_request(
|
||||||
|
RequestLogLevel.CONTENT,
|
||||||
|
message="Scheduler has enqueued some requests: {request_ids}",
|
||||||
|
request_ids=[request.request_id for request in requests],
|
||||||
|
)
|
||||||
|
|
||||||
if duplicate:
|
if duplicate:
|
||||||
scheduler_logger.warning(
|
scheduler_logger.warning(
|
||||||
@@ -573,7 +582,9 @@ class GlobalScheduler:
|
|||||||
self.stolen_requests[request.request_id] = request
|
self.stolen_requests[request.request_id] = request
|
||||||
continue
|
continue
|
||||||
|
|
||||||
scheduler_logger.error(f"Scheduler has received a duplicate request from others: {request}")
|
log_request_error(
|
||||||
|
message="Scheduler has received a duplicate request from others: {request}", request=request
|
||||||
|
)
|
||||||
|
|
||||||
requests: List[Request] = [request.raw for request in scheduled_requests]
|
requests: List[Request] = [request.raw for request in scheduled_requests]
|
||||||
if len(remaining_request) > 0:
|
if len(remaining_request) > 0:
|
||||||
@@ -603,7 +614,11 @@ class GlobalScheduler:
|
|||||||
)
|
)
|
||||||
|
|
||||||
if len(requests) > 0:
|
if len(requests) > 0:
|
||||||
scheduler_logger.info(f"Scheduler has pulled some request: {[request.request_id for request in requests]}")
|
log_request(
|
||||||
|
RequestLogLevel.CONTENT,
|
||||||
|
message="Scheduler has pulled some request: {request_ids}",
|
||||||
|
request_ids=[request.request_id for request in requests],
|
||||||
|
)
|
||||||
return requests
|
return requests
|
||||||
|
|
||||||
def _put_results_worker(self, tasks: List[Task]):
|
def _put_results_worker(self, tasks: List[Task]):
|
||||||
@@ -649,7 +664,9 @@ class GlobalScheduler:
|
|||||||
stolen_responses[response_queue_name].append(response.serialize())
|
stolen_responses[response_queue_name].append(response.serialize())
|
||||||
continue
|
continue
|
||||||
|
|
||||||
scheduler_logger.error(f"Scheduler has received a non-existent response from engine: {[response]}")
|
log_request_error(
|
||||||
|
message="Scheduler has received a non-existent response from engine: {response}", response=[response]
|
||||||
|
)
|
||||||
|
|
||||||
with self.mutex:
|
with self.mutex:
|
||||||
for request_id, responses in local_responses.items():
|
for request_id, responses in local_responses.items():
|
||||||
@@ -664,7 +681,11 @@ class GlobalScheduler:
|
|||||||
self.local_response_not_empty.notify_all()
|
self.local_response_not_empty.notify_all()
|
||||||
|
|
||||||
if len(finished_request_ids) > 0:
|
if len(finished_request_ids) > 0:
|
||||||
scheduler_logger.info(f"Scheduler has received some finished responses: {finished_request_ids}")
|
log_request(
|
||||||
|
RequestLogLevel.CONTENT,
|
||||||
|
message="Scheduler has received some finished responses: {request_ids}",
|
||||||
|
request_ids=finished_request_ids,
|
||||||
|
)
|
||||||
|
|
||||||
for response_queue_name, responses in stolen_responses.items():
|
for response_queue_name, responses in stolen_responses.items():
|
||||||
self.client.rpush(response_queue_name, *responses, ttl=self.ttl)
|
self.client.rpush(response_queue_name, *responses, ttl=self.ttl)
|
||||||
@@ -793,7 +814,11 @@ class GlobalScheduler:
|
|||||||
|
|
||||||
if finished:
|
if finished:
|
||||||
del self.local_responses[request_id]
|
del self.local_responses[request_id]
|
||||||
scheduler_logger.info(f"Scheduler has pulled a finished response: {[request_id]}")
|
log_request(
|
||||||
|
RequestLogLevel.CONTENT,
|
||||||
|
message="Scheduler has pulled a finished response: {request_ids}",
|
||||||
|
request_ids=[request_id],
|
||||||
|
)
|
||||||
return results
|
return results
|
||||||
|
|
||||||
def reset(self):
|
def reset(self):
|
||||||
@@ -824,7 +849,7 @@ class GlobalScheduler:
|
|||||||
self.client.zrem(self._load_table_name(), self.name)
|
self.client.zrem(self._load_table_name(), self.name)
|
||||||
self.local_responses = dict()
|
self.local_responses = dict()
|
||||||
self.stolen_requests = dict()
|
self.stolen_requests = dict()
|
||||||
scheduler_logger.info("Scheduler has been reset")
|
log_request(RequestLogLevel.LIFECYCLE, message="Scheduler has been reset")
|
||||||
|
|
||||||
def update_config(self, load_shards_num: Optional[int], reallocate: Optional[bool]):
|
def update_config(self, load_shards_num: Optional[int], reallocate: Optional[bool]):
|
||||||
"""
|
"""
|
||||||
|
|||||||
@@ -19,6 +19,7 @@ import time
|
|||||||
from typing import Dict, List, Optional, Tuple
|
from typing import Dict, List, Optional, Tuple
|
||||||
|
|
||||||
from fastdeploy.engine.request import Request, RequestOutput
|
from fastdeploy.engine.request import Request, RequestOutput
|
||||||
|
from fastdeploy.logger.request_logger import RequestLogLevel, log_request
|
||||||
from fastdeploy.scheduler.data import ScheduledRequest, ScheduledResponse
|
from fastdeploy.scheduler.data import ScheduledRequest, ScheduledResponse
|
||||||
from fastdeploy.utils import envs, scheduler_logger
|
from fastdeploy.utils import envs, scheduler_logger
|
||||||
|
|
||||||
@@ -116,7 +117,7 @@ class LocalScheduler:
|
|||||||
self.ids = list()
|
self.ids = list()
|
||||||
self.requests = dict()
|
self.requests = dict()
|
||||||
self.responses = dict()
|
self.responses = dict()
|
||||||
scheduler_logger.info("Scheduler has been reset")
|
log_request(RequestLogLevel.LIFECYCLE, message="Scheduler has been reset")
|
||||||
|
|
||||||
def _recycle(self, request_id: Optional[str] = None):
|
def _recycle(self, request_id: Optional[str] = None):
|
||||||
"""
|
"""
|
||||||
@@ -191,7 +192,12 @@ class LocalScheduler:
|
|||||||
|
|
||||||
self.ids += valid_ids
|
self.ids += valid_ids
|
||||||
self.requests_not_empty.notify_all()
|
self.requests_not_empty.notify_all()
|
||||||
scheduler_logger.info(f"Scheduler has enqueued some requests: {valid_ids}")
|
if len(valid_ids) > 0:
|
||||||
|
log_request(
|
||||||
|
RequestLogLevel.CONTENT,
|
||||||
|
message="Scheduler has enqueued some requests: {request_ids}",
|
||||||
|
request_ids=valid_ids,
|
||||||
|
)
|
||||||
|
|
||||||
if len(duplicated_ids) > 0:
|
if len(duplicated_ids) > 0:
|
||||||
scheduler_logger.warning(f"Scheduler has received some duplicated requests: {duplicated_ids}")
|
scheduler_logger.warning(f"Scheduler has received some duplicated requests: {duplicated_ids}")
|
||||||
@@ -300,7 +306,11 @@ class LocalScheduler:
|
|||||||
scheduler_logger.debug(f"Scheduler has put all just-pulled request into the queue: {len(batch_ids)}")
|
scheduler_logger.debug(f"Scheduler has put all just-pulled request into the queue: {len(batch_ids)}")
|
||||||
|
|
||||||
if len(requests) > 0:
|
if len(requests) > 0:
|
||||||
scheduler_logger.info(f"Scheduler has pulled some request: {[request.request_id for request in requests]}")
|
log_request(
|
||||||
|
RequestLogLevel.CONTENT,
|
||||||
|
message="Scheduler has pulled some request: {request_ids}",
|
||||||
|
request_ids=[request.request_id for request in requests],
|
||||||
|
)
|
||||||
|
|
||||||
return requests
|
return requests
|
||||||
|
|
||||||
@@ -316,7 +326,11 @@ class LocalScheduler:
|
|||||||
|
|
||||||
finished_responses = [response.request_id for response in responses if response.finished]
|
finished_responses = [response.request_id for response in responses if response.finished]
|
||||||
if len(finished_responses) > 0:
|
if len(finished_responses) > 0:
|
||||||
scheduler_logger.info(f"Scheduler has received some finished responses: {finished_responses}")
|
log_request(
|
||||||
|
RequestLogLevel.CONTENT,
|
||||||
|
message="Scheduler has received some finished responses: {request_ids}",
|
||||||
|
request_ids=finished_responses,
|
||||||
|
)
|
||||||
|
|
||||||
with self.mutex:
|
with self.mutex:
|
||||||
self.batch_responses_per_step.append([response.raw for response in responses])
|
self.batch_responses_per_step.append([response.raw for response in responses])
|
||||||
@@ -381,7 +395,11 @@ class LocalScheduler:
|
|||||||
|
|
||||||
if finished:
|
if finished:
|
||||||
self._recycle(request_id)
|
self._recycle(request_id)
|
||||||
scheduler_logger.info(f"Scheduler has pulled a finished response: {[request_id]}")
|
log_request(
|
||||||
|
RequestLogLevel.CONTENT,
|
||||||
|
message="Scheduler has pulled a finished response: {request_ids}",
|
||||||
|
request_ids=[request_id],
|
||||||
|
)
|
||||||
|
|
||||||
if results:
|
if results:
|
||||||
scheduler_logger.debug(f"get responses, {results}")
|
scheduler_logger.debug(f"get responses, {results}")
|
||||||
|
|||||||
@@ -33,6 +33,7 @@ from fastdeploy.engine.request import (
|
|||||||
RequestMetrics,
|
RequestMetrics,
|
||||||
RequestOutput,
|
RequestOutput,
|
||||||
)
|
)
|
||||||
|
from fastdeploy.logger.request_logger import log_request_error
|
||||||
from fastdeploy.utils import scheduler_logger as logger
|
from fastdeploy.utils import scheduler_logger as logger
|
||||||
|
|
||||||
|
|
||||||
@@ -240,7 +241,12 @@ class NodeInfo:
|
|||||||
for req_id, pairs in self.reqs.items():
|
for req_id, pairs in self.reqs.items():
|
||||||
load, arrival_time = pairs
|
load, arrival_time = pairs
|
||||||
if cur_time - arrival_time > ttl:
|
if cur_time - arrival_time > ttl:
|
||||||
logger.error(f"InferScheduler Expire Reqs({req_id}), arrival({arrival_time}), ttl({ttl})")
|
log_request_error(
|
||||||
|
message="InferScheduler Expire Reqs({req_id}), arrival({arrival_time}), ttl({ttl})",
|
||||||
|
req_id=req_id,
|
||||||
|
arrival_time=arrival_time,
|
||||||
|
ttl=ttl,
|
||||||
|
)
|
||||||
expire_reqs.add((req_id, load))
|
expire_reqs.add((req_id, load))
|
||||||
for req_id, load in expire_reqs:
|
for req_id, load in expire_reqs:
|
||||||
if req_id in self.reqs:
|
if req_id in self.reqs:
|
||||||
@@ -378,7 +384,7 @@ class ResultReader:
|
|||||||
)
|
)
|
||||||
self.data.appendleft(result)
|
self.data.appendleft(result)
|
||||||
|
|
||||||
logger.error(f"Req({req_id}) is expired({self.ttl})")
|
log_request_error(message="Req({req_id}) is expired({ttl})", req_id=req_id, ttl=self.ttl)
|
||||||
expired_reqs.add(req_id)
|
expired_reqs.add(req_id)
|
||||||
continue
|
continue
|
||||||
keys.append(req_id)
|
keys.append(req_id)
|
||||||
@@ -511,7 +517,11 @@ class APIScheduler:
|
|||||||
except IndexError:
|
except IndexError:
|
||||||
continue
|
continue
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
logger.error(f"APIScheduler Schedule req error: {e!s}, {str(traceback.format_exc())}")
|
log_request_error(
|
||||||
|
message="APIScheduler Schedule req error: {error}, {traceback}",
|
||||||
|
error=str(e),
|
||||||
|
traceback=traceback.format_exc(),
|
||||||
|
)
|
||||||
|
|
||||||
def schedule(self, req, pnodes, dnodes, mnodes, group=""):
|
def schedule(self, req, pnodes, dnodes, mnodes, group=""):
|
||||||
"""
|
"""
|
||||||
@@ -841,7 +851,11 @@ class InferScheduler:
|
|||||||
|
|
||||||
req = self.reqs_queue.popleft()
|
req = self.reqs_queue.popleft()
|
||||||
if cur_time - req.metrics.arrival_time > self.ttl:
|
if cur_time - req.metrics.arrival_time > self.ttl:
|
||||||
logger.error(f"req({req.request_id}) is expired({self.ttl}) when InferScheduler Get Requests")
|
log_request_error(
|
||||||
|
message="req({request_id}) is expired({ttl}) when InferScheduler Get Requests",
|
||||||
|
request_id=req.request_id,
|
||||||
|
ttl=self.ttl,
|
||||||
|
)
|
||||||
self.node.finish_req(req.request_id)
|
self.node.finish_req(req.request_id)
|
||||||
continue
|
continue
|
||||||
current_prefill_tokens += req.prompt_token_ids_len
|
current_prefill_tokens += req.prompt_token_ids_len
|
||||||
|
|||||||
+54
-158
@@ -16,15 +16,12 @@
|
|||||||
|
|
||||||
import argparse
|
import argparse
|
||||||
import asyncio
|
import asyncio
|
||||||
import codecs
|
|
||||||
import hashlib
|
import hashlib
|
||||||
import importlib
|
import importlib
|
||||||
import json
|
import json
|
||||||
import logging
|
|
||||||
import os
|
import os
|
||||||
import pickle
|
import pickle
|
||||||
import random
|
import random
|
||||||
import re
|
|
||||||
import socket
|
import socket
|
||||||
import subprocess
|
import subprocess
|
||||||
import sys
|
import sys
|
||||||
@@ -36,8 +33,6 @@ from enum import Enum
|
|||||||
from functools import cache
|
from functools import cache
|
||||||
from http import HTTPStatus
|
from http import HTTPStatus
|
||||||
from importlib.metadata import PackageNotFoundError, distribution
|
from importlib.metadata import PackageNotFoundError, distribution
|
||||||
from logging.handlers import BaseRotatingHandler
|
|
||||||
from pathlib import Path
|
|
||||||
from typing import Any, Literal, TypeVar, Union
|
from typing import Any, Literal, TypeVar, Union
|
||||||
|
|
||||||
import numpy as np
|
import numpy as np
|
||||||
@@ -53,7 +48,7 @@ from typing_extensions import TypeIs, assert_never
|
|||||||
|
|
||||||
from fastdeploy import envs
|
from fastdeploy import envs
|
||||||
from fastdeploy.entrypoints.openai.protocol import ErrorInfo, ErrorResponse
|
from fastdeploy.entrypoints.openai.protocol import ErrorInfo, ErrorResponse
|
||||||
from fastdeploy.logger.logger import FastDeployLogger
|
from fastdeploy.logger.request_logger import log_request_error
|
||||||
from fastdeploy.worker.output import PromptLogprobs
|
from fastdeploy.worker.output import PromptLogprobs
|
||||||
|
|
||||||
T = TypeVar("T")
|
T = TypeVar("T")
|
||||||
@@ -193,6 +188,19 @@ class ExceptionHandler:
|
|||||||
loc = first_error.get("loc", [])
|
loc = first_error.get("loc", [])
|
||||||
param = loc[-1] if loc else None
|
param = loc[-1] if loc else None
|
||||||
message = first_error.get("msg", str(exc))
|
message = first_error.get("msg", str(exc))
|
||||||
|
|
||||||
|
# Try to extract request_id from request body
|
||||||
|
request_id = None
|
||||||
|
try:
|
||||||
|
body = await request.body()
|
||||||
|
if body:
|
||||||
|
import json
|
||||||
|
|
||||||
|
body_json = json.loads(body)
|
||||||
|
request_id = body_json.get("request_id")
|
||||||
|
except Exception:
|
||||||
|
pass
|
||||||
|
|
||||||
err = ErrorResponse(
|
err = ErrorResponse(
|
||||||
error=ErrorInfo(
|
error=ErrorInfo(
|
||||||
message=message,
|
message=message,
|
||||||
@@ -201,7 +209,13 @@ class ExceptionHandler:
|
|||||||
param=param,
|
param=param,
|
||||||
)
|
)
|
||||||
)
|
)
|
||||||
api_server_logger.error(f"invalid_request_error: {request.url} {param} {message}")
|
log_request_error(
|
||||||
|
message="request[{request_id}] invalid_request_error: {url} {param} {msg}",
|
||||||
|
request_id=request_id or "unknown",
|
||||||
|
url=str(request.url),
|
||||||
|
param=param,
|
||||||
|
msg=message,
|
||||||
|
)
|
||||||
return JSONResponse(content=err.model_dump(), status_code=HTTPStatus.BAD_REQUEST)
|
return JSONResponse(content=err.model_dump(), status_code=HTTPStatus.BAD_REQUEST)
|
||||||
|
|
||||||
|
|
||||||
@@ -224,139 +238,18 @@ class ErrorCode(str, Enum):
|
|||||||
CLIENT_ABORTED = "client_aborted"
|
CLIENT_ABORTED = "client_aborted"
|
||||||
|
|
||||||
|
|
||||||
class ColoredFormatter(logging.Formatter):
|
# Backward compatibility: logger classes have been moved to fastdeploy.logger module
|
||||||
"""自定义日志格式器,用于控制台输出带颜色"""
|
# Use lazy import to avoid circular dependencies
|
||||||
|
def __getattr__(name):
|
||||||
|
if name == "ColoredFormatter":
|
||||||
|
from fastdeploy.logger.formatters import ColoredFormatter
|
||||||
|
|
||||||
COLOR_CODES = {
|
return ColoredFormatter
|
||||||
logging.WARNING: 33, # 黄色
|
elif name == "DailyRotatingFileHandler":
|
||||||
logging.ERROR: 31, # 红色
|
from fastdeploy.logger.handlers import DailyRotatingFileHandler
|
||||||
logging.CRITICAL: 31, # 红色
|
|
||||||
}
|
|
||||||
|
|
||||||
def format(self, record):
|
return DailyRotatingFileHandler
|
||||||
color_code = self.COLOR_CODES.get(record.levelno, 0)
|
raise AttributeError(f"module {__name__!r} has no attribute {name!r}")
|
||||||
prefix = f"\033[{color_code}m"
|
|
||||||
suffix = "\033[0m"
|
|
||||||
message = super().format(record)
|
|
||||||
if color_code:
|
|
||||||
message = f"{prefix}{message}{suffix}"
|
|
||||||
return message
|
|
||||||
|
|
||||||
|
|
||||||
class DailyRotatingFileHandler(BaseRotatingHandler):
|
|
||||||
"""
|
|
||||||
like `logging.TimedRotatingFileHandler`, but this class support multi-process
|
|
||||||
"""
|
|
||||||
|
|
||||||
def __init__(
|
|
||||||
self,
|
|
||||||
filename,
|
|
||||||
backupCount=0,
|
|
||||||
encoding="utf-8",
|
|
||||||
delay=False,
|
|
||||||
utc=False,
|
|
||||||
**kwargs,
|
|
||||||
):
|
|
||||||
"""
|
|
||||||
初始化 RotatingFileHandler 对象。
|
|
||||||
|
|
||||||
Args:
|
|
||||||
filename (str): 日志文件的路径,可以是相对路径或绝对路径。
|
|
||||||
backupCount (int, optional, default=0): 保存的备份文件数量,默认为 0,表示不保存备份文件。
|
|
||||||
encoding (str, optional, default='utf-8'): 编码格式,默认为 'utf-8'。
|
|
||||||
delay (bool, optional, default=False): 是否延迟写入,默认为 False,表示立即写入。
|
|
||||||
utc (bool, optional, default=False): 是否使用 UTC 时区,默认为 False,表示不使用 UTC 时区。
|
|
||||||
kwargs (dict, optional): 其他参数将被传递给 BaseRotatingHandler 类的 init 方法。
|
|
||||||
|
|
||||||
Raises:
|
|
||||||
TypeError: 如果 filename 不是 str 类型。
|
|
||||||
ValueError: 如果 backupCount 小于等于 0。
|
|
||||||
"""
|
|
||||||
self.backup_count = backupCount
|
|
||||||
self.utc = utc
|
|
||||||
self.suffix = "%Y-%m-%d"
|
|
||||||
self.base_log_path = Path(filename)
|
|
||||||
self.base_filename = self.base_log_path.name
|
|
||||||
self.current_filename = self._compute_fn()
|
|
||||||
self.current_log_path = self.base_log_path.with_name(self.current_filename)
|
|
||||||
BaseRotatingHandler.__init__(self, filename, "a", encoding, delay)
|
|
||||||
|
|
||||||
def shouldRollover(self, record):
|
|
||||||
"""
|
|
||||||
check scroll through the log
|
|
||||||
"""
|
|
||||||
if self.current_filename != self._compute_fn():
|
|
||||||
return True
|
|
||||||
return False
|
|
||||||
|
|
||||||
def doRollover(self):
|
|
||||||
"""
|
|
||||||
scroll log
|
|
||||||
"""
|
|
||||||
if self.stream:
|
|
||||||
self.stream.close()
|
|
||||||
self.stream = None
|
|
||||||
|
|
||||||
self.current_filename = self._compute_fn()
|
|
||||||
self.current_log_path = self.base_log_path.with_name(self.current_filename)
|
|
||||||
|
|
||||||
if not self.delay:
|
|
||||||
self.stream = self._open()
|
|
||||||
|
|
||||||
self.delete_expired_files()
|
|
||||||
|
|
||||||
def _compute_fn(self):
|
|
||||||
"""
|
|
||||||
Calculate the log file name corresponding current time
|
|
||||||
"""
|
|
||||||
return self.base_filename + "." + time.strftime(self.suffix, time.localtime())
|
|
||||||
|
|
||||||
def _open(self):
|
|
||||||
"""
|
|
||||||
open new log file
|
|
||||||
"""
|
|
||||||
if self.encoding is None:
|
|
||||||
stream = open(str(self.current_log_path), self.mode)
|
|
||||||
else:
|
|
||||||
stream = codecs.open(str(self.current_log_path), self.mode, self.encoding)
|
|
||||||
|
|
||||||
if self.base_log_path.exists():
|
|
||||||
try:
|
|
||||||
if not self.base_log_path.is_symlink() or os.readlink(self.base_log_path) != self.current_filename:
|
|
||||||
os.remove(self.base_log_path)
|
|
||||||
except OSError:
|
|
||||||
pass
|
|
||||||
|
|
||||||
try:
|
|
||||||
os.symlink(self.current_filename, str(self.base_log_path))
|
|
||||||
except OSError:
|
|
||||||
pass
|
|
||||||
return stream
|
|
||||||
|
|
||||||
def delete_expired_files(self):
|
|
||||||
"""
|
|
||||||
delete expired log files
|
|
||||||
"""
|
|
||||||
if self.backup_count <= 0:
|
|
||||||
return
|
|
||||||
|
|
||||||
file_names = os.listdir(str(self.base_log_path.parent))
|
|
||||||
result = []
|
|
||||||
prefix = self.base_filename + "."
|
|
||||||
plen = len(prefix)
|
|
||||||
for file_name in file_names:
|
|
||||||
if file_name[:plen] == prefix:
|
|
||||||
suffix = file_name[plen:]
|
|
||||||
if re.match(r"^\d{4}-\d{2}-\d{2}(\.\w+)?$", suffix):
|
|
||||||
result.append(file_name)
|
|
||||||
if len(result) < self.backup_count:
|
|
||||||
result = []
|
|
||||||
else:
|
|
||||||
result.sort()
|
|
||||||
result = result[: len(result) - self.backup_count]
|
|
||||||
|
|
||||||
for file_name in result:
|
|
||||||
os.remove(str(self.base_log_path.with_name(file_name)))
|
|
||||||
|
|
||||||
|
|
||||||
def chunk_list(lst: list[T], chunk_size: int):
|
def chunk_list(lst: list[T], chunk_size: int):
|
||||||
@@ -1054,12 +947,6 @@ def parse_quantization(value: str):
|
|||||||
return {"quantization": value}
|
return {"quantization": value}
|
||||||
|
|
||||||
|
|
||||||
# 日志使用全局访问点(兼容原有使用方式)
|
|
||||||
def get_logger(name, file_name=None, without_formater=False, print_to_console=False):
|
|
||||||
"""全局函数包装器,保持向后兼容"""
|
|
||||||
return FastDeployLogger().get_logger(name, file_name, without_formater, print_to_console)
|
|
||||||
|
|
||||||
|
|
||||||
def check_download_links(bos_client, links, timeout=1):
|
def check_download_links(bos_client, links, timeout=1):
|
||||||
"""
|
"""
|
||||||
check bos download links
|
check bos download links
|
||||||
@@ -1156,20 +1043,6 @@ def download_from_bos(bos_client, bos_links, retry: int = 0):
|
|||||||
break
|
break
|
||||||
|
|
||||||
|
|
||||||
llm_logger = get_logger("fastdeploy", "fastdeploy.log")
|
|
||||||
data_processor_logger = get_logger("data_processor", "data_processor.log")
|
|
||||||
scheduler_logger = get_logger("scheduler", "scheduler.log")
|
|
||||||
api_server_logger = get_logger("api_server", "api_server.log")
|
|
||||||
console_logger = get_logger("console", "console.log", print_to_console=True)
|
|
||||||
spec_logger = get_logger("speculate", "speculate.log")
|
|
||||||
zmq_client_logger = get_logger("zmq_client", "zmq_client.log")
|
|
||||||
trace_logger = FastDeployLogger().get_trace_logger("trace", "trace.log")
|
|
||||||
router_logger = get_logger("router", "router.log")
|
|
||||||
fmq_logger = get_logger("fmq", "fmq.log")
|
|
||||||
obj_logger = get_logger("obj", "obj.log") # debug内存问题
|
|
||||||
register_manager_logger = get_logger("register_manager", "register_manager.log")
|
|
||||||
|
|
||||||
|
|
||||||
def parse_type(return_type: Callable[[str], T]) -> Callable[[str], T]:
|
def parse_type(return_type: Callable[[str], T]) -> Callable[[str], T]:
|
||||||
|
|
||||||
def _parse_type(val: str) -> T:
|
def _parse_type(val: str) -> T:
|
||||||
@@ -1338,3 +1211,26 @@ def all_gather_values(value: int | float | bool, group: paddle.distributed.commu
|
|||||||
paddle.distributed.all_gather(_global, _local, group)
|
paddle.distributed.all_gather(_global, _local, group)
|
||||||
_results = [_type(t.item()) for t in _global]
|
_results = [_type(t.item()) for t in _global]
|
||||||
return _results
|
return _results
|
||||||
|
|
||||||
|
|
||||||
|
# =============================================================================
|
||||||
|
# Logger re-export (backward compatibility)
|
||||||
|
# Actual implementation is in fastdeploy.logger module, re-exported here to
|
||||||
|
# support existing import patterns
|
||||||
|
# NOTE: Must be at the end of file to avoid circular imports
|
||||||
|
# =============================================================================
|
||||||
|
from fastdeploy.logger import ( # noqa: F401
|
||||||
|
api_server_logger,
|
||||||
|
console_logger,
|
||||||
|
data_processor_logger,
|
||||||
|
fmq_logger,
|
||||||
|
get_logger,
|
||||||
|
llm_logger,
|
||||||
|
obj_logger,
|
||||||
|
register_manager_logger,
|
||||||
|
router_logger,
|
||||||
|
scheduler_logger,
|
||||||
|
spec_logger,
|
||||||
|
trace_logger,
|
||||||
|
zmq_client_logger,
|
||||||
|
)
|
||||||
|
|||||||
@@ -756,8 +756,8 @@ class TestAsyncLLMOpenAIServingCompletionPreprocess(unittest.IsolatedAsyncioTest
|
|||||||
expected_id = f"{request_id}_{i}" if request_id else f"_{i}"
|
expected_id = f"{request_id}_{i}" if request_id else f"_{i}"
|
||||||
self.assertEqual(ctx.preprocess_requests[i]["request_id"], expected_id)
|
self.assertEqual(ctx.preprocess_requests[i]["request_id"], expected_id)
|
||||||
|
|
||||||
@patch("fastdeploy.entrypoints.openai.v1.serving_completion.api_server_logger")
|
@patch("fastdeploy.entrypoints.openai.v1.serving_completion.log_request_error")
|
||||||
async def test_preprocess_exception_logging(self, mock_logger):
|
async def test_preprocess_exception_logging(self, mock_log_request_error):
|
||||||
"""Test _preprocess logs exceptions properly"""
|
"""Test _preprocess logs exceptions properly"""
|
||||||
# Setup - create a request that will cause an exception
|
# Setup - create a request that will cause an exception
|
||||||
request = CompletionRequest(model="test_model", prompt="dummy", max_tokens=50)
|
request = CompletionRequest(model="test_model", prompt="dummy", max_tokens=50)
|
||||||
@@ -771,11 +771,10 @@ class TestAsyncLLMOpenAIServingCompletionPreprocess(unittest.IsolatedAsyncioTest
|
|||||||
|
|
||||||
# Assert
|
# Assert
|
||||||
self.assertIsInstance(result, ErrorResponse)
|
self.assertIsInstance(result, ErrorResponse)
|
||||||
mock_logger.error.assert_called_once()
|
mock_log_request_error.assert_called_once()
|
||||||
error_log = mock_logger.error.call_args[0][0]
|
error_msg = mock_log_request_error.call_args[1].get("message", "")
|
||||||
self.assertIn("OpenAIServingCompletion create_completion", error_log)
|
self.assertIn("OpenAIServingCompletion create_completion", error_msg)
|
||||||
self.assertIn("ValueError", error_log)
|
self.assertIn("ValueError", error_msg)
|
||||||
self.assertIn("Traceback", error_log) # Changed from "traceback" to "Traceback"
|
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
|
|||||||
@@ -206,9 +206,9 @@ class TestEngineClientAbort(unittest.TestCase):
|
|||||||
mock_send_task.assert_called_once_with(expected_data)
|
mock_send_task.assert_called_once_with(expected_data)
|
||||||
|
|
||||||
@patch("fastdeploy.entrypoints.engine_client.envs.FD_ENABLE_REQUEST_DISCONNECT_STOP_INFERENCE", True)
|
@patch("fastdeploy.entrypoints.engine_client.envs.FD_ENABLE_REQUEST_DISCONNECT_STOP_INFERENCE", True)
|
||||||
@patch("fastdeploy.entrypoints.engine_client.api_server_logger")
|
@patch("fastdeploy.entrypoints.engine_client.log_request")
|
||||||
@patch.object(EngineClient, "_send_task")
|
@patch.object(EngineClient, "_send_task")
|
||||||
def test_abort_logging(self, mock_send_task, mock_logger):
|
def test_abort_logging(self, mock_send_task, mock_log_request):
|
||||||
"""Test that abort method logs correctly"""
|
"""Test that abort method logs correctly"""
|
||||||
request_id = "test_request"
|
request_id = "test_request"
|
||||||
n = 2
|
n = 2
|
||||||
@@ -216,18 +216,16 @@ class TestEngineClientAbort(unittest.TestCase):
|
|||||||
# Run the abort method
|
# Run the abort method
|
||||||
self.loop.run_until_complete(self.engine_client.abort(request_id, n=n))
|
self.loop.run_until_complete(self.engine_client.abort(request_id, n=n))
|
||||||
|
|
||||||
# Verify info log was called twice
|
# Verify log_request was called twice
|
||||||
self.assertEqual(mock_logger.info.call_count, 2)
|
self.assertEqual(mock_log_request.call_count, 2)
|
||||||
|
|
||||||
# Verify the first log message (abort start)
|
# Verify the first log message (abort start)
|
||||||
first_call = mock_logger.info.call_args_list[0]
|
first_call = mock_log_request.call_args_list[0]
|
||||||
self.assertEqual(first_call[0][0], "abort request_id:test_request")
|
self.assertIn("abort request_id", first_call[1].get("message", ""))
|
||||||
|
|
||||||
# Verify the second log message (abort completion with request IDs)
|
# Verify the second log message (abort completion with request IDs)
|
||||||
second_call = mock_logger.info.call_args_list[1]
|
second_call = mock_log_request.call_args_list[1]
|
||||||
expected_log_message = "Aborted request(s) %s."
|
self.assertIn("Aborted request(s)", second_call[1].get("message", ""))
|
||||||
self.assertEqual(second_call[0][0], expected_log_message)
|
|
||||||
self.assertEqual(second_call[0][1], "test_request_0,test_request_1")
|
|
||||||
|
|
||||||
@patch("fastdeploy.entrypoints.engine_client.envs.FD_ENABLE_REQUEST_DISCONNECT_STOP_INFERENCE", True)
|
@patch("fastdeploy.entrypoints.engine_client.envs.FD_ENABLE_REQUEST_DISCONNECT_STOP_INFERENCE", True)
|
||||||
@patch("fastdeploy.entrypoints.engine_client.api_server_logger")
|
@patch("fastdeploy.entrypoints.engine_client.api_server_logger")
|
||||||
|
|||||||
@@ -843,11 +843,11 @@ class TestEngineClientValidParameters(unittest.TestCase):
|
|||||||
"""Test valid_parameters adjusts reasoning_max_tokens when needed."""
|
"""Test valid_parameters adjusts reasoning_max_tokens when needed."""
|
||||||
data = {"max_tokens": 50, "reasoning_max_tokens": 100, "request_id": "test-id"} # Larger than max_tokens
|
data = {"max_tokens": 50, "reasoning_max_tokens": 100, "request_id": "test-id"} # Larger than max_tokens
|
||||||
|
|
||||||
with patch("fastdeploy.entrypoints.engine_client.api_server_logger") as mock_logger:
|
with patch("fastdeploy.entrypoints.engine_client.log_request") as mock_log_request:
|
||||||
self.engine_client.valid_parameters(data)
|
self.engine_client.valid_parameters(data)
|
||||||
|
|
||||||
self.assertEqual(data["reasoning_max_tokens"], 50)
|
self.assertEqual(data["reasoning_max_tokens"], 50)
|
||||||
mock_logger.warning.assert_called_once()
|
mock_log_request.assert_called_once()
|
||||||
|
|
||||||
def test_valid_parameters_reasoning_max_tokens_with_reasoning_effort(self):
|
def test_valid_parameters_reasoning_max_tokens_with_reasoning_effort(self):
|
||||||
"""Test valid_parameters when both reasoning_max_tokens and reasoning_effort are set."""
|
"""Test valid_parameters when both reasoning_max_tokens and reasoning_effort are set."""
|
||||||
@@ -858,14 +858,13 @@ class TestEngineClientValidParameters(unittest.TestCase):
|
|||||||
"request_id": "test-id",
|
"request_id": "test-id",
|
||||||
}
|
}
|
||||||
|
|
||||||
with patch("fastdeploy.entrypoints.engine_client.api_server_logger") as mock_logger:
|
with patch("fastdeploy.entrypoints.engine_client.log_request") as mock_log_request:
|
||||||
self.engine_client.valid_parameters(data)
|
self.engine_client.valid_parameters(data)
|
||||||
|
|
||||||
# When reasoning_effort is set, reasoning_max_tokens should be set to None
|
# When reasoning_effort is set, reasoning_max_tokens should be set to None
|
||||||
self.assertIsNone(data["reasoning_max_tokens"])
|
self.assertIsNone(data["reasoning_max_tokens"])
|
||||||
mock_logger.warning.assert_called_once()
|
# log_request is called once: for reasoning_effort conflict (reasoning_max_tokens=50 < max_tokens=100)
|
||||||
warning_call = mock_logger.warning.call_args[0][0]
|
mock_log_request.assert_called_once()
|
||||||
self.assertIn("reasoning_max_tokens and reasoning_effort are both set", warning_call)
|
|
||||||
|
|
||||||
def test_valid_parameters_temperature_zero_adjustment(self):
|
def test_valid_parameters_temperature_zero_adjustment(self):
|
||||||
"""Test valid_parameters adjusts zero temperature."""
|
"""Test valid_parameters adjusts zero temperature."""
|
||||||
@@ -1846,7 +1845,7 @@ def test_add_requests_objgraph_and_error_paths(minimal_engine_client):
|
|||||||
with (
|
with (
|
||||||
patch(
|
patch(
|
||||||
"fastdeploy.entrypoints.engine_client.os.getenv",
|
"fastdeploy.entrypoints.engine_client.os.getenv",
|
||||||
side_effect=lambda k: "1" if k == "FD_ENABLE_OBJGRAPH_DEBUG" else None,
|
side_effect=lambda k, default=None: "1" if k == "FD_ENABLE_OBJGRAPH_DEBUG" else default,
|
||||||
),
|
),
|
||||||
patch("fastdeploy.entrypoints.engine_client._has_objgraph", True),
|
patch("fastdeploy.entrypoints.engine_client._has_objgraph", True),
|
||||||
patch("fastdeploy.entrypoints.engine_client._has_psutil", False),
|
patch("fastdeploy.entrypoints.engine_client._has_psutil", False),
|
||||||
|
|||||||
@@ -148,7 +148,9 @@ def test_receive_output_merges():
|
|||||||
assert first.added is True
|
assert first.added is True
|
||||||
|
|
||||||
|
|
||||||
def test_receive_output_logs_exception(caplog):
|
def test_receive_output_logs_exception():
|
||||||
|
from unittest.mock import patch
|
||||||
|
|
||||||
llm = _make_llm(_make_engine())
|
llm = _make_llm(_make_engine())
|
||||||
calls = iter([RuntimeError("boom"), SystemExit()])
|
calls = iter([RuntimeError("boom"), SystemExit()])
|
||||||
|
|
||||||
@@ -159,9 +161,14 @@ def test_receive_output_logs_exception(caplog):
|
|||||||
return nxt
|
return nxt
|
||||||
|
|
||||||
llm.llm_engine._get_generated_result = _get_generated_result
|
llm.llm_engine._get_generated_result = _get_generated_result
|
||||||
|
with patch("fastdeploy.entrypoints.llm.log_request_error") as mock_log:
|
||||||
with pytest.raises(SystemExit):
|
with pytest.raises(SystemExit):
|
||||||
llm._receive_output()
|
llm._receive_output()
|
||||||
assert "Unexpected error happened" in caplog.text
|
mock_log.assert_called_once()
|
||||||
|
call_kwargs = mock_log.call_args[1]
|
||||||
|
assert "Unexpected error happened" in call_kwargs.get(
|
||||||
|
"message", mock_log.call_args[0][0] if mock_log.call_args[0] else ""
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
def test_generate_and_chat_branches():
|
def test_generate_and_chat_branches():
|
||||||
|
|||||||
@@ -264,7 +264,7 @@ class TestSampleFramesQwen(unittest.TestCase):
|
|||||||
|
|
||||||
def test_fps_warning_when_nframes_exceeds_total(self):
|
def test_fps_warning_when_nframes_exceeds_total(self):
|
||||||
"""fps so high that computed num_frames > total → warning logged."""
|
"""fps so high that computed num_frames > total → warning logged."""
|
||||||
with self.assertLogs(level="WARNING"):
|
with self.assertLogs(logger="fastdeploy.main", level="WARNING"):
|
||||||
sample_frames_qwen(2, 4, 100, {"num_of_frame": 10, "fps": 1.0}, fps=100.0)
|
sample_frames_qwen(2, 4, 100, {"num_of_frame": 10, "fps": 1.0}, fps=100.0)
|
||||||
|
|
||||||
def test_divisible_by_4_correction(self):
|
def test_divisible_by_4_correction(self):
|
||||||
|
|||||||
@@ -46,8 +46,12 @@ class LoggerTests(unittest.TestCase):
|
|||||||
shutil.rmtree(self.tmp_dir, ignore_errors=True)
|
shutil.rmtree(self.tmp_dir, ignore_errors=True)
|
||||||
|
|
||||||
def test_unified_logger(self):
|
def test_unified_logger(self):
|
||||||
"""Test _get_unified_logger through instance"""
|
"""Test _get_unified_logger through instance (uses main channel)"""
|
||||||
test_cases = [(None, "fastdeploy"), ("module", "fastdeploy.module"), ("fastdeploy.utils", "fastdeploy.utils")]
|
test_cases = [
|
||||||
|
(None, "fastdeploy.main"),
|
||||||
|
("module", "fastdeploy.main.module"),
|
||||||
|
("fastdeploy.utils", "fastdeploy.utils"), # 已有 fastdeploy. 前缀的保持不变
|
||||||
|
]
|
||||||
|
|
||||||
for name, expected in test_cases:
|
for name, expected in test_cases:
|
||||||
with self.subTest(name=name):
|
with self.subTest(name=name):
|
||||||
|
|||||||
@@ -0,0 +1,91 @@
|
|||||||
|
# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
|
||||||
|
#
|
||||||
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
# you may not use this file except in compliance with the License.
|
||||||
|
# You may obtain a copy of the License at
|
||||||
|
#
|
||||||
|
# http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
#
|
||||||
|
# Unless required by applicable law or agreed to in writing, software
|
||||||
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
# See the License for the specific language governing permissions and
|
||||||
|
# limitations under the License.
|
||||||
|
|
||||||
|
import unittest
|
||||||
|
from unittest.mock import patch
|
||||||
|
|
||||||
|
from fastdeploy.logger.config import resolve_log_level, resolve_request_logging_defaults
|
||||||
|
|
||||||
|
|
||||||
|
class TestResolveLogLevel(unittest.TestCase):
|
||||||
|
"""测试 resolve_log_level 函数"""
|
||||||
|
|
||||||
|
def test_explicit_info_level(self):
|
||||||
|
"""显式设置 INFO 级别"""
|
||||||
|
result = resolve_log_level(raw_level="INFO")
|
||||||
|
self.assertEqual(result, "INFO")
|
||||||
|
|
||||||
|
def test_explicit_debug_level(self):
|
||||||
|
"""显式设置 DEBUG 级别"""
|
||||||
|
result = resolve_log_level(raw_level="DEBUG")
|
||||||
|
self.assertEqual(result, "DEBUG")
|
||||||
|
|
||||||
|
def test_case_insensitive(self):
|
||||||
|
"""级别名称应该大小写不敏感"""
|
||||||
|
self.assertEqual(resolve_log_level(raw_level="info"), "INFO")
|
||||||
|
self.assertEqual(resolve_log_level(raw_level="debug"), "DEBUG")
|
||||||
|
|
||||||
|
def test_invalid_level_raises(self):
|
||||||
|
"""无效级别应该抛出 ValueError"""
|
||||||
|
with self.assertRaises(ValueError) as ctx:
|
||||||
|
resolve_log_level(raw_level="INVALID")
|
||||||
|
self.assertIn("Unsupported FD_LOG_LEVEL", str(ctx.exception))
|
||||||
|
|
||||||
|
def test_debug_enabled_fallback(self):
|
||||||
|
"""FD_DEBUG=1 应该返回 DEBUG"""
|
||||||
|
result = resolve_log_level(raw_level=None, debug_enabled=1)
|
||||||
|
self.assertEqual(result, "DEBUG")
|
||||||
|
|
||||||
|
def test_debug_disabled_fallback(self):
|
||||||
|
"""FD_DEBUG=0 应该返回 INFO"""
|
||||||
|
result = resolve_log_level(raw_level=None, debug_enabled=0)
|
||||||
|
self.assertEqual(result, "INFO")
|
||||||
|
|
||||||
|
def test_env_fd_log_level_priority(self):
|
||||||
|
"""FD_LOG_LEVEL 环境变量优先级高于 FD_DEBUG"""
|
||||||
|
with patch.dict("os.environ", {"FD_LOG_LEVEL": "INFO", "FD_DEBUG": "1"}):
|
||||||
|
result = resolve_log_level()
|
||||||
|
self.assertEqual(result, "INFO")
|
||||||
|
|
||||||
|
def test_env_fd_debug_fallback(self):
|
||||||
|
"""无 FD_LOG_LEVEL 时使用 FD_DEBUG"""
|
||||||
|
with patch.dict("os.environ", {"FD_DEBUG": "1"}, clear=True):
|
||||||
|
result = resolve_log_level()
|
||||||
|
self.assertEqual(result, "DEBUG")
|
||||||
|
|
||||||
|
|
||||||
|
class TestResolveRequestLoggingDefaults(unittest.TestCase):
|
||||||
|
"""测试 resolve_request_logging_defaults 函数"""
|
||||||
|
|
||||||
|
def test_default_values(self):
|
||||||
|
"""默认值测试"""
|
||||||
|
with patch.dict("os.environ", {}, clear=True):
|
||||||
|
result = resolve_request_logging_defaults()
|
||||||
|
self.assertEqual(result["enabled"], 1)
|
||||||
|
self.assertEqual(result["level"], 2)
|
||||||
|
self.assertEqual(result["max_len"], 2048)
|
||||||
|
|
||||||
|
def test_custom_values(self):
|
||||||
|
"""自定义值测试"""
|
||||||
|
with patch.dict(
|
||||||
|
"os.environ", {"FD_LOG_REQUESTS": "0", "FD_LOG_REQUESTS_LEVEL": "2", "FD_LOG_MAX_LEN": "1024"}
|
||||||
|
):
|
||||||
|
result = resolve_request_logging_defaults()
|
||||||
|
self.assertEqual(result["enabled"], 0)
|
||||||
|
self.assertEqual(result["level"], 2)
|
||||||
|
self.assertEqual(result["max_len"], 1024)
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
unittest.main()
|
||||||
@@ -0,0 +1,176 @@
|
|||||||
|
# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
|
||||||
|
#
|
||||||
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
# you may not use this file except in compliance with the License.
|
||||||
|
# You may obtain a copy of the License at
|
||||||
|
#
|
||||||
|
# http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
#
|
||||||
|
# Unless required by applicable law or agreed to in writing, software
|
||||||
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
# See the License for the specific language governing permissions and
|
||||||
|
# limitations under the License.
|
||||||
|
|
||||||
|
import unittest
|
||||||
|
from unittest.mock import patch
|
||||||
|
|
||||||
|
from fastdeploy.logger.request_logger import (
|
||||||
|
RequestLogLevel,
|
||||||
|
_should_log,
|
||||||
|
_truncate,
|
||||||
|
log_request,
|
||||||
|
log_request_error,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
class TestRequestLogLevel(unittest.TestCase):
|
||||||
|
"""Test RequestLogLevel enum"""
|
||||||
|
|
||||||
|
def test_level_values(self):
|
||||||
|
"""Test level values"""
|
||||||
|
self.assertEqual(int(RequestLogLevel.LIFECYCLE), 0)
|
||||||
|
self.assertEqual(int(RequestLogLevel.STAGES), 1)
|
||||||
|
self.assertEqual(int(RequestLogLevel.CONTENT), 2)
|
||||||
|
self.assertEqual(int(RequestLogLevel.FULL), 3)
|
||||||
|
|
||||||
|
|
||||||
|
class TestShouldLog(unittest.TestCase):
|
||||||
|
"""Test _should_log function"""
|
||||||
|
|
||||||
|
def test_disabled_returns_false(self):
|
||||||
|
"""FD_LOG_REQUESTS=0 should return False"""
|
||||||
|
with patch("fastdeploy.logger.request_logger.envs") as mock_envs:
|
||||||
|
mock_envs.FD_LOG_REQUESTS = 0
|
||||||
|
mock_envs.FD_LOG_REQUESTS_LEVEL = 3
|
||||||
|
self.assertFalse(_should_log(RequestLogLevel.LIFECYCLE))
|
||||||
|
|
||||||
|
def test_level_within_threshold(self):
|
||||||
|
"""Level within threshold should return True"""
|
||||||
|
with patch("fastdeploy.logger.request_logger.envs") as mock_envs:
|
||||||
|
mock_envs.FD_LOG_REQUESTS = 1
|
||||||
|
mock_envs.FD_LOG_REQUESTS_LEVEL = 2
|
||||||
|
self.assertTrue(_should_log(RequestLogLevel.LIFECYCLE))
|
||||||
|
self.assertTrue(_should_log(RequestLogLevel.STAGES))
|
||||||
|
self.assertTrue(_should_log(RequestLogLevel.CONTENT))
|
||||||
|
|
||||||
|
def test_level_above_threshold(self):
|
||||||
|
"""Level above threshold should return False"""
|
||||||
|
with patch("fastdeploy.logger.request_logger.envs") as mock_envs:
|
||||||
|
mock_envs.FD_LOG_REQUESTS = 1
|
||||||
|
mock_envs.FD_LOG_REQUESTS_LEVEL = 1
|
||||||
|
self.assertFalse(_should_log(RequestLogLevel.CONTENT))
|
||||||
|
self.assertFalse(_should_log(RequestLogLevel.FULL))
|
||||||
|
|
||||||
|
|
||||||
|
class TestTruncate(unittest.TestCase):
|
||||||
|
"""Test _truncate function"""
|
||||||
|
|
||||||
|
def test_short_text_unchanged(self):
|
||||||
|
"""Short text should remain unchanged"""
|
||||||
|
with patch("fastdeploy.logger.request_logger.envs") as mock_envs:
|
||||||
|
mock_envs.FD_LOG_MAX_LEN = 100
|
||||||
|
result = _truncate("short text")
|
||||||
|
self.assertEqual(result, "short text")
|
||||||
|
|
||||||
|
def test_long_text_truncated(self):
|
||||||
|
"""Long text should be truncated"""
|
||||||
|
with patch("fastdeploy.logger.request_logger.envs") as mock_envs:
|
||||||
|
mock_envs.FD_LOG_MAX_LEN = 10
|
||||||
|
result = _truncate("this is a very long text")
|
||||||
|
self.assertEqual(result, "this is a ")
|
||||||
|
self.assertEqual(len(result), 10)
|
||||||
|
|
||||||
|
def test_non_string_converted(self):
|
||||||
|
"""Non-string should be converted"""
|
||||||
|
with patch("fastdeploy.logger.request_logger.envs") as mock_envs:
|
||||||
|
mock_envs.FD_LOG_MAX_LEN = 100
|
||||||
|
result = _truncate(12345)
|
||||||
|
self.assertEqual(result, "12345")
|
||||||
|
|
||||||
|
|
||||||
|
class TestLogRequest(unittest.TestCase):
|
||||||
|
"""Test log_request function"""
|
||||||
|
|
||||||
|
@patch("fastdeploy.logger._request_logger")
|
||||||
|
def test_log_when_enabled(self, mock_logger):
|
||||||
|
"""Should log when enabled"""
|
||||||
|
with patch("fastdeploy.logger.request_logger.envs") as mock_envs:
|
||||||
|
mock_envs.FD_LOG_REQUESTS = 1
|
||||||
|
mock_envs.FD_LOG_REQUESTS_LEVEL = 0
|
||||||
|
mock_envs.FD_LOG_MAX_LEN = 2048
|
||||||
|
|
||||||
|
log_request(RequestLogLevel.LIFECYCLE, message="test {value}", value="hello")
|
||||||
|
mock_logger.info.assert_called_once()
|
||||||
|
call_args = mock_logger.info.call_args[0][0]
|
||||||
|
self.assertEqual(call_args, "test hello")
|
||||||
|
|
||||||
|
@patch("fastdeploy.logger._request_logger")
|
||||||
|
def test_no_log_when_disabled(self, mock_logger):
|
||||||
|
"""Should not log when disabled"""
|
||||||
|
with patch("fastdeploy.logger.request_logger.envs") as mock_envs:
|
||||||
|
mock_envs.FD_LOG_REQUESTS = 0
|
||||||
|
mock_envs.FD_LOG_REQUESTS_LEVEL = 3
|
||||||
|
|
||||||
|
log_request(RequestLogLevel.LIFECYCLE, message="test {value}", value="hello")
|
||||||
|
mock_logger.info.assert_not_called()
|
||||||
|
|
||||||
|
@patch("fastdeploy.logger._request_logger")
|
||||||
|
def test_no_log_when_level_too_high(self, mock_logger):
|
||||||
|
"""Should not log when level is too high"""
|
||||||
|
with patch("fastdeploy.logger.request_logger.envs") as mock_envs:
|
||||||
|
mock_envs.FD_LOG_REQUESTS = 1
|
||||||
|
mock_envs.FD_LOG_REQUESTS_LEVEL = 0
|
||||||
|
|
||||||
|
log_request(RequestLogLevel.CONTENT, message="test {value}", value="hello")
|
||||||
|
mock_logger.info.assert_not_called()
|
||||||
|
|
||||||
|
@patch("fastdeploy.logger._request_logger")
|
||||||
|
def test_content_level_truncates_content(self, mock_logger):
|
||||||
|
"""CONTENT level should truncate content"""
|
||||||
|
with patch("fastdeploy.logger.request_logger.envs") as mock_envs:
|
||||||
|
mock_envs.FD_LOG_REQUESTS = 1
|
||||||
|
mock_envs.FD_LOG_REQUESTS_LEVEL = 3
|
||||||
|
mock_envs.FD_LOG_MAX_LEN = 5
|
||||||
|
|
||||||
|
log_request(RequestLogLevel.CONTENT, message="content: {data}", data="very long data")
|
||||||
|
mock_logger.info.assert_called_once()
|
||||||
|
call_args = mock_logger.info.call_args[0][0]
|
||||||
|
self.assertEqual(call_args, "content: very ")
|
||||||
|
|
||||||
|
@patch("fastdeploy.logger._request_logger")
|
||||||
|
def test_lifecycle_level_no_truncation(self, mock_logger):
|
||||||
|
"""LIFECYCLE level should not truncate content"""
|
||||||
|
with patch("fastdeploy.logger.request_logger.envs") as mock_envs:
|
||||||
|
mock_envs.FD_LOG_REQUESTS = 1
|
||||||
|
mock_envs.FD_LOG_REQUESTS_LEVEL = 3
|
||||||
|
mock_envs.FD_LOG_MAX_LEN = 5
|
||||||
|
|
||||||
|
log_request(RequestLogLevel.LIFECYCLE, message="content: {data}", data="very long data")
|
||||||
|
mock_logger.info.assert_called_once()
|
||||||
|
call_args = mock_logger.info.call_args[0][0]
|
||||||
|
self.assertEqual(call_args, "content: very long data")
|
||||||
|
|
||||||
|
|
||||||
|
class TestLogRequestError(unittest.TestCase):
|
||||||
|
"""Test log_request_error function"""
|
||||||
|
|
||||||
|
@patch("fastdeploy.logger._request_logger")
|
||||||
|
def test_error_with_fields(self, mock_logger):
|
||||||
|
"""Error log with fields should format message"""
|
||||||
|
log_request_error(message="request {request_id} failed: {error}", request_id="req-123", error="timeout")
|
||||||
|
mock_logger.error.assert_called_once()
|
||||||
|
call_args = mock_logger.error.call_args[0][0]
|
||||||
|
self.assertEqual(call_args, "request req-123 failed: timeout")
|
||||||
|
|
||||||
|
@patch("fastdeploy.logger._request_logger")
|
||||||
|
def test_error_without_fields(self, mock_logger):
|
||||||
|
"""Error log without fields should not call format"""
|
||||||
|
log_request_error(message="simple error message")
|
||||||
|
mock_logger.error.assert_called_once()
|
||||||
|
call_args = mock_logger.error.call_args[0][0]
|
||||||
|
self.assertEqual(call_args, "simple error message")
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
unittest.main()
|
||||||
@@ -43,58 +43,18 @@ class TestSetupLogging(unittest.TestCase):
|
|||||||
delattr(setup_logging, "_configured")
|
delattr(setup_logging, "_configured")
|
||||||
|
|
||||||
def test_log_dir_created(self):
|
def test_log_dir_created(self):
|
||||||
|
"""Log directory should be created"""
|
||||||
nested = os.path.join(self.temp_dir, "a", "b", "c")
|
nested = os.path.join(self.temp_dir, "a", "b", "c")
|
||||||
setup_logging(log_dir=nested)
|
setup_logging(log_dir=nested)
|
||||||
self.assertTrue(Path(nested).is_dir())
|
self.assertTrue(Path(nested).is_dir())
|
||||||
|
|
||||||
def test_default_config_fallback(self):
|
|
||||||
"""Pass a non-existent config_file to trigger default_config"""
|
|
||||||
fake_cfg = os.path.join(self.temp_dir, "no_such_cfg.json")
|
|
||||||
setup_logging(config_file=fake_cfg)
|
|
||||||
logger = logging.getLogger("fastdeploy")
|
|
||||||
self.assertTrue(logger.handlers)
|
|
||||||
handler_classes = [h.__class__.__name__ for h in logger.handlers]
|
|
||||||
self.assertIn("TimedRotatingFileHandler", handler_classes)
|
|
||||||
|
|
||||||
def test_debug_level_affects_handlers(self):
|
|
||||||
"""FD_DEBUG=1 should force DEBUG level"""
|
|
||||||
with patch("fastdeploy.envs.FD_DEBUG", 1):
|
|
||||||
with patch("logging.config.dictConfig") as mock_cfg:
|
|
||||||
setup_logging()
|
|
||||||
called_config = mock_cfg.call_args[0][0]
|
|
||||||
for handler in called_config["handlers"].values():
|
|
||||||
self.assertIn("formatter", handler)
|
|
||||||
self.assertEqual(called_config["handlers"]["console_stdout"]["level"], "DEBUG")
|
|
||||||
|
|
||||||
@patch("logging.config.dictConfig")
|
|
||||||
def test_custom_config_with_dailyrotating_and_debug(self, mock_dict):
|
|
||||||
custom_cfg = {
|
|
||||||
"version": 1,
|
|
||||||
"handlers": {
|
|
||||||
"daily": {
|
|
||||||
"class": "logging.handlers.DailyRotatingFileHandler",
|
|
||||||
"level": "INFO",
|
|
||||||
"formatter": "plain",
|
|
||||||
}
|
|
||||||
},
|
|
||||||
"loggers": {"fastdeploy": {"handlers": ["daily"], "level": "INFO"}},
|
|
||||||
}
|
|
||||||
cfg_path = Path(self.temp_dir) / "cfg.json"
|
|
||||||
cfg_path.write_text(json.dumps(custom_cfg))
|
|
||||||
|
|
||||||
with patch("fastdeploy.envs.FD_DEBUG", 1):
|
|
||||||
setup_logging(config_file=str(cfg_path))
|
|
||||||
|
|
||||||
config_used = mock_dict.call_args[0][0]
|
|
||||||
self.assertIn("daily", config_used["handlers"])
|
|
||||||
self.assertEqual(config_used["handlers"]["daily"]["level"], "DEBUG")
|
|
||||||
self.assertIn("backupCount", config_used["handlers"]["daily"])
|
|
||||||
|
|
||||||
def test_configure_once(self):
|
def test_configure_once(self):
|
||||||
"""Ensure idempotent setup"""
|
"""Ensure idempotent setup - only configures once"""
|
||||||
l1 = setup_logging()
|
setup_logging()
|
||||||
l2 = setup_logging()
|
self.assertTrue(setup_logging._configured)
|
||||||
self.assertIs(l1, l2)
|
# Second call should not raise
|
||||||
|
setup_logging()
|
||||||
|
self.assertTrue(setup_logging._configured)
|
||||||
|
|
||||||
def test_envs_priority_used_for_log_dir(self):
|
def test_envs_priority_used_for_log_dir(self):
|
||||||
"""When log_dir=None, should use envs.FD_LOG_DIR"""
|
"""When log_dir=None, should use envs.FD_LOG_DIR"""
|
||||||
@@ -102,15 +62,45 @@ class TestSetupLogging(unittest.TestCase):
|
|||||||
setup_logging()
|
setup_logging()
|
||||||
self.assertTrue(os.path.exists(self.temp_dir))
|
self.assertTrue(os.path.exists(self.temp_dir))
|
||||||
|
|
||||||
@patch("logging.StreamHandler.emit")
|
def test_log_dir_stored(self):
|
||||||
def test_console_colored(self, mock_emit):
|
"""setup_logging should store log_dir for later use"""
|
||||||
setup_logging()
|
setup_logging(log_dir=self.temp_dir)
|
||||||
logger = logging.getLogger("fastdeploy")
|
self.assertEqual(setup_logging._log_dir, self.temp_dir)
|
||||||
logger.error("color test")
|
|
||||||
self.assertTrue(mock_emit.called)
|
|
||||||
|
|
||||||
@patch("logging.config.dictConfig")
|
def test_no_config_file_no_dictconfig(self):
|
||||||
def test_backup_count_merging(self, mock_dict):
|
"""When config_file is not provided, dictConfig should not be called"""
|
||||||
|
with patch("logging.config.dictConfig") as mock_dict:
|
||||||
|
setup_logging()
|
||||||
|
self.assertFalse(mock_dict.called)
|
||||||
|
|
||||||
|
def test_config_file_with_dictconfig(self):
|
||||||
|
"""When config_file is provided, dictConfig should be called"""
|
||||||
|
custom_cfg = {
|
||||||
|
"version": 1,
|
||||||
|
"handlers": {},
|
||||||
|
"loggers": {},
|
||||||
|
}
|
||||||
|
cfg_path = Path(self.temp_dir) / "cfg.json"
|
||||||
|
cfg_path.write_text(json.dumps(custom_cfg))
|
||||||
|
|
||||||
|
with patch("logging.config.dictConfig") as mock_dict:
|
||||||
|
setup_logging(config_file=str(cfg_path))
|
||||||
|
self.assertTrue(mock_dict.called)
|
||||||
|
|
||||||
|
def test_config_file_not_exists_uses_default(self):
|
||||||
|
"""When config_file doesn't exist, use default config"""
|
||||||
|
fake_cfg = os.path.join(self.temp_dir, "no_such_cfg.json")
|
||||||
|
|
||||||
|
with patch("logging.config.dictConfig") as mock_dict:
|
||||||
|
setup_logging(config_file=fake_cfg)
|
||||||
|
self.assertTrue(mock_dict.called)
|
||||||
|
# Should use default config
|
||||||
|
config_used = mock_dict.call_args[0][0]
|
||||||
|
self.assertIn("handlers", config_used)
|
||||||
|
self.assertIn("loggers", config_used)
|
||||||
|
|
||||||
|
def test_backup_count_merging(self):
|
||||||
|
"""backupCount should be merged into handler config"""
|
||||||
custom_cfg = {
|
custom_cfg = {
|
||||||
"version": 1,
|
"version": 1,
|
||||||
"handlers": {"daily": {"class": "logging.handlers.DailyRotatingFileHandler", "formatter": "plain"}},
|
"handlers": {"daily": {"class": "logging.handlers.DailyRotatingFileHandler", "formatter": "plain"}},
|
||||||
@@ -119,24 +109,68 @@ class TestSetupLogging(unittest.TestCase):
|
|||||||
cfg_path = Path(self.temp_dir) / "cfg.json"
|
cfg_path = Path(self.temp_dir) / "cfg.json"
|
||||||
cfg_path.write_text(json.dumps(custom_cfg))
|
cfg_path.write_text(json.dumps(custom_cfg))
|
||||||
|
|
||||||
|
with patch("logging.config.dictConfig") as mock_dict:
|
||||||
setup_logging(config_file=str(cfg_path))
|
setup_logging(config_file=str(cfg_path))
|
||||||
|
|
||||||
config_used = mock_dict.call_args[0][0]
|
config_used = mock_dict.call_args[0][0]
|
||||||
self.assertEqual(config_used["handlers"]["daily"]["backupCount"], 3)
|
self.assertEqual(config_used["handlers"]["daily"]["backupCount"], 3)
|
||||||
|
|
||||||
@patch("logging.config.dictConfig")
|
def test_debug_level_affects_handlers(self):
|
||||||
def test_error_logs_use_stderr_handler(self, mock_dict):
|
"""FD_DEBUG=1 should force DEBUG level in handlers"""
|
||||||
"""ERROR级别日志应该使用stderr输出"""
|
custom_cfg = {
|
||||||
setup_logging()
|
"version": 1,
|
||||||
|
"handlers": {"test": {"class": "logging.StreamHandler", "level": "INFO"}},
|
||||||
|
"loggers": {},
|
||||||
|
}
|
||||||
|
cfg_path = Path(self.temp_dir) / "cfg.json"
|
||||||
|
cfg_path.write_text(json.dumps(custom_cfg))
|
||||||
|
|
||||||
|
with patch("fastdeploy.envs.FD_DEBUG", 1):
|
||||||
|
with patch("logging.config.dictConfig") as mock_dict:
|
||||||
|
setup_logging(config_file=str(cfg_path))
|
||||||
config_used = mock_dict.call_args[0][0]
|
config_used = mock_dict.call_args[0][0]
|
||||||
|
self.assertEqual(config_used["handlers"]["test"]["level"], "DEBUG")
|
||||||
|
|
||||||
|
def test_default_config_has_channels(self):
|
||||||
|
"""Default config should have channel loggers configured"""
|
||||||
|
fake_cfg = os.path.join(self.temp_dir, "no_such_cfg.json")
|
||||||
|
|
||||||
|
with patch("logging.config.dictConfig") as mock_dict:
|
||||||
|
setup_logging(config_file=fake_cfg)
|
||||||
|
config_used = mock_dict.call_args[0][0]
|
||||||
|
# Check channel loggers exist
|
||||||
|
self.assertIn("fastdeploy.main", config_used["loggers"])
|
||||||
|
self.assertIn("fastdeploy.request", config_used["loggers"])
|
||||||
|
self.assertIn("fastdeploy.console", config_used["loggers"])
|
||||||
|
|
||||||
|
def test_default_config_has_handlers(self):
|
||||||
|
"""Default config should have file handlers configured"""
|
||||||
|
fake_cfg = os.path.join(self.temp_dir, "no_such_cfg.json")
|
||||||
|
|
||||||
|
with patch("logging.config.dictConfig") as mock_dict:
|
||||||
|
setup_logging(config_file=fake_cfg)
|
||||||
|
config_used = mock_dict.call_args[0][0]
|
||||||
|
# Check handlers exist
|
||||||
|
self.assertIn("main_file", config_used["handlers"])
|
||||||
|
self.assertIn("request_file", config_used["handlers"])
|
||||||
|
self.assertIn("error_file", config_used["handlers"])
|
||||||
self.assertIn("console_stderr", config_used["handlers"])
|
self.assertIn("console_stderr", config_used["handlers"])
|
||||||
|
|
||||||
|
def test_default_config_stderr_handler(self):
|
||||||
|
"""Default config console_stderr should output to stderr"""
|
||||||
|
fake_cfg = os.path.join(self.temp_dir, "no_such_cfg.json")
|
||||||
|
|
||||||
|
with patch("logging.config.dictConfig") as mock_dict:
|
||||||
|
setup_logging(config_file=fake_cfg)
|
||||||
|
config_used = mock_dict.call_args[0][0]
|
||||||
self.assertEqual(config_used["handlers"]["console_stderr"]["stream"], "ext://sys.stderr")
|
self.assertEqual(config_used["handlers"]["console_stderr"]["stream"], "ext://sys.stderr")
|
||||||
self.assertEqual(config_used["handlers"]["console_stderr"]["level"], "ERROR")
|
self.assertEqual(config_used["handlers"]["console_stderr"]["level"], "ERROR")
|
||||||
|
|
||||||
@patch("logging.config.dictConfig")
|
def test_default_config_stdout_filters_below_error(self):
|
||||||
def test_console_stdout_filters_below_error(self, mock_dict):
|
"""Default config console_stdout should filter below ERROR level"""
|
||||||
"""console_stdout应该只输出低于ERROR级别的日志"""
|
fake_cfg = os.path.join(self.temp_dir, "no_such_cfg.json")
|
||||||
setup_logging()
|
|
||||||
|
with patch("logging.config.dictConfig") as mock_dict:
|
||||||
|
setup_logging(config_file=fake_cfg)
|
||||||
config_used = mock_dict.call_args[0][0]
|
config_used = mock_dict.call_args[0][0]
|
||||||
self.assertIn("console_stdout", config_used["handlers"])
|
self.assertIn("console_stdout", config_used["handlers"])
|
||||||
self.assertIn("below_error", config_used["handlers"]["console_stdout"]["filters"])
|
self.assertIn("below_error", config_used["handlers"]["console_stdout"]["filters"])
|
||||||
@@ -145,7 +179,7 @@ class TestSetupLogging(unittest.TestCase):
|
|||||||
|
|
||||||
class TestMaxLevelFilter(unittest.TestCase):
|
class TestMaxLevelFilter(unittest.TestCase):
|
||||||
def test_filter_allows_below_level(self):
|
def test_filter_allows_below_level(self):
|
||||||
"""MaxLevelFilter应该允许低于指定级别的日志通过"""
|
"""MaxLevelFilter should allow logs below specified level"""
|
||||||
filter = MaxLevelFilter("ERROR")
|
filter = MaxLevelFilter("ERROR")
|
||||||
record = logging.LogRecord(
|
record = logging.LogRecord(
|
||||||
name="test", level=logging.INFO, pathname="", lineno=0, msg="test", args=(), exc_info=None
|
name="test", level=logging.INFO, pathname="", lineno=0, msg="test", args=(), exc_info=None
|
||||||
@@ -153,7 +187,7 @@ class TestMaxLevelFilter(unittest.TestCase):
|
|||||||
self.assertTrue(filter.filter(record))
|
self.assertTrue(filter.filter(record))
|
||||||
|
|
||||||
def test_filter_blocks_at_level(self):
|
def test_filter_blocks_at_level(self):
|
||||||
"""MaxLevelFilter应该阻止等于指定级别的日志"""
|
"""MaxLevelFilter should block logs at specified level"""
|
||||||
filter = MaxLevelFilter("ERROR")
|
filter = MaxLevelFilter("ERROR")
|
||||||
record = logging.LogRecord(
|
record = logging.LogRecord(
|
||||||
name="test", level=logging.ERROR, pathname="", lineno=0, msg="test", args=(), exc_info=None
|
name="test", level=logging.ERROR, pathname="", lineno=0, msg="test", args=(), exc_info=None
|
||||||
@@ -161,7 +195,7 @@ class TestMaxLevelFilter(unittest.TestCase):
|
|||||||
self.assertFalse(filter.filter(record))
|
self.assertFalse(filter.filter(record))
|
||||||
|
|
||||||
def test_filter_blocks_above_level(self):
|
def test_filter_blocks_above_level(self):
|
||||||
"""MaxLevelFilter应该阻止高于指定级别的日志"""
|
"""MaxLevelFilter should block logs above specified level"""
|
||||||
filter = MaxLevelFilter("ERROR")
|
filter = MaxLevelFilter("ERROR")
|
||||||
record = logging.LogRecord(
|
record = logging.LogRecord(
|
||||||
name="test", level=logging.CRITICAL, pathname="", lineno=0, msg="test", args=(), exc_info=None
|
name="test", level=logging.CRITICAL, pathname="", lineno=0, msg="test", args=(), exc_info=None
|
||||||
@@ -169,7 +203,7 @@ class TestMaxLevelFilter(unittest.TestCase):
|
|||||||
self.assertFalse(filter.filter(record))
|
self.assertFalse(filter.filter(record))
|
||||||
|
|
||||||
def test_filter_with_numeric_level(self):
|
def test_filter_with_numeric_level(self):
|
||||||
"""MaxLevelFilter应该支持数字级别"""
|
"""MaxLevelFilter should support numeric level"""
|
||||||
filter = MaxLevelFilter(logging.WARNING)
|
filter = MaxLevelFilter(logging.WARNING)
|
||||||
info_record = logging.LogRecord(
|
info_record = logging.LogRecord(
|
||||||
name="test", level=logging.INFO, pathname="", lineno=0, msg="test", args=(), exc_info=None
|
name="test", level=logging.INFO, pathname="", lineno=0, msg="test", args=(), exc_info=None
|
||||||
@@ -181,5 +215,95 @@ class TestMaxLevelFilter(unittest.TestCase):
|
|||||||
self.assertFalse(filter.filter(warning_record))
|
self.assertFalse(filter.filter(warning_record))
|
||||||
|
|
||||||
|
|
||||||
|
class TestChannelLoggers(unittest.TestCase):
|
||||||
|
"""Test channel logger configuration via get_logger"""
|
||||||
|
|
||||||
|
def setUp(self):
|
||||||
|
self.temp_dir = tempfile.mkdtemp(prefix="logger_channel_test_")
|
||||||
|
if hasattr(setup_logging, "_configured"):
|
||||||
|
delattr(setup_logging, "_configured")
|
||||||
|
# Clear channel configuration cache
|
||||||
|
from fastdeploy.logger.logger import FastDeployLogger
|
||||||
|
|
||||||
|
FastDeployLogger._configured_channels = set()
|
||||||
|
|
||||||
|
self.patches = [
|
||||||
|
patch("fastdeploy.envs.FD_LOG_DIR", self.temp_dir),
|
||||||
|
patch("fastdeploy.envs.FD_DEBUG", 0),
|
||||||
|
patch("fastdeploy.envs.FD_LOG_BACKUP_COUNT", "3"),
|
||||||
|
patch("fastdeploy.envs.FD_LOG_LEVEL", None),
|
||||||
|
]
|
||||||
|
[p.start() for p in self.patches]
|
||||||
|
|
||||||
|
def tearDown(self):
|
||||||
|
[p.stop() for p in self.patches]
|
||||||
|
shutil.rmtree(self.temp_dir, ignore_errors=True)
|
||||||
|
if hasattr(setup_logging, "_configured"):
|
||||||
|
delattr(setup_logging, "_configured")
|
||||||
|
# Clear channel configuration cache
|
||||||
|
from fastdeploy.logger.logger import FastDeployLogger
|
||||||
|
|
||||||
|
FastDeployLogger._configured_channels = set()
|
||||||
|
|
||||||
|
def test_main_channel_has_handlers(self):
|
||||||
|
"""main channel root logger should have handlers"""
|
||||||
|
from fastdeploy.logger import get_logger
|
||||||
|
|
||||||
|
get_logger("test", channel="main")
|
||||||
|
main_channel = logging.getLogger("fastdeploy.main")
|
||||||
|
self.assertTrue(len(main_channel.handlers) > 0)
|
||||||
|
|
||||||
|
def test_request_channel_has_handlers(self):
|
||||||
|
"""request channel root logger should have handlers"""
|
||||||
|
from fastdeploy.logger import get_logger
|
||||||
|
|
||||||
|
get_logger("test", channel="request")
|
||||||
|
request_channel = logging.getLogger("fastdeploy.request")
|
||||||
|
self.assertTrue(len(request_channel.handlers) > 0)
|
||||||
|
|
||||||
|
def test_console_channel_has_stdout_handler(self):
|
||||||
|
"""console channel should have stdout handler"""
|
||||||
|
from fastdeploy.logger import get_logger
|
||||||
|
|
||||||
|
get_logger("test", channel="console")
|
||||||
|
console_channel = logging.getLogger("fastdeploy.console")
|
||||||
|
handler_types = [type(h).__name__ for h in console_channel.handlers]
|
||||||
|
self.assertIn("StreamHandler", handler_types)
|
||||||
|
|
||||||
|
def test_child_logger_propagates_to_channel(self):
|
||||||
|
"""Child loggers should propagate to channel root logger"""
|
||||||
|
from fastdeploy.logger import get_logger
|
||||||
|
|
||||||
|
logger = get_logger("child_test", channel="main")
|
||||||
|
# Child logger should have no direct handlers (propagates to parent)
|
||||||
|
self.assertEqual(len(logger.handlers), 0)
|
||||||
|
self.assertEqual(logger.name, "fastdeploy.main.child_test")
|
||||||
|
|
||||||
|
def test_channel_file_mapping(self):
|
||||||
|
"""Each channel should write to correct log file"""
|
||||||
|
from fastdeploy.logger.logger import FastDeployLogger
|
||||||
|
|
||||||
|
expected_files = {
|
||||||
|
"main": "fastdeploy.log",
|
||||||
|
"request": "request.log",
|
||||||
|
"console": "console.log",
|
||||||
|
}
|
||||||
|
self.assertEqual(FastDeployLogger._channel_files, expected_files)
|
||||||
|
|
||||||
|
def test_multiple_loggers_same_channel(self):
|
||||||
|
"""Multiple loggers on same channel should share channel root handlers"""
|
||||||
|
from fastdeploy.logger import get_logger
|
||||||
|
|
||||||
|
logger1 = get_logger("test1", channel="main")
|
||||||
|
logger2 = get_logger("test2", channel="main")
|
||||||
|
|
||||||
|
main_channel = logging.getLogger("fastdeploy.main")
|
||||||
|
# Both child loggers should have no handlers
|
||||||
|
self.assertEqual(len(logger1.handlers), 0)
|
||||||
|
self.assertEqual(len(logger2.handlers), 0)
|
||||||
|
# Channel root should have handlers
|
||||||
|
self.assertTrue(len(main_channel.handlers) > 0)
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
unittest.main()
|
unittest.main()
|
||||||
|
|||||||
@@ -22,6 +22,7 @@ from unittest.mock import MagicMock, Mock, patch
|
|||||||
import paddle
|
import paddle
|
||||||
|
|
||||||
from fastdeploy.engine.request import RequestMetrics, RequestOutput
|
from fastdeploy.engine.request import RequestMetrics, RequestOutput
|
||||||
|
from fastdeploy.logger.request_logger import RequestLogLevel
|
||||||
from fastdeploy.output.token_processor import TokenProcessor
|
from fastdeploy.output.token_processor import TokenProcessor
|
||||||
|
|
||||||
paddle.set_device("cpu")
|
paddle.set_device("cpu")
|
||||||
@@ -295,9 +296,9 @@ class TestTokenProcessorProcessBatchOutput(unittest.TestCase):
|
|||||||
processor.tokens_counter[task_id] = 0
|
processor.tokens_counter[task_id] = 0
|
||||||
processor.tokens_counter[task2.request_id] = 0
|
processor.tokens_counter[task2.request_id] = 0
|
||||||
|
|
||||||
# Mock llm_logger to capture the log message and envs.ENABLE_V1_KVCACHE_SCHEDULER
|
# Mock log_request to capture the log message and envs.ENABLE_V1_KVCACHE_SCHEDULER
|
||||||
with (
|
with (
|
||||||
patch("fastdeploy.output.token_processor.llm_logger") as mock_logger,
|
patch("fastdeploy.output.token_processor.log_request") as mock_log_request,
|
||||||
patch("fastdeploy.output.token_processor.envs.ENABLE_V1_KVCACHE_SCHEDULER", 0),
|
patch("fastdeploy.output.token_processor.envs.ENABLE_V1_KVCACHE_SCHEDULER", 0),
|
||||||
):
|
):
|
||||||
# Call the method
|
# Call the method
|
||||||
@@ -306,7 +307,11 @@ class TestTokenProcessorProcessBatchOutput(unittest.TestCase):
|
|||||||
# In speculative decoding mode, when accept_num[i] == PREEMPTED_TOKEN_ID,
|
# In speculative decoding mode, when accept_num[i] == PREEMPTED_TOKEN_ID,
|
||||||
# the code logs "sync preemption" and continues without triggering abort recycling
|
# the code logs "sync preemption" and continues without triggering abort recycling
|
||||||
# This is the expected behavior for speculative decoding mode
|
# This is the expected behavior for speculative decoding mode
|
||||||
mock_logger.info.assert_any_call(f"sync preemption for request_id {task_id} done.")
|
mock_log_request.assert_any_call(
|
||||||
|
RequestLogLevel.STAGES,
|
||||||
|
message="sync preemption for request_id {request_id} done.",
|
||||||
|
request_id=task_id,
|
||||||
|
)
|
||||||
# Verify that _recycle_resources was NOT called for the aborted task
|
# Verify that _recycle_resources was NOT called for the aborted task
|
||||||
# (it may be called for other tasks like test_request_2 if they receive EOS tokens)
|
# (it may be called for other tasks like test_request_2 if they receive EOS tokens)
|
||||||
for call in processor._recycle_resources.call_args_list:
|
for call in processor._recycle_resources.call_args_list:
|
||||||
|
|||||||
@@ -21,6 +21,7 @@ from unittest.mock import MagicMock, patch
|
|||||||
import numpy as np
|
import numpy as np
|
||||||
|
|
||||||
from fastdeploy.engine.request import CompletionOutput, RequestMetrics, RequestOutput
|
from fastdeploy.engine.request import CompletionOutput, RequestMetrics, RequestOutput
|
||||||
|
from fastdeploy.logger.request_logger import RequestLogLevel
|
||||||
from fastdeploy.output.token_processor import TokenProcessor
|
from fastdeploy.output.token_processor import TokenProcessor
|
||||||
from fastdeploy.worker.output import LogprobsLists
|
from fastdeploy.worker.output import LogprobsLists
|
||||||
|
|
||||||
@@ -171,16 +172,20 @@ class TestTokenProcessorLogprobs(unittest.TestCase):
|
|||||||
# Mock _recycle_resources to track if it's called
|
# Mock _recycle_resources to track if it's called
|
||||||
self.processor._recycle_resources = MagicMock()
|
self.processor._recycle_resources = MagicMock()
|
||||||
|
|
||||||
# Mock the llm_logger module and envs.ENABLE_V1_KVCACHE_SCHEDULER
|
# Mock the log_request function and envs.ENABLE_V1_KVCACHE_SCHEDULER
|
||||||
with (
|
with (
|
||||||
patch("fastdeploy.output.token_processor.llm_logger") as mock_logger,
|
patch("fastdeploy.output.token_processor.log_request") as mock_log_request,
|
||||||
patch("fastdeploy.output.token_processor.envs.ENABLE_V1_KVCACHE_SCHEDULER", 1),
|
patch("fastdeploy.output.token_processor.envs.ENABLE_V1_KVCACHE_SCHEDULER", 1),
|
||||||
):
|
):
|
||||||
# Call the method
|
# Call the method
|
||||||
result = self.processor._process_batch_output_use_zmq([stream_data])
|
result = self.processor._process_batch_output_use_zmq([stream_data])
|
||||||
|
|
||||||
# Verify the recycling logic was triggered
|
# Verify the recycling logic was triggered via log_request
|
||||||
mock_logger.info.assert_any_call(f"start to recycle abort request_id {task_id}")
|
mock_log_request.assert_any_call(
|
||||||
|
RequestLogLevel.STAGES,
|
||||||
|
message="start to recycle abort request_id {request_id}",
|
||||||
|
request_id=task_id,
|
||||||
|
)
|
||||||
self.processor.resource_manager.recycle_abort_task.assert_called_once_with(task_id)
|
self.processor.resource_manager.recycle_abort_task.assert_called_once_with(task_id)
|
||||||
self.assertNotIn(task_id, self.processor.resource_manager.to_be_aborted_req_id_set)
|
self.assertNotIn(task_id, self.processor.resource_manager.to_be_aborted_req_id_set)
|
||||||
self.assertEqual(len(result), 0) # Aborted task is skipped (continue)
|
self.assertEqual(len(result), 0) # Aborted task is skipped (continue)
|
||||||
|
|||||||
@@ -47,9 +47,13 @@ sys.modules["fastdeploy.engine.request"] = Mock()
|
|||||||
sys.modules["fastdeploy.scheduler"] = Mock()
|
sys.modules["fastdeploy.scheduler"] = Mock()
|
||||||
sys.modules["fastdeploy.scheduler.local_scheduler"] = Mock()
|
sys.modules["fastdeploy.scheduler.local_scheduler"] = Mock()
|
||||||
sys.modules["fastdeploy.scheduler.data"] = Mock()
|
sys.modules["fastdeploy.scheduler.data"] = Mock()
|
||||||
|
sys.modules["fastdeploy.logger"] = Mock()
|
||||||
|
sys.modules["fastdeploy.logger.request_logger"] = Mock()
|
||||||
|
|
||||||
# Mock the get_logger function
|
# Mock the get_logger function
|
||||||
sys.modules["fastdeploy.utils"].get_logger = Mock(return_value=mock_logger)
|
sys.modules["fastdeploy.utils"].get_logger = Mock(return_value=mock_logger)
|
||||||
|
# Mock the log_request function
|
||||||
|
sys.modules["fastdeploy.logger.request_logger"].log_request = Mock()
|
||||||
|
|
||||||
|
|
||||||
# Mock the Request, RequestOutput, and ScheduledResponse classes
|
# Mock the Request, RequestOutput, and ScheduledResponse classes
|
||||||
@@ -240,8 +244,9 @@ class TestDPLocalScheduler(unittest.TestCase):
|
|||||||
|
|
||||||
def test_put_results_with_finished_requests(self):
|
def test_put_results_with_finished_requests(self):
|
||||||
"""Test putting results with finished requests."""
|
"""Test putting results with finished requests."""
|
||||||
# Reset mock logger
|
# Get the mock log_request function
|
||||||
mock_logger.reset_mock()
|
mock_log_request = sys.modules["fastdeploy.logger.request_logger"].log_request
|
||||||
|
mock_log_request.reset_mock()
|
||||||
|
|
||||||
# Create mock request outputs
|
# Create mock request outputs
|
||||||
results = [
|
results = [
|
||||||
@@ -254,13 +259,12 @@ class TestDPLocalScheduler(unittest.TestCase):
|
|||||||
with patch.object(self.scheduler, "responses_not_empty"):
|
with patch.object(self.scheduler, "responses_not_empty"):
|
||||||
self.scheduler.put_results(results)
|
self.scheduler.put_results(results)
|
||||||
|
|
||||||
# Check that finished requests were logged - the logger should have been called
|
# Check that finished requests were logged via log_request
|
||||||
self.assertTrue(mock_logger.info.called)
|
self.assertTrue(mock_log_request.called)
|
||||||
# Get the actual call arguments to verify the message format
|
# Get the actual call arguments to verify the message format
|
||||||
call_args = mock_logger.info.call_args[0][0]
|
call_kwargs = mock_log_request.call_args[1]
|
||||||
self.assertIn("finished responses", call_args)
|
self.assertIn("finished responses", call_kwargs.get("message", ""))
|
||||||
self.assertIn("req1", call_args)
|
self.assertIn("req1", str(call_kwargs.get("request_ids", [])))
|
||||||
self.assertIn("req3", call_args)
|
|
||||||
|
|
||||||
def test_put_results_with_new_responses(self):
|
def test_put_results_with_new_responses(self):
|
||||||
"""Test putting results with new responses."""
|
"""Test putting results with new responses."""
|
||||||
|
|||||||
@@ -17,10 +17,11 @@ import unittest
|
|||||||
from unittest.mock import Mock, patch # noqa: F401
|
from unittest.mock import Mock, patch # noqa: F401
|
||||||
|
|
||||||
from fastdeploy.engine.request import Request, RequestMetrics, RequestOutput
|
from fastdeploy.engine.request import Request, RequestMetrics, RequestOutput
|
||||||
|
from fastdeploy.logger.request_logger import RequestLogLevel
|
||||||
|
|
||||||
# Real FastDeploy imports
|
# Real FastDeploy imports
|
||||||
from fastdeploy.scheduler.local_scheduler import LocalScheduler
|
from fastdeploy.scheduler.local_scheduler import LocalScheduler
|
||||||
from fastdeploy.utils import envs, scheduler_logger
|
from fastdeploy.utils import envs
|
||||||
|
|
||||||
|
|
||||||
class TestLocalScheduler(unittest.TestCase):
|
class TestLocalScheduler(unittest.TestCase):
|
||||||
@@ -221,9 +222,9 @@ class TestLocalScheduler(unittest.TestCase):
|
|||||||
|
|
||||||
def test_reset_logs_message(self):
|
def test_reset_logs_message(self):
|
||||||
"""Test that reset logs appropriate message."""
|
"""Test that reset logs appropriate message."""
|
||||||
with patch.object(scheduler_logger, "info") as mock_info:
|
with patch("fastdeploy.scheduler.local_scheduler.log_request") as mock_log:
|
||||||
self.scheduler.reset()
|
self.scheduler.reset()
|
||||||
mock_info.assert_called_once_with("Scheduler has been reset")
|
mock_log.assert_called_once_with(RequestLogLevel.LIFECYCLE, message="Scheduler has been reset")
|
||||||
|
|
||||||
def test_put_requests_duplicate_handling(self):
|
def test_put_requests_duplicate_handling(self):
|
||||||
"""Test handling of duplicate request IDs."""
|
"""Test handling of duplicate request IDs."""
|
||||||
@@ -449,12 +450,14 @@ class TestLocalScheduler(unittest.TestCase):
|
|||||||
# Add request first
|
# Add request first
|
||||||
self.scheduler.put_requests([self.mock_request_1])
|
self.scheduler.put_requests([self.mock_request_1])
|
||||||
|
|
||||||
with patch.object(scheduler_logger, "info") as mock_info:
|
with patch("fastdeploy.scheduler.local_scheduler.log_request") as mock_log_request:
|
||||||
mock_output = self._create_test_request_output("req_1", finished=True)
|
mock_output = self._create_test_request_output("req_1", finished=True)
|
||||||
self.scheduler.put_results([mock_output])
|
self.scheduler.put_results([mock_output])
|
||||||
|
|
||||||
# Should log finished response
|
# Should log finished response via log_request
|
||||||
self._assert_log_contains(mock_info, "finished responses")
|
mock_log_request.assert_called_once()
|
||||||
|
call_kwargs = mock_log_request.call_args[1]
|
||||||
|
self.assertIn("finished responses", call_kwargs.get("message", ""))
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
|
|||||||
Reference in New Issue
Block a user