[Feature] implement log channel separation and request log level system (#7190)

* feat: implement log channel separation and request log level system

* fix: log system improvements based on review

* add request_id to error logs, use RequestLogLevel enum, and unify logger implementation from utils to logger module
This commit is contained in:
zhouchong
2026-04-16 15:13:05 +08:00
committed by GitHub
parent 29495b2cf1
commit 6e16438a57
52 changed files with 1956 additions and 639 deletions
+15 -4
View File
@@ -21,6 +21,7 @@ import time
from typing import Dict, List, Optional
from fastdeploy.engine.request import Request, RequestOutput
from fastdeploy.logger.request_logger import RequestLogLevel, log_request
from fastdeploy.scheduler.data import ScheduledResponse
from fastdeploy.scheduler.local_scheduler import LocalScheduler
from fastdeploy.utils import get_logger
@@ -58,7 +59,11 @@ class DPLocalScheduler(LocalScheduler):
finished_responses = [response.request_id for response in responses if response.finished]
if len(finished_responses) > 0:
self.scheduler_logger.info(f"Scheduler has received some finished responses: {finished_responses}")
log_request(
RequestLogLevel.CONTENT,
message="Scheduler has received some finished responses: {request_ids}",
request_ids=finished_responses,
)
with self.mutex:
self.batch_responses_per_step.append([response.raw for response in responses])
@@ -146,8 +151,10 @@ class DPLocalScheduler(LocalScheduler):
self.ids_read_cursor += 1
if len(requests) > 0:
self.scheduler_logger.info(
f"Scheduler has pulled some request: {[request.request_id for request in requests]}"
log_request(
RequestLogLevel.CONTENT,
message="Scheduler has pulled some request: {request_ids}",
request_ids=[request.request_id for request in requests],
)
return requests
@@ -195,7 +202,11 @@ class DPScheduler:
def _put_requests_to_local(self):
while True:
request = self.request_queues.get()
self.scheduler_logger.info(f"Receive request from puller, request_id: {request.request_id}")
log_request(
RequestLogLevel.CONTENT,
message="Receive request from puller, request_id: {request_id}",
request_id=request.request_id,
)
self._scheduler.put_requests([request])
def _get_response_from_local(self):
+32 -7
View File
@@ -25,6 +25,11 @@ import crcmod
from redis import ConnectionPool
from fastdeploy.engine.request import Request, RequestOutput
from fastdeploy.logger.request_logger import (
RequestLogLevel,
log_request,
log_request_error,
)
from fastdeploy.scheduler import utils
from fastdeploy.scheduler.data import ScheduledRequest, ScheduledResponse
from fastdeploy.scheduler.storage import AdaptedRedis
@@ -370,7 +375,11 @@ class GlobalScheduler:
rem_amount=0,
ttl=self.ttl,
)
scheduler_logger.info(f"Scheduler has enqueued some requests: {requests}")
log_request(
RequestLogLevel.CONTENT,
message="Scheduler has enqueued some requests: {request_ids}",
request_ids=[request.request_id for request in requests],
)
if duplicate:
scheduler_logger.warning(
@@ -573,7 +582,9 @@ class GlobalScheduler:
self.stolen_requests[request.request_id] = request
continue
scheduler_logger.error(f"Scheduler has received a duplicate request from others: {request}")
log_request_error(
message="Scheduler has received a duplicate request from others: {request}", request=request
)
requests: List[Request] = [request.raw for request in scheduled_requests]
if len(remaining_request) > 0:
@@ -603,7 +614,11 @@ class GlobalScheduler:
)
if len(requests) > 0:
scheduler_logger.info(f"Scheduler has pulled some request: {[request.request_id for request in requests]}")
log_request(
RequestLogLevel.CONTENT,
message="Scheduler has pulled some request: {request_ids}",
request_ids=[request.request_id for request in requests],
)
return requests
def _put_results_worker(self, tasks: List[Task]):
@@ -649,7 +664,9 @@ class GlobalScheduler:
stolen_responses[response_queue_name].append(response.serialize())
continue
scheduler_logger.error(f"Scheduler has received a non-existent response from engine: {[response]}")
log_request_error(
message="Scheduler has received a non-existent response from engine: {response}", response=[response]
)
with self.mutex:
for request_id, responses in local_responses.items():
@@ -664,7 +681,11 @@ class GlobalScheduler:
self.local_response_not_empty.notify_all()
if len(finished_request_ids) > 0:
scheduler_logger.info(f"Scheduler has received some finished responses: {finished_request_ids}")
log_request(
RequestLogLevel.CONTENT,
message="Scheduler has received some finished responses: {request_ids}",
request_ids=finished_request_ids,
)
for response_queue_name, responses in stolen_responses.items():
self.client.rpush(response_queue_name, *responses, ttl=self.ttl)
@@ -793,7 +814,11 @@ class GlobalScheduler:
if finished:
del self.local_responses[request_id]
scheduler_logger.info(f"Scheduler has pulled a finished response: {[request_id]}")
log_request(
RequestLogLevel.CONTENT,
message="Scheduler has pulled a finished response: {request_ids}",
request_ids=[request_id],
)
return results
def reset(self):
@@ -824,7 +849,7 @@ class GlobalScheduler:
self.client.zrem(self._load_table_name(), self.name)
self.local_responses = dict()
self.stolen_requests = dict()
scheduler_logger.info("Scheduler has been reset")
log_request(RequestLogLevel.LIFECYCLE, message="Scheduler has been reset")
def update_config(self, load_shards_num: Optional[int], reallocate: Optional[bool]):
"""
+23 -5
View File
@@ -19,6 +19,7 @@ import time
from typing import Dict, List, Optional, Tuple
from fastdeploy.engine.request import Request, RequestOutput
from fastdeploy.logger.request_logger import RequestLogLevel, log_request
from fastdeploy.scheduler.data import ScheduledRequest, ScheduledResponse
from fastdeploy.utils import envs, scheduler_logger
@@ -116,7 +117,7 @@ class LocalScheduler:
self.ids = list()
self.requests = dict()
self.responses = dict()
scheduler_logger.info("Scheduler has been reset")
log_request(RequestLogLevel.LIFECYCLE, message="Scheduler has been reset")
def _recycle(self, request_id: Optional[str] = None):
"""
@@ -191,7 +192,12 @@ class LocalScheduler:
self.ids += valid_ids
self.requests_not_empty.notify_all()
scheduler_logger.info(f"Scheduler has enqueued some requests: {valid_ids}")
if len(valid_ids) > 0:
log_request(
RequestLogLevel.CONTENT,
message="Scheduler has enqueued some requests: {request_ids}",
request_ids=valid_ids,
)
if len(duplicated_ids) > 0:
scheduler_logger.warning(f"Scheduler has received some duplicated requests: {duplicated_ids}")
@@ -300,7 +306,11 @@ class LocalScheduler:
scheduler_logger.debug(f"Scheduler has put all just-pulled request into the queue: {len(batch_ids)}")
if len(requests) > 0:
scheduler_logger.info(f"Scheduler has pulled some request: {[request.request_id for request in requests]}")
log_request(
RequestLogLevel.CONTENT,
message="Scheduler has pulled some request: {request_ids}",
request_ids=[request.request_id for request in requests],
)
return requests
@@ -316,7 +326,11 @@ class LocalScheduler:
finished_responses = [response.request_id for response in responses if response.finished]
if len(finished_responses) > 0:
scheduler_logger.info(f"Scheduler has received some finished responses: {finished_responses}")
log_request(
RequestLogLevel.CONTENT,
message="Scheduler has received some finished responses: {request_ids}",
request_ids=finished_responses,
)
with self.mutex:
self.batch_responses_per_step.append([response.raw for response in responses])
@@ -381,7 +395,11 @@ class LocalScheduler:
if finished:
self._recycle(request_id)
scheduler_logger.info(f"Scheduler has pulled a finished response: {[request_id]}")
log_request(
RequestLogLevel.CONTENT,
message="Scheduler has pulled a finished response: {request_ids}",
request_ids=[request_id],
)
if results:
scheduler_logger.debug(f"get responses, {results}")
+18 -4
View File
@@ -33,6 +33,7 @@ from fastdeploy.engine.request import (
RequestMetrics,
RequestOutput,
)
from fastdeploy.logger.request_logger import log_request_error
from fastdeploy.utils import scheduler_logger as logger
@@ -240,7 +241,12 @@ class NodeInfo:
for req_id, pairs in self.reqs.items():
load, arrival_time = pairs
if cur_time - arrival_time > ttl:
logger.error(f"InferScheduler Expire Reqs({req_id}), arrival({arrival_time}), ttl({ttl})")
log_request_error(
message="InferScheduler Expire Reqs({req_id}), arrival({arrival_time}), ttl({ttl})",
req_id=req_id,
arrival_time=arrival_time,
ttl=ttl,
)
expire_reqs.add((req_id, load))
for req_id, load in expire_reqs:
if req_id in self.reqs:
@@ -378,7 +384,7 @@ class ResultReader:
)
self.data.appendleft(result)
logger.error(f"Req({req_id}) is expired({self.ttl})")
log_request_error(message="Req({req_id}) is expired({ttl})", req_id=req_id, ttl=self.ttl)
expired_reqs.add(req_id)
continue
keys.append(req_id)
@@ -511,7 +517,11 @@ class APIScheduler:
except IndexError:
continue
except Exception as e:
logger.error(f"APIScheduler Schedule req error: {e!s}, {str(traceback.format_exc())}")
log_request_error(
message="APIScheduler Schedule req error: {error}, {traceback}",
error=str(e),
traceback=traceback.format_exc(),
)
def schedule(self, req, pnodes, dnodes, mnodes, group=""):
"""
@@ -841,7 +851,11 @@ class InferScheduler:
req = self.reqs_queue.popleft()
if cur_time - req.metrics.arrival_time > self.ttl:
logger.error(f"req({req.request_id}) is expired({self.ttl}) when InferScheduler Get Requests")
log_request_error(
message="req({request_id}) is expired({ttl}) when InferScheduler Get Requests",
request_id=req.request_id,
ttl=self.ttl,
)
self.node.finish_req(req.request_id)
continue
current_prefill_tokens += req.prompt_token_ids_len