[Feature] implement log channel separation and request log level system (#7190)

* feat: implement log channel separation and request log level system

* fix: log system improvements based on review

* add request_id to error logs, use RequestLogLevel enum, and unify logger implementation from utils to logger module
This commit is contained in:
zhouchong
2026-04-16 15:13:05 +08:00
committed by GitHub
parent 29495b2cf1
commit 6e16438a57
52 changed files with 1956 additions and 639 deletions
+19 -4
View File
@@ -73,6 +73,11 @@ from fastdeploy.entrypoints.openai.v1.serving_completion import (
OpenAIServingCompletion as OpenAIServingCompletionV1,
)
from fastdeploy.envs import environment_variables
from fastdeploy.logger.request_logger import (
RequestLogLevel,
log_request,
log_request_error,
)
from fastdeploy.metrics.metrics import get_filtered_metrics
from fastdeploy.utils import (
ExceptionHandler,
@@ -325,7 +330,11 @@ async def connection_manager():
await asyncio.wait_for(connection_semaphore.acquire(), timeout=0.001)
yield
except asyncio.TimeoutError:
api_server_logger.info(f"Reach max request concurrency, semaphore status: {connection_semaphore.status()}")
log_request(
level=RequestLogLevel.LIFECYCLE,
message="Reach max request concurrency, semaphore status: {status}",
status=connection_semaphore.status(),
)
raise HTTPException(
status_code=429, detail=f"Too many requests,current max concurrency is {args.max_concurrency}"
)
@@ -545,7 +554,7 @@ async def create_chat_completion(request: ChatCompletionRequest, req: Request):
"""
Create a chat completion for the provided prompt and parameters.
"""
api_server_logger.debug(f"Chat Received request: {request.model_dump_json()}")
log_request(RequestLogLevel.FULL, message="Chat Received request: {request}", request=request.model_dump_json())
if envs.TRACES_ENABLE:
if req.headers:
headers = dict(req.headers)
@@ -572,7 +581,11 @@ async def create_chat_completion(request: ChatCompletionRequest, req: Request):
return StreamingResponse(content=wrapped_generator(), media_type="text/event-stream")
except HTTPException as e:
api_server_logger.error(f"Error in chat completion: {str(e)}")
log_request_error(
message="request[{request_id}] Error in chat completion: {error}",
request_id=getattr(request, "request_id", None),
error=str(e),
)
return JSONResponse(status_code=e.status_code, content={"detail": e.detail})
@@ -582,7 +595,9 @@ async def create_completion(request: CompletionRequest, req: Request):
"""
Create a completion for the provided prompt and parameters.
"""
api_server_logger.info(f"Completion Received request: {request.model_dump_json()}")
log_request(
RequestLogLevel.FULL, message="Completion Received request: {request}", request=request.model_dump_json()
)
if envs.TRACES_ENABLE:
if req.headers:
headers = dict(req.headers)
+2 -3
View File
@@ -31,6 +31,7 @@ from pydantic import (
)
from fastdeploy.engine.pooling_params import PoolingParams
from fastdeploy.logger.request_logger import RequestLogLevel, log_request
from fastdeploy.worker.output import PromptLogprobs, SpeculateMetrics
@@ -758,9 +759,7 @@ class ChatCompletionRequest(BaseModel):
), "The parameter `raw_request` is not supported now, please use completion api instead."
for key, value in self.metadata.items():
req_dict[key] = value
from fastdeploy.utils import api_server_logger
api_server_logger.warning("The parameter metadata is obsolete.")
log_request(RequestLogLevel.STAGES, message="The parameter metadata is obsolete.")
for key, value in self.dict().items():
if value is not None:
req_dict[key] = value
+59 -18
View File
@@ -44,6 +44,11 @@ from fastdeploy.entrypoints.openai.protocol import (
UsageInfo,
)
from fastdeploy.entrypoints.openai.response_processors import ChatResponseProcessor
from fastdeploy.logger.request_logger import (
RequestLogLevel,
log_request,
log_request_error,
)
from fastdeploy.metrics.metrics import main_process_metrics
from fastdeploy.trace.constants import LoggingEventName
from fastdeploy.trace.trace_logger import print as trace_print
@@ -112,14 +117,16 @@ class OpenAIServingChat:
err_msg = (
f"Only master node can accept completion request, please send request to master node: {self.master_ip}"
)
api_server_logger.error(err_msg)
log_request_error(message="request[{request_id}] {error}", request_id=request.request_id, error=err_msg)
return ErrorResponse(error=ErrorInfo(message=err_msg, type=ErrorType.INTERNAL_ERROR))
if self.models:
is_supported, request.model = self.models.is_supported_model(request.model)
if not is_supported:
err_msg = f"Unsupported model: [{request.model}], support [{', '.join([x.name for x in self.models.model_paths])}] or default"
api_server_logger.error(err_msg)
log_request_error(
message="request[{request_id}] {error}", request_id=request.request_id, error=err_msg
)
return ErrorResponse(
error=ErrorInfo(message=err_msg, type=ErrorType.INTERNAL_ERROR, code=ErrorCode.MODEL_NOT_SUPPORT)
)
@@ -129,7 +136,11 @@ class OpenAIServingChat:
await self.engine_client.semaphore.acquire()
else:
await asyncio.wait_for(self.engine_client.semaphore.acquire(), timeout=self.max_waiting_time)
api_server_logger.info(f"current {self.engine_client.semaphore.status()}")
log_request(
RequestLogLevel.STAGES,
message="semaphore status: {status}",
status=self.engine_client.semaphore.status(),
)
if request.request_id is not None:
request_id = request.request_id
@@ -141,7 +152,11 @@ class OpenAIServingChat:
request_id = f"chatcmpl-{uuid.uuid4()}"
tracing.trace_req_start(rid=request_id, trace_content=request.trace_context, role="FastDeploy")
del request.trace_context
api_server_logger.info(f"create chat completion request: {request_id}")
log_request(
level=RequestLogLevel.LIFECYCLE,
message="create chat completion request: {request_id}",
request_id=request_id,
)
prompt_tokens = None
max_tokens = None
try:
@@ -156,14 +171,19 @@ class OpenAIServingChat:
if isinstance(prompt_token_ids, np.ndarray):
prompt_token_ids = prompt_token_ids.tolist()
except ParameterError as e:
api_server_logger.error(f"request[{request_id}] generator error: {str(e)}, {e.message}")
log_request_error(
message="request[{request_id}] generator error: {error}, {error_message}",
request_id=request_id,
error=str(e),
error_message=e.message,
)
self.engine_client.semaphore.release()
return ErrorResponse(
error=ErrorInfo(message=str(e.message), type=ErrorType.INVALID_REQUEST_ERROR, param=e.param)
)
except Exception as e:
error_msg = f"request[{request_id}] generator error: {str(e)}, {str(traceback.format_exc())}"
api_server_logger.error(error_msg)
log_request_error(message=error_msg)
self.engine_client.semaphore.release()
return ErrorResponse(error=ErrorInfo(message=error_msg, type=ErrorType.INVALID_REQUEST_ERROR))
@@ -178,12 +198,12 @@ class OpenAIServingChat:
)
except Exception as e:
error_msg = f"request[{request_id}]full generator error: {str(e)}, {str(traceback.format_exc())}"
api_server_logger.error(error_msg)
log_request_error(message=error_msg)
return ErrorResponse(error=ErrorInfo(message=error_msg, type=ErrorType.INTERNAL_ERROR))
except asyncio.CancelledError as e:
await self.engine_client.abort(f"{request_id}_0", 1 if request.n is None else request.n)
error_msg = f"request[{request_id}_0] client disconnected: {str(e)}, {str(traceback.format_exc())}"
api_server_logger.error(error_msg)
log_request_error(message=error_msg)
return ErrorResponse(
error=ErrorInfo(message=error_msg, type=ErrorType.INVALID_REQUEST_ERROR, code=ErrorCode.CLIENT_ABORTED)
)
@@ -192,13 +212,13 @@ class OpenAIServingChat:
f"request[{request_id}] waiting error: {str(e)}, {str(traceback.format_exc())}, "
f"max waiting time: {self.max_waiting_time}"
)
api_server_logger.error(error_msg)
log_request_error(message=error_msg)
return ErrorResponse(
error=ErrorInfo(message=error_msg, type=ErrorType.TIMEOUT_ERROR, code=ErrorCode.TIMEOUT)
)
def _create_streaming_error_response(self, message: str) -> str:
api_server_logger.error(message)
log_request_error(message=message)
error_response = ErrorResponse(error=ErrorInfo(message=message, type=ErrorType.INTERNAL_ERROR))
return error_response.model_dump_json()
@@ -249,7 +269,9 @@ class OpenAIServingChat:
choices=[],
model=model_name,
)
api_server_logger.info(f"create chat completion request: {request_id}")
log_request(
RequestLogLevel.LIFECYCLE, message="create chat completion request: {request_id}", request_id=request_id
)
try:
dealer, response_queue = await self.engine_client.connection_manager.get_connection(
@@ -372,7 +394,12 @@ class OpenAIServingChat:
completion_tokens_details=CompletionTokenUsageInfo(reasoning_tokens=0),
)
yield f"data: {chunk.model_dump_json(exclude_unset=True)} \n\n"
api_server_logger.info(f"Chat Streaming response send_idx 0: {chunk.model_dump_json()}")
log_request(
level=RequestLogLevel.LIFECYCLE,
message="Chat Streaming response send_idx 0: request_id={request_id}, completion_tokens={completion_tokens}",
request_id=request_id,
completion_tokens=0,
)
first_iteration = False
output = res["outputs"]
@@ -497,7 +524,14 @@ class OpenAIServingChat:
chunk.choices = choices
yield f"data: {chunk.model_dump_json(exclude_unset=True)}\n\n"
if res["finished"]:
api_server_logger.info(f"Chat Streaming response last send: {chunk.model_dump_json()}")
log_request(
level=RequestLogLevel.LIFECYCLE,
message="Chat Streaming response last send: request_id={request_id}, finish_reason={finish_reason}, completion_tokens={completion_tokens}, logprobs={logprobs}",
request_id=request_id,
finish_reason=choice.finish_reason,
completion_tokens=previous_num_tokens[idx],
logprobs=logprobs_res,
)
choices = []
if include_usage:
@@ -525,7 +559,7 @@ class OpenAIServingChat:
except asyncio.CancelledError as e:
await self.engine_client.abort(f"{request_id}_0", 1 if request.n is None else request.n)
error_msg = f"request[{request_id}_0] client disconnected: {str(e)}, {str(traceback.format_exc())}"
api_server_logger.error(error_msg)
log_request_error(message=error_msg)
except Exception as e:
error_data = self._create_streaming_error_response(
f"request[{request_id}] generate stream error: {str(e)}, {str(traceback.format_exc())}"
@@ -536,7 +570,12 @@ class OpenAIServingChat:
tracing.trace_req_finish(request_id)
await self.engine_client.connection_manager.cleanup_request(request_id)
self.engine_client.semaphore.release()
api_server_logger.info(f"release {request_id} {self.engine_client.semaphore.status()}")
log_request(
level=RequestLogLevel.STAGES,
message="release {request_id} {status}",
request_id=request_id,
status=self.engine_client.semaphore.status(),
)
yield "data: [DONE]\n\n"
async def chat_completion_full_generator(
@@ -704,7 +743,9 @@ class OpenAIServingChat:
tracing.trace_req_finish(request_id)
await self.engine_client.connection_manager.cleanup_request(request_id)
self.engine_client.semaphore.release()
api_server_logger.info(f"release {self.engine_client.semaphore.status()}")
log_request(
RequestLogLevel.STAGES, message="release {status}", status=self.engine_client.semaphore.status()
)
num_prompt_tokens = len(prompt_token_ids)
num_generated_tokens = sum(previous_num_tokens)
@@ -731,7 +772,7 @@ class OpenAIServingChat:
choices=choices,
usage=usage,
)
api_server_logger.info(f"Chat response: {res.model_dump_json()}")
log_request(RequestLogLevel.CONTENT, message="Chat response: {response}", response=res.model_dump_json())
return res
async def _create_chat_completion_choice(
@@ -904,7 +945,7 @@ class OpenAIServingChat:
except Exception as e:
error_msg = f"Error in _build_logprobs_response: {e}, {str(traceback.format_exc())}"
api_server_logger.error(error_msg)
log_request_error(message=error_msg)
return None
def _build_prompt_logprobs(
@@ -41,6 +41,11 @@ from fastdeploy.entrypoints.openai.protocol import (
PromptTokenUsageInfo,
UsageInfo,
)
from fastdeploy.logger.request_logger import (
RequestLogLevel,
log_request,
log_request_error,
)
from fastdeploy.trace.constants import LoggingEventName
from fastdeploy.trace.trace_logger import print as trace_print
from fastdeploy.utils import (
@@ -91,13 +96,15 @@ class OpenAIServingCompletion:
err_msg = (
f"Only master node can accept completion request, please send request to master node: {self.master_ip}"
)
api_server_logger.error(err_msg)
log_request_error(message="request[{request_id}] {error}", request_id=request.request_id, error=err_msg)
return ErrorResponse(error=ErrorInfo(message=err_msg, type=ErrorType.INTERNAL_ERROR))
if self.models:
is_supported, request.model = self.models.is_supported_model(request.model)
if not is_supported:
err_msg = f"Unsupported model: [{request.model}], support [{', '.join([x.name for x in self.models.model_paths])}] or default"
api_server_logger.error(err_msg)
log_request_error(
message="request[{request_id}] {error}", request_id=request.request_id, error=err_msg
)
return ErrorResponse(
error=ErrorInfo(message=err_msg, type=ErrorType.INTERNAL_ERROR, code=ErrorCode.MODEL_NOT_SUPPORT)
)
@@ -110,7 +117,11 @@ class OpenAIServingCompletion:
request_id = f"cmpl-{request.user}-{uuid.uuid4()}"
else:
request_id = f"cmpl-{uuid.uuid4()}"
api_server_logger.info(f"Initialize request {request_id}: {request}")
log_request(
level=RequestLogLevel.LIFECYCLE,
message="Initialize request {request_id}",
request_id=request_id,
)
tracing.trace_req_start(rid=request_id, trace_content=request.trace_context, role="FastDeploy")
del request.trace_context
request_prompt_ids = None
@@ -147,15 +158,20 @@ class OpenAIServingCompletion:
else:
raise ValueError("Prompt type must be one of: str, list[str], list[int], list[list[int]]")
except Exception as e:
error_msg = f"OpenAIServingCompletion create_completion: {e}, {str(traceback.format_exc())}"
api_server_logger.error(error_msg)
error_msg = f"request[{request_id}] create_completion: {e}, {str(traceback.format_exc())}"
log_request_error(message=error_msg)
return ErrorResponse(error=ErrorInfo(message=error_msg, type=ErrorType.INTERNAL_ERROR))
if request_prompt_ids is not None:
request_prompts = request_prompt_ids
num_choices = len(request_prompts) * (1 if request.n is None else request.n)
api_server_logger.info(f"Start preprocessing request: req_id={request_id}), num_choices={num_choices}")
log_request(
RequestLogLevel.STAGES,
message="Start preprocessing request: req_id={request_id}), num_choices={num_choices}",
request_id=request_id,
num_choices=num_choices,
)
prompt_batched_token_ids = []
prompt_tokens_list = []
max_tokens_list = []
@@ -169,7 +185,7 @@ class OpenAIServingCompletion:
f"OpenAIServingCompletion waiting error: {e}, {str(traceback.format_exc())}, "
f"max waiting time: {self.max_waiting_time}"
)
api_server_logger.error(error_msg)
log_request_error(message="request[{request_id}] {error}", request_id=request_id, error=error_msg)
return ErrorResponse(
error=ErrorInfo(message=error_msg, code=ErrorCode.TIMEOUT, type=ErrorType.TIMEOUT_ERROR)
)
@@ -188,14 +204,19 @@ class OpenAIServingCompletion:
max_tokens_list.append(current_req_dict.get("max_tokens"))
del current_req_dict
except ParameterError as e:
api_server_logger.error(f"OpenAIServingCompletion format error: {e}, {e.message}")
log_request_error(
message="request[{request_id}] format error: {error}, {error_message}",
request_id=request_id,
error=e,
error_message=e.message,
)
self.engine_client.semaphore.release()
return ErrorResponse(
error=ErrorInfo(code="400", message=str(e.message), type="invalid_request", param=e.param)
)
except Exception as e:
error_msg = f"OpenAIServingCompletion format error: {e}, {str(traceback.format_exc())}"
api_server_logger.error(error_msg)
error_msg = f"request[{request_id}] format error: {e}, {str(traceback.format_exc())}"
log_request_error(message=error_msg)
self.engine_client.semaphore.release()
return ErrorResponse(
error=ErrorInfo(message=str(e), code=ErrorCode.INVALID_VALUE, type=ErrorType.INVALID_REQUEST_ERROR)
@@ -226,20 +247,20 @@ class OpenAIServingCompletion:
)
except Exception as e:
error_msg = (
f"OpenAIServingCompletion completion_full_generator error: {e}, {str(traceback.format_exc())}"
f"request[{request_id}] completion_full_generator error: {e}, {str(traceback.format_exc())}"
)
api_server_logger.error(error_msg)
log_request_error(message=error_msg)
return ErrorResponse(error=ErrorInfo(message=error_msg, type=ErrorType.INTERNAL_ERROR))
except asyncio.CancelledError as e:
await self.engine_client.abort(f"{request_id}_0", num_choices)
error_msg = f"request[{request_id}_0] client disconnected: {str(e)}, {str(traceback.format_exc())}"
api_server_logger.error(error_msg)
log_request_error(message=error_msg)
return ErrorResponse(
error=ErrorInfo(message=error_msg, type=ErrorType.INVALID_REQUEST_ERROR, code=ErrorCode.CLIENT_ABORTED)
)
except Exception as e:
error_msg = f"OpenAIServingCompletion create_completion error: {e}, {str(traceback.format_exc())}"
api_server_logger.error(error_msg)
error_msg = f"request[{request_id}] create_completion error: {e}, {str(traceback.format_exc())}"
log_request_error(message=error_msg)
return ErrorResponse(error=ErrorInfo(message=error_msg, type=ErrorType.INTERNAL_ERROR))
async def completion_full_generator(
@@ -368,10 +389,16 @@ class OpenAIServingCompletion:
prompt_tokens_list=prompt_tokens_list,
max_tokens_list=max_tokens_list,
)
api_server_logger.info(f"Completion response: {res.model_dump_json()}")
log_request(
RequestLogLevel.CONTENT, message="Completion response: {response}", response=res.model_dump_json()
)
return res
except Exception as e:
api_server_logger.error(f"Error in completion_full_generator: {e}", exc_info=True)
log_request_error(
message="request[{request_id}] error in completion_full_generator: {error}",
request_id=request_id,
error=e,
)
finally:
trace_print(LoggingEventName.POSTPROCESSING_END, request_id, getattr(request, "user", ""))
tracing.trace_req_finish(request_id)
@@ -514,8 +541,11 @@ class OpenAIServingCompletion:
],
)
yield f"data: {chunk.model_dump_json(exclude_unset=True)}\n\n"
api_server_logger.info(
f"Completion Streaming response send_idx 0: {chunk.model_dump_json()}"
log_request(
level=RequestLogLevel.LIFECYCLE,
message="Completion Streaming response send_idx 0: request_id={request_id}, completion_tokens={completion_tokens}",
request_id=request_id,
completion_tokens=0,
)
first_iteration[idx] = False
@@ -592,8 +622,11 @@ class OpenAIServingCompletion:
if send_idx == 0 and not request.return_token_ids:
chunk_temp = chunk
chunk_temp.choices = choices
api_server_logger.info(
f"Completion Streaming response send_idx 0: {chunk_temp.model_dump_json()}"
log_request(
level=RequestLogLevel.LIFECYCLE,
message="Completion Streaming response send_idx 0: request_id={request_id}, completion_tokens={completion_tokens}",
request_id=request_id,
completion_tokens=output_tokens[idx],
)
del chunk_temp
@@ -646,14 +679,26 @@ class OpenAIServingCompletion:
metrics=res["metrics"] if request.collect_metrics else None,
)
yield f"data: {usage_chunk.model_dump_json(exclude_unset=True)}\n\n"
api_server_logger.info(f"Completion Streaming response last send: {chunk.model_dump_json()}")
log_request(
level=RequestLogLevel.LIFECYCLE,
message="Completion Streaming response last send: request_id={request_id}, finish_reason={finish_reason}, completion_tokens={completion_tokens}, logprobs={logprobs}",
request_id=request_id,
finish_reason=chunk.choices[-1].finish_reason if chunk.choices else None,
completion_tokens=output_tokens[idx],
logprobs=logprobs_res,
)
except asyncio.CancelledError as e:
await self.engine_client.abort(f"{request_id}_0", num_choices)
error_msg = f"request[{request_id}_0] client disconnected: {str(e)}, {str(traceback.format_exc())}"
api_server_logger.error(error_msg)
log_request_error(message=error_msg)
except Exception as e:
api_server_logger.error(f"Error in completion_stream_generator: {e}, {str(traceback.format_exc())}")
log_request_error(
message="request[{request_id}] error in completion_stream_generator: {error}, {traceback}",
request_id=request_id,
error=e,
traceback=traceback.format_exc(),
)
yield f"data: {ErrorResponse(error=ErrorInfo(message=str(e), code='400', type=ErrorType.INTERNAL_ERROR)).model_dump_json(exclude_unset=True)}\n\n"
finally:
trace_print(LoggingEventName.POSTPROCESSING_END, request_id, getattr(request, "user", ""))
@@ -887,7 +932,11 @@ class OpenAIServingCompletion:
)
except Exception as e:
api_server_logger.error(f"Error in _build_logprobs_response: {str(e)}, {str(traceback.format_exc())}")
log_request_error(
message="Error in _build_logprobs_response: {error}, {traceback}",
error=str(e),
traceback=traceback.format_exc(),
)
return None
def _build_prompt_logprobs(
@@ -35,7 +35,7 @@ from fastdeploy.entrypoints.openai.protocol import (
UsageInfo,
)
from fastdeploy.entrypoints.openai.serving_engine import ServeContext, ZmqOpenAIServing
from fastdeploy.utils import api_server_logger
from fastdeploy.logger.request_logger import RequestLogLevel, log_request
def _get_embedding(
@@ -140,7 +140,12 @@ class OpenAIServingEmbedding(ZmqOpenAIServing):
@override
def _build_response(self, ctx: ServeContext, request_output: dict):
"""Generate final embedding response"""
api_server_logger.info(f"[{ctx.request_id}] Embedding RequestOutput received:{request_output}")
log_request(
level=RequestLogLevel.CONTENT,
message="[{request_id}] Embedding RequestOutput received:{request_output}",
request_id=ctx.request_id,
request_output=request_output,
)
base = PoolingRequestOutput.from_dict(request_output)
embedding_res = EmbeddingRequestOutput.from_base(base)
+50 -11
View File
@@ -33,6 +33,11 @@ from fastdeploy.entrypoints.openai.protocol import (
InvalidParameterException,
)
from fastdeploy.envs import FD_SUPPORT_MAX_CONNECTIONS
from fastdeploy.logger.request_logger import (
RequestLogLevel,
log_request,
log_request_error,
)
from fastdeploy.utils import ErrorCode, ErrorType, StatefulSemaphore, api_server_logger
RequestT = TypeVar("RequestT")
@@ -96,13 +101,18 @@ class OpenAIServing(ABC, Generic[RequestT]):
is_supported, adjusted_name = self.models.is_supported_model(model_name)
if not is_supported:
err_msg = f"Unsupported model: [{model_name}]"
api_server_logger.error(err_msg)
log_request_error(message=err_msg)
return is_supported, adjusted_name
async def _acquire_semaphore(self, request_id: str) -> bool:
"""Acquire engine client semaphore with timeout"""
try:
api_server_logger.info(f"Acquire request:{request_id} status:{self._get_semaphore().status()}")
log_request(
level=RequestLogLevel.STAGES,
message="Acquire request:{request_id} status:{status}",
request_id=request_id,
status=self._get_semaphore().status(),
)
if self.max_waiting_time < 0:
await self._get_semaphore().acquire()
else:
@@ -111,13 +121,18 @@ class OpenAIServing(ABC, Generic[RequestT]):
except asyncio.TimeoutError:
self._release_semaphore(request_id)
error_msg = f"Request waiting timeout, request:{request_id} max waiting time:{self.max_waiting_time}"
api_server_logger.error(error_msg)
log_request_error(message=error_msg)
return False
def _release_semaphore(self, request_id: str) -> None:
"""Release engine client semaphore"""
self._get_semaphore().release()
api_server_logger.info(f"Release request:{request_id} status:{self._get_semaphore().status()}")
log_request(
level=RequestLogLevel.STAGES,
message="Release request:{request_id} status:{status}",
request_id=request_id,
status=self._get_semaphore().status(),
)
def _create_error_response(
self,
@@ -128,7 +143,7 @@ class OpenAIServing(ABC, Generic[RequestT]):
) -> ErrorResponse:
"""Create standardized error response"""
traceback.print_exc()
api_server_logger.error(message)
log_request_error(message=message)
return ErrorResponse(error=ErrorInfo(message=message, type=error_type, code=code, param=param))
def _generate_request_id(self, request: RequestT) -> str:
@@ -193,7 +208,12 @@ class OpenAIServing(ABC, Generic[RequestT]):
request_id = self._generate_request_id(request)
ctx.request_id = request_id
api_server_logger.info(f"Initialize request {request_id}: {request}")
log_request(
level=RequestLogLevel.LIFECYCLE,
message="Initialize request {request_id}: {request}",
request_id=request_id,
request=request,
)
# Step 2: Semaphore acquisition
if not await self._acquire_semaphore(request_id):
@@ -252,7 +272,12 @@ class ZmqOpenAIServing(OpenAIServing):
request_dicts = self._request_to_batch_dicts(ctx)
ctx.preprocess_requests = request_dicts
for request_dict in request_dicts:
api_server_logger.info(f"batch add request_id: {request_dict['request_id']}, request: {request_dict}")
log_request(
level=RequestLogLevel.CONTENT,
message="batch add request_id: {request_id}, request: {request}",
request_id=request_dict["request_id"],
request=request_dict,
)
await self.engine_client.format_and_add_data(request_dict)
def _process_chat_template_kwargs(self, request_dict):
@@ -283,7 +308,11 @@ class ZmqOpenAIServing(OpenAIServing):
while num_choices > 0:
request_output_dicts = await asyncio.wait_for(request_output_queue.get(), timeout=60)
for request_output_dict in request_output_dicts:
api_server_logger.debug(f"Received RequestOutput: {request_output_dict}")
log_request(
level=RequestLogLevel.FULL,
message="Received RequestOutput: {request_output}",
request_output=request_output_dict,
)
if request_output_dict["finished"] is True:
num_choices -= 1
yield request_output_dict
@@ -301,7 +330,12 @@ class ZmqOpenAIServing(OpenAIServing):
async def _acquire_semaphore(self, request_id: str) -> bool:
"""Acquire engine client semaphore with timeout"""
try:
api_server_logger.info(f"Acquire request:{request_id} status:{self._get_semaphore().status()}")
log_request(
level=RequestLogLevel.STAGES,
message="Acquire request:{request_id} status:{status}",
request_id=request_id,
status=self._get_semaphore().status(),
)
if self.max_waiting_time < 0:
await self._get_semaphore().acquire()
else:
@@ -310,14 +344,19 @@ class ZmqOpenAIServing(OpenAIServing):
except asyncio.TimeoutError:
self._release_semaphore(request_id)
error_msg = f"Request waiting timeout, request:{request_id} max waiting time:{self.max_waiting_time}"
api_server_logger.error(error_msg)
log_request_error(message=error_msg)
return False
@override
def _release_semaphore(self, request_id: str) -> None:
"""Release engine client semaphore"""
self._get_semaphore().release()
api_server_logger.info(f"Release request:{request_id} status:{self._get_semaphore().status()}")
log_request(
level=RequestLogLevel.STAGES,
message="Release request:{request_id} status:{status}",
request_id=request_id,
status=self._get_semaphore().status(),
)
@override
def _check_master(self) -> bool:
@@ -24,7 +24,8 @@ from fastdeploy.entrypoints.openai.protocol import (
ModelList,
ModelPermission,
)
from fastdeploy.utils import ErrorType, api_server_logger, get_host_ip
from fastdeploy.logger.request_logger import log_request_error
from fastdeploy.utils import ErrorType, get_host_ip
@dataclass
@@ -86,7 +87,7 @@ class OpenAIServingModels:
err_msg = (
f"Only master node can accept models request, please send request to master node: {self.master_ip}"
)
api_server_logger.error(err_msg)
log_request_error(message=err_msg)
return ErrorResponse(error=ErrorInfo(message=err_msg, type=ErrorType.INTERNAL_ERROR))
model_infos = [
ModelInfo(
@@ -28,7 +28,7 @@ from fastdeploy.entrypoints.openai.protocol import (
UsageInfo,
)
from fastdeploy.entrypoints.openai.serving_engine import ServeContext, ZmqOpenAIServing
from fastdeploy.utils import api_server_logger
from fastdeploy.logger.request_logger import RequestLogLevel, log_request
class OpenAIServingReward(ZmqOpenAIServing):
@@ -77,7 +77,7 @@ class OpenAIServingReward(ZmqOpenAIServing):
response: ChatRewardResponse = None
generators: AsyncGenerator[ChatRewardResponse, None] = self.handle(ctx)
async for r in generators:
api_server_logger.info(f"engine pooling result:{r}")
log_request(RequestLogLevel.CONTENT, message="engine pooling result: {result}", result=r)
r.data[0].index = idx
idx += 1
if response is None or isinstance(r, ErrorResponse):
@@ -93,7 +93,12 @@ class OpenAIServingReward(ZmqOpenAIServing):
@override
def _build_response(self, ctx: ServeContext, request_output: dict):
"""Generate final reward response"""
api_server_logger.info(f"[{ctx.request_id}] Reward RequestOutput received:{request_output}")
log_request(
level=RequestLogLevel.CONTENT,
message="Reward RequestOutput received: request_id={request_id}, output={request_output}",
request_id=ctx.request_id,
request_output=request_output,
)
base = PoolingRequestOutput.from_dict(request_output)
reward_res = RewardRequestOutput.from_base(base)
@@ -41,6 +41,7 @@ from fastdeploy.entrypoints.openai.tool_parsers.abstract_tool_parser import (
ToolParser,
ToolParserManager,
)
from fastdeploy.logger.request_logger import log_request_error
from fastdeploy.utils import data_processor_logger
@@ -184,7 +185,9 @@ class Ernie45VLThinkingToolParser(ToolParser):
continue
if not function_call_arr:
data_processor_logger.error("No valid tool calls found")
log_request_error(
message="request[{request_id}] No valid tool calls found", request_id=request.request_id
)
return ExtractedToolCallInformation(tools_called=False, content=model_output)
tool_calls = []
@@ -226,7 +229,11 @@ class Ernie45VLThinkingToolParser(ToolParser):
)
except Exception as e:
data_processor_logger.error(f"Error in extracting tool call from response: {str(e)}")
log_request_error(
message="request[{request_id}] Error in extracting tool call from response: {error}",
request_id=request.request_id,
error=str(e),
)
return ExtractedToolCallInformation(tools_called=False, tool_calls=None, content=model_output)
def extract_tool_calls_streaming(
@@ -343,7 +350,11 @@ class Ernie45VLThinkingToolParser(ToolParser):
)
return delta
except Exception as e:
data_processor_logger.error(f"Error in streaming tool call extraction: {str(e)}")
log_request_error(
message="request[{request_id}] Error in streaming tool call extraction: {error}",
request_id=request.get("request_id"),
error=str(e),
)
return None
if "</tool_call>" in self.buffer:
end_pos = self.buffer.find("</tool_call>")
@@ -354,5 +365,9 @@ class Ernie45VLThinkingToolParser(ToolParser):
return delta
except Exception as e:
data_processor_logger.error(f"Error in streaming tool call extraction: {str(e)}")
log_request_error(
message="request[{request_id}] Error in streaming tool call extraction: {error}",
request_id=request.get("request_id"),
error=str(e),
)
return None
@@ -41,6 +41,7 @@ from fastdeploy.entrypoints.openai.tool_parsers.abstract_tool_parser import (
ToolParser,
ToolParserManager,
)
from fastdeploy.logger.request_logger import log_request_error
from fastdeploy.utils import data_processor_logger as logger
@@ -254,8 +255,11 @@ class ErnieX1ToolParser(ToolParser):
logger.debug("Skipping text %s - no arguments", delta_text)
delta = None
elif cur_arguments is None and prev_arguments is not None:
logger.error("should be impossible to have arguments reset " "mid-call. skipping streaming anything.")
elif not cur_arguments and prev_arguments:
log_request_error(
message="request[{request_id}] should be impossible to have arguments reset mid-call. skipping streaming anything.",
request_id=request.request_id,
)
delta = None
elif cur_arguments is not None and prev_arguments is None:
@@ -45,8 +45,12 @@ from fastdeploy.entrypoints.openai.v1.serving_base import (
ServingResponseContext,
)
from fastdeploy.input.tokenizer_client import AsyncTokenizerClient, ImageDecodeRequest
from fastdeploy.logger.request_logger import (
RequestLogLevel,
log_request,
log_request_error,
)
from fastdeploy.metrics.metrics import main_process_metrics
from fastdeploy.utils import api_server_logger
from fastdeploy.worker.output import LogprobsLists
@@ -178,7 +182,7 @@ class OpenAIServingChat(OpenAiServingBase):
except Exception as e:
error_msg = f"Error in _build_logprobs_response: {e}, {str(traceback.format_exc())}"
api_server_logger.error(error_msg)
log_request_error(message=error_msg)
return None
@override
@@ -302,7 +306,14 @@ class OpenAIServingChat(OpenAiServingBase):
max_tokens = request.max_completion_tokens or request.max_tokens
choice_completion_tokens = response_ctx.choice_completion_tokens_dict[output.index]
choice.finish_reason = self._calc_finish_reason(request_output, max_tokens, choice_completion_tokens)
api_server_logger.info(f"Chat Streaming response last send: {chunk.model_dump_json()}")
log_request(
level=RequestLogLevel.LIFECYCLE,
message="Chat Streaming response last send: request_id={request_id}, finish_reason={finish_reason}, completion_tokens={completion_tokens}, logprobs={logprobs}",
request_id=request_id,
finish_reason=choice.finish_reason,
completion_tokens=choice_completion_tokens,
logprobs=choice.logprobs,
)
yield f"data: {chunk.model_dump_json(exclude_unset=True)}\n\n"
if request_output.finished and response_ctx.remain_choices == 0:
@@ -339,7 +350,11 @@ class OpenAIServingChat(OpenAiServingBase):
res = ChatCompletionResponse(
id=ctx.request_id, model=request.model, choices=choices, created=ctx.created_time, usage=response_ctx.usage
)
api_server_logger.info(f"Chat response: {res.model_dump_json()}")
log_request(
level=RequestLogLevel.CONTENT,
message="Chat response: {response}",
response=res.model_dump_json(),
)
return res
async def _create_chat_completion_choice(
@@ -38,7 +38,12 @@ from fastdeploy.entrypoints.openai.v1.serving_base import (
ServeContext,
ServingResponseContext,
)
from fastdeploy.utils import ErrorType, api_server_logger
from fastdeploy.logger.request_logger import (
RequestLogLevel,
log_request,
log_request_error,
)
from fastdeploy.utils import ErrorType
from fastdeploy.worker.output import LogprobsLists
@@ -94,7 +99,7 @@ class OpenAIServingCompletion(OpenAiServingBase):
raise ValueError("Prompt type must be one of: str, list[str], list[int], list[list[int]]")
except Exception as e:
error_msg = f"OpenAIServingCompletion create_completion: {e}, {str(traceback.format_exc())}"
api_server_logger.error(error_msg)
log_request_error(message=error_msg)
return ErrorResponse(error=ErrorInfo(message=error_msg, type=ErrorType.INTERNAL_ERROR))
if request_prompt_ids is not None:
@@ -199,7 +204,11 @@ class OpenAIServingCompletion(OpenAiServingBase):
)
except Exception as e:
api_server_logger.error(f"Error in _build_logprobs_response: {str(e)}, {str(traceback.format_exc())}")
log_request_error(
message="Error in _build_logprobs_response: {error}, {traceback}",
error=str(e),
traceback=traceback.format_exc(),
)
return None
async def _build_stream_response(
@@ -271,9 +280,21 @@ class OpenAIServingCompletion(OpenAiServingBase):
choice.finish_reason = self._calc_finish_reason(
request_output, request.max_tokens, choice_completion_tokens
)
api_server_logger.info(f"Completion Streaming response last send: {chunk.model_dump_json()}")
log_request(
level=RequestLogLevel.LIFECYCLE,
message="Completion Streaming response last send: request_id={request_id}, finish_reason={finish_reason}, completion_tokens={completion_tokens}, logprobs={logprobs}",
request_id=request_id,
finish_reason=choice.finish_reason,
completion_tokens=choice_completion_tokens,
logprobs=choice.logprobs,
)
if send_idx == 0 and not request.return_token_ids:
api_server_logger.info(f"Completion Streaming response send_idx 0: {chunk.model_dump_json()}")
log_request(
level=RequestLogLevel.LIFECYCLE,
message="Completion Streaming response send_idx 0: request_id={request_id}, completion_tokens={completion_tokens}",
request_id=request_id,
completion_tokens=response_ctx.choice_completion_tokens_dict[output.index],
)
yield f"data: {chunk.model_dump_json()}\n\n"
if request_output.finished and response_ctx.remain_choices == 0:
if include_usage:
@@ -287,7 +308,12 @@ class OpenAIServingCompletion(OpenAiServingBase):
yield f"data: {usage_chunk.model_dump_json(exclude_unset=True)}\n\n"
yield "data: [DONE]\n\n"
except Exception as e:
api_server_logger.error(f"Error in completion_stream_generator: {e}, {str(traceback.format_exc())}")
log_request_error(
message="request[{request_id}] Error in completion_stream_generator: {error}, {traceback}",
request_id=request_id,
error=e,
traceback=traceback.format_exc(),
)
yield f"data: {ErrorResponse(error=ErrorInfo(message=str(e), code='400', type=ErrorType.INTERNAL_ERROR)).model_dump_json(exclude_unset=True)}\n\n"
async def _build_full_response(
@@ -321,10 +347,18 @@ class OpenAIServingCompletion(OpenAiServingBase):
choices=choices,
usage=response_ctx.usage,
)
api_server_logger.info(f"Completion response: {res.model_dump_json()}")
log_request(
level=RequestLogLevel.FULL,
message="Completion response: {response}",
response=res.model_dump_json(),
)
return res
except Exception as e:
api_server_logger.error(f"Error in completion_full_generator: {e}", exc_info=True)
log_request_error(
message="request[{request_id}] Error in completion_full_generator: {error}",
request_id=ctx.request_id,
error=e,
)
return self._create_error_response(str(e))
def build_completion_choice(