[Feature] Support stopping the inference for the corresponding request in the online service after a disconnection request. (#5320)

* request disconnect

* request disconnect

* fix bug

* fix bug--amend

---------

Co-authored-by: root <root@yq01-sys-rpm26xc1knu.yq01.baidu.com>
This commit is contained in:
qwes5s5
2026-01-16 11:46:13 +08:00
committed by GitHub
parent 8f035101ad
commit b2a2e11551
25 changed files with 1339 additions and 63 deletions
@@ -230,7 +230,13 @@ class OpenAIServingCompletion:
)
api_server_logger.error(error_msg)
return ErrorResponse(error=ErrorInfo(message=error_msg, type=ErrorType.INTERNAL_ERROR))
except asyncio.CancelledError as e:
await self.engine_client.abort(f"{request_id}_0", num_choices)
error_msg = f"request[{request_id}_0] client disconnected: {str(e)}, {str(traceback.format_exc())}"
api_server_logger.error(error_msg)
return ErrorResponse(
error=ErrorInfo(message=error_msg, type=ErrorType.INVALID_REQUEST_ERROR, code=ErrorCode.CLIENT_ABORTED)
)
except Exception as e:
error_msg = f"OpenAIServingCompletion create_completion error: {e}, {str(traceback.format_exc())}"
api_server_logger.error(error_msg)
@@ -285,7 +291,9 @@ class OpenAIServingCompletion:
except asyncio.TimeoutError:
current_waiting_time += 10
if current_waiting_time == 300:
status, msg = self.engine_client.check_health(time_interval_threashold=envs.FD_WORKER_ALIVE_TIMEOUT)
status, msg = self.engine_client.check_health(
time_interval_threashold=envs.FD_WORKER_ALIVE_TIMEOUT
)
if not status:
raise ValueError(f"Engine is not healthy: {msg}")
else:
@@ -455,7 +463,9 @@ class OpenAIServingCompletion:
except asyncio.TimeoutError:
current_waiting_time += 10
if current_waiting_time == 300:
status, msg = self.engine_client.check_health(time_interval_threashold=envs.FD_WORKER_ALIVE_TIMEOUT)
status, msg = self.engine_client.check_health(
time_interval_threashold=envs.FD_WORKER_ALIVE_TIMEOUT
)
if not status:
raise ValueError(f"Engine is not healthy: {msg}")
else:
@@ -634,6 +644,10 @@ class OpenAIServingCompletion:
yield f"data: {usage_chunk.model_dump_json(exclude_unset=True)}\n\n"
api_server_logger.info(f"Completion Streaming response last send: {chunk.model_dump_json()}")
except asyncio.CancelledError as e:
await self.engine_client.abort(f"{request_id}_0", num_choices)
error_msg = f"request[{request_id}_0] client disconnected: {str(e)}, {str(traceback.format_exc())}"
api_server_logger.error(error_msg)
except Exception as e:
api_server_logger.error(f"Error in completion_stream_generator: {e}, {str(traceback.format_exc())}")
yield f"data: {ErrorResponse(error=ErrorInfo(message=str(e), code='400', type=ErrorType.INTERNAL_ERROR)).model_dump_json(exclude_unset=True)}\n\n"