mirror of
https://github.com/PaddlePaddle/FastDeploy.git
synced 2026-04-23 00:17:25 +08:00
[RL] add pause, update_weights, resume interface for async RL (#6052)
* support dynamic run_control_request through zmq from apiserver to common_engine * support pause/resume/is_paused/update_weights in apiserver->common_engine by common run_control_method * change /is_puased from HTTP POST method to GET method * add pause、resume、is_paused implementation * support engine <==> worker communication(request&response) * support sync weights through RDMA from checkpoint_transfer * support specified version, rsync_config in update_weights rpc call * add pause, update_weights, resume interface for async RL * bug fix: update_weights support using default arguments * fix typo * typo fix * typo fix * typo fix * add unitest for control request/response, localscheduler.get_inflight_requests, resource_manager_v1.preempted_all * add "rsync" to LoadConfig.load_strategy Literal type hints Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com> * typo fix * typo fix * Apply suggestion from @Copilot Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com> * check version/rsync params * add error log when version.txt not exists Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com> * raise specified ValueError when paramters check failed Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com> * tp barrier after run_control_method * encode 'engine_worker_queue_port' to unique name of worker2engine fmq queue * typo fix * typo fix --------- Co-authored-by: Jiang-Jia-Jun <163579578+Jiang-Jia-Jun@users.noreply.github.com> Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com>
This commit is contained in:
@@ -14,6 +14,7 @@
|
||||
# limitations under the License.
|
||||
"""
|
||||
|
||||
import asyncio
|
||||
import inspect
|
||||
import os
|
||||
import re
|
||||
@@ -29,7 +30,12 @@ from filelock import FileLock
|
||||
import fastdeploy.metrics.trace as tracing
|
||||
from fastdeploy import envs
|
||||
from fastdeploy.config import FDConfig
|
||||
from fastdeploy.engine.request import Request, RequestStatus
|
||||
from fastdeploy.engine.request import (
|
||||
ControlRequest,
|
||||
ControlResponse,
|
||||
Request,
|
||||
RequestStatus,
|
||||
)
|
||||
from fastdeploy.entrypoints.openai.utils import DealerConnectionManager
|
||||
from fastdeploy.envs import FD_SUPPORT_MAX_CONNECTIONS
|
||||
from fastdeploy.eplb.utils import RedundantExpertWorkload
|
||||
@@ -526,6 +532,23 @@ class EngineClient:
|
||||
|
||||
return True, ""
|
||||
|
||||
async def run_control_method(self, request: ControlRequest):
|
||||
api_server_logger.info(f"Start Run Control Method: {request}")
|
||||
self.zmq_client.send_json(request.to_dict())
|
||||
request_id = request.request_id
|
||||
dealer, response_queue = await self.connection_manager.get_connection(request_id)
|
||||
dealer.write([b"", request_id.encode("utf-8")])
|
||||
try:
|
||||
# todo: support user specified timeout. default 600s is enough for most control cases
|
||||
response = await asyncio.wait_for(response_queue.get(), timeout=600)
|
||||
response = ControlResponse.from_dict(response[0])
|
||||
api_server_logger.info(f"End Run Control Method: {response}")
|
||||
return response
|
||||
except asyncio.TimeoutError:
|
||||
error_response = ControlResponse(request_id, 500, "Timeout waiting for control method response")
|
||||
api_server_logger.error(f"Error Run Control Method: {error_response}")
|
||||
return error_response
|
||||
|
||||
def is_workers_alive(self):
|
||||
"""
|
||||
Check the health of the model server by checking whether all workers are alive.
|
||||
|
||||
Reference in New Issue
Block a user