[RL] add pause, update_weights, resume interface for async RL (#6052)

* support dynamic run_control_request through zmq from apiserver to common_engine * support pause/resume/is_paused/update_weights in apiserver->common_engine by common run_control_method * change /is_puased from HTTP POST method to GET method * add pause、resume、is_paused implementation * support engine <==> worker communication(request&response) * support sync weights through RDMA from checkpoint_transfer * support specified version, rsync_config in update_weights rpc call * add pause, update_weights, resume interface for async RL * bug fix: update_weights support using default arguments * fix typo * typo fix * typo fix * typo fix * add unitest for control request/response, localscheduler.get_inflight_requests, resource_manager_v1.preempted_all * add "rsync" to LoadConfig.load_strategy Literal type hints Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com> * typo fix * typo fix * Apply suggestion from @Copilot Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com> * check version/rsync params * add error log when version.txt not exists Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com> * raise specified ValueError when paramters check failed Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com> * tp barrier after run_control_method * encode 'engine_worker_queue_port' to unique name of worker2engine fmq queue * typo fix * typo fix --------- Co-authored-by: Jiang-Jia-Jun <163579578+Jiang-Jia-Jun@users.noreply.github.com> Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com>
2026-04-23 00:17:25 +08:00 · 2026-01-23 10:18:07 +08:00
parent 96b2cf2c20
commit b7c5daa316
18 changed files with 1170 additions and 16 deletions
@@ -1225,7 +1225,8 @@ class LoadConfig:
    ):
        self.load_choices: Union[str, LoadChoices] = LoadChoices.DEFAULT.value
        self.dynamic_load_weight: bool = False
-        self.load_strategy: Optional[Literal["ipc", "ipc_snapshot", "meta", "normal"]] = "normal"
+        self.load_strategy: Optional[Literal["ipc", "ipc_snapshot", "meta", "normal", "rsync"]] = "normal"
        self.rsync_config: Optional[Dict[str, Any]] = None
        for key, value in args.items():
            if hasattr(self, key):
                setattr(self, key, value)
@@ -195,6 +195,10 @@ class EngineArgs:
    """
    dynamic load weight strategy
    """
    rsync_config: Optional[Dict[str, Any]] = None
    """
    rsync weights config info
    """
    quantization: Optional[Dict[str, Any]] = None
    guided_decoding_backend: str = "off"
    """
@@ -812,6 +816,12 @@ class EngineArgs:
            default=EngineArgs.load_strategy,
            help="Flag to dynamic load strategy.",
        )
        model_group.add_argument(
            "--rsync-config",
            type=json.loads,
            default=EngineArgs.rsync_config,
            help="Rsync weights config",
        )
        model_group.add_argument(
            "--engine-worker-queue-port",
            type=lambda s: s.split(",") if s else None,
@@ -16,6 +16,7 @@
 from __future__ import annotations
 import asyncio
 import copy
 import json
 import multiprocessing
@@ -38,7 +39,14 @@ import zmq
 from tqdm import tqdm
 import fastdeploy.metrics.trace as tracing
-from fastdeploy.engine.request import Request, RequestOutput, RequestStatus, RequestType
+from fastdeploy.engine.request import (
    ControlRequest,
    ControlResponse,
    Request,
    RequestOutput,
    RequestStatus,
    RequestType,
 )
 from fastdeploy.engine.resource_manager import ResourceManager
 from fastdeploy.engine.sched.resource_manager_v1 import ResourceManagerV1
 from fastdeploy.eplb.utils import init_eplb_signals
@@ -50,6 +58,7 @@ from fastdeploy.inter_communicator import (
    ZmqIpcServer,
    ZmqTcpServer,
 )
 from fastdeploy.inter_communicator.fmq import FMQ
 from fastdeploy.metrics.metrics import main_process_metrics
 from fastdeploy.model_executor.guided_decoding import schema_checker
 from fastdeploy.plugins.token_processor import load_token_processor_plugins
@@ -89,6 +98,18 @@ class EngineService:
        else:
            self.llm_logger = llm_logger
        self.is_paused = False  # pause request generation
        self._pause_cond = threading.Condition()
        self._ctrl_worker_output_queues = []
        tp_size = cfg.parallel_config.tensor_parallel_size
        dp_index = cfg.parallel_config.local_data_parallel_id
        for rank in range(tp_size):
            engine_worker_queue_port = self.cfg.parallel_config.local_engine_worker_queue_port
            name = f"ctrl_w2e_rank{rank+tp_size*dp_index}_{engine_worker_queue_port}"
            self.llm_logger.info(f"Init Worker Control Output Queue: {name}(consumer)")
            self._ctrl_worker_output_queues.append(FMQ().queue(name, "consumer"))
        self.scheduler = cfg.scheduler_config.scheduler()
        self.enable_decode_cache_task = envs.FD_ENABLE_CACHE_TASK == "1"
@@ -758,6 +779,8 @@ class EngineService:
        def _fetch_request():
            try:
                with self._pause_cond:
                    self._pause_cond.wait_for(lambda: not self.is_paused)
                nonlocal is_fetching
                num_prefill_batch = min(
                    int(self.resource_manager.available_batch()),
@@ -922,6 +945,8 @@ class EngineService:
                is_fetching = False
        while self.running:
            with self._pause_cond:
                self._pause_cond.wait_for(lambda: not self.is_paused)
            try:
                if self.engine_worker_queue.exist_tasks():
                    time.sleep(0.001)
@@ -1065,6 +1090,17 @@ class EngineService:
                        self.recv_request_server = ZmqIpcServer(name=self.api_server_pid, mode=zmq.PULL)
                    continue
                if ControlRequest.is_control_request(data):
                    try:  # todo: run control request async, do not block request generation
                        control_req = ControlRequest.from_dict(data)
                        self.run_control_method(control_req)
                    except Exception as e:
                        self.llm_logger.error(
                            f"Failed to process control request {data.get('request_id')}: "
                            f"{e}, {traceback.format_exc()}"
                        )
                    continue
                request, insert_task = data, []
                results: List[Tuple[str, Optional[str]]] = list()
                if data:
@@ -1096,6 +1132,13 @@ class EngineService:
                        trace_print(LoggingEventName.REQUEST_SCHEDULE_START, data["request_id"], data.get("user", ""))
                        trace_print(LoggingEventName.REQUEST_QUEUE_START, data["request_id"], data.get("user", ""))
                        self.llm_logger.debug(f"Receive request from api server: {request}")
                        if self.is_paused:
                            self.llm_logger.warning(f"Engine is paused, drop request: {request}")
                            self._send_error_response(
                                request.request_id, "Request is aborted since LLM Engine is paused."
                            )
                            continue
                    except Exception as e:
                        self.llm_logger.error(f"Receive request error: {e}, {traceback.format_exc()!s}")
                        err_msg = str(e)
@@ -1135,6 +1178,200 @@ class EngineService:
                    f"traceback={traceback.format_exc()}"
                )
    def run_control_method(self, control_req: ControlRequest):
        """
        Execute control method, process control request and return response.
        This method is responsible for handling control requests, calling the corresponding
        handler function based on the method name in the request. If the method doesn't exist
        or is not callable, it returns an error response; otherwise executes the method and
        returns a success response.
        Args:
            control_req (ControlRequest): Control request object containing request ID,
                method name and parameters.
        Returns:
            None: No return value, sends ControlResponse through send_response_server.
        """
        method = control_req.get_method()
        request_id = control_req.request_id
        try:
            self.llm_logger.info(f"START run control method {request_id}: {method}")
            handler_name = f"_control_{method}"
            handler = getattr(self, handler_name, None)
            if handler is None or not callable(handler):
                error_result = ControlResponse(request_id, 400, f"unknown control method:{method}")
                self.llm_logger.error(str(error_result))
                self.send_response_server.send_response(request_id, [error_result])
                return
            result = handler(control_req)
            self.llm_logger.info(f"SUCCESS run control method {method}.")
            succ_result = ControlResponse(request_id, 200, "Success", result)
            self.send_response_server.send_response(request_id, [succ_result])
        except Exception as e:
            error_msg = f"Failed run control method {method}: {str(e)}"
            self.llm_logger.error(f"{error_msg}\n{traceback.format_exc()}")
            error_result = ControlResponse(request_id, 500, error_msg)
            self.send_response_server.send_response(request_id, [error_result])
    def _control_pause(self, control_request: ControlRequest):
        """Pauses the LLM engine and aborts all running/inflight requests.
        Args:
            control_request: The control request containing pause command
        Raises:
            Exception: If pause is not supported in current configuration
            Exception: If engine worker queue cleanup times out
        Returns:
            None
        """
        if not envs.ENABLE_V1_KVCACHE_SCHEDULER:
            raise Exception("pause only supported in ENABLE_V1_KVCACHE_SCHEDULER")
        if self.cfg.scheduler_config.name != "local":
            raise Exception(f"pause only supported in local scheduler, current {self.cfg.scheduler_config.name}")
        with self._pause_cond:
            if self.is_paused:
                self.llm_logger.info("Pause Request Generation: already paused.")
            self.is_paused = True
        self.llm_logger.info("Start Abort Running Requests")
        self.resource_manager.log_status()
        # preempted all running reqs. preempted reqs will be append to ResourceManager.waiting queue
        timeout, count = 60, 0
        while self.engine_worker_queue.exist_tasks():
            time.sleep(0.001)
            count += 1
            if count >= timeout * 1000:
                break
        if count >= timeout * 1000:
            error_msg = f"wait engine_worker_queue tasks empty timeout after {timeout} seconds, worker may Hanged"
            self.llm_logger.error(error_msg)
            raise Exception(error_msg)
        running_reqs = self.resource_manager.preempted_all()
        if len(running_reqs) > 0:
            self.llm_logger.info(f"Total {len(running_reqs)} requests need to be aborted.")
            self.resource_manager.get_real_bsz()
            self.engine_worker_queue.put_tasks((running_reqs, self.resource_manager.real_bsz))
            self.resource_manager.wait_worker_inflight_requests_finish(timeout=60)
        # self.engine_worker_queue.clear_data()
        self.token_processor.clear_data()
        self.resource_manager.log_status()
        # abort inflight requests to user
        inflight_requests = self.scheduler.get_inflight_requests()
        self.llm_logger.info(f"Start Abort Inflight Requests, total {len(inflight_requests)} waiting requests")
        for req in inflight_requests:
            self._send_error_response(req.request_id, "Request is aborted since LLM Engine is paused.")
        self.scheduler.reset()
        self.resource_manager.cache_manager.reset()
        return None
    def _control_resume(self, control_request: ControlRequest) -> Optional[dict]:
        """Control function for resuming request generation.
        This method resumes the paused request generation process by setting the pause flag
        and notifying all waiting threads. It logs the start and end of the resume operation.
        Args:
            control_request: Control request object containing resume operation information
        """
        self.llm_logger.info("START Resume Request Generation")
        with self._pause_cond:
            if not self.is_paused:
                self.llm_logger.info("Resume Request Generation: not paused.")
                return None
            self.is_paused = False
            self._pause_cond.notify_all()
        self.llm_logger.info("END Resume Request Generation")
        return None
    def _control_is_paused(self, control_request: ControlRequest) -> bool:
        """
        Check if the LLM engine is in paused state.
        Args:
            control_request: Control request object.
        Returns:
            dict: Dictionary containing pause status information, {'is_paused': bool}
        """
        self.llm_logger.info(f"LLM Engine request generation is paused: {self.is_paused}")
        with self._pause_cond:
            return {"is_paused": self.is_paused}
    def _control_update_weights(self, control_request: ControlRequest) -> Optional[dict]:
        """Update model weights
        Args:
            control_request: Control request object containing parameters for weight updates
        Returns:
            Optional[dict]: Returns the result dictionary if update succeeds, None otherwise
        Raises:
            Exception: Raised when the engine is not in paused state
        """
        self.llm_logger.info("Update Model Weights")
        with self._pause_cond:
            if self.is_paused is False:
                error_msg = "Pause LLM Engine first before calling updating weights"
                self.llm_logger.error(error_msg)
                raise Exception(error_msg)
        return self._call_worker(control_request, 60)
    async def _wait_all_control_responses(self, request_id: str, timeout: int):
        """Wait for control responses from all workers with a global timeout.
        This method concurrently waits for responses from all control workers
        and enforces an overall timeout to avoid leaking pending tasks.
        """
        timeout_ms = timeout * 1000
        # Create one get() coroutine per worker output queue
        tasks = [output_queue.get(timeout=timeout_ms) for output_queue in self._ctrl_worker_output_queues]
        try:
            results = await asyncio.wait_for(
                asyncio.gather(*tasks, return_exceptions=True),
                timeout=timeout,
            )
        except asyncio.TimeoutError:
            # Keep the error message consistent with previous behavior
            raise Exception("Worker Update Weights Timeouted after 600s")
        responses = []
        for output_queue, msg in zip(self._ctrl_worker_output_queues, results):
            if isinstance(msg, Exception):
                self.llm_logger.error(f"Call Worker Failed: {output_queue.name} {repr(msg)}")
                raise Exception(f"Call Worker error: {repr(msg)}")
            if msg is None:
                # Preserve original semantics when no message is received
                raise Exception("Worker Update Weights Timeouted after 600s")
            response: ControlResponse = msg.payload
            if response.request_id != request_id:
                self.llm_logger.info(f"ignore old control response from worker:{output_queue.name} {response}")
                continue
            if response.error_code != 200:
                self.llm_logger.info(f"Call Worker Failed: {output_queue.name} {response.error_message}")
                raise Exception(f"Call Worker error: {response.error_message}")
            self.llm_logger.info(f"Call Worker Succeed: {output_queue.name} {response.result}")
            responses.append(response.result)
        return responses
    def _call_worker(self, control_request: ControlRequest, timeout: int):
        request_id = control_request.request_id
        self.engine_worker_queue.put_tasks(([control_request], 1))
        # Use a single asyncio.run() to concurrently wait for all worker responses.
        return asyncio.run(self._wait_all_control_responses(request_id, timeout))
    def _send_error_response(self, request_id, error_msg, error_code: int = 500):
        self.llm_logger.error(
            f"Send error response to client, request_id: {request_id}, error_msg: {error_msg}, error_code: {error_code}"
@@ -1708,6 +1945,7 @@ class EngineService:
            f" --graph_optimization_config '{self.cfg.graph_opt_config.to_json_string()}'"
            f" --guided_decoding_backend {self.cfg.structured_outputs_config.guided_decoding_backend}"
            f" --load_strategy {self.cfg.load_config.load_strategy}"
            f" --rsync_config '{json.dumps(self.cfg.load_config.rsync_config)}'"
            f" --early_stop_config '{self.cfg.early_stop_config.to_json_string()}'"
            f" --reasoning_parser {self.cfg.structured_outputs_config.reasoning_parser}"
            f" --load_choices {self.cfg.load_config.load_choices}"
@@ -556,6 +556,7 @@ class LLMEngine:
            f" --graph_optimization_config '{self.cfg.graph_opt_config.to_json_string()}'"
            f" --guided_decoding_backend {self.cfg.structured_outputs_config.guided_decoding_backend}"
            f" --load_strategy {self.cfg.load_config.load_strategy}"
            f" --rsync_config '{json.dumps(self.cfg.load_config.rsync_config)}'"
            f" --early_stop_config '{self.cfg.early_stop_config.to_json_string()}'"
            f" --reasoning_parser {self.cfg.structured_outputs_config.reasoning_parser}"
            f" --load_choices {self.cfg.load_config.load_choices}"
@@ -26,6 +26,7 @@ from typing import TypeVar as TypingTypeVar
 from typing import Union
 import numpy as np
 from fastapi.responses import JSONResponse
 from pydantic import BaseModel
 from typing_extensions import TypeVar
@@ -99,7 +100,7 @@ class Request:
        guided_json_object: Optional[bool] = None,
        enable_thinking: Optional[bool] = None,
        reasoning_max_tokens: Optional[int] = None,
-        trace_carrier: dict = dict(),
+        trace_carrier: Optional[Dict[str, Any]] = None,
        dp_rank: Optional[int] = None,
        chat_template: Optional[str] = None,
        image_start: int = 0,
@@ -544,6 +545,157 @@ class Request:
        return hasattr(self, key)
 class ControlRequest:
    """A generic control request that supports method and args for control operations.
    This request type is used for system-level control operations rather than
    typical inference requests. It enables dynamic control of engine behavior,
    resource management, and system configuration via a flexible method-args interface.
    """
    def __init__(
        self,
        request_id: str,
        method: str,
        args: Optional[Dict[str, Any]] = None,
    ) -> None:
        """
        Args:
            request_id: Unique identifier for the control request.
            method: The control method to execute (e.g., "reset_scheduler", "get_metrics").
            args: Optional arguments for the control method.
        """
        self.request_id = request_id
        self.method = method
        self.args = args or {}
    @classmethod
    def from_dict(cls, d: dict):
        """Create ControlRequest instance from dictionary."""
        return cls(request_id=d["request_id"], method=d["method"], args=d.get("args", {}))
    def to_dict(self) -> dict:
        """Convert ControlRequest into a serializable dict."""
        return {"request_id": self.request_id, "method": self.method, "args": self.args}
    def __repr__(self) -> str:
        """Provide a clean representation of the control request."""
        try:
            if not envs.FD_DEBUG:
                return f"ControlRequest(request_id={self.request_id}, method={self.method})"
            else:
                return (
                    f"ControlRequest("
                    f"request_id={self.request_id}, "
                    f"method={self.method}, "
                    f"args={self.args}"
                    f")"
                )
        except Exception as e:
            return f"<ControlRequest repr failed: {e}>"
    def get_method(self) -> str:
        """Get the control method name."""
        return self.method
    def get_args(self) -> Dict[str, Any]:
        """Get the control method arguments."""
        return self.args.copy()
    @staticmethod
    def is_control_request(d: dict) -> bool:
        """
        Check if a dictionary represents a valid ControlRequest.
        Args:
            d: Dictionary to check
        Returns:
            bool: True if the dictionary contains the required fields for a ControlRequest
        """
        # Check if all required fields are present and have correct types
        if not isinstance(d, dict):
            return False
        # Check field types
        if "request_id" not in d or not isinstance(d.get("request_id"), str):
            return False
        if "method" not in d or not isinstance(d.get("method"), str):
            return False
        # Args is optional, but if present should be a dict
        if "args" in d and not isinstance(d["args"], dict):
            return False
        return True
 class ControlResponse:
    """
    Response for control operations
    """
    def __init__(
        self,
        request_id: str,
        error_code: int = 200,
        error_message: Optional[str] = None,
        result: Optional[dict] = None,
        finished: bool = True,
    ) -> None:
        self.request_id = request_id
        self.finished = finished
        self.error_message = error_message
        self.result = result
        self.error_code = error_code
    def to_dict(self) -> dict:
        """Convert ControlResponse into a serializable dict."""
        return {
            "request_id": self.request_id,
            "finished": self.finished,
            "error_code": self.error_code,
            "error_message": self.error_message,
            "result": self.result,
        }
    @classmethod
    def from_dict(cls, d: dict):
        """Create ControlResponse instance from dictionary."""
        return cls(
            request_id=d["request_id"],
            finished=d.get("finished", True),
            error_code=d.get("error_code", 200),
            error_message=d.get("error_message"),
            result=d.get("result"),
        )
    def to_api_json_response(self) -> JSONResponse:
        """Convert ControlResponse into a JSONResponse."""
        status = "success" if self.error_code == 200 else "error"
        content = {
            "request_id": self.request_id,
            "status": status,
            "error_message": self.error_message,
            "result": self.result,
        }
        return JSONResponse(status_code=self.error_code, content=content)
    def __repr__(self) -> str:
        """Provide a clean representation of the control response."""
        return (
            f"ControlResponse("
            f"request_id={self.request_id}, "
            f"finished={self.finished}, "
            f"error_code={self.error_code}, "
            f"error_message={self.error_message}, "
            f"result={self.result}"
            f")"
        )
@dataclass(slots=True)
 class CompletionOutput:
    """The output data of one completion output of a request.
@@ -16,6 +16,7 @@
 import copy
 import threading
 import time
 import traceback
 from collections import deque
 from collections.abc import Iterable
@@ -240,6 +241,7 @@ class ResourceManagerV1(ResourceManager):
    def reschedule_preempt_task(self, request_id, process_func=None):
        with self.lock:
            llm_logger.debug(f"reschedule {request_id} into waiting queue")
            if request_id in self.to_be_rescheduled_request_id_set and request_id in self.requests:
                request = self.requests[request_id]
                if process_func is not None:
@@ -266,6 +268,39 @@ class ResourceManagerV1(ResourceManager):
                return True
        return False
    def preempted_all(self):
        with self.lock:
            preempted_reqs = []
            for i in range(len(self.running)):
                req = self.running.pop()
                # txt2image: req.use_extend_tables is True, req can not be preempted. txt2image is not used in RL.
                if req.use_extend_tables:
                    self.running.insert(0, req)
                    continue
                req.status = RequestStatus.PREEMPTED
                req.num_computed_tokens = 0
                self._free_blocks(req)
                req.cached_block_num = 0
                self.to_be_rescheduled_request_id_set.add(req.request_id)
                preempted_reqs.append(self._prepare_preempt_task(req))
            return preempted_reqs
    def wait_worker_inflight_requests_finish(self, timeout=60):
        count = 0
        while count < timeout * 1000:
            # wait ongoing running and rescheduled requests finished in worker
            running_reqs_count = len(self.to_be_rescheduled_request_id_set) + len(self.running)
            if running_reqs_count == 0:
                break
            count += 1
            time.sleep(0.001)
        if count >= timeout * 1000:
            llm_logger.info(
                f"wait_inflight_requests_finish timeout after {timeout} seconds, "
                f"still {len(self.to_be_rescheduled_request_id_set)} requests running"
            )
    def _trigger_preempt(self, request, num_new_blocks, preempted_reqs, scheduled_reqs):
        """
        If the request cannot be scheduled, preempt the running request one by one until it can be scheduled. Last in, first out.
@@ -1347,3 +1382,16 @@ class ResourceManagerV1(ResourceManager):
        main_process_metrics.gpu_cache_usage_perc.set(self.get_gpu_cache_usage_perc())
        main_process_metrics.num_requests_running.set(len(self.running))
        main_process_metrics.num_requests_waiting.set(num_tasks - len(self.running))
    def log_status(self):
        llm_logger.info(
            f"ResourceManagerV1( "
            f"waiting={len(self.waiting)}, "
            f"running={len(self.running)}, "
            f"preempted={len(self.to_be_rescheduled_request_id_set)}, "
            f"tasks_list={self.tasks_list}, "
            f"stop_flags={self.stop_flags}, "
            f"req_dict={self.req_dict}, "
            f"requests={self.requests}, "
            f")"
        )
@@ -14,6 +14,7 @@
 # limitations under the License.
 """
 import asyncio
 import inspect
 import os
 import re
@@ -29,7 +30,12 @@ from filelock import FileLock
 import fastdeploy.metrics.trace as tracing
 from fastdeploy import envs
 from fastdeploy.config import FDConfig
-from fastdeploy.engine.request import Request, RequestStatus
+from fastdeploy.engine.request import (
    ControlRequest,
    ControlResponse,
    Request,
    RequestStatus,
 )
 from fastdeploy.entrypoints.openai.utils import DealerConnectionManager
 from fastdeploy.envs import FD_SUPPORT_MAX_CONNECTIONS
 from fastdeploy.eplb.utils import RedundantExpertWorkload
@@ -526,6 +532,23 @@ class EngineClient:
        return True, ""
    async def run_control_method(self, request: ControlRequest):
        api_server_logger.info(f"Start Run Control Method: {request}")
        self.zmq_client.send_json(request.to_dict())
        request_id = request.request_id
        dealer, response_queue = await self.connection_manager.get_connection(request_id)
        dealer.write([b"", request_id.encode("utf-8")])
        try:
            # todo: support user specified timeout. default 600s is enough for most control cases
            response = await asyncio.wait_for(response_queue.get(), timeout=600)
            response = ControlResponse.from_dict(response[0])
            api_server_logger.info(f"End Run Control Method: {response}")
            return response
        except asyncio.TimeoutError:
            error_response = ControlResponse(request_id, 500, "Timeout waiting for control method response")
            api_server_logger.error(f"Error Run Control Method: {error_response}")
            return error_response
    def is_workers_alive(self):
        """
        Check the health of the model server by checking whether all workers are alive.
@@ -20,6 +20,7 @@ import signal
 import threading
 import time
 import traceback
 import uuid
 from collections.abc import AsyncGenerator
 from contextlib import asynccontextmanager
@@ -38,6 +39,7 @@ from fastdeploy.engine.args_utils import EngineArgs
 from fastdeploy.engine.async_llm import AsyncLLM
 from fastdeploy.engine.engine import LLMEngine
 from fastdeploy.engine.expert_service import ExpertService
 from fastdeploy.engine.request import ControlRequest
 from fastdeploy.entrypoints.chat_utils import load_chat_template
 from fastdeploy.entrypoints.engine_client import EngineClient
 from fastdeploy.entrypoints.openai.middleware import AuthenticationMiddleware
@@ -370,6 +372,66 @@ def ping(raw_request: Request) -> Response:
    return health(raw_request)
@app.post("/v1/pause")
 async def pause(request: Request) -> Response:
    # todo: support wait_for_inflight_requests(default False), clear_cache(default True) arguments
    request_id = f"control-{uuid.uuid4()}"
    control_request = ControlRequest(request_id, "pause")
    control_response = await app.state.engine_client.run_control_method(control_request)
    return control_response.to_api_json_response()
@app.post("/v1/resume")
 async def resume(request: Request) -> Response:
    request_id = f"control-{uuid.uuid4()}"
    control_request = ControlRequest(request_id, "resume")
    control_response = await app.state.engine_client.run_control_method(control_request)
    return control_response.to_api_json_response()
@app.get("/v1/is_paused")
 async def is_paused(request: Request) -> Response:
    request_id = f"control-{uuid.uuid4()}"
    control_request = ControlRequest(request_id, "is_paused")
    control_response = await app.state.engine_client.run_control_method(control_request)
    return control_response.to_api_json_response()
@app.post("/v1/update_weights")
 async def update_weights(request: Request) -> Response:
    request_id = f"control-{uuid.uuid4()}"
    request_data = await request.json() if await request.body() else {}
    args = {}
    # Validate and extract version parameter
    if "version" in request_data and request_data["version"] is not None:
        if not isinstance(request_data["version"], str):
            return JSONResponse(
                status_code=400, content={"error": "Invalid parameter type", "message": "version must be a string"}
            )
        args["version"] = request_data["version"]
    # Validate and extract rsync_config parameter
    if "rsync_config" in request_data and request_data["rsync_config"] is not None:
        if not isinstance(request_data["rsync_config"], dict):
            return JSONResponse(
                status_code=400,
                content={"error": "Invalid parameter type", "message": "rsync_config must be a dictionary"},
            )
        if "etcd_server" not in request_data["rsync_config"]:
            return JSONResponse(
                status_code=400,
                content={"error": "Invalid parameter type", "message": "rsync_config must contain etcd_server"},
            )
        args["rsync_config"] = request_data["rsync_config"]
    control_request = ControlRequest(request_id, "update_weights", args)
    control_response = await app.state.engine_client.run_control_method(control_request)
    return control_response.to_api_json_response()
 def wrap_streaming_generator(original_generator: AsyncGenerator):
    """
    Wrap an async generator to release the connection semaphore when the generator is finished.
@@ -214,7 +214,7 @@ class Queue(BaseComponent):
        else:
            self.socket.bind(full_ep)
-        fmq_logger.info(f"Queue {name} initialized on {full_ep}")
+        fmq_logger.info(f"Queue {name}({role}) initialized on {full_ep}")
    async def put(self, data: Any, shm_threshold: int = 1024 * 1024):
        """
@@ -1034,7 +1034,7 @@ class TokenProcessor:
                finished=True,
                metrics=RequestMetrics(
                    arrival_time=time.time(),
-                    request_start_time=task.arrival_time,
+                    request_start_time=task.metrics.arrival_time,
                ),
            )
            is_prefill = task.disaggregate_info is not None and task.disaggregate_info["role"] == "prefill"
@@ -14,6 +14,7 @@
 # limitations under the License.
 """
 import io
 import os
 import time
 from multiprocessing.shared_memory import SharedMemory
@@ -27,13 +28,38 @@ from fastdeploy.config import FDConfig
 from fastdeploy.inter_communicator import KVCacheStatus, ModelWeightsStatus
 def sync_weights_by_rdma(config, step, rank):
    from checkpoint_transfer.core import RDMAWeightsDownloader
    downloader = RDMAWeightsDownloader(config)
    downloader.initialize()
    logger.info(f"Fetching weights for step:{step}, rank:{rank}...")
    data = downloader.get_weights(step, rank)
    if data is None:
        logger.error("Failed to get weights!")
        raise Exception("Failed to rsync weights through checkpoint_transfer")
    logger.info(f"Successfully retrieved data. Type: {type(data)}")
    if isinstance(data, np.ndarray):
        data_bytes = data.tobytes()
    elif isinstance(data, (bytes, bytearray)):
        data_bytes = data
    else:
        data_bytes = bytes(data)
    logger.info(f"Data size: {len(data_bytes)} bytes")
    buffer = io.BytesIO(data_bytes)
    new_state_dict = paddle.load(buffer)
    return new_state_dict
 class DynamicWeightManager:
    """Manages model weights loading, updating and shared state across processes."""
-    def __init__(self, fd_config: FDConfig, models):
+    def __init__(self, fd_config: FDConfig, models, local_rank: int):
        """Initialize with config and model instances."""
        self.fd_config = fd_config
        self.load_config = fd_config.load_config
        self.local_rank = local_rank
        self.parallel_config = fd_config.parallel_config
        self.state_dict: Dict[str, paddle.Tensor] = {}
        self.rank = fd_config.parallel_config.tensor_parallel_rank
@@ -46,7 +72,10 @@ class DynamicWeightManager:
        else:
            self.model_list = models
        self._capture_model_state()
-        self.update_parameters()
+        if self.load_config.load_strategy == "rsync":
            self.update_weights_by_rdma()
        else:
            self.update_parameters()
        self.finalize_update()
        logger.info(
@@ -62,6 +91,74 @@ class DynamicWeightManager:
                logger.info(f"Model param: {name}, shape={param.shape}, dtype={param.dtype}")
                self.state_dict[name] = param
    def update_weights_by_rdma(self, version: str = None, rsync_config: Dict[str, Any] = None):
        def valid_parameters(old_state_dict, new_state_dict):
            is_valid = True
            for key in old_state_dict:
                if key not in new_state_dict:
                    is_valid = False
                    logger.error(f"Invalid parameter: {key} not in new_state_dict")
                elif old_state_dict[key].shape != new_state_dict[key].shape:
                    is_valid = False
                    logger.error(
                        f"Invalid parameter: {key} shape mismatch, "
                        f"new shape:{new_state_dict[key].shape}, "
                        f"old shape:{old_state_dict[key].shape}"
                    )
                elif old_state_dict[key].dtype != new_state_dict[key].dtype:
                    is_valid = False
                    logger.error(f"Invalid parameter: {key} dtype mismatch")
            return is_valid
        if rsync_config is None:
            rsync_config = self.fd_config.load_config.rsync_config
        if rsync_config is None or len(rsync_config) == 0:
            raise Exception(
                "rsync config not set, please set it in 1) launch arguments '--rsync-config' "
                "or 2) interface arguments 'rsync_config'"
            )
        if version is None or version == "":
            version = self.read_model_version_from_file()
        if version is None or version == "":
            raise Exception(
                "rsync model version not set, please set it in 1) {model_version}/version.txt "
                "or 2) interface arguments 'version'"
            )
        logger.info(f"START update_weights_by_rdma, version:{version}, rsync_config:{rsync_config}")
        rank = self.local_rank
        sync_start = time.perf_counter()
        new_state_dict = sync_weights_by_rdma(rsync_config, version, rank)
        sync_cost = time.perf_counter() - sync_start
        logger.info(f"weights sync cost {sync_cost:.2f} seconds")
        old_state_dict = self.state_dict
        if not valid_parameters(old_state_dict, new_state_dict):
            error_msg = "Invalid new_state_dict, update parameters failed"
            logger.error(error_msg)
            raise ValueError(error_msg)
        update_start = time.perf_counter()
        for name, param in old_state_dict.items():
            param.set_value(new_state_dict[name])
        update_cost = time.perf_counter() - update_start
        logger.info(f"params set value cost {update_cost:.2f} seconds")
        total_cost = time.perf_counter() - sync_start
        logger.info(
            f"END update_weights_by_rdma, cost {total_cost:.2f} seconds"
            f" version:{version}, rsync_config: {rsync_config}",
        )
        return {
            "sync_cost": sync_cost,
            "update_cost": update_cost,
            "total_cost": total_cost,
            "version": version,
            "rank": rank,
        }
    def update_parameters(self, pid: int = 0, restart_process_group=False) -> None:
        """Core method to update model parameters based on strategy."""
        start_time = time.perf_counter()
@@ -257,6 +354,17 @@ class DynamicWeightManager:
        if self.rank == 0:
            value[self.rank] = status
    def read_model_version_from_file(self):
        model_dir = self.fd_config.model_config.model
        version_file = os.path.join(model_dir, "version.txt")
        try:
            with open(version_file, "r", encoding="utf-8") as f:
                version = f.read().strip()
            return version
        except (FileNotFoundError, OSError, IOError) as e:
            logger.error(f"Failed to read model version file '{version_file}': {e}")
            return None
    @staticmethod
    def check_model_weights_status(model_weights_status, kv_cache_status, model_runner, pid, block):
        """
@@ -158,6 +158,10 @@ class LocalScheduler:
            else:
                self.ids_read_cursor -= len(expired_ids)
    def get_inflight_requests(self) -> List[Request]:
        with self.mutex:
            return [request.raw for request in self.requests.values()]
    def put_requests(self, requests: List[Request]) -> List[Tuple[str, Optional[str]]]:
        """
        Add new requests to the scheduler queue.
@@ -20,7 +20,7 @@ import queue
 import time
 from concurrent.futures import Future
 from threading import Thread
-from typing import List, Optional, cast
+from typing import Any, Dict, List, Optional, cast
 import numpy as np
 import paddle
@@ -1518,7 +1518,7 @@ class GPUModelRunner(ModelRunnerBase):
        if self.fd_config.load_config.dynamic_load_weight:
            from fastdeploy.rl.dynamic_weight_manager import DynamicWeightManager
-            self.dynamic_weight_manager = DynamicWeightManager(self.fd_config, self.model)
+            self.dynamic_weight_manager = DynamicWeightManager(self.fd_config, self.model, self.local_rank)
        # 2. Load lora model
@@ -2798,6 +2798,9 @@ class GPUModelRunner(ModelRunnerBase):
        self.dynamic_weight_manager._log_memory("dynamic weight manager update all memory")
    def update_weights(self, version: str = None, rsync_config: Dict[str, Any] = None):
        return self.dynamic_weight_manager.update_weights_by_rdma(version, rsync_config)
    def padding_cudagraph_inputs(self) -> None:
        """
        Clean buffers used for the CUDA graph when replaying the CUDA graph with the padded batch.
@@ -16,7 +16,7 @@
 import gc
 import time
-from typing import List, Optional
+from typing import Any, Dict, List, Optional
 import paddle
 import pynvml
@@ -188,6 +188,10 @@ class GpuWorker(WorkerBase):
        # accurate cache size
        self.model_runner.update_share_input_block_num(num_gpu_blocks=num_gpu_blocks)
    def update_weights(self, version: str = None, rsync_config: Dict[str, Any] = None):
        """update weights in place"""
        return self.model_runner.update_weights(version, rsync_config)
    def execute_model(
        self,
        model_forward_batch: Optional[List[Request]] = None,
@@ -15,9 +15,11 @@
 """
 import argparse
 import asyncio
 import json
 import os
 import time
 import traceback
 from typing import Tuple
 import numpy as np
@@ -42,7 +44,7 @@ from fastdeploy.config import (
    SpeculativeConfig,
    StructuredOutputsConfig,
 )
-from fastdeploy.engine.request import RequestType
+from fastdeploy.engine.request import ControlRequest, ControlResponse, RequestType
 from fastdeploy.eplb.async_expert_loader import (
    MODEL_MAIN_NAME,
    REARRANGE_EXPERT_MAGIC_NUM,
@@ -57,6 +59,7 @@ from fastdeploy.inter_communicator import (
    ModelWeightsStatus,
    RearrangeExpertStatus,
 )
 from fastdeploy.inter_communicator.fmq import FMQ
 from fastdeploy.model_executor.layers.quantization import parse_quant_config
 from fastdeploy.model_executor.utils import v1_loader_support
 from fastdeploy.platforms import current_platform
@@ -164,6 +167,12 @@ class PaddleDisWorkerProc:
        self.max_chips_per_node = 16 if current_platform.is_iluvatar() else 8
    def init_control(self):
        engine_worker_queue_port = self.parallel_config.local_engine_worker_queue_port
        queue_name = f"ctrl_w2e_rank{self.local_rank}_{engine_worker_queue_port}"
        logger.info(f"Init Control Output Queue: {queue_name}(producer)")
        self._ctrl_output = FMQ().queue(queue_name, "producer")
    def init_health_status(self) -> None:
        """
        Initialize the health status of the worker.
@@ -513,10 +522,20 @@ class PaddleDisWorkerProc:
                    else:
                        self.exist_task_signal.value[0] = ExistTaskStatus.EMPTY
-                req_dicts = []
+                req_dicts, control_reqs = [], []
                for req_dict, bsz in tasks:
-                    max_occupied_batch_index = int(bsz)
+                    if len(req_dict) > 0 and isinstance(req_dict[0], ControlRequest):
-                    req_dicts.extend(req_dict)
+                        control_reqs.append(req_dict[0])
                    else:
                        max_occupied_batch_index = int(bsz)
                        req_dicts.extend(req_dict)
                # todo: run control request async
                if len(control_reqs) > 0:
                    logger.info(f"Rank: {self.local_rank} received {len(control_reqs)} control request.")
                    for control_req in control_reqs:
                        self.run_control_method(control_req)
                        self._tp_barrier_wait() if tp_size > 1 else None
                # Count prefill requests in current batch
                num_prefill_requests = sum(1 for req in req_dicts if req.task_type == RequestType.PREFILL)
@@ -655,6 +674,32 @@ class PaddleDisWorkerProc:
            paddle.distributed.barrier()
        self.loaded_model_signal.value[0] = 1
    def run_control_method(self, control_request: ControlRequest) -> None:
        logger.info(f"Start run control request: {control_request}")
        request_id = control_request.request_id
        method = control_request.method
        kwargs = control_request.args
        handler = getattr(self.worker, method, None)
        if handler is None or not callable(handler):
            error_msg = f"Rank-{self.local_rank}: Unknown control method {method}"
            error_result = ControlResponse(request_id, 400, error_msg)
            asyncio.run(self._ctrl_output.put(error_result))
            return
        try:
            result = handler(**kwargs)
            succ_result = ControlResponse(request_id, 200, "Success", result)
            logger.info(
                f"Rank-{self.local_rank} Success run control request: {control_request}, response: {succ_result}"
            )
            asyncio.run(self._ctrl_output.put(succ_result, shm_threshold=100 * 1024 * 1024))
        except Exception as e:
            error_msg = f"Rank-{self.local_rank} Failed run control method {method}: {str(e)}"
            logger.error(f"{error_msg}\n{traceback.format_exc()}")
            error_result = ControlResponse(request_id, 500, error_msg)
            asyncio.run(self._ctrl_output.put(error_result))
 def parse_args():
    """
@@ -813,12 +858,18 @@ def parse_args():
    parser.add_argument(
        "--load_strategy",
        type=str,
-        choices=["ipc", "ipc_snapshot", "meta", "normal"],
+        choices=["ipc", "ipc_snapshot", "meta", "normal", "rsync"],
        default="ipc_snapshot",
        help="Weight loading method when dynamic loading is enabled: "
        "'ipc': real-time IPC streaming with automatic resharding, "
        "'ipc_snapshot': load from disk snapshot of IPC weights.",
    )
    parser.add_argument(
        "--rsync_config",
        type=json.loads,
        default=None,
        help="Rsync weights config",
    )
    parser.add_argument(
        "--enable_logprob",
        action="store_true",
@@ -1045,6 +1096,7 @@ def initialize_fd_config(args, ranks: int = 1, local_rank: int = 0) -> FDConfig:
    logger.info(f"- Dynamic load weight: {load_config.dynamic_load_weight}")
    logger.info(f"- Load strategy: {load_config.load_strategy}")
    logger.info(f"- Rsync config: {load_config.rsync_config}, {type(load_config.rsync_config)}")
    if not (
        current_platform.is_cuda()
@@ -1112,6 +1164,7 @@ def run_worker_proc() -> None:
        worker_proc = IluvatarPaddleDisWorkerProc(fd_config, ranks, local_rank)
    else:
        worker_proc = PaddleDisWorkerProc(fd_config, ranks, local_rank)
        worker_proc.init_control()
    # Initialize device and create model runner
    worker_proc.init_device()
@@ -0,0 +1,329 @@
 # Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License"
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 import unittest
 from unittest.mock import patch
 from fastapi.responses import JSONResponse
 from fastdeploy.engine.request import ControlRequest, ControlResponse
 class TestControlRequest(unittest.TestCase):
    """Test cases for ControlRequest class."""
    def test_initialization_basic(self):
        """Test basic initialization of ControlRequest."""
        request_id = "test_request_123"
        method = "get_metrics"
        request = ControlRequest(request_id=request_id, method=method)
        self.assertEqual(request.request_id, request_id)
        self.assertEqual(request.method, method)
        self.assertEqual(request.args, {})
    def test_initialization_with_args(self):
        """Test initialization with arguments."""
        request_id = "test_request_456"
        method = "reset_scheduler"
        args = {"force": True, "timeout": 30}
        request = ControlRequest(request_id=request_id, method=method, args=args)
        self.assertEqual(request.request_id, request_id)
        self.assertEqual(request.method, method)
        self.assertEqual(request.args, args)
    def test_from_dict_basic(self):
        """Test creating ControlRequest from dictionary (basic case)."""
        data = {"request_id": "test_from_dict", "method": "status_check"}
        request = ControlRequest.from_dict(data)
        self.assertEqual(request.request_id, data["request_id"])
        self.assertEqual(request.method, data["method"])
        self.assertEqual(request.args, {})
    def test_from_dict_with_args(self):
        """Test creating ControlRequest from dictionary with arguments."""
        data = {
            "request_id": "test_from_dict_args",
            "method": "configure",
            "args": {"max_requests": 100, "queue_timeout": 60},
        }
        request = ControlRequest.from_dict(data)
        self.assertEqual(request.request_id, data["request_id"])
        self.assertEqual(request.method, data["method"])
        self.assertEqual(request.args, data["args"])
    def test_to_dict_basic(self):
        """Test converting ControlRequest to dictionary (basic case)."""
        request = ControlRequest(request_id="test_to_dict", method="health_check")
        result = request.to_dict()
        expected = {"request_id": "test_to_dict", "method": "health_check", "args": {}}
        self.assertEqual(result, expected)
    def test_to_dict_with_args(self):
        """Test converting ControlRequest to dictionary with arguments."""
        args = {"setting1": "value1", "setting2": 42}
        request = ControlRequest(request_id="test_to_dict_args", method="update_settings", args=args)
        result = request.to_dict()
        expected = {"request_id": "test_to_dict_args", "method": "update_settings", "args": args}
        self.assertEqual(result, expected)
    def test_get_method(self):
        """Test get_method method."""
        method = "custom_operation"
        request = ControlRequest(request_id="test", method=method)
        self.assertEqual(request.get_method(), method)
    def test_get_args(self):
        """Test get_args method."""
        args = {"param1": "value1", "param2": 123}
        request = ControlRequest(request_id="test", method="test", args=args)
        result_args = request.get_args()
        self.assertEqual(result_args, args)
        # Ensure it returns a copy, not the original dict
        self.assertIsNot(result_args, args)
    def test_is_control_request_valid(self):
        """Test is_control_request method with valid data."""
        valid_data = [
            {"request_id": "test1", "method": "method1"},
            {"request_id": "test2", "method": "method2", "args": {}},
            {"request_id": "test3", "method": "method3", "args": {"key": "value"}},
        ]
        for data in valid_data:
            with self.subTest(data=data):
                self.assertTrue(ControlRequest.is_control_request(data))
    def test_is_control_request_invalid(self):
        """Test is_control_request method with invalid data."""
        invalid_data = [
            # Missing required fields
            {"method": "test"},  # missing request_id
            {"request_id": "test"},  # missing method
            # Wrong field types
            {"request_id": 123, "method": "test"},  # request_id not string
            {"request_id": "test", "method": 456},  # method not string
            {"request_id": "test", "method": "test", "args": "not_a_dict"},  # args not dict
            # Not a dict
            "not_a_dict",
            123,
            None,
        ]
        for data in invalid_data:
            with self.subTest(data=data):
                self.assertFalse(ControlRequest.is_control_request(data))
    def test_repr_simple(self):
        """Test __repr__ method in simple mode."""
        with patch("fastdeploy.envs.FD_DEBUG", False):
            request = ControlRequest(request_id="test_repr", method="test_method")
            repr_str = repr(request)
            self.assertIn("ControlRequest", repr_str)
            self.assertIn("test_repr", repr_str)
            self.assertIn("test_method", repr_str)
            self.assertNotIn("args", repr_str)  # Args not shown in simple mode
    def test_repr_debug_mode(self):
        """Test __repr__ method in debug mode."""
        with patch("fastdeploy.envs.FD_DEBUG", True):
            args = {"debug_param": "debug_value"}
            request = ControlRequest(request_id="test_repr", method="test_method", args=args)
            repr_str = repr(request)
            self.assertIn("ControlRequest", repr_str)
            self.assertIn("test_repr", repr_str)
            self.assertIn("test_method", repr_str)
            self.assertIn("debug_param", repr_str)  # Args shown in debug mode
 class TestControlResponse(unittest.TestCase):
    """Test cases for ControlResponse class."""
    def test_initialization_basic(self):
        """Test basic initialization of ControlResponse."""
        request_id = "test_response_123"
        response = ControlResponse(request_id=request_id)
        self.assertEqual(response.request_id, request_id)
        self.assertEqual(response.error_code, 200)
        self.assertIsNone(response.error_message)
        self.assertIsNone(response.result)
        self.assertTrue(response.finished)
    def test_initialization_with_all_params(self):
        """Test initialization with all parameters."""
        request_id = "test_response_456"
        error_code = 404
        error_message = "Not found"
        result = {"data": "some_result"}
        finished = False
        response = ControlResponse(
            request_id=request_id, error_code=error_code, error_message=error_message, result=result, finished=finished
        )
        self.assertEqual(response.request_id, request_id)
        self.assertEqual(response.error_code, error_code)
        self.assertEqual(response.error_message, error_message)
        self.assertEqual(response.result, result)
        self.assertEqual(response.finished, finished)
    def test_initialization_error_cases(self):
        """Test initialization with various error codes."""
        test_cases = [
            (200, None, True),  # Success case
            (400, "Bad Request", False),  # Client error
            (500, "Internal Error", True),  # Server error
        ]
        for error_code, error_message, finished in test_cases:
            with self.subTest(error_code=error_code):
                response = ControlResponse(
                    request_id="test", error_code=error_code, error_message=error_message, finished=finished
                )
                self.assertEqual(response.error_code, error_code)
                self.assertEqual(response.error_message, error_message)
                self.assertEqual(response.finished, finished)
    def test_from_dict_basic(self):
        """Test creating ControlResponse from dictionary (basic case)."""
        data = {"request_id": "test_from_dict"}
        response = ControlResponse.from_dict(data)
        self.assertEqual(response.request_id, data["request_id"])
        self.assertEqual(response.error_code, 200)
        self.assertIsNone(response.error_message)
        self.assertIsNone(response.result)
        self.assertTrue(response.finished)
    def test_from_dict_with_all_fields(self):
        """Test creating ControlResponse from dictionary with all fields."""
        data = {
            "request_id": "test_from_dict_full",
            "error_code": 500,
            "error_message": "Test error",
            "result": {"key": "value"},
            "finished": False,
        }
        response = ControlResponse.from_dict(data)
        self.assertEqual(response.request_id, data["request_id"])
        self.assertEqual(response.error_code, data["error_code"])
        self.assertEqual(response.error_message, data["error_message"])
        self.assertEqual(response.result, data["result"])
        self.assertEqual(response.finished, data["finished"])
    def test_to_dict_basic(self):
        """Test converting ControlResponse to dictionary (basic case)."""
        response = ControlResponse(request_id="test_to_dict")
        result = response.to_dict()
        expected = {
            "request_id": "test_to_dict",
            "finished": True,
            "error_code": 200,
            "error_message": None,
            "result": None,
        }
        self.assertEqual(result, expected)
    def test_to_dict_with_all_fields(self):
        """Test converting ControlResponse to dictionary with all fields."""
        response = ControlResponse(
            request_id="test_to_dict_full",
            error_code=400,
            error_message="Validation failed",
            result={"valid": False, "reason": "missing_field"},
            finished=False,
        )
        result = response.to_dict()
        expected = {
            "request_id": "test_to_dict_full",
            "finished": False,
            "error_code": 400,
            "error_message": "Validation failed",
            "result": {"valid": False, "reason": "missing_field"},
        }
        self.assertEqual(result, expected)
    def test_to_api_json_response_success(self):
        """Test converting to JSONResponse for successful response."""
        result_data = {"metrics": {"cpu_usage": 0.5, "memory_used": 1024}}
        response = ControlResponse(request_id="test_json_success", result=result_data)
        json_response = response.to_api_json_response()
        self.assertIsInstance(json_response, JSONResponse)
        self.assertEqual(json_response.status_code, 200)
        content = json_response.body.decode("utf-8")
        self.assertIn("success", content)
        self.assertIn("test_json_success", content)
        self.assertIn("cpu_usage", content)
    def test_to_api_json_response_error(self):
        """Test converting to JSONResponse for error response."""
        response = ControlResponse(request_id="test_json_error", error_code=503, error_message="Service unavailable")
        json_response = response.to_api_json_response()
        self.assertIsInstance(json_response, JSONResponse)
        self.assertEqual(json_response.status_code, 503)
        content = json_response.body.decode("utf-8")
        self.assertIn("error", content)
        self.assertIn("test_json_error", content)
        self.assertIn("Service unavailable", content)
    def test_repr_method(self):
        """Test __repr__ method."""
        response = ControlResponse(
            request_id="test_repr", error_code=200, error_message=None, result={"data": "test"}, finished=True
        )
        repr_str = repr(response)
        # Check that all important fields are represented
        self.assertIn("ControlResponse", repr_str)
        self.assertIn("test_repr", repr_str)
        self.assertIn("200", repr_str)
        self.assertIn("test", repr_str)  # from result
        self.assertIn("True", repr_str)  # finished flag
 if __name__ == "__main__":
    unittest.main()
@@ -0,0 +1,104 @@
 # Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License"
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 import os
 import unittest
 from unittest.mock import Mock
 from fastdeploy.engine.args_utils import EngineArgs
 from fastdeploy.engine.request import Request, RequestStatus
 from fastdeploy.engine.sched.resource_manager_v1 import ResourceManagerV1
 MODEL_NAME = os.getenv("MODEL_PATH", "/path/to/models") + "/ERNIE-4.5-0.3B-Paddle"
 class TestResourceManagerV1(unittest.TestCase):
    """Test cases for ResourceManagerV1."""
    def setUp(self):
        """Set up test fixtures."""
        engine_args = EngineArgs(
            model=MODEL_NAME,
            max_model_len=8192,
            tensor_parallel_size=1,
            engine_worker_queue_port=int(os.getenv("FD_ENGINE_QUEUE_PORT", "6778")),
            cache_queue_port=int(os.getenv("FD_CACHE_QUEUE_PORT", "6779")),
        )
        # Create and start the engine service
        mock_config = engine_args.create_engine_config()
        self.manager = ResourceManagerV1(
            max_num_seqs=4,
            config=mock_config,
            tensor_parallel_size=1,
            splitwise_role="mixed",
            local_data_parallel_id=0,
        )
        # Mock cache manager
        self.manager.cache_manager = Mock()
        self.manager.cache_manager.free_blocks = Mock()
    def tearDown(self) -> None:
        self.manager.need_block_num_signal.clear()
    def test_preempted_all_with_no_running_requests(self):
        """Test preempted_all with no running requests."""
        self.assertEqual(len(self.manager.running), 0)
        preempted_reqs = self.manager.preempted_all()
        self.assertEqual(len(preempted_reqs), 0)
    def test_preempted_all_with_normal_requests(self):
        """Test preempted_all with normal running requests."""
        # Add mock running requests
        req1 = Mock(spec=Request)
        req1.request_id = "req1"
        req1.use_extend_tables = False
        req1.status = RequestStatus.RUNNING
        req1.block_tables = [1, 2, 3]
        req1.num_cached_blocks = 0
        req1.idx = 0
        req2 = Mock(spec=Request)
        req2.request_id = "req2"
        req2.use_extend_tables = False
        req2.status = RequestStatus.RUNNING
        req2.block_tables = [4, 5]
        req2.num_cached_blocks = 0
        req2.idx = 1
        self.manager.running = [req1, req2]
        preempted_reqs = self.manager.preempted_all()
        # Verify
        self.assertEqual(len(preempted_reqs), 2)
        self.assertEqual(preempted_reqs[0].request_id, "req2")
        self.assertEqual(preempted_reqs[1].request_id, "req1")
        # Verify request status changed
        self.assertEqual(req1.status, RequestStatus.PREEMPTED)
        self.assertEqual(req2.status, RequestStatus.PREEMPTED)
        # Verify added to to_be_rescheduled_request_id_set
        self.assertIn("req1", self.manager.to_be_rescheduled_request_id_set)
        self.assertIn("req2", self.manager.to_be_rescheduled_request_id_set)
        self.assertEqual(len(self.manager.running), 0)
        self.assertEqual(len(self.manager.waiting), 0)
        self.assertEqual(len(self.manager.to_be_rescheduled_request_id_set), 2)
 if __name__ == "__main__":
    unittest.main()
@@ -246,6 +246,20 @@ class TestLocalScheduler(unittest.TestCase):
        # Verify only one request exists in scheduler
        self.assertEqual(len(self.scheduler.requests), 1)
    def test_get_inflight_requests(self):
        """Test getting inflight requests."""
        # Add some requests
        requests = [self.mock_request_1, self.mock_request_2]
        self.scheduler.put_requests(requests)
        # Get inflight requests
        inflight_requests = self.scheduler.get_inflight_requests()
        # Verify correct requests are returned
        self.assertEqual(len(inflight_requests), len(requests))
        for req in inflight_requests:
            self.assertIn(req, requests)
    def test_put_requests_max_size_limit(self):
        """Test that max size limit is enforced."""
        # Create scheduler with small max size