diff --git a/fastdeploy/worker/gcu_model_runner.py b/fastdeploy/worker/gcu_model_runner.py index a67ac75e4a..139a809c64 100644 --- a/fastdeploy/worker/gcu_model_runner.py +++ b/fastdeploy/worker/gcu_model_runner.py @@ -105,9 +105,9 @@ class GCUModelRunner(ModelRunnerBase): self.local_rank + int(self.parallel_config.engine_worker_queue_port) ) - def prefill_finished(self): + def exist_prefill(self): """ - check whether prefill stage finished + check whether prefill stage exist """ if int(paddle.max(self.share_inputs["seq_lens_encoder"])) != 0: return 1 diff --git a/fastdeploy/worker/gcu_worker.py b/fastdeploy/worker/gcu_worker.py index 1b98e3b0c8..004e0e801a 100644 --- a/fastdeploy/worker/gcu_worker.py +++ b/fastdeploy/worker/gcu_worker.py @@ -69,11 +69,11 @@ class GcuWorker(WorkerBase): local_rank=self.local_rank, ) - def prefill_finished(self): + def exist_prefill(self): """ - check whether prefill stage finished + check whether prefill stage exist """ - return self.model_runner.prefill_finished() + return self.model_runner.exist_prefill() def determine_available_memory(self) -> int: """ diff --git a/fastdeploy/worker/gpu_model_runner.py b/fastdeploy/worker/gpu_model_runner.py index 45dd69d597..ecdc4bf37e 100644 --- a/fastdeploy/worker/gpu_model_runner.py +++ b/fastdeploy/worker/gpu_model_runner.py @@ -148,9 +148,9 @@ class GPUModelRunner(ModelRunnerBase): self.local_rank + int(self.parallel_config.engine_worker_queue_port) ) - def prefill_finished(self): + def exist_prefill(self): """ - Check whether prefill stage finished + check whether prefill stage exist """ if int(paddle.max(self.share_inputs["seq_lens_encoder"])) != 0: return 1 diff --git a/fastdeploy/worker/gpu_worker.py b/fastdeploy/worker/gpu_worker.py index 50ac4d231a..13270757b7 100644 --- a/fastdeploy/worker/gpu_worker.py +++ b/fastdeploy/worker/gpu_worker.py @@ -78,11 +78,11 @@ class GpuWorker(WorkerBase): local_rank=self.local_rank, ) - def prefill_finished(self): + def exist_prefill(self): """ - Check whether prefill stage finished + check whether prefill stage exist """ - return self.model_runner.prefill_finished() + return self.model_runner.exist_prefill() def determine_available_memory(self) -> int: """ diff --git a/fastdeploy/worker/iluvatar_model_runner.py b/fastdeploy/worker/iluvatar_model_runner.py index f110ee64df..f3ef3823c0 100644 --- a/fastdeploy/worker/iluvatar_model_runner.py +++ b/fastdeploy/worker/iluvatar_model_runner.py @@ -105,9 +105,9 @@ class IluvatarModelRunner(ModelRunnerBase): self.local_rank + int(self.parallel_config.engine_worker_queue_port) ) - def prefill_finished(self): + def exist_prefill(self): """ - check whether prefill stage finished + check whether prefill stage exist """ if int(paddle.max(self.share_inputs["seq_lens_encoder"])) != 0: return 1 diff --git a/fastdeploy/worker/iluvatar_worker.py b/fastdeploy/worker/iluvatar_worker.py index 76fcb558bc..8ed74c6fee 100644 --- a/fastdeploy/worker/iluvatar_worker.py +++ b/fastdeploy/worker/iluvatar_worker.py @@ -69,11 +69,11 @@ class IluvatarWorker(WorkerBase): local_rank=self.local_rank, ) - def prefill_finished(self): + def exist_prefill(self): """ - check whether prefill stage finished + check whether prefill stage exist """ - return self.model_runner.prefill_finished() + return self.model_runner.exist_prefill() def determine_available_memory(self) -> int: """ diff --git a/fastdeploy/worker/worker_base.py b/fastdeploy/worker/worker_base.py index 281776bb0e..8ff5a659fd 100644 --- a/fastdeploy/worker/worker_base.py +++ b/fastdeploy/worker/worker_base.py @@ -96,6 +96,6 @@ class WorkerBase(ABC): """Basic health check (override for device-specific checks).""" return NotImplementedError - def prefill_finished(self): - """check whether prefill stage finished.""" + def exist_prefill(self): + """check whether prefill stage exist.""" return True diff --git a/fastdeploy/worker/worker_process.py b/fastdeploy/worker/worker_process.py index 8a12988c4f..72cd2b8ed6 100644 --- a/fastdeploy/worker/worker_process.py +++ b/fastdeploy/worker/worker_process.py @@ -286,7 +286,7 @@ class PaddleDisWorkerProc: if self.local_rank % mp_num_per_node == 0: if self.task_queue.num_tasks() > 0: # VL only support 1 batch to prefill - if not self.fd_config.model_config.enable_mm or not self.worker.prefill_finished(): + if not self.fd_config.model_config.enable_mm or not self.worker.exist_prefill(): if self.nnode > 1: self.task_queue.read_finish_flag.set(1) else: @@ -346,7 +346,7 @@ class PaddleDisWorkerProc: # Execute model to generate token. The generated token will be written to the buffer. # These generated tokens can be obtained through get_output op. self.worker.execute_model(req_dicts) - self.exist_prefill_task_signal.value[0] = self.worker.prefill_finished() + self.exist_prefill_task_signal.value[0] = self.worker.exist_prefill() def initialize_kv_cache(self) -> None: """Profiles the peak memory usage of the model to determine how many diff --git a/fastdeploy/worker/xpu_model_runner.py b/fastdeploy/worker/xpu_model_runner.py index 3240b217e1..0c39cf3e0a 100644 --- a/fastdeploy/worker/xpu_model_runner.py +++ b/fastdeploy/worker/xpu_model_runner.py @@ -584,9 +584,9 @@ class XPUModelRunner(ModelRunnerBase): logger.warn("XPU not support cuda graph currently") pass - def prefill_finished(self): + def exist_prefill(self): """ - check whether prefill stage finished + check whether prefill stage exist """ if int(paddle.max(self.share_inputs["seq_lens_encoder"])) != 0: return 1 diff --git a/fastdeploy/worker/xpu_worker.py b/fastdeploy/worker/xpu_worker.py index 49b4a74ac0..5e1dc0c531 100644 --- a/fastdeploy/worker/xpu_worker.py +++ b/fastdeploy/worker/xpu_worker.py @@ -143,11 +143,11 @@ class XpuWorker(WorkerBase): output = self.model_runner.execute_model(model_forward_batch) return output - def prefill_finished(self): + def exist_prefill(self): """ - check whether prefill stage finished + check whether prefill stage exist """ - return self.model_runner.prefill_finished() + return self.model_runner.exist_prefill() def preprocess_new_task(self, req_dicts: List[Request]) -> None: """Process new requests and then start the decode loop