mirror of
https://github.com/PaddlePaddle/FastDeploy.git
synced 2026-04-23 00:17:25 +08:00
This reverts commit 90db0bdd0d.
This commit is contained in:
@@ -283,16 +283,6 @@ class PaddleDisWorkerProc:
|
||||
create=False,
|
||||
)
|
||||
|
||||
# init engine forward signal
|
||||
engine_forward_signal_data = np.zeros([1], dtype=np.int32)
|
||||
self.engine_forward_signal = IPCSignal(
|
||||
name="engine_forward_signal",
|
||||
array=engine_forward_signal_data,
|
||||
dtype=np.int32,
|
||||
suffix=self.parallel_config.local_engine_worker_queue_port,
|
||||
create=False,
|
||||
)
|
||||
|
||||
def update_weights_from_tensor(self, mmap_infos):
|
||||
"""
|
||||
update_weights_from_tensor
|
||||
@@ -450,6 +440,9 @@ class PaddleDisWorkerProc:
|
||||
# TODO: Unify status variables model_weights_status (shared memory) and model_weights_signal (numpy array) to one
|
||||
self.model_weights_signal = np.zeros([1], dtype=np.int32)
|
||||
while True:
|
||||
# run eplb
|
||||
self._run_eplb(tp_rank)
|
||||
|
||||
if self.fd_config.load_config.dynamic_load_weight:
|
||||
self.model_weights_signal[0] = int(self.model_weights_status.value[0])
|
||||
if self.ranks > 1:
|
||||
@@ -523,7 +516,7 @@ class PaddleDisWorkerProc:
|
||||
|
||||
if self.exist_task_signal.value[0] == ExistTaskStatus.EXIST or self.task_queue.read_finish_flag.get() == 1:
|
||||
logger.info(f"Rank: {self.local_rank} Detected new requests.")
|
||||
self.engine_forward_signal.value[0] = 1
|
||||
|
||||
tasks, read_finish = self.task_queue.get_tasks()
|
||||
# Only one of all tp_size client will get read_finish == True.
|
||||
if read_finish:
|
||||
@@ -532,48 +525,35 @@ class PaddleDisWorkerProc:
|
||||
self.task_queue.read_finish_flag.set(0)
|
||||
else:
|
||||
self.exist_task_signal.value[0] = ExistTaskStatus.EMPTY
|
||||
# In EP parallel(corresponing to dp attention), we need to barrier for prefill to prevent data imbalance due to inconsistent data arrival.
|
||||
# Only EP + DP prefill should barrier for data arrival.
|
||||
# In mixed mode and decoder in D, we should not barrier to influence decoding.
|
||||
if self.parallel_config.use_ep and self.scheduler_config.splitwise_role == "prefill":
|
||||
paddle.distributed.barrier(self.parallel_config.ep_group)
|
||||
|
||||
req_dicts, control_reqs = [], []
|
||||
# In EP + DP prefill, empty task ([]) is delived in worker to barrier. For empty task, just skip and continue.
|
||||
if tasks[0][0]:
|
||||
for req_dict, bsz in tasks:
|
||||
if len(req_dict) > 0 and isinstance(req_dict[0], ControlRequest):
|
||||
control_reqs.append(req_dict[0])
|
||||
else:
|
||||
max_occupied_batch_index = int(bsz)
|
||||
req_dicts.extend(req_dict)
|
||||
for req_dict, bsz in tasks:
|
||||
if len(req_dict) > 0 and isinstance(req_dict[0], ControlRequest):
|
||||
control_reqs.append(req_dict[0])
|
||||
else:
|
||||
max_occupied_batch_index = int(bsz)
|
||||
req_dicts.extend(req_dict)
|
||||
|
||||
# todo: run control request async
|
||||
if len(control_reqs) > 0:
|
||||
logger.info(f"Rank: {self.local_rank} received {len(control_reqs)} control request.")
|
||||
for control_req in control_reqs:
|
||||
self.run_control_method(control_req)
|
||||
self._tp_barrier_wait() if tp_size > 1 else None
|
||||
# todo: run control request async
|
||||
if len(control_reqs) > 0:
|
||||
logger.info(f"Rank: {self.local_rank} received {len(control_reqs)} control request.")
|
||||
for control_req in control_reqs:
|
||||
self.run_control_method(control_req)
|
||||
self._tp_barrier_wait() if tp_size > 1 else None
|
||||
|
||||
# Count prefill requests in current batch
|
||||
num_prefill_requests = sum(1 for req in req_dicts if req.task_type == RequestType.PREFILL)
|
||||
num_scheduled_requests = len(req_dicts)
|
||||
scheduled_request_ids = [req.request_id for req in req_dicts]
|
||||
logger.info(
|
||||
f"Rank: {self.local_rank}, num_prefill_requests: {num_prefill_requests}, "
|
||||
f"max_occupied_batch_index: {max_occupied_batch_index}, "
|
||||
f"num_scheduled_requests: {num_scheduled_requests}, "
|
||||
f"scheduled_request_ids: {scheduled_request_ids}"
|
||||
)
|
||||
# Count prefill requests in current batch
|
||||
num_prefill_requests = sum(1 for req in req_dicts if req.task_type == RequestType.PREFILL)
|
||||
num_scheduled_requests = len(req_dicts)
|
||||
scheduled_request_ids = [req.request_id for req in req_dicts]
|
||||
logger.info(
|
||||
f"Rank: {self.local_rank}, num_prefill_requests: {num_prefill_requests}, "
|
||||
f"max_occupied_batch_index: {max_occupied_batch_index}, "
|
||||
f"num_scheduled_requests: {num_scheduled_requests}, "
|
||||
f"scheduled_request_ids: {scheduled_request_ids}"
|
||||
)
|
||||
|
||||
# Process prefill inputs
|
||||
self.worker.preprocess_new_task(req_dicts, max_occupied_batch_index)
|
||||
else:
|
||||
if self.scheduler_config.splitwise_role == "prefill":
|
||||
if tp_size > 1:
|
||||
# Synchronize the signal for other workers
|
||||
self._tp_barrier_wait()
|
||||
continue
|
||||
# Process prefill inputs
|
||||
self.worker.preprocess_new_task(req_dicts, max_occupied_batch_index)
|
||||
|
||||
if (
|
||||
(not self.parallel_config.use_ep)
|
||||
@@ -581,7 +561,7 @@ class PaddleDisWorkerProc:
|
||||
and (not self.enable_overlap_schedule)
|
||||
):
|
||||
self._tp_barrier_wait() if tp_size > 1 else None
|
||||
self.engine_forward_signal.value[0] = 0
|
||||
|
||||
time.sleep(0.001)
|
||||
continue
|
||||
|
||||
@@ -593,9 +573,6 @@ class PaddleDisWorkerProc:
|
||||
if not envs.ENABLE_V1_KVCACHE_SCHEDULER:
|
||||
self.exist_prefill_task_signal.value[0] = self.worker.exist_prefill()
|
||||
logger.debug(f"execute model cost: {time.time()-start_execute_time:.5f} s")
|
||||
# run eplb
|
||||
self._run_eplb(tp_rank)
|
||||
self.engine_forward_signal.value[0] = 0
|
||||
|
||||
def initialize_kv_cache(self) -> None:
|
||||
"""Profiles the peak memory usage of the model to determine how many
|
||||
|
||||
Reference in New Issue
Block a user