mirror of
https://github.com/PaddlePaddle/FastDeploy.git
synced 2026-04-23 00:17:25 +08:00
[Feature] support eplb in api_server (#4782)
* support eplb in api_server * update code * add eplb test case * update eplb * support tp+dp eplb * update test cese * update code * update code * fix bug * update copilot review * update test case name
This commit is contained in:
@@ -1,3 +1,15 @@
|
||||
""" "
|
||||
Expert Parallelism Load Balancer (EPLB)
|
||||
"""
|
||||
# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
"""
|
||||
|
||||
@@ -1,4 +1,18 @@
|
||||
"""AsyncExpertLoader async load the model weights of the MoE experts."""
|
||||
"""
|
||||
# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
"""
|
||||
|
||||
import ctypes
|
||||
import os
|
||||
@@ -8,8 +22,9 @@ from typing import List, Tuple
|
||||
|
||||
import numpy as np
|
||||
import paddle
|
||||
from cuda import cudart
|
||||
|
||||
from fastdeploy import envs
|
||||
from fastdeploy.config import EPLBConfig
|
||||
|
||||
REARRANGE_EXPERT_MAGIC_NUM = 147183647
|
||||
REARRANGE_ORIGINATOR_EP_RANK = 0
|
||||
@@ -17,7 +32,6 @@ CHECK_TIME_INTERNAL = 3
|
||||
HTTP_RETRY_NUM = 5
|
||||
CHECK_TIMEOUT = 120
|
||||
|
||||
|
||||
libc = ctypes.CDLL(None)
|
||||
|
||||
libc.mmap.argtypes = [
|
||||
@@ -45,22 +59,19 @@ MAIN_MODEL_REDUNDANT_SHM_SIZE = 5
|
||||
MODEL_MAIN_NAME = "eplb_main"
|
||||
|
||||
|
||||
def create_mmap(model_name: List, ep_rank: int, ep_size: int, shm_uuid: str, logger=None):
|
||||
def create_mmap(model_name: List, ep_rank: int, ep_size: int, shm_uuid: str, eplb_config: EPLBConfig, logger=None):
|
||||
"""create_mmap"""
|
||||
flags = MAP_SHARED
|
||||
prot = PROT_READ | PROT_WRITE
|
||||
|
||||
main_size = 0
|
||||
if envs.FD_REDUNDANT_EXPERT_ASYNC_LOAD_MODEL_SHMEM_SIZE_GB == 0:
|
||||
if eplb_config.redundant_expert_async_load_model_shmem_size_gb == 0:
|
||||
main_size = TOTAL_MODEL_SIZE // ep_size
|
||||
else:
|
||||
main_size = envs.FD_REDUNDANT_EXPERT_ASYNC_LOAD_MODEL_SHMEM_SIZE_GB
|
||||
main_size = eplb_config.redundant_expert_async_load_model_shmem_size_gb
|
||||
main_size = main_size * G
|
||||
|
||||
mmap_infos = {}
|
||||
|
||||
from cuda import cudart
|
||||
|
||||
for name in model_name:
|
||||
expert_weight_file = f"/dev/shm/{name}_rank_{ep_rank}_expert_weight_{shm_uuid}"
|
||||
shm_size = main_size
|
||||
@@ -70,10 +81,7 @@ def create_mmap(model_name: List, ep_rank: int, ep_size: int, shm_uuid: str, log
|
||||
shm_fd = os.open(expert_weight_file, os.O_RDWR)
|
||||
os.ftruncate(shm_fd, shm_size)
|
||||
if logger is not None:
|
||||
logger.info(
|
||||
f"redundant_expert: create_mmap file {expert_weight_file}, \
|
||||
fd {shm_fd}, size {shm_size}"
|
||||
)
|
||||
logger.info(f"redundant_expert: create_mmap file {expert_weight_file}, fd {shm_fd}, size {shm_size}")
|
||||
|
||||
shm_ptr = libc.mmap(0, ctypes.c_size_t(shm_size), prot, flags, shm_fd, 0)
|
||||
if shm_ptr == MAP_FAILED:
|
||||
@@ -86,8 +94,8 @@ def create_mmap(model_name: List, ep_rank: int, ep_size: int, shm_uuid: str, log
|
||||
(ret,) = cudart.cudaHostRegister(addr, shm_size, 0)
|
||||
if ret != cudart.cudaError_t.cudaSuccess:
|
||||
raise RuntimeError(
|
||||
f"cudaHostRegister failed: {cudart.cudaGetErrorString(ret)},"
|
||||
+ f" address {hex(addr)} size {shm_size}, ret: {ret}"
|
||||
f"cudaHostRegister failed: {cudart.cudaGetErrorString(ret)}, "
|
||||
f" address {hex(addr)} size {shm_size}, ret: {ret}"
|
||||
)
|
||||
|
||||
mmap_infos[name] = shm_ptr
|
||||
@@ -173,6 +181,7 @@ class AsyncEPLoader(object):
|
||||
def __init__(
|
||||
self,
|
||||
model_dir,
|
||||
eplb_config,
|
||||
rank=8,
|
||||
expert_per_rank=8,
|
||||
moe_layer_start_index=3,
|
||||
@@ -183,6 +192,7 @@ class AsyncEPLoader(object):
|
||||
__init__
|
||||
"""
|
||||
self.model_path = model_dir
|
||||
self.eplb_config = eplb_config
|
||||
|
||||
self.expert_per_rank = expert_per_rank
|
||||
self.moe_layer_start_index = moe_layer_start_index
|
||||
@@ -239,7 +249,7 @@ class AsyncEPLoader(object):
|
||||
succ = True
|
||||
message = ""
|
||||
if len(need_to_reload) > 0:
|
||||
if envs.FD_MODEL_USE_SAFETENSORS:
|
||||
if self.eplb_config.model_use_safetensors:
|
||||
succ, message = self.load_safetensor_fp8_from_disk(need_to_reload)
|
||||
else:
|
||||
succ, message = self.load_weight_bf16_from_disk(need_to_reload)
|
||||
@@ -278,7 +288,7 @@ class AsyncEPLoader(object):
|
||||
# self.logger.info(f"redundant_expert: {file_name} not exist.")
|
||||
continue
|
||||
# self.logger.info(f"redundant_expert: Loading expert weights: {file_name}.")
|
||||
self.state_dicts[file_name] = paddle.load(self.model_path + "/merged_tp1_state_split/" + file_name)
|
||||
# self.state_dicts[file_name] = paddle.load(self.model_path + "/merged_tp1_state_split/" + file_name)
|
||||
|
||||
paddle.set_device(last_device)
|
||||
self.logger.info("redundant_expert: Loading expert weights end.")
|
||||
@@ -343,7 +353,15 @@ def load_ep_checkpoint(model_path):
|
||||
|
||||
|
||||
def load_model_weights_process(
|
||||
rank: int, expert_per_rank: int, moe_layer_start_index: int, moe_quant_type: str, data_conn, mg_conn, shm_uuid
|
||||
rank: int,
|
||||
model_dir: str,
|
||||
expert_per_rank: int,
|
||||
moe_layer_start_index: int,
|
||||
moe_quant_type: str,
|
||||
shm_uuid: str,
|
||||
eplb_config: EPLBConfig,
|
||||
data_conn,
|
||||
mg_conn,
|
||||
):
|
||||
"""
|
||||
load_model_weights_process
|
||||
@@ -354,18 +372,20 @@ def load_model_weights_process(
|
||||
|
||||
setproctitle(f"eplb::async_load_model_{rank}")
|
||||
faulthandler.enable()
|
||||
from server.utils import get_logger
|
||||
from fastdeploy.utils import get_logger
|
||||
|
||||
logger = get_logger("eplb_async_loader", "eplb_{0}.log".format(rank))
|
||||
logger.info("redundant_expert: load_model_weights_process start")
|
||||
|
||||
paddle.set_device("cpu")
|
||||
ep_loader = AsyncEPLoader(
|
||||
model_dir=model_dir,
|
||||
rank=rank,
|
||||
expert_per_rank=expert_per_rank,
|
||||
moe_layer_start_index=moe_layer_start_index,
|
||||
moe_quant_type=moe_quant_type,
|
||||
logger=logger,
|
||||
eplb_config=eplb_config,
|
||||
)
|
||||
|
||||
while True:
|
||||
|
||||
+15
-12
@@ -1,4 +1,18 @@
|
||||
"""Expert Parallelism Load Balancer (EPLB)"""
|
||||
"""
|
||||
# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
"""
|
||||
|
||||
from typing import Tuple
|
||||
|
||||
@@ -9,11 +23,9 @@ def balanced_packing(weight: np.ndarray, num_packs: int) -> Tuple[np.ndarray, np
|
||||
"""
|
||||
Pack n weighted objects to m packs, such that each bin contains exactly n/m objects and the weights of all packs
|
||||
are as balanced as possible.
|
||||
|
||||
Parameters:
|
||||
weight: [X, n], the weight of each item
|
||||
num_packs: number of packs
|
||||
|
||||
Returns:
|
||||
pack_index: [X, n], the pack index of each item
|
||||
rank_in_pack: [X, n], the rank of the item in the pack
|
||||
@@ -49,11 +61,9 @@ def balanced_packing(weight: np.ndarray, num_packs: int) -> Tuple[np.ndarray, np
|
||||
def replicate_experts(weight: np.ndarray, num_phy: int) -> Tuple[np.ndarray, np.ndarray, np.ndarray]:
|
||||
"""
|
||||
Replicate `num_log` experts to `num_phy` replicas, such that the maximum load of all replicas is minimized.
|
||||
|
||||
Parameters:
|
||||
weight: [X, num_log]
|
||||
num_phy: total number of experts after replication
|
||||
|
||||
Returns:
|
||||
phy2log: [X, num_phy], logical expert id of each physical expert
|
||||
rank: [X, num_phy], the replica rank
|
||||
@@ -88,7 +98,6 @@ def rebalance_experts_intra_node(
|
||||
num_groups: number of expert groups
|
||||
num_nodes: number of server nodes, where the intra-node network (e.g, NVLink) is faster
|
||||
num_gpus: number of GPUs, must be a multiple of `num_nodes`
|
||||
|
||||
Returns:
|
||||
physical_to_logical_map: [num_moe_layers, num_physical_experts]
|
||||
logical_to_physical_map: [num_moe_layers, num_logical_experts, X]
|
||||
@@ -155,7 +164,6 @@ def rebalance_experts_hierarchical(
|
||||
num_groups: number of expert groups
|
||||
num_nodes: number of server nodes, where the intra-node network (e.g, NVLink) is faster
|
||||
num_gpus: number of GPUs, must be a multiple of `num_nodes`
|
||||
|
||||
Returns:
|
||||
physical_to_logical_map: [num_moe_layers, num_physical_experts]
|
||||
logical_to_physical_map: [num_moe_layers, num_logical_experts, X]
|
||||
@@ -215,14 +223,12 @@ def rebalance_experts(
|
||||
) -> Tuple[np.ndarray, np.ndarray, np.ndarray]:
|
||||
"""
|
||||
Entry point for expert-parallelism load balancer.
|
||||
|
||||
Parameters:
|
||||
weight: [layers, num_logical_experts], the load statistics for all logical experts
|
||||
num_replicas: number of physical experts, must be a multiple of `num_gpus`
|
||||
num_groups: number of expert groups
|
||||
num_nodes: number of server nodes, where the intra-node network (e.g, NVLink) is faster
|
||||
num_gpus: number of GPUs, must be a multiple of `num_nodes`
|
||||
|
||||
Returns:
|
||||
physical_to_logical_map: [layers, num_replicas], the expert index of each replica
|
||||
logical_to_physical_map: [layers, num_logical_experts, X], the replica indices for each expert
|
||||
@@ -267,9 +273,6 @@ def main():
|
||||
num_nodes = 4
|
||||
num_gpus = 4 * 8
|
||||
|
||||
# model_tokens_per_expert_stats_list = np.ones(
|
||||
# (num_hidden_layers, num_expert), dtype=int)
|
||||
|
||||
model_tokens_per_expert_stats_list = np.random.randint(low=1, high=10, size=(num_hidden_layers, num_expert))
|
||||
|
||||
phy2log, phyrank, logcnt = rebalance_experts(
|
||||
|
||||
+121
-218
@@ -1,19 +1,33 @@
|
||||
"""
|
||||
redundant expert manger
|
||||
# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
"""
|
||||
|
||||
import threading
|
||||
import time
|
||||
from http import HTTPStatus
|
||||
from multiprocessing import Pipe, Process, shared_memory
|
||||
from multiprocessing import Pipe, Process
|
||||
|
||||
import numpy as np
|
||||
import requests
|
||||
|
||||
from fastdeploy.config import FDConfig
|
||||
from fastdeploy.eplb.async_expert_loader import load_model_weights_process
|
||||
from fastdeploy.eplb.eplb import rebalance_experts
|
||||
from fastdeploy.eplb.utils import RearrangeExpertState, RedundantExpertWorkload
|
||||
from fastdeploy.utils import envs, get_logger
|
||||
from fastdeploy.eplb.utils import RedundantExpertWorkload
|
||||
from fastdeploy.inter_communicator import IPCSignal, RearrangeExpertStatus
|
||||
from fastdeploy.utils import get_logger
|
||||
|
||||
|
||||
class RedundantExpertManager:
|
||||
@@ -21,7 +35,13 @@ class RedundantExpertManager:
|
||||
RedundantExpertManger
|
||||
"""
|
||||
|
||||
def __init__(self, rank=0, ep_size=64, fd_config=None):
|
||||
def __init__(
|
||||
self,
|
||||
rank: int = 0,
|
||||
ep_size: int = 32,
|
||||
fd_config: FDConfig = None,
|
||||
ipc_signal_suffix: int = 0,
|
||||
):
|
||||
self.logger = get_logger("eplb_expert_manager", "eplb_{0}.log".format(rank))
|
||||
|
||||
self.rank = rank
|
||||
@@ -30,9 +50,11 @@ class RedundantExpertManager:
|
||||
self.eplb_config = fd_config.eplb_config
|
||||
self.api_user = self.eplb_config.redundant_expert_api_user
|
||||
self.api_passwd = self.eplb_config.redundant_expert_api_password
|
||||
self.num_hidden_layers = self.eplb_config.model_config.num_layers
|
||||
self.num_logical_experts = self.eplb_config.model_config.moe_num_experts
|
||||
self.num_redundant_experts = self.eplb_config.redundant_experts_num
|
||||
self.num_hidden_layers = self.fd_config.model_config.num_hidden_layers
|
||||
self.num_logical_experts = self.fd_config.model_config.moe_num_experts
|
||||
self.ipc_signal_suffix = ipc_signal_suffix
|
||||
self.local_rank = self.rank % self.fd_config.parallel_config.tensor_parallel_size
|
||||
|
||||
self.num_replicas = self.num_logical_experts + self.num_redundant_experts
|
||||
self.num_groups = self.num_logical_experts
|
||||
@@ -112,9 +134,12 @@ class RedundantExpertManager:
|
||||
name=f"eplb::async_load_model_{rank}",
|
||||
args=(
|
||||
self.rank,
|
||||
self.fd_config.model_config.model,
|
||||
self.expert_per_rank,
|
||||
self.fd_config.model_config.moe_layer_start_index,
|
||||
self.eplb_config.moe_quant_type,
|
||||
self.ipc_signal_suffix,
|
||||
self.eplb_config,
|
||||
child_data_conn,
|
||||
child_mg_conn,
|
||||
),
|
||||
@@ -130,9 +155,6 @@ class RedundantExpertManager:
|
||||
strategy {self.eplb_config.redundant_expert_eplb_strategy}"
|
||||
)
|
||||
|
||||
def get_unique_name(self, name):
|
||||
return f"{envs.get_unique_name(name + '_dprank_' + str(self.rank))}"
|
||||
|
||||
def get_ep_rank_to_expert_id_list(self):
|
||||
"""
|
||||
get_ep_rank_to_expert_id_list
|
||||
@@ -147,66 +169,84 @@ class RedundantExpertManager:
|
||||
"""
|
||||
listen_rearrange_expert_signal
|
||||
"""
|
||||
if self.rank == 0:
|
||||
rearrange_experts_ips_size = np.zeros([1], dtype=np.int32)
|
||||
shm_rearrange_experts_ips_size = shared_memory.SharedMemory(
|
||||
dp_ipc_signal_suffix = f"{self.ipc_signal_suffix}_dp{self.fd_config.parallel_config.local_data_parallel_id}"
|
||||
if self.local_rank == 0:
|
||||
rearrange_experts_ips_size_array = np.zeros([1], dtype=np.int32)
|
||||
rearrange_experts_ips_size_signal = IPCSignal(
|
||||
name="rearrange_experts_ips_size",
|
||||
array=rearrange_experts_ips_size_array,
|
||||
dtype=np.int32,
|
||||
suffix=dp_ipc_signal_suffix,
|
||||
create=False,
|
||||
size=rearrange_experts_ips_size.nbytes,
|
||||
name=self.get_unique_name("rearrange_experts_ips_size"),
|
||||
)
|
||||
rearrange_experts_ips_size_array = np.ndarray(
|
||||
rearrange_experts_ips_size.shape,
|
||||
dtype=rearrange_experts_ips_size.dtype,
|
||||
buffer=shm_rearrange_experts_ips_size.buf,
|
||||
)
|
||||
shm_rearrange_experts_ips_list = shared_memory.SharedMemory(
|
||||
|
||||
shm_rearrange_experts_ips_list = IPCSignal(
|
||||
name="rearrange_experts_ips_list",
|
||||
shm_size=self.eplb_config.redundant_expert_ip_shm_size,
|
||||
suffix=dp_ipc_signal_suffix,
|
||||
create=False,
|
||||
size=1024,
|
||||
name=self.get_unique_name("rearrange_experts_ips_list"),
|
||||
)
|
||||
|
||||
rearrange_experts_status = np.zeros([1], dtype=np.int32)
|
||||
shm_rearrange_experts_status = shared_memory.SharedMemory(
|
||||
rearrange_experts_signal = IPCSignal(
|
||||
name="rearrange_experts_status",
|
||||
array=rearrange_experts_status,
|
||||
dtype=np.int32,
|
||||
suffix=dp_ipc_signal_suffix,
|
||||
create=False,
|
||||
size=rearrange_experts_status.nbytes,
|
||||
name=self.get_unique_name("rearrange_experts_status"),
|
||||
)
|
||||
rearrange_experts_status_array = np.ndarray(
|
||||
rearrange_experts_status.shape,
|
||||
dtype=rearrange_experts_status.dtype,
|
||||
buffer=shm_rearrange_experts_status.buf,
|
||||
)
|
||||
|
||||
signal_update_weight_from_tensor = np.zeros([1], dtype=np.int32)
|
||||
self.signal_update_weight_from_tensor_array = IPCSignal(
|
||||
name="signal_update_weight_from_tensor",
|
||||
array=signal_update_weight_from_tensor,
|
||||
dtype=np.int32,
|
||||
suffix=dp_ipc_signal_suffix,
|
||||
create=False,
|
||||
)
|
||||
|
||||
tp_ipc_signal_suffix = f"{dp_ipc_signal_suffix}_tp{self.local_rank}"
|
||||
signal_update_weight_from_disk = np.zeros([1], dtype=np.int32)
|
||||
shm_signal_update_weight_from_disk = shared_memory.SharedMemory(
|
||||
signal_update_weight_from_disk_array = IPCSignal(
|
||||
name="signal_update_weight_from_disk",
|
||||
array=signal_update_weight_from_disk,
|
||||
dtype=np.int32,
|
||||
suffix=tp_ipc_signal_suffix,
|
||||
create=False,
|
||||
size=signal_update_weight_from_disk.nbytes,
|
||||
name=self.get_unique_name("signal_update_weight_from_disk"),
|
||||
)
|
||||
signal_update_weight_from_disk_array = np.ndarray(
|
||||
signal_update_weight_from_disk.shape,
|
||||
dtype=signal_update_weight_from_disk.dtype,
|
||||
buffer=shm_signal_update_weight_from_disk.buf,
|
||||
)
|
||||
|
||||
experts_token_stats = np.zeros((self.num_hidden_layers, 64), dtype=np.int32)
|
||||
shm_all_experts_token_stats = shared_memory.SharedMemory(
|
||||
experts_token_stats = np.zeros(
|
||||
(self.fd_config.model_config.num_hidden_layers, self.fd_config.model_config.moe_num_experts),
|
||||
dtype=np.int32,
|
||||
)
|
||||
shm_all_experts_token_stats = IPCSignal(
|
||||
name="all_experts_token_stats",
|
||||
array=experts_token_stats,
|
||||
dtype=np.int32,
|
||||
suffix=tp_ipc_signal_suffix,
|
||||
create=False,
|
||||
)
|
||||
|
||||
result_update_weight_from_disk = np.zeros([1], dtype=np.int32)
|
||||
self.update_weight_from_disk_result = IPCSignal(
|
||||
name="result_update_weight_from_disk",
|
||||
array=result_update_weight_from_disk,
|
||||
dtype=np.int32,
|
||||
suffix=tp_ipc_signal_suffix,
|
||||
create=False,
|
||||
size=experts_token_stats.nbytes,
|
||||
name=self.get_unique_name("all_experts_token_stats"),
|
||||
)
|
||||
|
||||
while True:
|
||||
if self.rank == 0:
|
||||
if self.local_rank == 0:
|
||||
now = int(time.time())
|
||||
if rearrange_experts_ips_size_array[0] > 0:
|
||||
if rearrange_experts_ips_size_signal.value[0] > 0:
|
||||
# step 1. all reduce experts token stats
|
||||
address = bytes(shm_rearrange_experts_ips_list.buf[: rearrange_experts_ips_size_array[0]]).decode(
|
||||
"utf-8"
|
||||
)
|
||||
address = bytes(
|
||||
shm_rearrange_experts_ips_list.shm.buf[: rearrange_experts_ips_size_signal.value[0]]
|
||||
).decode("utf-8")
|
||||
self.logger.info(f"redundant_expert: all rank ips {address}")
|
||||
rearrange_experts_ips_size_array[0] = 0
|
||||
rearrange_experts_status_array[0] = RearrangeExpertState.doing.value
|
||||
rearrange_experts_ips_size_signal.value[0] = 0
|
||||
rearrange_experts_signal.value[0] = RearrangeExpertStatus.DOING.value
|
||||
|
||||
self.dp_rank_address = address.strip().split(";")
|
||||
if self.allreduce_experts_stat():
|
||||
@@ -214,30 +254,25 @@ class RedundantExpertManager:
|
||||
self.load_weight_begin_ts = now
|
||||
self.logger.info("redundant_expert: all-reduce experts stats success")
|
||||
else:
|
||||
rearrange_experts_status_array[0] = RearrangeExpertState.free.value
|
||||
rearrange_experts_signal.value[0] = RearrangeExpertStatus.FREE.value
|
||||
self.logger.warning("redundant_expert: all-reduce experts stats fail")
|
||||
elif self.need_allgather_load_weight_result and self.allreduce_load_weight_result():
|
||||
# step 3. all reduce the result of load weight from disk
|
||||
self.need_allgather_load_weight_result = False
|
||||
rearrange_experts_status_array[0] = RearrangeExpertState.load_succ.value
|
||||
rearrange_experts_signal.value[0] = RearrangeExpertStatus.LOAD_SUCC.value
|
||||
self.rearrange_end_ts = now
|
||||
if rearrange_experts_status_array[0] > 1 and (
|
||||
if rearrange_experts_signal.value[0] > 1 and (
|
||||
now - self.rearrange_end_ts > self.rearrange_reset_interval
|
||||
):
|
||||
# reset rearrange status
|
||||
rearrange_experts_status_array[0] = RearrangeExpertState.free.value
|
||||
rearrange_experts_signal.value[0] = RearrangeExpertStatus.FREE.value
|
||||
|
||||
if signal_update_weight_from_disk_array[0] == 1:
|
||||
if signal_update_weight_from_disk_array.value[0] == 1:
|
||||
# step 2. async load weight: disk -> memory
|
||||
expert_token_stats = np.ndarray(
|
||||
experts_token_stats.shape,
|
||||
dtype=experts_token_stats.dtype,
|
||||
buffer=shm_all_experts_token_stats.buf,
|
||||
)
|
||||
self.model_tokens_per_expert_stats_list[:] = expert_token_stats[:]
|
||||
self.model_tokens_per_expert_stats_list[:] = shm_all_experts_token_stats.value[:]
|
||||
self.caculate_expert_rank_table()
|
||||
self.update_weight_from_disk()
|
||||
signal_update_weight_from_disk_array[0] = 0
|
||||
signal_update_weight_from_disk_array.value[0] = 0
|
||||
time.sleep(0.5)
|
||||
|
||||
def caculate_expert_rank_table(self, is_init=False):
|
||||
@@ -274,7 +309,7 @@ class RedundantExpertManager:
|
||||
self.model_expert_id_to_ep_rank_array[..., : logical_to_physical_map.shape[-1]] = logical_to_physical_map[:]
|
||||
self.model_expert_in_rank_num_list[:] = expert_count[:]
|
||||
|
||||
if self.rank == 0:
|
||||
if self.local_rank == 0:
|
||||
workload = RedundantExpertWorkload()
|
||||
workload.tokens_per_expert_stats_list = self.model_tokens_per_expert_stats_list.tolist()
|
||||
workload.ep_rank_to_expert_id_list = rank_expert_list.tolist()
|
||||
@@ -287,18 +322,7 @@ class RedundantExpertManager:
|
||||
update_weight_from_disk
|
||||
"""
|
||||
begin_time = time.time()
|
||||
result_update_weight_from_disk = np.zeros([1], dtype=np.int32)
|
||||
shm_result_update_weight_from_disk = shared_memory.SharedMemory(
|
||||
create=False,
|
||||
size=result_update_weight_from_disk.nbytes,
|
||||
name=self.get_unique_name("result_update_weight_from_disk"),
|
||||
)
|
||||
result_update_weight_from_disk_array = np.ndarray(
|
||||
result_update_weight_from_disk.shape,
|
||||
dtype=result_update_weight_from_disk.dtype,
|
||||
buffer=shm_result_update_weight_from_disk.buf,
|
||||
)
|
||||
result_update_weight_from_disk_array[0] = 0
|
||||
self.update_weight_from_disk_result.value[0] = 0
|
||||
|
||||
self.logger.info(f"redundant_expert: update_weight_from_disk send to async process, rank {self.rank}")
|
||||
self.parent_mg_conn.send(
|
||||
@@ -312,7 +336,7 @@ class RedundantExpertManager:
|
||||
self.tensor_infos = response["weights"]
|
||||
|
||||
# 更新权重加载结果
|
||||
result_update_weight_from_disk_array[0] = 1 if response["result"] else -1
|
||||
self.update_weight_from_disk_result.value[0] = 1 if response["result"] else -1
|
||||
self.logger.info(
|
||||
"redundant_expert: update_weight_from_disk end, rank"
|
||||
+ f" {self.rank} {response['result']}, cost {int(time.time() - begin_time)}s"
|
||||
@@ -330,8 +354,8 @@ class RedundantExpertManager:
|
||||
"""
|
||||
allgather_expert_token_stats
|
||||
"""
|
||||
success_count = 0
|
||||
expert_token_stats = np.zeros((self.num_hidden_layers, self.num_logical_experts), dtype=np.int32)
|
||||
success_count = 0
|
||||
for addr in self.dp_rank_address:
|
||||
try:
|
||||
# TODO: 请求失败重试
|
||||
@@ -347,8 +371,10 @@ class RedundantExpertManager:
|
||||
+ f"addr {addr}, res {res.status_code} {res.json()}"
|
||||
)
|
||||
break
|
||||
|
||||
for meta_data in res.json()["data"]:
|
||||
expert_token_stats += np.array(meta_data, dtype=np.int32)
|
||||
success_count += 1
|
||||
expert_token_stats += np.array(res.json()["data"], dtype=np.int32)
|
||||
except Exception as e:
|
||||
self.logger.error(f"redundant_expert: allgather_expert_token_stats fail. addr {addr}, error {e}")
|
||||
if success_count == len(self.dp_rank_address):
|
||||
@@ -426,18 +452,7 @@ class RedundantExpertManager:
|
||||
or not self.eplb_config.redundant_expert_enable_schedule_cordon
|
||||
):
|
||||
self.logger.info("redundant_expert: allreduce_load_weight_result success, notify infer.py")
|
||||
signal_update_weight_from_tensor = np.zeros([1], dtype=np.int32)
|
||||
shm_signal_update_weight_from_tensor = shared_memory.SharedMemory(
|
||||
create=False,
|
||||
size=signal_update_weight_from_tensor.nbytes,
|
||||
name=self.get_unique_name("signal_update_weight_from_tensor"),
|
||||
)
|
||||
signal_update_weight_from_tensor_array = np.ndarray(
|
||||
signal_update_weight_from_tensor.shape,
|
||||
dtype=signal_update_weight_from_tensor.dtype,
|
||||
buffer=shm_signal_update_weight_from_tensor.buf,
|
||||
)
|
||||
signal_update_weight_from_tensor_array[0] = 1
|
||||
self.signal_update_weight_from_tensor_array.value[0] = 1
|
||||
return True
|
||||
|
||||
def allgather_load_weight_result(self):
|
||||
@@ -465,140 +480,28 @@ class RedundantExpertManager:
|
||||
+ f"addr {addr}, res {res.status_code} {res.json()}"
|
||||
)
|
||||
break
|
||||
result = res.json()["data"]
|
||||
result_list = res.json()["data"]
|
||||
self.logger.info(
|
||||
f"redundant_expert: allgather_load_weight_result success. addr {addr}, result {result}"
|
||||
f"redundant_expert: allgather_load_weight_result success. addr {addr}, result_list {result_list}"
|
||||
)
|
||||
if result == 1:
|
||||
success_count += 1
|
||||
elif result == -1:
|
||||
fail_count += 1
|
||||
self.logger.error(
|
||||
f"redundant_expert: allgather_load_weight_result fail. addr {addr}, result {result}"
|
||||
)
|
||||
exist_fail = True
|
||||
for result in result_list:
|
||||
if result == 1:
|
||||
success_count += 1
|
||||
elif result == -1:
|
||||
fail_count += 1
|
||||
self.logger.error(
|
||||
f"redundant_expert: allgather_load_weight_result fail. addr {addr}, result {result}"
|
||||
)
|
||||
exist_fail = True
|
||||
except Exception as e:
|
||||
self.logger.error(f"redundant_expert: allgather_load_weight_result error. addr {addr}, error {e}")
|
||||
if success_count == len(self.dp_rank_address):
|
||||
self.logger.info("redundant_expert: allgather_load_weight_result all success")
|
||||
all_success = True
|
||||
else:
|
||||
|
||||
if fail_count > 0:
|
||||
self.logger.info(
|
||||
"redundant_expert: allgather_load_weight_result not all ready, "
|
||||
+ f"succ {success_count} fail {fail_count} total {len(self.dp_rank_address)}"
|
||||
)
|
||||
else:
|
||||
self.logger.info("redundant_expert: allgather_load_weight_result all success")
|
||||
all_success = True
|
||||
return all_success, exist_fail
|
||||
|
||||
|
||||
def init_shared_memory_for_eplb_rank0(rank):
|
||||
rearrange_experts_ips_size = np.zeros([1], dtype=np.int32)
|
||||
shm_rearrange_experts_ips_size = shared_memory.SharedMemory(
|
||||
create=True,
|
||||
size=rearrange_experts_ips_size.nbytes,
|
||||
name=f"{envs.get_unique_name('rearrange_experts_ips_size_dprank' + rank)}",
|
||||
)
|
||||
rearrange_experts_ips_size_array = np.ndarray(
|
||||
rearrange_experts_ips_size.shape,
|
||||
dtype=rearrange_experts_ips_size.dtype,
|
||||
buffer=shm_rearrange_experts_ips_size.buf,
|
||||
)
|
||||
shm_rearrange_experts_ips_list = shared_memory.SharedMemory(
|
||||
create=True,
|
||||
size=envs.FD_REDUNDANT_EXPERT_IP_SHM_SIZE,
|
||||
name=f"{envs.get_unique_name('rearrange_experts_ips_list_dprank' + rank)}",
|
||||
)
|
||||
# 记录专家重排状态
|
||||
rearrange_experts_status = np.zeros([1], dtype=np.int32)
|
||||
shm_rearrange_experts_status = shared_memory.SharedMemory(
|
||||
create=True,
|
||||
size=rearrange_experts_status.nbytes,
|
||||
name=f"{envs.get_unique_name('rearrange_experts_status_dprank' + rank)}",
|
||||
)
|
||||
rearrange_experts_status_array = np.ndarray(
|
||||
rearrange_experts_status.shape, dtype=rearrange_experts_status.dtype, buffer=shm_rearrange_experts_status.buf
|
||||
)
|
||||
# 接收更新权重的信号
|
||||
signal_update_weight_from_tensor = np.zeros([1], dtype=np.int32)
|
||||
shm_signal_update_weight_from_tensor = shared_memory.SharedMemory(
|
||||
create=True,
|
||||
size=signal_update_weight_from_tensor.nbytes,
|
||||
name=f"{envs.get_unique_name('signal_update_weight_from_tensor_dprank' + rank) }",
|
||||
)
|
||||
signal_update_weight_from_tensor_array = np.ndarray(
|
||||
signal_update_weight_from_tensor.shape,
|
||||
dtype=signal_update_weight_from_tensor.dtype,
|
||||
buffer=shm_signal_update_weight_from_tensor.buf,
|
||||
)
|
||||
return (
|
||||
rearrange_experts_ips_size_array,
|
||||
shm_rearrange_experts_ips_list,
|
||||
rearrange_experts_status_array,
|
||||
signal_update_weight_from_tensor_array,
|
||||
)
|
||||
|
||||
|
||||
def init_shared_memory_for_eplb_each_rank(fd_config, rank):
|
||||
# 记录专家负载
|
||||
num_layers = fd_config.model_config.num_hidden_layers
|
||||
num_experts = fd_config.model_config.moe_num_experts
|
||||
experts_token_stats = np.zeros((num_layers, num_experts), dtype=np.int32)
|
||||
shm_local_experts_token_stats = shared_memory.SharedMemory(
|
||||
create=True,
|
||||
size=experts_token_stats.nbytes,
|
||||
name=f"{envs.get_unique_name('local_experts_token_stats_dprank' + rank)}",
|
||||
)
|
||||
local_experts_token_stats_array = np.ndarray(
|
||||
experts_token_stats.shape, dtype=experts_token_stats.dtype, buffer=shm_local_experts_token_stats.buf
|
||||
)
|
||||
# TODO: 全局专家负载状态是一样的,节点上的所有DP可以共用一份,但需要避免多个DP同时更新
|
||||
shm_all_experts_token_stats = shared_memory.SharedMemory(
|
||||
create=True,
|
||||
size=experts_token_stats.nbytes,
|
||||
name=f"{envs.get_unique_name('all_experts_token_stats_dprank' + rank)}",
|
||||
)
|
||||
expert_tokens_stats_array = np.ndarray(
|
||||
experts_token_stats.shape, dtype=experts_token_stats.dtype, buffer=shm_all_experts_token_stats.buf
|
||||
)
|
||||
# 接收加载权重的信号
|
||||
signal_update_weight_from_disk = np.zeros([1], dtype=np.int32)
|
||||
shm_signal_update_weight_from_disk = shared_memory.SharedMemory(
|
||||
create=True,
|
||||
size=signal_update_weight_from_disk.nbytes,
|
||||
name=f"{envs.get_unique_name('signal_update_weight_from_disk_dprank' + rank)}",
|
||||
)
|
||||
signal_update_weight_from_disk_array = np.ndarray(
|
||||
signal_update_weight_from_disk.shape,
|
||||
dtype=signal_update_weight_from_disk.dtype,
|
||||
buffer=shm_signal_update_weight_from_disk.buf,
|
||||
)
|
||||
# 记录加载权重的结果
|
||||
result_update_weight_from_disk = np.zeros([1], dtype=np.int32)
|
||||
shm_result_update_weight_from_disk = shared_memory.SharedMemory(
|
||||
create=True,
|
||||
size=result_update_weight_from_disk.nbytes,
|
||||
name=f"{envs.get_unique_name('result_update_weight_from_disk_dprank' + rank)}",
|
||||
)
|
||||
result_update_weight_from_disk_array = np.ndarray(
|
||||
result_update_weight_from_disk.shape,
|
||||
dtype=result_update_weight_from_disk.dtype,
|
||||
buffer=shm_result_update_weight_from_disk.buf,
|
||||
)
|
||||
# 接收清零专家负载的信号
|
||||
signal_clear_experts_token_stats = np.zeros([1], dtype=np.int32)
|
||||
shm_signal_clear_experts_token_stats = shared_memory.SharedMemory(
|
||||
create=True,
|
||||
size=signal_clear_experts_token_stats.nbytes,
|
||||
name=f"{envs.get_unique_name('signal_clear_experts_token_stats_dprank' + rank)}",
|
||||
)
|
||||
signal_clear_experts_token_stats_array = np.ndarray(
|
||||
signal_clear_experts_token_stats.shape,
|
||||
dtype=signal_clear_experts_token_stats.dtype,
|
||||
buffer=shm_signal_clear_experts_token_stats.buf,
|
||||
)
|
||||
return (
|
||||
local_experts_token_stats_array,
|
||||
expert_tokens_stats_array,
|
||||
signal_update_weight_from_disk_array,
|
||||
result_update_weight_from_disk_array,
|
||||
signal_clear_experts_token_stats_array,
|
||||
)
|
||||
|
||||
+114
-8
@@ -1,9 +1,27 @@
|
||||
"""eplb utilities"""
|
||||
"""
|
||||
# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
"""
|
||||
|
||||
import json
|
||||
import os
|
||||
import time
|
||||
from enum import Enum
|
||||
|
||||
import numpy as np
|
||||
|
||||
from fastdeploy.config import FDConfig
|
||||
from fastdeploy.inter_communicator import IPCSignal
|
||||
|
||||
|
||||
class RedundantExpertWorkload:
|
||||
@@ -47,13 +65,101 @@ class RedundantExpertWorkload:
|
||||
return {}, f"redundant_expert: load file {self.meta_file_name} failed, {e}"
|
||||
|
||||
|
||||
class RearrangeExpertState(Enum):
|
||||
"""RearrangeExpertState"""
|
||||
def init_eplb_signals(config: FDConfig, ipc_signal_suffix):
|
||||
"""
|
||||
Initialize shared memory to indicate eplb status
|
||||
"""
|
||||
if config.parallel_config.tensor_parallel_rank != 0:
|
||||
# only TP rank 0 need to init eplb signals, rank 0 manage all EPLB signals for all TP ranks
|
||||
return
|
||||
|
||||
free = 0
|
||||
doing = 1
|
||||
load_succ = 2 # load weight from disk success
|
||||
done = 3
|
||||
dp_ipc_signal_suffix = f"{ipc_signal_suffix}_dp{config.parallel_config.local_data_parallel_id}"
|
||||
# rearrange_experts_status Record the expert's rearrangement status
|
||||
rearrange_experts_array = np.zeros([1], dtype=np.int32)
|
||||
_ = IPCSignal(
|
||||
name="rearrange_experts_status",
|
||||
array=rearrange_experts_array,
|
||||
dtype=np.int32,
|
||||
suffix=dp_ipc_signal_suffix,
|
||||
create=True,
|
||||
)
|
||||
|
||||
# Record all DP rank IPs when receiving expert rearrangement requests
|
||||
rearrange_experts_ips_size_array = np.zeros([1], dtype=np.int32)
|
||||
_ = IPCSignal(
|
||||
name="rearrange_experts_ips_size",
|
||||
array=rearrange_experts_ips_size_array,
|
||||
dtype=np.int32,
|
||||
suffix=dp_ipc_signal_suffix,
|
||||
create=True,
|
||||
)
|
||||
_ = IPCSignal(
|
||||
name="rearrange_experts_ips_list",
|
||||
shm_size=config.eplb_config.redundant_expert_ip_shm_size,
|
||||
suffix=dp_ipc_signal_suffix,
|
||||
create=True,
|
||||
)
|
||||
|
||||
# Receive signals for updating weights
|
||||
signal_update_weight_from_tensor = np.zeros([1], dtype=np.int32)
|
||||
_ = IPCSignal(
|
||||
name="signal_update_weight_from_tensor",
|
||||
array=signal_update_weight_from_tensor,
|
||||
dtype=np.int32,
|
||||
suffix=dp_ipc_signal_suffix,
|
||||
create=True,
|
||||
)
|
||||
|
||||
for rank_id in range(config.parallel_config.tensor_parallel_size):
|
||||
tp_ipc_signal_suffix = f"{dp_ipc_signal_suffix}_tp{rank_id}"
|
||||
# Record expert workload
|
||||
experts_token_stats = np.zeros(
|
||||
(config.model_config.num_hidden_layers, config.model_config.moe_num_experts),
|
||||
dtype=np.int32,
|
||||
)
|
||||
_ = IPCSignal(
|
||||
name="all_experts_token_stats",
|
||||
array=experts_token_stats,
|
||||
dtype=np.int32,
|
||||
suffix=tp_ipc_signal_suffix,
|
||||
create=True,
|
||||
)
|
||||
_ = IPCSignal(
|
||||
name="local_experts_token_stats",
|
||||
array=experts_token_stats,
|
||||
dtype=np.int32,
|
||||
suffix=tp_ipc_signal_suffix,
|
||||
create=True,
|
||||
)
|
||||
|
||||
# Receive signals for loading weights
|
||||
signal_update_weight_from_disk = np.zeros([1], dtype=np.int32)
|
||||
_ = IPCSignal(
|
||||
name="signal_update_weight_from_disk",
|
||||
array=signal_update_weight_from_disk,
|
||||
dtype=np.int32,
|
||||
suffix=tp_ipc_signal_suffix,
|
||||
create=True,
|
||||
)
|
||||
|
||||
# Receive signals for clearing expert loads
|
||||
clear_experts_token_stats = np.zeros([1], dtype=np.int32)
|
||||
_ = IPCSignal(
|
||||
name="signal_clear_experts_token_stats",
|
||||
array=clear_experts_token_stats,
|
||||
dtype=np.int32,
|
||||
suffix=tp_ipc_signal_suffix,
|
||||
create=True,
|
||||
)
|
||||
|
||||
result_update_weight_from_disk = np.zeros([1], dtype=np.int32)
|
||||
_ = IPCSignal(
|
||||
name="result_update_weight_from_disk",
|
||||
array=result_update_weight_from_disk,
|
||||
dtype=np.int32,
|
||||
suffix=tp_ipc_signal_suffix,
|
||||
create=True,
|
||||
)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
|
||||
Reference in New Issue
Block a user