[Fix] Fix eplb bug and support fp8 load weight (#5178)

* fix eplb part2 * fix eplb part2 * fix eplb part2
2026-04-23 00:17:25 +08:00 · 2025-11-24 15:31:37 +08:00
parent f5c1066245
commit 95f3c8c641
5 changed files with 27 additions and 8 deletions
@@ -21,7 +21,7 @@ class RedundantExpertManager:
    RedundantExpertManger
    """

-    def __init__(self, rank=0, ep_size=32, fd_config=None):
+    def __init__(self, rank=0, ep_size=64, fd_config=None):
        self.logger = get_logger("eplb_expert_manager", "eplb_{0}.log".format(rank))

        self.rank = rank
@@ -101,7 +101,7 @@ class RedundantExpertManager:
        self.http_timeout = 1
        # 重置重排状态: 'done' -> 'free'
        self.rearrange_end_ts = 0
-        self.rearrange_reset_interval = 300
+        self.rearrange_reset_interval = 30

        self.tensor_infos = None

@@ -250,8 +250,8 @@ class RedundantExpertManager:
        eplb_strategy = self.eplb_config.redundant_expert_eplb_strategy
        if is_init:
            num_groups = 1
-            num_nodes = 2
-            num_gpus = 2 * 8
+            num_nodes = 8
+            num_gpus = 8 * 8
            eplb_strategy = ""
        # eplb
        rank_expert_list, logical_to_physical_map, expert_count = rebalance_experts(
@@ -420,7 +420,9 @@ class RedundantExpertManager:
        if not exist_fail and all_success:
            # prefill需要等待调度屏蔽
            if (
-                self.fd_config.splitwise_role == "decode"
+                self.fd_config.scheduler_config.splitwise_role == "mixed"
+                or self.fd_config.scheduler_config.splitwise_role == "decode"
+                or self.fd_config.scheduler_config.splitwise_role == "prefill"
                or not self.eplb_config.redundant_expert_enable_schedule_cordon
            ):
                self.logger.info("redundant_expert: allreduce_load_weight_result success, notify infer.py")