diff --git a/fastdeploy/model_executor/models/glm4_moe.py b/fastdeploy/model_executor/models/glm4_moe.py index 3f45e9df61..b32ebb2ced 100644 --- a/fastdeploy/model_executor/models/glm4_moe.py +++ b/fastdeploy/model_executor/models/glm4_moe.py @@ -26,6 +26,7 @@ from paddleformers.transformers import PretrainedModel from paddleformers.utils.log import logger from fastdeploy.config import FDConfig +from fastdeploy.distributed.communication import tensor_model_parallel_all_reduce from fastdeploy.model_executor.forward_meta import ForwardMeta from fastdeploy.model_executor.graph_optimization.decorator import ( support_graph_optimization, @@ -160,8 +161,16 @@ class Glm4Moe(nn.Layer): default_initializer=paddle.nn.initializer.Constant(0), ) + # In pure-TP mode (tp>1, ep=1) both branches return partial sums, so we + # defer the all-reduce to after combining them — saving one collective. + # In all other modes (EP, EP+attn-TP, no parallelism) each branch handles + # its own reduction internally (reduce_results default=True), so we must + # NOT add an extra all-reduce here. + self.merge_ffn_tp = self.use_tp and not self.use_ep + self.experts = FusedMoE( fd_config, + reduce_results=not self.merge_ffn_tp, renormalize=self.norm_topk_prob, moe_intermediate_size=fd_config.model_config.moe_intermediate_size, num_experts=fd_config.model_config.n_routed_experts, @@ -182,14 +191,16 @@ class Glm4Moe(nn.Layer): intermediate_size=shared_experts_intermediate_size, layer_id=layer_id, prefix=f"{prefix}.shared_experts", + reduce_results=not self.merge_ffn_tp, ) def forward(self, x, forward_meta: ForwardMeta = None): out = self.experts(x, self.gate, forward_meta) if self.n_shared_experts > 0: - shared_experts_out = self.shared_experts(x) - out = out + shared_experts_out - + out = out + self.shared_experts(x) + if self.merge_ffn_tp: + # Both branches produced partial sums; combine first, then single all-reduce. + out = tensor_model_parallel_all_reduce(out, self.tp_group) return out diff --git a/tests/e2e/4cards_cases/test_GLM_45_AIR_mtp_tp4.py b/tests/e2e/4cards_cases/test_GLM_45_AIR_mtp_tp4.py index 4024b57613..b3bd6d9b45 100644 --- a/tests/e2e/4cards_cases/test_GLM_45_AIR_mtp_tp4.py +++ b/tests/e2e/4cards_cases/test_GLM_45_AIR_mtp_tp4.py @@ -185,7 +185,7 @@ def test_lm_head_fp32(api_url, headers, consistent_payload): # 校验返回内容与概率信息 assert ( resp_json["choices"][0]["message"]["content"] - == "\n这个问题是关于牛顿的三大运动定律的。牛顿的三大运动定律是经典" + == "\n我需要回答牛顿的三大运动定律是什么。牛顿的三大运动定律是经典" ), f"The response content is not as expected {resp_json['choices'][0]['message']['content']}." diff --git a/tests/e2e/utils/rollout_routing_replay_test_utils.py b/tests/e2e/utils/rollout_routing_replay_test_utils.py index 8a01b50240..4186a71649 100644 --- a/tests/e2e/utils/rollout_routing_replay_test_utils.py +++ b/tests/e2e/utils/rollout_routing_replay_test_utils.py @@ -157,10 +157,10 @@ def check_routing_replay_chat_completion(openai_client, moe_layer_num: int, mode model_path = os.getenv("MODEL_PATH") if model_path: baseline_path = os.path.join( - model_path, f"R3_BaseLine_dev_uint8_0312/routing_replay_output_baseline_{model_name}" + model_path, f"R3_BaseLine_dev_uint8_0402/routing_replay_output_baseline_{model_name}" ) else: - baseline_path = f"./R3_BaseLine_dev_uint8_0312/routing_replay_output_baseline_{model_name}" + baseline_path = f"./R3_BaseLine_dev_uint8_0402/routing_replay_output_baseline_{model_name}" stream_baseline_path = os.path.join(baseline_path, "r3_chat_completion_stream") nonstream_baseline_path = os.path.join(baseline_path, "r3_chat_completion_nonstream")