[Feature] 为 FusedMoE 添加 hidden_size 显式参数支持 (#7361)

[Feature] 为 FusedMoE 添加 hidden_size 显式参数支持
This commit is contained in:
周周周
2026-04-13 20:24:58 +08:00
committed by GitHub
parent 1e08ee74e5
commit 73bd4ab318
11 changed files with 12 additions and 1 deletions
+2 -1
View File
@@ -153,6 +153,7 @@ class FusedMoE(nn.Layer):
def __init__(
self,
fd_config,
hidden_size: int = -1,
reduce_results: bool = True,
renormalize: bool = False,
moe_intermediate_size: int = -1,
@@ -204,7 +205,7 @@ class FusedMoE(nn.Layer):
self.tp_size == 1 and self.ep_size > 1
), "MoE only support parallelism on TP or EP dimension."
self.hidden_size = fd_config.model_config.hidden_size
self.hidden_size = hidden_size
self.num_experts = num_experts
self.num_local_experts = self.num_experts // self.ep_size
@@ -167,6 +167,7 @@ class DeepSeekV3MoE(nn.Layer):
self.experts = FusedMoE(
fd_config=fd_config,
hidden_size=fd_config.model_config.hidden_size,
reduce_results=False,
renormalize=self.norm_topk_prob,
moe_intermediate_size=fd_config.model_config.moe_intermediate_size,
@@ -210,6 +210,7 @@ class Ernie4_5_MoE(nn.Layer):
self.experts = FusedMoE(
fd_config=fd_config,
hidden_size=fd_config.model_config.hidden_size,
moe_intermediate_size=fd_config.model_config.moe_intermediate_size,
num_experts=fd_config.model_config.moe_num_experts,
top_k=fd_config.model_config.moe_k,
@@ -148,6 +148,7 @@ class Ernie4_5_VLMoeBlock(nn.Layer):
)
self.experts = FusedMoE(
fd_config=fd_config,
hidden_size=fd_config.model_config.hidden_size,
reduce_results=False,
moe_intermediate_size=moe_intermediate_size,
num_experts=num_experts,
@@ -169,6 +169,7 @@ class Glm4Moe(nn.Layer):
self.experts = FusedMoE(
fd_config,
hidden_size=fd_config.model_config.hidden_size,
reduce_results=not self.merge_ffn_tp,
renormalize=self.norm_topk_prob,
moe_intermediate_size=fd_config.model_config.moe_intermediate_size,
@@ -114,6 +114,7 @@ class GptOssMoe(nn.Layer):
self.experts = FusedMoE(
fd_config=fd_config,
hidden_size=fd_config.model_config.hidden_size,
moe_intermediate_size=fd_config.model_config.intermediate_size,
num_experts=num_local_experts,
top_k=fd_config.model_config.num_experts_per_tok,
@@ -63,6 +63,7 @@ class Qwen3MoeBlock(nn.Layer):
}
self.experts = FusedMoE(
fd_config,
hidden_size=fd_config.model_config.hidden_size,
moe_intermediate_size=fd_config.model_config.moe_intermediate_size,
num_experts=fd_config.model_config.num_experts,
top_k=fd_config.model_config.num_experts_per_tok,
+1
View File
@@ -509,6 +509,7 @@ class FuseMoEWrapper(paddle.nn.Layer):
self.fused_moe = FusedMoE(
fd_config=self.fd_config,
hidden_size=self.fd_config.model_config.hidden_size,
moe_intermediate_size=self.fd_config.model_config.moe_intermediate_size,
num_experts=self.fd_config.model_config.moe_num_experts,
top_k=self.fd_config.model_config.moe_k,
+1
View File
@@ -516,6 +516,7 @@ class FuseMoEWrapper(paddle.nn.Layer):
self.fused_moe = FusedMoE(
fd_config=self.fd_config,
hidden_size=self.fd_config.model_config.hidden_size,
moe_intermediate_size=self.fd_config.model_config.moe_intermediate_size,
num_experts=self.fd_config.model_config.moe_num_experts,
top_k=self.fd_config.model_config.moe_k,
+1
View File
@@ -110,6 +110,7 @@ class FuseMoEWrapper(paddle.nn.Layer):
self.fused_moe = FusedMoE(
fd_config=self.fd_config,
hidden_size=self.fd_config.model_config.hidden_size,
moe_intermediate_size=self.fd_config.model_config.moe_intermediate_size,
num_experts=self.fd_config.model_config.moe_num_experts,
top_k=self.fd_config.model_config.moe_k,
+1
View File
@@ -96,6 +96,7 @@ class FuseMoEWrapper(paddle.nn.Layer):
self.fused_moe = FusedMoE(
fd_config=self.fd_config,
hidden_size=self.fd_config.model_config.hidden_size,
moe_intermediate_size=self.fd_config.model_config.moe_intermediate_size,
num_experts=self.fd_config.model_config.moe_num_experts,
top_k=self.fd_config.model_config.moe_k,