mirror of
https://github.com/PaddlePaddle/FastDeploy.git
synced 2026-04-22 16:07:51 +08:00
[Feature] 为 FusedMoE 添加 hidden_size 显式参数支持 (#7361)
[Feature] 为 FusedMoE 添加 hidden_size 显式参数支持
This commit is contained in:
@@ -153,6 +153,7 @@ class FusedMoE(nn.Layer):
|
||||
def __init__(
|
||||
self,
|
||||
fd_config,
|
||||
hidden_size: int = -1,
|
||||
reduce_results: bool = True,
|
||||
renormalize: bool = False,
|
||||
moe_intermediate_size: int = -1,
|
||||
@@ -204,7 +205,7 @@ class FusedMoE(nn.Layer):
|
||||
self.tp_size == 1 and self.ep_size > 1
|
||||
), "MoE only support parallelism on TP or EP dimension."
|
||||
|
||||
self.hidden_size = fd_config.model_config.hidden_size
|
||||
self.hidden_size = hidden_size
|
||||
self.num_experts = num_experts
|
||||
|
||||
self.num_local_experts = self.num_experts // self.ep_size
|
||||
|
||||
@@ -167,6 +167,7 @@ class DeepSeekV3MoE(nn.Layer):
|
||||
|
||||
self.experts = FusedMoE(
|
||||
fd_config=fd_config,
|
||||
hidden_size=fd_config.model_config.hidden_size,
|
||||
reduce_results=False,
|
||||
renormalize=self.norm_topk_prob,
|
||||
moe_intermediate_size=fd_config.model_config.moe_intermediate_size,
|
||||
|
||||
@@ -210,6 +210,7 @@ class Ernie4_5_MoE(nn.Layer):
|
||||
|
||||
self.experts = FusedMoE(
|
||||
fd_config=fd_config,
|
||||
hidden_size=fd_config.model_config.hidden_size,
|
||||
moe_intermediate_size=fd_config.model_config.moe_intermediate_size,
|
||||
num_experts=fd_config.model_config.moe_num_experts,
|
||||
top_k=fd_config.model_config.moe_k,
|
||||
|
||||
@@ -148,6 +148,7 @@ class Ernie4_5_VLMoeBlock(nn.Layer):
|
||||
)
|
||||
self.experts = FusedMoE(
|
||||
fd_config=fd_config,
|
||||
hidden_size=fd_config.model_config.hidden_size,
|
||||
reduce_results=False,
|
||||
moe_intermediate_size=moe_intermediate_size,
|
||||
num_experts=num_experts,
|
||||
|
||||
@@ -169,6 +169,7 @@ class Glm4Moe(nn.Layer):
|
||||
|
||||
self.experts = FusedMoE(
|
||||
fd_config,
|
||||
hidden_size=fd_config.model_config.hidden_size,
|
||||
reduce_results=not self.merge_ffn_tp,
|
||||
renormalize=self.norm_topk_prob,
|
||||
moe_intermediate_size=fd_config.model_config.moe_intermediate_size,
|
||||
|
||||
@@ -114,6 +114,7 @@ class GptOssMoe(nn.Layer):
|
||||
|
||||
self.experts = FusedMoE(
|
||||
fd_config=fd_config,
|
||||
hidden_size=fd_config.model_config.hidden_size,
|
||||
moe_intermediate_size=fd_config.model_config.intermediate_size,
|
||||
num_experts=num_local_experts,
|
||||
top_k=fd_config.model_config.num_experts_per_tok,
|
||||
|
||||
@@ -63,6 +63,7 @@ class Qwen3MoeBlock(nn.Layer):
|
||||
}
|
||||
self.experts = FusedMoE(
|
||||
fd_config,
|
||||
hidden_size=fd_config.model_config.hidden_size,
|
||||
moe_intermediate_size=fd_config.model_config.moe_intermediate_size,
|
||||
num_experts=fd_config.model_config.num_experts,
|
||||
top_k=fd_config.model_config.num_experts_per_tok,
|
||||
|
||||
@@ -509,6 +509,7 @@ class FuseMoEWrapper(paddle.nn.Layer):
|
||||
|
||||
self.fused_moe = FusedMoE(
|
||||
fd_config=self.fd_config,
|
||||
hidden_size=self.fd_config.model_config.hidden_size,
|
||||
moe_intermediate_size=self.fd_config.model_config.moe_intermediate_size,
|
||||
num_experts=self.fd_config.model_config.moe_num_experts,
|
||||
top_k=self.fd_config.model_config.moe_k,
|
||||
|
||||
@@ -516,6 +516,7 @@ class FuseMoEWrapper(paddle.nn.Layer):
|
||||
|
||||
self.fused_moe = FusedMoE(
|
||||
fd_config=self.fd_config,
|
||||
hidden_size=self.fd_config.model_config.hidden_size,
|
||||
moe_intermediate_size=self.fd_config.model_config.moe_intermediate_size,
|
||||
num_experts=self.fd_config.model_config.moe_num_experts,
|
||||
top_k=self.fd_config.model_config.moe_k,
|
||||
|
||||
@@ -110,6 +110,7 @@ class FuseMoEWrapper(paddle.nn.Layer):
|
||||
|
||||
self.fused_moe = FusedMoE(
|
||||
fd_config=self.fd_config,
|
||||
hidden_size=self.fd_config.model_config.hidden_size,
|
||||
moe_intermediate_size=self.fd_config.model_config.moe_intermediate_size,
|
||||
num_experts=self.fd_config.model_config.moe_num_experts,
|
||||
top_k=self.fd_config.model_config.moe_k,
|
||||
|
||||
@@ -96,6 +96,7 @@ class FuseMoEWrapper(paddle.nn.Layer):
|
||||
|
||||
self.fused_moe = FusedMoE(
|
||||
fd_config=self.fd_config,
|
||||
hidden_size=self.fd_config.model_config.hidden_size,
|
||||
moe_intermediate_size=self.fd_config.model_config.moe_intermediate_size,
|
||||
num_experts=self.fd_config.model_config.moe_num_experts,
|
||||
top_k=self.fd_config.model_config.moe_k,
|
||||
|
||||
Reference in New Issue
Block a user