[Feature] support compute shared experts before combine for better overlap (#6697)

* [Feature] support compute shared experts before combine for better overlap

* fix test

* fix xpu

* fix
This commit is contained in:
Longzhi Wang
2026-03-17 15:18:51 +08:00
committed by GitHub
parent 12eb001d0c
commit daaf498213
15 changed files with 104 additions and 27 deletions
@@ -450,7 +450,12 @@ class MXFP4MoeMethod(MoEMethodBase):
)
def apply(
self, layer: nn.Layer, x: paddle.Tensor, router: nn.Layer, topk_ids_hookfunc: Callable = None
self,
layer: nn.Layer,
x: paddle.Tensor,
router: nn.Layer,
topk_ids_hookfunc: Callable = None,
shared_experts: nn.Layer = None,
) -> paddle.Tensor:
router_out = router(x.cast("float32"))
@@ -14,9 +14,10 @@
# limitations under the License.
"""
from typing import Optional
from typing import Callable, Optional
import paddle
from paddle import nn
from paddleformers.utils.log import logger
import fastdeploy
@@ -553,7 +554,8 @@ class ModelOptNvFp4FusedMoE(QuantMethodBase):
layer,
x,
gate,
topk_ids_hookfunc=None,
topk_ids_hookfunc: Callable = None,
shared_experts: nn.Layer = None,
):
"""
flashinfer nvfp4 fusedmoe for Model Optimizer