mirror of
https://github.com/PaddlePaddle/FastDeploy.git
synced 2026-04-23 00:17:25 +08:00
[Feature] support compute shared experts before combine for better overlap (#6697)
* [Feature] support compute shared experts before combine for better overlap * fix test * fix xpu * fix
This commit is contained in:
@@ -450,7 +450,12 @@ class MXFP4MoeMethod(MoEMethodBase):
|
||||
)
|
||||
|
||||
def apply(
|
||||
self, layer: nn.Layer, x: paddle.Tensor, router: nn.Layer, topk_ids_hookfunc: Callable = None
|
||||
self,
|
||||
layer: nn.Layer,
|
||||
x: paddle.Tensor,
|
||||
router: nn.Layer,
|
||||
topk_ids_hookfunc: Callable = None,
|
||||
shared_experts: nn.Layer = None,
|
||||
) -> paddle.Tensor:
|
||||
router_out = router(x.cast("float32"))
|
||||
|
||||
|
||||
@@ -14,9 +14,10 @@
|
||||
# limitations under the License.
|
||||
"""
|
||||
|
||||
from typing import Optional
|
||||
from typing import Callable, Optional
|
||||
|
||||
import paddle
|
||||
from paddle import nn
|
||||
from paddleformers.utils.log import logger
|
||||
|
||||
import fastdeploy
|
||||
@@ -553,7 +554,8 @@ class ModelOptNvFp4FusedMoE(QuantMethodBase):
|
||||
layer,
|
||||
x,
|
||||
gate,
|
||||
topk_ids_hookfunc=None,
|
||||
topk_ids_hookfunc: Callable = None,
|
||||
shared_experts: nn.Layer = None,
|
||||
):
|
||||
"""
|
||||
flashinfer nvfp4 fusedmoe for Model Optimizer
|
||||
|
||||
Reference in New Issue
Block a user