mirror of
https://github.com/PaddlePaddle/FastDeploy.git
synced 2026-04-24 09:44:10 +08:00
[Feature] support compute shared experts before combine for better overlap (#6697)
* [Feature] support compute shared experts before combine for better overlap * fix test * fix xpu * fix
This commit is contained in:
@@ -14,9 +14,10 @@
|
||||
# limitations under the License.
|
||||
"""
|
||||
|
||||
from typing import Optional
|
||||
from typing import Callable, Optional
|
||||
|
||||
import paddle
|
||||
from paddle import nn
|
||||
from paddleformers.utils.log import logger
|
||||
|
||||
import fastdeploy
|
||||
@@ -553,7 +554,8 @@ class ModelOptNvFp4FusedMoE(QuantMethodBase):
|
||||
layer,
|
||||
x,
|
||||
gate,
|
||||
topk_ids_hookfunc=None,
|
||||
topk_ids_hookfunc: Callable = None,
|
||||
shared_experts: nn.Layer = None,
|
||||
):
|
||||
"""
|
||||
flashinfer nvfp4 fusedmoe for Model Optimizer
|
||||
|
||||
Reference in New Issue
Block a user