Sync v2.0 version of code to github repo

2026-04-24 09:44:10 +08:00 · 2025-06-29 23:29:37 +00:00
parent d151496038
commit 92c2cfa2e7
597 changed files with 78776 additions and 22905 deletions
@@ -11,3 +11,13 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+
+from .fused_moe_cutlass_backend import (CutlassW4A8MoEMethod,
+                                        CutlassWeightOnlyMoEMethod)
+from .fused_moe_triton_backend import TritonWeightOnlyMoEMethod
+from .moe import FusedMoE
+
+__all__ = [
+    CutlassWeightOnlyMoEMethod, CutlassW4A8MoEMethod, FusedMoE,
+    TritonWeightOnlyMoEMethod
+]
@@ -1,222 +0,0 @@
-"""
-# Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""
-
-import paddle
-from paddle import nn
-from paddle.distributed import fleet
-from paddle.framework import in_dynamic_or_pir_mode
-from paddle.nn.quant import weight_quantize
-
-from fastdeploy.model_executor.ops.gpu import (moe_expert_dispatch,
-                                               moe_expert_ffn,
-                                               moe_expert_reduce)
-
-from .fused_moe_method_base import FusedMoEMethodBase
-
-
-class CutlassFusedMoeMethod(FusedMoEMethodBase):
-    """
-    Use Cutlass Group Gemm to compute Fused MoE.
-    This method is the oldest way to compute MoE in Paddle.
-    """
-
-    def create_weights(
-            self,
-            layer: nn.Layer,
-            moe_compute_params,
-            ffn1_tensor,
-            ffn2_tensor,
-            ffn1_bias=None,
-            ffn2_bias=None,
-            # belows only used in w4a8.
-            moe_ffn1_weight_scale=None,
-            moe_ffn2_weight_scale=None,
-            moe_ffn1_in_scale=None,
-            moe_ffn2_in_scale=None):
-        """
-        Paddle cutlass create weight process.
-        """
-
-        num_local_experts = moe_compute_params.num_local_experts
-        moe_quant_type = moe_compute_params.moe_quant_type
-
-        assert len(ffn1_tensor) == num_local_experts
-        assert len(ffn2_tensor) == num_local_experts
-        assert ffn1_tensor[0].shape == [
-            moe_compute_params.hidden_size,
-            moe_compute_params.moe_intermediate_size * 2
-        ]
-        assert ffn2_tensor[0].shape == [
-            moe_compute_params.moe_intermediate_size,
-            moe_compute_params.hidden_size
-        ]
-
-        added_weight_attrs = ["moe_ffn1_weight", "moe_ffn2_weight"]
-        added_scale_attrs = ["moe_ffn1_weight_scale", "moe_ffn2_weight_scale"]
-
-        if moe_quant_type == "w4a8":
-            moe_ffn1_in_scale = paddle.concat(moe_ffn1_in_scale)
-            moe_ffn2_in_scale = paddle.concat(moe_ffn2_in_scale)
-            moe_ffn1_in_scale = 1 / moe_ffn1_in_scale
-            moe_ffn2_in_scale = 1 / moe_ffn2_in_scale
-            moe_ffn1_weight_scale = paddle.stack(moe_ffn1_weight_scale, axis=0)
-            moe_ffn2_weight_scale = paddle.stack(moe_ffn2_weight_scale, axis=0)
-
-            moe_ffn1_weight_scale = moe_ffn1_weight_scale / (127 * 112)
-            moe_ffn2_weight_scale = moe_ffn2_weight_scale / (127 * 112)
-            moe_ffn1_weight_scale = moe_ffn1_weight_scale / moe_ffn1_in_scale[:,
-                                                                              None]
-            moe_ffn2_weight_scale = moe_ffn2_weight_scale / moe_ffn2_in_scale[:,
-                                                                              None]
-            moe_ffn1_weight_scale = moe_ffn1_weight_scale.cast(
-                paddle.get_default_dtype())
-            moe_ffn2_weight_scale = moe_ffn2_weight_scale.cast(
-                paddle.get_default_dtype())
-
-        if moe_quant_type in ["weight_only_int4", "weight_only_int8", "w4a8"]:
-
-            for idx, weight_tensor in enumerate([ffn1_tensor, ffn2_tensor]):
-                weight_name = added_weight_attrs[idx]
-                scale_name = added_scale_attrs[idx]
-
-                weight_list = []
-                weight_scale_list = []
-                for i in range(num_local_experts):
-                    quant_weight, scale = weight_quantize(weight_tensor[i],
-                                                          algo=moe_quant_type,
-                                                          arch=80)
-                    weight_list.append(quant_weight)
-                    if moe_quant_type != "w4a8":
-                        # scale holds no memoty in w4a8, don't touch it!
-                        weight_scale_list.append(scale)
-                quanted_weight = paddle.stack(weight_list, axis=0)
-                setattr(
-                    layer, weight_name,
-                    layer.create_parameter(
-                        shape=quanted_weight.shape,
-                        dtype=quanted_weight.dtype,
-                        default_initializer=paddle.nn.initializer.Constant(0),
-                    ))
-                getattr(layer, weight_name).set_value(quanted_weight)
-
-                # this scale only useful for wint8/4.
-                if moe_quant_type != "w4a8":
-                    quanted_weight_scale = paddle.stack(weight_scale_list,
-                                                        axis=0)
-                    setattr(
-                        layer, scale_name,
-                        layer.create_parameter(
-                            shape=quanted_weight_scale.shape,
-                            dtype=quanted_weight_scale.dtype,
-                        ))
-                    getattr(layer, scale_name).set_value(quanted_weight_scale)
-
-        if moe_quant_type == "w4a8":
-            assert moe_ffn1_weight_scale is not None
-            assert moe_ffn2_weight_scale is not None
-            assert moe_ffn1_in_scale is not None
-            assert moe_ffn2_in_scale is not None
-            added_w4a8_attrs = [
-                "moe_ffn1_weight_scale", "moe_ffn2_weight_scale",
-                "moe_ffn1_in_scale", "moe_ffn2_in_scale"
-            ]
-            for idx, weight_tensor in enumerate([
-                    moe_ffn1_weight_scale, moe_ffn2_weight_scale,
-                    moe_ffn1_in_scale, moe_ffn2_in_scale
-            ]):
-                name = added_w4a8_attrs[idx]
-                setattr(
-                    layer, name,
-                    layer.create_parameter(
-                        shape=weight_tensor.shape,
-                        dtype=weight_tensor.dtype,
-                        default_initializer=paddle.nn.initializer.Constant(0),
-                    ))
-                getattr(layer, name).set_value(weight_tensor)
-
-    def apply(
-        self,
-        layer: nn.Layer,
-        moe_compute_params,
-        x: paddle.Tensor,
-    ) -> paddle.Tensor:
-        """
-        Paddle Cutlass compute Fused MoE.
-        """
-
-        gate_out = paddle.matmul(x.cast("float32"), layer.gate_weight)
-
-        (
-            permute_input,
-            token_nums_per_expert,
-            permute_indices_per_token,
-            topk_weights,
-            topk_idx,
-            expert_idx_per_token,
-        ) = moe_expert_dispatch(
-            x,
-            gate_out,
-            layer.gate_correction_bias,
-            (layer.moe_ffn1_in_scale if hasattr(layer, "moe_ffn1_in_scale")
-             else None),  # if set, permute_input will be int8_t
-            moe_compute_params.top_k,
-            False,
-            topk_only_mode=False,
-        )
-
-        if moe_compute_params.moe_quant_type != "w4a8":
-            # only w4a8 need expert_idx_per_token
-            # Other need not this tensor, so we make it None.
-            expert_idx_per_token = None
-        else:
-            expert_idx_per_token = expert_idx_per_token.cast("int64")
-
-        ffn_out = moe_expert_ffn(
-            permute_input,
-            token_nums_per_expert,
-            layer.moe_ffn1_weight,
-            layer.moe_ffn2_weight,
-            None,
-            (layer.moe_ffn1_weight_scale
-             if hasattr(layer, "moe_ffn1_weight_scale") else None),
-            (layer.moe_ffn2_weight_scale
-             if hasattr(layer, "moe_ffn2_weight_scale") else None),
-            (layer.moe_ffn2_in_scale
-             if hasattr(layer, "moe_ffn2_in_scale") else None),
-            expert_idx_per_token,
-            moe_compute_params.moe_quant_type,
-            False,  # used_in_ep_low_latency
-        )
-
-        if False:
-            if in_dynamic_or_pir_mode():
-                hcg = fleet.get_hybrid_communicate_group()
-                mp_group = hcg.get_model_parallel_group()
-                paddle.distributed.all_reduce(ffn_out, group=mp_group)
-            else:
-                paddle.distributed.all_reduce(ffn_out, group=mp_group)
-
-        # reduce 中会做 topk 个 weight 的 norm 和 routed_scaling_factor
-        fused_moe_out = moe_expert_reduce(
-            ffn_out,
-            topk_weights,
-            permute_indices_per_token,
-            topk_idx,
-            None,
-            norm_topk_prob=True,
-            routed_scaling_factor=1.0,
-        )
-        return fused_moe_out
@@ -0,0 +1,135 @@
+"""
+# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+
+from abc import abstractmethod
+
+import paddle
+from paddle import nn
+
+from fastdeploy.config import MoEPhase
+
+from ..quantization.quant_base import QuantMethodBase
+
+
+class MoEMethodBase(QuantMethodBase):
+    """
+    """
+
+    def __init__(self, quant_config):
+        super().__init__()
+        if quant_config is None:
+            self.moe_quant_type = "w16a16"
+        else:
+            self.quant_config = quant_config
+        self.added_weight_attrs = ["moe_ffn1_weight", "moe_ffn2_weight"]
+        self.added_scale_attrs = [
+            "moe_ffn1_weight_scale", "moe_ffn2_weight_scale"
+        ]
+        self.pack_num = 1
+
+    def init_ep(self, layer: nn.Layer) -> None:
+        """
+        Init EP related module
+        """
+        if layer.ep_size > 1:
+            if layer.fd_config.parallel_config.moe_phase == MoEPhase.DECODER:
+                from .ep import EPDecoderRunner
+                self.ep_decoder_runner = EPDecoderRunner(
+                    layer.top_k, layer.hidden_size, layer.num_experts,
+                    layer.moe_config.num_max_dispatch_tokens_per_rank,
+                    layer.ep_size, layer.ep_rank)
+            else:
+                from .ep import EPPrefillRunner
+                self.ep_prefill_runner = EPPrefillRunner(
+                    layer.top_k, layer.hidden_size, layer.num_experts,
+                    layer.ep_size, layer.ep_rank)
+
+    def process_loaded_weights(self, layer, weights) -> None:
+        """
+        process_loaded_weights
+        """
+        pass
+
+    def check(self, layer: nn.Layer, ffn1_weights, ffn2_weights):
+        """
+        check layer is valid for this method
+        """
+        assert ffn1_weights[0].shape == [
+            layer.hidden_size // self.pack_num, layer.moe_intermediate_size * 2
+        ]
+        assert ffn2_weights[0].shape == [
+            layer.moe_intermediate_size // self.pack_num, layer.hidden_size
+        ]
+
+    @abstractmethod
+    def create_weights(self, layer: nn.Layer, state_dict):
+        """
+        Paddle cutlass create weight process.
+        """
+        raise NotImplementedError
+
+    @abstractmethod
+    def apply_ep_prefill(
+        self,
+        layer: nn.Layer,
+        x: paddle.Tensor,
+        gate_out: paddle.Tensor,
+    ) -> paddle.Tensor:
+        """
+        Apply the EP prefill method.
+        """
+        raise NotImplementedError
+
+    @abstractmethod
+    def apply_ep_decode(
+        self,
+        layer: nn.Layer,
+        x: paddle.Tensor,
+        gate_out: paddle.Tensor,
+    ) -> paddle.Tensor:
+        """
+        Apply the EP decoder method.
+        """
+        raise NotImplementedError
+
+    @abstractmethod
+    def apply_tp(
+        self,
+        layer: nn.Layer,
+        x: paddle.Tensor,
+        gate_out: paddle.Tensor,
+    ) -> paddle.Tensor:
+        """
+        Paddle Cutlass compute Fused MoE.
+        """
+        raise NotImplementedError
+
+    def apply(
+        self,
+        layer: nn.Layer,
+        x: paddle.Tensor,
+        gate_out: paddle.Tensor,
+    ) -> paddle.Tensor:
+        """
+        Paddle Cutlass compute Fused MoE.
+        """
+        if layer.ep_size > 1:
+            if layer.fd_config.parallel_config.moe_phase == MoEPhase.PREFILL:
+                return self.apply_ep_prefill(layer, x, gate_out)
+            else:
+                return self.apply_ep_decode(layer, x, gate_out)
+        else:
+            return self.apply_tp(layer, x, gate_out)
@@ -0,0 +1,431 @@
+"""
+# Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+
+import paddle
+from paddle import nn
+from paddle.nn.quant import weight_quantize
+from paddleformers.utils.log import logger
+
+import fastdeploy
+from fastdeploy.distributed.communication_op import \
+    tensor_model_parallel_all_reduce
+from ..utils import get_tensor, create_and_set_parameter
+from .fused_moe_backend_base import MoEMethodBase
+
+from fastdeploy.platforms import current_platform
+if current_platform.is_cuda():
+    from fastdeploy.model_executor.ops.gpu import moe_expert_dispatch
+    from fastdeploy.model_executor.ops.gpu import moe_expert_reduce
+    
+
+class CutlassMoEMethod(MoEMethodBase):
+    """
+    Use Cutlass Group Gemm to compute Fused MoE.
+    This method is the oldest way to compute MoE in Paddle.
+    """
+
+    def create_weights(self, layer: nn.Layer, state_dict):
+        """
+        Paddle cutlass create weight process.
+        """
+        # bf16
+        ffn1_weights, ffn2_weights = layer.extract_moe_ffn_weights(state_dict)
+        stacked_ffn1_weights = paddle.stack(ffn1_weights, axis=0)
+        stacked_ffn2_weights = paddle.stack(ffn2_weights, axis=0)
+        for idx, weight_tensor in enumerate(
+            [stacked_ffn1_weights, stacked_ffn2_weights]):
+            weight_name = self.added_weight_attrs[idx]
+            setattr(
+                layer, weight_name,
+                layer.create_parameter(
+                    shape=weight_tensor.shape,
+                    dtype=weight_tensor.dtype,
+                    default_initializer=paddle.nn.initializer.Constant(0),
+                ))
+            getattr(layer, weight_name).set_value(weight_tensor)
+
+    def compute_ffn(
+        self,
+        layer: nn.Layer,
+        permute_input: paddle.Tensor,
+        token_nums_per_expert: paddle.Tensor,
+        expert_idx_per_token: paddle.Tensor,
+        used_in_ep_low_latency: bool = False,
+    ):
+        """
+        Paddle Cutlass compute Fused MoE.
+        """
+        return fastdeploy.model_executor.ops.gpu.moe_expert_ffn(
+            permute_input,
+            token_nums_per_expert,
+            layer.moe_ffn1_weight,
+            layer.moe_ffn2_weight,
+            None,
+            (layer.moe_ffn1_weight_scale
+             if hasattr(layer, "moe_ffn1_weight_scale") else None),
+            (layer.moe_ffn2_weight_scale
+             if hasattr(layer, "moe_ffn2_weight_scale") else None),
+            (layer.moe_ffn2_in_scale
+             if hasattr(layer, "moe_ffn2_in_scale") else None),
+            expert_idx_per_token,
+            self.moe_quant_type,
+            used_in_ep_low_latency,
+        )
+
+    def apply_ep_prefill(
+        self,
+        layer: nn.Layer,
+        x: paddle.Tensor,
+        gate_out: paddle.Tensor,
+    ) -> paddle.Tensor:
+        """
+        Apply the EP prefill method.
+        """
+        # 1. Select topk experts and weights
+        topk_idx, topk_weights = self.ep_prefill_runner.moe_select(
+            layer, gate_out)
+        # 2. EP Dispatch
+        (
+            recv_x,
+            recv_topk_idx,
+            recv_topk_weights,
+            recv_num_tokens_per_expert_list,
+            handle,
+            _,
+        ) = self.ep_prefill_runner.dispatch(x, topk_idx, topk_weights)
+        token_all_num = sum(recv_num_tokens_per_expert_list)
+
+        # 3. Compute ffn
+        if token_all_num > 0:
+            logger.info(f"token_all_num {token_all_num}")
+            (
+                permute_input,
+                permute_indices_per_token,
+                recv_num_tokens_per_expert_list_cumsum,
+                dst_weights,
+                dst_indices,
+                cumsum_idx_gpu,
+                expert_idx_per_token,
+            ) = fastdeploy.model_executor.ops.gpu.ep_moe_expert_dispatch(
+                recv_x,
+                recv_topk_idx,
+                recv_topk_weights,
+                (self.moe_ffn1_in_scale
+                 if hasattr(self, "moe_ffn1_in_scale") else None),
+                recv_num_tokens_per_expert_list,
+                token_all_num,
+                self.moe_quant_type,
+            )
+            if self.moe_quant_type != "w4a8":
+                # only w4a8 need expert_idx_per_token
+                # Other need not this tensor, so we make it None.
+                expert_idx_per_token = None
+            else:
+                expert_idx_per_token = expert_idx_per_token.cast("int64")
+
+            ffn_out = self.compute_ffn(layer, permute_input,
+                                       recv_num_tokens_per_expert_list_cumsum,
+                                       expert_idx_per_token)
+
+            # prmt back per rank
+            tmp_ffn_out = fastdeploy.model_executor.ops.gpu.ep_moe_expert_combine(
+                ffn_out,
+                dst_weights,
+                permute_indices_per_token,
+                dst_indices,
+                None,  # moe_ffn2_bias,
+                False,  # norm_topk_prob
+                1.0,
+            )[0]
+        else:
+            tmp_ffn_out = recv_x
+
+        # 4. EP combine
+        return self.ep_prefill_runner.combine(tmp_ffn_out, handle,
+                                              recv_topk_weights)
+
+    def apply_ep_decode(
+        self,
+        layer: nn.Layer,
+        x: paddle.Tensor,
+        gate_out: paddle.Tensor,
+    ) -> paddle.Tensor:
+        """
+        Apply the EP decoder method.
+        """
+        # 1. Select topk experts and weights
+        topk_idx, topk_weights = self.ep_decoder_runner.moe_select(
+            layer, gate_out)
+        # 2. EP Dispatch
+        permute_input, token_nums_per_expert, handle = self.ep_decoder_runner.dispatch(
+            x, topk_idx, topk_weights)
+        # 3. Compute ffn
+        if self.moe_quant_type == "w4a8":
+            num_local_experts, max_num, _ = permute_input.shape
+            expert_idx_per_token = paddle.arange(
+                num_local_experts)[:, None].tile([1, max_num])
+        elif self.moe_quant_type in ["weight_only_int8", "weight_only_int4"]:
+            expert_idx_per_token = None
+        else:
+            raise NotImplementedError
+
+        ffn_out = self.compute_ffn(layer, permute_input,
+                                   token_nums_per_expert.cast("int64"),
+                                   expert_idx_per_token, True)
+
+        # 4. EP combine
+        return self.ep_decoder_runner.combine(ffn_out, topk_idx, topk_weights,
+                                              handle)
+
+    def apply_tp(
+        self,
+        layer: nn.Layer,
+        x: paddle.Tensor,
+        gate_out: paddle.Tensor,
+    ) -> paddle.Tensor:
+        """
+        Paddle Cutlass compute Fused MoE.
+        """
+        (
+            permute_input,
+            token_nums_per_expert,
+            permute_indices_per_token,
+            topk_weights,
+            topk_idx,
+            expert_idx_per_token,
+        ) = moe_expert_dispatch(
+            x,
+            gate_out,
+            layer.gate_correction_bias,
+            (layer.moe_ffn1_in_scale if hasattr(layer, "moe_ffn1_in_scale")
+             else None),  # if set, permute_input will be int8_t
+            layer.top_k,
+            False,
+            topk_only_mode=False,
+        )
+
+        if self.moe_quant_type != "w4a8":
+            # only w4a8 need expert_idx_per_token
+            # Other need not this tensor, so we make it None.
+            expert_idx_per_token = None
+        else:
+            expert_idx_per_token = expert_idx_per_token.cast("int64")
+
+        ffn_out = self.compute_ffn(layer, permute_input, token_nums_per_expert,
+                                   expert_idx_per_token)
+
+        # reduce 中会做 topk 个 weight 的 norm 和 routed_scaling_factor
+        fused_moe_out = moe_expert_reduce(
+            ffn_out,
+            topk_weights,
+            permute_indices_per_token,
+            topk_idx,
+            None,
+            norm_topk_prob=True,
+            routed_scaling_factor=1.0,
+        )
+
+        if layer.tp_size > 1:
+            tensor_model_parallel_all_reduce(fused_moe_out)
+
+        return fused_moe_out
+
+
+class CutlassW4A8MoEMethod(CutlassMoEMethod):
+    """
+    w4a8 MoE Method
+    """
+
+    def __init__(self, quant_config):
+        super().__init__(quant_config)
+        self.quant_config = quant_config
+        self.moe_quant_type = "w4a8"
+        self.pack_num = 2
+
+    def create_weights(self, layer: nn.Layer, state_dict):
+        """
+        Paddle cutlass create weight process.
+        """
+        ffn1_weights, ffn2_weights = layer.extract_moe_ffn_weights(state_dict)
+        self.check(layer, ffn1_weights, ffn2_weights)
+        for idx, weight_tensor in enumerate([ffn1_weights, ffn2_weights]):
+            weight_name = self.added_weight_attrs[idx]
+            weight_list = []
+            for i in range(layer.num_local_experts):
+                quant_weight, scale = weight_quantize(weight_tensor[i],
+                                                      algo=self.moe_quant_type,
+                                                      arch=80)
+                weight_list.append(quant_weight)
+            quanted_weight = paddle.stack(weight_list, axis=0)
+            create_and_set_parameter(layer, weight_name, quanted_weight)
+
+        self.create_w4a8_scale_weights(layer, layer.weight_key_map, state_dict)
+
+    def create_w4a8_scale_weights(self, layer: nn.Layer, weight_key_map: dict,
+                                  state_dict: dict):
+        """
+        Get w4a8 weights from state dict and process them.
+        Args:
+            layer (nn.Layer): The layer to add parameters to.
+            weight_key_map (dict): The weight key map.
+            state_dict (dict): The state dict.
+        """
+
+        def _extract_scale_tensor(state_dict, key_template, expert_idx):
+            return get_tensor(state_dict.pop(key_template.format(expert_idx)))
+
+        def _process_in_scale(name: str, in_scales: list[paddle.Tensor]):
+            processed_in_scale = 1 / paddle.concat(in_scales)
+            create_and_set_parameter(layer, name, processed_in_scale)
+            return processed_in_scale
+
+        def _process_weight_scale(name: str,
+                                  weight_scales: list[paddle.Tensor],
+                                  processed_in_scale: paddle.Tensor):
+            processed_weight_scale = (paddle.stack(weight_scales, axis=0) /
+                                      (127 * 112) /
+                                      processed_in_scale[:, None]).cast(
+                                          paddle.get_default_dtype())
+            create_and_set_parameter(layer, name, processed_weight_scale)
+
+        # 1. Init scale containers and maps
+        moe_ffn1_weight_scales = []
+        moe_ffn2_weight_scales = []
+        moe_ffn1_in_scales = []
+        moe_ffn2_in_scales = []
+
+        scale_weight_map = {
+            "moe_ffn1_weight_scale": moe_ffn1_weight_scales,
+            "moe_ffn2_weight_scale": moe_ffn2_weight_scales,
+            "moe_ffn1_in_scale": moe_ffn1_in_scales,
+            "moe_ffn2_in_scale": moe_ffn2_in_scales,
+        }
+        scale_key_map = {
+            "moe_ffn1_weight_scale":
+            weight_key_map.get("ffn1_expert_weight_scale_key", None),
+            "moe_ffn2_weight_scale":
+            weight_key_map.get("ffn2_expert_weight_scale_key", None),
+            "moe_ffn1_in_scale":
+            weight_key_map.get("ffn1_expert_in_scale_key", None),
+            "moe_ffn2_in_scale":
+            weight_key_map.get("ffn2_expert_in_scale_key", None),
+        }
+        for name, value in scale_key_map.items():
+            if value is None:
+                raise ValueError(
+                    f"scale {name} should not be none in w4a8 mode.")
+
+        # 2. Extract scale tensor from state dict
+
+        for local_expert_idx in range(layer.num_local_experts):
+            expert_idx = local_expert_idx + layer.expert_id_offset * layer.num_local_experts
+            for name, scale_key_template in scale_key_map.items():
+                scale_tensor = _extract_scale_tensor(state_dict,
+                                                     scale_key_template,
+                                                     expert_idx)
+                scale_weight_map[name].append(scale_tensor)
+
+        # 3. Process scale tensor and set to layer
+        in_scales = []
+        for in_scale_name in ["moe_ffn1_in_scale", "moe_ffn2_in_scale"]:
+            in_scales.append(
+                _process_in_scale(in_scale_name,
+                                  scale_weight_map[in_scale_name]))
+
+        for i, weight_scale_name in enumerate(
+            ["moe_ffn1_weight_scale", "moe_ffn2_weight_scale"]):
+            _process_weight_scale(weight_scale_name,
+                                  scale_weight_map[weight_scale_name],
+                                  in_scales[i])
+
+
+class CutlassWeightOnlyMoEMethod(CutlassMoEMethod):
+    """
+    weight only for moe
+    """
+
+    def __init__(self, quant_config):
+        super().__init__(quant_config)
+        self.quant_config = quant_config
+        self.moe_quant_type = self.quant_config.algo
+        self.pack_num = 1
+
+    def process_prequanted_weights(self, layer: nn.Layer, state_dict):
+        """
+        Paddle cutlass process prequanted weights.
+        """
+        ffn1_expert_weight_key = layer.weight_key_map.get(
+            "ffn1_expert_weight_key", None)
+        ffn2_expert_weight_key = layer.weight_key_map.get(
+            "ffn2_expert_weight_key", None)
+        ffn1_expert_weight_scale_key = layer.weight_key_map.get(
+            "ffn1_expert_weight_scale_key", None)
+        ffn2_expert_weight_scale_key = layer.weight_key_map.get(
+            "ffn2_expert_weight_scale_key", None)
+
+        ffn1_weights, ffn2_weights = layer.load_experts_weight(
+            state_dict, ffn1_expert_weight_key, ffn2_expert_weight_key)
+        # self.check(layer, ffn1_weights, ffn2_weights)
+        ffn1_weight_scale = []
+        ffn2_weight_scale = []
+        for i in range(layer.num_local_experts):
+            expert_idx = layer.expert_id_offset + i
+            ffn1_weight_scale.append(
+                get_tensor(
+                    state_dict.pop(
+                        ffn1_expert_weight_scale_key.format(expert_idx))))
+            ffn2_weight_scale.append(
+                get_tensor(
+                    state_dict.pop(
+                        ffn2_expert_weight_scale_key.format(expert_idx))))
+
+        ffn1_weight = paddle.stack(ffn1_weights, axis=0)
+        ffn2_weight = paddle.stack(ffn2_weights, axis=0)
+        ffn1_weight_scale = paddle.stack(ffn1_weight_scale, axis=0)
+        ffn2_weight_scale = paddle.stack(ffn2_weight_scale, axis=0)
+
+        name_tensor_map = {
+            "moe_ffn1_weight": ffn1_weight,
+            "moe_ffn2_weight": ffn2_weight,
+            "moe_ffn1_weight_scale": ffn1_weight_scale,
+            "moe_ffn2_weight_scale": ffn2_weight_scale
+        }
+        for name, tensor in name_tensor_map.items():
+            create_and_set_parameter(layer, name, tensor)
+
+    def create_weights(self, layer: nn.Layer, state_dict):
+        """
+        Paddle cutlass create weight process.
+        """
+        ffn1_weights, ffn2_weights = layer.extract_moe_ffn_weights(state_dict)
+        self.check(layer, ffn1_weights, ffn2_weights)
+
+        for idx, weight_tensor in enumerate([ffn1_weights, ffn2_weights]):
+            weight_name = self.added_weight_attrs[idx]
+            scale_name = self.added_scale_attrs[idx]
+
+            weight_list = []
+            weight_scale_list = []
+            for i in range(layer.num_local_experts):
+                quant_weight, scale = weight_quantize(weight_tensor[i],
+                                                      algo=self.moe_quant_type)
+                weight_list.append(quant_weight)
+                weight_scale_list.append(scale)
+            quanted_weight = paddle.stack(weight_list, axis=0)
+            create_and_set_parameter(layer, weight_name, quanted_weight)
+
+            quanted_weight_scale = paddle.stack(weight_scale_list, axis=0)
+            create_and_set_parameter(layer, scale_name, quanted_weight_scale)
@@ -0,0 +1,380 @@
+"""
+# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+
+import numpy as np
+import paddle
+from paddle import nn
+from paddleformers.utils.log import logger
+
+import fastdeploy
+import fastdeploy.model_executor.ops.gpu.deep_gemm as deep_gemm
+from fastdeploy.distributed.communication_op import \
+    tensor_model_parallel_all_reduce
+from fastdeploy.model_executor.ops.gpu import count_tokens_per_expert_func
+from fastdeploy.model_executor.layers.utils import get_tensor
+
+from ..utils import create_and_set_parameter
+from .fused_moe_backend_base import MoEMethodBase
+
+
+class DeepGemmFusedMoeMethod(MoEMethodBase):
+    """
+    DeepGemmFusedMoeMethod is a class that implements the MoEMethodBase interface for DeepGemm backend.
+    """
+
+    def create_weights(self, layer: nn.Layer, state_dict):
+        """
+        deepgemm create weight process.
+        """
+
+        ffn1_weights, ffn2_weights = layer.extract_moe_ffn_weights(state_dict)
+
+        self.check(layer, ffn1_weights, ffn2_weights)
+
+        for idx, weight_tensor in enumerate([ffn1_weights, ffn2_weights]):
+            weight_name = self.added_weight_attrs[idx]
+            scale_name = self.added_scale_attrs[idx]
+
+            weight_list = []
+            weight_scale_list = []
+            for i in range(layer.num_local_experts):
+                from fastdeploy.model_executor.layers.utils import \
+                    per_block_cast_to_fp8
+                quant_weight, scale = per_block_cast_to_fp8(
+                    weight_tensor[i], self.quant_config.weight_block_size)
+
+                weight_list.append(quant_weight)
+                weight_scale_list.append(scale)
+            quanted_weight = paddle.stack(weight_list, axis=0)
+            quanted_weight = quanted_weight.transpose([0, 2, 1]).contiguous()
+            create_and_set_parameter(layer, weight_name, quanted_weight)
+
+            quanted_weight_scale = paddle.stack(weight_scale_list, axis=0)
+            quanted_weight_scale = quanted_weight_scale.transpose(
+                [0, 2, 1]).contiguous()
+            create_and_set_parameter(layer, scale_name, quanted_weight_scale)
+
+    def process_prequanted_weights(self, layer: nn.Layer, state_dict):
+        """
+        Paddle cutlass process prequanted weights.
+        """
+        ffn1_expert_weight_key = layer.weight_key_map.get(
+            "ffn1_expert_weight_key", None)
+        ffn2_expert_weight_key = layer.weight_key_map.get(
+            "ffn2_expert_weight_key", None)
+        ffn1_expert_weight_scale_key = layer.weight_key_map.get(
+            "ffn1_expert_weight_scale_key", None)
+        ffn2_expert_weight_scale_key = layer.weight_key_map.get(
+            "ffn2_expert_weight_scale_key", None)
+
+        ffn1_weights, ffn2_weights = layer.load_experts_weight(
+            state_dict, ffn1_expert_weight_key, ffn2_expert_weight_key)
+        # self.check(layer, ffn1_weights, ffn2_weights)
+        ffn1_weight_scale = []
+        ffn2_weight_scale = []
+        for i in range(layer.num_local_experts):
+            expert_idx = layer.expert_id_offset + i
+            ffn1_weight_scale.append(
+                get_tensor(
+                    state_dict.pop(
+                        ffn1_expert_weight_scale_key.format(expert_idx))))
+            ffn2_weight_scale.append(
+                get_tensor(
+                    state_dict.pop(
+                        ffn2_expert_weight_scale_key.format(expert_idx))))
+
+        ffn1_weight = paddle.stack(ffn1_weights, axis=0).transpose([0, 2, 1]).contiguous().view("float8_e4m3fn")
+        ffn2_weight = paddle.stack(ffn2_weights, axis=0).transpose([0, 2, 1]).contiguous().view("float8_e4m3fn")
+        ffn1_weight_scale = paddle.stack(ffn1_weight_scale, axis=0).transpose([0, 2, 1]).contiguous()
+        ffn2_weight_scale = paddle.stack(ffn2_weight_scale, axis=0).transpose([0, 2, 1]).contiguous()
+
+        name_tensor_map = {
+            "moe_ffn1_weight": ffn1_weight,
+            "moe_ffn2_weight": ffn2_weight,
+            "moe_ffn1_weight_scale": ffn1_weight_scale,
+            "moe_ffn2_weight_scale": ffn2_weight_scale
+        }
+        for name, tensor in name_tensor_map.items():
+            create_and_set_parameter(layer, name, tensor)
+
+    def apply_ep_prefill(
+        self,
+        layer: nn.Layer,
+        x: paddle.Tensor,
+        gate_out: paddle.Tensor,
+    ) -> paddle.Tensor:
+        """
+        Apply the EP prefill method.
+        """
+        # 1. Select topk experts and weights
+        topk_idx, topk_weights = self.ep_prefill_runner.moe_select(
+            layer, gate_out)
+        # 2. Dynamic compute blockwise quantization scales
+        x, x_scale_tensor = fastdeploy.model_executor.ops.gpu.per_token_quant(
+            x, self.quant_config.weight_block_size[0])
+        # 3. EP Dispatch
+        (
+            recv_x,
+            recv_topk_idx,
+            recv_topk_weights,
+            recv_num_tokens_per_expert_list,
+            handle,
+            _,
+        ) = self.ep_prefill_runner.dispatch(x,
+                                            topk_idx,
+                                            topk_weights,
+                                            x_scale_tensor=x_scale_tensor)
+
+        token_all_num = sum(recv_num_tokens_per_expert_list)
+
+        # 4. Compute ffn
+        if token_all_num > 0:
+            logger.info(f"token_all_num {token_all_num}")
+            (recv_x, recv_x_scale) = recv_x
+            tmp = count_tokens_per_expert_func(recv_topk_idx, layer.num_local_experts)
+            (
+                permute_input,
+                permute_scale,
+                permute_indices_per_token,
+                recv_num_tokens_per_expert_list_cumsum,
+                recv_num_tokens_per_expert_list_padded_cumsum,
+                dst_weights,
+                dst_indices,
+                cumsum_idx_gpu,
+                m_indices,
+            ) = fastdeploy.model_executor.ops.gpu.ep_moe_expert_dispatch_fp8(
+                recv_x,
+                recv_x_scale,
+                recv_topk_idx,
+                recv_topk_weights,
+                tmp[0],
+                tmp[1]
+            )
+
+            permute_scale = permute_scale.transpose([1, 0]).contiguous()
+            permute_scale = permute_scale.transpose([1, 0])
+
+            # ffn1
+            ffn_out = paddle.empty(
+                (permute_input.shape[0], layer.moe_ffn1_weight.shape[1]),
+                dtype=paddle.bfloat16,
+            )
+            deep_gemm.m_grouped_gemm_fp8_fp8_bf16_nt_contiguous(
+                (permute_input, permute_scale),
+                (layer.moe_ffn1_weight, layer.moe_ffn1_weight_scale),
+                ffn_out,
+                m_indices,
+            )
+            # swiglu
+            ffn_out = paddle.incubate.nn.functional.swiglu(ffn_out, None)
+
+            # ffn2
+            ffn_in_x, ffn_in_x_scale_tensor = fastdeploy.model_executor.ops.gpu.per_token_quant(
+                ffn_out, self.quant_config.weight_block_size[0])
+            ffn_in_x_scale_tensor = ffn_in_x_scale_tensor.transpose(
+                [1, 0]).contiguous()
+            ffn_in_x_scale_tensor = ffn_in_x_scale_tensor.transpose([1, 0])
+
+            ffn_out = paddle.empty(
+                (ffn_out.shape[0], layer.moe_ffn2_weight.shape[1]),
+                dtype=paddle.bfloat16)
+            deep_gemm.m_grouped_gemm_fp8_fp8_bf16_nt_contiguous(
+                (ffn_in_x, ffn_in_x_scale_tensor),
+                (layer.moe_ffn2_weight, layer.moe_ffn2_weight_scale),
+                ffn_out,
+                m_indices,
+            )
+            # prmt back per rank
+            tmp_ffn_out = fastdeploy.model_executor.ops.gpu.ep_moe_expert_combine(
+                ffn_out,
+                dst_weights,
+                permute_indices_per_token,
+                dst_indices,
+                None,  # moe_ffn2_bias
+                False,  # norm_topk_prob
+                1.0,
+            )[0]
+
+        else:
+            tmp_ffn_out = paddle.cast(recv_x[0], paddle.bfloat16)
+
+        # 5. EP combine
+        return self.ep_prefill_runner.combine(tmp_ffn_out, handle,
+                                              recv_topk_weights)
+
+    def apply_ep_decode(
+        self,
+        layer: nn.Layer,
+        x: paddle.Tensor,
+        gate_out: paddle.Tensor,
+    ) -> paddle.Tensor:
+        """
+        Apply the EP decoder method.
+        """
+        # 1. Select topk experts and weights
+        topk_idx, topk_weights = self.ep_decoder_runner.moe_select(
+            layer, gate_out)
+        # 2. EP Dispatch
+        permute_input, token_nums_per_expert, handle = self.ep_decoder_runner.dispatch(
+            x, topk_idx, topk_weights, use_fp8=True)
+
+        # 3. Compute ffn
+        assert isinstance(permute_input, tuple)
+        ffn1_out = paddle.empty(
+            [
+                layer.num_local_experts,
+                layer.ep_size *
+                layer.moe_config.num_max_dispatch_tokens_per_rank,
+                layer.moe_intermediate_size * 2,
+            ],
+            dtype=paddle.bfloat16,
+        )
+
+        ffn_out = paddle.empty(
+            [
+                layer.num_local_experts,
+                layer.ep_size *
+                layer.moe_config.num_max_dispatch_tokens_per_rank,
+                layer.hidden_size,
+            ],
+            dtype=paddle.bfloat16,
+        )
+
+        expected_m = 128
+        deep_gemm.m_grouped_gemm_fp8_fp8_bf16_nt_masked(
+            permute_input,
+            (
+                layer.moe_ffn1_weight,
+                layer.moe_ffn1_weight_scale,
+            ),
+            ffn1_out,
+            token_nums_per_expert,
+            expected_m,
+        )
+
+        act_out = fastdeploy.model_executor.ops.gpu.group_swiglu_with_masked(
+            ffn1_out, token_nums_per_expert)
+
+        act_out_fp8, scale = fastdeploy.model_executor.ops.gpu.masked_per_token_quant(
+            act_out, token_nums_per_expert,
+            self.quant_config.weight_block_size[0])
+
+        deep_gemm.m_grouped_gemm_fp8_fp8_bf16_nt_masked(
+            (act_out_fp8, scale),
+            (
+                layer.moe_ffn2_weight,
+                layer.moe_ffn2_weight_scale,
+            ),
+            ffn_out,
+            token_nums_per_expert,
+            expected_m,
+        )
+
+        # 4. EP combine
+        return self.ep_decoder_runner.combine(ffn_out, topk_idx, topk_weights,
+                                              handle)
+
+    def apply_tp(
+        self,
+        layer: nn.Layer,
+        x: paddle.Tensor,
+        gate_out: paddle.Tensor,
+    ) -> paddle.Tensor:
+        """
+        Paddle Use DeepGemm compute Fused MoE.
+        below is TP compute method.
+        """
+
+        topk_ids, topk_weights = fastdeploy.model_executor.ops.gpu.moe_topk_select(
+            gate_out,
+            layer.gate_correction_bias,
+            layer.top_k,
+            True,  # apply_norm_weight
+            False,
+        )
+
+        tmp = count_tokens_per_expert_func(topk_ids, layer.num_experts)
+
+        recv_x, recv_x_scale = fastdeploy.model_executor.ops.gpu.per_token_quant(
+            x, 128)
+
+        (
+            permute_input,
+            permute_scale,
+            permute_indices_per_token,
+            recv_num_tokens_per_expert_list_cumsum,
+            recv_num_tokens_per_expert_list_padded_cumsum,
+            dst_weights,
+            dst_indices,
+            cumsum_idx_gpu,
+            m_indices,
+        ) = fastdeploy.model_executor.ops.gpu.ep_moe_expert_dispatch_fp8(
+            recv_x,
+            recv_x_scale,
+            topk_ids,
+            topk_weights,
+            tmp[0],
+            tmp[1],
+        )
+
+        permute_scale = permute_scale.transpose([1, 0]).contiguous()
+        permute_scale = permute_scale.transpose([1, 0])
+
+        # ffn1
+        ffn_out = paddle.empty(
+            (permute_input.shape[0], layer.moe_ffn1_weight.shape[1]),
+            dtype=paddle.bfloat16,
+        )
+        deep_gemm.m_grouped_gemm_fp8_fp8_bf16_nt_contiguous(
+            (permute_input, permute_scale),
+            (layer.moe_ffn1_weight, layer.moe_ffn1_weight_scale),
+            ffn_out,
+            m_indices,
+        )
+        # swiglu
+        ffn_out = paddle.incubate.nn.functional.swiglu(ffn_out)
+
+        # ffn2
+        ffn_in_x, ffn_in_x_scale_tensor = fastdeploy.model_executor.ops.gpu.per_token_quant(
+            ffn_out, self.quant_config.weight_block_size[0])
+
+        ffn_in_x_scale_tensor = ffn_in_x_scale_tensor.transpose(
+            [1, 0]).contiguous()
+        ffn_in_x_scale_tensor = ffn_in_x_scale_tensor.transpose([1, 0])
+
+        ffn_out = paddle.empty(
+            (ffn_out.shape[0], layer.moe_ffn2_weight.shape[1]),
+            dtype=paddle.bfloat16)
+        deep_gemm.m_grouped_gemm_fp8_fp8_bf16_nt_contiguous(
+            (ffn_in_x, ffn_in_x_scale_tensor),
+            (layer.moe_ffn2_weight, layer.moe_ffn2_weight_scale),
+            ffn_out,
+            m_indices,
+        )
+        # prmt back per rank
+        tmp_ffn_out = fastdeploy.model_executor.ops.gpu.ep_moe_expert_combine(
+            ffn_out,
+            dst_weights,
+            permute_indices_per_token,
+            dst_indices,
+            None,
+            False,  # norm_topk_prob
+            1.0,
+        )[0]
+        if layer.tp_size > 1:
+            tensor_model_parallel_all_reduce(tmp_ffn_out)
+
+        return tmp_ffn_out
@@ -0,0 +1,285 @@
+"""
+# Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+
+import paddle
+from paddle import nn
+
+import fastdeploy
+from fastdeploy.distributed.communication_op import \
+    tensor_model_parallel_all_reduce
+from fastdeploy.model_executor.ops.gpu import (MoeWna16MarlinGemmApi,
+                                               tritonmoe_preprocess_func)
+
+from ..quantization.quant_base import QuantMethodBase
+
+
+def gptq_marlin_moe_repack(b_q_weight: paddle.Tensor, perm: paddle.Tensor,
+                           size_k: int, size_n: int,
+                           num_bits: int) -> paddle.Tensor:
+    """
+    Util function.
+    """
+    from fastdeploy.model_executor.ops.gpu import gptq_marlin_repack
+    num_experts = b_q_weight.shape[0]
+    assert size_k % 16 == 0
+    output = paddle.empty(
+        [num_experts, size_k // 16, size_n * (num_bits // 2)],
+        dtype=b_q_weight.dtype)
+    for e in range(num_experts):
+        output[e] = gptq_marlin_repack(b_q_weight[e], perm[e], size_k, size_n,
+                                       num_bits)
+    return output
+
+
+def get_scale_perms():
+    """
+    Util function.
+    """
+    scale_perm: list[int] = []
+    for i in range(8):
+        scale_perm.extend([i + 8 * j for j in range(8)])
+    scale_perm_single: list[int] = []
+    for i in range(4):
+        scale_perm_single.extend(
+            [2 * i + j for j in [0, 1, 8, 9, 16, 17, 24, 25]])
+    return scale_perm, scale_perm_single
+
+
+def marlin_permute_scales(s: paddle.Tensor, size_k: int, size_n: int,
+                          group_size: int) -> paddle.Tensor:
+    """
+    Util function.
+    """
+    scale_perm, scale_perm_single = get_scale_perms()
+    if group_size < size_k and group_size != -1:
+        s = s.reshape([-1, len(scale_perm)])[:, scale_perm]
+    else:
+        s = s.reshape([-1, len(scale_perm_single)])[:, scale_perm_single]
+    s = s.reshape((-1, size_n)).contiguous()
+
+    return s
+
+
+def marlin_moe_permute_scales(
+    s: paddle.Tensor,
+    size_k: int,
+    size_n: int,
+    group_size: int,
+):
+    """
+    Util function.
+    """
+    num_experts = s.shape[0]
+    output = paddle.empty(
+        [num_experts, s.shape[1], s.shape[2]],
+        dtype=s.dtype,
+    )
+
+    for e in range(num_experts):
+        output[e] = marlin_permute_scales(s[e], size_k, size_n, group_size)
+    return output
+
+
+class MarlinWeightOnlyMoEMethod(QuantMethodBase):
+    """
+    Use Marlin Group Gemm to compute Fused MoE.
+    """
+
+    def __init__(self, quant_method=None):
+        """
+        Marlin Group Gemm to compute Fused MoE.
+        """
+        self.quant_method = quant_method
+        self.added_weight_attrs = ["moe_ffn1_weight", "moe_ffn2_weight"]
+        self.added_scale_attrs = [
+            "moe_ffn1_weight_scale", "moe_ffn2_weight_scale"
+        ]
+        self.added_zeros_attrs = ["zeros0", "zeros1"]
+
+    def create_weights(self, layer: nn.Layer, state_dict):
+        """
+        Marlin MoE create weight process.
+        """
+        ffn1_weights, ffn2_weights = layer.extract_moe_ffn_weights(state_dict)
+        assert len(ffn1_weights) == layer.num_local_experts
+        assert len(ffn2_weights) == layer.num_local_experts
+        assert ffn1_weights[0].shape == [
+            layer.hidden_size, layer.moe_intermediate_size * 2
+        ]
+        assert ffn2_weights[0].shape == [
+            layer.moe_intermediate_size, layer.hidden_size
+        ]
+
+        ffn1_tensor = paddle.stack(ffn1_weights, axis=0)
+        ffn2_tensor = paddle.stack(ffn2_weights, axis=0)
+
+        max_bound = 7
+
+        for idx, weight_tensor in enumerate([ffn1_tensor, ffn2_tensor]):
+            weight_name = self.added_weight_attrs[idx]
+            scale_name = self.added_scale_attrs[idx]
+
+            weight_scale = weight_tensor.abs().max(axis=1)
+            quanted_weight = weight_tensor / weight_scale[:,
+                                                          None, :] * max_bound
+            quanted_weight = paddle.round(quanted_weight).astype("int32")
+
+            quanted_weight[quanted_weight > 7] = 7
+            quanted_weight[quanted_weight < -7] = -7
+            quanted_weight += 8
+
+            E, K, N = quanted_weight.shape
+            quanted_weight = quanted_weight.reshape([0, K // 8, 8, N])
+            res = paddle.zeros([E, K // 8, N], dtype='int32')
+            for j in range(8):
+                tmp = quanted_weight[:, :, j, :]
+                res = res | (tmp << (j * 4))
+            quanted_weight = paddle.assign(res)
+            weight_scale = weight_scale / max_bound
+            weight_scale = weight_scale[:, None, :]
+
+            group_size = -1  # means per_channel
+
+            g_idx_sort_indices = paddle.empty([E, 0], dtype="int32")
+            quanted_weight = gptq_marlin_moe_repack(
+                quanted_weight,
+                g_idx_sort_indices,
+                K,
+                N,
+                4,
+            )
+
+            weight_scale = marlin_moe_permute_scales(
+                weight_scale,
+                size_k=layer.moe_intermediate_size,  #useless
+                size_n=N,
+                group_size=group_size)
+
+            for (name, tensor) in [(weight_name, quanted_weight),
+                                   (scale_name, weight_scale)]:
+                setattr(
+                    layer, name,
+                    layer.create_parameter(
+                        shape=tensor.shape,
+                        dtype=tensor.dtype,
+                        default_initializer=paddle.nn.initializer.Constant(0),
+                    ))
+                getattr(layer, name).set_value(tensor)
+
+    def apply(
+        self,
+        layer: nn.Layer,
+        x: paddle.Tensor,
+        gate_out: paddle.Tensor,
+    ) -> paddle.Tensor:
+        """
+        Marlin compute Fused MoE.
+        """
+        token_num = x.shape[0]
+        top_k = layer.top_k
+        top_k = layer.top_k
+        moe_intermediate_size = layer.moe_intermediate_size
+        hidden_size = layer.hidden_size
+        num_experts = layer.num_experts
+
+        gate_out = paddle.matmul(x.cast("float32"), layer.gate_weight)
+
+        topk_ids, topk_weights = fastdeploy.model_executor.ops.gpu.moe_topk_select(
+            gate_out,
+            layer.gate_correction_bias,
+            top_k,
+            True,  # apply_norm_weight,
+            False,
+        )
+
+        block_size_m = 64
+
+        for m in [8, 16, 32, 48, 64]:
+            if token_num * top_k / num_experts / m < 0.9:
+                block_size_m = m
+                break
+
+        topk = top_k
+
+        # for H100 132 sms
+        workspace = paddle.empty([528], dtype="int32")
+
+        sorted_token_ids, expert_ids, num_tokens_post_padded = tritonmoe_preprocess_func(
+            topk_ids, num_experts, block_size_m)
+
+        ffn_out = MoeWna16MarlinGemmApi(
+            x,
+            c_or_none=None,
+            b_q_weight=layer.moe_ffn1_weight,
+            b_scales=layer.moe_ffn1_weight_scale,
+            global_scale_or_none=None,
+            b_zeros_or_none=None,
+            g_idx_or_none=None,
+            perm_or_none=None,
+            workspace=workspace,
+            sorted_token_ids=sorted_token_ids,
+            expert_ids=expert_ids,
+            num_tokens_post_padded=num_tokens_post_padded,
+            topk_weights=topk_weights,
+            moe_block_size=block_size_m,
+            top_k=topk,
+            mul_topk_weights=False,
+            is_ep=False,
+            b_q_type_str="uint4b8",
+            size_m=token_num,
+            size_n=moe_intermediate_size * 2,
+            size_k=hidden_size,
+            is_k_full=True,
+            use_atomic_add=True,
+            use_fp32_reduce=True,
+            is_zp_float=False)[0]
+
+        swiglu_out = paddle.incubate.nn.functional.swiglu(ffn_out)
+
+        ffn_out = MoeWna16MarlinGemmApi(
+            swiglu_out,
+            c_or_none=None,
+            b_q_weight=layer.moe_ffn2_weight,
+            b_scales=layer.moe_ffn2_weight_scale,
+            global_scale_or_none=None,
+            b_zeros_or_none=None,
+            g_idx_or_none=None,
+            perm_or_none=None,
+            workspace=workspace,
+            sorted_token_ids=sorted_token_ids,
+            expert_ids=expert_ids,
+            num_tokens_post_padded=num_tokens_post_padded,
+            topk_weights=topk_weights,
+            moe_block_size=block_size_m,
+            top_k=1,
+            mul_topk_weights=True,
+            is_ep=False,
+            b_q_type_str="uint4b8",
+            size_m=token_num * topk,
+            size_n=hidden_size,
+            size_k=moe_intermediate_size,
+            is_k_full=True,
+            use_atomic_add=True,
+            use_fp32_reduce=True,
+            is_zp_float=False)[0]
+
+        ffn_out.reshape_([token_num, -1, hidden_size])
+        ffn_out = ffn_out.sum(axis=1)
+
+        if layer.tp_size > 1:
+            tensor_model_parallel_all_reduce(ffn_out)
+
+        return ffn_out
@@ -1,57 +0,0 @@
-"""
-# Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""
-
-from abc import abstractmethod
-
-import paddle
-from paddle import nn
-
-from fastdeploy.model_executor.layers.quantization.quant_base import \
-    QuantMethodBase
-
-
-class FusedMoEMethodBase(QuantMethodBase):
-    """
-    All MoE Method should inherit this class.
-    and must implement following methods!
-
-    """
-
-    @abstractmethod
-    def create_weights(self,
-                       layer: nn.Layer,
-                       moe_compute_params,
-                       ffn1_tensor,
-                       ffn2_tensor,
-                       ffn1_bias=None,
-                       ffn2_bias=None):
-        """
-        How to create weights, you must implement this method.
-        """
-        raise NotImplementedError
-
-    @abstractmethod
-    def apply(
-        self,
-        layer: nn.Layer,
-        moe_compute_params,
-        x: paddle.Tensor,
-    ) -> paddle.Tensor:
-        """
-        Compute methods, you must implement this method.
-        """
-
-        raise NotImplementedError
@@ -0,0 +1,479 @@
+"""
+# Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+
+import paddle
+from paddle import nn
+
+from fastdeploy.distributed.communication_op import \
+    tensor_model_parallel_all_reduce
+from fastdeploy.model_executor.layers.utils import (create_hadamard_matrix_map,
+                                                    get_tensor)
+from fastdeploy.utils import ceil_div
+
+from ..quantization.quant_base import QuantMethodBase
+
+
+class TritonWeightOnlyMoEMethod(QuantMethodBase):
+    """
+    Use Triton Group Gemm to compute Fused MoE.
+    """
+
+    def __init__(self, quant_method=None):
+        """
+        Triton Group Gemm to compute Fused MoE.
+        """
+        self.quant_method = quant_method
+        self.added_weight_attrs = ["moe_ffn1_weight", "moe_ffn2_weight"]
+        self.added_scale_attrs = [
+            "moe_ffn1_weight_scale", "moe_ffn2_weight_scale"
+        ]
+
+    def process_prequanted_weights(self, layer: nn.Layer, state_dict) -> None:
+        """process_prequanted_weights"""
+        pass
+
+    def create_weights(self, layer: nn.Layer, state_dict):
+        """
+        Triton MoE create weight process.
+        """
+        ffn1_weights, ffn2_weights = layer.extract_moe_ffn_weights(state_dict)
+        assert len(ffn1_weights) == layer.num_local_experts
+        assert len(ffn2_weights) == layer.num_local_experts
+        assert layer.quant_method.quant_config.name() == "wint8"
+        assert ffn1_weights[0].shape == [
+            layer.hidden_size, layer.moe_intermediate_size * 2
+        ]
+        assert ffn2_weights[0].shape == [
+            layer.moe_intermediate_size, layer.hidden_size
+        ]
+
+        ffn1_tensor = paddle.stack(ffn1_weights, axis=0)
+        ffn2_tensor = paddle.stack(ffn2_weights, axis=0)
+
+        if self.quant_config.name() == "wint8":
+            max_bound = 127
+        elif self.quant_config.name() == "wint4":
+            max_bound = 7
+
+        for idx, weight_tensor in enumerate([ffn1_tensor, ffn2_tensor]):
+            weight_name = self.added_weight_attrs[idx]
+            scale_name = self.added_scale_attrs[idx]
+
+            quanted_weight_scale = weight_tensor.abs().max(axis=1)
+            quanted_weight = weight_tensor / quanted_weight_scale[:,
+                                                                  None, :] * max_bound
+            quanted_weight = paddle.round(quanted_weight).astype("int8")
+            quanted_weight_scale = quanted_weight_scale / max_bound
+
+            setattr(
+                layer, weight_name,
+                layer.create_parameter(
+                    shape=quanted_weight.shape,
+                    dtype=quanted_weight.dtype,
+                    default_initializer=paddle.nn.initializer.Constant(0),
+                ))
+            getattr(layer, weight_name).set_value(quanted_weight)
+
+            setattr(
+                layer, scale_name,
+                layer.create_parameter(
+                    shape=quanted_weight_scale.shape,
+                    dtype=quanted_weight_scale.dtype,
+                ))
+            getattr(layer, scale_name).set_value(quanted_weight_scale)
+
+    def apply(
+        self,
+        layer: nn.Layer,
+        x: paddle.Tensor,
+        gate_out: paddle.Tensor,
+    ) -> paddle.Tensor:
+        """
+        Triton compute Fused MoE.
+        """
+        token_num = x.shape[0]
+        top_k = layer.top_k
+        num_local_experts = layer.num_local_experts
+        top_k = layer.top_k
+        moe_intermediate_size = layer.moe_intermediate_size
+        hidden_size = layer.hidden_size
+
+        gate_out = paddle.matmul(x.cast("float32"), layer.gate_weight)
+        scores = paddle.nn.functional.softmax(gate_out, axis=-1)
+
+        topk_weights, topk_ids = paddle.topk(scores,
+                                             k=top_k,
+                                             axis=-1,
+                                             sorted=False)
+        topk_weights = topk_weights / topk_weights.sum(axis=-1, keepdim=True)
+
+        intermediate_cache1 = paddle.empty(
+            [token_num * top_k, moe_intermediate_size * 2],
+            dtype=x.dtype,
+        )
+        intermediate_cache2 = paddle.empty(
+            (token_num * top_k, moe_intermediate_size),
+            dtype=x.dtype,
+        )
+        intermediate_cache3 = paddle.empty(
+            (token_num * top_k, hidden_size),
+            dtype=x.dtype,
+        )
+
+        config = {
+            "BLOCK_SIZE_M": 32,
+            "BLOCK_SIZE_N": 128,
+            "BLOCK_SIZE_K": 128,
+            "GROUP_SIZE_M": 1,
+        }
+        from fastdeploy.model_executor.ops.gpu import tritonmoe_preprocess
+
+        from .triton_moe_kernels import fused_moe_kernel_paddle
+        sorted_token_ids, expert_ids, num_tokens_post_padded = tritonmoe_preprocess(
+            topk_ids, num_local_experts, config["BLOCK_SIZE_M"])
+        max_num_tokens_padded = sorted_token_ids.shape[0]
+        grid = (ceil_div(max_num_tokens_padded, config["BLOCK_SIZE_M"]) *
+                ceil_div(moe_intermediate_size * 2, config["BLOCK_SIZE_N"]), )
+
+        fused_moe_kernel_paddle[grid](
+            x,
+            layer.moe_ffn1_weight,
+            intermediate_cache1,
+            None,
+            layer.moe_ffn1_weight_scale,
+            None,
+            sorted_token_ids,
+            expert_ids,
+            num_tokens_post_padded,
+            moe_intermediate_size * 2,
+            hidden_size,
+            max_num_tokens_padded,
+            token_num * top_k,
+            stride_am=x.strides[0],
+            stride_ak=x.strides[1],
+            stride_be=layer.moe_ffn1_weight.strides[0],
+            stride_bk=layer.moe_ffn1_weight.strides[1],
+            stride_bn=layer.moe_ffn1_weight.strides[2],
+            stride_cm=intermediate_cache1.strides[0],
+            stride_cn=intermediate_cache1.strides[1],
+            #
+            stride_asm=-1,
+            stride_ask=-1,
+            stride_bse=layer.moe_ffn1_weight_scale.strides[0],
+            stride_bsk=-1,
+            stride_bsn=layer.moe_ffn1_weight_scale.strides[1],
+            group_n=-1,
+            group_k=-1,
+            # Meta-parameters
+            BLOCK_SIZE_M=config["BLOCK_SIZE_M"],
+            BLOCK_SIZE_N=config["BLOCK_SIZE_N"],
+            BLOCK_SIZE_K=config["BLOCK_SIZE_K"],
+            GROUP_SIZE_M=config["GROUP_SIZE_M"],
+            MUL_ROUTED_WEIGHT=False,
+            top_k=top_k,
+            compute_type_enum=1,
+            use_fp8_w8a8=False,
+            use_int8_w8a16=True,
+            even_Ks=hidden_size % config["BLOCK_SIZE_K"] == 0,
+        )
+
+        intermediate_cache2 = paddle.incubate.nn.functional.swiglu(
+            intermediate_cache1)
+
+        grid = (ceil_div(max_num_tokens_padded, config["BLOCK_SIZE_M"]) *
+                ceil_div(hidden_size, config["BLOCK_SIZE_N"]), )
+        fused_moe_kernel_paddle[grid](
+            intermediate_cache2,
+            layer.moe_ffn2_weight,
+            intermediate_cache3,
+            None,
+            layer.moe_ffn2_weight_scale,
+            topk_weights,
+            sorted_token_ids,
+            expert_ids,
+            num_tokens_post_padded,
+            hidden_size,
+            moe_intermediate_size,
+            max_num_tokens_padded,
+            token_num * top_k,
+            stride_am=intermediate_cache2.strides[0],
+            stride_ak=intermediate_cache2.strides[1],
+            stride_be=layer.moe_ffn2_weight.strides[0],
+            stride_bk=layer.moe_ffn2_weight.strides[1],
+            stride_bn=layer.moe_ffn2_weight.strides[2],
+            stride_cm=intermediate_cache3.strides[0],
+            stride_cn=intermediate_cache3.strides[1],
+            stride_asm=-1,
+            stride_ask=-1,
+            stride_bse=layer.moe_ffn2_weight_scale.strides[0],
+            stride_bsk=-1,
+            stride_bsn=layer.moe_ffn2_weight_scale.strides[1],
+            group_n=-1,
+            group_k=-1,
+            # Meta-parameters
+            BLOCK_SIZE_M=config["BLOCK_SIZE_M"],
+            BLOCK_SIZE_N=config["BLOCK_SIZE_N"],
+            BLOCK_SIZE_K=config["BLOCK_SIZE_K"],
+            GROUP_SIZE_M=config["GROUP_SIZE_M"],
+            MUL_ROUTED_WEIGHT=True,
+            top_k=1,
+            compute_type_enum=1,
+            use_fp8_w8a8=False,
+            use_int8_w8a16=True,
+            even_Ks=moe_intermediate_size % config["BLOCK_SIZE_K"] == 0,
+        )
+
+        intermediate_cache3.reshape_([token_num, top_k, hidden_size])
+        out = intermediate_cache3.sum(axis=1)
+        return out
+
+
+class TensorWiseFP8MoEMethod(QuantMethodBase):
+    """
+    Use Triton Group Gemm to compute Fused MoE.
+    """
+
+    def __init__(self, quant_method=None):
+        """
+        Triton Group Gemm to compute Fused MoE.
+        """
+        self.quant_method = quant_method
+
+    def process_prequanted_weights(self, layer: nn.Layer, state_dict) -> None:
+        """process_prequanted_weights"""
+
+        ffn1_tensor, ffn2_tensor = layer.extract_moe_ffn_weights(state_dict)
+        assert ffn1_tensor[0].shape == [
+            layer.hidden_size, layer.moe_intermediate_size * 2
+        ]
+        assert ffn2_tensor[0].shape == [
+            layer.moe_intermediate_size, layer.hidden_size
+        ]
+
+        ffn1_tensor = paddle.stack(ffn1_tensor, axis=0)
+        ffn2_tensor = paddle.stack(ffn2_tensor, axis=0)
+
+        added_wfp8afp8_attrs = [
+            "moe_ffn1_weight", "moe_ffn2_weight", "moe_ffn1_weight_scale",
+            "moe_ffn2_weight_scale", "moe_ffn1_in_scale", "moe_ffn2_in_scale"
+        ]
+
+        def _extract_scale_tensor(key_template):
+            result = []
+            for i in range(layer.num_experts):
+                result.append(
+                    get_tensor(state_dict.pop(key_template.format(i))))
+            return paddle.concat(result).cast("float32")
+
+        weight_key_map = layer.weight_key_map
+        moe_ffn1_weight_scale = _extract_scale_tensor(
+            weight_key_map["ffn1_expert_weight_scale_key"])
+        moe_ffn2_weight_scale = _extract_scale_tensor(
+            weight_key_map["ffn2_expert_weight_scale_key"])
+        moe_ffn1_in_scale = _extract_scale_tensor(
+            weight_key_map["ffn1_expert_in_scale_key"])
+        moe_ffn2_in_scale = _extract_scale_tensor(
+            weight_key_map["ffn2_expert_in_scale_key"])
+
+        for idx, weight_tensor in enumerate([
+                ffn1_tensor, ffn2_tensor, moe_ffn1_weight_scale,
+                moe_ffn2_weight_scale, moe_ffn1_in_scale, moe_ffn2_in_scale
+        ]):
+            name = added_wfp8afp8_attrs[idx]
+            setattr(
+                layer, name,
+                layer.create_parameter(
+                    shape=weight_tensor.shape,
+                    dtype=weight_tensor.dtype,
+                    default_initializer=paddle.nn.initializer.Constant(0),
+                ))
+            getattr(layer, name).set_value(weight_tensor)
+
+    def create_weights(self, layer: nn.Layer, state_dict):
+        """
+        Triton MoE create weight process.
+        """
+        pass
+
+    def apply(
+        self,
+        layer: nn.Layer,
+        x: paddle.Tensor,
+        gate_out: paddle.Tensor,
+    ) -> paddle.Tensor:
+        """
+        Triton compute Fused MoE.
+        """
+
+        token_num = x.shape[0]
+        top_k = layer.top_k
+        num_local_experts = layer.num_local_experts
+        moe_intermediate_size = layer.moe_intermediate_size
+        hidden_size = layer.hidden_size
+
+        gate_out = paddle.matmul(x.cast("float32"), layer.gate_weight)
+        scores = paddle.nn.functional.softmax(gate_out, axis=-1)
+
+        topk_weights, topk_ids = paddle.topk(scores,
+                                             k=top_k,
+                                             axis=-1,
+                                             sorted=False)
+        topk_weights = topk_weights / topk_weights.sum(axis=-1, keepdim=True)
+
+        intermediate_cache1 = paddle.empty(
+            [token_num * top_k, moe_intermediate_size * 2],
+            dtype=x.dtype,
+        )
+        intermediate_cache2 = paddle.empty(
+            (token_num * top_k, moe_intermediate_size),
+            dtype=x.dtype,
+        )
+        intermediate_cache3 = paddle.empty(
+            (token_num * top_k, hidden_size),
+            dtype=x.dtype,
+        )
+
+        config = {
+            "BLOCK_SIZE_M": 32,
+            "BLOCK_SIZE_N": 128,
+            "BLOCK_SIZE_K": 128,
+            "GROUP_SIZE_M": 1,
+        }
+        from fastdeploy.model_executor.ops.gpu import tritonmoe_preprocess
+
+        sorted_token_ids, expert_ids, num_tokens_post_padded = tritonmoe_preprocess(
+            topk_ids, num_local_experts, config["BLOCK_SIZE_M"])
+        max_num_tokens_padded = sorted_token_ids.shape[0]
+        grid = (ceil_div(max_num_tokens_padded, config["BLOCK_SIZE_M"]) *
+                ceil_div(moe_intermediate_size * 2, config["BLOCK_SIZE_N"]), )
+
+        adamard_matrix = create_hadamard_matrix_map[hidden_size]
+        x = paddle.matmul(x.cast("float32"), adamard_matrix)
+
+        permute_x = x[:, None, :].tile([1, top_k, 1])
+        permute_x = permute_x.reshape([-1, hidden_size])
+
+        quant_activation_scale = layer.moe_ffn1_in_scale[topk_ids].reshape(
+            [-1, 1])
+        permute_x = permute_x / quant_activation_scale
+        permute_x = permute_x.astype("float8_e4m3fn")
+
+        from .triton_moe_kernels import fused_moe_kernel_paddle
+
+        fused_moe_kernel_paddle[grid](
+            permute_x,
+            layer.moe_ffn1_weight.view(paddle.float8_e4m3fn),
+            intermediate_cache1,
+            layer.moe_ffn1_in_scale,
+            layer.moe_ffn1_weight_scale,
+            None,
+            sorted_token_ids,
+            expert_ids,
+            num_tokens_post_padded,
+            moe_intermediate_size * 2,
+            hidden_size,
+            max_num_tokens_padded,
+            token_num * top_k,
+            stride_am=x.strides[0],
+            stride_ak=x.strides[1],
+            stride_be=layer.moe_ffn1_weight.strides[0],
+            stride_bk=layer.moe_ffn1_weight.strides[1],
+            stride_bn=layer.moe_ffn1_weight.strides[2],
+            stride_cm=intermediate_cache1.strides[0],
+            stride_cn=intermediate_cache1.strides[1],
+            #
+            stride_asm=-1,  # only used in blockwise fp8
+            stride_ask=-1,  # only used in blockwise fp8
+            stride_bse=-1,
+            stride_bsk=-1,
+            stride_bsn=-1,
+            group_n=-1,
+            group_k=-1,
+            # Meta-parameters
+            BLOCK_SIZE_M=config["BLOCK_SIZE_M"],
+            BLOCK_SIZE_N=config["BLOCK_SIZE_N"],
+            BLOCK_SIZE_K=config["BLOCK_SIZE_K"],
+            GROUP_SIZE_M=config["GROUP_SIZE_M"],
+            MUL_ROUTED_WEIGHT=False,
+            top_k=1,
+            compute_type_enum=1,
+            use_fp8_w8a8=True,
+            use_int8_w8a16=False,
+            even_Ks=hidden_size % config["BLOCK_SIZE_K"] == 0,
+        )
+
+        intermediate_cache2 = paddle.incubate.nn.functional.swiglu(
+            intermediate_cache1)
+
+        hadamard_matrix = create_hadamard_matrix_map[moe_intermediate_size]
+        intermediate_cache2 = paddle.matmul(
+            intermediate_cache2.cast("float32"), hadamard_matrix)
+        quant_activation_scale = layer.moe_ffn2_in_scale[topk_ids].reshape(
+            [-1, 1])
+        intermediate_cache2 = intermediate_cache2 / quant_activation_scale
+        intermediate_cache2 = intermediate_cache2.astype("float8_e4m3fn")
+
+        grid = (ceil_div(max_num_tokens_padded, config["BLOCK_SIZE_M"]) *
+                ceil_div(hidden_size, config["BLOCK_SIZE_N"]), )
+
+        fused_moe_kernel_paddle[grid](
+            intermediate_cache2,
+            layer.moe_ffn2_weight.view(paddle.float8_e4m3fn),
+            intermediate_cache3,
+            layer.moe_ffn2_in_scale,
+            layer.moe_ffn2_weight_scale,
+            topk_weights,
+            sorted_token_ids,
+            expert_ids,
+            num_tokens_post_padded,
+            hidden_size,
+            moe_intermediate_size,
+            max_num_tokens_padded,
+            token_num * top_k,
+            stride_am=intermediate_cache2.strides[0],
+            stride_ak=intermediate_cache2.strides[1],
+            stride_be=layer.moe_ffn2_weight.strides[0],
+            stride_bk=layer.moe_ffn2_weight.strides[1],
+            stride_bn=layer.moe_ffn2_weight.strides[2],
+            stride_cm=intermediate_cache3.strides[0],
+            stride_cn=intermediate_cache3.strides[1],
+            stride_asm=-1,
+            stride_ask=-1,
+            stride_bse=-1,
+            stride_bsk=-1,
+            stride_bsn=-1,
+            group_n=-1,
+            group_k=-1,
+            # Meta-parameters
+            BLOCK_SIZE_M=config["BLOCK_SIZE_M"],
+            BLOCK_SIZE_N=config["BLOCK_SIZE_N"],
+            BLOCK_SIZE_K=config["BLOCK_SIZE_K"],
+            GROUP_SIZE_M=config["GROUP_SIZE_M"],
+            MUL_ROUTED_WEIGHT=True,
+            top_k=1,
+            compute_type_enum=1,
+            use_fp8_w8a8=True,
+            use_int8_w8a16=False,
+            even_Ks=moe_intermediate_size % config["BLOCK_SIZE_K"] == 0,
+        )
+
+        intermediate_cache3.reshape_([token_num, top_k, hidden_size])
+        out = intermediate_cache3.sum(axis=1)
+
+        if layer.tp_size > 1:
+            tensor_model_parallel_all_reduce(out)
+
+        return out
@@ -0,0 +1,236 @@
+"""
+# Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+
+import paddle
+from paddle import nn
+
+import fastdeploy
+
+from ..quantization.quant_base import QuantMethodBase
+from ..utils import create_and_set_parameter, get_tensor
+
+
+class Wint2MoeMethod(QuantMethodBase):
+    """
+    Use  compute Fused MoE.
+    """
+
+    def __init__(self, quant_config):
+        super().__init__()
+        self.moe_quant_type = quant_config.moe_quant_type
+
+    def process_loaded_weights(self, layer, weights) -> None:
+        """
+        process_loaded_weights
+        """
+        pass
+
+    def check(self, layer: nn.Layer, ffn1_weights, ffn2_weights):
+        """
+        check layer is valid for this method
+        """
+        assert len(
+            ffn1_weights
+        ) == layer.num_local_experts, "ffn1_weights length should be equal to num_local_experts."
+        assert len(
+            ffn2_weights
+        ) == layer.num_local_experts, "ffn2_weights length should be equal to num_local_experts."
+
+    def create_weights(self, layer: nn.Layer, state_dict):
+        """
+        Paddle cutlass create weight process.
+        """
+        pass
+
+
+class TritonWint2FusedMoeMethod(Wint2MoeMethod):
+    """
+    Use Triton Group Gemm to compute Fused MoE.
+    """
+
+    def __init__(self, quant_config):
+        super().__init__(quant_config)
+        self.moe_quant_type = quant_config.moe_quant_type
+
+    def process_loaded_weights(self, layer, weights) -> None:
+        """
+        process_loaded_weights
+        """
+        pass
+
+    def process_prequanted_weights(self, layer: nn.Layer, state_dict):
+        """
+        Paddle cutlass process prequanted weights.
+        """
+        ffn1_expert_weight_key = layer.weight_key_map.get(
+            "ffn1_expert_weight_key", None)
+        ffn2_expert_weight_key = layer.weight_key_map.get(
+            "ffn2_expert_weight_key", None)
+        ffn1_expert_weight_scale_key = layer.weight_key_map.get(
+            "ffn1_expert_weight_scale_key", None)
+        ffn2_expert_weight_scale_key = layer.weight_key_map.get(
+            "ffn2_expert_weight_scale_key", None)
+        ffn1_expert_super_scales_key = layer.weight_key_map.get(
+            "ffn1_expert_super_scales_key", None)
+        ffn2_expert_super_scales_key = layer.weight_key_map.get(
+            "ffn2_expert_super_scales_key", None)
+        ffn1_expert_code_scale_key = layer.weight_key_map.get(
+            "ffn1_expert_code_scale_key", None)
+        ffn2_expert_code_scale_key = layer.weight_key_map.get(
+            "ffn2_expert_code_scale_key", None)
+        ffn1_expert_code_zp_key = layer.weight_key_map.get(
+            "ffn1_expert_code_zp_key", None)
+        ffn2_expert_code_zp_key = layer.weight_key_map.get(
+            "ffn2_expert_code_zp_key", None)
+
+        ffn1_weights, ffn2_weights = layer.load_experts_weight(
+            state_dict, ffn1_expert_weight_key, ffn2_expert_weight_key)
+        # self.check(layer, ffn1_weights, ffn2_weights)
+
+        ffn1_weight_scale = []
+        ffn2_weight_scale = []
+        ffn1_super_scales = []
+        ffn2_super_scales = []
+        ffn1_code_scale = []
+        ffn2_code_scale = []
+        ffn1_code_zp = []
+        ffn2_code_zp = []
+        for i in range(layer.num_experts):
+            expert_idx = layer.expert_id_offset + i
+            ffn1_weight_scale.append(
+                get_tensor(
+                    state_dict.pop(
+                        ffn1_expert_weight_scale_key.format(expert_idx))))
+            ffn2_weight_scale.append(
+                get_tensor(
+                    state_dict.pop(
+                        ffn2_expert_weight_scale_key.format(expert_idx))))
+            ffn1_super_scales.append(
+                get_tensor(
+                    state_dict.pop(
+                        ffn1_expert_super_scales_key.format(expert_idx))))
+            ffn2_super_scales.append(
+                get_tensor(
+                    state_dict.pop(
+                        ffn2_expert_super_scales_key.format(expert_idx))))
+            ffn1_code_scale.append(
+                get_tensor(
+                    state_dict.pop(
+                        ffn1_expert_code_scale_key.format(expert_idx))))
+            ffn2_code_scale.append(
+                get_tensor(
+                    state_dict.pop(
+                        ffn2_expert_code_scale_key.format(expert_idx))))
+            ffn1_code_zp.append(
+                get_tensor(
+                    state_dict.pop(
+                        ffn1_expert_code_zp_key.format(expert_idx))))
+            ffn2_code_zp.append(
+                get_tensor(
+                    state_dict.pop(
+                        ffn2_expert_code_zp_key.format(expert_idx))))
+
+        ffn1_weight = paddle.stack(ffn1_weights, axis=0)
+        ffn2_weight = paddle.stack(ffn2_weights, axis=0)
+        ffn1_weight_scale = paddle.stack(ffn1_weight_scale, axis=0)
+        ffn2_weight_scale = paddle.stack(ffn2_weight_scale, axis=0)
+        ffn1_super_scales = paddle.stack(ffn1_super_scales, axis=0)
+        ffn2_super_scales = paddle.stack(ffn2_super_scales, axis=0)
+        ffn1_code_scale = paddle.stack(ffn1_code_scale, axis=0)
+        ffn2_code_scale = paddle.stack(ffn2_code_scale, axis=0)
+        ffn1_code_zp = paddle.stack(ffn1_code_zp, axis=0)
+        ffn2_code_zp = paddle.stack(ffn2_code_zp, axis=0)
+
+        name_tensor_map = {
+            "moe_ffn1_weight": ffn1_weight,
+            "moe_ffn2_weight": ffn2_weight,
+            "moe_ffn1_weight_scale": ffn1_weight_scale,
+            "moe_ffn2_weight_scale": ffn2_weight_scale,
+            "moe_ffn1_super_scales": ffn1_super_scales,
+            "moe_ffn2_super_scales": ffn2_super_scales,
+            "moe_ffn1_code_scale": ffn1_code_scale,
+            "moe_ffn2_code_scale": ffn2_code_scale,
+            "moe_ffn1_code_zp": ffn1_code_zp,
+            "moe_ffn2_code_zp": ffn2_code_zp
+        }
+        for name, tensor in name_tensor_map.items():
+            create_and_set_parameter(layer, name, tensor)
+
+    def create_weights(self, layer: nn.Layer, state_dict):
+        """
+        Paddle cutlass create weight process.
+        """
+        pass
+
+    def apply(
+        self,
+        layer: nn.Layer,
+        x: paddle.Tensor,
+        gate_out: paddle.Tensor,
+    ) -> paddle.Tensor:
+        """
+        Use Wint2 Triton Fusedmoe compute Fused MoE.
+        """
+
+        from fastdeploy.model_executor.ops.gpu import moe_expert_dispatch
+        (
+            permute_input,
+            token_nums_per_expert,
+            permute_indices_per_token,
+            topk_weights,
+            topk_idx,
+            expert_idx_per_token,
+        ) = moe_expert_dispatch(
+            x,
+            gate_out,
+            layer.gate_correction_bias,
+            (layer.moe_ffn1_in_scale if hasattr(layer, "moe_ffn1_in_scale")
+             else None),  # if set, permute_input will be int8_t
+            layer.top_k,
+            False,
+            topk_only_mode=False,
+        )
+
+        ffn_out = fastdeploy.model_executor.ops.gpu.moe_expert_ffn_wint2(
+            permute_input,
+            token_nums_per_expert,
+            layer.moe_ffn1_weight,
+            layer.moe_ffn2_weight,
+            None,
+            layer.moe_ffn1_super_scales,
+            layer.moe_ffn2_super_scales,
+            layer.moe_ffn1_weight_scale,
+            layer.moe_ffn1_code_scale,
+            layer.moe_ffn1_code_zp,
+            layer.moe_ffn2_weight_scale,
+            layer.moe_ffn2_code_scale,
+            layer.moe_ffn2_code_zp,
+            False,
+        )
+
+        from fastdeploy.model_executor.ops.gpu import moe_expert_reduce
+
+        fused_moe_out = moe_expert_reduce(
+            ffn_out,
+            topk_weights,
+            permute_indices_per_token,
+            topk_idx,
+            None,
+            norm_topk_prob=True,
+            routed_scaling_factor=1.0,
+        )
+
+        return fused_moe_out
@@ -1,273 +0,0 @@
-"""
-# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""
-
-import os
-import paddle
-from paddle import nn
-from fastdeploy.model_executor.layers.moe.moe import MoELayer
-from fastdeploy.model_executor.layers.utils import get_tensor
-
-
-class TextMoELayer(MoELayer):
-    """
-    MoELayer is a layer that performs MoE (Mixture of Experts) computation.
-    """
-
-    def __init__(
-        self,
-        *args,
-        **kwargs,
-    ):
-        """
-            初始化函数，用于设置类的属性和方法。
-        参数：
-            - args (tuple, optional): 可变长度的位置参数列表，默认为空元组。
-            - kwargs (dict, optional): 关键字参数字典，默认为空字典。
-        返回值：
-            无返回值，直接修改类的属性和方法。
-        """
-        kwargs["moe_tag"] = "Text"
-        super().__init__(*args, **kwargs)
-
-    def load_gate_state_dict(self, state_dict):
-        """
-            加载门状态字典，用于初始化网络参数。
-        将从给定的状态字典中弹出的参数赋值给网络的门参数。
-
-        Args:
-            state_dict (OrderedDict): 包含网络门参数的字典。
-
-        Returns:
-            tuple (list, list): 返回两个列表，分别代表上阶网关投影和下阶投影的参数。
-                每个元素都是一个列表，长度为网络的专家数量。
-        """
-        up_gate_proj_weight = []
-        up_gate_proj_weight_scale = []
-        down_proj_weight = []
-        down_proj_weight_scale = []
-        for j in range(0, self.num_experts):
-            up_gate_proj_weight.append(
-                get_tensor(state_dict.pop(self.ffn1_expert_weight_key.format(j)))
-            )
-            down_proj_weight.append(
-                get_tensor(state_dict.pop(self.ffn2_expert_weight_key.format(j)))
-            )
-        return (
-            up_gate_proj_weight,
-            down_proj_weight,
-            up_gate_proj_weight_scale,
-            down_proj_weight_scale,
-        )
-
-    def load_gate_correction_bias(self, state_dict):
-        """
-            加载网关校正偏置。如果使用了网关校正偏置，则从state_dict中获取相应的张量并设置到网关校正偏置上。
-        参数：
-            state_dict (OrderedDict): 包含模型参数和状态的字典。
-        返回值：
-            无返回值，直接修改了网关校正偏置的值。
-        """
-        if self.moe_config.moe_use_gate_correction_bias:
-            gate_correction_bias_tensor = get_tensor(
-                state_dict[self.gate_correction_bias_key]
-            )
-            self.gate_correction_bias.set_value(
-                gate_correction_bias_tensor[0].unsqueeze(0)
-            )
-
-
-class ImageMoELayer(MoELayer):
-    """
-    MoELayer is a layer that performs MoE (Mixture of Experts) computation.
-    """
-
-    def __init__(
-        self,
-        *args,
-        **kwargs,
-    ):
-        """
-            初始化函数，用于设置类的属性和方法。
-        参数：
-            - args (tuple, optional): 可变长度的位置参数列表，默认为空元组。
-            - kwargs (dict, optional): 关键字参数字典，默认为空字典。
-        返回值：
-            无返回值，直接修改类的属性和方法。
-        """
-        moe_quant_type = os.getenv("ELLM_MM_IMAGE_QUANT_TYPE", None)
-        if moe_quant_type is not None:
-            kwargs["moe_quant_type"] = moe_quant_type
-        kwargs["moe_tag"] = "Image"
-        super().__init__(*args, **kwargs)
-
-    def load_gate_state_dict(self, state_dict):
-        """
-            加载门状态字典。
-        从给定的状态字典中提取并返回两个专家的上下关门投影权重，以及两个专家的下降投影权重。
-        参数：
-            state_dict (OrderedDict): 包含网络参数的有序字典。
-        返回值：
-            tuple (list, list)，分别是两个专家的上下关门投影权重和两个专家的下降投影权重，都是列表类型。
-        """
-        up_gate_proj_weight = []
-        up_gate_proj_weight_scale = []
-        down_proj_weight = []
-        down_proj_weight_scale = []
-        for j in range(self.num_experts, self.num_experts + self.num_experts):
-            up_gate_proj_weight.append(
-                get_tensor(state_dict.pop(self.ffn1_expert_weight_key.format(j)))
-            )
-            down_proj_weight.append(
-                get_tensor(state_dict.pop(self.ffn2_expert_weight_key.format(j)))
-            )
-        return (
-            up_gate_proj_weight,
-            down_proj_weight,
-            up_gate_proj_weight_scale,
-            down_proj_weight_scale,
-        )
-
-    def load_gate_correction_bias(self, state_dict):
-        """
-            加载门级别校正偏置参数，如果使用门级别校正偏置则从state_dict中获取并设置到gate_correction_bias中。
-        参数：
-            state_dict (OrderedDict): 模型的状态字典，包含所有需要被加载的参数。
-        返回值：
-            无返回值，直接修改了gate_correction_bias的值。
-        """
-        if self.moe_config.moe_use_gate_correction_bias:
-            gate_correction_bias_tensor = get_tensor(
-                state_dict[self.gate_correction_bias_key]
-            )
-            self.gate_correction_bias.set_value(
-                gate_correction_bias_tensor[1].unsqueeze(0)
-            )
-
-
-class MultimodalityMoeLayer(nn.Layer):
-    """
-    Multimodality MOE Layer
-    """
-
-    def __init__(
-        self,
-        inference_args,
-        layer_name,
-        layer_idx,
-    ):
-        """
-            初始化一个 MoELayer。
-
-        Args:
-            inference_args (InferenceArgs): 推理参数类，包含了所有必要的配置信息。
-            layer_name (str): 当前 MoE Layer 的名称。
-            layer_idx (int): 当前 MoE Layer 在模型中的索引。
-
-        Returns:
-            None, 无返回值。
-        """
-        super().__init__()
-
-        self.text_moe_layer = TextMoELayer(
-            inference_args=inference_args,
-            moe_config=inference_args.moe_config,
-            layer_name=layer_name + ".text",
-            gate_weight_key=f"ernie.layers.{layer_idx}.mlp.gate.weight",
-            ffn1_expert_weight_key=f"ernie.layers.{layer_idx}.mlp.experts"
-            + ".{}.up_gate_proj.weight",
-            ffn2_expert_weight_key=f"ernie.layers.{layer_idx}.mlp.experts"
-            + ".{}.down_proj.weight",
-            gate_correction_bias_key=f"ernie.layers.{layer_idx}.mlp.moe_statics.e_score_correction_bias",
-            ffn1_bias_key=None,
-            ffn2_bias_key=None,
-            ffn1_shared_weight_key=None,
-            ffn1_shared_bias_key=None,
-            ffn2_shared_weight_key=None,
-            ffn2_shared_bias_key=None,
-            layer_idx=layer_idx,
-        )
-
-        self.image_moe_layer = ImageMoELayer(
-            inference_args=inference_args,
-            moe_config=inference_args.moe_config_1,
-            layer_name=layer_name + ".image",
-            gate_weight_key=f"ernie.layers.{layer_idx}.mlp.gate.weight_1",
-            ffn1_expert_weight_key=f"ernie.layers.{layer_idx}.mlp.experts"
-            + ".{}.up_gate_proj.weight",
-            ffn2_expert_weight_key=f"ernie.layers.{layer_idx}.mlp.experts"
-            + ".{}.down_proj.weight",
-            gate_correction_bias_key=f"ernie.layers.{layer_idx}.mlp.moe_statics.e_score_correction_bias",
-            ffn1_bias_key=None,
-            ffn2_bias_key=None,
-            ffn1_shared_weight_key=None,
-            ffn1_shared_bias_key=None,
-            ffn2_shared_weight_key=None,
-            ffn2_shared_bias_key=None,
-            layer_idx=layer_idx,
-        )
-
-    def load_state_dict(self, state_dict):
-        """
-            加载模型参数。
-        将给定的字典中的参数覆盖到当前模型上，并返回一个新的字典，其中包含未被覆盖的键值对。
-
-        Args:
-            state_dict (dict): 包含了要加载的模型参数的字典。
-
-        Returns:
-            dict: 包含未被覆盖的键值对的字典。
-        """
-        self.text_moe_layer.load_state_dict(state_dict)
-        self.image_moe_layer.load_state_dict(state_dict)
-        state_dict.pop(self.text_moe_layer.gate_correction_bias_key)
-
-    def forward(self, x, **kwargs):
-        """
-            前向计算函数，将输入的张量进行处理并返回结果。
-        该函数接受以下键值对参数：
-            - token_type_ids (Optional, Tensor, default=None): 一个bool型Tensor，用于指定每个元素是否为文本类型（值为0）或图像类型（值为1）。
-                如果未提供此参数，则会引发AssertionError。
-        返回值是一个Tensor，形状与输入相同，表示处理后的结果。
-
-        Args:
-            x (Tensor): 输入张量，形状为[token_num, hidden_size]，其中token_num是序列长度，hidden_size是隐藏状态维度。
-            kwargs (dict, optional): 可选参数字典，默认为None，包含以下键值对：
-                - token_type_ids (Tensor, optional): 一个bool型Tensor，用于指定每个元素是否为文本类型（值为0）或图像类型（值为1），默认为None。
-
-        Returns:
-            Tensor: 一个Tensor，形状与输入相同，表示处理后的结果。
-
-        Raises:
-            AssertionError: 当未提供token_type_ids参数时会引发此错误。
-        """
-        token_type_ids = kwargs.get("token_type_ids", None)
-        assert token_type_ids is not None
-
-        # x.shape is [token_num, hidden_size]
-        fused_moe_out = paddle.zeros_like(x)
-
-        text_mask = token_type_ids == 0  # [token_num]
-        image_mask = token_type_ids == 1
-
-        if text_mask.any():
-            text_out = self.text_moe_layer(x[text_mask])
-            fused_moe_out[text_mask] = text_out
-
-        if image_mask.any():
-            image_out = self.image_moe_layer(x[image_mask])
-            fused_moe_out[image_mask] = image_out
-
-        return fused_moe_out
@@ -1,5 +1,5 @@
 """
-# Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
+# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -14,34 +14,13 @@
 # limitations under the License.
 """

-from dataclasses import dataclass
-
+import paddle
 from paddle import nn
-from paddlenlp.utils.log import logger
+from paddleformers.utils.log import logger

+from fastdeploy import envs
 from fastdeploy.model_executor.layers.utils import get_tensor

-from .cutlass_fused_moe import CutlassFusedMoeMethod
-
-
-@dataclass
-class MoEComputeParams:
-    """
-    some params for computing MoE.
-    it is given to different compute methods.
-    """
-    global_num_experts: int = -1
-    top_k: int = -1
-    hidden_size: int = -1
-    num_local_experts: int = -1
-    moe_intermediate_size: int = -1
-
-    tp_size: int = -1
-    ep_size: int = -1
-    dp_size: int = -1
-
-    moe_quant_type: str = ""
-

 class FusedMoE(nn.Layer):
    """
@@ -50,174 +29,195 @@ class FusedMoE(nn.Layer):

    def __init__(
        self,
-        llm_config,
+        fd_config,
        moe_intermediate_size: int = -1,
        num_experts: int = -1,
+        expert_id_offset: int = 0,
        top_k: int = -1,
-        moe_use_gate_correction_bias: bool = False,
-        moe_quant_type: str = "weight_only_int4",
        layer_idx: int = -1,
-        gate_weight_key=None,
-        gate_correction_bias_key=None,
-        ffn1_expert_weight_key=None,
-        ffn2_expert_weight_key=None,
-        moe_ffn1_bias_keys=None,
-        moe_ffn2_bias_keys=None,
-        moe_ffn1_weight_scale_keys=None,
-        moe_ffn2_weight_scale_keys=None,
-        moe_ffn1_in_scale_keys=None,
-        moe_ffn2_in_scale_keys=None,
+        moe_tag: str = "",
+        weight_key_map: dict = {},
    ):
        """
        Initialize the Moe layer with given parameters.
        Args:
-            llm_config (LLMConfig): Arguments related to inference, containing
+            fd_config (FDConfig): Arguments related to inference, containing
                attributes such as weight_dtype, act_dtype, mp_size, hidden_size, head_dim,
                num_attention_heads, and ffn_hidden_size.
        """
        super().__init__()

-        self.llm_config = llm_config
+        self.fd_config = fd_config
        self.layer_idx = layer_idx
-        self.tp_size = llm_config.parallel_config.mp_size
-        self.ep_size = llm_config.parallel_config.ep_size

-        self.moe_use_gate_correction_bias = moe_use_gate_correction_bias
+        self.tp_size = fd_config.parallel_config.tensor_parallel_degree
+        self.ep_size = fd_config.parallel_config.expert_parallel_degree
+        self.ep_rank = fd_config.parallel_config.expert_parallel_rank
+
+        assert (self.tp_size >= 1 and self.ep_size == 1) or \
+                (self.tp_size == 1 and self.ep_size > 1), \
+            'MoE only support parallelism on TP or EP dimension.'
+
+        self.hidden_size = fd_config.model_config.hidden_size
+        self.moe_config = fd_config.moe_config

-        self.hidden_size = llm_config.model_config.hidden_size
-        self.moe_config = llm_config.moe_config
-        self.use_offline_quant = llm_config.tmp_config.use_offline_quant
-        moe_tag = self.llm_config.moe_config.moe_tag
-        logger.info(f"{moe_tag}MoE is running in {moe_quant_type} mode")
-        
-        self.moe_quant_type = moe_quant_type
        self.num_experts = num_experts
        self.num_local_experts = self.num_experts // self.ep_size

-        logger.info(f'''MoE config is num_experts:{num_experts},
-             top_k:{top_k},
-             hidden_size:{self.hidden_size},
-             moe_intermediate_size:{moe_intermediate_size}''')
-        logger.info(
-            f"MoE is running on moe_quant_type: {self.moe_quant_type}, ep:{self.ep_size}, tp:{self.tp_size} mode"
-        )
        self.moe_intermediate_size = moe_intermediate_size // self.tp_size

-        self.gate_weight_key = gate_weight_key
-        self.gate_correction_bias_key = gate_correction_bias_key
+        self.top_k = top_k
+        self.hidden_size = self.hidden_size
+        self.moe_intermediate_size = moe_intermediate_size // self.tp_size
+        self.weight_key_map = weight_key_map

-        self.ffn1_expert_weight_key = ffn1_expert_weight_key
-        self.ffn2_expert_weight_key = ffn2_expert_weight_key
-        self.ffn1_bias_key = moe_ffn1_bias_keys
-        self.ffn2_bias_key = moe_ffn2_bias_keys
+        self.use_method = envs.FD_MOE_BACKEND.lower()
+        self.gate_correction_bias = None
+        self.moe_tag = moe_tag

-        if self.moe_quant_type == "w4a8":
-            # below keys are only used in MoE W4A8!
-            self.ffn1_expert_weight_scale_key = moe_ffn1_weight_scale_keys
-            self.ffn2_expert_weight_scale_key = moe_ffn2_weight_scale_keys
-            self.ffn1_expert_in_scale_key = moe_ffn1_in_scale_keys
-            self.ffn2_expert_in_scale_key = moe_ffn2_in_scale_keys
+        if self.ep_size > 1:
+            expert_id_offset = expert_id_offset + self.ep_rank * self.num_local_experts

-        self.compute_method = CutlassFusedMoeMethod()
+        self.expert_id_offset = expert_id_offset

-        self.moe_compute_params = MoEComputeParams()
-        self.moe_compute_params.global_num_experts = self.num_experts
-        self.moe_compute_params.top_k = top_k
-        self.moe_compute_params.hidden_size = self.hidden_size
-        self.moe_compute_params.num_local_experts = self.num_local_experts
-        self.moe_compute_params.moe_quant_type = self.moe_quant_type
-        self.moe_compute_params.moe_intermediate_size = self.moe_intermediate_size
-        self.moe_compute_params.ep_size = self.ep_size
-        self.moe_compute_params.tp_size = self.tp_size
+        if fd_config.quant_config:
+            self.quant_method = fd_config.quant_config.get_quant_method(self)
+        else:
+            # now, no quant method(w_fp16 a_fp16) can't get from quant_config, we will optimize it in future
+            from .fused_moe_cutlass_backend import CutlassMoEMethod
+            self.quant_method = CutlassMoEMethod(None)

-    def load_gate_state_dict(self, state_dict):
+        if self.ep_size > 1:
+            self.quant_method.init_ep(self)
+
+        logger.info(
+            f"{moe_tag}MoE config is {num_experts=}[{expert_id_offset}, {expert_id_offset+self.num_local_experts}), \
+        {top_k=}, hidden_size={self.hidden_size}, {moe_intermediate_size=}, \
+            , ep_size={self.ep_size}, \
+            tp_size={self.tp_size}.")
+
+    def load_experts_weight(self, state_dict: dict,
+                            ffn1_expert_weight_key: str,
+                            ffn2_expert_weight_key: str):
        """
-        load_gate_state_dict function.
+        Load experts weight from state_dict.
+        Args:
+            state_dict (dict): The state_dict of model.
+            ffn1_expert_weight_key (str): The key of ffn1 expert weight.
+            ffn2_expert_weight_key (str): The key of ffn2 expert weight.
        """
-        up_gate_proj_weight = []
-        up_gate_proj_weight_scale = []
-        down_proj_weight = []
-        down_proj_weight_scale = []
-        for j in range(self.num_experts):
-            up_gate_proj_weight.append(
-                get_tensor(
-                    state_dict.pop(self.ffn1_expert_weight_key.format(j))))
-            down_proj_weight.append(
-                get_tensor(
-                    state_dict.pop(self.ffn2_expert_weight_key.format(j))))
-        return up_gate_proj_weight, down_proj_weight
+        ffn1_weights = []
+        ffn2_weights = []
+        is_ffn_merged = ffn1_expert_weight_key.format(
+            self.expert_id_offset) in state_dict
+        if is_ffn_merged:
+            for i in range(self.num_local_experts):
+                expert_idx = self.expert_id_offset + i
+                ffn1_weights.append(
+                    get_tensor(
+                        state_dict.pop(
+                            ffn1_expert_weight_key.format(expert_idx))))
+                ffn2_weights.append(
+                    get_tensor(
+                        state_dict.pop(
+                            ffn2_expert_weight_key.format(expert_idx))))
+        else:
+            gate_expert_weight_key = ffn1_expert_weight_key.replace(
+                "up_gate_proj", "gate_proj")
+            up_expert_weight_key = ffn1_expert_weight_key.replace(
+                "up_gate_proj", "up_proj")
+            for j in range(self.num_local_experts):
+                expert_idx = self.expert_id_offset + j
+                gate = get_tensor(
+                    state_dict.pop(gate_expert_weight_key.format(expert_idx)))
+                up = get_tensor(
+                    state_dict.pop(up_expert_weight_key.format(expert_idx)))
+                ffn1_weights.append(paddle.concat([gate, up], axis=-1))
+                ffn2_weights.append(
+                    get_tensor(
+                        state_dict.pop(
+                            ffn2_expert_weight_key.format(expert_idx))))
+        return ffn1_weights, ffn2_weights

-    def load_state_dict(self, state_dict, is_update: bool = False):
+    def extract_moe_ffn_weights(self, state_dict: dict):
+        """
+        Extract MoE FFN weights from state dict based on weight key mapping.
+
+        Args:
+            state_dict (dict): Model state dictionary containing the weights.
+
+        Returns:
+            tuple: A tuple containing two lists:
+                - ffn1_weights: List of tensors for first FFN layer weights
+                - ffn2_weights: List of tensors for second FFN layer weights
+
+        Raises:
+            AssertionError: If required weight keys are missing or number of weights
+                doesn't match number of local experts.
+        """
+        ffn1_expert_weight_key = self.weight_key_map.get(
+            "ffn1_expert_weight_key", None)
+        ffn2_expert_weight_key = self.weight_key_map.get(
+            "ffn2_expert_weight_key", None)
+        assert ffn1_expert_weight_key is not None, "ffn1_expert_weight_key should not be none."
+        assert ffn2_expert_weight_key is not None, "ffn2_expert_weight_key should not be none."
+
+        ffn1_weights, ffn2_weights = self.load_experts_weight(
+            state_dict, ffn1_expert_weight_key, ffn2_expert_weight_key)
+        assert len(
+            ffn1_weights
+        ) == self.num_local_experts, "ffn1_weights length should be equal to num_local_experts."
+        assert len(
+            ffn2_weights
+        ) == self.num_local_experts, "ffn2_weights length should be equal to num_local_experts."
+
+        return ffn1_weights, ffn2_weights
+
+    def extract_gate_correction_bias(self, gate_correction_bias_key,
+                                     state_dict):
+        """
+        extract_gate_correction_bias function.
+        """
+        gate_correction_bias_tensor = get_tensor(
+            state_dict.pop(gate_correction_bias_key)).astype("float32")
+        return gate_correction_bias_tensor
+
+    def load_state_dict(self, state_dict):
        """
        load_state_dict function.
        """
-        # gate
-        if not is_update:
-            gate_weight_tensor = get_tensor(state_dict.pop(self.gate_weight_key))
-            self.gate_weight = self.create_parameter(
-                shape=gate_weight_tensor.shape,
-                dtype="float32",
-            )
-            self.gate_weight.set_value(gate_weight_tensor)
-
-        # gate_correction_bias
+        self.gate_correction_bias_key = self.weight_key_map.get(
+            "gate_correction_bias_key", None)
+        if self.gate_correction_bias_key is not None and self.gate_correction_bias_key in state_dict:
+            self.moe_use_gate_correction_bias = True
+        else:
+            self.moe_use_gate_correction_bias = False
        if self.moe_use_gate_correction_bias:
-            gate_correction_bias_tensor = get_tensor(
-                state_dict.pop(self.gate_correction_bias_key))
-
+            gate_correction_bias_tensor = self.extract_gate_correction_bias(
+                self.gate_correction_bias_key, state_dict)
            self.gate_correction_bias = self.create_parameter(
                shape=gate_correction_bias_tensor.shape,
                dtype="float32",
            )
-
            self.gate_correction_bias.set_value(gate_correction_bias_tensor)
+
+        gate_weight_key = self.weight_key_map.get("gate_weight_key", None)
+        assert gate_weight_key is not None, "gate_weight_key should not be None, please check model checkpoints"
+
+        gate_weight_tensor = get_tensor(state_dict.pop(gate_weight_key))
+
+        self.gate_weight = self.create_parameter(
+            shape=gate_weight_tensor.shape,
+            dtype="float32",
+        )
+        self.gate_weight.set_value(gate_weight_tensor.astype("float32"))
+
+        if self.fd_config.model_config.is_quantized:
+            self.quant_method.process_prequanted_weights(self, state_dict)
        else:
-            self.gate_correction_bias = None
+            self.quant_method.create_weights(self, state_dict)

-        up_gate_proj_weight, down_proj_weight = self.load_gate_state_dict(
-            state_dict)
-
-        weight1_scale = None
-        weight2_scale = None
-        ffn1_in_scale = None
-        ffn2_in_scale = None
-        if self.moe_quant_type == "w4a8":
-            weight1_scale = []
-            weight2_scale = []
-            ffn1_in_scale = []
-            ffn2_in_scale = []
-
-            for j in range(self.num_experts):
-                weight1_scale.append(
-                    get_tensor(
-                        state_dict.pop(
-                            self.ffn1_expert_weight_scale_key.format(
-                                self.layer_idx, j))))
-                weight2_scale.append(
-                    get_tensor(
-                        state_dict.pop(
-                            self.ffn2_expert_weight_scale_key.format(
-                                self.layer_idx, j))))
-                ffn1_in_scale.append(
-                    get_tensor(
-                        state_dict.pop(
-                            self.ffn1_expert_in_scale_key.format(
-                                self.layer_idx, j))))
-                ffn2_in_scale.append(
-                    get_tensor(
-                        state_dict.pop(
-                            self.ffn2_expert_in_scale_key.format(
-                                self.layer_idx, j))))
-
-        # other weight is with compute_method
-        # different method may have different way to create weights
-        self.compute_method.create_weights(self, self.moe_compute_params,
-                                           up_gate_proj_weight,
-                                           down_proj_weight, None, None,
-                                           weight1_scale, weight2_scale,
-                                           ffn1_in_scale, ffn2_in_scale)
-
-    def forward(self, x, **kwargs):
+    def forward(self, x: paddle.Tensor):
        """
        Defines the forward computation of the moe layer.

@@ -225,13 +225,9 @@ class FusedMoE(nn.Layer):
            x (Tensor): Input tensor to the moe layer.

        Returns:
-            Tensor: Output tensor.
+            Tensor: Output tensor.s

        """
-
-        out = self.compute_method.apply(self, self.moe_compute_params, x)
-        if self.tp_size > 1:
-            from fastdeploy.distributed.communication_op import \
-                tensor_model_parallel_all_reduce
-            tensor_model_parallel_all_reduce(out)
+        gate_out = paddle.matmul(x.cast("float32"), self.gate_weight)
+        out = self.quant_method.apply(self, x, gate_out)
        return out
@@ -1,126 +0,0 @@
-"""
-# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""
-
-import os
-import paddle
-import fastdeploy
-import fastdeploy.model_executor.ops.gpu.deep_gemm as deep_gemm
-from fastdeploy.model_executor.layers.moe.moe import MoELayer
-
-
-class MoeTPDecoerDeepDeepGEMMLayer(MoELayer):
-    """
-    MoeTPDecoerDeepDeepGEMMLayer
-    """
-    def __init__(self, *args, **kwargs):
-        super().__init__(*args, **kwargs)
-
-    def forward(self, x, **kwargs):
-        """
-        forward
-        """
-        gate_out = paddle.matmul(x.cast("float32"), self.gate_weight)
-        if os.getenv("EP_DECODER_PERF_TEST", "False") == "True":
-            gate_out = paddle.rand(shape=gate_out.shape, dtype=gate_out.dtype)
-        ffn1_out = paddle.empty(
-            [
-                self.num_local_experts,
-                self.max_batch_size,
-                self.moe_intermediate_size * 2,
-            ],
-            dtype=self._dtype,
-        )
-
-        ffn_out = paddle.empty(
-            [
-                self.num_local_experts,
-                self.max_batch_size,
-                self.embed_dim,
-            ],
-            dtype=self._dtype,
-        )
-
-        topk_idx, topk_weights = fastdeploy.model_executor.ops.gpu.moe_topk_select(
-            gate_out,
-            (
-                self.gate_correction_bias
-                if self.moe_config.moe_use_gate_correction_bias
-                else None
-            ),
-            self.top_k,
-            True,  # apply_norm_weight
-            False,
-        )
-        permute_input, token_nums_per_expert, permute_indices_per_token = (
-            fastdeploy.model_executor.ops.gpu.moe_deepgemm_permute(
-                x, topk_idx, self.num_local_experts, self.max_batch_size
-            )
-        )
-
-        expected_m = 128
-
-        permute_input_fp8, scale = fastdeploy.model_executor.ops.gpu.masked_per_token_quant(
-            permute_input, token_nums_per_expert, 128
-        )
-        deep_gemm.m_grouped_gemm_fp8_fp8_bf16_nt_masked(
-            (permute_input_fp8, scale),
-            (
-                self.moe_ffn1_weight,
-                self.moe_ffn1_weight_scale,
-            ),
-            ffn1_out,
-            token_nums_per_expert,
-            expected_m,
-        )
-
-        act_out = fastdeploy.model_executor.ops.gpu.group_swiglu_with_masked(
-            ffn1_out, token_nums_per_expert
-        )
-
-        act_out_fp8, scale = fastdeploy.model_executor.ops.gpu.masked_per_token_quant(
-            act_out, token_nums_per_expert, 128
-        )
-
-        deep_gemm.m_grouped_gemm_fp8_fp8_bf16_nt_masked(
-            (act_out_fp8, scale),
-            (
-                self.moe_ffn2_weight,
-                self.moe_ffn2_weight_scale,
-            ),
-            ffn_out,
-            token_nums_per_expert,
-            expected_m,
-        )
-
-        fused_moe_out = fastdeploy.model_executor.ops.gpu.moe_deepgemm_depermute(
-            ffn_out, permute_indices_per_token, topk_idx, topk_weights
-        )[0]
-
-        return fused_moe_out
-
-
-class MoeTPPrefillDeepDeepGEMMLayer(MoELayer):
-    """
-    MoeTPPrefillDeepDeepGEMMLayer
-    """
-    def __init__(self, *args, **kwargs):
-        super().__init__(*args, **kwargs)
-
-    def forward(self, x, **kwargs):
-        """
-        forward
-        """
-        raise NotImplementedError("Prefill is comming soon...")
@@ -0,0 +1,198 @@
+"""
+# Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+
+import triton
+import triton.language as tl
+
+
+@triton.jit
+def fused_moe_kernel_paddle(
+    a_ptr,
+    b_ptr,
+    c_ptr,
+    a_scale_ptr,
+    b_scale_ptr,
+    topk_weights_ptr,
+    sorted_token_ids_ptr,
+    expert_ids_ptr,
+    num_tokens_post_padded_ptr,
+
+    # Matrix dimensions
+    N,
+    K,
+    num_tokens_post_padded,
+    num_valid_tokens,
+    stride_am,
+    stride_ak,
+    stride_be,
+    stride_bk,
+    stride_bn,
+    stride_cm,
+    stride_cn,
+    stride_asm,
+    stride_ask,
+    stride_bse,
+    stride_bsk,
+    stride_bsn,
+    # Block size for block-wise fp8 quantization
+    group_n: tl.constexpr,
+    group_k: tl.constexpr,
+    # Meta-parameters
+    BLOCK_SIZE_M: tl.constexpr,
+    BLOCK_SIZE_N: tl.constexpr,
+    BLOCK_SIZE_K: tl.constexpr,
+    GROUP_SIZE_M: tl.constexpr,
+    MUL_ROUTED_WEIGHT: tl.constexpr,
+    top_k: tl.constexpr,
+    compute_type_enum: tl.constexpr,
+    use_fp8_w8a8: tl.constexpr,
+    use_int8_w8a16: tl.constexpr,
+    even_Ks: tl.constexpr,
+):
+    """
+
+    Key Parameters:
+    - A: The input tensor representing tokens with shape (*, K), where '*' can
+        be any shape representing batches and K is the feature dimension of
+        each token.
+    - B: The stacked MOE weight tensor with shape (E, N, K), where E is
+        the number of experts, K is the input feature dimension, and N is
+        the output feature dimension.
+    - C: The output cache tensor with shape (M, topk, N), where M is the
+        total number of tokens post padding, topk is the number of times
+        each token is repeated, and N is the output feature dimension.
+    - sorted_token_ids: A tensor containing the sorted indices of tokens,
+        repeated topk times and arranged by the expert index they are
+        assigned to.
+    - expert_ids: A tensor containing the indices of the expert for each
+        block. It determines which expert matrix from B should be used for
+        each block in A.
+    This kernel performs the multiplication of a token by its corresponding
+    expert matrix as determined by `expert_ids`. The sorting of
+    `sorted_token_ids` by expert index and padding ensures divisibility by
+    BLOCK_SIZE_M, which is necessary to maintain consistency in block matrix
+    multiplication across different blocks processed by the same expert.
+    """
+    pid = tl.program_id(axis=0)
+    num_pid_m = tl.cdiv(num_tokens_post_padded, BLOCK_SIZE_M)
+    num_pid_n = tl.cdiv(N, BLOCK_SIZE_N)
+    num_pid_in_group = GROUP_SIZE_M * num_pid_n
+    group_id = pid // num_pid_in_group
+    first_pid_m = group_id * GROUP_SIZE_M
+    group_size_m = min(num_pid_m - first_pid_m, GROUP_SIZE_M)
+    pid_m = first_pid_m + ((pid % num_pid_in_group) % group_size_m)
+    pid_n = (pid % num_pid_in_group) // group_size_m
+
+    assert compute_type_enum == 1
+    compute_type = tl.bfloat16
+
+    num_tokens_post_padded = tl.load(num_tokens_post_padded_ptr)
+    if pid_m * BLOCK_SIZE_M >= num_tokens_post_padded:
+        return
+    offs_token_id = pid_m * BLOCK_SIZE_M + tl.arange(0, BLOCK_SIZE_M)
+    offs_token = tl.load(sorted_token_ids_ptr + offs_token_id)
+    token_mask = offs_token < num_valid_tokens
+
+    offs_bn = (pid_n * BLOCK_SIZE_N + tl.arange(0, BLOCK_SIZE_N)) % N
+    offs_k = tl.arange(0, BLOCK_SIZE_K)
+    a_ptrs = a_ptr + (offs_token[:, None] // top_k * stride_am +
+                      offs_k[None, :] * stride_ak)
+
+    off_experts = tl.load(expert_ids_ptr + pid_m)
+    b_ptrs = b_ptr + off_experts * stride_be + (offs_k[:, None] * stride_bk +
+                                                offs_bn[None, :] * stride_bn)
+
+    if use_int8_w8a16:
+        b_scale_ptrs = b_scale_ptr + off_experts * stride_bse + offs_bn[
+            None, :] * stride_bsn
+        b_scale = tl.load(b_scale_ptrs)
+
+    if use_fp8_w8a8:
+        if group_k > 0 and group_n > 0:
+            a_scale_ptrs = a_scale_ptr + (offs_token // top_k) * stride_asm
+            offs_bsn = offs_bn // group_n
+            b_scale_ptrs = b_scale_ptr + off_experts * stride_bse + offs_bsn * stride_bsn
+        else:
+            # (Zkk): every expert has one activation scale and weight scale.
+            a_scale = tl.load(a_scale_ptr + off_experts)
+            b_scale = tl.load(b_scale_ptr + off_experts)
+
+    accumulator = tl.zeros((BLOCK_SIZE_M, BLOCK_SIZE_N), dtype=tl.float32)
+
+    for k in range(0, tl.cdiv(K, BLOCK_SIZE_K)):
+        if even_Ks:
+            a = tl.load(
+                a_ptrs,
+                mask=token_mask[:, None],
+                other=0.0,
+            )
+            b = tl.load(b_ptrs,
+                        cache_modifier=".cv",
+                        eviction_policy='evict_first')
+        else:
+            a = tl.load(
+                a_ptrs,
+                mask=token_mask[:, None] &
+                (offs_k[None, :] < K - k * BLOCK_SIZE_K),
+                other=0.0,
+            )
+            b = tl.load(b_ptrs,
+                        mask=offs_k[:, None] < K - k * BLOCK_SIZE_K,
+                        other=0.0)
+
+        # We accumulate along the K dimension.
+        if use_int8_w8a16:
+            accumulator = tl.dot(a, b.to(compute_type), acc=accumulator)
+        elif use_fp8_w8a8:
+            if group_k > 0 and group_n > 0:
+                k_start = k * BLOCK_SIZE_K
+                offs_ks = k_start // group_k
+                a_scale = tl.load(a_scale_ptrs + offs_ks * stride_ask,
+                                  mask=token_mask,
+                                  other=0.0)
+                b_scale = tl.load(b_scale_ptrs + offs_ks * stride_bsk)
+
+                accumulator += tl.dot(a, b) * a_scale[:,
+                                                      None] * b_scale[None, :]
+            else:
+                accumulator = tl.dot(a, b, acc=accumulator)
+        else:
+            accumulator += tl.dot(a, b)
+
+        a_ptrs += BLOCK_SIZE_K * stride_ak
+        b_ptrs += BLOCK_SIZE_K * stride_bk
+
+    if MUL_ROUTED_WEIGHT:
+        moe_weight = tl.load(topk_weights_ptr + offs_token,
+                             mask=token_mask,
+                             other=0)
+        accumulator = accumulator * moe_weight[:, None]
+    if use_int8_w8a16:
+        accumulator = (accumulator * b_scale).to(compute_type)
+    elif use_fp8_w8a8:
+        if group_k > 0 and group_n > 0:
+            accumulator = accumulator.to(compute_type)
+        else:
+            accumulator = (accumulator * a_scale * b_scale).to(compute_type)
+    else:
+        accumulator = accumulator.to(compute_type)
+    # Write back the block of the output
+    offs_cn = pid_n * BLOCK_SIZE_N + tl.arange(0, BLOCK_SIZE_N)
+    c_ptrs = c_ptr + stride_cm * offs_token[:, None] + stride_cn * offs_cn[
+        None, :]
+    c_mask = token_mask[:, None] & (offs_cn[None, :] < N)
+
+    tl.store(c_ptrs, accumulator, mask=c_mask)