Sync v2.0 version of code to github repo

This commit is contained in:
Jiang-Jia-Jun
2025-06-29 23:29:37 +00:00
parent d151496038
commit 92c2cfa2e7
597 changed files with 78776 additions and 22905 deletions
@@ -11,3 +11,13 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from .fused_moe_cutlass_backend import (CutlassW4A8MoEMethod,
CutlassWeightOnlyMoEMethod)
from .fused_moe_triton_backend import TritonWeightOnlyMoEMethod
from .moe import FusedMoE
__all__ = [
CutlassWeightOnlyMoEMethod, CutlassW4A8MoEMethod, FusedMoE,
TritonWeightOnlyMoEMethod
]
@@ -1,222 +0,0 @@
"""
# Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""
import paddle
from paddle import nn
from paddle.distributed import fleet
from paddle.framework import in_dynamic_or_pir_mode
from paddle.nn.quant import weight_quantize
from fastdeploy.model_executor.ops.gpu import (moe_expert_dispatch,
moe_expert_ffn,
moe_expert_reduce)
from .fused_moe_method_base import FusedMoEMethodBase
class CutlassFusedMoeMethod(FusedMoEMethodBase):
"""
Use Cutlass Group Gemm to compute Fused MoE.
This method is the oldest way to compute MoE in Paddle.
"""
def create_weights(
self,
layer: nn.Layer,
moe_compute_params,
ffn1_tensor,
ffn2_tensor,
ffn1_bias=None,
ffn2_bias=None,
# belows only used in w4a8.
moe_ffn1_weight_scale=None,
moe_ffn2_weight_scale=None,
moe_ffn1_in_scale=None,
moe_ffn2_in_scale=None):
"""
Paddle cutlass create weight process.
"""
num_local_experts = moe_compute_params.num_local_experts
moe_quant_type = moe_compute_params.moe_quant_type
assert len(ffn1_tensor) == num_local_experts
assert len(ffn2_tensor) == num_local_experts
assert ffn1_tensor[0].shape == [
moe_compute_params.hidden_size,
moe_compute_params.moe_intermediate_size * 2
]
assert ffn2_tensor[0].shape == [
moe_compute_params.moe_intermediate_size,
moe_compute_params.hidden_size
]
added_weight_attrs = ["moe_ffn1_weight", "moe_ffn2_weight"]
added_scale_attrs = ["moe_ffn1_weight_scale", "moe_ffn2_weight_scale"]
if moe_quant_type == "w4a8":
moe_ffn1_in_scale = paddle.concat(moe_ffn1_in_scale)
moe_ffn2_in_scale = paddle.concat(moe_ffn2_in_scale)
moe_ffn1_in_scale = 1 / moe_ffn1_in_scale
moe_ffn2_in_scale = 1 / moe_ffn2_in_scale
moe_ffn1_weight_scale = paddle.stack(moe_ffn1_weight_scale, axis=0)
moe_ffn2_weight_scale = paddle.stack(moe_ffn2_weight_scale, axis=0)
moe_ffn1_weight_scale = moe_ffn1_weight_scale / (127 * 112)
moe_ffn2_weight_scale = moe_ffn2_weight_scale / (127 * 112)
moe_ffn1_weight_scale = moe_ffn1_weight_scale / moe_ffn1_in_scale[:,
None]
moe_ffn2_weight_scale = moe_ffn2_weight_scale / moe_ffn2_in_scale[:,
None]
moe_ffn1_weight_scale = moe_ffn1_weight_scale.cast(
paddle.get_default_dtype())
moe_ffn2_weight_scale = moe_ffn2_weight_scale.cast(
paddle.get_default_dtype())
if moe_quant_type in ["weight_only_int4", "weight_only_int8", "w4a8"]:
for idx, weight_tensor in enumerate([ffn1_tensor, ffn2_tensor]):
weight_name = added_weight_attrs[idx]
scale_name = added_scale_attrs[idx]
weight_list = []
weight_scale_list = []
for i in range(num_local_experts):
quant_weight, scale = weight_quantize(weight_tensor[i],
algo=moe_quant_type,
arch=80)
weight_list.append(quant_weight)
if moe_quant_type != "w4a8":
# scale holds no memoty in w4a8, don't touch it!
weight_scale_list.append(scale)
quanted_weight = paddle.stack(weight_list, axis=0)
setattr(
layer, weight_name,
layer.create_parameter(
shape=quanted_weight.shape,
dtype=quanted_weight.dtype,
default_initializer=paddle.nn.initializer.Constant(0),
))
getattr(layer, weight_name).set_value(quanted_weight)
# this scale only useful for wint8/4.
if moe_quant_type != "w4a8":
quanted_weight_scale = paddle.stack(weight_scale_list,
axis=0)
setattr(
layer, scale_name,
layer.create_parameter(
shape=quanted_weight_scale.shape,
dtype=quanted_weight_scale.dtype,
))
getattr(layer, scale_name).set_value(quanted_weight_scale)
if moe_quant_type == "w4a8":
assert moe_ffn1_weight_scale is not None
assert moe_ffn2_weight_scale is not None
assert moe_ffn1_in_scale is not None
assert moe_ffn2_in_scale is not None
added_w4a8_attrs = [
"moe_ffn1_weight_scale", "moe_ffn2_weight_scale",
"moe_ffn1_in_scale", "moe_ffn2_in_scale"
]
for idx, weight_tensor in enumerate([
moe_ffn1_weight_scale, moe_ffn2_weight_scale,
moe_ffn1_in_scale, moe_ffn2_in_scale
]):
name = added_w4a8_attrs[idx]
setattr(
layer, name,
layer.create_parameter(
shape=weight_tensor.shape,
dtype=weight_tensor.dtype,
default_initializer=paddle.nn.initializer.Constant(0),
))
getattr(layer, name).set_value(weight_tensor)
def apply(
self,
layer: nn.Layer,
moe_compute_params,
x: paddle.Tensor,
) -> paddle.Tensor:
"""
Paddle Cutlass compute Fused MoE.
"""
gate_out = paddle.matmul(x.cast("float32"), layer.gate_weight)
(
permute_input,
token_nums_per_expert,
permute_indices_per_token,
topk_weights,
topk_idx,
expert_idx_per_token,
) = moe_expert_dispatch(
x,
gate_out,
layer.gate_correction_bias,
(layer.moe_ffn1_in_scale if hasattr(layer, "moe_ffn1_in_scale")
else None), # if set, permute_input will be int8_t
moe_compute_params.top_k,
False,
topk_only_mode=False,
)
if moe_compute_params.moe_quant_type != "w4a8":
# only w4a8 need expert_idx_per_token
# Other need not this tensor, so we make it None.
expert_idx_per_token = None
else:
expert_idx_per_token = expert_idx_per_token.cast("int64")
ffn_out = moe_expert_ffn(
permute_input,
token_nums_per_expert,
layer.moe_ffn1_weight,
layer.moe_ffn2_weight,
None,
(layer.moe_ffn1_weight_scale
if hasattr(layer, "moe_ffn1_weight_scale") else None),
(layer.moe_ffn2_weight_scale
if hasattr(layer, "moe_ffn2_weight_scale") else None),
(layer.moe_ffn2_in_scale
if hasattr(layer, "moe_ffn2_in_scale") else None),
expert_idx_per_token,
moe_compute_params.moe_quant_type,
False, # used_in_ep_low_latency
)
if False:
if in_dynamic_or_pir_mode():
hcg = fleet.get_hybrid_communicate_group()
mp_group = hcg.get_model_parallel_group()
paddle.distributed.all_reduce(ffn_out, group=mp_group)
else:
paddle.distributed.all_reduce(ffn_out, group=mp_group)
# reduce 中会做 topk 个 weight 的 norm 和 routed_scaling_factor
fused_moe_out = moe_expert_reduce(
ffn_out,
topk_weights,
permute_indices_per_token,
topk_idx,
None,
norm_topk_prob=True,
routed_scaling_factor=1.0,
)
return fused_moe_out
File diff suppressed because it is too large Load Diff
@@ -0,0 +1,135 @@
"""
# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""
from abc import abstractmethod
import paddle
from paddle import nn
from fastdeploy.config import MoEPhase
from ..quantization.quant_base import QuantMethodBase
class MoEMethodBase(QuantMethodBase):
"""
"""
def __init__(self, quant_config):
super().__init__()
if quant_config is None:
self.moe_quant_type = "w16a16"
else:
self.quant_config = quant_config
self.added_weight_attrs = ["moe_ffn1_weight", "moe_ffn2_weight"]
self.added_scale_attrs = [
"moe_ffn1_weight_scale", "moe_ffn2_weight_scale"
]
self.pack_num = 1
def init_ep(self, layer: nn.Layer) -> None:
"""
Init EP related module
"""
if layer.ep_size > 1:
if layer.fd_config.parallel_config.moe_phase == MoEPhase.DECODER:
from .ep import EPDecoderRunner
self.ep_decoder_runner = EPDecoderRunner(
layer.top_k, layer.hidden_size, layer.num_experts,
layer.moe_config.num_max_dispatch_tokens_per_rank,
layer.ep_size, layer.ep_rank)
else:
from .ep import EPPrefillRunner
self.ep_prefill_runner = EPPrefillRunner(
layer.top_k, layer.hidden_size, layer.num_experts,
layer.ep_size, layer.ep_rank)
def process_loaded_weights(self, layer, weights) -> None:
"""
process_loaded_weights
"""
pass
def check(self, layer: nn.Layer, ffn1_weights, ffn2_weights):
"""
check layer is valid for this method
"""
assert ffn1_weights[0].shape == [
layer.hidden_size // self.pack_num, layer.moe_intermediate_size * 2
]
assert ffn2_weights[0].shape == [
layer.moe_intermediate_size // self.pack_num, layer.hidden_size
]
@abstractmethod
def create_weights(self, layer: nn.Layer, state_dict):
"""
Paddle cutlass create weight process.
"""
raise NotImplementedError
@abstractmethod
def apply_ep_prefill(
self,
layer: nn.Layer,
x: paddle.Tensor,
gate_out: paddle.Tensor,
) -> paddle.Tensor:
"""
Apply the EP prefill method.
"""
raise NotImplementedError
@abstractmethod
def apply_ep_decode(
self,
layer: nn.Layer,
x: paddle.Tensor,
gate_out: paddle.Tensor,
) -> paddle.Tensor:
"""
Apply the EP decoder method.
"""
raise NotImplementedError
@abstractmethod
def apply_tp(
self,
layer: nn.Layer,
x: paddle.Tensor,
gate_out: paddle.Tensor,
) -> paddle.Tensor:
"""
Paddle Cutlass compute Fused MoE.
"""
raise NotImplementedError
def apply(
self,
layer: nn.Layer,
x: paddle.Tensor,
gate_out: paddle.Tensor,
) -> paddle.Tensor:
"""
Paddle Cutlass compute Fused MoE.
"""
if layer.ep_size > 1:
if layer.fd_config.parallel_config.moe_phase == MoEPhase.PREFILL:
return self.apply_ep_prefill(layer, x, gate_out)
else:
return self.apply_ep_decode(layer, x, gate_out)
else:
return self.apply_tp(layer, x, gate_out)
@@ -0,0 +1,431 @@
"""
# Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""
import paddle
from paddle import nn
from paddle.nn.quant import weight_quantize
from paddleformers.utils.log import logger
import fastdeploy
from fastdeploy.distributed.communication_op import \
tensor_model_parallel_all_reduce
from ..utils import get_tensor, create_and_set_parameter
from .fused_moe_backend_base import MoEMethodBase
from fastdeploy.platforms import current_platform
if current_platform.is_cuda():
from fastdeploy.model_executor.ops.gpu import moe_expert_dispatch
from fastdeploy.model_executor.ops.gpu import moe_expert_reduce
class CutlassMoEMethod(MoEMethodBase):
"""
Use Cutlass Group Gemm to compute Fused MoE.
This method is the oldest way to compute MoE in Paddle.
"""
def create_weights(self, layer: nn.Layer, state_dict):
"""
Paddle cutlass create weight process.
"""
# bf16
ffn1_weights, ffn2_weights = layer.extract_moe_ffn_weights(state_dict)
stacked_ffn1_weights = paddle.stack(ffn1_weights, axis=0)
stacked_ffn2_weights = paddle.stack(ffn2_weights, axis=0)
for idx, weight_tensor in enumerate(
[stacked_ffn1_weights, stacked_ffn2_weights]):
weight_name = self.added_weight_attrs[idx]
setattr(
layer, weight_name,
layer.create_parameter(
shape=weight_tensor.shape,
dtype=weight_tensor.dtype,
default_initializer=paddle.nn.initializer.Constant(0),
))
getattr(layer, weight_name).set_value(weight_tensor)
def compute_ffn(
self,
layer: nn.Layer,
permute_input: paddle.Tensor,
token_nums_per_expert: paddle.Tensor,
expert_idx_per_token: paddle.Tensor,
used_in_ep_low_latency: bool = False,
):
"""
Paddle Cutlass compute Fused MoE.
"""
return fastdeploy.model_executor.ops.gpu.moe_expert_ffn(
permute_input,
token_nums_per_expert,
layer.moe_ffn1_weight,
layer.moe_ffn2_weight,
None,
(layer.moe_ffn1_weight_scale
if hasattr(layer, "moe_ffn1_weight_scale") else None),
(layer.moe_ffn2_weight_scale
if hasattr(layer, "moe_ffn2_weight_scale") else None),
(layer.moe_ffn2_in_scale
if hasattr(layer, "moe_ffn2_in_scale") else None),
expert_idx_per_token,
self.moe_quant_type,
used_in_ep_low_latency,
)
def apply_ep_prefill(
self,
layer: nn.Layer,
x: paddle.Tensor,
gate_out: paddle.Tensor,
) -> paddle.Tensor:
"""
Apply the EP prefill method.
"""
# 1. Select topk experts and weights
topk_idx, topk_weights = self.ep_prefill_runner.moe_select(
layer, gate_out)
# 2. EP Dispatch
(
recv_x,
recv_topk_idx,
recv_topk_weights,
recv_num_tokens_per_expert_list,
handle,
_,
) = self.ep_prefill_runner.dispatch(x, topk_idx, topk_weights)
token_all_num = sum(recv_num_tokens_per_expert_list)
# 3. Compute ffn
if token_all_num > 0:
logger.info(f"token_all_num {token_all_num}")
(
permute_input,
permute_indices_per_token,
recv_num_tokens_per_expert_list_cumsum,
dst_weights,
dst_indices,
cumsum_idx_gpu,
expert_idx_per_token,
) = fastdeploy.model_executor.ops.gpu.ep_moe_expert_dispatch(
recv_x,
recv_topk_idx,
recv_topk_weights,
(self.moe_ffn1_in_scale
if hasattr(self, "moe_ffn1_in_scale") else None),
recv_num_tokens_per_expert_list,
token_all_num,
self.moe_quant_type,
)
if self.moe_quant_type != "w4a8":
# only w4a8 need expert_idx_per_token
# Other need not this tensor, so we make it None.
expert_idx_per_token = None
else:
expert_idx_per_token = expert_idx_per_token.cast("int64")
ffn_out = self.compute_ffn(layer, permute_input,
recv_num_tokens_per_expert_list_cumsum,
expert_idx_per_token)
# prmt back per rank
tmp_ffn_out = fastdeploy.model_executor.ops.gpu.ep_moe_expert_combine(
ffn_out,
dst_weights,
permute_indices_per_token,
dst_indices,
None, # moe_ffn2_bias,
False, # norm_topk_prob
1.0,
)[0]
else:
tmp_ffn_out = recv_x
# 4. EP combine
return self.ep_prefill_runner.combine(tmp_ffn_out, handle,
recv_topk_weights)
def apply_ep_decode(
self,
layer: nn.Layer,
x: paddle.Tensor,
gate_out: paddle.Tensor,
) -> paddle.Tensor:
"""
Apply the EP decoder method.
"""
# 1. Select topk experts and weights
topk_idx, topk_weights = self.ep_decoder_runner.moe_select(
layer, gate_out)
# 2. EP Dispatch
permute_input, token_nums_per_expert, handle = self.ep_decoder_runner.dispatch(
x, topk_idx, topk_weights)
# 3. Compute ffn
if self.moe_quant_type == "w4a8":
num_local_experts, max_num, _ = permute_input.shape
expert_idx_per_token = paddle.arange(
num_local_experts)[:, None].tile([1, max_num])
elif self.moe_quant_type in ["weight_only_int8", "weight_only_int4"]:
expert_idx_per_token = None
else:
raise NotImplementedError
ffn_out = self.compute_ffn(layer, permute_input,
token_nums_per_expert.cast("int64"),
expert_idx_per_token, True)
# 4. EP combine
return self.ep_decoder_runner.combine(ffn_out, topk_idx, topk_weights,
handle)
def apply_tp(
self,
layer: nn.Layer,
x: paddle.Tensor,
gate_out: paddle.Tensor,
) -> paddle.Tensor:
"""
Paddle Cutlass compute Fused MoE.
"""
(
permute_input,
token_nums_per_expert,
permute_indices_per_token,
topk_weights,
topk_idx,
expert_idx_per_token,
) = moe_expert_dispatch(
x,
gate_out,
layer.gate_correction_bias,
(layer.moe_ffn1_in_scale if hasattr(layer, "moe_ffn1_in_scale")
else None), # if set, permute_input will be int8_t
layer.top_k,
False,
topk_only_mode=False,
)
if self.moe_quant_type != "w4a8":
# only w4a8 need expert_idx_per_token
# Other need not this tensor, so we make it None.
expert_idx_per_token = None
else:
expert_idx_per_token = expert_idx_per_token.cast("int64")
ffn_out = self.compute_ffn(layer, permute_input, token_nums_per_expert,
expert_idx_per_token)
# reduce 中会做 topk 个 weight 的 norm 和 routed_scaling_factor
fused_moe_out = moe_expert_reduce(
ffn_out,
topk_weights,
permute_indices_per_token,
topk_idx,
None,
norm_topk_prob=True,
routed_scaling_factor=1.0,
)
if layer.tp_size > 1:
tensor_model_parallel_all_reduce(fused_moe_out)
return fused_moe_out
class CutlassW4A8MoEMethod(CutlassMoEMethod):
"""
w4a8 MoE Method
"""
def __init__(self, quant_config):
super().__init__(quant_config)
self.quant_config = quant_config
self.moe_quant_type = "w4a8"
self.pack_num = 2
def create_weights(self, layer: nn.Layer, state_dict):
"""
Paddle cutlass create weight process.
"""
ffn1_weights, ffn2_weights = layer.extract_moe_ffn_weights(state_dict)
self.check(layer, ffn1_weights, ffn2_weights)
for idx, weight_tensor in enumerate([ffn1_weights, ffn2_weights]):
weight_name = self.added_weight_attrs[idx]
weight_list = []
for i in range(layer.num_local_experts):
quant_weight, scale = weight_quantize(weight_tensor[i],
algo=self.moe_quant_type,
arch=80)
weight_list.append(quant_weight)
quanted_weight = paddle.stack(weight_list, axis=0)
create_and_set_parameter(layer, weight_name, quanted_weight)
self.create_w4a8_scale_weights(layer, layer.weight_key_map, state_dict)
def create_w4a8_scale_weights(self, layer: nn.Layer, weight_key_map: dict,
state_dict: dict):
"""
Get w4a8 weights from state dict and process them.
Args:
layer (nn.Layer): The layer to add parameters to.
weight_key_map (dict): The weight key map.
state_dict (dict): The state dict.
"""
def _extract_scale_tensor(state_dict, key_template, expert_idx):
return get_tensor(state_dict.pop(key_template.format(expert_idx)))
def _process_in_scale(name: str, in_scales: list[paddle.Tensor]):
processed_in_scale = 1 / paddle.concat(in_scales)
create_and_set_parameter(layer, name, processed_in_scale)
return processed_in_scale
def _process_weight_scale(name: str,
weight_scales: list[paddle.Tensor],
processed_in_scale: paddle.Tensor):
processed_weight_scale = (paddle.stack(weight_scales, axis=0) /
(127 * 112) /
processed_in_scale[:, None]).cast(
paddle.get_default_dtype())
create_and_set_parameter(layer, name, processed_weight_scale)
# 1. Init scale containers and maps
moe_ffn1_weight_scales = []
moe_ffn2_weight_scales = []
moe_ffn1_in_scales = []
moe_ffn2_in_scales = []
scale_weight_map = {
"moe_ffn1_weight_scale": moe_ffn1_weight_scales,
"moe_ffn2_weight_scale": moe_ffn2_weight_scales,
"moe_ffn1_in_scale": moe_ffn1_in_scales,
"moe_ffn2_in_scale": moe_ffn2_in_scales,
}
scale_key_map = {
"moe_ffn1_weight_scale":
weight_key_map.get("ffn1_expert_weight_scale_key", None),
"moe_ffn2_weight_scale":
weight_key_map.get("ffn2_expert_weight_scale_key", None),
"moe_ffn1_in_scale":
weight_key_map.get("ffn1_expert_in_scale_key", None),
"moe_ffn2_in_scale":
weight_key_map.get("ffn2_expert_in_scale_key", None),
}
for name, value in scale_key_map.items():
if value is None:
raise ValueError(
f"scale {name} should not be none in w4a8 mode.")
# 2. Extract scale tensor from state dict
for local_expert_idx in range(layer.num_local_experts):
expert_idx = local_expert_idx + layer.expert_id_offset * layer.num_local_experts
for name, scale_key_template in scale_key_map.items():
scale_tensor = _extract_scale_tensor(state_dict,
scale_key_template,
expert_idx)
scale_weight_map[name].append(scale_tensor)
# 3. Process scale tensor and set to layer
in_scales = []
for in_scale_name in ["moe_ffn1_in_scale", "moe_ffn2_in_scale"]:
in_scales.append(
_process_in_scale(in_scale_name,
scale_weight_map[in_scale_name]))
for i, weight_scale_name in enumerate(
["moe_ffn1_weight_scale", "moe_ffn2_weight_scale"]):
_process_weight_scale(weight_scale_name,
scale_weight_map[weight_scale_name],
in_scales[i])
class CutlassWeightOnlyMoEMethod(CutlassMoEMethod):
"""
weight only for moe
"""
def __init__(self, quant_config):
super().__init__(quant_config)
self.quant_config = quant_config
self.moe_quant_type = self.quant_config.algo
self.pack_num = 1
def process_prequanted_weights(self, layer: nn.Layer, state_dict):
"""
Paddle cutlass process prequanted weights.
"""
ffn1_expert_weight_key = layer.weight_key_map.get(
"ffn1_expert_weight_key", None)
ffn2_expert_weight_key = layer.weight_key_map.get(
"ffn2_expert_weight_key", None)
ffn1_expert_weight_scale_key = layer.weight_key_map.get(
"ffn1_expert_weight_scale_key", None)
ffn2_expert_weight_scale_key = layer.weight_key_map.get(
"ffn2_expert_weight_scale_key", None)
ffn1_weights, ffn2_weights = layer.load_experts_weight(
state_dict, ffn1_expert_weight_key, ffn2_expert_weight_key)
# self.check(layer, ffn1_weights, ffn2_weights)
ffn1_weight_scale = []
ffn2_weight_scale = []
for i in range(layer.num_local_experts):
expert_idx = layer.expert_id_offset + i
ffn1_weight_scale.append(
get_tensor(
state_dict.pop(
ffn1_expert_weight_scale_key.format(expert_idx))))
ffn2_weight_scale.append(
get_tensor(
state_dict.pop(
ffn2_expert_weight_scale_key.format(expert_idx))))
ffn1_weight = paddle.stack(ffn1_weights, axis=0)
ffn2_weight = paddle.stack(ffn2_weights, axis=0)
ffn1_weight_scale = paddle.stack(ffn1_weight_scale, axis=0)
ffn2_weight_scale = paddle.stack(ffn2_weight_scale, axis=0)
name_tensor_map = {
"moe_ffn1_weight": ffn1_weight,
"moe_ffn2_weight": ffn2_weight,
"moe_ffn1_weight_scale": ffn1_weight_scale,
"moe_ffn2_weight_scale": ffn2_weight_scale
}
for name, tensor in name_tensor_map.items():
create_and_set_parameter(layer, name, tensor)
def create_weights(self, layer: nn.Layer, state_dict):
"""
Paddle cutlass create weight process.
"""
ffn1_weights, ffn2_weights = layer.extract_moe_ffn_weights(state_dict)
self.check(layer, ffn1_weights, ffn2_weights)
for idx, weight_tensor in enumerate([ffn1_weights, ffn2_weights]):
weight_name = self.added_weight_attrs[idx]
scale_name = self.added_scale_attrs[idx]
weight_list = []
weight_scale_list = []
for i in range(layer.num_local_experts):
quant_weight, scale = weight_quantize(weight_tensor[i],
algo=self.moe_quant_type)
weight_list.append(quant_weight)
weight_scale_list.append(scale)
quanted_weight = paddle.stack(weight_list, axis=0)
create_and_set_parameter(layer, weight_name, quanted_weight)
quanted_weight_scale = paddle.stack(weight_scale_list, axis=0)
create_and_set_parameter(layer, scale_name, quanted_weight_scale)
@@ -0,0 +1,380 @@
"""
# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""
import numpy as np
import paddle
from paddle import nn
from paddleformers.utils.log import logger
import fastdeploy
import fastdeploy.model_executor.ops.gpu.deep_gemm as deep_gemm
from fastdeploy.distributed.communication_op import \
tensor_model_parallel_all_reduce
from fastdeploy.model_executor.ops.gpu import count_tokens_per_expert_func
from fastdeploy.model_executor.layers.utils import get_tensor
from ..utils import create_and_set_parameter
from .fused_moe_backend_base import MoEMethodBase
class DeepGemmFusedMoeMethod(MoEMethodBase):
"""
DeepGemmFusedMoeMethod is a class that implements the MoEMethodBase interface for DeepGemm backend.
"""
def create_weights(self, layer: nn.Layer, state_dict):
"""
deepgemm create weight process.
"""
ffn1_weights, ffn2_weights = layer.extract_moe_ffn_weights(state_dict)
self.check(layer, ffn1_weights, ffn2_weights)
for idx, weight_tensor in enumerate([ffn1_weights, ffn2_weights]):
weight_name = self.added_weight_attrs[idx]
scale_name = self.added_scale_attrs[idx]
weight_list = []
weight_scale_list = []
for i in range(layer.num_local_experts):
from fastdeploy.model_executor.layers.utils import \
per_block_cast_to_fp8
quant_weight, scale = per_block_cast_to_fp8(
weight_tensor[i], self.quant_config.weight_block_size)
weight_list.append(quant_weight)
weight_scale_list.append(scale)
quanted_weight = paddle.stack(weight_list, axis=0)
quanted_weight = quanted_weight.transpose([0, 2, 1]).contiguous()
create_and_set_parameter(layer, weight_name, quanted_weight)
quanted_weight_scale = paddle.stack(weight_scale_list, axis=0)
quanted_weight_scale = quanted_weight_scale.transpose(
[0, 2, 1]).contiguous()
create_and_set_parameter(layer, scale_name, quanted_weight_scale)
def process_prequanted_weights(self, layer: nn.Layer, state_dict):
"""
Paddle cutlass process prequanted weights.
"""
ffn1_expert_weight_key = layer.weight_key_map.get(
"ffn1_expert_weight_key", None)
ffn2_expert_weight_key = layer.weight_key_map.get(
"ffn2_expert_weight_key", None)
ffn1_expert_weight_scale_key = layer.weight_key_map.get(
"ffn1_expert_weight_scale_key", None)
ffn2_expert_weight_scale_key = layer.weight_key_map.get(
"ffn2_expert_weight_scale_key", None)
ffn1_weights, ffn2_weights = layer.load_experts_weight(
state_dict, ffn1_expert_weight_key, ffn2_expert_weight_key)
# self.check(layer, ffn1_weights, ffn2_weights)
ffn1_weight_scale = []
ffn2_weight_scale = []
for i in range(layer.num_local_experts):
expert_idx = layer.expert_id_offset + i
ffn1_weight_scale.append(
get_tensor(
state_dict.pop(
ffn1_expert_weight_scale_key.format(expert_idx))))
ffn2_weight_scale.append(
get_tensor(
state_dict.pop(
ffn2_expert_weight_scale_key.format(expert_idx))))
ffn1_weight = paddle.stack(ffn1_weights, axis=0).transpose([0, 2, 1]).contiguous().view("float8_e4m3fn")
ffn2_weight = paddle.stack(ffn2_weights, axis=0).transpose([0, 2, 1]).contiguous().view("float8_e4m3fn")
ffn1_weight_scale = paddle.stack(ffn1_weight_scale, axis=0).transpose([0, 2, 1]).contiguous()
ffn2_weight_scale = paddle.stack(ffn2_weight_scale, axis=0).transpose([0, 2, 1]).contiguous()
name_tensor_map = {
"moe_ffn1_weight": ffn1_weight,
"moe_ffn2_weight": ffn2_weight,
"moe_ffn1_weight_scale": ffn1_weight_scale,
"moe_ffn2_weight_scale": ffn2_weight_scale
}
for name, tensor in name_tensor_map.items():
create_and_set_parameter(layer, name, tensor)
def apply_ep_prefill(
self,
layer: nn.Layer,
x: paddle.Tensor,
gate_out: paddle.Tensor,
) -> paddle.Tensor:
"""
Apply the EP prefill method.
"""
# 1. Select topk experts and weights
topk_idx, topk_weights = self.ep_prefill_runner.moe_select(
layer, gate_out)
# 2. Dynamic compute blockwise quantization scales
x, x_scale_tensor = fastdeploy.model_executor.ops.gpu.per_token_quant(
x, self.quant_config.weight_block_size[0])
# 3. EP Dispatch
(
recv_x,
recv_topk_idx,
recv_topk_weights,
recv_num_tokens_per_expert_list,
handle,
_,
) = self.ep_prefill_runner.dispatch(x,
topk_idx,
topk_weights,
x_scale_tensor=x_scale_tensor)
token_all_num = sum(recv_num_tokens_per_expert_list)
# 4. Compute ffn
if token_all_num > 0:
logger.info(f"token_all_num {token_all_num}")
(recv_x, recv_x_scale) = recv_x
tmp = count_tokens_per_expert_func(recv_topk_idx, layer.num_local_experts)
(
permute_input,
permute_scale,
permute_indices_per_token,
recv_num_tokens_per_expert_list_cumsum,
recv_num_tokens_per_expert_list_padded_cumsum,
dst_weights,
dst_indices,
cumsum_idx_gpu,
m_indices,
) = fastdeploy.model_executor.ops.gpu.ep_moe_expert_dispatch_fp8(
recv_x,
recv_x_scale,
recv_topk_idx,
recv_topk_weights,
tmp[0],
tmp[1]
)
permute_scale = permute_scale.transpose([1, 0]).contiguous()
permute_scale = permute_scale.transpose([1, 0])
# ffn1
ffn_out = paddle.empty(
(permute_input.shape[0], layer.moe_ffn1_weight.shape[1]),
dtype=paddle.bfloat16,
)
deep_gemm.m_grouped_gemm_fp8_fp8_bf16_nt_contiguous(
(permute_input, permute_scale),
(layer.moe_ffn1_weight, layer.moe_ffn1_weight_scale),
ffn_out,
m_indices,
)
# swiglu
ffn_out = paddle.incubate.nn.functional.swiglu(ffn_out, None)
# ffn2
ffn_in_x, ffn_in_x_scale_tensor = fastdeploy.model_executor.ops.gpu.per_token_quant(
ffn_out, self.quant_config.weight_block_size[0])
ffn_in_x_scale_tensor = ffn_in_x_scale_tensor.transpose(
[1, 0]).contiguous()
ffn_in_x_scale_tensor = ffn_in_x_scale_tensor.transpose([1, 0])
ffn_out = paddle.empty(
(ffn_out.shape[0], layer.moe_ffn2_weight.shape[1]),
dtype=paddle.bfloat16)
deep_gemm.m_grouped_gemm_fp8_fp8_bf16_nt_contiguous(
(ffn_in_x, ffn_in_x_scale_tensor),
(layer.moe_ffn2_weight, layer.moe_ffn2_weight_scale),
ffn_out,
m_indices,
)
# prmt back per rank
tmp_ffn_out = fastdeploy.model_executor.ops.gpu.ep_moe_expert_combine(
ffn_out,
dst_weights,
permute_indices_per_token,
dst_indices,
None, # moe_ffn2_bias
False, # norm_topk_prob
1.0,
)[0]
else:
tmp_ffn_out = paddle.cast(recv_x[0], paddle.bfloat16)
# 5. EP combine
return self.ep_prefill_runner.combine(tmp_ffn_out, handle,
recv_topk_weights)
def apply_ep_decode(
self,
layer: nn.Layer,
x: paddle.Tensor,
gate_out: paddle.Tensor,
) -> paddle.Tensor:
"""
Apply the EP decoder method.
"""
# 1. Select topk experts and weights
topk_idx, topk_weights = self.ep_decoder_runner.moe_select(
layer, gate_out)
# 2. EP Dispatch
permute_input, token_nums_per_expert, handle = self.ep_decoder_runner.dispatch(
x, topk_idx, topk_weights, use_fp8=True)
# 3. Compute ffn
assert isinstance(permute_input, tuple)
ffn1_out = paddle.empty(
[
layer.num_local_experts,
layer.ep_size *
layer.moe_config.num_max_dispatch_tokens_per_rank,
layer.moe_intermediate_size * 2,
],
dtype=paddle.bfloat16,
)
ffn_out = paddle.empty(
[
layer.num_local_experts,
layer.ep_size *
layer.moe_config.num_max_dispatch_tokens_per_rank,
layer.hidden_size,
],
dtype=paddle.bfloat16,
)
expected_m = 128
deep_gemm.m_grouped_gemm_fp8_fp8_bf16_nt_masked(
permute_input,
(
layer.moe_ffn1_weight,
layer.moe_ffn1_weight_scale,
),
ffn1_out,
token_nums_per_expert,
expected_m,
)
act_out = fastdeploy.model_executor.ops.gpu.group_swiglu_with_masked(
ffn1_out, token_nums_per_expert)
act_out_fp8, scale = fastdeploy.model_executor.ops.gpu.masked_per_token_quant(
act_out, token_nums_per_expert,
self.quant_config.weight_block_size[0])
deep_gemm.m_grouped_gemm_fp8_fp8_bf16_nt_masked(
(act_out_fp8, scale),
(
layer.moe_ffn2_weight,
layer.moe_ffn2_weight_scale,
),
ffn_out,
token_nums_per_expert,
expected_m,
)
# 4. EP combine
return self.ep_decoder_runner.combine(ffn_out, topk_idx, topk_weights,
handle)
def apply_tp(
self,
layer: nn.Layer,
x: paddle.Tensor,
gate_out: paddle.Tensor,
) -> paddle.Tensor:
"""
Paddle Use DeepGemm compute Fused MoE.
below is TP compute method.
"""
topk_ids, topk_weights = fastdeploy.model_executor.ops.gpu.moe_topk_select(
gate_out,
layer.gate_correction_bias,
layer.top_k,
True, # apply_norm_weight
False,
)
tmp = count_tokens_per_expert_func(topk_ids, layer.num_experts)
recv_x, recv_x_scale = fastdeploy.model_executor.ops.gpu.per_token_quant(
x, 128)
(
permute_input,
permute_scale,
permute_indices_per_token,
recv_num_tokens_per_expert_list_cumsum,
recv_num_tokens_per_expert_list_padded_cumsum,
dst_weights,
dst_indices,
cumsum_idx_gpu,
m_indices,
) = fastdeploy.model_executor.ops.gpu.ep_moe_expert_dispatch_fp8(
recv_x,
recv_x_scale,
topk_ids,
topk_weights,
tmp[0],
tmp[1],
)
permute_scale = permute_scale.transpose([1, 0]).contiguous()
permute_scale = permute_scale.transpose([1, 0])
# ffn1
ffn_out = paddle.empty(
(permute_input.shape[0], layer.moe_ffn1_weight.shape[1]),
dtype=paddle.bfloat16,
)
deep_gemm.m_grouped_gemm_fp8_fp8_bf16_nt_contiguous(
(permute_input, permute_scale),
(layer.moe_ffn1_weight, layer.moe_ffn1_weight_scale),
ffn_out,
m_indices,
)
# swiglu
ffn_out = paddle.incubate.nn.functional.swiglu(ffn_out)
# ffn2
ffn_in_x, ffn_in_x_scale_tensor = fastdeploy.model_executor.ops.gpu.per_token_quant(
ffn_out, self.quant_config.weight_block_size[0])
ffn_in_x_scale_tensor = ffn_in_x_scale_tensor.transpose(
[1, 0]).contiguous()
ffn_in_x_scale_tensor = ffn_in_x_scale_tensor.transpose([1, 0])
ffn_out = paddle.empty(
(ffn_out.shape[0], layer.moe_ffn2_weight.shape[1]),
dtype=paddle.bfloat16)
deep_gemm.m_grouped_gemm_fp8_fp8_bf16_nt_contiguous(
(ffn_in_x, ffn_in_x_scale_tensor),
(layer.moe_ffn2_weight, layer.moe_ffn2_weight_scale),
ffn_out,
m_indices,
)
# prmt back per rank
tmp_ffn_out = fastdeploy.model_executor.ops.gpu.ep_moe_expert_combine(
ffn_out,
dst_weights,
permute_indices_per_token,
dst_indices,
None,
False, # norm_topk_prob
1.0,
)[0]
if layer.tp_size > 1:
tensor_model_parallel_all_reduce(tmp_ffn_out)
return tmp_ffn_out
@@ -0,0 +1,285 @@
"""
# Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""
import paddle
from paddle import nn
import fastdeploy
from fastdeploy.distributed.communication_op import \
tensor_model_parallel_all_reduce
from fastdeploy.model_executor.ops.gpu import (MoeWna16MarlinGemmApi,
tritonmoe_preprocess_func)
from ..quantization.quant_base import QuantMethodBase
def gptq_marlin_moe_repack(b_q_weight: paddle.Tensor, perm: paddle.Tensor,
size_k: int, size_n: int,
num_bits: int) -> paddle.Tensor:
"""
Util function.
"""
from fastdeploy.model_executor.ops.gpu import gptq_marlin_repack
num_experts = b_q_weight.shape[0]
assert size_k % 16 == 0
output = paddle.empty(
[num_experts, size_k // 16, size_n * (num_bits // 2)],
dtype=b_q_weight.dtype)
for e in range(num_experts):
output[e] = gptq_marlin_repack(b_q_weight[e], perm[e], size_k, size_n,
num_bits)
return output
def get_scale_perms():
"""
Util function.
"""
scale_perm: list[int] = []
for i in range(8):
scale_perm.extend([i + 8 * j for j in range(8)])
scale_perm_single: list[int] = []
for i in range(4):
scale_perm_single.extend(
[2 * i + j for j in [0, 1, 8, 9, 16, 17, 24, 25]])
return scale_perm, scale_perm_single
def marlin_permute_scales(s: paddle.Tensor, size_k: int, size_n: int,
group_size: int) -> paddle.Tensor:
"""
Util function.
"""
scale_perm, scale_perm_single = get_scale_perms()
if group_size < size_k and group_size != -1:
s = s.reshape([-1, len(scale_perm)])[:, scale_perm]
else:
s = s.reshape([-1, len(scale_perm_single)])[:, scale_perm_single]
s = s.reshape((-1, size_n)).contiguous()
return s
def marlin_moe_permute_scales(
s: paddle.Tensor,
size_k: int,
size_n: int,
group_size: int,
):
"""
Util function.
"""
num_experts = s.shape[0]
output = paddle.empty(
[num_experts, s.shape[1], s.shape[2]],
dtype=s.dtype,
)
for e in range(num_experts):
output[e] = marlin_permute_scales(s[e], size_k, size_n, group_size)
return output
class MarlinWeightOnlyMoEMethod(QuantMethodBase):
"""
Use Marlin Group Gemm to compute Fused MoE.
"""
def __init__(self, quant_method=None):
"""
Marlin Group Gemm to compute Fused MoE.
"""
self.quant_method = quant_method
self.added_weight_attrs = ["moe_ffn1_weight", "moe_ffn2_weight"]
self.added_scale_attrs = [
"moe_ffn1_weight_scale", "moe_ffn2_weight_scale"
]
self.added_zeros_attrs = ["zeros0", "zeros1"]
def create_weights(self, layer: nn.Layer, state_dict):
"""
Marlin MoE create weight process.
"""
ffn1_weights, ffn2_weights = layer.extract_moe_ffn_weights(state_dict)
assert len(ffn1_weights) == layer.num_local_experts
assert len(ffn2_weights) == layer.num_local_experts
assert ffn1_weights[0].shape == [
layer.hidden_size, layer.moe_intermediate_size * 2
]
assert ffn2_weights[0].shape == [
layer.moe_intermediate_size, layer.hidden_size
]
ffn1_tensor = paddle.stack(ffn1_weights, axis=0)
ffn2_tensor = paddle.stack(ffn2_weights, axis=0)
max_bound = 7
for idx, weight_tensor in enumerate([ffn1_tensor, ffn2_tensor]):
weight_name = self.added_weight_attrs[idx]
scale_name = self.added_scale_attrs[idx]
weight_scale = weight_tensor.abs().max(axis=1)
quanted_weight = weight_tensor / weight_scale[:,
None, :] * max_bound
quanted_weight = paddle.round(quanted_weight).astype("int32")
quanted_weight[quanted_weight > 7] = 7
quanted_weight[quanted_weight < -7] = -7
quanted_weight += 8
E, K, N = quanted_weight.shape
quanted_weight = quanted_weight.reshape([0, K // 8, 8, N])
res = paddle.zeros([E, K // 8, N], dtype='int32')
for j in range(8):
tmp = quanted_weight[:, :, j, :]
res = res | (tmp << (j * 4))
quanted_weight = paddle.assign(res)
weight_scale = weight_scale / max_bound
weight_scale = weight_scale[:, None, :]
group_size = -1 # means per_channel
g_idx_sort_indices = paddle.empty([E, 0], dtype="int32")
quanted_weight = gptq_marlin_moe_repack(
quanted_weight,
g_idx_sort_indices,
K,
N,
4,
)
weight_scale = marlin_moe_permute_scales(
weight_scale,
size_k=layer.moe_intermediate_size, #useless
size_n=N,
group_size=group_size)
for (name, tensor) in [(weight_name, quanted_weight),
(scale_name, weight_scale)]:
setattr(
layer, name,
layer.create_parameter(
shape=tensor.shape,
dtype=tensor.dtype,
default_initializer=paddle.nn.initializer.Constant(0),
))
getattr(layer, name).set_value(tensor)
def apply(
self,
layer: nn.Layer,
x: paddle.Tensor,
gate_out: paddle.Tensor,
) -> paddle.Tensor:
"""
Marlin compute Fused MoE.
"""
token_num = x.shape[0]
top_k = layer.top_k
top_k = layer.top_k
moe_intermediate_size = layer.moe_intermediate_size
hidden_size = layer.hidden_size
num_experts = layer.num_experts
gate_out = paddle.matmul(x.cast("float32"), layer.gate_weight)
topk_ids, topk_weights = fastdeploy.model_executor.ops.gpu.moe_topk_select(
gate_out,
layer.gate_correction_bias,
top_k,
True, # apply_norm_weight,
False,
)
block_size_m = 64
for m in [8, 16, 32, 48, 64]:
if token_num * top_k / num_experts / m < 0.9:
block_size_m = m
break
topk = top_k
# for H100 132 sms
workspace = paddle.empty([528], dtype="int32")
sorted_token_ids, expert_ids, num_tokens_post_padded = tritonmoe_preprocess_func(
topk_ids, num_experts, block_size_m)
ffn_out = MoeWna16MarlinGemmApi(
x,
c_or_none=None,
b_q_weight=layer.moe_ffn1_weight,
b_scales=layer.moe_ffn1_weight_scale,
global_scale_or_none=None,
b_zeros_or_none=None,
g_idx_or_none=None,
perm_or_none=None,
workspace=workspace,
sorted_token_ids=sorted_token_ids,
expert_ids=expert_ids,
num_tokens_post_padded=num_tokens_post_padded,
topk_weights=topk_weights,
moe_block_size=block_size_m,
top_k=topk,
mul_topk_weights=False,
is_ep=False,
b_q_type_str="uint4b8",
size_m=token_num,
size_n=moe_intermediate_size * 2,
size_k=hidden_size,
is_k_full=True,
use_atomic_add=True,
use_fp32_reduce=True,
is_zp_float=False)[0]
swiglu_out = paddle.incubate.nn.functional.swiglu(ffn_out)
ffn_out = MoeWna16MarlinGemmApi(
swiglu_out,
c_or_none=None,
b_q_weight=layer.moe_ffn2_weight,
b_scales=layer.moe_ffn2_weight_scale,
global_scale_or_none=None,
b_zeros_or_none=None,
g_idx_or_none=None,
perm_or_none=None,
workspace=workspace,
sorted_token_ids=sorted_token_ids,
expert_ids=expert_ids,
num_tokens_post_padded=num_tokens_post_padded,
topk_weights=topk_weights,
moe_block_size=block_size_m,
top_k=1,
mul_topk_weights=True,
is_ep=False,
b_q_type_str="uint4b8",
size_m=token_num * topk,
size_n=hidden_size,
size_k=moe_intermediate_size,
is_k_full=True,
use_atomic_add=True,
use_fp32_reduce=True,
is_zp_float=False)[0]
ffn_out.reshape_([token_num, -1, hidden_size])
ffn_out = ffn_out.sum(axis=1)
if layer.tp_size > 1:
tensor_model_parallel_all_reduce(ffn_out)
return ffn_out
@@ -1,57 +0,0 @@
"""
# Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""
from abc import abstractmethod
import paddle
from paddle import nn
from fastdeploy.model_executor.layers.quantization.quant_base import \
QuantMethodBase
class FusedMoEMethodBase(QuantMethodBase):
"""
All MoE Method should inherit this class.
and must implement following methods!
"""
@abstractmethod
def create_weights(self,
layer: nn.Layer,
moe_compute_params,
ffn1_tensor,
ffn2_tensor,
ffn1_bias=None,
ffn2_bias=None):
"""
How to create weights, you must implement this method.
"""
raise NotImplementedError
@abstractmethod
def apply(
self,
layer: nn.Layer,
moe_compute_params,
x: paddle.Tensor,
) -> paddle.Tensor:
"""
Compute methods, you must implement this method.
"""
raise NotImplementedError
@@ -0,0 +1,479 @@
"""
# Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""
import paddle
from paddle import nn
from fastdeploy.distributed.communication_op import \
tensor_model_parallel_all_reduce
from fastdeploy.model_executor.layers.utils import (create_hadamard_matrix_map,
get_tensor)
from fastdeploy.utils import ceil_div
from ..quantization.quant_base import QuantMethodBase
class TritonWeightOnlyMoEMethod(QuantMethodBase):
"""
Use Triton Group Gemm to compute Fused MoE.
"""
def __init__(self, quant_method=None):
"""
Triton Group Gemm to compute Fused MoE.
"""
self.quant_method = quant_method
self.added_weight_attrs = ["moe_ffn1_weight", "moe_ffn2_weight"]
self.added_scale_attrs = [
"moe_ffn1_weight_scale", "moe_ffn2_weight_scale"
]
def process_prequanted_weights(self, layer: nn.Layer, state_dict) -> None:
"""process_prequanted_weights"""
pass
def create_weights(self, layer: nn.Layer, state_dict):
"""
Triton MoE create weight process.
"""
ffn1_weights, ffn2_weights = layer.extract_moe_ffn_weights(state_dict)
assert len(ffn1_weights) == layer.num_local_experts
assert len(ffn2_weights) == layer.num_local_experts
assert layer.quant_method.quant_config.name() == "wint8"
assert ffn1_weights[0].shape == [
layer.hidden_size, layer.moe_intermediate_size * 2
]
assert ffn2_weights[0].shape == [
layer.moe_intermediate_size, layer.hidden_size
]
ffn1_tensor = paddle.stack(ffn1_weights, axis=0)
ffn2_tensor = paddle.stack(ffn2_weights, axis=0)
if self.quant_config.name() == "wint8":
max_bound = 127
elif self.quant_config.name() == "wint4":
max_bound = 7
for idx, weight_tensor in enumerate([ffn1_tensor, ffn2_tensor]):
weight_name = self.added_weight_attrs[idx]
scale_name = self.added_scale_attrs[idx]
quanted_weight_scale = weight_tensor.abs().max(axis=1)
quanted_weight = weight_tensor / quanted_weight_scale[:,
None, :] * max_bound
quanted_weight = paddle.round(quanted_weight).astype("int8")
quanted_weight_scale = quanted_weight_scale / max_bound
setattr(
layer, weight_name,
layer.create_parameter(
shape=quanted_weight.shape,
dtype=quanted_weight.dtype,
default_initializer=paddle.nn.initializer.Constant(0),
))
getattr(layer, weight_name).set_value(quanted_weight)
setattr(
layer, scale_name,
layer.create_parameter(
shape=quanted_weight_scale.shape,
dtype=quanted_weight_scale.dtype,
))
getattr(layer, scale_name).set_value(quanted_weight_scale)
def apply(
self,
layer: nn.Layer,
x: paddle.Tensor,
gate_out: paddle.Tensor,
) -> paddle.Tensor:
"""
Triton compute Fused MoE.
"""
token_num = x.shape[0]
top_k = layer.top_k
num_local_experts = layer.num_local_experts
top_k = layer.top_k
moe_intermediate_size = layer.moe_intermediate_size
hidden_size = layer.hidden_size
gate_out = paddle.matmul(x.cast("float32"), layer.gate_weight)
scores = paddle.nn.functional.softmax(gate_out, axis=-1)
topk_weights, topk_ids = paddle.topk(scores,
k=top_k,
axis=-1,
sorted=False)
topk_weights = topk_weights / topk_weights.sum(axis=-1, keepdim=True)
intermediate_cache1 = paddle.empty(
[token_num * top_k, moe_intermediate_size * 2],
dtype=x.dtype,
)
intermediate_cache2 = paddle.empty(
(token_num * top_k, moe_intermediate_size),
dtype=x.dtype,
)
intermediate_cache3 = paddle.empty(
(token_num * top_k, hidden_size),
dtype=x.dtype,
)
config = {
"BLOCK_SIZE_M": 32,
"BLOCK_SIZE_N": 128,
"BLOCK_SIZE_K": 128,
"GROUP_SIZE_M": 1,
}
from fastdeploy.model_executor.ops.gpu import tritonmoe_preprocess
from .triton_moe_kernels import fused_moe_kernel_paddle
sorted_token_ids, expert_ids, num_tokens_post_padded = tritonmoe_preprocess(
topk_ids, num_local_experts, config["BLOCK_SIZE_M"])
max_num_tokens_padded = sorted_token_ids.shape[0]
grid = (ceil_div(max_num_tokens_padded, config["BLOCK_SIZE_M"]) *
ceil_div(moe_intermediate_size * 2, config["BLOCK_SIZE_N"]), )
fused_moe_kernel_paddle[grid](
x,
layer.moe_ffn1_weight,
intermediate_cache1,
None,
layer.moe_ffn1_weight_scale,
None,
sorted_token_ids,
expert_ids,
num_tokens_post_padded,
moe_intermediate_size * 2,
hidden_size,
max_num_tokens_padded,
token_num * top_k,
stride_am=x.strides[0],
stride_ak=x.strides[1],
stride_be=layer.moe_ffn1_weight.strides[0],
stride_bk=layer.moe_ffn1_weight.strides[1],
stride_bn=layer.moe_ffn1_weight.strides[2],
stride_cm=intermediate_cache1.strides[0],
stride_cn=intermediate_cache1.strides[1],
#
stride_asm=-1,
stride_ask=-1,
stride_bse=layer.moe_ffn1_weight_scale.strides[0],
stride_bsk=-1,
stride_bsn=layer.moe_ffn1_weight_scale.strides[1],
group_n=-1,
group_k=-1,
# Meta-parameters
BLOCK_SIZE_M=config["BLOCK_SIZE_M"],
BLOCK_SIZE_N=config["BLOCK_SIZE_N"],
BLOCK_SIZE_K=config["BLOCK_SIZE_K"],
GROUP_SIZE_M=config["GROUP_SIZE_M"],
MUL_ROUTED_WEIGHT=False,
top_k=top_k,
compute_type_enum=1,
use_fp8_w8a8=False,
use_int8_w8a16=True,
even_Ks=hidden_size % config["BLOCK_SIZE_K"] == 0,
)
intermediate_cache2 = paddle.incubate.nn.functional.swiglu(
intermediate_cache1)
grid = (ceil_div(max_num_tokens_padded, config["BLOCK_SIZE_M"]) *
ceil_div(hidden_size, config["BLOCK_SIZE_N"]), )
fused_moe_kernel_paddle[grid](
intermediate_cache2,
layer.moe_ffn2_weight,
intermediate_cache3,
None,
layer.moe_ffn2_weight_scale,
topk_weights,
sorted_token_ids,
expert_ids,
num_tokens_post_padded,
hidden_size,
moe_intermediate_size,
max_num_tokens_padded,
token_num * top_k,
stride_am=intermediate_cache2.strides[0],
stride_ak=intermediate_cache2.strides[1],
stride_be=layer.moe_ffn2_weight.strides[0],
stride_bk=layer.moe_ffn2_weight.strides[1],
stride_bn=layer.moe_ffn2_weight.strides[2],
stride_cm=intermediate_cache3.strides[0],
stride_cn=intermediate_cache3.strides[1],
stride_asm=-1,
stride_ask=-1,
stride_bse=layer.moe_ffn2_weight_scale.strides[0],
stride_bsk=-1,
stride_bsn=layer.moe_ffn2_weight_scale.strides[1],
group_n=-1,
group_k=-1,
# Meta-parameters
BLOCK_SIZE_M=config["BLOCK_SIZE_M"],
BLOCK_SIZE_N=config["BLOCK_SIZE_N"],
BLOCK_SIZE_K=config["BLOCK_SIZE_K"],
GROUP_SIZE_M=config["GROUP_SIZE_M"],
MUL_ROUTED_WEIGHT=True,
top_k=1,
compute_type_enum=1,
use_fp8_w8a8=False,
use_int8_w8a16=True,
even_Ks=moe_intermediate_size % config["BLOCK_SIZE_K"] == 0,
)
intermediate_cache3.reshape_([token_num, top_k, hidden_size])
out = intermediate_cache3.sum(axis=1)
return out
class TensorWiseFP8MoEMethod(QuantMethodBase):
"""
Use Triton Group Gemm to compute Fused MoE.
"""
def __init__(self, quant_method=None):
"""
Triton Group Gemm to compute Fused MoE.
"""
self.quant_method = quant_method
def process_prequanted_weights(self, layer: nn.Layer, state_dict) -> None:
"""process_prequanted_weights"""
ffn1_tensor, ffn2_tensor = layer.extract_moe_ffn_weights(state_dict)
assert ffn1_tensor[0].shape == [
layer.hidden_size, layer.moe_intermediate_size * 2
]
assert ffn2_tensor[0].shape == [
layer.moe_intermediate_size, layer.hidden_size
]
ffn1_tensor = paddle.stack(ffn1_tensor, axis=0)
ffn2_tensor = paddle.stack(ffn2_tensor, axis=0)
added_wfp8afp8_attrs = [
"moe_ffn1_weight", "moe_ffn2_weight", "moe_ffn1_weight_scale",
"moe_ffn2_weight_scale", "moe_ffn1_in_scale", "moe_ffn2_in_scale"
]
def _extract_scale_tensor(key_template):
result = []
for i in range(layer.num_experts):
result.append(
get_tensor(state_dict.pop(key_template.format(i))))
return paddle.concat(result).cast("float32")
weight_key_map = layer.weight_key_map
moe_ffn1_weight_scale = _extract_scale_tensor(
weight_key_map["ffn1_expert_weight_scale_key"])
moe_ffn2_weight_scale = _extract_scale_tensor(
weight_key_map["ffn2_expert_weight_scale_key"])
moe_ffn1_in_scale = _extract_scale_tensor(
weight_key_map["ffn1_expert_in_scale_key"])
moe_ffn2_in_scale = _extract_scale_tensor(
weight_key_map["ffn2_expert_in_scale_key"])
for idx, weight_tensor in enumerate([
ffn1_tensor, ffn2_tensor, moe_ffn1_weight_scale,
moe_ffn2_weight_scale, moe_ffn1_in_scale, moe_ffn2_in_scale
]):
name = added_wfp8afp8_attrs[idx]
setattr(
layer, name,
layer.create_parameter(
shape=weight_tensor.shape,
dtype=weight_tensor.dtype,
default_initializer=paddle.nn.initializer.Constant(0),
))
getattr(layer, name).set_value(weight_tensor)
def create_weights(self, layer: nn.Layer, state_dict):
"""
Triton MoE create weight process.
"""
pass
def apply(
self,
layer: nn.Layer,
x: paddle.Tensor,
gate_out: paddle.Tensor,
) -> paddle.Tensor:
"""
Triton compute Fused MoE.
"""
token_num = x.shape[0]
top_k = layer.top_k
num_local_experts = layer.num_local_experts
moe_intermediate_size = layer.moe_intermediate_size
hidden_size = layer.hidden_size
gate_out = paddle.matmul(x.cast("float32"), layer.gate_weight)
scores = paddle.nn.functional.softmax(gate_out, axis=-1)
topk_weights, topk_ids = paddle.topk(scores,
k=top_k,
axis=-1,
sorted=False)
topk_weights = topk_weights / topk_weights.sum(axis=-1, keepdim=True)
intermediate_cache1 = paddle.empty(
[token_num * top_k, moe_intermediate_size * 2],
dtype=x.dtype,
)
intermediate_cache2 = paddle.empty(
(token_num * top_k, moe_intermediate_size),
dtype=x.dtype,
)
intermediate_cache3 = paddle.empty(
(token_num * top_k, hidden_size),
dtype=x.dtype,
)
config = {
"BLOCK_SIZE_M": 32,
"BLOCK_SIZE_N": 128,
"BLOCK_SIZE_K": 128,
"GROUP_SIZE_M": 1,
}
from fastdeploy.model_executor.ops.gpu import tritonmoe_preprocess
sorted_token_ids, expert_ids, num_tokens_post_padded = tritonmoe_preprocess(
topk_ids, num_local_experts, config["BLOCK_SIZE_M"])
max_num_tokens_padded = sorted_token_ids.shape[0]
grid = (ceil_div(max_num_tokens_padded, config["BLOCK_SIZE_M"]) *
ceil_div(moe_intermediate_size * 2, config["BLOCK_SIZE_N"]), )
adamard_matrix = create_hadamard_matrix_map[hidden_size]
x = paddle.matmul(x.cast("float32"), adamard_matrix)
permute_x = x[:, None, :].tile([1, top_k, 1])
permute_x = permute_x.reshape([-1, hidden_size])
quant_activation_scale = layer.moe_ffn1_in_scale[topk_ids].reshape(
[-1, 1])
permute_x = permute_x / quant_activation_scale
permute_x = permute_x.astype("float8_e4m3fn")
from .triton_moe_kernels import fused_moe_kernel_paddle
fused_moe_kernel_paddle[grid](
permute_x,
layer.moe_ffn1_weight.view(paddle.float8_e4m3fn),
intermediate_cache1,
layer.moe_ffn1_in_scale,
layer.moe_ffn1_weight_scale,
None,
sorted_token_ids,
expert_ids,
num_tokens_post_padded,
moe_intermediate_size * 2,
hidden_size,
max_num_tokens_padded,
token_num * top_k,
stride_am=x.strides[0],
stride_ak=x.strides[1],
stride_be=layer.moe_ffn1_weight.strides[0],
stride_bk=layer.moe_ffn1_weight.strides[1],
stride_bn=layer.moe_ffn1_weight.strides[2],
stride_cm=intermediate_cache1.strides[0],
stride_cn=intermediate_cache1.strides[1],
#
stride_asm=-1, # only used in blockwise fp8
stride_ask=-1, # only used in blockwise fp8
stride_bse=-1,
stride_bsk=-1,
stride_bsn=-1,
group_n=-1,
group_k=-1,
# Meta-parameters
BLOCK_SIZE_M=config["BLOCK_SIZE_M"],
BLOCK_SIZE_N=config["BLOCK_SIZE_N"],
BLOCK_SIZE_K=config["BLOCK_SIZE_K"],
GROUP_SIZE_M=config["GROUP_SIZE_M"],
MUL_ROUTED_WEIGHT=False,
top_k=1,
compute_type_enum=1,
use_fp8_w8a8=True,
use_int8_w8a16=False,
even_Ks=hidden_size % config["BLOCK_SIZE_K"] == 0,
)
intermediate_cache2 = paddle.incubate.nn.functional.swiglu(
intermediate_cache1)
hadamard_matrix = create_hadamard_matrix_map[moe_intermediate_size]
intermediate_cache2 = paddle.matmul(
intermediate_cache2.cast("float32"), hadamard_matrix)
quant_activation_scale = layer.moe_ffn2_in_scale[topk_ids].reshape(
[-1, 1])
intermediate_cache2 = intermediate_cache2 / quant_activation_scale
intermediate_cache2 = intermediate_cache2.astype("float8_e4m3fn")
grid = (ceil_div(max_num_tokens_padded, config["BLOCK_SIZE_M"]) *
ceil_div(hidden_size, config["BLOCK_SIZE_N"]), )
fused_moe_kernel_paddle[grid](
intermediate_cache2,
layer.moe_ffn2_weight.view(paddle.float8_e4m3fn),
intermediate_cache3,
layer.moe_ffn2_in_scale,
layer.moe_ffn2_weight_scale,
topk_weights,
sorted_token_ids,
expert_ids,
num_tokens_post_padded,
hidden_size,
moe_intermediate_size,
max_num_tokens_padded,
token_num * top_k,
stride_am=intermediate_cache2.strides[0],
stride_ak=intermediate_cache2.strides[1],
stride_be=layer.moe_ffn2_weight.strides[0],
stride_bk=layer.moe_ffn2_weight.strides[1],
stride_bn=layer.moe_ffn2_weight.strides[2],
stride_cm=intermediate_cache3.strides[0],
stride_cn=intermediate_cache3.strides[1],
stride_asm=-1,
stride_ask=-1,
stride_bse=-1,
stride_bsk=-1,
stride_bsn=-1,
group_n=-1,
group_k=-1,
# Meta-parameters
BLOCK_SIZE_M=config["BLOCK_SIZE_M"],
BLOCK_SIZE_N=config["BLOCK_SIZE_N"],
BLOCK_SIZE_K=config["BLOCK_SIZE_K"],
GROUP_SIZE_M=config["GROUP_SIZE_M"],
MUL_ROUTED_WEIGHT=True,
top_k=1,
compute_type_enum=1,
use_fp8_w8a8=True,
use_int8_w8a16=False,
even_Ks=moe_intermediate_size % config["BLOCK_SIZE_K"] == 0,
)
intermediate_cache3.reshape_([token_num, top_k, hidden_size])
out = intermediate_cache3.sum(axis=1)
if layer.tp_size > 1:
tensor_model_parallel_all_reduce(out)
return out
@@ -0,0 +1,236 @@
"""
# Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""
import paddle
from paddle import nn
import fastdeploy
from ..quantization.quant_base import QuantMethodBase
from ..utils import create_and_set_parameter, get_tensor
class Wint2MoeMethod(QuantMethodBase):
"""
Use compute Fused MoE.
"""
def __init__(self, quant_config):
super().__init__()
self.moe_quant_type = quant_config.moe_quant_type
def process_loaded_weights(self, layer, weights) -> None:
"""
process_loaded_weights
"""
pass
def check(self, layer: nn.Layer, ffn1_weights, ffn2_weights):
"""
check layer is valid for this method
"""
assert len(
ffn1_weights
) == layer.num_local_experts, "ffn1_weights length should be equal to num_local_experts."
assert len(
ffn2_weights
) == layer.num_local_experts, "ffn2_weights length should be equal to num_local_experts."
def create_weights(self, layer: nn.Layer, state_dict):
"""
Paddle cutlass create weight process.
"""
pass
class TritonWint2FusedMoeMethod(Wint2MoeMethod):
"""
Use Triton Group Gemm to compute Fused MoE.
"""
def __init__(self, quant_config):
super().__init__(quant_config)
self.moe_quant_type = quant_config.moe_quant_type
def process_loaded_weights(self, layer, weights) -> None:
"""
process_loaded_weights
"""
pass
def process_prequanted_weights(self, layer: nn.Layer, state_dict):
"""
Paddle cutlass process prequanted weights.
"""
ffn1_expert_weight_key = layer.weight_key_map.get(
"ffn1_expert_weight_key", None)
ffn2_expert_weight_key = layer.weight_key_map.get(
"ffn2_expert_weight_key", None)
ffn1_expert_weight_scale_key = layer.weight_key_map.get(
"ffn1_expert_weight_scale_key", None)
ffn2_expert_weight_scale_key = layer.weight_key_map.get(
"ffn2_expert_weight_scale_key", None)
ffn1_expert_super_scales_key = layer.weight_key_map.get(
"ffn1_expert_super_scales_key", None)
ffn2_expert_super_scales_key = layer.weight_key_map.get(
"ffn2_expert_super_scales_key", None)
ffn1_expert_code_scale_key = layer.weight_key_map.get(
"ffn1_expert_code_scale_key", None)
ffn2_expert_code_scale_key = layer.weight_key_map.get(
"ffn2_expert_code_scale_key", None)
ffn1_expert_code_zp_key = layer.weight_key_map.get(
"ffn1_expert_code_zp_key", None)
ffn2_expert_code_zp_key = layer.weight_key_map.get(
"ffn2_expert_code_zp_key", None)
ffn1_weights, ffn2_weights = layer.load_experts_weight(
state_dict, ffn1_expert_weight_key, ffn2_expert_weight_key)
# self.check(layer, ffn1_weights, ffn2_weights)
ffn1_weight_scale = []
ffn2_weight_scale = []
ffn1_super_scales = []
ffn2_super_scales = []
ffn1_code_scale = []
ffn2_code_scale = []
ffn1_code_zp = []
ffn2_code_zp = []
for i in range(layer.num_experts):
expert_idx = layer.expert_id_offset + i
ffn1_weight_scale.append(
get_tensor(
state_dict.pop(
ffn1_expert_weight_scale_key.format(expert_idx))))
ffn2_weight_scale.append(
get_tensor(
state_dict.pop(
ffn2_expert_weight_scale_key.format(expert_idx))))
ffn1_super_scales.append(
get_tensor(
state_dict.pop(
ffn1_expert_super_scales_key.format(expert_idx))))
ffn2_super_scales.append(
get_tensor(
state_dict.pop(
ffn2_expert_super_scales_key.format(expert_idx))))
ffn1_code_scale.append(
get_tensor(
state_dict.pop(
ffn1_expert_code_scale_key.format(expert_idx))))
ffn2_code_scale.append(
get_tensor(
state_dict.pop(
ffn2_expert_code_scale_key.format(expert_idx))))
ffn1_code_zp.append(
get_tensor(
state_dict.pop(
ffn1_expert_code_zp_key.format(expert_idx))))
ffn2_code_zp.append(
get_tensor(
state_dict.pop(
ffn2_expert_code_zp_key.format(expert_idx))))
ffn1_weight = paddle.stack(ffn1_weights, axis=0)
ffn2_weight = paddle.stack(ffn2_weights, axis=0)
ffn1_weight_scale = paddle.stack(ffn1_weight_scale, axis=0)
ffn2_weight_scale = paddle.stack(ffn2_weight_scale, axis=0)
ffn1_super_scales = paddle.stack(ffn1_super_scales, axis=0)
ffn2_super_scales = paddle.stack(ffn2_super_scales, axis=0)
ffn1_code_scale = paddle.stack(ffn1_code_scale, axis=0)
ffn2_code_scale = paddle.stack(ffn2_code_scale, axis=0)
ffn1_code_zp = paddle.stack(ffn1_code_zp, axis=0)
ffn2_code_zp = paddle.stack(ffn2_code_zp, axis=0)
name_tensor_map = {
"moe_ffn1_weight": ffn1_weight,
"moe_ffn2_weight": ffn2_weight,
"moe_ffn1_weight_scale": ffn1_weight_scale,
"moe_ffn2_weight_scale": ffn2_weight_scale,
"moe_ffn1_super_scales": ffn1_super_scales,
"moe_ffn2_super_scales": ffn2_super_scales,
"moe_ffn1_code_scale": ffn1_code_scale,
"moe_ffn2_code_scale": ffn2_code_scale,
"moe_ffn1_code_zp": ffn1_code_zp,
"moe_ffn2_code_zp": ffn2_code_zp
}
for name, tensor in name_tensor_map.items():
create_and_set_parameter(layer, name, tensor)
def create_weights(self, layer: nn.Layer, state_dict):
"""
Paddle cutlass create weight process.
"""
pass
def apply(
self,
layer: nn.Layer,
x: paddle.Tensor,
gate_out: paddle.Tensor,
) -> paddle.Tensor:
"""
Use Wint2 Triton Fusedmoe compute Fused MoE.
"""
from fastdeploy.model_executor.ops.gpu import moe_expert_dispatch
(
permute_input,
token_nums_per_expert,
permute_indices_per_token,
topk_weights,
topk_idx,
expert_idx_per_token,
) = moe_expert_dispatch(
x,
gate_out,
layer.gate_correction_bias,
(layer.moe_ffn1_in_scale if hasattr(layer, "moe_ffn1_in_scale")
else None), # if set, permute_input will be int8_t
layer.top_k,
False,
topk_only_mode=False,
)
ffn_out = fastdeploy.model_executor.ops.gpu.moe_expert_ffn_wint2(
permute_input,
token_nums_per_expert,
layer.moe_ffn1_weight,
layer.moe_ffn2_weight,
None,
layer.moe_ffn1_super_scales,
layer.moe_ffn2_super_scales,
layer.moe_ffn1_weight_scale,
layer.moe_ffn1_code_scale,
layer.moe_ffn1_code_zp,
layer.moe_ffn2_weight_scale,
layer.moe_ffn2_code_scale,
layer.moe_ffn2_code_zp,
False,
)
from fastdeploy.model_executor.ops.gpu import moe_expert_reduce
fused_moe_out = moe_expert_reduce(
ffn_out,
topk_weights,
permute_indices_per_token,
topk_idx,
None,
norm_topk_prob=True,
routed_scaling_factor=1.0,
)
return fused_moe_out
-273
View File
@@ -1,273 +0,0 @@
"""
# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""
import os
import paddle
from paddle import nn
from fastdeploy.model_executor.layers.moe.moe import MoELayer
from fastdeploy.model_executor.layers.utils import get_tensor
class TextMoELayer(MoELayer):
"""
MoELayer is a layer that performs MoE (Mixture of Experts) computation.
"""
def __init__(
self,
*args,
**kwargs,
):
"""
初始化函数,用于设置类的属性和方法。
参数:
- args (tuple, optional): 可变长度的位置参数列表,默认为空元组。
- kwargs (dict, optional): 关键字参数字典,默认为空字典。
返回值:
无返回值,直接修改类的属性和方法。
"""
kwargs["moe_tag"] = "Text"
super().__init__(*args, **kwargs)
def load_gate_state_dict(self, state_dict):
"""
加载门状态字典,用于初始化网络参数。
将从给定的状态字典中弹出的参数赋值给网络的门参数。
Args:
state_dict (OrderedDict): 包含网络门参数的字典。
Returns:
tuple (list, list): 返回两个列表,分别代表上阶网关投影和下阶投影的参数。
每个元素都是一个列表,长度为网络的专家数量。
"""
up_gate_proj_weight = []
up_gate_proj_weight_scale = []
down_proj_weight = []
down_proj_weight_scale = []
for j in range(0, self.num_experts):
up_gate_proj_weight.append(
get_tensor(state_dict.pop(self.ffn1_expert_weight_key.format(j)))
)
down_proj_weight.append(
get_tensor(state_dict.pop(self.ffn2_expert_weight_key.format(j)))
)
return (
up_gate_proj_weight,
down_proj_weight,
up_gate_proj_weight_scale,
down_proj_weight_scale,
)
def load_gate_correction_bias(self, state_dict):
"""
加载网关校正偏置。如果使用了网关校正偏置,则从state_dict中获取相应的张量并设置到网关校正偏置上。
参数:
state_dict (OrderedDict): 包含模型参数和状态的字典。
返回值:
无返回值,直接修改了网关校正偏置的值。
"""
if self.moe_config.moe_use_gate_correction_bias:
gate_correction_bias_tensor = get_tensor(
state_dict[self.gate_correction_bias_key]
)
self.gate_correction_bias.set_value(
gate_correction_bias_tensor[0].unsqueeze(0)
)
class ImageMoELayer(MoELayer):
"""
MoELayer is a layer that performs MoE (Mixture of Experts) computation.
"""
def __init__(
self,
*args,
**kwargs,
):
"""
初始化函数,用于设置类的属性和方法。
参数:
- args (tuple, optional): 可变长度的位置参数列表,默认为空元组。
- kwargs (dict, optional): 关键字参数字典,默认为空字典。
返回值:
无返回值,直接修改类的属性和方法。
"""
moe_quant_type = os.getenv("ELLM_MM_IMAGE_QUANT_TYPE", None)
if moe_quant_type is not None:
kwargs["moe_quant_type"] = moe_quant_type
kwargs["moe_tag"] = "Image"
super().__init__(*args, **kwargs)
def load_gate_state_dict(self, state_dict):
"""
加载门状态字典。
从给定的状态字典中提取并返回两个专家的上下关门投影权重,以及两个专家的下降投影权重。
参数:
state_dict (OrderedDict): 包含网络参数的有序字典。
返回值:
tuple (list, list),分别是两个专家的上下关门投影权重和两个专家的下降投影权重,都是列表类型。
"""
up_gate_proj_weight = []
up_gate_proj_weight_scale = []
down_proj_weight = []
down_proj_weight_scale = []
for j in range(self.num_experts, self.num_experts + self.num_experts):
up_gate_proj_weight.append(
get_tensor(state_dict.pop(self.ffn1_expert_weight_key.format(j)))
)
down_proj_weight.append(
get_tensor(state_dict.pop(self.ffn2_expert_weight_key.format(j)))
)
return (
up_gate_proj_weight,
down_proj_weight,
up_gate_proj_weight_scale,
down_proj_weight_scale,
)
def load_gate_correction_bias(self, state_dict):
"""
加载门级别校正偏置参数,如果使用门级别校正偏置则从state_dict中获取并设置到gate_correction_bias中。
参数:
state_dict (OrderedDict): 模型的状态字典,包含所有需要被加载的参数。
返回值:
无返回值,直接修改了gate_correction_bias的值。
"""
if self.moe_config.moe_use_gate_correction_bias:
gate_correction_bias_tensor = get_tensor(
state_dict[self.gate_correction_bias_key]
)
self.gate_correction_bias.set_value(
gate_correction_bias_tensor[1].unsqueeze(0)
)
class MultimodalityMoeLayer(nn.Layer):
"""
Multimodality MOE Layer
"""
def __init__(
self,
inference_args,
layer_name,
layer_idx,
):
"""
初始化一个 MoELayer。
Args:
inference_args (InferenceArgs): 推理参数类,包含了所有必要的配置信息。
layer_name (str): 当前 MoE Layer 的名称。
layer_idx (int): 当前 MoE Layer 在模型中的索引。
Returns:
None, 无返回值。
"""
super().__init__()
self.text_moe_layer = TextMoELayer(
inference_args=inference_args,
moe_config=inference_args.moe_config,
layer_name=layer_name + ".text",
gate_weight_key=f"ernie.layers.{layer_idx}.mlp.gate.weight",
ffn1_expert_weight_key=f"ernie.layers.{layer_idx}.mlp.experts"
+ ".{}.up_gate_proj.weight",
ffn2_expert_weight_key=f"ernie.layers.{layer_idx}.mlp.experts"
+ ".{}.down_proj.weight",
gate_correction_bias_key=f"ernie.layers.{layer_idx}.mlp.moe_statics.e_score_correction_bias",
ffn1_bias_key=None,
ffn2_bias_key=None,
ffn1_shared_weight_key=None,
ffn1_shared_bias_key=None,
ffn2_shared_weight_key=None,
ffn2_shared_bias_key=None,
layer_idx=layer_idx,
)
self.image_moe_layer = ImageMoELayer(
inference_args=inference_args,
moe_config=inference_args.moe_config_1,
layer_name=layer_name + ".image",
gate_weight_key=f"ernie.layers.{layer_idx}.mlp.gate.weight_1",
ffn1_expert_weight_key=f"ernie.layers.{layer_idx}.mlp.experts"
+ ".{}.up_gate_proj.weight",
ffn2_expert_weight_key=f"ernie.layers.{layer_idx}.mlp.experts"
+ ".{}.down_proj.weight",
gate_correction_bias_key=f"ernie.layers.{layer_idx}.mlp.moe_statics.e_score_correction_bias",
ffn1_bias_key=None,
ffn2_bias_key=None,
ffn1_shared_weight_key=None,
ffn1_shared_bias_key=None,
ffn2_shared_weight_key=None,
ffn2_shared_bias_key=None,
layer_idx=layer_idx,
)
def load_state_dict(self, state_dict):
"""
加载模型参数。
将给定的字典中的参数覆盖到当前模型上,并返回一个新的字典,其中包含未被覆盖的键值对。
Args:
state_dict (dict): 包含了要加载的模型参数的字典。
Returns:
dict: 包含未被覆盖的键值对的字典。
"""
self.text_moe_layer.load_state_dict(state_dict)
self.image_moe_layer.load_state_dict(state_dict)
state_dict.pop(self.text_moe_layer.gate_correction_bias_key)
def forward(self, x, **kwargs):
"""
前向计算函数,将输入的张量进行处理并返回结果。
该函数接受以下键值对参数:
- token_type_ids (Optional, Tensor, default=None): 一个bool型Tensor,用于指定每个元素是否为文本类型(值为0)或图像类型(值为1)。
如果未提供此参数,则会引发AssertionError。
返回值是一个Tensor,形状与输入相同,表示处理后的结果。
Args:
x (Tensor): 输入张量,形状为[token_num, hidden_size],其中token_num是序列长度,hidden_size是隐藏状态维度。
kwargs (dict, optional): 可选参数字典,默认为None,包含以下键值对:
- token_type_ids (Tensor, optional): 一个bool型Tensor,用于指定每个元素是否为文本类型(值为0)或图像类型(值为1),默认为None。
Returns:
Tensor: 一个Tensor,形状与输入相同,表示处理后的结果。
Raises:
AssertionError: 当未提供token_type_ids参数时会引发此错误。
"""
token_type_ids = kwargs.get("token_type_ids", None)
assert token_type_ids is not None
# x.shape is [token_num, hidden_size]
fused_moe_out = paddle.zeros_like(x)
text_mask = token_type_ids == 0 # [token_num]
image_mask = token_type_ids == 1
if text_mask.any():
text_out = self.text_moe_layer(x[text_mask])
fused_moe_out[text_mask] = text_out
if image_mask.any():
image_out = self.image_moe_layer(x[image_mask])
fused_moe_out[image_mask] = image_out
return fused_moe_out
+156 -160
View File
@@ -1,5 +1,5 @@
"""
# Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
@@ -14,34 +14,13 @@
# limitations under the License.
"""
from dataclasses import dataclass
import paddle
from paddle import nn
from paddlenlp.utils.log import logger
from paddleformers.utils.log import logger
from fastdeploy import envs
from fastdeploy.model_executor.layers.utils import get_tensor
from .cutlass_fused_moe import CutlassFusedMoeMethod
@dataclass
class MoEComputeParams:
"""
some params for computing MoE.
it is given to different compute methods.
"""
global_num_experts: int = -1
top_k: int = -1
hidden_size: int = -1
num_local_experts: int = -1
moe_intermediate_size: int = -1
tp_size: int = -1
ep_size: int = -1
dp_size: int = -1
moe_quant_type: str = ""
class FusedMoE(nn.Layer):
"""
@@ -50,174 +29,195 @@ class FusedMoE(nn.Layer):
def __init__(
self,
llm_config,
fd_config,
moe_intermediate_size: int = -1,
num_experts: int = -1,
expert_id_offset: int = 0,
top_k: int = -1,
moe_use_gate_correction_bias: bool = False,
moe_quant_type: str = "weight_only_int4",
layer_idx: int = -1,
gate_weight_key=None,
gate_correction_bias_key=None,
ffn1_expert_weight_key=None,
ffn2_expert_weight_key=None,
moe_ffn1_bias_keys=None,
moe_ffn2_bias_keys=None,
moe_ffn1_weight_scale_keys=None,
moe_ffn2_weight_scale_keys=None,
moe_ffn1_in_scale_keys=None,
moe_ffn2_in_scale_keys=None,
moe_tag: str = "",
weight_key_map: dict = {},
):
"""
Initialize the Moe layer with given parameters.
Args:
llm_config (LLMConfig): Arguments related to inference, containing
fd_config (FDConfig): Arguments related to inference, containing
attributes such as weight_dtype, act_dtype, mp_size, hidden_size, head_dim,
num_attention_heads, and ffn_hidden_size.
"""
super().__init__()
self.llm_config = llm_config
self.fd_config = fd_config
self.layer_idx = layer_idx
self.tp_size = llm_config.parallel_config.mp_size
self.ep_size = llm_config.parallel_config.ep_size
self.moe_use_gate_correction_bias = moe_use_gate_correction_bias
self.tp_size = fd_config.parallel_config.tensor_parallel_degree
self.ep_size = fd_config.parallel_config.expert_parallel_degree
self.ep_rank = fd_config.parallel_config.expert_parallel_rank
assert (self.tp_size >= 1 and self.ep_size == 1) or \
(self.tp_size == 1 and self.ep_size > 1), \
'MoE only support parallelism on TP or EP dimension.'
self.hidden_size = fd_config.model_config.hidden_size
self.moe_config = fd_config.moe_config
self.hidden_size = llm_config.model_config.hidden_size
self.moe_config = llm_config.moe_config
self.use_offline_quant = llm_config.tmp_config.use_offline_quant
moe_tag = self.llm_config.moe_config.moe_tag
logger.info(f"{moe_tag}MoE is running in {moe_quant_type} mode")
self.moe_quant_type = moe_quant_type
self.num_experts = num_experts
self.num_local_experts = self.num_experts // self.ep_size
logger.info(f'''MoE config is num_experts:{num_experts},
top_k:{top_k},
hidden_size:{self.hidden_size},
moe_intermediate_size:{moe_intermediate_size}''')
logger.info(
f"MoE is running on moe_quant_type: {self.moe_quant_type}, ep:{self.ep_size}, tp:{self.tp_size} mode"
)
self.moe_intermediate_size = moe_intermediate_size // self.tp_size
self.gate_weight_key = gate_weight_key
self.gate_correction_bias_key = gate_correction_bias_key
self.top_k = top_k
self.hidden_size = self.hidden_size
self.moe_intermediate_size = moe_intermediate_size // self.tp_size
self.weight_key_map = weight_key_map
self.ffn1_expert_weight_key = ffn1_expert_weight_key
self.ffn2_expert_weight_key = ffn2_expert_weight_key
self.ffn1_bias_key = moe_ffn1_bias_keys
self.ffn2_bias_key = moe_ffn2_bias_keys
self.use_method = envs.FD_MOE_BACKEND.lower()
self.gate_correction_bias = None
self.moe_tag = moe_tag
if self.moe_quant_type == "w4a8":
# below keys are only used in MoE W4A8!
self.ffn1_expert_weight_scale_key = moe_ffn1_weight_scale_keys
self.ffn2_expert_weight_scale_key = moe_ffn2_weight_scale_keys
self.ffn1_expert_in_scale_key = moe_ffn1_in_scale_keys
self.ffn2_expert_in_scale_key = moe_ffn2_in_scale_keys
if self.ep_size > 1:
expert_id_offset = expert_id_offset + self.ep_rank * self.num_local_experts
self.compute_method = CutlassFusedMoeMethod()
self.expert_id_offset = expert_id_offset
self.moe_compute_params = MoEComputeParams()
self.moe_compute_params.global_num_experts = self.num_experts
self.moe_compute_params.top_k = top_k
self.moe_compute_params.hidden_size = self.hidden_size
self.moe_compute_params.num_local_experts = self.num_local_experts
self.moe_compute_params.moe_quant_type = self.moe_quant_type
self.moe_compute_params.moe_intermediate_size = self.moe_intermediate_size
self.moe_compute_params.ep_size = self.ep_size
self.moe_compute_params.tp_size = self.tp_size
if fd_config.quant_config:
self.quant_method = fd_config.quant_config.get_quant_method(self)
else:
# now, no quant method(w_fp16 a_fp16) can't get from quant_config, we will optimize it in future
from .fused_moe_cutlass_backend import CutlassMoEMethod
self.quant_method = CutlassMoEMethod(None)
def load_gate_state_dict(self, state_dict):
if self.ep_size > 1:
self.quant_method.init_ep(self)
logger.info(
f"{moe_tag}MoE config is {num_experts=}[{expert_id_offset}, {expert_id_offset+self.num_local_experts}), \
{top_k=}, hidden_size={self.hidden_size}, {moe_intermediate_size=}, \
, ep_size={self.ep_size}, \
tp_size={self.tp_size}.")
def load_experts_weight(self, state_dict: dict,
ffn1_expert_weight_key: str,
ffn2_expert_weight_key: str):
"""
load_gate_state_dict function.
Load experts weight from state_dict.
Args:
state_dict (dict): The state_dict of model.
ffn1_expert_weight_key (str): The key of ffn1 expert weight.
ffn2_expert_weight_key (str): The key of ffn2 expert weight.
"""
up_gate_proj_weight = []
up_gate_proj_weight_scale = []
down_proj_weight = []
down_proj_weight_scale = []
for j in range(self.num_experts):
up_gate_proj_weight.append(
get_tensor(
state_dict.pop(self.ffn1_expert_weight_key.format(j))))
down_proj_weight.append(
get_tensor(
state_dict.pop(self.ffn2_expert_weight_key.format(j))))
return up_gate_proj_weight, down_proj_weight
ffn1_weights = []
ffn2_weights = []
is_ffn_merged = ffn1_expert_weight_key.format(
self.expert_id_offset) in state_dict
if is_ffn_merged:
for i in range(self.num_local_experts):
expert_idx = self.expert_id_offset + i
ffn1_weights.append(
get_tensor(
state_dict.pop(
ffn1_expert_weight_key.format(expert_idx))))
ffn2_weights.append(
get_tensor(
state_dict.pop(
ffn2_expert_weight_key.format(expert_idx))))
else:
gate_expert_weight_key = ffn1_expert_weight_key.replace(
"up_gate_proj", "gate_proj")
up_expert_weight_key = ffn1_expert_weight_key.replace(
"up_gate_proj", "up_proj")
for j in range(self.num_local_experts):
expert_idx = self.expert_id_offset + j
gate = get_tensor(
state_dict.pop(gate_expert_weight_key.format(expert_idx)))
up = get_tensor(
state_dict.pop(up_expert_weight_key.format(expert_idx)))
ffn1_weights.append(paddle.concat([gate, up], axis=-1))
ffn2_weights.append(
get_tensor(
state_dict.pop(
ffn2_expert_weight_key.format(expert_idx))))
return ffn1_weights, ffn2_weights
def load_state_dict(self, state_dict, is_update: bool = False):
def extract_moe_ffn_weights(self, state_dict: dict):
"""
Extract MoE FFN weights from state dict based on weight key mapping.
Args:
state_dict (dict): Model state dictionary containing the weights.
Returns:
tuple: A tuple containing two lists:
- ffn1_weights: List of tensors for first FFN layer weights
- ffn2_weights: List of tensors for second FFN layer weights
Raises:
AssertionError: If required weight keys are missing or number of weights
doesn't match number of local experts.
"""
ffn1_expert_weight_key = self.weight_key_map.get(
"ffn1_expert_weight_key", None)
ffn2_expert_weight_key = self.weight_key_map.get(
"ffn2_expert_weight_key", None)
assert ffn1_expert_weight_key is not None, "ffn1_expert_weight_key should not be none."
assert ffn2_expert_weight_key is not None, "ffn2_expert_weight_key should not be none."
ffn1_weights, ffn2_weights = self.load_experts_weight(
state_dict, ffn1_expert_weight_key, ffn2_expert_weight_key)
assert len(
ffn1_weights
) == self.num_local_experts, "ffn1_weights length should be equal to num_local_experts."
assert len(
ffn2_weights
) == self.num_local_experts, "ffn2_weights length should be equal to num_local_experts."
return ffn1_weights, ffn2_weights
def extract_gate_correction_bias(self, gate_correction_bias_key,
state_dict):
"""
extract_gate_correction_bias function.
"""
gate_correction_bias_tensor = get_tensor(
state_dict.pop(gate_correction_bias_key)).astype("float32")
return gate_correction_bias_tensor
def load_state_dict(self, state_dict):
"""
load_state_dict function.
"""
# gate
if not is_update:
gate_weight_tensor = get_tensor(state_dict.pop(self.gate_weight_key))
self.gate_weight = self.create_parameter(
shape=gate_weight_tensor.shape,
dtype="float32",
)
self.gate_weight.set_value(gate_weight_tensor)
# gate_correction_bias
self.gate_correction_bias_key = self.weight_key_map.get(
"gate_correction_bias_key", None)
if self.gate_correction_bias_key is not None and self.gate_correction_bias_key in state_dict:
self.moe_use_gate_correction_bias = True
else:
self.moe_use_gate_correction_bias = False
if self.moe_use_gate_correction_bias:
gate_correction_bias_tensor = get_tensor(
state_dict.pop(self.gate_correction_bias_key))
gate_correction_bias_tensor = self.extract_gate_correction_bias(
self.gate_correction_bias_key, state_dict)
self.gate_correction_bias = self.create_parameter(
shape=gate_correction_bias_tensor.shape,
dtype="float32",
)
self.gate_correction_bias.set_value(gate_correction_bias_tensor)
gate_weight_key = self.weight_key_map.get("gate_weight_key", None)
assert gate_weight_key is not None, "gate_weight_key should not be None, please check model checkpoints"
gate_weight_tensor = get_tensor(state_dict.pop(gate_weight_key))
self.gate_weight = self.create_parameter(
shape=gate_weight_tensor.shape,
dtype="float32",
)
self.gate_weight.set_value(gate_weight_tensor.astype("float32"))
if self.fd_config.model_config.is_quantized:
self.quant_method.process_prequanted_weights(self, state_dict)
else:
self.gate_correction_bias = None
self.quant_method.create_weights(self, state_dict)
up_gate_proj_weight, down_proj_weight = self.load_gate_state_dict(
state_dict)
weight1_scale = None
weight2_scale = None
ffn1_in_scale = None
ffn2_in_scale = None
if self.moe_quant_type == "w4a8":
weight1_scale = []
weight2_scale = []
ffn1_in_scale = []
ffn2_in_scale = []
for j in range(self.num_experts):
weight1_scale.append(
get_tensor(
state_dict.pop(
self.ffn1_expert_weight_scale_key.format(
self.layer_idx, j))))
weight2_scale.append(
get_tensor(
state_dict.pop(
self.ffn2_expert_weight_scale_key.format(
self.layer_idx, j))))
ffn1_in_scale.append(
get_tensor(
state_dict.pop(
self.ffn1_expert_in_scale_key.format(
self.layer_idx, j))))
ffn2_in_scale.append(
get_tensor(
state_dict.pop(
self.ffn2_expert_in_scale_key.format(
self.layer_idx, j))))
# other weight is with compute_method
# different method may have different way to create weights
self.compute_method.create_weights(self, self.moe_compute_params,
up_gate_proj_weight,
down_proj_weight, None, None,
weight1_scale, weight2_scale,
ffn1_in_scale, ffn2_in_scale)
def forward(self, x, **kwargs):
def forward(self, x: paddle.Tensor):
"""
Defines the forward computation of the moe layer.
@@ -225,13 +225,9 @@ class FusedMoE(nn.Layer):
x (Tensor): Input tensor to the moe layer.
Returns:
Tensor: Output tensor.
Tensor: Output tensor.s
"""
out = self.compute_method.apply(self, self.moe_compute_params, x)
if self.tp_size > 1:
from fastdeploy.distributed.communication_op import \
tensor_model_parallel_all_reduce
tensor_model_parallel_all_reduce(out)
gate_out = paddle.matmul(x.cast("float32"), self.gate_weight)
out = self.quant_method.apply(self, x, gate_out)
return out
-126
View File
@@ -1,126 +0,0 @@
"""
# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""
import os
import paddle
import fastdeploy
import fastdeploy.model_executor.ops.gpu.deep_gemm as deep_gemm
from fastdeploy.model_executor.layers.moe.moe import MoELayer
class MoeTPDecoerDeepDeepGEMMLayer(MoELayer):
"""
MoeTPDecoerDeepDeepGEMMLayer
"""
def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)
def forward(self, x, **kwargs):
"""
forward
"""
gate_out = paddle.matmul(x.cast("float32"), self.gate_weight)
if os.getenv("EP_DECODER_PERF_TEST", "False") == "True":
gate_out = paddle.rand(shape=gate_out.shape, dtype=gate_out.dtype)
ffn1_out = paddle.empty(
[
self.num_local_experts,
self.max_batch_size,
self.moe_intermediate_size * 2,
],
dtype=self._dtype,
)
ffn_out = paddle.empty(
[
self.num_local_experts,
self.max_batch_size,
self.embed_dim,
],
dtype=self._dtype,
)
topk_idx, topk_weights = fastdeploy.model_executor.ops.gpu.moe_topk_select(
gate_out,
(
self.gate_correction_bias
if self.moe_config.moe_use_gate_correction_bias
else None
),
self.top_k,
True, # apply_norm_weight
False,
)
permute_input, token_nums_per_expert, permute_indices_per_token = (
fastdeploy.model_executor.ops.gpu.moe_deepgemm_permute(
x, topk_idx, self.num_local_experts, self.max_batch_size
)
)
expected_m = 128
permute_input_fp8, scale = fastdeploy.model_executor.ops.gpu.masked_per_token_quant(
permute_input, token_nums_per_expert, 128
)
deep_gemm.m_grouped_gemm_fp8_fp8_bf16_nt_masked(
(permute_input_fp8, scale),
(
self.moe_ffn1_weight,
self.moe_ffn1_weight_scale,
),
ffn1_out,
token_nums_per_expert,
expected_m,
)
act_out = fastdeploy.model_executor.ops.gpu.group_swiglu_with_masked(
ffn1_out, token_nums_per_expert
)
act_out_fp8, scale = fastdeploy.model_executor.ops.gpu.masked_per_token_quant(
act_out, token_nums_per_expert, 128
)
deep_gemm.m_grouped_gemm_fp8_fp8_bf16_nt_masked(
(act_out_fp8, scale),
(
self.moe_ffn2_weight,
self.moe_ffn2_weight_scale,
),
ffn_out,
token_nums_per_expert,
expected_m,
)
fused_moe_out = fastdeploy.model_executor.ops.gpu.moe_deepgemm_depermute(
ffn_out, permute_indices_per_token, topk_idx, topk_weights
)[0]
return fused_moe_out
class MoeTPPrefillDeepDeepGEMMLayer(MoELayer):
"""
MoeTPPrefillDeepDeepGEMMLayer
"""
def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)
def forward(self, x, **kwargs):
"""
forward
"""
raise NotImplementedError("Prefill is comming soon...")
@@ -0,0 +1,198 @@
"""
# Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""
import triton
import triton.language as tl
@triton.jit
def fused_moe_kernel_paddle(
a_ptr,
b_ptr,
c_ptr,
a_scale_ptr,
b_scale_ptr,
topk_weights_ptr,
sorted_token_ids_ptr,
expert_ids_ptr,
num_tokens_post_padded_ptr,
# Matrix dimensions
N,
K,
num_tokens_post_padded,
num_valid_tokens,
stride_am,
stride_ak,
stride_be,
stride_bk,
stride_bn,
stride_cm,
stride_cn,
stride_asm,
stride_ask,
stride_bse,
stride_bsk,
stride_bsn,
# Block size for block-wise fp8 quantization
group_n: tl.constexpr,
group_k: tl.constexpr,
# Meta-parameters
BLOCK_SIZE_M: tl.constexpr,
BLOCK_SIZE_N: tl.constexpr,
BLOCK_SIZE_K: tl.constexpr,
GROUP_SIZE_M: tl.constexpr,
MUL_ROUTED_WEIGHT: tl.constexpr,
top_k: tl.constexpr,
compute_type_enum: tl.constexpr,
use_fp8_w8a8: tl.constexpr,
use_int8_w8a16: tl.constexpr,
even_Ks: tl.constexpr,
):
"""
Key Parameters:
- A: The input tensor representing tokens with shape (*, K), where '*' can
be any shape representing batches and K is the feature dimension of
each token.
- B: The stacked MOE weight tensor with shape (E, N, K), where E is
the number of experts, K is the input feature dimension, and N is
the output feature dimension.
- C: The output cache tensor with shape (M, topk, N), where M is the
total number of tokens post padding, topk is the number of times
each token is repeated, and N is the output feature dimension.
- sorted_token_ids: A tensor containing the sorted indices of tokens,
repeated topk times and arranged by the expert index they are
assigned to.
- expert_ids: A tensor containing the indices of the expert for each
block. It determines which expert matrix from B should be used for
each block in A.
This kernel performs the multiplication of a token by its corresponding
expert matrix as determined by `expert_ids`. The sorting of
`sorted_token_ids` by expert index and padding ensures divisibility by
BLOCK_SIZE_M, which is necessary to maintain consistency in block matrix
multiplication across different blocks processed by the same expert.
"""
pid = tl.program_id(axis=0)
num_pid_m = tl.cdiv(num_tokens_post_padded, BLOCK_SIZE_M)
num_pid_n = tl.cdiv(N, BLOCK_SIZE_N)
num_pid_in_group = GROUP_SIZE_M * num_pid_n
group_id = pid // num_pid_in_group
first_pid_m = group_id * GROUP_SIZE_M
group_size_m = min(num_pid_m - first_pid_m, GROUP_SIZE_M)
pid_m = first_pid_m + ((pid % num_pid_in_group) % group_size_m)
pid_n = (pid % num_pid_in_group) // group_size_m
assert compute_type_enum == 1
compute_type = tl.bfloat16
num_tokens_post_padded = tl.load(num_tokens_post_padded_ptr)
if pid_m * BLOCK_SIZE_M >= num_tokens_post_padded:
return
offs_token_id = pid_m * BLOCK_SIZE_M + tl.arange(0, BLOCK_SIZE_M)
offs_token = tl.load(sorted_token_ids_ptr + offs_token_id)
token_mask = offs_token < num_valid_tokens
offs_bn = (pid_n * BLOCK_SIZE_N + tl.arange(0, BLOCK_SIZE_N)) % N
offs_k = tl.arange(0, BLOCK_SIZE_K)
a_ptrs = a_ptr + (offs_token[:, None] // top_k * stride_am +
offs_k[None, :] * stride_ak)
off_experts = tl.load(expert_ids_ptr + pid_m)
b_ptrs = b_ptr + off_experts * stride_be + (offs_k[:, None] * stride_bk +
offs_bn[None, :] * stride_bn)
if use_int8_w8a16:
b_scale_ptrs = b_scale_ptr + off_experts * stride_bse + offs_bn[
None, :] * stride_bsn
b_scale = tl.load(b_scale_ptrs)
if use_fp8_w8a8:
if group_k > 0 and group_n > 0:
a_scale_ptrs = a_scale_ptr + (offs_token // top_k) * stride_asm
offs_bsn = offs_bn // group_n
b_scale_ptrs = b_scale_ptr + off_experts * stride_bse + offs_bsn * stride_bsn
else:
# (Zkk): every expert has one activation scale and weight scale.
a_scale = tl.load(a_scale_ptr + off_experts)
b_scale = tl.load(b_scale_ptr + off_experts)
accumulator = tl.zeros((BLOCK_SIZE_M, BLOCK_SIZE_N), dtype=tl.float32)
for k in range(0, tl.cdiv(K, BLOCK_SIZE_K)):
if even_Ks:
a = tl.load(
a_ptrs,
mask=token_mask[:, None],
other=0.0,
)
b = tl.load(b_ptrs,
cache_modifier=".cv",
eviction_policy='evict_first')
else:
a = tl.load(
a_ptrs,
mask=token_mask[:, None] &
(offs_k[None, :] < K - k * BLOCK_SIZE_K),
other=0.0,
)
b = tl.load(b_ptrs,
mask=offs_k[:, None] < K - k * BLOCK_SIZE_K,
other=0.0)
# We accumulate along the K dimension.
if use_int8_w8a16:
accumulator = tl.dot(a, b.to(compute_type), acc=accumulator)
elif use_fp8_w8a8:
if group_k > 0 and group_n > 0:
k_start = k * BLOCK_SIZE_K
offs_ks = k_start // group_k
a_scale = tl.load(a_scale_ptrs + offs_ks * stride_ask,
mask=token_mask,
other=0.0)
b_scale = tl.load(b_scale_ptrs + offs_ks * stride_bsk)
accumulator += tl.dot(a, b) * a_scale[:,
None] * b_scale[None, :]
else:
accumulator = tl.dot(a, b, acc=accumulator)
else:
accumulator += tl.dot(a, b)
a_ptrs += BLOCK_SIZE_K * stride_ak
b_ptrs += BLOCK_SIZE_K * stride_bk
if MUL_ROUTED_WEIGHT:
moe_weight = tl.load(topk_weights_ptr + offs_token,
mask=token_mask,
other=0)
accumulator = accumulator * moe_weight[:, None]
if use_int8_w8a16:
accumulator = (accumulator * b_scale).to(compute_type)
elif use_fp8_w8a8:
if group_k > 0 and group_n > 0:
accumulator = accumulator.to(compute_type)
else:
accumulator = (accumulator * a_scale * b_scale).to(compute_type)
else:
accumulator = accumulator.to(compute_type)
# Write back the block of the output
offs_cn = pid_n * BLOCK_SIZE_N + tl.arange(0, BLOCK_SIZE_N)
c_ptrs = c_ptr + stride_cm * offs_token[:, None] + stride_cn * offs_cn[
None, :]
c_mask = token_mask[:, None] & (offs_cn[None, :] < N)
tl.store(c_ptrs, accumulator, mask=c_mask)