mirror of
https://github.com/PaddlePaddle/FastDeploy.git
synced 2026-04-24 09:44:10 +08:00
Sync v2.0 version of code to github repo
This commit is contained in:
@@ -11,3 +11,13 @@
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
from .fused_moe_cutlass_backend import (CutlassW4A8MoEMethod,
|
||||
CutlassWeightOnlyMoEMethod)
|
||||
from .fused_moe_triton_backend import TritonWeightOnlyMoEMethod
|
||||
from .moe import FusedMoE
|
||||
|
||||
__all__ = [
|
||||
CutlassWeightOnlyMoEMethod, CutlassW4A8MoEMethod, FusedMoE,
|
||||
TritonWeightOnlyMoEMethod
|
||||
]
|
||||
|
||||
@@ -1,222 +0,0 @@
|
||||
"""
|
||||
# Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
"""
|
||||
|
||||
import paddle
|
||||
from paddle import nn
|
||||
from paddle.distributed import fleet
|
||||
from paddle.framework import in_dynamic_or_pir_mode
|
||||
from paddle.nn.quant import weight_quantize
|
||||
|
||||
from fastdeploy.model_executor.ops.gpu import (moe_expert_dispatch,
|
||||
moe_expert_ffn,
|
||||
moe_expert_reduce)
|
||||
|
||||
from .fused_moe_method_base import FusedMoEMethodBase
|
||||
|
||||
|
||||
class CutlassFusedMoeMethod(FusedMoEMethodBase):
|
||||
"""
|
||||
Use Cutlass Group Gemm to compute Fused MoE.
|
||||
This method is the oldest way to compute MoE in Paddle.
|
||||
"""
|
||||
|
||||
def create_weights(
|
||||
self,
|
||||
layer: nn.Layer,
|
||||
moe_compute_params,
|
||||
ffn1_tensor,
|
||||
ffn2_tensor,
|
||||
ffn1_bias=None,
|
||||
ffn2_bias=None,
|
||||
# belows only used in w4a8.
|
||||
moe_ffn1_weight_scale=None,
|
||||
moe_ffn2_weight_scale=None,
|
||||
moe_ffn1_in_scale=None,
|
||||
moe_ffn2_in_scale=None):
|
||||
"""
|
||||
Paddle cutlass create weight process.
|
||||
"""
|
||||
|
||||
num_local_experts = moe_compute_params.num_local_experts
|
||||
moe_quant_type = moe_compute_params.moe_quant_type
|
||||
|
||||
assert len(ffn1_tensor) == num_local_experts
|
||||
assert len(ffn2_tensor) == num_local_experts
|
||||
assert ffn1_tensor[0].shape == [
|
||||
moe_compute_params.hidden_size,
|
||||
moe_compute_params.moe_intermediate_size * 2
|
||||
]
|
||||
assert ffn2_tensor[0].shape == [
|
||||
moe_compute_params.moe_intermediate_size,
|
||||
moe_compute_params.hidden_size
|
||||
]
|
||||
|
||||
added_weight_attrs = ["moe_ffn1_weight", "moe_ffn2_weight"]
|
||||
added_scale_attrs = ["moe_ffn1_weight_scale", "moe_ffn2_weight_scale"]
|
||||
|
||||
if moe_quant_type == "w4a8":
|
||||
moe_ffn1_in_scale = paddle.concat(moe_ffn1_in_scale)
|
||||
moe_ffn2_in_scale = paddle.concat(moe_ffn2_in_scale)
|
||||
moe_ffn1_in_scale = 1 / moe_ffn1_in_scale
|
||||
moe_ffn2_in_scale = 1 / moe_ffn2_in_scale
|
||||
moe_ffn1_weight_scale = paddle.stack(moe_ffn1_weight_scale, axis=0)
|
||||
moe_ffn2_weight_scale = paddle.stack(moe_ffn2_weight_scale, axis=0)
|
||||
|
||||
moe_ffn1_weight_scale = moe_ffn1_weight_scale / (127 * 112)
|
||||
moe_ffn2_weight_scale = moe_ffn2_weight_scale / (127 * 112)
|
||||
moe_ffn1_weight_scale = moe_ffn1_weight_scale / moe_ffn1_in_scale[:,
|
||||
None]
|
||||
moe_ffn2_weight_scale = moe_ffn2_weight_scale / moe_ffn2_in_scale[:,
|
||||
None]
|
||||
moe_ffn1_weight_scale = moe_ffn1_weight_scale.cast(
|
||||
paddle.get_default_dtype())
|
||||
moe_ffn2_weight_scale = moe_ffn2_weight_scale.cast(
|
||||
paddle.get_default_dtype())
|
||||
|
||||
if moe_quant_type in ["weight_only_int4", "weight_only_int8", "w4a8"]:
|
||||
|
||||
for idx, weight_tensor in enumerate([ffn1_tensor, ffn2_tensor]):
|
||||
weight_name = added_weight_attrs[idx]
|
||||
scale_name = added_scale_attrs[idx]
|
||||
|
||||
weight_list = []
|
||||
weight_scale_list = []
|
||||
for i in range(num_local_experts):
|
||||
quant_weight, scale = weight_quantize(weight_tensor[i],
|
||||
algo=moe_quant_type,
|
||||
arch=80)
|
||||
weight_list.append(quant_weight)
|
||||
if moe_quant_type != "w4a8":
|
||||
# scale holds no memoty in w4a8, don't touch it!
|
||||
weight_scale_list.append(scale)
|
||||
quanted_weight = paddle.stack(weight_list, axis=0)
|
||||
setattr(
|
||||
layer, weight_name,
|
||||
layer.create_parameter(
|
||||
shape=quanted_weight.shape,
|
||||
dtype=quanted_weight.dtype,
|
||||
default_initializer=paddle.nn.initializer.Constant(0),
|
||||
))
|
||||
getattr(layer, weight_name).set_value(quanted_weight)
|
||||
|
||||
# this scale only useful for wint8/4.
|
||||
if moe_quant_type != "w4a8":
|
||||
quanted_weight_scale = paddle.stack(weight_scale_list,
|
||||
axis=0)
|
||||
setattr(
|
||||
layer, scale_name,
|
||||
layer.create_parameter(
|
||||
shape=quanted_weight_scale.shape,
|
||||
dtype=quanted_weight_scale.dtype,
|
||||
))
|
||||
getattr(layer, scale_name).set_value(quanted_weight_scale)
|
||||
|
||||
if moe_quant_type == "w4a8":
|
||||
assert moe_ffn1_weight_scale is not None
|
||||
assert moe_ffn2_weight_scale is not None
|
||||
assert moe_ffn1_in_scale is not None
|
||||
assert moe_ffn2_in_scale is not None
|
||||
added_w4a8_attrs = [
|
||||
"moe_ffn1_weight_scale", "moe_ffn2_weight_scale",
|
||||
"moe_ffn1_in_scale", "moe_ffn2_in_scale"
|
||||
]
|
||||
for idx, weight_tensor in enumerate([
|
||||
moe_ffn1_weight_scale, moe_ffn2_weight_scale,
|
||||
moe_ffn1_in_scale, moe_ffn2_in_scale
|
||||
]):
|
||||
name = added_w4a8_attrs[idx]
|
||||
setattr(
|
||||
layer, name,
|
||||
layer.create_parameter(
|
||||
shape=weight_tensor.shape,
|
||||
dtype=weight_tensor.dtype,
|
||||
default_initializer=paddle.nn.initializer.Constant(0),
|
||||
))
|
||||
getattr(layer, name).set_value(weight_tensor)
|
||||
|
||||
def apply(
|
||||
self,
|
||||
layer: nn.Layer,
|
||||
moe_compute_params,
|
||||
x: paddle.Tensor,
|
||||
) -> paddle.Tensor:
|
||||
"""
|
||||
Paddle Cutlass compute Fused MoE.
|
||||
"""
|
||||
|
||||
gate_out = paddle.matmul(x.cast("float32"), layer.gate_weight)
|
||||
|
||||
(
|
||||
permute_input,
|
||||
token_nums_per_expert,
|
||||
permute_indices_per_token,
|
||||
topk_weights,
|
||||
topk_idx,
|
||||
expert_idx_per_token,
|
||||
) = moe_expert_dispatch(
|
||||
x,
|
||||
gate_out,
|
||||
layer.gate_correction_bias,
|
||||
(layer.moe_ffn1_in_scale if hasattr(layer, "moe_ffn1_in_scale")
|
||||
else None), # if set, permute_input will be int8_t
|
||||
moe_compute_params.top_k,
|
||||
False,
|
||||
topk_only_mode=False,
|
||||
)
|
||||
|
||||
if moe_compute_params.moe_quant_type != "w4a8":
|
||||
# only w4a8 need expert_idx_per_token
|
||||
# Other need not this tensor, so we make it None.
|
||||
expert_idx_per_token = None
|
||||
else:
|
||||
expert_idx_per_token = expert_idx_per_token.cast("int64")
|
||||
|
||||
ffn_out = moe_expert_ffn(
|
||||
permute_input,
|
||||
token_nums_per_expert,
|
||||
layer.moe_ffn1_weight,
|
||||
layer.moe_ffn2_weight,
|
||||
None,
|
||||
(layer.moe_ffn1_weight_scale
|
||||
if hasattr(layer, "moe_ffn1_weight_scale") else None),
|
||||
(layer.moe_ffn2_weight_scale
|
||||
if hasattr(layer, "moe_ffn2_weight_scale") else None),
|
||||
(layer.moe_ffn2_in_scale
|
||||
if hasattr(layer, "moe_ffn2_in_scale") else None),
|
||||
expert_idx_per_token,
|
||||
moe_compute_params.moe_quant_type,
|
||||
False, # used_in_ep_low_latency
|
||||
)
|
||||
|
||||
if False:
|
||||
if in_dynamic_or_pir_mode():
|
||||
hcg = fleet.get_hybrid_communicate_group()
|
||||
mp_group = hcg.get_model_parallel_group()
|
||||
paddle.distributed.all_reduce(ffn_out, group=mp_group)
|
||||
else:
|
||||
paddle.distributed.all_reduce(ffn_out, group=mp_group)
|
||||
|
||||
# reduce 中会做 topk 个 weight 的 norm 和 routed_scaling_factor
|
||||
fused_moe_out = moe_expert_reduce(
|
||||
ffn_out,
|
||||
topk_weights,
|
||||
permute_indices_per_token,
|
||||
topk_idx,
|
||||
None,
|
||||
norm_topk_prob=True,
|
||||
routed_scaling_factor=1.0,
|
||||
)
|
||||
return fused_moe_out
|
||||
File diff suppressed because it is too large
Load Diff
@@ -0,0 +1,135 @@
|
||||
"""
|
||||
# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
"""
|
||||
|
||||
from abc import abstractmethod
|
||||
|
||||
import paddle
|
||||
from paddle import nn
|
||||
|
||||
from fastdeploy.config import MoEPhase
|
||||
|
||||
from ..quantization.quant_base import QuantMethodBase
|
||||
|
||||
|
||||
class MoEMethodBase(QuantMethodBase):
|
||||
"""
|
||||
"""
|
||||
|
||||
def __init__(self, quant_config):
|
||||
super().__init__()
|
||||
if quant_config is None:
|
||||
self.moe_quant_type = "w16a16"
|
||||
else:
|
||||
self.quant_config = quant_config
|
||||
self.added_weight_attrs = ["moe_ffn1_weight", "moe_ffn2_weight"]
|
||||
self.added_scale_attrs = [
|
||||
"moe_ffn1_weight_scale", "moe_ffn2_weight_scale"
|
||||
]
|
||||
self.pack_num = 1
|
||||
|
||||
def init_ep(self, layer: nn.Layer) -> None:
|
||||
"""
|
||||
Init EP related module
|
||||
"""
|
||||
if layer.ep_size > 1:
|
||||
if layer.fd_config.parallel_config.moe_phase == MoEPhase.DECODER:
|
||||
from .ep import EPDecoderRunner
|
||||
self.ep_decoder_runner = EPDecoderRunner(
|
||||
layer.top_k, layer.hidden_size, layer.num_experts,
|
||||
layer.moe_config.num_max_dispatch_tokens_per_rank,
|
||||
layer.ep_size, layer.ep_rank)
|
||||
else:
|
||||
from .ep import EPPrefillRunner
|
||||
self.ep_prefill_runner = EPPrefillRunner(
|
||||
layer.top_k, layer.hidden_size, layer.num_experts,
|
||||
layer.ep_size, layer.ep_rank)
|
||||
|
||||
def process_loaded_weights(self, layer, weights) -> None:
|
||||
"""
|
||||
process_loaded_weights
|
||||
"""
|
||||
pass
|
||||
|
||||
def check(self, layer: nn.Layer, ffn1_weights, ffn2_weights):
|
||||
"""
|
||||
check layer is valid for this method
|
||||
"""
|
||||
assert ffn1_weights[0].shape == [
|
||||
layer.hidden_size // self.pack_num, layer.moe_intermediate_size * 2
|
||||
]
|
||||
assert ffn2_weights[0].shape == [
|
||||
layer.moe_intermediate_size // self.pack_num, layer.hidden_size
|
||||
]
|
||||
|
||||
@abstractmethod
|
||||
def create_weights(self, layer: nn.Layer, state_dict):
|
||||
"""
|
||||
Paddle cutlass create weight process.
|
||||
"""
|
||||
raise NotImplementedError
|
||||
|
||||
@abstractmethod
|
||||
def apply_ep_prefill(
|
||||
self,
|
||||
layer: nn.Layer,
|
||||
x: paddle.Tensor,
|
||||
gate_out: paddle.Tensor,
|
||||
) -> paddle.Tensor:
|
||||
"""
|
||||
Apply the EP prefill method.
|
||||
"""
|
||||
raise NotImplementedError
|
||||
|
||||
@abstractmethod
|
||||
def apply_ep_decode(
|
||||
self,
|
||||
layer: nn.Layer,
|
||||
x: paddle.Tensor,
|
||||
gate_out: paddle.Tensor,
|
||||
) -> paddle.Tensor:
|
||||
"""
|
||||
Apply the EP decoder method.
|
||||
"""
|
||||
raise NotImplementedError
|
||||
|
||||
@abstractmethod
|
||||
def apply_tp(
|
||||
self,
|
||||
layer: nn.Layer,
|
||||
x: paddle.Tensor,
|
||||
gate_out: paddle.Tensor,
|
||||
) -> paddle.Tensor:
|
||||
"""
|
||||
Paddle Cutlass compute Fused MoE.
|
||||
"""
|
||||
raise NotImplementedError
|
||||
|
||||
def apply(
|
||||
self,
|
||||
layer: nn.Layer,
|
||||
x: paddle.Tensor,
|
||||
gate_out: paddle.Tensor,
|
||||
) -> paddle.Tensor:
|
||||
"""
|
||||
Paddle Cutlass compute Fused MoE.
|
||||
"""
|
||||
if layer.ep_size > 1:
|
||||
if layer.fd_config.parallel_config.moe_phase == MoEPhase.PREFILL:
|
||||
return self.apply_ep_prefill(layer, x, gate_out)
|
||||
else:
|
||||
return self.apply_ep_decode(layer, x, gate_out)
|
||||
else:
|
||||
return self.apply_tp(layer, x, gate_out)
|
||||
@@ -0,0 +1,431 @@
|
||||
"""
|
||||
# Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
"""
|
||||
|
||||
import paddle
|
||||
from paddle import nn
|
||||
from paddle.nn.quant import weight_quantize
|
||||
from paddleformers.utils.log import logger
|
||||
|
||||
import fastdeploy
|
||||
from fastdeploy.distributed.communication_op import \
|
||||
tensor_model_parallel_all_reduce
|
||||
from ..utils import get_tensor, create_and_set_parameter
|
||||
from .fused_moe_backend_base import MoEMethodBase
|
||||
|
||||
from fastdeploy.platforms import current_platform
|
||||
if current_platform.is_cuda():
|
||||
from fastdeploy.model_executor.ops.gpu import moe_expert_dispatch
|
||||
from fastdeploy.model_executor.ops.gpu import moe_expert_reduce
|
||||
|
||||
|
||||
class CutlassMoEMethod(MoEMethodBase):
|
||||
"""
|
||||
Use Cutlass Group Gemm to compute Fused MoE.
|
||||
This method is the oldest way to compute MoE in Paddle.
|
||||
"""
|
||||
|
||||
def create_weights(self, layer: nn.Layer, state_dict):
|
||||
"""
|
||||
Paddle cutlass create weight process.
|
||||
"""
|
||||
# bf16
|
||||
ffn1_weights, ffn2_weights = layer.extract_moe_ffn_weights(state_dict)
|
||||
stacked_ffn1_weights = paddle.stack(ffn1_weights, axis=0)
|
||||
stacked_ffn2_weights = paddle.stack(ffn2_weights, axis=0)
|
||||
for idx, weight_tensor in enumerate(
|
||||
[stacked_ffn1_weights, stacked_ffn2_weights]):
|
||||
weight_name = self.added_weight_attrs[idx]
|
||||
setattr(
|
||||
layer, weight_name,
|
||||
layer.create_parameter(
|
||||
shape=weight_tensor.shape,
|
||||
dtype=weight_tensor.dtype,
|
||||
default_initializer=paddle.nn.initializer.Constant(0),
|
||||
))
|
||||
getattr(layer, weight_name).set_value(weight_tensor)
|
||||
|
||||
def compute_ffn(
|
||||
self,
|
||||
layer: nn.Layer,
|
||||
permute_input: paddle.Tensor,
|
||||
token_nums_per_expert: paddle.Tensor,
|
||||
expert_idx_per_token: paddle.Tensor,
|
||||
used_in_ep_low_latency: bool = False,
|
||||
):
|
||||
"""
|
||||
Paddle Cutlass compute Fused MoE.
|
||||
"""
|
||||
return fastdeploy.model_executor.ops.gpu.moe_expert_ffn(
|
||||
permute_input,
|
||||
token_nums_per_expert,
|
||||
layer.moe_ffn1_weight,
|
||||
layer.moe_ffn2_weight,
|
||||
None,
|
||||
(layer.moe_ffn1_weight_scale
|
||||
if hasattr(layer, "moe_ffn1_weight_scale") else None),
|
||||
(layer.moe_ffn2_weight_scale
|
||||
if hasattr(layer, "moe_ffn2_weight_scale") else None),
|
||||
(layer.moe_ffn2_in_scale
|
||||
if hasattr(layer, "moe_ffn2_in_scale") else None),
|
||||
expert_idx_per_token,
|
||||
self.moe_quant_type,
|
||||
used_in_ep_low_latency,
|
||||
)
|
||||
|
||||
def apply_ep_prefill(
|
||||
self,
|
||||
layer: nn.Layer,
|
||||
x: paddle.Tensor,
|
||||
gate_out: paddle.Tensor,
|
||||
) -> paddle.Tensor:
|
||||
"""
|
||||
Apply the EP prefill method.
|
||||
"""
|
||||
# 1. Select topk experts and weights
|
||||
topk_idx, topk_weights = self.ep_prefill_runner.moe_select(
|
||||
layer, gate_out)
|
||||
# 2. EP Dispatch
|
||||
(
|
||||
recv_x,
|
||||
recv_topk_idx,
|
||||
recv_topk_weights,
|
||||
recv_num_tokens_per_expert_list,
|
||||
handle,
|
||||
_,
|
||||
) = self.ep_prefill_runner.dispatch(x, topk_idx, topk_weights)
|
||||
token_all_num = sum(recv_num_tokens_per_expert_list)
|
||||
|
||||
# 3. Compute ffn
|
||||
if token_all_num > 0:
|
||||
logger.info(f"token_all_num {token_all_num}")
|
||||
(
|
||||
permute_input,
|
||||
permute_indices_per_token,
|
||||
recv_num_tokens_per_expert_list_cumsum,
|
||||
dst_weights,
|
||||
dst_indices,
|
||||
cumsum_idx_gpu,
|
||||
expert_idx_per_token,
|
||||
) = fastdeploy.model_executor.ops.gpu.ep_moe_expert_dispatch(
|
||||
recv_x,
|
||||
recv_topk_idx,
|
||||
recv_topk_weights,
|
||||
(self.moe_ffn1_in_scale
|
||||
if hasattr(self, "moe_ffn1_in_scale") else None),
|
||||
recv_num_tokens_per_expert_list,
|
||||
token_all_num,
|
||||
self.moe_quant_type,
|
||||
)
|
||||
if self.moe_quant_type != "w4a8":
|
||||
# only w4a8 need expert_idx_per_token
|
||||
# Other need not this tensor, so we make it None.
|
||||
expert_idx_per_token = None
|
||||
else:
|
||||
expert_idx_per_token = expert_idx_per_token.cast("int64")
|
||||
|
||||
ffn_out = self.compute_ffn(layer, permute_input,
|
||||
recv_num_tokens_per_expert_list_cumsum,
|
||||
expert_idx_per_token)
|
||||
|
||||
# prmt back per rank
|
||||
tmp_ffn_out = fastdeploy.model_executor.ops.gpu.ep_moe_expert_combine(
|
||||
ffn_out,
|
||||
dst_weights,
|
||||
permute_indices_per_token,
|
||||
dst_indices,
|
||||
None, # moe_ffn2_bias,
|
||||
False, # norm_topk_prob
|
||||
1.0,
|
||||
)[0]
|
||||
else:
|
||||
tmp_ffn_out = recv_x
|
||||
|
||||
# 4. EP combine
|
||||
return self.ep_prefill_runner.combine(tmp_ffn_out, handle,
|
||||
recv_topk_weights)
|
||||
|
||||
def apply_ep_decode(
|
||||
self,
|
||||
layer: nn.Layer,
|
||||
x: paddle.Tensor,
|
||||
gate_out: paddle.Tensor,
|
||||
) -> paddle.Tensor:
|
||||
"""
|
||||
Apply the EP decoder method.
|
||||
"""
|
||||
# 1. Select topk experts and weights
|
||||
topk_idx, topk_weights = self.ep_decoder_runner.moe_select(
|
||||
layer, gate_out)
|
||||
# 2. EP Dispatch
|
||||
permute_input, token_nums_per_expert, handle = self.ep_decoder_runner.dispatch(
|
||||
x, topk_idx, topk_weights)
|
||||
# 3. Compute ffn
|
||||
if self.moe_quant_type == "w4a8":
|
||||
num_local_experts, max_num, _ = permute_input.shape
|
||||
expert_idx_per_token = paddle.arange(
|
||||
num_local_experts)[:, None].tile([1, max_num])
|
||||
elif self.moe_quant_type in ["weight_only_int8", "weight_only_int4"]:
|
||||
expert_idx_per_token = None
|
||||
else:
|
||||
raise NotImplementedError
|
||||
|
||||
ffn_out = self.compute_ffn(layer, permute_input,
|
||||
token_nums_per_expert.cast("int64"),
|
||||
expert_idx_per_token, True)
|
||||
|
||||
# 4. EP combine
|
||||
return self.ep_decoder_runner.combine(ffn_out, topk_idx, topk_weights,
|
||||
handle)
|
||||
|
||||
def apply_tp(
|
||||
self,
|
||||
layer: nn.Layer,
|
||||
x: paddle.Tensor,
|
||||
gate_out: paddle.Tensor,
|
||||
) -> paddle.Tensor:
|
||||
"""
|
||||
Paddle Cutlass compute Fused MoE.
|
||||
"""
|
||||
(
|
||||
permute_input,
|
||||
token_nums_per_expert,
|
||||
permute_indices_per_token,
|
||||
topk_weights,
|
||||
topk_idx,
|
||||
expert_idx_per_token,
|
||||
) = moe_expert_dispatch(
|
||||
x,
|
||||
gate_out,
|
||||
layer.gate_correction_bias,
|
||||
(layer.moe_ffn1_in_scale if hasattr(layer, "moe_ffn1_in_scale")
|
||||
else None), # if set, permute_input will be int8_t
|
||||
layer.top_k,
|
||||
False,
|
||||
topk_only_mode=False,
|
||||
)
|
||||
|
||||
if self.moe_quant_type != "w4a8":
|
||||
# only w4a8 need expert_idx_per_token
|
||||
# Other need not this tensor, so we make it None.
|
||||
expert_idx_per_token = None
|
||||
else:
|
||||
expert_idx_per_token = expert_idx_per_token.cast("int64")
|
||||
|
||||
ffn_out = self.compute_ffn(layer, permute_input, token_nums_per_expert,
|
||||
expert_idx_per_token)
|
||||
|
||||
# reduce 中会做 topk 个 weight 的 norm 和 routed_scaling_factor
|
||||
fused_moe_out = moe_expert_reduce(
|
||||
ffn_out,
|
||||
topk_weights,
|
||||
permute_indices_per_token,
|
||||
topk_idx,
|
||||
None,
|
||||
norm_topk_prob=True,
|
||||
routed_scaling_factor=1.0,
|
||||
)
|
||||
|
||||
if layer.tp_size > 1:
|
||||
tensor_model_parallel_all_reduce(fused_moe_out)
|
||||
|
||||
return fused_moe_out
|
||||
|
||||
|
||||
class CutlassW4A8MoEMethod(CutlassMoEMethod):
|
||||
"""
|
||||
w4a8 MoE Method
|
||||
"""
|
||||
|
||||
def __init__(self, quant_config):
|
||||
super().__init__(quant_config)
|
||||
self.quant_config = quant_config
|
||||
self.moe_quant_type = "w4a8"
|
||||
self.pack_num = 2
|
||||
|
||||
def create_weights(self, layer: nn.Layer, state_dict):
|
||||
"""
|
||||
Paddle cutlass create weight process.
|
||||
"""
|
||||
ffn1_weights, ffn2_weights = layer.extract_moe_ffn_weights(state_dict)
|
||||
self.check(layer, ffn1_weights, ffn2_weights)
|
||||
for idx, weight_tensor in enumerate([ffn1_weights, ffn2_weights]):
|
||||
weight_name = self.added_weight_attrs[idx]
|
||||
weight_list = []
|
||||
for i in range(layer.num_local_experts):
|
||||
quant_weight, scale = weight_quantize(weight_tensor[i],
|
||||
algo=self.moe_quant_type,
|
||||
arch=80)
|
||||
weight_list.append(quant_weight)
|
||||
quanted_weight = paddle.stack(weight_list, axis=0)
|
||||
create_and_set_parameter(layer, weight_name, quanted_weight)
|
||||
|
||||
self.create_w4a8_scale_weights(layer, layer.weight_key_map, state_dict)
|
||||
|
||||
def create_w4a8_scale_weights(self, layer: nn.Layer, weight_key_map: dict,
|
||||
state_dict: dict):
|
||||
"""
|
||||
Get w4a8 weights from state dict and process them.
|
||||
Args:
|
||||
layer (nn.Layer): The layer to add parameters to.
|
||||
weight_key_map (dict): The weight key map.
|
||||
state_dict (dict): The state dict.
|
||||
"""
|
||||
|
||||
def _extract_scale_tensor(state_dict, key_template, expert_idx):
|
||||
return get_tensor(state_dict.pop(key_template.format(expert_idx)))
|
||||
|
||||
def _process_in_scale(name: str, in_scales: list[paddle.Tensor]):
|
||||
processed_in_scale = 1 / paddle.concat(in_scales)
|
||||
create_and_set_parameter(layer, name, processed_in_scale)
|
||||
return processed_in_scale
|
||||
|
||||
def _process_weight_scale(name: str,
|
||||
weight_scales: list[paddle.Tensor],
|
||||
processed_in_scale: paddle.Tensor):
|
||||
processed_weight_scale = (paddle.stack(weight_scales, axis=0) /
|
||||
(127 * 112) /
|
||||
processed_in_scale[:, None]).cast(
|
||||
paddle.get_default_dtype())
|
||||
create_and_set_parameter(layer, name, processed_weight_scale)
|
||||
|
||||
# 1. Init scale containers and maps
|
||||
moe_ffn1_weight_scales = []
|
||||
moe_ffn2_weight_scales = []
|
||||
moe_ffn1_in_scales = []
|
||||
moe_ffn2_in_scales = []
|
||||
|
||||
scale_weight_map = {
|
||||
"moe_ffn1_weight_scale": moe_ffn1_weight_scales,
|
||||
"moe_ffn2_weight_scale": moe_ffn2_weight_scales,
|
||||
"moe_ffn1_in_scale": moe_ffn1_in_scales,
|
||||
"moe_ffn2_in_scale": moe_ffn2_in_scales,
|
||||
}
|
||||
scale_key_map = {
|
||||
"moe_ffn1_weight_scale":
|
||||
weight_key_map.get("ffn1_expert_weight_scale_key", None),
|
||||
"moe_ffn2_weight_scale":
|
||||
weight_key_map.get("ffn2_expert_weight_scale_key", None),
|
||||
"moe_ffn1_in_scale":
|
||||
weight_key_map.get("ffn1_expert_in_scale_key", None),
|
||||
"moe_ffn2_in_scale":
|
||||
weight_key_map.get("ffn2_expert_in_scale_key", None),
|
||||
}
|
||||
for name, value in scale_key_map.items():
|
||||
if value is None:
|
||||
raise ValueError(
|
||||
f"scale {name} should not be none in w4a8 mode.")
|
||||
|
||||
# 2. Extract scale tensor from state dict
|
||||
|
||||
for local_expert_idx in range(layer.num_local_experts):
|
||||
expert_idx = local_expert_idx + layer.expert_id_offset * layer.num_local_experts
|
||||
for name, scale_key_template in scale_key_map.items():
|
||||
scale_tensor = _extract_scale_tensor(state_dict,
|
||||
scale_key_template,
|
||||
expert_idx)
|
||||
scale_weight_map[name].append(scale_tensor)
|
||||
|
||||
# 3. Process scale tensor and set to layer
|
||||
in_scales = []
|
||||
for in_scale_name in ["moe_ffn1_in_scale", "moe_ffn2_in_scale"]:
|
||||
in_scales.append(
|
||||
_process_in_scale(in_scale_name,
|
||||
scale_weight_map[in_scale_name]))
|
||||
|
||||
for i, weight_scale_name in enumerate(
|
||||
["moe_ffn1_weight_scale", "moe_ffn2_weight_scale"]):
|
||||
_process_weight_scale(weight_scale_name,
|
||||
scale_weight_map[weight_scale_name],
|
||||
in_scales[i])
|
||||
|
||||
|
||||
class CutlassWeightOnlyMoEMethod(CutlassMoEMethod):
|
||||
"""
|
||||
weight only for moe
|
||||
"""
|
||||
|
||||
def __init__(self, quant_config):
|
||||
super().__init__(quant_config)
|
||||
self.quant_config = quant_config
|
||||
self.moe_quant_type = self.quant_config.algo
|
||||
self.pack_num = 1
|
||||
|
||||
def process_prequanted_weights(self, layer: nn.Layer, state_dict):
|
||||
"""
|
||||
Paddle cutlass process prequanted weights.
|
||||
"""
|
||||
ffn1_expert_weight_key = layer.weight_key_map.get(
|
||||
"ffn1_expert_weight_key", None)
|
||||
ffn2_expert_weight_key = layer.weight_key_map.get(
|
||||
"ffn2_expert_weight_key", None)
|
||||
ffn1_expert_weight_scale_key = layer.weight_key_map.get(
|
||||
"ffn1_expert_weight_scale_key", None)
|
||||
ffn2_expert_weight_scale_key = layer.weight_key_map.get(
|
||||
"ffn2_expert_weight_scale_key", None)
|
||||
|
||||
ffn1_weights, ffn2_weights = layer.load_experts_weight(
|
||||
state_dict, ffn1_expert_weight_key, ffn2_expert_weight_key)
|
||||
# self.check(layer, ffn1_weights, ffn2_weights)
|
||||
ffn1_weight_scale = []
|
||||
ffn2_weight_scale = []
|
||||
for i in range(layer.num_local_experts):
|
||||
expert_idx = layer.expert_id_offset + i
|
||||
ffn1_weight_scale.append(
|
||||
get_tensor(
|
||||
state_dict.pop(
|
||||
ffn1_expert_weight_scale_key.format(expert_idx))))
|
||||
ffn2_weight_scale.append(
|
||||
get_tensor(
|
||||
state_dict.pop(
|
||||
ffn2_expert_weight_scale_key.format(expert_idx))))
|
||||
|
||||
ffn1_weight = paddle.stack(ffn1_weights, axis=0)
|
||||
ffn2_weight = paddle.stack(ffn2_weights, axis=0)
|
||||
ffn1_weight_scale = paddle.stack(ffn1_weight_scale, axis=0)
|
||||
ffn2_weight_scale = paddle.stack(ffn2_weight_scale, axis=0)
|
||||
|
||||
name_tensor_map = {
|
||||
"moe_ffn1_weight": ffn1_weight,
|
||||
"moe_ffn2_weight": ffn2_weight,
|
||||
"moe_ffn1_weight_scale": ffn1_weight_scale,
|
||||
"moe_ffn2_weight_scale": ffn2_weight_scale
|
||||
}
|
||||
for name, tensor in name_tensor_map.items():
|
||||
create_and_set_parameter(layer, name, tensor)
|
||||
|
||||
def create_weights(self, layer: nn.Layer, state_dict):
|
||||
"""
|
||||
Paddle cutlass create weight process.
|
||||
"""
|
||||
ffn1_weights, ffn2_weights = layer.extract_moe_ffn_weights(state_dict)
|
||||
self.check(layer, ffn1_weights, ffn2_weights)
|
||||
|
||||
for idx, weight_tensor in enumerate([ffn1_weights, ffn2_weights]):
|
||||
weight_name = self.added_weight_attrs[idx]
|
||||
scale_name = self.added_scale_attrs[idx]
|
||||
|
||||
weight_list = []
|
||||
weight_scale_list = []
|
||||
for i in range(layer.num_local_experts):
|
||||
quant_weight, scale = weight_quantize(weight_tensor[i],
|
||||
algo=self.moe_quant_type)
|
||||
weight_list.append(quant_weight)
|
||||
weight_scale_list.append(scale)
|
||||
quanted_weight = paddle.stack(weight_list, axis=0)
|
||||
create_and_set_parameter(layer, weight_name, quanted_weight)
|
||||
|
||||
quanted_weight_scale = paddle.stack(weight_scale_list, axis=0)
|
||||
create_and_set_parameter(layer, scale_name, quanted_weight_scale)
|
||||
@@ -0,0 +1,380 @@
|
||||
"""
|
||||
# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
"""
|
||||
|
||||
import numpy as np
|
||||
import paddle
|
||||
from paddle import nn
|
||||
from paddleformers.utils.log import logger
|
||||
|
||||
import fastdeploy
|
||||
import fastdeploy.model_executor.ops.gpu.deep_gemm as deep_gemm
|
||||
from fastdeploy.distributed.communication_op import \
|
||||
tensor_model_parallel_all_reduce
|
||||
from fastdeploy.model_executor.ops.gpu import count_tokens_per_expert_func
|
||||
from fastdeploy.model_executor.layers.utils import get_tensor
|
||||
|
||||
from ..utils import create_and_set_parameter
|
||||
from .fused_moe_backend_base import MoEMethodBase
|
||||
|
||||
|
||||
class DeepGemmFusedMoeMethod(MoEMethodBase):
|
||||
"""
|
||||
DeepGemmFusedMoeMethod is a class that implements the MoEMethodBase interface for DeepGemm backend.
|
||||
"""
|
||||
|
||||
def create_weights(self, layer: nn.Layer, state_dict):
|
||||
"""
|
||||
deepgemm create weight process.
|
||||
"""
|
||||
|
||||
ffn1_weights, ffn2_weights = layer.extract_moe_ffn_weights(state_dict)
|
||||
|
||||
self.check(layer, ffn1_weights, ffn2_weights)
|
||||
|
||||
for idx, weight_tensor in enumerate([ffn1_weights, ffn2_weights]):
|
||||
weight_name = self.added_weight_attrs[idx]
|
||||
scale_name = self.added_scale_attrs[idx]
|
||||
|
||||
weight_list = []
|
||||
weight_scale_list = []
|
||||
for i in range(layer.num_local_experts):
|
||||
from fastdeploy.model_executor.layers.utils import \
|
||||
per_block_cast_to_fp8
|
||||
quant_weight, scale = per_block_cast_to_fp8(
|
||||
weight_tensor[i], self.quant_config.weight_block_size)
|
||||
|
||||
weight_list.append(quant_weight)
|
||||
weight_scale_list.append(scale)
|
||||
quanted_weight = paddle.stack(weight_list, axis=0)
|
||||
quanted_weight = quanted_weight.transpose([0, 2, 1]).contiguous()
|
||||
create_and_set_parameter(layer, weight_name, quanted_weight)
|
||||
|
||||
quanted_weight_scale = paddle.stack(weight_scale_list, axis=0)
|
||||
quanted_weight_scale = quanted_weight_scale.transpose(
|
||||
[0, 2, 1]).contiguous()
|
||||
create_and_set_parameter(layer, scale_name, quanted_weight_scale)
|
||||
|
||||
def process_prequanted_weights(self, layer: nn.Layer, state_dict):
|
||||
"""
|
||||
Paddle cutlass process prequanted weights.
|
||||
"""
|
||||
ffn1_expert_weight_key = layer.weight_key_map.get(
|
||||
"ffn1_expert_weight_key", None)
|
||||
ffn2_expert_weight_key = layer.weight_key_map.get(
|
||||
"ffn2_expert_weight_key", None)
|
||||
ffn1_expert_weight_scale_key = layer.weight_key_map.get(
|
||||
"ffn1_expert_weight_scale_key", None)
|
||||
ffn2_expert_weight_scale_key = layer.weight_key_map.get(
|
||||
"ffn2_expert_weight_scale_key", None)
|
||||
|
||||
ffn1_weights, ffn2_weights = layer.load_experts_weight(
|
||||
state_dict, ffn1_expert_weight_key, ffn2_expert_weight_key)
|
||||
# self.check(layer, ffn1_weights, ffn2_weights)
|
||||
ffn1_weight_scale = []
|
||||
ffn2_weight_scale = []
|
||||
for i in range(layer.num_local_experts):
|
||||
expert_idx = layer.expert_id_offset + i
|
||||
ffn1_weight_scale.append(
|
||||
get_tensor(
|
||||
state_dict.pop(
|
||||
ffn1_expert_weight_scale_key.format(expert_idx))))
|
||||
ffn2_weight_scale.append(
|
||||
get_tensor(
|
||||
state_dict.pop(
|
||||
ffn2_expert_weight_scale_key.format(expert_idx))))
|
||||
|
||||
ffn1_weight = paddle.stack(ffn1_weights, axis=0).transpose([0, 2, 1]).contiguous().view("float8_e4m3fn")
|
||||
ffn2_weight = paddle.stack(ffn2_weights, axis=0).transpose([0, 2, 1]).contiguous().view("float8_e4m3fn")
|
||||
ffn1_weight_scale = paddle.stack(ffn1_weight_scale, axis=0).transpose([0, 2, 1]).contiguous()
|
||||
ffn2_weight_scale = paddle.stack(ffn2_weight_scale, axis=0).transpose([0, 2, 1]).contiguous()
|
||||
|
||||
name_tensor_map = {
|
||||
"moe_ffn1_weight": ffn1_weight,
|
||||
"moe_ffn2_weight": ffn2_weight,
|
||||
"moe_ffn1_weight_scale": ffn1_weight_scale,
|
||||
"moe_ffn2_weight_scale": ffn2_weight_scale
|
||||
}
|
||||
for name, tensor in name_tensor_map.items():
|
||||
create_and_set_parameter(layer, name, tensor)
|
||||
|
||||
def apply_ep_prefill(
|
||||
self,
|
||||
layer: nn.Layer,
|
||||
x: paddle.Tensor,
|
||||
gate_out: paddle.Tensor,
|
||||
) -> paddle.Tensor:
|
||||
"""
|
||||
Apply the EP prefill method.
|
||||
"""
|
||||
# 1. Select topk experts and weights
|
||||
topk_idx, topk_weights = self.ep_prefill_runner.moe_select(
|
||||
layer, gate_out)
|
||||
# 2. Dynamic compute blockwise quantization scales
|
||||
x, x_scale_tensor = fastdeploy.model_executor.ops.gpu.per_token_quant(
|
||||
x, self.quant_config.weight_block_size[0])
|
||||
# 3. EP Dispatch
|
||||
(
|
||||
recv_x,
|
||||
recv_topk_idx,
|
||||
recv_topk_weights,
|
||||
recv_num_tokens_per_expert_list,
|
||||
handle,
|
||||
_,
|
||||
) = self.ep_prefill_runner.dispatch(x,
|
||||
topk_idx,
|
||||
topk_weights,
|
||||
x_scale_tensor=x_scale_tensor)
|
||||
|
||||
token_all_num = sum(recv_num_tokens_per_expert_list)
|
||||
|
||||
# 4. Compute ffn
|
||||
if token_all_num > 0:
|
||||
logger.info(f"token_all_num {token_all_num}")
|
||||
(recv_x, recv_x_scale) = recv_x
|
||||
tmp = count_tokens_per_expert_func(recv_topk_idx, layer.num_local_experts)
|
||||
(
|
||||
permute_input,
|
||||
permute_scale,
|
||||
permute_indices_per_token,
|
||||
recv_num_tokens_per_expert_list_cumsum,
|
||||
recv_num_tokens_per_expert_list_padded_cumsum,
|
||||
dst_weights,
|
||||
dst_indices,
|
||||
cumsum_idx_gpu,
|
||||
m_indices,
|
||||
) = fastdeploy.model_executor.ops.gpu.ep_moe_expert_dispatch_fp8(
|
||||
recv_x,
|
||||
recv_x_scale,
|
||||
recv_topk_idx,
|
||||
recv_topk_weights,
|
||||
tmp[0],
|
||||
tmp[1]
|
||||
)
|
||||
|
||||
permute_scale = permute_scale.transpose([1, 0]).contiguous()
|
||||
permute_scale = permute_scale.transpose([1, 0])
|
||||
|
||||
# ffn1
|
||||
ffn_out = paddle.empty(
|
||||
(permute_input.shape[0], layer.moe_ffn1_weight.shape[1]),
|
||||
dtype=paddle.bfloat16,
|
||||
)
|
||||
deep_gemm.m_grouped_gemm_fp8_fp8_bf16_nt_contiguous(
|
||||
(permute_input, permute_scale),
|
||||
(layer.moe_ffn1_weight, layer.moe_ffn1_weight_scale),
|
||||
ffn_out,
|
||||
m_indices,
|
||||
)
|
||||
# swiglu
|
||||
ffn_out = paddle.incubate.nn.functional.swiglu(ffn_out, None)
|
||||
|
||||
# ffn2
|
||||
ffn_in_x, ffn_in_x_scale_tensor = fastdeploy.model_executor.ops.gpu.per_token_quant(
|
||||
ffn_out, self.quant_config.weight_block_size[0])
|
||||
ffn_in_x_scale_tensor = ffn_in_x_scale_tensor.transpose(
|
||||
[1, 0]).contiguous()
|
||||
ffn_in_x_scale_tensor = ffn_in_x_scale_tensor.transpose([1, 0])
|
||||
|
||||
ffn_out = paddle.empty(
|
||||
(ffn_out.shape[0], layer.moe_ffn2_weight.shape[1]),
|
||||
dtype=paddle.bfloat16)
|
||||
deep_gemm.m_grouped_gemm_fp8_fp8_bf16_nt_contiguous(
|
||||
(ffn_in_x, ffn_in_x_scale_tensor),
|
||||
(layer.moe_ffn2_weight, layer.moe_ffn2_weight_scale),
|
||||
ffn_out,
|
||||
m_indices,
|
||||
)
|
||||
# prmt back per rank
|
||||
tmp_ffn_out = fastdeploy.model_executor.ops.gpu.ep_moe_expert_combine(
|
||||
ffn_out,
|
||||
dst_weights,
|
||||
permute_indices_per_token,
|
||||
dst_indices,
|
||||
None, # moe_ffn2_bias
|
||||
False, # norm_topk_prob
|
||||
1.0,
|
||||
)[0]
|
||||
|
||||
else:
|
||||
tmp_ffn_out = paddle.cast(recv_x[0], paddle.bfloat16)
|
||||
|
||||
# 5. EP combine
|
||||
return self.ep_prefill_runner.combine(tmp_ffn_out, handle,
|
||||
recv_topk_weights)
|
||||
|
||||
def apply_ep_decode(
|
||||
self,
|
||||
layer: nn.Layer,
|
||||
x: paddle.Tensor,
|
||||
gate_out: paddle.Tensor,
|
||||
) -> paddle.Tensor:
|
||||
"""
|
||||
Apply the EP decoder method.
|
||||
"""
|
||||
# 1. Select topk experts and weights
|
||||
topk_idx, topk_weights = self.ep_decoder_runner.moe_select(
|
||||
layer, gate_out)
|
||||
# 2. EP Dispatch
|
||||
permute_input, token_nums_per_expert, handle = self.ep_decoder_runner.dispatch(
|
||||
x, topk_idx, topk_weights, use_fp8=True)
|
||||
|
||||
# 3. Compute ffn
|
||||
assert isinstance(permute_input, tuple)
|
||||
ffn1_out = paddle.empty(
|
||||
[
|
||||
layer.num_local_experts,
|
||||
layer.ep_size *
|
||||
layer.moe_config.num_max_dispatch_tokens_per_rank,
|
||||
layer.moe_intermediate_size * 2,
|
||||
],
|
||||
dtype=paddle.bfloat16,
|
||||
)
|
||||
|
||||
ffn_out = paddle.empty(
|
||||
[
|
||||
layer.num_local_experts,
|
||||
layer.ep_size *
|
||||
layer.moe_config.num_max_dispatch_tokens_per_rank,
|
||||
layer.hidden_size,
|
||||
],
|
||||
dtype=paddle.bfloat16,
|
||||
)
|
||||
|
||||
expected_m = 128
|
||||
deep_gemm.m_grouped_gemm_fp8_fp8_bf16_nt_masked(
|
||||
permute_input,
|
||||
(
|
||||
layer.moe_ffn1_weight,
|
||||
layer.moe_ffn1_weight_scale,
|
||||
),
|
||||
ffn1_out,
|
||||
token_nums_per_expert,
|
||||
expected_m,
|
||||
)
|
||||
|
||||
act_out = fastdeploy.model_executor.ops.gpu.group_swiglu_with_masked(
|
||||
ffn1_out, token_nums_per_expert)
|
||||
|
||||
act_out_fp8, scale = fastdeploy.model_executor.ops.gpu.masked_per_token_quant(
|
||||
act_out, token_nums_per_expert,
|
||||
self.quant_config.weight_block_size[0])
|
||||
|
||||
deep_gemm.m_grouped_gemm_fp8_fp8_bf16_nt_masked(
|
||||
(act_out_fp8, scale),
|
||||
(
|
||||
layer.moe_ffn2_weight,
|
||||
layer.moe_ffn2_weight_scale,
|
||||
),
|
||||
ffn_out,
|
||||
token_nums_per_expert,
|
||||
expected_m,
|
||||
)
|
||||
|
||||
# 4. EP combine
|
||||
return self.ep_decoder_runner.combine(ffn_out, topk_idx, topk_weights,
|
||||
handle)
|
||||
|
||||
def apply_tp(
|
||||
self,
|
||||
layer: nn.Layer,
|
||||
x: paddle.Tensor,
|
||||
gate_out: paddle.Tensor,
|
||||
) -> paddle.Tensor:
|
||||
"""
|
||||
Paddle Use DeepGemm compute Fused MoE.
|
||||
below is TP compute method.
|
||||
"""
|
||||
|
||||
topk_ids, topk_weights = fastdeploy.model_executor.ops.gpu.moe_topk_select(
|
||||
gate_out,
|
||||
layer.gate_correction_bias,
|
||||
layer.top_k,
|
||||
True, # apply_norm_weight
|
||||
False,
|
||||
)
|
||||
|
||||
tmp = count_tokens_per_expert_func(topk_ids, layer.num_experts)
|
||||
|
||||
recv_x, recv_x_scale = fastdeploy.model_executor.ops.gpu.per_token_quant(
|
||||
x, 128)
|
||||
|
||||
(
|
||||
permute_input,
|
||||
permute_scale,
|
||||
permute_indices_per_token,
|
||||
recv_num_tokens_per_expert_list_cumsum,
|
||||
recv_num_tokens_per_expert_list_padded_cumsum,
|
||||
dst_weights,
|
||||
dst_indices,
|
||||
cumsum_idx_gpu,
|
||||
m_indices,
|
||||
) = fastdeploy.model_executor.ops.gpu.ep_moe_expert_dispatch_fp8(
|
||||
recv_x,
|
||||
recv_x_scale,
|
||||
topk_ids,
|
||||
topk_weights,
|
||||
tmp[0],
|
||||
tmp[1],
|
||||
)
|
||||
|
||||
permute_scale = permute_scale.transpose([1, 0]).contiguous()
|
||||
permute_scale = permute_scale.transpose([1, 0])
|
||||
|
||||
# ffn1
|
||||
ffn_out = paddle.empty(
|
||||
(permute_input.shape[0], layer.moe_ffn1_weight.shape[1]),
|
||||
dtype=paddle.bfloat16,
|
||||
)
|
||||
deep_gemm.m_grouped_gemm_fp8_fp8_bf16_nt_contiguous(
|
||||
(permute_input, permute_scale),
|
||||
(layer.moe_ffn1_weight, layer.moe_ffn1_weight_scale),
|
||||
ffn_out,
|
||||
m_indices,
|
||||
)
|
||||
# swiglu
|
||||
ffn_out = paddle.incubate.nn.functional.swiglu(ffn_out)
|
||||
|
||||
# ffn2
|
||||
ffn_in_x, ffn_in_x_scale_tensor = fastdeploy.model_executor.ops.gpu.per_token_quant(
|
||||
ffn_out, self.quant_config.weight_block_size[0])
|
||||
|
||||
ffn_in_x_scale_tensor = ffn_in_x_scale_tensor.transpose(
|
||||
[1, 0]).contiguous()
|
||||
ffn_in_x_scale_tensor = ffn_in_x_scale_tensor.transpose([1, 0])
|
||||
|
||||
ffn_out = paddle.empty(
|
||||
(ffn_out.shape[0], layer.moe_ffn2_weight.shape[1]),
|
||||
dtype=paddle.bfloat16)
|
||||
deep_gemm.m_grouped_gemm_fp8_fp8_bf16_nt_contiguous(
|
||||
(ffn_in_x, ffn_in_x_scale_tensor),
|
||||
(layer.moe_ffn2_weight, layer.moe_ffn2_weight_scale),
|
||||
ffn_out,
|
||||
m_indices,
|
||||
)
|
||||
# prmt back per rank
|
||||
tmp_ffn_out = fastdeploy.model_executor.ops.gpu.ep_moe_expert_combine(
|
||||
ffn_out,
|
||||
dst_weights,
|
||||
permute_indices_per_token,
|
||||
dst_indices,
|
||||
None,
|
||||
False, # norm_topk_prob
|
||||
1.0,
|
||||
)[0]
|
||||
if layer.tp_size > 1:
|
||||
tensor_model_parallel_all_reduce(tmp_ffn_out)
|
||||
|
||||
return tmp_ffn_out
|
||||
@@ -0,0 +1,285 @@
|
||||
"""
|
||||
# Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
"""
|
||||
|
||||
import paddle
|
||||
from paddle import nn
|
||||
|
||||
import fastdeploy
|
||||
from fastdeploy.distributed.communication_op import \
|
||||
tensor_model_parallel_all_reduce
|
||||
from fastdeploy.model_executor.ops.gpu import (MoeWna16MarlinGemmApi,
|
||||
tritonmoe_preprocess_func)
|
||||
|
||||
from ..quantization.quant_base import QuantMethodBase
|
||||
|
||||
|
||||
def gptq_marlin_moe_repack(b_q_weight: paddle.Tensor, perm: paddle.Tensor,
|
||||
size_k: int, size_n: int,
|
||||
num_bits: int) -> paddle.Tensor:
|
||||
"""
|
||||
Util function.
|
||||
"""
|
||||
from fastdeploy.model_executor.ops.gpu import gptq_marlin_repack
|
||||
num_experts = b_q_weight.shape[0]
|
||||
assert size_k % 16 == 0
|
||||
output = paddle.empty(
|
||||
[num_experts, size_k // 16, size_n * (num_bits // 2)],
|
||||
dtype=b_q_weight.dtype)
|
||||
for e in range(num_experts):
|
||||
output[e] = gptq_marlin_repack(b_q_weight[e], perm[e], size_k, size_n,
|
||||
num_bits)
|
||||
return output
|
||||
|
||||
|
||||
def get_scale_perms():
|
||||
"""
|
||||
Util function.
|
||||
"""
|
||||
scale_perm: list[int] = []
|
||||
for i in range(8):
|
||||
scale_perm.extend([i + 8 * j for j in range(8)])
|
||||
scale_perm_single: list[int] = []
|
||||
for i in range(4):
|
||||
scale_perm_single.extend(
|
||||
[2 * i + j for j in [0, 1, 8, 9, 16, 17, 24, 25]])
|
||||
return scale_perm, scale_perm_single
|
||||
|
||||
|
||||
def marlin_permute_scales(s: paddle.Tensor, size_k: int, size_n: int,
|
||||
group_size: int) -> paddle.Tensor:
|
||||
"""
|
||||
Util function.
|
||||
"""
|
||||
scale_perm, scale_perm_single = get_scale_perms()
|
||||
if group_size < size_k and group_size != -1:
|
||||
s = s.reshape([-1, len(scale_perm)])[:, scale_perm]
|
||||
else:
|
||||
s = s.reshape([-1, len(scale_perm_single)])[:, scale_perm_single]
|
||||
s = s.reshape((-1, size_n)).contiguous()
|
||||
|
||||
return s
|
||||
|
||||
|
||||
def marlin_moe_permute_scales(
|
||||
s: paddle.Tensor,
|
||||
size_k: int,
|
||||
size_n: int,
|
||||
group_size: int,
|
||||
):
|
||||
"""
|
||||
Util function.
|
||||
"""
|
||||
num_experts = s.shape[0]
|
||||
output = paddle.empty(
|
||||
[num_experts, s.shape[1], s.shape[2]],
|
||||
dtype=s.dtype,
|
||||
)
|
||||
|
||||
for e in range(num_experts):
|
||||
output[e] = marlin_permute_scales(s[e], size_k, size_n, group_size)
|
||||
return output
|
||||
|
||||
|
||||
class MarlinWeightOnlyMoEMethod(QuantMethodBase):
|
||||
"""
|
||||
Use Marlin Group Gemm to compute Fused MoE.
|
||||
"""
|
||||
|
||||
def __init__(self, quant_method=None):
|
||||
"""
|
||||
Marlin Group Gemm to compute Fused MoE.
|
||||
"""
|
||||
self.quant_method = quant_method
|
||||
self.added_weight_attrs = ["moe_ffn1_weight", "moe_ffn2_weight"]
|
||||
self.added_scale_attrs = [
|
||||
"moe_ffn1_weight_scale", "moe_ffn2_weight_scale"
|
||||
]
|
||||
self.added_zeros_attrs = ["zeros0", "zeros1"]
|
||||
|
||||
def create_weights(self, layer: nn.Layer, state_dict):
|
||||
"""
|
||||
Marlin MoE create weight process.
|
||||
"""
|
||||
ffn1_weights, ffn2_weights = layer.extract_moe_ffn_weights(state_dict)
|
||||
assert len(ffn1_weights) == layer.num_local_experts
|
||||
assert len(ffn2_weights) == layer.num_local_experts
|
||||
assert ffn1_weights[0].shape == [
|
||||
layer.hidden_size, layer.moe_intermediate_size * 2
|
||||
]
|
||||
assert ffn2_weights[0].shape == [
|
||||
layer.moe_intermediate_size, layer.hidden_size
|
||||
]
|
||||
|
||||
ffn1_tensor = paddle.stack(ffn1_weights, axis=0)
|
||||
ffn2_tensor = paddle.stack(ffn2_weights, axis=0)
|
||||
|
||||
max_bound = 7
|
||||
|
||||
for idx, weight_tensor in enumerate([ffn1_tensor, ffn2_tensor]):
|
||||
weight_name = self.added_weight_attrs[idx]
|
||||
scale_name = self.added_scale_attrs[idx]
|
||||
|
||||
weight_scale = weight_tensor.abs().max(axis=1)
|
||||
quanted_weight = weight_tensor / weight_scale[:,
|
||||
None, :] * max_bound
|
||||
quanted_weight = paddle.round(quanted_weight).astype("int32")
|
||||
|
||||
quanted_weight[quanted_weight > 7] = 7
|
||||
quanted_weight[quanted_weight < -7] = -7
|
||||
quanted_weight += 8
|
||||
|
||||
E, K, N = quanted_weight.shape
|
||||
quanted_weight = quanted_weight.reshape([0, K // 8, 8, N])
|
||||
res = paddle.zeros([E, K // 8, N], dtype='int32')
|
||||
for j in range(8):
|
||||
tmp = quanted_weight[:, :, j, :]
|
||||
res = res | (tmp << (j * 4))
|
||||
quanted_weight = paddle.assign(res)
|
||||
weight_scale = weight_scale / max_bound
|
||||
weight_scale = weight_scale[:, None, :]
|
||||
|
||||
group_size = -1 # means per_channel
|
||||
|
||||
g_idx_sort_indices = paddle.empty([E, 0], dtype="int32")
|
||||
quanted_weight = gptq_marlin_moe_repack(
|
||||
quanted_weight,
|
||||
g_idx_sort_indices,
|
||||
K,
|
||||
N,
|
||||
4,
|
||||
)
|
||||
|
||||
weight_scale = marlin_moe_permute_scales(
|
||||
weight_scale,
|
||||
size_k=layer.moe_intermediate_size, #useless
|
||||
size_n=N,
|
||||
group_size=group_size)
|
||||
|
||||
for (name, tensor) in [(weight_name, quanted_weight),
|
||||
(scale_name, weight_scale)]:
|
||||
setattr(
|
||||
layer, name,
|
||||
layer.create_parameter(
|
||||
shape=tensor.shape,
|
||||
dtype=tensor.dtype,
|
||||
default_initializer=paddle.nn.initializer.Constant(0),
|
||||
))
|
||||
getattr(layer, name).set_value(tensor)
|
||||
|
||||
def apply(
|
||||
self,
|
||||
layer: nn.Layer,
|
||||
x: paddle.Tensor,
|
||||
gate_out: paddle.Tensor,
|
||||
) -> paddle.Tensor:
|
||||
"""
|
||||
Marlin compute Fused MoE.
|
||||
"""
|
||||
token_num = x.shape[0]
|
||||
top_k = layer.top_k
|
||||
top_k = layer.top_k
|
||||
moe_intermediate_size = layer.moe_intermediate_size
|
||||
hidden_size = layer.hidden_size
|
||||
num_experts = layer.num_experts
|
||||
|
||||
gate_out = paddle.matmul(x.cast("float32"), layer.gate_weight)
|
||||
|
||||
topk_ids, topk_weights = fastdeploy.model_executor.ops.gpu.moe_topk_select(
|
||||
gate_out,
|
||||
layer.gate_correction_bias,
|
||||
top_k,
|
||||
True, # apply_norm_weight,
|
||||
False,
|
||||
)
|
||||
|
||||
block_size_m = 64
|
||||
|
||||
for m in [8, 16, 32, 48, 64]:
|
||||
if token_num * top_k / num_experts / m < 0.9:
|
||||
block_size_m = m
|
||||
break
|
||||
|
||||
topk = top_k
|
||||
|
||||
# for H100 132 sms
|
||||
workspace = paddle.empty([528], dtype="int32")
|
||||
|
||||
sorted_token_ids, expert_ids, num_tokens_post_padded = tritonmoe_preprocess_func(
|
||||
topk_ids, num_experts, block_size_m)
|
||||
|
||||
ffn_out = MoeWna16MarlinGemmApi(
|
||||
x,
|
||||
c_or_none=None,
|
||||
b_q_weight=layer.moe_ffn1_weight,
|
||||
b_scales=layer.moe_ffn1_weight_scale,
|
||||
global_scale_or_none=None,
|
||||
b_zeros_or_none=None,
|
||||
g_idx_or_none=None,
|
||||
perm_or_none=None,
|
||||
workspace=workspace,
|
||||
sorted_token_ids=sorted_token_ids,
|
||||
expert_ids=expert_ids,
|
||||
num_tokens_post_padded=num_tokens_post_padded,
|
||||
topk_weights=topk_weights,
|
||||
moe_block_size=block_size_m,
|
||||
top_k=topk,
|
||||
mul_topk_weights=False,
|
||||
is_ep=False,
|
||||
b_q_type_str="uint4b8",
|
||||
size_m=token_num,
|
||||
size_n=moe_intermediate_size * 2,
|
||||
size_k=hidden_size,
|
||||
is_k_full=True,
|
||||
use_atomic_add=True,
|
||||
use_fp32_reduce=True,
|
||||
is_zp_float=False)[0]
|
||||
|
||||
swiglu_out = paddle.incubate.nn.functional.swiglu(ffn_out)
|
||||
|
||||
ffn_out = MoeWna16MarlinGemmApi(
|
||||
swiglu_out,
|
||||
c_or_none=None,
|
||||
b_q_weight=layer.moe_ffn2_weight,
|
||||
b_scales=layer.moe_ffn2_weight_scale,
|
||||
global_scale_or_none=None,
|
||||
b_zeros_or_none=None,
|
||||
g_idx_or_none=None,
|
||||
perm_or_none=None,
|
||||
workspace=workspace,
|
||||
sorted_token_ids=sorted_token_ids,
|
||||
expert_ids=expert_ids,
|
||||
num_tokens_post_padded=num_tokens_post_padded,
|
||||
topk_weights=topk_weights,
|
||||
moe_block_size=block_size_m,
|
||||
top_k=1,
|
||||
mul_topk_weights=True,
|
||||
is_ep=False,
|
||||
b_q_type_str="uint4b8",
|
||||
size_m=token_num * topk,
|
||||
size_n=hidden_size,
|
||||
size_k=moe_intermediate_size,
|
||||
is_k_full=True,
|
||||
use_atomic_add=True,
|
||||
use_fp32_reduce=True,
|
||||
is_zp_float=False)[0]
|
||||
|
||||
ffn_out.reshape_([token_num, -1, hidden_size])
|
||||
ffn_out = ffn_out.sum(axis=1)
|
||||
|
||||
if layer.tp_size > 1:
|
||||
tensor_model_parallel_all_reduce(ffn_out)
|
||||
|
||||
return ffn_out
|
||||
@@ -1,57 +0,0 @@
|
||||
"""
|
||||
# Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
"""
|
||||
|
||||
from abc import abstractmethod
|
||||
|
||||
import paddle
|
||||
from paddle import nn
|
||||
|
||||
from fastdeploy.model_executor.layers.quantization.quant_base import \
|
||||
QuantMethodBase
|
||||
|
||||
|
||||
class FusedMoEMethodBase(QuantMethodBase):
|
||||
"""
|
||||
All MoE Method should inherit this class.
|
||||
and must implement following methods!
|
||||
|
||||
"""
|
||||
|
||||
@abstractmethod
|
||||
def create_weights(self,
|
||||
layer: nn.Layer,
|
||||
moe_compute_params,
|
||||
ffn1_tensor,
|
||||
ffn2_tensor,
|
||||
ffn1_bias=None,
|
||||
ffn2_bias=None):
|
||||
"""
|
||||
How to create weights, you must implement this method.
|
||||
"""
|
||||
raise NotImplementedError
|
||||
|
||||
@abstractmethod
|
||||
def apply(
|
||||
self,
|
||||
layer: nn.Layer,
|
||||
moe_compute_params,
|
||||
x: paddle.Tensor,
|
||||
) -> paddle.Tensor:
|
||||
"""
|
||||
Compute methods, you must implement this method.
|
||||
"""
|
||||
|
||||
raise NotImplementedError
|
||||
@@ -0,0 +1,479 @@
|
||||
"""
|
||||
# Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
"""
|
||||
|
||||
import paddle
|
||||
from paddle import nn
|
||||
|
||||
from fastdeploy.distributed.communication_op import \
|
||||
tensor_model_parallel_all_reduce
|
||||
from fastdeploy.model_executor.layers.utils import (create_hadamard_matrix_map,
|
||||
get_tensor)
|
||||
from fastdeploy.utils import ceil_div
|
||||
|
||||
from ..quantization.quant_base import QuantMethodBase
|
||||
|
||||
|
||||
class TritonWeightOnlyMoEMethod(QuantMethodBase):
|
||||
"""
|
||||
Use Triton Group Gemm to compute Fused MoE.
|
||||
"""
|
||||
|
||||
def __init__(self, quant_method=None):
|
||||
"""
|
||||
Triton Group Gemm to compute Fused MoE.
|
||||
"""
|
||||
self.quant_method = quant_method
|
||||
self.added_weight_attrs = ["moe_ffn1_weight", "moe_ffn2_weight"]
|
||||
self.added_scale_attrs = [
|
||||
"moe_ffn1_weight_scale", "moe_ffn2_weight_scale"
|
||||
]
|
||||
|
||||
def process_prequanted_weights(self, layer: nn.Layer, state_dict) -> None:
|
||||
"""process_prequanted_weights"""
|
||||
pass
|
||||
|
||||
def create_weights(self, layer: nn.Layer, state_dict):
|
||||
"""
|
||||
Triton MoE create weight process.
|
||||
"""
|
||||
ffn1_weights, ffn2_weights = layer.extract_moe_ffn_weights(state_dict)
|
||||
assert len(ffn1_weights) == layer.num_local_experts
|
||||
assert len(ffn2_weights) == layer.num_local_experts
|
||||
assert layer.quant_method.quant_config.name() == "wint8"
|
||||
assert ffn1_weights[0].shape == [
|
||||
layer.hidden_size, layer.moe_intermediate_size * 2
|
||||
]
|
||||
assert ffn2_weights[0].shape == [
|
||||
layer.moe_intermediate_size, layer.hidden_size
|
||||
]
|
||||
|
||||
ffn1_tensor = paddle.stack(ffn1_weights, axis=0)
|
||||
ffn2_tensor = paddle.stack(ffn2_weights, axis=0)
|
||||
|
||||
if self.quant_config.name() == "wint8":
|
||||
max_bound = 127
|
||||
elif self.quant_config.name() == "wint4":
|
||||
max_bound = 7
|
||||
|
||||
for idx, weight_tensor in enumerate([ffn1_tensor, ffn2_tensor]):
|
||||
weight_name = self.added_weight_attrs[idx]
|
||||
scale_name = self.added_scale_attrs[idx]
|
||||
|
||||
quanted_weight_scale = weight_tensor.abs().max(axis=1)
|
||||
quanted_weight = weight_tensor / quanted_weight_scale[:,
|
||||
None, :] * max_bound
|
||||
quanted_weight = paddle.round(quanted_weight).astype("int8")
|
||||
quanted_weight_scale = quanted_weight_scale / max_bound
|
||||
|
||||
setattr(
|
||||
layer, weight_name,
|
||||
layer.create_parameter(
|
||||
shape=quanted_weight.shape,
|
||||
dtype=quanted_weight.dtype,
|
||||
default_initializer=paddle.nn.initializer.Constant(0),
|
||||
))
|
||||
getattr(layer, weight_name).set_value(quanted_weight)
|
||||
|
||||
setattr(
|
||||
layer, scale_name,
|
||||
layer.create_parameter(
|
||||
shape=quanted_weight_scale.shape,
|
||||
dtype=quanted_weight_scale.dtype,
|
||||
))
|
||||
getattr(layer, scale_name).set_value(quanted_weight_scale)
|
||||
|
||||
def apply(
|
||||
self,
|
||||
layer: nn.Layer,
|
||||
x: paddle.Tensor,
|
||||
gate_out: paddle.Tensor,
|
||||
) -> paddle.Tensor:
|
||||
"""
|
||||
Triton compute Fused MoE.
|
||||
"""
|
||||
token_num = x.shape[0]
|
||||
top_k = layer.top_k
|
||||
num_local_experts = layer.num_local_experts
|
||||
top_k = layer.top_k
|
||||
moe_intermediate_size = layer.moe_intermediate_size
|
||||
hidden_size = layer.hidden_size
|
||||
|
||||
gate_out = paddle.matmul(x.cast("float32"), layer.gate_weight)
|
||||
scores = paddle.nn.functional.softmax(gate_out, axis=-1)
|
||||
|
||||
topk_weights, topk_ids = paddle.topk(scores,
|
||||
k=top_k,
|
||||
axis=-1,
|
||||
sorted=False)
|
||||
topk_weights = topk_weights / topk_weights.sum(axis=-1, keepdim=True)
|
||||
|
||||
intermediate_cache1 = paddle.empty(
|
||||
[token_num * top_k, moe_intermediate_size * 2],
|
||||
dtype=x.dtype,
|
||||
)
|
||||
intermediate_cache2 = paddle.empty(
|
||||
(token_num * top_k, moe_intermediate_size),
|
||||
dtype=x.dtype,
|
||||
)
|
||||
intermediate_cache3 = paddle.empty(
|
||||
(token_num * top_k, hidden_size),
|
||||
dtype=x.dtype,
|
||||
)
|
||||
|
||||
config = {
|
||||
"BLOCK_SIZE_M": 32,
|
||||
"BLOCK_SIZE_N": 128,
|
||||
"BLOCK_SIZE_K": 128,
|
||||
"GROUP_SIZE_M": 1,
|
||||
}
|
||||
from fastdeploy.model_executor.ops.gpu import tritonmoe_preprocess
|
||||
|
||||
from .triton_moe_kernels import fused_moe_kernel_paddle
|
||||
sorted_token_ids, expert_ids, num_tokens_post_padded = tritonmoe_preprocess(
|
||||
topk_ids, num_local_experts, config["BLOCK_SIZE_M"])
|
||||
max_num_tokens_padded = sorted_token_ids.shape[0]
|
||||
grid = (ceil_div(max_num_tokens_padded, config["BLOCK_SIZE_M"]) *
|
||||
ceil_div(moe_intermediate_size * 2, config["BLOCK_SIZE_N"]), )
|
||||
|
||||
fused_moe_kernel_paddle[grid](
|
||||
x,
|
||||
layer.moe_ffn1_weight,
|
||||
intermediate_cache1,
|
||||
None,
|
||||
layer.moe_ffn1_weight_scale,
|
||||
None,
|
||||
sorted_token_ids,
|
||||
expert_ids,
|
||||
num_tokens_post_padded,
|
||||
moe_intermediate_size * 2,
|
||||
hidden_size,
|
||||
max_num_tokens_padded,
|
||||
token_num * top_k,
|
||||
stride_am=x.strides[0],
|
||||
stride_ak=x.strides[1],
|
||||
stride_be=layer.moe_ffn1_weight.strides[0],
|
||||
stride_bk=layer.moe_ffn1_weight.strides[1],
|
||||
stride_bn=layer.moe_ffn1_weight.strides[2],
|
||||
stride_cm=intermediate_cache1.strides[0],
|
||||
stride_cn=intermediate_cache1.strides[1],
|
||||
#
|
||||
stride_asm=-1,
|
||||
stride_ask=-1,
|
||||
stride_bse=layer.moe_ffn1_weight_scale.strides[0],
|
||||
stride_bsk=-1,
|
||||
stride_bsn=layer.moe_ffn1_weight_scale.strides[1],
|
||||
group_n=-1,
|
||||
group_k=-1,
|
||||
# Meta-parameters
|
||||
BLOCK_SIZE_M=config["BLOCK_SIZE_M"],
|
||||
BLOCK_SIZE_N=config["BLOCK_SIZE_N"],
|
||||
BLOCK_SIZE_K=config["BLOCK_SIZE_K"],
|
||||
GROUP_SIZE_M=config["GROUP_SIZE_M"],
|
||||
MUL_ROUTED_WEIGHT=False,
|
||||
top_k=top_k,
|
||||
compute_type_enum=1,
|
||||
use_fp8_w8a8=False,
|
||||
use_int8_w8a16=True,
|
||||
even_Ks=hidden_size % config["BLOCK_SIZE_K"] == 0,
|
||||
)
|
||||
|
||||
intermediate_cache2 = paddle.incubate.nn.functional.swiglu(
|
||||
intermediate_cache1)
|
||||
|
||||
grid = (ceil_div(max_num_tokens_padded, config["BLOCK_SIZE_M"]) *
|
||||
ceil_div(hidden_size, config["BLOCK_SIZE_N"]), )
|
||||
fused_moe_kernel_paddle[grid](
|
||||
intermediate_cache2,
|
||||
layer.moe_ffn2_weight,
|
||||
intermediate_cache3,
|
||||
None,
|
||||
layer.moe_ffn2_weight_scale,
|
||||
topk_weights,
|
||||
sorted_token_ids,
|
||||
expert_ids,
|
||||
num_tokens_post_padded,
|
||||
hidden_size,
|
||||
moe_intermediate_size,
|
||||
max_num_tokens_padded,
|
||||
token_num * top_k,
|
||||
stride_am=intermediate_cache2.strides[0],
|
||||
stride_ak=intermediate_cache2.strides[1],
|
||||
stride_be=layer.moe_ffn2_weight.strides[0],
|
||||
stride_bk=layer.moe_ffn2_weight.strides[1],
|
||||
stride_bn=layer.moe_ffn2_weight.strides[2],
|
||||
stride_cm=intermediate_cache3.strides[0],
|
||||
stride_cn=intermediate_cache3.strides[1],
|
||||
stride_asm=-1,
|
||||
stride_ask=-1,
|
||||
stride_bse=layer.moe_ffn2_weight_scale.strides[0],
|
||||
stride_bsk=-1,
|
||||
stride_bsn=layer.moe_ffn2_weight_scale.strides[1],
|
||||
group_n=-1,
|
||||
group_k=-1,
|
||||
# Meta-parameters
|
||||
BLOCK_SIZE_M=config["BLOCK_SIZE_M"],
|
||||
BLOCK_SIZE_N=config["BLOCK_SIZE_N"],
|
||||
BLOCK_SIZE_K=config["BLOCK_SIZE_K"],
|
||||
GROUP_SIZE_M=config["GROUP_SIZE_M"],
|
||||
MUL_ROUTED_WEIGHT=True,
|
||||
top_k=1,
|
||||
compute_type_enum=1,
|
||||
use_fp8_w8a8=False,
|
||||
use_int8_w8a16=True,
|
||||
even_Ks=moe_intermediate_size % config["BLOCK_SIZE_K"] == 0,
|
||||
)
|
||||
|
||||
intermediate_cache3.reshape_([token_num, top_k, hidden_size])
|
||||
out = intermediate_cache3.sum(axis=1)
|
||||
return out
|
||||
|
||||
|
||||
class TensorWiseFP8MoEMethod(QuantMethodBase):
|
||||
"""
|
||||
Use Triton Group Gemm to compute Fused MoE.
|
||||
"""
|
||||
|
||||
def __init__(self, quant_method=None):
|
||||
"""
|
||||
Triton Group Gemm to compute Fused MoE.
|
||||
"""
|
||||
self.quant_method = quant_method
|
||||
|
||||
def process_prequanted_weights(self, layer: nn.Layer, state_dict) -> None:
|
||||
"""process_prequanted_weights"""
|
||||
|
||||
ffn1_tensor, ffn2_tensor = layer.extract_moe_ffn_weights(state_dict)
|
||||
assert ffn1_tensor[0].shape == [
|
||||
layer.hidden_size, layer.moe_intermediate_size * 2
|
||||
]
|
||||
assert ffn2_tensor[0].shape == [
|
||||
layer.moe_intermediate_size, layer.hidden_size
|
||||
]
|
||||
|
||||
ffn1_tensor = paddle.stack(ffn1_tensor, axis=0)
|
||||
ffn2_tensor = paddle.stack(ffn2_tensor, axis=0)
|
||||
|
||||
added_wfp8afp8_attrs = [
|
||||
"moe_ffn1_weight", "moe_ffn2_weight", "moe_ffn1_weight_scale",
|
||||
"moe_ffn2_weight_scale", "moe_ffn1_in_scale", "moe_ffn2_in_scale"
|
||||
]
|
||||
|
||||
def _extract_scale_tensor(key_template):
|
||||
result = []
|
||||
for i in range(layer.num_experts):
|
||||
result.append(
|
||||
get_tensor(state_dict.pop(key_template.format(i))))
|
||||
return paddle.concat(result).cast("float32")
|
||||
|
||||
weight_key_map = layer.weight_key_map
|
||||
moe_ffn1_weight_scale = _extract_scale_tensor(
|
||||
weight_key_map["ffn1_expert_weight_scale_key"])
|
||||
moe_ffn2_weight_scale = _extract_scale_tensor(
|
||||
weight_key_map["ffn2_expert_weight_scale_key"])
|
||||
moe_ffn1_in_scale = _extract_scale_tensor(
|
||||
weight_key_map["ffn1_expert_in_scale_key"])
|
||||
moe_ffn2_in_scale = _extract_scale_tensor(
|
||||
weight_key_map["ffn2_expert_in_scale_key"])
|
||||
|
||||
for idx, weight_tensor in enumerate([
|
||||
ffn1_tensor, ffn2_tensor, moe_ffn1_weight_scale,
|
||||
moe_ffn2_weight_scale, moe_ffn1_in_scale, moe_ffn2_in_scale
|
||||
]):
|
||||
name = added_wfp8afp8_attrs[idx]
|
||||
setattr(
|
||||
layer, name,
|
||||
layer.create_parameter(
|
||||
shape=weight_tensor.shape,
|
||||
dtype=weight_tensor.dtype,
|
||||
default_initializer=paddle.nn.initializer.Constant(0),
|
||||
))
|
||||
getattr(layer, name).set_value(weight_tensor)
|
||||
|
||||
def create_weights(self, layer: nn.Layer, state_dict):
|
||||
"""
|
||||
Triton MoE create weight process.
|
||||
"""
|
||||
pass
|
||||
|
||||
def apply(
|
||||
self,
|
||||
layer: nn.Layer,
|
||||
x: paddle.Tensor,
|
||||
gate_out: paddle.Tensor,
|
||||
) -> paddle.Tensor:
|
||||
"""
|
||||
Triton compute Fused MoE.
|
||||
"""
|
||||
|
||||
token_num = x.shape[0]
|
||||
top_k = layer.top_k
|
||||
num_local_experts = layer.num_local_experts
|
||||
moe_intermediate_size = layer.moe_intermediate_size
|
||||
hidden_size = layer.hidden_size
|
||||
|
||||
gate_out = paddle.matmul(x.cast("float32"), layer.gate_weight)
|
||||
scores = paddle.nn.functional.softmax(gate_out, axis=-1)
|
||||
|
||||
topk_weights, topk_ids = paddle.topk(scores,
|
||||
k=top_k,
|
||||
axis=-1,
|
||||
sorted=False)
|
||||
topk_weights = topk_weights / topk_weights.sum(axis=-1, keepdim=True)
|
||||
|
||||
intermediate_cache1 = paddle.empty(
|
||||
[token_num * top_k, moe_intermediate_size * 2],
|
||||
dtype=x.dtype,
|
||||
)
|
||||
intermediate_cache2 = paddle.empty(
|
||||
(token_num * top_k, moe_intermediate_size),
|
||||
dtype=x.dtype,
|
||||
)
|
||||
intermediate_cache3 = paddle.empty(
|
||||
(token_num * top_k, hidden_size),
|
||||
dtype=x.dtype,
|
||||
)
|
||||
|
||||
config = {
|
||||
"BLOCK_SIZE_M": 32,
|
||||
"BLOCK_SIZE_N": 128,
|
||||
"BLOCK_SIZE_K": 128,
|
||||
"GROUP_SIZE_M": 1,
|
||||
}
|
||||
from fastdeploy.model_executor.ops.gpu import tritonmoe_preprocess
|
||||
|
||||
sorted_token_ids, expert_ids, num_tokens_post_padded = tritonmoe_preprocess(
|
||||
topk_ids, num_local_experts, config["BLOCK_SIZE_M"])
|
||||
max_num_tokens_padded = sorted_token_ids.shape[0]
|
||||
grid = (ceil_div(max_num_tokens_padded, config["BLOCK_SIZE_M"]) *
|
||||
ceil_div(moe_intermediate_size * 2, config["BLOCK_SIZE_N"]), )
|
||||
|
||||
adamard_matrix = create_hadamard_matrix_map[hidden_size]
|
||||
x = paddle.matmul(x.cast("float32"), adamard_matrix)
|
||||
|
||||
permute_x = x[:, None, :].tile([1, top_k, 1])
|
||||
permute_x = permute_x.reshape([-1, hidden_size])
|
||||
|
||||
quant_activation_scale = layer.moe_ffn1_in_scale[topk_ids].reshape(
|
||||
[-1, 1])
|
||||
permute_x = permute_x / quant_activation_scale
|
||||
permute_x = permute_x.astype("float8_e4m3fn")
|
||||
|
||||
from .triton_moe_kernels import fused_moe_kernel_paddle
|
||||
|
||||
fused_moe_kernel_paddle[grid](
|
||||
permute_x,
|
||||
layer.moe_ffn1_weight.view(paddle.float8_e4m3fn),
|
||||
intermediate_cache1,
|
||||
layer.moe_ffn1_in_scale,
|
||||
layer.moe_ffn1_weight_scale,
|
||||
None,
|
||||
sorted_token_ids,
|
||||
expert_ids,
|
||||
num_tokens_post_padded,
|
||||
moe_intermediate_size * 2,
|
||||
hidden_size,
|
||||
max_num_tokens_padded,
|
||||
token_num * top_k,
|
||||
stride_am=x.strides[0],
|
||||
stride_ak=x.strides[1],
|
||||
stride_be=layer.moe_ffn1_weight.strides[0],
|
||||
stride_bk=layer.moe_ffn1_weight.strides[1],
|
||||
stride_bn=layer.moe_ffn1_weight.strides[2],
|
||||
stride_cm=intermediate_cache1.strides[0],
|
||||
stride_cn=intermediate_cache1.strides[1],
|
||||
#
|
||||
stride_asm=-1, # only used in blockwise fp8
|
||||
stride_ask=-1, # only used in blockwise fp8
|
||||
stride_bse=-1,
|
||||
stride_bsk=-1,
|
||||
stride_bsn=-1,
|
||||
group_n=-1,
|
||||
group_k=-1,
|
||||
# Meta-parameters
|
||||
BLOCK_SIZE_M=config["BLOCK_SIZE_M"],
|
||||
BLOCK_SIZE_N=config["BLOCK_SIZE_N"],
|
||||
BLOCK_SIZE_K=config["BLOCK_SIZE_K"],
|
||||
GROUP_SIZE_M=config["GROUP_SIZE_M"],
|
||||
MUL_ROUTED_WEIGHT=False,
|
||||
top_k=1,
|
||||
compute_type_enum=1,
|
||||
use_fp8_w8a8=True,
|
||||
use_int8_w8a16=False,
|
||||
even_Ks=hidden_size % config["BLOCK_SIZE_K"] == 0,
|
||||
)
|
||||
|
||||
intermediate_cache2 = paddle.incubate.nn.functional.swiglu(
|
||||
intermediate_cache1)
|
||||
|
||||
hadamard_matrix = create_hadamard_matrix_map[moe_intermediate_size]
|
||||
intermediate_cache2 = paddle.matmul(
|
||||
intermediate_cache2.cast("float32"), hadamard_matrix)
|
||||
quant_activation_scale = layer.moe_ffn2_in_scale[topk_ids].reshape(
|
||||
[-1, 1])
|
||||
intermediate_cache2 = intermediate_cache2 / quant_activation_scale
|
||||
intermediate_cache2 = intermediate_cache2.astype("float8_e4m3fn")
|
||||
|
||||
grid = (ceil_div(max_num_tokens_padded, config["BLOCK_SIZE_M"]) *
|
||||
ceil_div(hidden_size, config["BLOCK_SIZE_N"]), )
|
||||
|
||||
fused_moe_kernel_paddle[grid](
|
||||
intermediate_cache2,
|
||||
layer.moe_ffn2_weight.view(paddle.float8_e4m3fn),
|
||||
intermediate_cache3,
|
||||
layer.moe_ffn2_in_scale,
|
||||
layer.moe_ffn2_weight_scale,
|
||||
topk_weights,
|
||||
sorted_token_ids,
|
||||
expert_ids,
|
||||
num_tokens_post_padded,
|
||||
hidden_size,
|
||||
moe_intermediate_size,
|
||||
max_num_tokens_padded,
|
||||
token_num * top_k,
|
||||
stride_am=intermediate_cache2.strides[0],
|
||||
stride_ak=intermediate_cache2.strides[1],
|
||||
stride_be=layer.moe_ffn2_weight.strides[0],
|
||||
stride_bk=layer.moe_ffn2_weight.strides[1],
|
||||
stride_bn=layer.moe_ffn2_weight.strides[2],
|
||||
stride_cm=intermediate_cache3.strides[0],
|
||||
stride_cn=intermediate_cache3.strides[1],
|
||||
stride_asm=-1,
|
||||
stride_ask=-1,
|
||||
stride_bse=-1,
|
||||
stride_bsk=-1,
|
||||
stride_bsn=-1,
|
||||
group_n=-1,
|
||||
group_k=-1,
|
||||
# Meta-parameters
|
||||
BLOCK_SIZE_M=config["BLOCK_SIZE_M"],
|
||||
BLOCK_SIZE_N=config["BLOCK_SIZE_N"],
|
||||
BLOCK_SIZE_K=config["BLOCK_SIZE_K"],
|
||||
GROUP_SIZE_M=config["GROUP_SIZE_M"],
|
||||
MUL_ROUTED_WEIGHT=True,
|
||||
top_k=1,
|
||||
compute_type_enum=1,
|
||||
use_fp8_w8a8=True,
|
||||
use_int8_w8a16=False,
|
||||
even_Ks=moe_intermediate_size % config["BLOCK_SIZE_K"] == 0,
|
||||
)
|
||||
|
||||
intermediate_cache3.reshape_([token_num, top_k, hidden_size])
|
||||
out = intermediate_cache3.sum(axis=1)
|
||||
|
||||
if layer.tp_size > 1:
|
||||
tensor_model_parallel_all_reduce(out)
|
||||
|
||||
return out
|
||||
@@ -0,0 +1,236 @@
|
||||
"""
|
||||
# Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
"""
|
||||
|
||||
import paddle
|
||||
from paddle import nn
|
||||
|
||||
import fastdeploy
|
||||
|
||||
from ..quantization.quant_base import QuantMethodBase
|
||||
from ..utils import create_and_set_parameter, get_tensor
|
||||
|
||||
|
||||
class Wint2MoeMethod(QuantMethodBase):
|
||||
"""
|
||||
Use compute Fused MoE.
|
||||
"""
|
||||
|
||||
def __init__(self, quant_config):
|
||||
super().__init__()
|
||||
self.moe_quant_type = quant_config.moe_quant_type
|
||||
|
||||
def process_loaded_weights(self, layer, weights) -> None:
|
||||
"""
|
||||
process_loaded_weights
|
||||
"""
|
||||
pass
|
||||
|
||||
def check(self, layer: nn.Layer, ffn1_weights, ffn2_weights):
|
||||
"""
|
||||
check layer is valid for this method
|
||||
"""
|
||||
assert len(
|
||||
ffn1_weights
|
||||
) == layer.num_local_experts, "ffn1_weights length should be equal to num_local_experts."
|
||||
assert len(
|
||||
ffn2_weights
|
||||
) == layer.num_local_experts, "ffn2_weights length should be equal to num_local_experts."
|
||||
|
||||
def create_weights(self, layer: nn.Layer, state_dict):
|
||||
"""
|
||||
Paddle cutlass create weight process.
|
||||
"""
|
||||
pass
|
||||
|
||||
|
||||
class TritonWint2FusedMoeMethod(Wint2MoeMethod):
|
||||
"""
|
||||
Use Triton Group Gemm to compute Fused MoE.
|
||||
"""
|
||||
|
||||
def __init__(self, quant_config):
|
||||
super().__init__(quant_config)
|
||||
self.moe_quant_type = quant_config.moe_quant_type
|
||||
|
||||
def process_loaded_weights(self, layer, weights) -> None:
|
||||
"""
|
||||
process_loaded_weights
|
||||
"""
|
||||
pass
|
||||
|
||||
def process_prequanted_weights(self, layer: nn.Layer, state_dict):
|
||||
"""
|
||||
Paddle cutlass process prequanted weights.
|
||||
"""
|
||||
ffn1_expert_weight_key = layer.weight_key_map.get(
|
||||
"ffn1_expert_weight_key", None)
|
||||
ffn2_expert_weight_key = layer.weight_key_map.get(
|
||||
"ffn2_expert_weight_key", None)
|
||||
ffn1_expert_weight_scale_key = layer.weight_key_map.get(
|
||||
"ffn1_expert_weight_scale_key", None)
|
||||
ffn2_expert_weight_scale_key = layer.weight_key_map.get(
|
||||
"ffn2_expert_weight_scale_key", None)
|
||||
ffn1_expert_super_scales_key = layer.weight_key_map.get(
|
||||
"ffn1_expert_super_scales_key", None)
|
||||
ffn2_expert_super_scales_key = layer.weight_key_map.get(
|
||||
"ffn2_expert_super_scales_key", None)
|
||||
ffn1_expert_code_scale_key = layer.weight_key_map.get(
|
||||
"ffn1_expert_code_scale_key", None)
|
||||
ffn2_expert_code_scale_key = layer.weight_key_map.get(
|
||||
"ffn2_expert_code_scale_key", None)
|
||||
ffn1_expert_code_zp_key = layer.weight_key_map.get(
|
||||
"ffn1_expert_code_zp_key", None)
|
||||
ffn2_expert_code_zp_key = layer.weight_key_map.get(
|
||||
"ffn2_expert_code_zp_key", None)
|
||||
|
||||
ffn1_weights, ffn2_weights = layer.load_experts_weight(
|
||||
state_dict, ffn1_expert_weight_key, ffn2_expert_weight_key)
|
||||
# self.check(layer, ffn1_weights, ffn2_weights)
|
||||
|
||||
ffn1_weight_scale = []
|
||||
ffn2_weight_scale = []
|
||||
ffn1_super_scales = []
|
||||
ffn2_super_scales = []
|
||||
ffn1_code_scale = []
|
||||
ffn2_code_scale = []
|
||||
ffn1_code_zp = []
|
||||
ffn2_code_zp = []
|
||||
for i in range(layer.num_experts):
|
||||
expert_idx = layer.expert_id_offset + i
|
||||
ffn1_weight_scale.append(
|
||||
get_tensor(
|
||||
state_dict.pop(
|
||||
ffn1_expert_weight_scale_key.format(expert_idx))))
|
||||
ffn2_weight_scale.append(
|
||||
get_tensor(
|
||||
state_dict.pop(
|
||||
ffn2_expert_weight_scale_key.format(expert_idx))))
|
||||
ffn1_super_scales.append(
|
||||
get_tensor(
|
||||
state_dict.pop(
|
||||
ffn1_expert_super_scales_key.format(expert_idx))))
|
||||
ffn2_super_scales.append(
|
||||
get_tensor(
|
||||
state_dict.pop(
|
||||
ffn2_expert_super_scales_key.format(expert_idx))))
|
||||
ffn1_code_scale.append(
|
||||
get_tensor(
|
||||
state_dict.pop(
|
||||
ffn1_expert_code_scale_key.format(expert_idx))))
|
||||
ffn2_code_scale.append(
|
||||
get_tensor(
|
||||
state_dict.pop(
|
||||
ffn2_expert_code_scale_key.format(expert_idx))))
|
||||
ffn1_code_zp.append(
|
||||
get_tensor(
|
||||
state_dict.pop(
|
||||
ffn1_expert_code_zp_key.format(expert_idx))))
|
||||
ffn2_code_zp.append(
|
||||
get_tensor(
|
||||
state_dict.pop(
|
||||
ffn2_expert_code_zp_key.format(expert_idx))))
|
||||
|
||||
ffn1_weight = paddle.stack(ffn1_weights, axis=0)
|
||||
ffn2_weight = paddle.stack(ffn2_weights, axis=0)
|
||||
ffn1_weight_scale = paddle.stack(ffn1_weight_scale, axis=0)
|
||||
ffn2_weight_scale = paddle.stack(ffn2_weight_scale, axis=0)
|
||||
ffn1_super_scales = paddle.stack(ffn1_super_scales, axis=0)
|
||||
ffn2_super_scales = paddle.stack(ffn2_super_scales, axis=0)
|
||||
ffn1_code_scale = paddle.stack(ffn1_code_scale, axis=0)
|
||||
ffn2_code_scale = paddle.stack(ffn2_code_scale, axis=0)
|
||||
ffn1_code_zp = paddle.stack(ffn1_code_zp, axis=0)
|
||||
ffn2_code_zp = paddle.stack(ffn2_code_zp, axis=0)
|
||||
|
||||
name_tensor_map = {
|
||||
"moe_ffn1_weight": ffn1_weight,
|
||||
"moe_ffn2_weight": ffn2_weight,
|
||||
"moe_ffn1_weight_scale": ffn1_weight_scale,
|
||||
"moe_ffn2_weight_scale": ffn2_weight_scale,
|
||||
"moe_ffn1_super_scales": ffn1_super_scales,
|
||||
"moe_ffn2_super_scales": ffn2_super_scales,
|
||||
"moe_ffn1_code_scale": ffn1_code_scale,
|
||||
"moe_ffn2_code_scale": ffn2_code_scale,
|
||||
"moe_ffn1_code_zp": ffn1_code_zp,
|
||||
"moe_ffn2_code_zp": ffn2_code_zp
|
||||
}
|
||||
for name, tensor in name_tensor_map.items():
|
||||
create_and_set_parameter(layer, name, tensor)
|
||||
|
||||
def create_weights(self, layer: nn.Layer, state_dict):
|
||||
"""
|
||||
Paddle cutlass create weight process.
|
||||
"""
|
||||
pass
|
||||
|
||||
def apply(
|
||||
self,
|
||||
layer: nn.Layer,
|
||||
x: paddle.Tensor,
|
||||
gate_out: paddle.Tensor,
|
||||
) -> paddle.Tensor:
|
||||
"""
|
||||
Use Wint2 Triton Fusedmoe compute Fused MoE.
|
||||
"""
|
||||
|
||||
from fastdeploy.model_executor.ops.gpu import moe_expert_dispatch
|
||||
(
|
||||
permute_input,
|
||||
token_nums_per_expert,
|
||||
permute_indices_per_token,
|
||||
topk_weights,
|
||||
topk_idx,
|
||||
expert_idx_per_token,
|
||||
) = moe_expert_dispatch(
|
||||
x,
|
||||
gate_out,
|
||||
layer.gate_correction_bias,
|
||||
(layer.moe_ffn1_in_scale if hasattr(layer, "moe_ffn1_in_scale")
|
||||
else None), # if set, permute_input will be int8_t
|
||||
layer.top_k,
|
||||
False,
|
||||
topk_only_mode=False,
|
||||
)
|
||||
|
||||
ffn_out = fastdeploy.model_executor.ops.gpu.moe_expert_ffn_wint2(
|
||||
permute_input,
|
||||
token_nums_per_expert,
|
||||
layer.moe_ffn1_weight,
|
||||
layer.moe_ffn2_weight,
|
||||
None,
|
||||
layer.moe_ffn1_super_scales,
|
||||
layer.moe_ffn2_super_scales,
|
||||
layer.moe_ffn1_weight_scale,
|
||||
layer.moe_ffn1_code_scale,
|
||||
layer.moe_ffn1_code_zp,
|
||||
layer.moe_ffn2_weight_scale,
|
||||
layer.moe_ffn2_code_scale,
|
||||
layer.moe_ffn2_code_zp,
|
||||
False,
|
||||
)
|
||||
|
||||
from fastdeploy.model_executor.ops.gpu import moe_expert_reduce
|
||||
|
||||
fused_moe_out = moe_expert_reduce(
|
||||
ffn_out,
|
||||
topk_weights,
|
||||
permute_indices_per_token,
|
||||
topk_idx,
|
||||
None,
|
||||
norm_topk_prob=True,
|
||||
routed_scaling_factor=1.0,
|
||||
)
|
||||
|
||||
return fused_moe_out
|
||||
@@ -1,273 +0,0 @@
|
||||
"""
|
||||
# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
"""
|
||||
|
||||
import os
|
||||
import paddle
|
||||
from paddle import nn
|
||||
from fastdeploy.model_executor.layers.moe.moe import MoELayer
|
||||
from fastdeploy.model_executor.layers.utils import get_tensor
|
||||
|
||||
|
||||
class TextMoELayer(MoELayer):
|
||||
"""
|
||||
MoELayer is a layer that performs MoE (Mixture of Experts) computation.
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
*args,
|
||||
**kwargs,
|
||||
):
|
||||
"""
|
||||
初始化函数,用于设置类的属性和方法。
|
||||
参数:
|
||||
- args (tuple, optional): 可变长度的位置参数列表,默认为空元组。
|
||||
- kwargs (dict, optional): 关键字参数字典,默认为空字典。
|
||||
返回值:
|
||||
无返回值,直接修改类的属性和方法。
|
||||
"""
|
||||
kwargs["moe_tag"] = "Text"
|
||||
super().__init__(*args, **kwargs)
|
||||
|
||||
def load_gate_state_dict(self, state_dict):
|
||||
"""
|
||||
加载门状态字典,用于初始化网络参数。
|
||||
将从给定的状态字典中弹出的参数赋值给网络的门参数。
|
||||
|
||||
Args:
|
||||
state_dict (OrderedDict): 包含网络门参数的字典。
|
||||
|
||||
Returns:
|
||||
tuple (list, list): 返回两个列表,分别代表上阶网关投影和下阶投影的参数。
|
||||
每个元素都是一个列表,长度为网络的专家数量。
|
||||
"""
|
||||
up_gate_proj_weight = []
|
||||
up_gate_proj_weight_scale = []
|
||||
down_proj_weight = []
|
||||
down_proj_weight_scale = []
|
||||
for j in range(0, self.num_experts):
|
||||
up_gate_proj_weight.append(
|
||||
get_tensor(state_dict.pop(self.ffn1_expert_weight_key.format(j)))
|
||||
)
|
||||
down_proj_weight.append(
|
||||
get_tensor(state_dict.pop(self.ffn2_expert_weight_key.format(j)))
|
||||
)
|
||||
return (
|
||||
up_gate_proj_weight,
|
||||
down_proj_weight,
|
||||
up_gate_proj_weight_scale,
|
||||
down_proj_weight_scale,
|
||||
)
|
||||
|
||||
def load_gate_correction_bias(self, state_dict):
|
||||
"""
|
||||
加载网关校正偏置。如果使用了网关校正偏置,则从state_dict中获取相应的张量并设置到网关校正偏置上。
|
||||
参数:
|
||||
state_dict (OrderedDict): 包含模型参数和状态的字典。
|
||||
返回值:
|
||||
无返回值,直接修改了网关校正偏置的值。
|
||||
"""
|
||||
if self.moe_config.moe_use_gate_correction_bias:
|
||||
gate_correction_bias_tensor = get_tensor(
|
||||
state_dict[self.gate_correction_bias_key]
|
||||
)
|
||||
self.gate_correction_bias.set_value(
|
||||
gate_correction_bias_tensor[0].unsqueeze(0)
|
||||
)
|
||||
|
||||
|
||||
class ImageMoELayer(MoELayer):
|
||||
"""
|
||||
MoELayer is a layer that performs MoE (Mixture of Experts) computation.
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
*args,
|
||||
**kwargs,
|
||||
):
|
||||
"""
|
||||
初始化函数,用于设置类的属性和方法。
|
||||
参数:
|
||||
- args (tuple, optional): 可变长度的位置参数列表,默认为空元组。
|
||||
- kwargs (dict, optional): 关键字参数字典,默认为空字典。
|
||||
返回值:
|
||||
无返回值,直接修改类的属性和方法。
|
||||
"""
|
||||
moe_quant_type = os.getenv("ELLM_MM_IMAGE_QUANT_TYPE", None)
|
||||
if moe_quant_type is not None:
|
||||
kwargs["moe_quant_type"] = moe_quant_type
|
||||
kwargs["moe_tag"] = "Image"
|
||||
super().__init__(*args, **kwargs)
|
||||
|
||||
def load_gate_state_dict(self, state_dict):
|
||||
"""
|
||||
加载门状态字典。
|
||||
从给定的状态字典中提取并返回两个专家的上下关门投影权重,以及两个专家的下降投影权重。
|
||||
参数:
|
||||
state_dict (OrderedDict): 包含网络参数的有序字典。
|
||||
返回值:
|
||||
tuple (list, list),分别是两个专家的上下关门投影权重和两个专家的下降投影权重,都是列表类型。
|
||||
"""
|
||||
up_gate_proj_weight = []
|
||||
up_gate_proj_weight_scale = []
|
||||
down_proj_weight = []
|
||||
down_proj_weight_scale = []
|
||||
for j in range(self.num_experts, self.num_experts + self.num_experts):
|
||||
up_gate_proj_weight.append(
|
||||
get_tensor(state_dict.pop(self.ffn1_expert_weight_key.format(j)))
|
||||
)
|
||||
down_proj_weight.append(
|
||||
get_tensor(state_dict.pop(self.ffn2_expert_weight_key.format(j)))
|
||||
)
|
||||
return (
|
||||
up_gate_proj_weight,
|
||||
down_proj_weight,
|
||||
up_gate_proj_weight_scale,
|
||||
down_proj_weight_scale,
|
||||
)
|
||||
|
||||
def load_gate_correction_bias(self, state_dict):
|
||||
"""
|
||||
加载门级别校正偏置参数,如果使用门级别校正偏置则从state_dict中获取并设置到gate_correction_bias中。
|
||||
参数:
|
||||
state_dict (OrderedDict): 模型的状态字典,包含所有需要被加载的参数。
|
||||
返回值:
|
||||
无返回值,直接修改了gate_correction_bias的值。
|
||||
"""
|
||||
if self.moe_config.moe_use_gate_correction_bias:
|
||||
gate_correction_bias_tensor = get_tensor(
|
||||
state_dict[self.gate_correction_bias_key]
|
||||
)
|
||||
self.gate_correction_bias.set_value(
|
||||
gate_correction_bias_tensor[1].unsqueeze(0)
|
||||
)
|
||||
|
||||
|
||||
class MultimodalityMoeLayer(nn.Layer):
|
||||
"""
|
||||
Multimodality MOE Layer
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
inference_args,
|
||||
layer_name,
|
||||
layer_idx,
|
||||
):
|
||||
"""
|
||||
初始化一个 MoELayer。
|
||||
|
||||
Args:
|
||||
inference_args (InferenceArgs): 推理参数类,包含了所有必要的配置信息。
|
||||
layer_name (str): 当前 MoE Layer 的名称。
|
||||
layer_idx (int): 当前 MoE Layer 在模型中的索引。
|
||||
|
||||
Returns:
|
||||
None, 无返回值。
|
||||
"""
|
||||
super().__init__()
|
||||
|
||||
self.text_moe_layer = TextMoELayer(
|
||||
inference_args=inference_args,
|
||||
moe_config=inference_args.moe_config,
|
||||
layer_name=layer_name + ".text",
|
||||
gate_weight_key=f"ernie.layers.{layer_idx}.mlp.gate.weight",
|
||||
ffn1_expert_weight_key=f"ernie.layers.{layer_idx}.mlp.experts"
|
||||
+ ".{}.up_gate_proj.weight",
|
||||
ffn2_expert_weight_key=f"ernie.layers.{layer_idx}.mlp.experts"
|
||||
+ ".{}.down_proj.weight",
|
||||
gate_correction_bias_key=f"ernie.layers.{layer_idx}.mlp.moe_statics.e_score_correction_bias",
|
||||
ffn1_bias_key=None,
|
||||
ffn2_bias_key=None,
|
||||
ffn1_shared_weight_key=None,
|
||||
ffn1_shared_bias_key=None,
|
||||
ffn2_shared_weight_key=None,
|
||||
ffn2_shared_bias_key=None,
|
||||
layer_idx=layer_idx,
|
||||
)
|
||||
|
||||
self.image_moe_layer = ImageMoELayer(
|
||||
inference_args=inference_args,
|
||||
moe_config=inference_args.moe_config_1,
|
||||
layer_name=layer_name + ".image",
|
||||
gate_weight_key=f"ernie.layers.{layer_idx}.mlp.gate.weight_1",
|
||||
ffn1_expert_weight_key=f"ernie.layers.{layer_idx}.mlp.experts"
|
||||
+ ".{}.up_gate_proj.weight",
|
||||
ffn2_expert_weight_key=f"ernie.layers.{layer_idx}.mlp.experts"
|
||||
+ ".{}.down_proj.weight",
|
||||
gate_correction_bias_key=f"ernie.layers.{layer_idx}.mlp.moe_statics.e_score_correction_bias",
|
||||
ffn1_bias_key=None,
|
||||
ffn2_bias_key=None,
|
||||
ffn1_shared_weight_key=None,
|
||||
ffn1_shared_bias_key=None,
|
||||
ffn2_shared_weight_key=None,
|
||||
ffn2_shared_bias_key=None,
|
||||
layer_idx=layer_idx,
|
||||
)
|
||||
|
||||
def load_state_dict(self, state_dict):
|
||||
"""
|
||||
加载模型参数。
|
||||
将给定的字典中的参数覆盖到当前模型上,并返回一个新的字典,其中包含未被覆盖的键值对。
|
||||
|
||||
Args:
|
||||
state_dict (dict): 包含了要加载的模型参数的字典。
|
||||
|
||||
Returns:
|
||||
dict: 包含未被覆盖的键值对的字典。
|
||||
"""
|
||||
self.text_moe_layer.load_state_dict(state_dict)
|
||||
self.image_moe_layer.load_state_dict(state_dict)
|
||||
state_dict.pop(self.text_moe_layer.gate_correction_bias_key)
|
||||
|
||||
def forward(self, x, **kwargs):
|
||||
"""
|
||||
前向计算函数,将输入的张量进行处理并返回结果。
|
||||
该函数接受以下键值对参数:
|
||||
- token_type_ids (Optional, Tensor, default=None): 一个bool型Tensor,用于指定每个元素是否为文本类型(值为0)或图像类型(值为1)。
|
||||
如果未提供此参数,则会引发AssertionError。
|
||||
返回值是一个Tensor,形状与输入相同,表示处理后的结果。
|
||||
|
||||
Args:
|
||||
x (Tensor): 输入张量,形状为[token_num, hidden_size],其中token_num是序列长度,hidden_size是隐藏状态维度。
|
||||
kwargs (dict, optional): 可选参数字典,默认为None,包含以下键值对:
|
||||
- token_type_ids (Tensor, optional): 一个bool型Tensor,用于指定每个元素是否为文本类型(值为0)或图像类型(值为1),默认为None。
|
||||
|
||||
Returns:
|
||||
Tensor: 一个Tensor,形状与输入相同,表示处理后的结果。
|
||||
|
||||
Raises:
|
||||
AssertionError: 当未提供token_type_ids参数时会引发此错误。
|
||||
"""
|
||||
token_type_ids = kwargs.get("token_type_ids", None)
|
||||
assert token_type_ids is not None
|
||||
|
||||
# x.shape is [token_num, hidden_size]
|
||||
fused_moe_out = paddle.zeros_like(x)
|
||||
|
||||
text_mask = token_type_ids == 0 # [token_num]
|
||||
image_mask = token_type_ids == 1
|
||||
|
||||
if text_mask.any():
|
||||
text_out = self.text_moe_layer(x[text_mask])
|
||||
fused_moe_out[text_mask] = text_out
|
||||
|
||||
if image_mask.any():
|
||||
image_out = self.image_moe_layer(x[image_mask])
|
||||
fused_moe_out[image_mask] = image_out
|
||||
|
||||
return fused_moe_out
|
||||
@@ -1,5 +1,5 @@
|
||||
"""
|
||||
# Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
|
||||
# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
@@ -14,34 +14,13 @@
|
||||
# limitations under the License.
|
||||
"""
|
||||
|
||||
from dataclasses import dataclass
|
||||
|
||||
import paddle
|
||||
from paddle import nn
|
||||
from paddlenlp.utils.log import logger
|
||||
from paddleformers.utils.log import logger
|
||||
|
||||
from fastdeploy import envs
|
||||
from fastdeploy.model_executor.layers.utils import get_tensor
|
||||
|
||||
from .cutlass_fused_moe import CutlassFusedMoeMethod
|
||||
|
||||
|
||||
@dataclass
|
||||
class MoEComputeParams:
|
||||
"""
|
||||
some params for computing MoE.
|
||||
it is given to different compute methods.
|
||||
"""
|
||||
global_num_experts: int = -1
|
||||
top_k: int = -1
|
||||
hidden_size: int = -1
|
||||
num_local_experts: int = -1
|
||||
moe_intermediate_size: int = -1
|
||||
|
||||
tp_size: int = -1
|
||||
ep_size: int = -1
|
||||
dp_size: int = -1
|
||||
|
||||
moe_quant_type: str = ""
|
||||
|
||||
|
||||
class FusedMoE(nn.Layer):
|
||||
"""
|
||||
@@ -50,174 +29,195 @@ class FusedMoE(nn.Layer):
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
llm_config,
|
||||
fd_config,
|
||||
moe_intermediate_size: int = -1,
|
||||
num_experts: int = -1,
|
||||
expert_id_offset: int = 0,
|
||||
top_k: int = -1,
|
||||
moe_use_gate_correction_bias: bool = False,
|
||||
moe_quant_type: str = "weight_only_int4",
|
||||
layer_idx: int = -1,
|
||||
gate_weight_key=None,
|
||||
gate_correction_bias_key=None,
|
||||
ffn1_expert_weight_key=None,
|
||||
ffn2_expert_weight_key=None,
|
||||
moe_ffn1_bias_keys=None,
|
||||
moe_ffn2_bias_keys=None,
|
||||
moe_ffn1_weight_scale_keys=None,
|
||||
moe_ffn2_weight_scale_keys=None,
|
||||
moe_ffn1_in_scale_keys=None,
|
||||
moe_ffn2_in_scale_keys=None,
|
||||
moe_tag: str = "",
|
||||
weight_key_map: dict = {},
|
||||
):
|
||||
"""
|
||||
Initialize the Moe layer with given parameters.
|
||||
Args:
|
||||
llm_config (LLMConfig): Arguments related to inference, containing
|
||||
fd_config (FDConfig): Arguments related to inference, containing
|
||||
attributes such as weight_dtype, act_dtype, mp_size, hidden_size, head_dim,
|
||||
num_attention_heads, and ffn_hidden_size.
|
||||
"""
|
||||
super().__init__()
|
||||
|
||||
self.llm_config = llm_config
|
||||
self.fd_config = fd_config
|
||||
self.layer_idx = layer_idx
|
||||
self.tp_size = llm_config.parallel_config.mp_size
|
||||
self.ep_size = llm_config.parallel_config.ep_size
|
||||
|
||||
self.moe_use_gate_correction_bias = moe_use_gate_correction_bias
|
||||
self.tp_size = fd_config.parallel_config.tensor_parallel_degree
|
||||
self.ep_size = fd_config.parallel_config.expert_parallel_degree
|
||||
self.ep_rank = fd_config.parallel_config.expert_parallel_rank
|
||||
|
||||
assert (self.tp_size >= 1 and self.ep_size == 1) or \
|
||||
(self.tp_size == 1 and self.ep_size > 1), \
|
||||
'MoE only support parallelism on TP or EP dimension.'
|
||||
|
||||
self.hidden_size = fd_config.model_config.hidden_size
|
||||
self.moe_config = fd_config.moe_config
|
||||
|
||||
self.hidden_size = llm_config.model_config.hidden_size
|
||||
self.moe_config = llm_config.moe_config
|
||||
self.use_offline_quant = llm_config.tmp_config.use_offline_quant
|
||||
moe_tag = self.llm_config.moe_config.moe_tag
|
||||
logger.info(f"{moe_tag}MoE is running in {moe_quant_type} mode")
|
||||
|
||||
self.moe_quant_type = moe_quant_type
|
||||
self.num_experts = num_experts
|
||||
self.num_local_experts = self.num_experts // self.ep_size
|
||||
|
||||
logger.info(f'''MoE config is num_experts:{num_experts},
|
||||
top_k:{top_k},
|
||||
hidden_size:{self.hidden_size},
|
||||
moe_intermediate_size:{moe_intermediate_size}''')
|
||||
logger.info(
|
||||
f"MoE is running on moe_quant_type: {self.moe_quant_type}, ep:{self.ep_size}, tp:{self.tp_size} mode"
|
||||
)
|
||||
self.moe_intermediate_size = moe_intermediate_size // self.tp_size
|
||||
|
||||
self.gate_weight_key = gate_weight_key
|
||||
self.gate_correction_bias_key = gate_correction_bias_key
|
||||
self.top_k = top_k
|
||||
self.hidden_size = self.hidden_size
|
||||
self.moe_intermediate_size = moe_intermediate_size // self.tp_size
|
||||
self.weight_key_map = weight_key_map
|
||||
|
||||
self.ffn1_expert_weight_key = ffn1_expert_weight_key
|
||||
self.ffn2_expert_weight_key = ffn2_expert_weight_key
|
||||
self.ffn1_bias_key = moe_ffn1_bias_keys
|
||||
self.ffn2_bias_key = moe_ffn2_bias_keys
|
||||
self.use_method = envs.FD_MOE_BACKEND.lower()
|
||||
self.gate_correction_bias = None
|
||||
self.moe_tag = moe_tag
|
||||
|
||||
if self.moe_quant_type == "w4a8":
|
||||
# below keys are only used in MoE W4A8!
|
||||
self.ffn1_expert_weight_scale_key = moe_ffn1_weight_scale_keys
|
||||
self.ffn2_expert_weight_scale_key = moe_ffn2_weight_scale_keys
|
||||
self.ffn1_expert_in_scale_key = moe_ffn1_in_scale_keys
|
||||
self.ffn2_expert_in_scale_key = moe_ffn2_in_scale_keys
|
||||
if self.ep_size > 1:
|
||||
expert_id_offset = expert_id_offset + self.ep_rank * self.num_local_experts
|
||||
|
||||
self.compute_method = CutlassFusedMoeMethod()
|
||||
self.expert_id_offset = expert_id_offset
|
||||
|
||||
self.moe_compute_params = MoEComputeParams()
|
||||
self.moe_compute_params.global_num_experts = self.num_experts
|
||||
self.moe_compute_params.top_k = top_k
|
||||
self.moe_compute_params.hidden_size = self.hidden_size
|
||||
self.moe_compute_params.num_local_experts = self.num_local_experts
|
||||
self.moe_compute_params.moe_quant_type = self.moe_quant_type
|
||||
self.moe_compute_params.moe_intermediate_size = self.moe_intermediate_size
|
||||
self.moe_compute_params.ep_size = self.ep_size
|
||||
self.moe_compute_params.tp_size = self.tp_size
|
||||
if fd_config.quant_config:
|
||||
self.quant_method = fd_config.quant_config.get_quant_method(self)
|
||||
else:
|
||||
# now, no quant method(w_fp16 a_fp16) can't get from quant_config, we will optimize it in future
|
||||
from .fused_moe_cutlass_backend import CutlassMoEMethod
|
||||
self.quant_method = CutlassMoEMethod(None)
|
||||
|
||||
def load_gate_state_dict(self, state_dict):
|
||||
if self.ep_size > 1:
|
||||
self.quant_method.init_ep(self)
|
||||
|
||||
logger.info(
|
||||
f"{moe_tag}MoE config is {num_experts=}[{expert_id_offset}, {expert_id_offset+self.num_local_experts}), \
|
||||
{top_k=}, hidden_size={self.hidden_size}, {moe_intermediate_size=}, \
|
||||
, ep_size={self.ep_size}, \
|
||||
tp_size={self.tp_size}.")
|
||||
|
||||
def load_experts_weight(self, state_dict: dict,
|
||||
ffn1_expert_weight_key: str,
|
||||
ffn2_expert_weight_key: str):
|
||||
"""
|
||||
load_gate_state_dict function.
|
||||
Load experts weight from state_dict.
|
||||
Args:
|
||||
state_dict (dict): The state_dict of model.
|
||||
ffn1_expert_weight_key (str): The key of ffn1 expert weight.
|
||||
ffn2_expert_weight_key (str): The key of ffn2 expert weight.
|
||||
"""
|
||||
up_gate_proj_weight = []
|
||||
up_gate_proj_weight_scale = []
|
||||
down_proj_weight = []
|
||||
down_proj_weight_scale = []
|
||||
for j in range(self.num_experts):
|
||||
up_gate_proj_weight.append(
|
||||
get_tensor(
|
||||
state_dict.pop(self.ffn1_expert_weight_key.format(j))))
|
||||
down_proj_weight.append(
|
||||
get_tensor(
|
||||
state_dict.pop(self.ffn2_expert_weight_key.format(j))))
|
||||
return up_gate_proj_weight, down_proj_weight
|
||||
ffn1_weights = []
|
||||
ffn2_weights = []
|
||||
is_ffn_merged = ffn1_expert_weight_key.format(
|
||||
self.expert_id_offset) in state_dict
|
||||
if is_ffn_merged:
|
||||
for i in range(self.num_local_experts):
|
||||
expert_idx = self.expert_id_offset + i
|
||||
ffn1_weights.append(
|
||||
get_tensor(
|
||||
state_dict.pop(
|
||||
ffn1_expert_weight_key.format(expert_idx))))
|
||||
ffn2_weights.append(
|
||||
get_tensor(
|
||||
state_dict.pop(
|
||||
ffn2_expert_weight_key.format(expert_idx))))
|
||||
else:
|
||||
gate_expert_weight_key = ffn1_expert_weight_key.replace(
|
||||
"up_gate_proj", "gate_proj")
|
||||
up_expert_weight_key = ffn1_expert_weight_key.replace(
|
||||
"up_gate_proj", "up_proj")
|
||||
for j in range(self.num_local_experts):
|
||||
expert_idx = self.expert_id_offset + j
|
||||
gate = get_tensor(
|
||||
state_dict.pop(gate_expert_weight_key.format(expert_idx)))
|
||||
up = get_tensor(
|
||||
state_dict.pop(up_expert_weight_key.format(expert_idx)))
|
||||
ffn1_weights.append(paddle.concat([gate, up], axis=-1))
|
||||
ffn2_weights.append(
|
||||
get_tensor(
|
||||
state_dict.pop(
|
||||
ffn2_expert_weight_key.format(expert_idx))))
|
||||
return ffn1_weights, ffn2_weights
|
||||
|
||||
def load_state_dict(self, state_dict, is_update: bool = False):
|
||||
def extract_moe_ffn_weights(self, state_dict: dict):
|
||||
"""
|
||||
Extract MoE FFN weights from state dict based on weight key mapping.
|
||||
|
||||
Args:
|
||||
state_dict (dict): Model state dictionary containing the weights.
|
||||
|
||||
Returns:
|
||||
tuple: A tuple containing two lists:
|
||||
- ffn1_weights: List of tensors for first FFN layer weights
|
||||
- ffn2_weights: List of tensors for second FFN layer weights
|
||||
|
||||
Raises:
|
||||
AssertionError: If required weight keys are missing or number of weights
|
||||
doesn't match number of local experts.
|
||||
"""
|
||||
ffn1_expert_weight_key = self.weight_key_map.get(
|
||||
"ffn1_expert_weight_key", None)
|
||||
ffn2_expert_weight_key = self.weight_key_map.get(
|
||||
"ffn2_expert_weight_key", None)
|
||||
assert ffn1_expert_weight_key is not None, "ffn1_expert_weight_key should not be none."
|
||||
assert ffn2_expert_weight_key is not None, "ffn2_expert_weight_key should not be none."
|
||||
|
||||
ffn1_weights, ffn2_weights = self.load_experts_weight(
|
||||
state_dict, ffn1_expert_weight_key, ffn2_expert_weight_key)
|
||||
assert len(
|
||||
ffn1_weights
|
||||
) == self.num_local_experts, "ffn1_weights length should be equal to num_local_experts."
|
||||
assert len(
|
||||
ffn2_weights
|
||||
) == self.num_local_experts, "ffn2_weights length should be equal to num_local_experts."
|
||||
|
||||
return ffn1_weights, ffn2_weights
|
||||
|
||||
def extract_gate_correction_bias(self, gate_correction_bias_key,
|
||||
state_dict):
|
||||
"""
|
||||
extract_gate_correction_bias function.
|
||||
"""
|
||||
gate_correction_bias_tensor = get_tensor(
|
||||
state_dict.pop(gate_correction_bias_key)).astype("float32")
|
||||
return gate_correction_bias_tensor
|
||||
|
||||
def load_state_dict(self, state_dict):
|
||||
"""
|
||||
load_state_dict function.
|
||||
"""
|
||||
# gate
|
||||
if not is_update:
|
||||
gate_weight_tensor = get_tensor(state_dict.pop(self.gate_weight_key))
|
||||
self.gate_weight = self.create_parameter(
|
||||
shape=gate_weight_tensor.shape,
|
||||
dtype="float32",
|
||||
)
|
||||
self.gate_weight.set_value(gate_weight_tensor)
|
||||
|
||||
# gate_correction_bias
|
||||
self.gate_correction_bias_key = self.weight_key_map.get(
|
||||
"gate_correction_bias_key", None)
|
||||
if self.gate_correction_bias_key is not None and self.gate_correction_bias_key in state_dict:
|
||||
self.moe_use_gate_correction_bias = True
|
||||
else:
|
||||
self.moe_use_gate_correction_bias = False
|
||||
if self.moe_use_gate_correction_bias:
|
||||
gate_correction_bias_tensor = get_tensor(
|
||||
state_dict.pop(self.gate_correction_bias_key))
|
||||
|
||||
gate_correction_bias_tensor = self.extract_gate_correction_bias(
|
||||
self.gate_correction_bias_key, state_dict)
|
||||
self.gate_correction_bias = self.create_parameter(
|
||||
shape=gate_correction_bias_tensor.shape,
|
||||
dtype="float32",
|
||||
)
|
||||
|
||||
self.gate_correction_bias.set_value(gate_correction_bias_tensor)
|
||||
|
||||
gate_weight_key = self.weight_key_map.get("gate_weight_key", None)
|
||||
assert gate_weight_key is not None, "gate_weight_key should not be None, please check model checkpoints"
|
||||
|
||||
gate_weight_tensor = get_tensor(state_dict.pop(gate_weight_key))
|
||||
|
||||
self.gate_weight = self.create_parameter(
|
||||
shape=gate_weight_tensor.shape,
|
||||
dtype="float32",
|
||||
)
|
||||
self.gate_weight.set_value(gate_weight_tensor.astype("float32"))
|
||||
|
||||
if self.fd_config.model_config.is_quantized:
|
||||
self.quant_method.process_prequanted_weights(self, state_dict)
|
||||
else:
|
||||
self.gate_correction_bias = None
|
||||
self.quant_method.create_weights(self, state_dict)
|
||||
|
||||
up_gate_proj_weight, down_proj_weight = self.load_gate_state_dict(
|
||||
state_dict)
|
||||
|
||||
weight1_scale = None
|
||||
weight2_scale = None
|
||||
ffn1_in_scale = None
|
||||
ffn2_in_scale = None
|
||||
if self.moe_quant_type == "w4a8":
|
||||
weight1_scale = []
|
||||
weight2_scale = []
|
||||
ffn1_in_scale = []
|
||||
ffn2_in_scale = []
|
||||
|
||||
for j in range(self.num_experts):
|
||||
weight1_scale.append(
|
||||
get_tensor(
|
||||
state_dict.pop(
|
||||
self.ffn1_expert_weight_scale_key.format(
|
||||
self.layer_idx, j))))
|
||||
weight2_scale.append(
|
||||
get_tensor(
|
||||
state_dict.pop(
|
||||
self.ffn2_expert_weight_scale_key.format(
|
||||
self.layer_idx, j))))
|
||||
ffn1_in_scale.append(
|
||||
get_tensor(
|
||||
state_dict.pop(
|
||||
self.ffn1_expert_in_scale_key.format(
|
||||
self.layer_idx, j))))
|
||||
ffn2_in_scale.append(
|
||||
get_tensor(
|
||||
state_dict.pop(
|
||||
self.ffn2_expert_in_scale_key.format(
|
||||
self.layer_idx, j))))
|
||||
|
||||
# other weight is with compute_method
|
||||
# different method may have different way to create weights
|
||||
self.compute_method.create_weights(self, self.moe_compute_params,
|
||||
up_gate_proj_weight,
|
||||
down_proj_weight, None, None,
|
||||
weight1_scale, weight2_scale,
|
||||
ffn1_in_scale, ffn2_in_scale)
|
||||
|
||||
def forward(self, x, **kwargs):
|
||||
def forward(self, x: paddle.Tensor):
|
||||
"""
|
||||
Defines the forward computation of the moe layer.
|
||||
|
||||
@@ -225,13 +225,9 @@ class FusedMoE(nn.Layer):
|
||||
x (Tensor): Input tensor to the moe layer.
|
||||
|
||||
Returns:
|
||||
Tensor: Output tensor.
|
||||
Tensor: Output tensor.s
|
||||
|
||||
"""
|
||||
|
||||
out = self.compute_method.apply(self, self.moe_compute_params, x)
|
||||
if self.tp_size > 1:
|
||||
from fastdeploy.distributed.communication_op import \
|
||||
tensor_model_parallel_all_reduce
|
||||
tensor_model_parallel_all_reduce(out)
|
||||
gate_out = paddle.matmul(x.cast("float32"), self.gate_weight)
|
||||
out = self.quant_method.apply(self, x, gate_out)
|
||||
return out
|
||||
|
||||
@@ -1,126 +0,0 @@
|
||||
"""
|
||||
# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
"""
|
||||
|
||||
import os
|
||||
import paddle
|
||||
import fastdeploy
|
||||
import fastdeploy.model_executor.ops.gpu.deep_gemm as deep_gemm
|
||||
from fastdeploy.model_executor.layers.moe.moe import MoELayer
|
||||
|
||||
|
||||
class MoeTPDecoerDeepDeepGEMMLayer(MoELayer):
|
||||
"""
|
||||
MoeTPDecoerDeepDeepGEMMLayer
|
||||
"""
|
||||
def __init__(self, *args, **kwargs):
|
||||
super().__init__(*args, **kwargs)
|
||||
|
||||
def forward(self, x, **kwargs):
|
||||
"""
|
||||
forward
|
||||
"""
|
||||
gate_out = paddle.matmul(x.cast("float32"), self.gate_weight)
|
||||
if os.getenv("EP_DECODER_PERF_TEST", "False") == "True":
|
||||
gate_out = paddle.rand(shape=gate_out.shape, dtype=gate_out.dtype)
|
||||
ffn1_out = paddle.empty(
|
||||
[
|
||||
self.num_local_experts,
|
||||
self.max_batch_size,
|
||||
self.moe_intermediate_size * 2,
|
||||
],
|
||||
dtype=self._dtype,
|
||||
)
|
||||
|
||||
ffn_out = paddle.empty(
|
||||
[
|
||||
self.num_local_experts,
|
||||
self.max_batch_size,
|
||||
self.embed_dim,
|
||||
],
|
||||
dtype=self._dtype,
|
||||
)
|
||||
|
||||
topk_idx, topk_weights = fastdeploy.model_executor.ops.gpu.moe_topk_select(
|
||||
gate_out,
|
||||
(
|
||||
self.gate_correction_bias
|
||||
if self.moe_config.moe_use_gate_correction_bias
|
||||
else None
|
||||
),
|
||||
self.top_k,
|
||||
True, # apply_norm_weight
|
||||
False,
|
||||
)
|
||||
permute_input, token_nums_per_expert, permute_indices_per_token = (
|
||||
fastdeploy.model_executor.ops.gpu.moe_deepgemm_permute(
|
||||
x, topk_idx, self.num_local_experts, self.max_batch_size
|
||||
)
|
||||
)
|
||||
|
||||
expected_m = 128
|
||||
|
||||
permute_input_fp8, scale = fastdeploy.model_executor.ops.gpu.masked_per_token_quant(
|
||||
permute_input, token_nums_per_expert, 128
|
||||
)
|
||||
deep_gemm.m_grouped_gemm_fp8_fp8_bf16_nt_masked(
|
||||
(permute_input_fp8, scale),
|
||||
(
|
||||
self.moe_ffn1_weight,
|
||||
self.moe_ffn1_weight_scale,
|
||||
),
|
||||
ffn1_out,
|
||||
token_nums_per_expert,
|
||||
expected_m,
|
||||
)
|
||||
|
||||
act_out = fastdeploy.model_executor.ops.gpu.group_swiglu_with_masked(
|
||||
ffn1_out, token_nums_per_expert
|
||||
)
|
||||
|
||||
act_out_fp8, scale = fastdeploy.model_executor.ops.gpu.masked_per_token_quant(
|
||||
act_out, token_nums_per_expert, 128
|
||||
)
|
||||
|
||||
deep_gemm.m_grouped_gemm_fp8_fp8_bf16_nt_masked(
|
||||
(act_out_fp8, scale),
|
||||
(
|
||||
self.moe_ffn2_weight,
|
||||
self.moe_ffn2_weight_scale,
|
||||
),
|
||||
ffn_out,
|
||||
token_nums_per_expert,
|
||||
expected_m,
|
||||
)
|
||||
|
||||
fused_moe_out = fastdeploy.model_executor.ops.gpu.moe_deepgemm_depermute(
|
||||
ffn_out, permute_indices_per_token, topk_idx, topk_weights
|
||||
)[0]
|
||||
|
||||
return fused_moe_out
|
||||
|
||||
|
||||
class MoeTPPrefillDeepDeepGEMMLayer(MoELayer):
|
||||
"""
|
||||
MoeTPPrefillDeepDeepGEMMLayer
|
||||
"""
|
||||
def __init__(self, *args, **kwargs):
|
||||
super().__init__(*args, **kwargs)
|
||||
|
||||
def forward(self, x, **kwargs):
|
||||
"""
|
||||
forward
|
||||
"""
|
||||
raise NotImplementedError("Prefill is comming soon...")
|
||||
@@ -0,0 +1,198 @@
|
||||
"""
|
||||
# Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
"""
|
||||
|
||||
import triton
|
||||
import triton.language as tl
|
||||
|
||||
|
||||
@triton.jit
|
||||
def fused_moe_kernel_paddle(
|
||||
a_ptr,
|
||||
b_ptr,
|
||||
c_ptr,
|
||||
a_scale_ptr,
|
||||
b_scale_ptr,
|
||||
topk_weights_ptr,
|
||||
sorted_token_ids_ptr,
|
||||
expert_ids_ptr,
|
||||
num_tokens_post_padded_ptr,
|
||||
|
||||
# Matrix dimensions
|
||||
N,
|
||||
K,
|
||||
num_tokens_post_padded,
|
||||
num_valid_tokens,
|
||||
stride_am,
|
||||
stride_ak,
|
||||
stride_be,
|
||||
stride_bk,
|
||||
stride_bn,
|
||||
stride_cm,
|
||||
stride_cn,
|
||||
stride_asm,
|
||||
stride_ask,
|
||||
stride_bse,
|
||||
stride_bsk,
|
||||
stride_bsn,
|
||||
# Block size for block-wise fp8 quantization
|
||||
group_n: tl.constexpr,
|
||||
group_k: tl.constexpr,
|
||||
# Meta-parameters
|
||||
BLOCK_SIZE_M: tl.constexpr,
|
||||
BLOCK_SIZE_N: tl.constexpr,
|
||||
BLOCK_SIZE_K: tl.constexpr,
|
||||
GROUP_SIZE_M: tl.constexpr,
|
||||
MUL_ROUTED_WEIGHT: tl.constexpr,
|
||||
top_k: tl.constexpr,
|
||||
compute_type_enum: tl.constexpr,
|
||||
use_fp8_w8a8: tl.constexpr,
|
||||
use_int8_w8a16: tl.constexpr,
|
||||
even_Ks: tl.constexpr,
|
||||
):
|
||||
"""
|
||||
|
||||
Key Parameters:
|
||||
- A: The input tensor representing tokens with shape (*, K), where '*' can
|
||||
be any shape representing batches and K is the feature dimension of
|
||||
each token.
|
||||
- B: The stacked MOE weight tensor with shape (E, N, K), where E is
|
||||
the number of experts, K is the input feature dimension, and N is
|
||||
the output feature dimension.
|
||||
- C: The output cache tensor with shape (M, topk, N), where M is the
|
||||
total number of tokens post padding, topk is the number of times
|
||||
each token is repeated, and N is the output feature dimension.
|
||||
- sorted_token_ids: A tensor containing the sorted indices of tokens,
|
||||
repeated topk times and arranged by the expert index they are
|
||||
assigned to.
|
||||
- expert_ids: A tensor containing the indices of the expert for each
|
||||
block. It determines which expert matrix from B should be used for
|
||||
each block in A.
|
||||
This kernel performs the multiplication of a token by its corresponding
|
||||
expert matrix as determined by `expert_ids`. The sorting of
|
||||
`sorted_token_ids` by expert index and padding ensures divisibility by
|
||||
BLOCK_SIZE_M, which is necessary to maintain consistency in block matrix
|
||||
multiplication across different blocks processed by the same expert.
|
||||
"""
|
||||
pid = tl.program_id(axis=0)
|
||||
num_pid_m = tl.cdiv(num_tokens_post_padded, BLOCK_SIZE_M)
|
||||
num_pid_n = tl.cdiv(N, BLOCK_SIZE_N)
|
||||
num_pid_in_group = GROUP_SIZE_M * num_pid_n
|
||||
group_id = pid // num_pid_in_group
|
||||
first_pid_m = group_id * GROUP_SIZE_M
|
||||
group_size_m = min(num_pid_m - first_pid_m, GROUP_SIZE_M)
|
||||
pid_m = first_pid_m + ((pid % num_pid_in_group) % group_size_m)
|
||||
pid_n = (pid % num_pid_in_group) // group_size_m
|
||||
|
||||
assert compute_type_enum == 1
|
||||
compute_type = tl.bfloat16
|
||||
|
||||
num_tokens_post_padded = tl.load(num_tokens_post_padded_ptr)
|
||||
if pid_m * BLOCK_SIZE_M >= num_tokens_post_padded:
|
||||
return
|
||||
offs_token_id = pid_m * BLOCK_SIZE_M + tl.arange(0, BLOCK_SIZE_M)
|
||||
offs_token = tl.load(sorted_token_ids_ptr + offs_token_id)
|
||||
token_mask = offs_token < num_valid_tokens
|
||||
|
||||
offs_bn = (pid_n * BLOCK_SIZE_N + tl.arange(0, BLOCK_SIZE_N)) % N
|
||||
offs_k = tl.arange(0, BLOCK_SIZE_K)
|
||||
a_ptrs = a_ptr + (offs_token[:, None] // top_k * stride_am +
|
||||
offs_k[None, :] * stride_ak)
|
||||
|
||||
off_experts = tl.load(expert_ids_ptr + pid_m)
|
||||
b_ptrs = b_ptr + off_experts * stride_be + (offs_k[:, None] * stride_bk +
|
||||
offs_bn[None, :] * stride_bn)
|
||||
|
||||
if use_int8_w8a16:
|
||||
b_scale_ptrs = b_scale_ptr + off_experts * stride_bse + offs_bn[
|
||||
None, :] * stride_bsn
|
||||
b_scale = tl.load(b_scale_ptrs)
|
||||
|
||||
if use_fp8_w8a8:
|
||||
if group_k > 0 and group_n > 0:
|
||||
a_scale_ptrs = a_scale_ptr + (offs_token // top_k) * stride_asm
|
||||
offs_bsn = offs_bn // group_n
|
||||
b_scale_ptrs = b_scale_ptr + off_experts * stride_bse + offs_bsn * stride_bsn
|
||||
else:
|
||||
# (Zkk): every expert has one activation scale and weight scale.
|
||||
a_scale = tl.load(a_scale_ptr + off_experts)
|
||||
b_scale = tl.load(b_scale_ptr + off_experts)
|
||||
|
||||
accumulator = tl.zeros((BLOCK_SIZE_M, BLOCK_SIZE_N), dtype=tl.float32)
|
||||
|
||||
for k in range(0, tl.cdiv(K, BLOCK_SIZE_K)):
|
||||
if even_Ks:
|
||||
a = tl.load(
|
||||
a_ptrs,
|
||||
mask=token_mask[:, None],
|
||||
other=0.0,
|
||||
)
|
||||
b = tl.load(b_ptrs,
|
||||
cache_modifier=".cv",
|
||||
eviction_policy='evict_first')
|
||||
else:
|
||||
a = tl.load(
|
||||
a_ptrs,
|
||||
mask=token_mask[:, None] &
|
||||
(offs_k[None, :] < K - k * BLOCK_SIZE_K),
|
||||
other=0.0,
|
||||
)
|
||||
b = tl.load(b_ptrs,
|
||||
mask=offs_k[:, None] < K - k * BLOCK_SIZE_K,
|
||||
other=0.0)
|
||||
|
||||
# We accumulate along the K dimension.
|
||||
if use_int8_w8a16:
|
||||
accumulator = tl.dot(a, b.to(compute_type), acc=accumulator)
|
||||
elif use_fp8_w8a8:
|
||||
if group_k > 0 and group_n > 0:
|
||||
k_start = k * BLOCK_SIZE_K
|
||||
offs_ks = k_start // group_k
|
||||
a_scale = tl.load(a_scale_ptrs + offs_ks * stride_ask,
|
||||
mask=token_mask,
|
||||
other=0.0)
|
||||
b_scale = tl.load(b_scale_ptrs + offs_ks * stride_bsk)
|
||||
|
||||
accumulator += tl.dot(a, b) * a_scale[:,
|
||||
None] * b_scale[None, :]
|
||||
else:
|
||||
accumulator = tl.dot(a, b, acc=accumulator)
|
||||
else:
|
||||
accumulator += tl.dot(a, b)
|
||||
|
||||
a_ptrs += BLOCK_SIZE_K * stride_ak
|
||||
b_ptrs += BLOCK_SIZE_K * stride_bk
|
||||
|
||||
if MUL_ROUTED_WEIGHT:
|
||||
moe_weight = tl.load(topk_weights_ptr + offs_token,
|
||||
mask=token_mask,
|
||||
other=0)
|
||||
accumulator = accumulator * moe_weight[:, None]
|
||||
if use_int8_w8a16:
|
||||
accumulator = (accumulator * b_scale).to(compute_type)
|
||||
elif use_fp8_w8a8:
|
||||
if group_k > 0 and group_n > 0:
|
||||
accumulator = accumulator.to(compute_type)
|
||||
else:
|
||||
accumulator = (accumulator * a_scale * b_scale).to(compute_type)
|
||||
else:
|
||||
accumulator = accumulator.to(compute_type)
|
||||
# Write back the block of the output
|
||||
offs_cn = pid_n * BLOCK_SIZE_N + tl.arange(0, BLOCK_SIZE_N)
|
||||
c_ptrs = c_ptr + stride_cm * offs_token[:, None] + stride_cn * offs_cn[
|
||||
None, :]
|
||||
c_mask = token_mask[:, None] & (offs_cn[None, :] < N)
|
||||
|
||||
tl.store(c_ptrs, accumulator, mask=c_mask)
|
||||
Reference in New Issue
Block a user