mirror of
https://github.com/PaddlePaddle/FastDeploy.git
synced 2026-04-23 00:17:25 +08:00
[Iluvatar] refactor attn and moe code (#6887)
This commit is contained in:
@@ -20,7 +20,6 @@ from .block_multihead_attn_backend import BlockAttentionBackend
|
|||||||
from .dsa_attention_backend import DSAAttentionBackend
|
from .dsa_attention_backend import DSAAttentionBackend
|
||||||
from .flash_attn_backend import FlashAttentionBackend
|
from .flash_attn_backend import FlashAttentionBackend
|
||||||
from .flash_mask_attn_backend import FlashMaskAttentionBackend
|
from .flash_mask_attn_backend import FlashMaskAttentionBackend
|
||||||
from .iluvatar_attn_backend import IluvatarAttnBackend
|
|
||||||
from .mla_attention_backend import MLAAttentionBackend
|
from .mla_attention_backend import MLAAttentionBackend
|
||||||
from .moba_attention_backend import PlasAttentionBackend
|
from .moba_attention_backend import PlasAttentionBackend
|
||||||
from .native_paddle_backend import PaddleNativeAttnBackend
|
from .native_paddle_backend import PaddleNativeAttnBackend
|
||||||
@@ -33,7 +32,6 @@ __all__ = [
|
|||||||
"MLAAttentionBackend",
|
"MLAAttentionBackend",
|
||||||
"DSAAttentionBackend",
|
"DSAAttentionBackend",
|
||||||
"FlashAttentionBackend",
|
"FlashAttentionBackend",
|
||||||
"IluvatarAttnBackend",
|
|
||||||
"BlockAttentionBackend",
|
"BlockAttentionBackend",
|
||||||
"Attention",
|
"Attention",
|
||||||
"PlasAttentionBackend",
|
"PlasAttentionBackend",
|
||||||
|
|||||||
@@ -62,3 +62,10 @@ if current_platform.is_intel_hpu():
|
|||||||
if hasattr(intel_hpu, "__all__"):
|
if hasattr(intel_hpu, "__all__"):
|
||||||
globals().update({name: getattr(intel_hpu, name) for name in intel_hpu.__all__})
|
globals().update({name: getattr(intel_hpu, name) for name in intel_hpu.__all__})
|
||||||
__all__.extend(intel_hpu.__all__)
|
__all__.extend(intel_hpu.__all__)
|
||||||
|
|
||||||
|
if current_platform.is_iluvatar():
|
||||||
|
from . import iluvatar
|
||||||
|
|
||||||
|
if hasattr(iluvatar, "__all__"):
|
||||||
|
globals().update({name: getattr(iluvatar, name) for name in iluvatar.__all__})
|
||||||
|
__all__.extend(iluvatar.__all__)
|
||||||
|
|||||||
@@ -0,0 +1,30 @@
|
|||||||
|
# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
|
||||||
|
#
|
||||||
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
# you may not use this file except in compliance with the License.
|
||||||
|
# You may obtain a copy of the License at
|
||||||
|
#
|
||||||
|
# http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
#
|
||||||
|
# Unless required by applicable law or agreed to in writing, software
|
||||||
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
# See the License for the specific language governing permissions and
|
||||||
|
# limitations under the License.
|
||||||
|
|
||||||
|
"""
|
||||||
|
iluvatar gpu backend methods
|
||||||
|
"""
|
||||||
|
from .attention.mha_attn_backend import MhaAttnBackend
|
||||||
|
from .moe.fuse_moe_cutlass_iluvatar_backend import (
|
||||||
|
IluvatarCutlassMoEMethod,
|
||||||
|
IluvatarCutlassWeightOnlyMoEMethod,
|
||||||
|
)
|
||||||
|
from .quantization.weight_only import IluvatarWeightOnlyLinearMethod
|
||||||
|
|
||||||
|
__all__ = [
|
||||||
|
"MhaAttnBackend",
|
||||||
|
"IluvatarCutlassMoEMethod",
|
||||||
|
"IluvatarCutlassWeightOnlyMoEMethod",
|
||||||
|
"IluvatarWeightOnlyLinearMethod",
|
||||||
|
]
|
||||||
@@ -0,0 +1,17 @@
|
|||||||
|
# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
|
||||||
|
#
|
||||||
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
# you may not use this file except in compliance with the License.
|
||||||
|
# You may obtain a copy of the License at
|
||||||
|
#
|
||||||
|
# http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
#
|
||||||
|
# Unless required by applicable law or agreed to in writing, software
|
||||||
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
# See the License for the specific language governing permissions and
|
||||||
|
# limitations under the License.
|
||||||
|
|
||||||
|
"""
|
||||||
|
iluvatar gpu backend attention methods
|
||||||
|
"""
|
||||||
+4
-4
@@ -39,9 +39,9 @@ if TYPE_CHECKING:
|
|||||||
|
|
||||||
|
|
||||||
@dataclass
|
@dataclass
|
||||||
class IluvatarAttentionMetadata(AttentionMetadata):
|
class MhaAttentionMetadata(AttentionMetadata):
|
||||||
"""
|
"""
|
||||||
IluvatarAttentionMetadata
|
MhaAttentionMetadata
|
||||||
"""
|
"""
|
||||||
|
|
||||||
alibi_slopes: Optional[paddle.Tensor] = None
|
alibi_slopes: Optional[paddle.Tensor] = None
|
||||||
@@ -60,7 +60,7 @@ class IluvatarAttentionMetadata(AttentionMetadata):
|
|||||||
decode_block_tables: paddle.Tensor = None
|
decode_block_tables: paddle.Tensor = None
|
||||||
|
|
||||||
|
|
||||||
class IluvatarAttnBackend(AttentionBackend):
|
class MhaAttnBackend(AttentionBackend):
|
||||||
"""
|
"""
|
||||||
The backend class that uses paddle native attention implementation.
|
The backend class that uses paddle native attention implementation.
|
||||||
Which is used only for testing purpose.
|
Which is used only for testing purpose.
|
||||||
@@ -76,7 +76,7 @@ class IluvatarAttnBackend(AttentionBackend):
|
|||||||
decoder_block_shape_q: int = -1,
|
decoder_block_shape_q: int = -1,
|
||||||
):
|
):
|
||||||
super().__init__()
|
super().__init__()
|
||||||
self.attention_metadata = IluvatarAttentionMetadata()
|
self.attention_metadata = MhaAttentionMetadata()
|
||||||
self.block_size = fd_config.cache_config.block_size
|
self.block_size = fd_config.cache_config.block_size
|
||||||
assert self.block_size == 16, "Iluvatar paged attn requires block_size must be 16."
|
assert self.block_size == 16, "Iluvatar paged attn requires block_size must be 16."
|
||||||
self.max_context_len = fd_config.model_config.max_model_len
|
self.max_context_len = fd_config.model_config.max_model_len
|
||||||
@@ -0,0 +1,17 @@
|
|||||||
|
# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
|
||||||
|
#
|
||||||
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
# you may not use this file except in compliance with the License.
|
||||||
|
# You may obtain a copy of the License at
|
||||||
|
#
|
||||||
|
# http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
#
|
||||||
|
# Unless required by applicable law or agreed to in writing, software
|
||||||
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
# See the License for the specific language governing permissions and
|
||||||
|
# limitations under the License.
|
||||||
|
|
||||||
|
"""
|
||||||
|
iluvatar gpu backend moe methods
|
||||||
|
"""
|
||||||
+510
@@ -0,0 +1,510 @@
|
|||||||
|
"""
|
||||||
|
# Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
|
||||||
|
#
|
||||||
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
# you may not use this file except in compliance with the License.
|
||||||
|
# You may obtain a copy of the License at
|
||||||
|
#
|
||||||
|
# http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
#
|
||||||
|
# Unless required by applicable law or agreed to in writing, software
|
||||||
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
# See the License for the specific language governing permissions and
|
||||||
|
# limitations under the License.
|
||||||
|
"""
|
||||||
|
|
||||||
|
from typing import Callable
|
||||||
|
|
||||||
|
import paddle
|
||||||
|
from paddle import nn
|
||||||
|
from paddle.nn.quant import weight_quantize
|
||||||
|
|
||||||
|
from fastdeploy.model_executor.layers.moe.fused_moe_backend_base import (
|
||||||
|
UnquantizedFusedMoEMethod,
|
||||||
|
)
|
||||||
|
from fastdeploy.model_executor.layers.moe.moe import get_moe_scores
|
||||||
|
from fastdeploy.model_executor.layers.utils import get_tensor
|
||||||
|
from fastdeploy.model_executor.ops.iluvatar import (
|
||||||
|
moe_expert_dispatch,
|
||||||
|
moe_expert_ffn,
|
||||||
|
moe_expert_reduce,
|
||||||
|
)
|
||||||
|
from fastdeploy.model_executor.utils import (
|
||||||
|
TensorTracker,
|
||||||
|
free_tensor,
|
||||||
|
process_weight_transpose,
|
||||||
|
set_weight_attrs,
|
||||||
|
weight_fully_copied,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
class IluvatarCutlassMoEMethod(UnquantizedFusedMoEMethod):
|
||||||
|
"""
|
||||||
|
Use Cutlass Group Gemm to compute Fused MoE.
|
||||||
|
This method is the oldest way to compute MoE in Paddle.
|
||||||
|
"""
|
||||||
|
|
||||||
|
def process_loaded_weights(self, layer: nn.Layer, state_dict):
|
||||||
|
up_gate_proj_weights, down_proj_weights, logical_expert_ids, ep_rank_to_expert_id_list = (
|
||||||
|
layer.extract_moe_ffn_weights(state_dict)
|
||||||
|
)
|
||||||
|
stacked_up_gate_proj_weights = paddle.stack(up_gate_proj_weights, axis=0)
|
||||||
|
stacked_down_proj_weights = paddle.stack(down_proj_weights, axis=0)
|
||||||
|
|
||||||
|
layer.up_gate_proj_weight.set_value(stacked_up_gate_proj_weights)
|
||||||
|
layer.down_proj_weight.set_value(stacked_down_proj_weights)
|
||||||
|
|
||||||
|
if layer.with_bias:
|
||||||
|
up_gate_proj_bias, down_proj_bias = layer.extract_moe_ffn_bias(state_dict)
|
||||||
|
stacked_up_gate_proj_bias = paddle.stack(up_gate_proj_bias, axis=0)
|
||||||
|
stacked_down_proj_bias = paddle.stack(down_proj_bias, axis=0)
|
||||||
|
|
||||||
|
layer.up_gate_proj_bias.set_value(stacked_up_gate_proj_bias)
|
||||||
|
layer.down_proj_bias.set_value(stacked_down_proj_bias)
|
||||||
|
|
||||||
|
def compute_ffn(
|
||||||
|
self,
|
||||||
|
layer: nn.Layer,
|
||||||
|
permute_input: paddle.Tensor,
|
||||||
|
token_nums_per_expert: paddle.Tensor,
|
||||||
|
expert_idx_per_token: paddle.Tensor,
|
||||||
|
):
|
||||||
|
"""
|
||||||
|
Paddle Cutlass compute Fused MoE.
|
||||||
|
"""
|
||||||
|
ffn_out_without_down_proj_bias = moe_expert_ffn(
|
||||||
|
permute_input,
|
||||||
|
token_nums_per_expert,
|
||||||
|
getattr(layer, self.added_weight_attrs[0]),
|
||||||
|
getattr(layer, self.added_weight_attrs[1]),
|
||||||
|
(layer.up_gate_proj_bias if hasattr(layer, "up_gate_proj_bias") else None),
|
||||||
|
(layer.up_gate_proj_weight_scale if hasattr(layer, "up_gate_proj_weight_scale") else None),
|
||||||
|
(layer.down_proj_weight_scale if hasattr(layer, "down_proj_weight_scale") else None),
|
||||||
|
self.moe_quant_type,
|
||||||
|
layer.fd_config.model_config.moe_phase.phase,
|
||||||
|
)
|
||||||
|
|
||||||
|
if layer.with_bias:
|
||||||
|
down_proj_bias_expand = paddle.index_select(layer.down_proj_bias, expert_idx_per_token, axis=0)
|
||||||
|
ffn_out_without_down_proj_bias = paddle.add(ffn_out_without_down_proj_bias, down_proj_bias_expand)
|
||||||
|
return ffn_out_without_down_proj_bias
|
||||||
|
|
||||||
|
def apply_ep_prefill(
|
||||||
|
self,
|
||||||
|
layer: nn.Layer,
|
||||||
|
x: paddle.Tensor,
|
||||||
|
gate: nn.Layer,
|
||||||
|
topk_ids_hookfunc: Callable = None,
|
||||||
|
) -> paddle.Tensor:
|
||||||
|
"""
|
||||||
|
Apply the EP prefill method.
|
||||||
|
"""
|
||||||
|
raise NotImplementedError
|
||||||
|
|
||||||
|
def apply_ep_decode(
|
||||||
|
self,
|
||||||
|
layer: nn.Layer,
|
||||||
|
x: paddle.Tensor,
|
||||||
|
gate: nn.Layer,
|
||||||
|
topk_ids_hookfunc: Callable = None,
|
||||||
|
) -> paddle.Tensor:
|
||||||
|
"""
|
||||||
|
Apply the EP decoder method.
|
||||||
|
"""
|
||||||
|
raise NotImplementedError
|
||||||
|
|
||||||
|
def apply_tp(
|
||||||
|
self,
|
||||||
|
layer: nn.Layer,
|
||||||
|
x: paddle.Tensor,
|
||||||
|
gate: nn.Layer,
|
||||||
|
topk_ids_hookfunc: Callable = None,
|
||||||
|
) -> paddle.Tensor:
|
||||||
|
"""
|
||||||
|
Paddle Cutlass compute Fused MoE.
|
||||||
|
"""
|
||||||
|
gate_out = gate(x)
|
||||||
|
gate_out = gate_out.cast("float32")
|
||||||
|
if layer.topk_method == "noaux_tc":
|
||||||
|
gate_out, topk_weights, topk_idx = get_moe_scores(
|
||||||
|
gate_out,
|
||||||
|
layer.n_group,
|
||||||
|
layer.topk_group,
|
||||||
|
layer.top_k,
|
||||||
|
layer.routed_scaling_factor,
|
||||||
|
layer.gate_correction_bias,
|
||||||
|
getattr(layer, "renormalize", True),
|
||||||
|
)
|
||||||
|
(
|
||||||
|
permute_input,
|
||||||
|
token_nums_per_expert,
|
||||||
|
permute_indices_per_token,
|
||||||
|
topk_weights,
|
||||||
|
topk_idx,
|
||||||
|
expert_idx_per_token,
|
||||||
|
) = moe_expert_dispatch(
|
||||||
|
x,
|
||||||
|
gate_out,
|
||||||
|
None, # Use layer.gate_correction_bias in get_moe_scores.
|
||||||
|
(
|
||||||
|
layer.up_gate_proj_in_scale if hasattr(layer, "up_gate_proj_in_scale") else None
|
||||||
|
), # if set, permute_input will be int8_t
|
||||||
|
layer.top_k,
|
||||||
|
False,
|
||||||
|
self.moe_quant_type,
|
||||||
|
topk_only_mode=True,
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
(
|
||||||
|
permute_input,
|
||||||
|
token_nums_per_expert,
|
||||||
|
permute_indices_per_token,
|
||||||
|
topk_weights,
|
||||||
|
topk_idx,
|
||||||
|
expert_idx_per_token,
|
||||||
|
) = moe_expert_dispatch(
|
||||||
|
x,
|
||||||
|
gate_out,
|
||||||
|
layer.gate_correction_bias,
|
||||||
|
(layer.up_gate_proj_in_scale if hasattr(layer, "up_gate_proj_in_scale") else None),
|
||||||
|
layer.top_k,
|
||||||
|
False,
|
||||||
|
self.moe_quant_type,
|
||||||
|
topk_only_mode=False,
|
||||||
|
)
|
||||||
|
|
||||||
|
if topk_ids_hookfunc is not None:
|
||||||
|
topk_ids_hookfunc(topk_ids=topk_idx)
|
||||||
|
|
||||||
|
if not layer.with_bias and self.moe_quant_type != "w4a8" and self.moe_quant_type != "w4afp8":
|
||||||
|
# only w4a8 need expert_idx_per_token
|
||||||
|
# Other need not this tensor, so we make it None.
|
||||||
|
expert_idx_per_token = None
|
||||||
|
else:
|
||||||
|
expert_idx_per_token = expert_idx_per_token.cast("int64")
|
||||||
|
|
||||||
|
ffn_out = self.compute_ffn(
|
||||||
|
layer,
|
||||||
|
permute_input,
|
||||||
|
token_nums_per_expert,
|
||||||
|
expert_idx_per_token,
|
||||||
|
)
|
||||||
|
|
||||||
|
# reduce 中会做 topk 个 weight 的 norm 和 routed_scaling_factor
|
||||||
|
fused_moe_out = moe_expert_reduce(
|
||||||
|
ffn_out,
|
||||||
|
topk_weights,
|
||||||
|
permute_indices_per_token,
|
||||||
|
topk_idx,
|
||||||
|
None,
|
||||||
|
norm_topk_prob=False if layer.topk_method == "noaux_tc" else True,
|
||||||
|
routed_scaling_factor=1.0,
|
||||||
|
)
|
||||||
|
|
||||||
|
return fused_moe_out
|
||||||
|
|
||||||
|
|
||||||
|
class IluvatarCutlassWeightOnlyMoEMethod(IluvatarCutlassMoEMethod):
|
||||||
|
"""
|
||||||
|
weight only for moe
|
||||||
|
"""
|
||||||
|
|
||||||
|
def __init__(self, quant_config):
|
||||||
|
super().__init__(quant_config)
|
||||||
|
self.quant_config = quant_config
|
||||||
|
self.moe_quant_type = self.quant_config.algo
|
||||||
|
self.pack_num = 1
|
||||||
|
|
||||||
|
def process_prequanted_weights(self, layer: nn.Layer, state_dict, is_rearrange: bool = False):
|
||||||
|
"""
|
||||||
|
Paddle cutlass process prequanted weights.
|
||||||
|
"""
|
||||||
|
up_gate_proj_expert_weight_key = layer.weight_key_map.get("up_gate_proj_expert_weight_key", None)
|
||||||
|
down_proj_expert_weight_key = layer.weight_key_map.get("down_proj_expert_weight_key", None)
|
||||||
|
up_gate_proj_expert_weight_scale_key = layer.weight_key_map.get("up_gate_proj_expert_weight_scale_key", None)
|
||||||
|
down_proj_expert_weight_scale_key = layer.weight_key_map.get("down_proj_expert_weight_scale_key", None)
|
||||||
|
|
||||||
|
up_gate_proj_weights, down_proj_weights, logical_expert_ids, _ = layer.load_experts_weight(
|
||||||
|
state_dict, up_gate_proj_expert_weight_key, down_proj_expert_weight_key, is_rearrange
|
||||||
|
)
|
||||||
|
# self.check(layer, up_gate_proj_weights, down_proj_weights)
|
||||||
|
up_gate_proj_weight_scale = []
|
||||||
|
down_proj_weight_scale = []
|
||||||
|
|
||||||
|
if isinstance(state_dict, list):
|
||||||
|
state_dict = dict(state_dict)
|
||||||
|
|
||||||
|
for expert_idx in logical_expert_ids:
|
||||||
|
up_gate_proj_weight_scale.append(
|
||||||
|
get_tensor(state_dict.pop(up_gate_proj_expert_weight_scale_key.format(expert_idx)))
|
||||||
|
)
|
||||||
|
down_proj_weight_scale.append(
|
||||||
|
get_tensor(state_dict.pop(down_proj_expert_weight_scale_key.format(expert_idx)))
|
||||||
|
)
|
||||||
|
|
||||||
|
up_gate_proj_weight = paddle.stack(up_gate_proj_weights, axis=0)
|
||||||
|
down_proj_weight = paddle.stack(down_proj_weights, axis=0)
|
||||||
|
up_gate_proj_weight_scale = paddle.stack(up_gate_proj_weight_scale, axis=0)
|
||||||
|
down_proj_weight_scale = paddle.stack(down_proj_weight_scale, axis=0)
|
||||||
|
|
||||||
|
name_tensor_map = {
|
||||||
|
"up_gate_proj_weight": up_gate_proj_weight,
|
||||||
|
"down_proj_weight": down_proj_weight,
|
||||||
|
"up_gate_proj_weight_scale": up_gate_proj_weight_scale,
|
||||||
|
"down_proj_weight_scale": down_proj_weight_scale,
|
||||||
|
}
|
||||||
|
for name, tensor in name_tensor_map.items():
|
||||||
|
getattr(layer, name).set_value(tensor)
|
||||||
|
|
||||||
|
def create_weights(self, layer: nn.Layer, **extra_weight_attrs):
|
||||||
|
"""
|
||||||
|
Paddle cutlass create weight process.
|
||||||
|
"""
|
||||||
|
self.default_dtype = layer._helper.get_default_dtype()
|
||||||
|
if self.moe_quant_type == "weight_only_int4":
|
||||||
|
self.up_gate_proj_weight_shape = [
|
||||||
|
layer.num_local_experts,
|
||||||
|
layer.moe_intermediate_size,
|
||||||
|
layer.hidden_size,
|
||||||
|
]
|
||||||
|
else:
|
||||||
|
self.up_gate_proj_weight_shape = [
|
||||||
|
layer.num_local_experts,
|
||||||
|
layer.moe_intermediate_size * 2,
|
||||||
|
layer.hidden_size,
|
||||||
|
]
|
||||||
|
if self.moe_quant_type == "weight_only_int4":
|
||||||
|
self.down_proj_weight_shape = [
|
||||||
|
layer.num_local_experts,
|
||||||
|
layer.hidden_size // 2,
|
||||||
|
layer.moe_intermediate_size,
|
||||||
|
]
|
||||||
|
else:
|
||||||
|
self.down_proj_weight_shape = [
|
||||||
|
layer.num_local_experts,
|
||||||
|
layer.hidden_size,
|
||||||
|
layer.moe_intermediate_size,
|
||||||
|
]
|
||||||
|
self.up_gate_proj_scale_shape = [layer.num_local_experts, layer.moe_intermediate_size * 2]
|
||||||
|
self.down_proj_scale_shape = [layer.num_local_experts, layer.hidden_size]
|
||||||
|
self.model_format = extra_weight_attrs.get("model_format")
|
||||||
|
# TODO(bukejiyu): remove v1 loader check when v0 loader is removed
|
||||||
|
if self.quant_config.is_checkpoint_bf16 and layer.fd_config.load_config.load_choices == "default_v1":
|
||||||
|
if self.model_format != "torch":
|
||||||
|
up_gate_proj_weight_shape = [
|
||||||
|
layer.num_local_experts,
|
||||||
|
layer.hidden_size,
|
||||||
|
layer.moe_intermediate_size * 2,
|
||||||
|
]
|
||||||
|
down_proj_weight_shape = [layer.num_local_experts, layer.moe_intermediate_size, layer.hidden_size]
|
||||||
|
up_gate_proj_attrs = {
|
||||||
|
**extra_weight_attrs,
|
||||||
|
"tensor_track": TensorTracker(shape=up_gate_proj_weight_shape, output_dim=True),
|
||||||
|
}
|
||||||
|
down_proj_attrs = {
|
||||||
|
**extra_weight_attrs,
|
||||||
|
"tensor_track": TensorTracker(shape=down_proj_weight_shape, output_dim=False),
|
||||||
|
}
|
||||||
|
else:
|
||||||
|
up_gate_proj_weight_shape = [
|
||||||
|
layer.num_local_experts,
|
||||||
|
layer.moe_intermediate_size * 2,
|
||||||
|
layer.hidden_size,
|
||||||
|
]
|
||||||
|
down_proj_weight_shape = [layer.num_local_experts, layer.hidden_size, layer.moe_intermediate_size]
|
||||||
|
up_gate_proj_attrs = {
|
||||||
|
**extra_weight_attrs,
|
||||||
|
"tensor_track": TensorTracker(shape=up_gate_proj_weight_shape, output_dim=False),
|
||||||
|
"SHARD_ID_TO_SHARDED_DIM": {"gate": 0, "down": 1, "up": 0},
|
||||||
|
}
|
||||||
|
down_proj_attrs = {
|
||||||
|
**extra_weight_attrs,
|
||||||
|
"tensor_track": TensorTracker(shape=down_proj_weight_shape, output_dim=True),
|
||||||
|
"SHARD_ID_TO_SHARDED_DIM": {"gate": 0, "down": 1, "up": 0},
|
||||||
|
}
|
||||||
|
|
||||||
|
layer.up_gate_proj_weight = layer.create_parameter(
|
||||||
|
shape=up_gate_proj_weight_shape,
|
||||||
|
dtype=layer.weight_dtype,
|
||||||
|
default_initializer=paddle.nn.initializer.Constant(0),
|
||||||
|
)
|
||||||
|
|
||||||
|
layer.down_proj_weight = layer.create_parameter(
|
||||||
|
shape=down_proj_weight_shape,
|
||||||
|
dtype=layer.weight_dtype,
|
||||||
|
default_initializer=paddle.nn.initializer.Constant(0),
|
||||||
|
)
|
||||||
|
set_weight_attrs(
|
||||||
|
layer.up_gate_proj_weight,
|
||||||
|
up_gate_proj_attrs,
|
||||||
|
)
|
||||||
|
set_weight_attrs(
|
||||||
|
layer.down_proj_weight,
|
||||||
|
down_proj_attrs,
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
self.weight_dtype = "int8"
|
||||||
|
|
||||||
|
up_gate_proj_weight_name = self.added_weight_attrs[0]
|
||||||
|
down_proj_weight_name = self.added_weight_attrs[1]
|
||||||
|
up_gate_proj_scale_name = self.added_scale_attrs[0]
|
||||||
|
down_proj_scale_name = self.added_scale_attrs[1]
|
||||||
|
|
||||||
|
setattr(
|
||||||
|
layer,
|
||||||
|
up_gate_proj_weight_name,
|
||||||
|
layer.create_parameter(
|
||||||
|
shape=self.up_gate_proj_weight_shape,
|
||||||
|
dtype=self.weight_dtype,
|
||||||
|
default_initializer=paddle.nn.initializer.Constant(0),
|
||||||
|
),
|
||||||
|
)
|
||||||
|
setattr(
|
||||||
|
layer,
|
||||||
|
down_proj_weight_name,
|
||||||
|
layer.create_parameter(
|
||||||
|
shape=self.down_proj_weight_shape,
|
||||||
|
dtype=self.weight_dtype,
|
||||||
|
default_initializer=paddle.nn.initializer.Constant(0),
|
||||||
|
),
|
||||||
|
)
|
||||||
|
# weight_scale
|
||||||
|
setattr(
|
||||||
|
layer,
|
||||||
|
up_gate_proj_scale_name,
|
||||||
|
layer.create_parameter(
|
||||||
|
shape=self.up_gate_proj_scale_shape,
|
||||||
|
dtype=self.default_dtype,
|
||||||
|
default_initializer=paddle.nn.initializer.Constant(0),
|
||||||
|
),
|
||||||
|
)
|
||||||
|
setattr(
|
||||||
|
layer,
|
||||||
|
down_proj_scale_name,
|
||||||
|
layer.create_parameter(
|
||||||
|
shape=self.down_proj_scale_shape,
|
||||||
|
dtype=self.default_dtype,
|
||||||
|
default_initializer=paddle.nn.initializer.Constant(0),
|
||||||
|
),
|
||||||
|
)
|
||||||
|
# The v1 loader currently does not support loading offline quantized weight-only weights.
|
||||||
|
moe_extra_weight_attrs = {**extra_weight_attrs, "SHARD_ID_TO_SHARDED_DIM": {"gate": 0, "down": 1, "up": 0}}
|
||||||
|
set_weight_attrs(layer.up_gate_proj_weight, moe_extra_weight_attrs)
|
||||||
|
set_weight_attrs(layer.down_proj_weight, moe_extra_weight_attrs)
|
||||||
|
scale_extra_weight_attrs = {
|
||||||
|
**extra_weight_attrs,
|
||||||
|
"SHARD_ID_TO_SHARDED_DIM": {"gate": 0, "up": 0, "down": None},
|
||||||
|
}
|
||||||
|
set_weight_attrs(layer.up_gate_proj_weight_scale, scale_extra_weight_attrs)
|
||||||
|
set_weight_attrs(layer.down_proj_weight_scale, scale_extra_weight_attrs)
|
||||||
|
|
||||||
|
if layer.with_bias:
|
||||||
|
layer.up_gate_proj_bias = layer.create_parameter(
|
||||||
|
shape=[layer.num_experts, layer.moe_intermediate_size * 2],
|
||||||
|
dtype=layer.weight_dtype,
|
||||||
|
default_initializer=paddle.nn.initializer.Constant(0),
|
||||||
|
)
|
||||||
|
|
||||||
|
layer.down_proj_bias = layer.create_parameter(
|
||||||
|
shape=[layer.num_experts, layer.hidden_size],
|
||||||
|
dtype=layer.weight_dtype,
|
||||||
|
default_initializer=paddle.nn.initializer.Constant(0),
|
||||||
|
)
|
||||||
|
|
||||||
|
set_weight_attrs(
|
||||||
|
layer.up_gate_proj_bias,
|
||||||
|
extra_weight_attrs,
|
||||||
|
)
|
||||||
|
set_weight_attrs(
|
||||||
|
layer.down_proj_bias,
|
||||||
|
extra_weight_attrs,
|
||||||
|
)
|
||||||
|
|
||||||
|
def process_weights_after_loading(self, layer):
|
||||||
|
def _process_quantize(weight_idx):
|
||||||
|
# 1.init shape and type
|
||||||
|
# quantized_weight_name
|
||||||
|
weight_name = self.added_weight_attrs[weight_idx]
|
||||||
|
unquantized_weight_name = weight_name.replace("quant_weight", "weight")
|
||||||
|
weight_shape = self.up_gate_proj_weight_shape if weight_type == "gate_up" else self.down_proj_weight_shape
|
||||||
|
weight_dtype = "int8"
|
||||||
|
# scale
|
||||||
|
scale_name = self.added_scale_attrs[weight_idx]
|
||||||
|
scale_shape = self.up_gate_proj_scale_shape if weight_type == "gate_up" else self.down_proj_scale_shape
|
||||||
|
scale_dtype = self.default_dtype
|
||||||
|
|
||||||
|
# 2.crate tmp tensor
|
||||||
|
|
||||||
|
weight = paddle.empty(weight_shape, dtype=weight_dtype)
|
||||||
|
scale = paddle.empty(scale_shape, dtype=scale_dtype)
|
||||||
|
|
||||||
|
# 3.quantize weight
|
||||||
|
|
||||||
|
for expert_id in range(layer.num_local_experts):
|
||||||
|
weight[expert_id], scale[expert_id] = weight_quantize(
|
||||||
|
getattr(layer, unquantized_weight_name)[expert_id], algo=self.moe_quant_type
|
||||||
|
)
|
||||||
|
|
||||||
|
free_tensor(getattr(layer, unquantized_weight_name))
|
||||||
|
|
||||||
|
# create weight
|
||||||
|
setattr(
|
||||||
|
layer,
|
||||||
|
weight_name,
|
||||||
|
layer.create_parameter(
|
||||||
|
shape=weight_shape,
|
||||||
|
dtype=weight_dtype,
|
||||||
|
default_initializer=paddle.nn.initializer.Constant(0),
|
||||||
|
),
|
||||||
|
)
|
||||||
|
# create scale
|
||||||
|
setattr(
|
||||||
|
layer,
|
||||||
|
scale_name,
|
||||||
|
layer.create_parameter(
|
||||||
|
shape=scale_shape,
|
||||||
|
dtype=scale_dtype,
|
||||||
|
default_initializer=paddle.nn.initializer.Constant(0),
|
||||||
|
),
|
||||||
|
)
|
||||||
|
getattr(layer, weight_name).copy_(weight, False)
|
||||||
|
getattr(layer, scale_name).copy_(scale, False)
|
||||||
|
|
||||||
|
if self.quant_config.is_checkpoint_bf16:
|
||||||
|
weight_id_map = {"gate_up": 0, "down": 1}
|
||||||
|
if weight_fully_copied(layer.up_gate_proj_weight):
|
||||||
|
weight_type = "gate_up"
|
||||||
|
else:
|
||||||
|
weight_type = "down"
|
||||||
|
|
||||||
|
if self.model_format == "torch":
|
||||||
|
unquantized_weight_name = self.added_weight_attrs[weight_id_map[weight_type]].replace(
|
||||||
|
"quant_weight", "weight"
|
||||||
|
)
|
||||||
|
process_weight_transpose(layer, unquantized_weight_name)
|
||||||
|
_process_quantize(weight_id_map[weight_type])
|
||||||
|
else:
|
||||||
|
return
|
||||||
|
|
||||||
|
def process_loaded_weights(self, layer: nn.Layer, state_dict):
|
||||||
|
"""
|
||||||
|
Paddle cutlass load weight process.
|
||||||
|
"""
|
||||||
|
up_gate_proj_weights, down_proj_weights, _, _ = layer.extract_moe_ffn_weights(state_dict)
|
||||||
|
self.check(layer, up_gate_proj_weights, down_proj_weights)
|
||||||
|
for idx, weight_tensor in enumerate([up_gate_proj_weights, down_proj_weights]):
|
||||||
|
weight_name = self.added_weight_attrs[idx]
|
||||||
|
scale_name = self.added_scale_attrs[idx]
|
||||||
|
|
||||||
|
weight_list = []
|
||||||
|
weight_scale_list = []
|
||||||
|
for i in range(layer.num_local_experts):
|
||||||
|
quant_weight, scale = weight_quantize(weight_tensor[i], algo=self.moe_quant_type)
|
||||||
|
weight_list.append(quant_weight)
|
||||||
|
weight_scale_list.append(scale)
|
||||||
|
quanted_weight = paddle.stack(weight_list, axis=0)
|
||||||
|
getattr(layer, weight_name).set_value(quanted_weight)
|
||||||
|
|
||||||
|
quanted_weight_scale = paddle.stack(weight_scale_list, axis=0)
|
||||||
|
getattr(layer, scale_name).set_value(quanted_weight_scale)
|
||||||
@@ -0,0 +1,16 @@
|
|||||||
|
# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
|
||||||
|
#
|
||||||
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
# you may not use this file except in compliance with the License.
|
||||||
|
# You may obtain a copy of the License at
|
||||||
|
#
|
||||||
|
# http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
#
|
||||||
|
# Unless required by applicable law or agreed to in writing, software
|
||||||
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
# See the License for the specific language governing permissions and
|
||||||
|
# limitations under the License.
|
||||||
|
"""
|
||||||
|
iluvatar quantization methods
|
||||||
|
"""
|
||||||
@@ -0,0 +1,183 @@
|
|||||||
|
"""
|
||||||
|
# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
|
||||||
|
#
|
||||||
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
# you may not use this file except in compliance with the License.
|
||||||
|
# You may obtain a copy of the License at
|
||||||
|
#
|
||||||
|
# http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
#
|
||||||
|
# Unless required by applicable law or agreed to in writing, software
|
||||||
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
# See the License for the specific language governing permissions and
|
||||||
|
# limitations under the License.
|
||||||
|
"""
|
||||||
|
|
||||||
|
import paddle
|
||||||
|
from paddle.nn.quant import weight_only_linear, weight_quantize
|
||||||
|
|
||||||
|
from fastdeploy.model_executor.layers.linear import (
|
||||||
|
MergedColumnParallelLinear,
|
||||||
|
MergedReplicatedLinear,
|
||||||
|
QKVGateParallelLinear,
|
||||||
|
QKVParallelLinear,
|
||||||
|
)
|
||||||
|
from fastdeploy.model_executor.layers.quantization.weight_only import (
|
||||||
|
WeightOnlyConfig,
|
||||||
|
WeightOnlyLinearMethod,
|
||||||
|
)
|
||||||
|
from fastdeploy.model_executor.layers.utils import get_tensor
|
||||||
|
from fastdeploy.model_executor.utils import (
|
||||||
|
TensorTracker,
|
||||||
|
free_tensor,
|
||||||
|
process_weight_transpose,
|
||||||
|
set_weight_attrs,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
class IluvatarWeightOnlyLinearMethod(WeightOnlyLinearMethod):
|
||||||
|
"""
|
||||||
|
Weight only quantization method for linear layer
|
||||||
|
"""
|
||||||
|
|
||||||
|
def __init__(
|
||||||
|
self,
|
||||||
|
quant_config: WeightOnlyConfig,
|
||||||
|
) -> None:
|
||||||
|
super().__init__(quant_config)
|
||||||
|
self.quant_config.weight_only_linear_arch = -1
|
||||||
|
self.group_size = -1
|
||||||
|
|
||||||
|
def create_weights(self, layer, **extra_weight_attrs):
|
||||||
|
# TODO(bukejiyu): remove v1 loader check when v0 loader is removed
|
||||||
|
self.model_format = extra_weight_attrs.get("model_format")
|
||||||
|
if self.quant_config.is_checkpoint_bf16 and layer.fd_config.load_config.load_choices == "default_v1":
|
||||||
|
weight_shape = layer.weight_shape[::-1] if self.model_format == "torch" else layer.weight_shape
|
||||||
|
layer.weight = layer.create_parameter(
|
||||||
|
shape=weight_shape,
|
||||||
|
dtype=layer.weight_dtype,
|
||||||
|
is_bias=False,
|
||||||
|
default_initializer=paddle.nn.initializer.Constant(0),
|
||||||
|
)
|
||||||
|
|
||||||
|
quant_attrs = extra_weight_attrs
|
||||||
|
|
||||||
|
if (
|
||||||
|
isinstance(layer, MergedColumnParallelLinear)
|
||||||
|
or isinstance(layer, QKVParallelLinear)
|
||||||
|
or isinstance(layer, MergedReplicatedLinear)
|
||||||
|
or isinstance(layer, QKVGateParallelLinear)
|
||||||
|
):
|
||||||
|
# Only MergedReplicatedLinear uses the default outdim.
|
||||||
|
tensor_output_dim = (self.model_format == "torch") ^ quant_attrs.get("output_dim", True)
|
||||||
|
quant_attrs = {
|
||||||
|
**quant_attrs,
|
||||||
|
"tensor_track": TensorTracker(shape=weight_shape, output_dim=tensor_output_dim),
|
||||||
|
}
|
||||||
|
|
||||||
|
if self.model_format == "torch" and "output_dim" in quant_attrs:
|
||||||
|
quant_attrs["output_dim"] = not quant_attrs["output_dim"]
|
||||||
|
|
||||||
|
set_weight_attrs(
|
||||||
|
layer.weight,
|
||||||
|
quant_attrs,
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
# The scale shape should be equal to the output dim of weight using Per-Channel Quantization.
|
||||||
|
weight_scale_shape = [layer.weight_shape[1]]
|
||||||
|
layer.weight_shape.reverse()
|
||||||
|
if self.quant_config.name() == "wint4":
|
||||||
|
layer.weight_shape[0] //= 2
|
||||||
|
layer.weight_dtype = "int8"
|
||||||
|
|
||||||
|
layer.weight = layer.create_parameter(
|
||||||
|
shape=layer.weight_shape,
|
||||||
|
dtype=layer.weight_dtype,
|
||||||
|
is_bias=False,
|
||||||
|
default_initializer=paddle.nn.initializer.Constant(0),
|
||||||
|
)
|
||||||
|
|
||||||
|
if "output_dim" in extra_weight_attrs:
|
||||||
|
extra_weight_attrs["output_dim"] = not extra_weight_attrs["output_dim"]
|
||||||
|
set_weight_attrs(
|
||||||
|
layer.weight,
|
||||||
|
extra_weight_attrs,
|
||||||
|
)
|
||||||
|
|
||||||
|
layer.weight_scale = layer.create_parameter(
|
||||||
|
shape=weight_scale_shape,
|
||||||
|
dtype=layer._dtype,
|
||||||
|
is_bias=False,
|
||||||
|
)
|
||||||
|
|
||||||
|
set_weight_attrs(
|
||||||
|
layer.weight_scale,
|
||||||
|
extra_weight_attrs,
|
||||||
|
)
|
||||||
|
|
||||||
|
def process_weights_after_loading(self, layer) -> None:
|
||||||
|
def _process_quantize():
|
||||||
|
quanted_weight_tensor, weight_scale_tensor = weight_quantize(
|
||||||
|
layer.weight,
|
||||||
|
algo=self.quant_config.algo,
|
||||||
|
arch=self.quant_config.weight_only_linear_arch,
|
||||||
|
)
|
||||||
|
|
||||||
|
free_tensor(layer.weight)
|
||||||
|
|
||||||
|
layer.weight = layer.create_parameter(
|
||||||
|
shape=quanted_weight_tensor.shape,
|
||||||
|
dtype="int8",
|
||||||
|
is_bias=False,
|
||||||
|
default_initializer=paddle.nn.initializer.Constant(0),
|
||||||
|
)
|
||||||
|
layer.weight_scale = layer.create_parameter(
|
||||||
|
shape=weight_scale_tensor.shape,
|
||||||
|
dtype=layer._dtype,
|
||||||
|
is_bias=False,
|
||||||
|
default_initializer=paddle.nn.initializer.Constant(0),
|
||||||
|
)
|
||||||
|
layer.weight.copy_(quanted_weight_tensor, False)
|
||||||
|
layer.weight_scale.copy_(weight_scale_tensor, False)
|
||||||
|
|
||||||
|
if self.quant_config.is_checkpoint_bf16:
|
||||||
|
if self.model_format == "torch":
|
||||||
|
process_weight_transpose(layer, "weight")
|
||||||
|
_process_quantize()
|
||||||
|
else:
|
||||||
|
return
|
||||||
|
|
||||||
|
def process_loaded_weights(self, layer, weight) -> None:
|
||||||
|
|
||||||
|
quanted_weight_tensor, weight_scale_tensor = weight_quantize(
|
||||||
|
weight,
|
||||||
|
algo=self.quant_config.algo,
|
||||||
|
arch=self.quant_config.weight_only_linear_arch,
|
||||||
|
)
|
||||||
|
layer.weight.set_value(quanted_weight_tensor)
|
||||||
|
layer.weight_scale.set_value(weight_scale_tensor.astype(paddle.get_default_dtype()))
|
||||||
|
|
||||||
|
def process_prequanted_weights(self, layer, state_dict, is_rearrange: bool = False) -> None:
|
||||||
|
"""
|
||||||
|
Process pre-quantized weights before applying them to the model
|
||||||
|
Args:
|
||||||
|
layer: The layer that owns the weights
|
||||||
|
quant_weight: The quantized weights
|
||||||
|
weight_scale: The scale of the quantized weights
|
||||||
|
"""
|
||||||
|
quant_weight = get_tensor(state_dict.pop(layer.weight_key))
|
||||||
|
weight_scale = get_tensor(state_dict.pop(layer.weight_scale_key))
|
||||||
|
layer.weight.set_value(quant_weight)
|
||||||
|
layer.weight_scale.set_value(weight_scale.astype(paddle.get_default_dtype()))
|
||||||
|
|
||||||
|
def apply(self, layer, x):
|
||||||
|
linear_out = weight_only_linear(
|
||||||
|
x,
|
||||||
|
weight=layer.weight,
|
||||||
|
bias=layer.bias if layer.with_bias else None,
|
||||||
|
weight_scale=layer.weight_scale,
|
||||||
|
weight_dtype=("int8" if self.quant_config.name() == "wint8" else "int4"),
|
||||||
|
arch=self.quant_config.weight_only_linear_arch,
|
||||||
|
)
|
||||||
|
return linear_out
|
||||||
@@ -37,11 +37,6 @@ if current_platform.is_cuda():
|
|||||||
)
|
)
|
||||||
except:
|
except:
|
||||||
logger.warning("import w4afp8_gemm_scale_permute Failed!")
|
logger.warning("import w4afp8_gemm_scale_permute Failed!")
|
||||||
elif current_platform.is_iluvatar():
|
|
||||||
from fastdeploy.model_executor.ops.iluvatar import (
|
|
||||||
moe_expert_dispatch,
|
|
||||||
moe_expert_reduce,
|
|
||||||
)
|
|
||||||
|
|
||||||
from fastdeploy.model_executor.layers.moe.moe import get_moe_scores
|
from fastdeploy.model_executor.layers.moe.moe import get_moe_scores
|
||||||
from fastdeploy.model_executor.utils import (
|
from fastdeploy.model_executor.utils import (
|
||||||
@@ -91,40 +86,24 @@ class CutlassMoEMethod(UnquantizedFusedMoEMethod):
|
|||||||
"""
|
"""
|
||||||
Paddle Cutlass compute Fused MoE.
|
Paddle Cutlass compute Fused MoE.
|
||||||
"""
|
"""
|
||||||
if current_platform.is_iluvatar():
|
ffn_out_without_down_proj_bias = fastdeploy.model_executor.ops.gpu.moe_expert_ffn(
|
||||||
ffn_out_without_down_proj_bias = fastdeploy.model_executor.ops.iluvatar.moe_expert_ffn(
|
permute_input,
|
||||||
permute_input,
|
token_nums_per_expert,
|
||||||
token_nums_per_expert,
|
getattr(layer, self.added_weight_attrs[0]),
|
||||||
getattr(layer, self.added_weight_attrs[0]),
|
getattr(layer, self.added_weight_attrs[1]),
|
||||||
getattr(layer, self.added_weight_attrs[1]),
|
dequant_scale,
|
||||||
(layer.up_gate_proj_bias if hasattr(layer, "up_gate_proj_bias") else None),
|
(layer.up_gate_proj_bias if hasattr(layer, "up_gate_proj_bias") else None),
|
||||||
(layer.up_gate_proj_weight_scale if hasattr(layer, "up_gate_proj_weight_scale") else None),
|
(layer.up_gate_proj_weight_scale if hasattr(layer, "up_gate_proj_weight_scale") else None),
|
||||||
(layer.down_proj_weight_scale if hasattr(layer, "down_proj_weight_scale") else None),
|
(layer.down_proj_weight_scale if hasattr(layer, "down_proj_weight_scale") else None),
|
||||||
(layer.down_proj_in_scale if hasattr(layer, "down_proj_in_scale") else None),
|
(layer.down_proj_in_scale if hasattr(layer, "down_proj_in_scale") else None),
|
||||||
expert_idx_per_token,
|
expert_idx_per_token,
|
||||||
self.moe_quant_type,
|
max_tokens_per_expert,
|
||||||
used_in_ep_low_latency,
|
self.moe_quant_type,
|
||||||
layer.fd_config.model_config.moe_phase.phase,
|
used_in_ep_low_latency,
|
||||||
)
|
estimate_total_token_nums,
|
||||||
else:
|
getattr(layer.moe_quant_config, "hadamard_block_size", 128),
|
||||||
ffn_out_without_down_proj_bias = fastdeploy.model_executor.ops.gpu.moe_expert_ffn(
|
layer.activation,
|
||||||
permute_input,
|
)
|
||||||
token_nums_per_expert,
|
|
||||||
getattr(layer, self.added_weight_attrs[0]),
|
|
||||||
getattr(layer, self.added_weight_attrs[1]),
|
|
||||||
dequant_scale,
|
|
||||||
(layer.up_gate_proj_bias if hasattr(layer, "up_gate_proj_bias") else None),
|
|
||||||
(layer.up_gate_proj_weight_scale if hasattr(layer, "up_gate_proj_weight_scale") else None),
|
|
||||||
(layer.down_proj_weight_scale if hasattr(layer, "down_proj_weight_scale") else None),
|
|
||||||
(layer.down_proj_in_scale if hasattr(layer, "down_proj_in_scale") else None),
|
|
||||||
expert_idx_per_token,
|
|
||||||
max_tokens_per_expert,
|
|
||||||
self.moe_quant_type,
|
|
||||||
used_in_ep_low_latency,
|
|
||||||
estimate_total_token_nums,
|
|
||||||
getattr(layer.moe_quant_config, "hadamard_block_size", 128),
|
|
||||||
layer.activation,
|
|
||||||
)
|
|
||||||
|
|
||||||
if layer.with_bias:
|
if layer.with_bias:
|
||||||
down_proj_bias_expand = paddle.index_select(layer.down_proj_bias, expert_idx_per_token, axis=0)
|
down_proj_bias_expand = paddle.index_select(layer.down_proj_bias, expert_idx_per_token, axis=0)
|
||||||
@@ -307,91 +286,47 @@ class CutlassMoEMethod(UnquantizedFusedMoEMethod):
|
|||||||
layer.gate_correction_bias,
|
layer.gate_correction_bias,
|
||||||
getattr(layer, "renormalize", True),
|
getattr(layer, "renormalize", True),
|
||||||
)
|
)
|
||||||
if current_platform.is_iluvatar():
|
(
|
||||||
|
permute_input,
|
||||||
|
token_nums_per_expert,
|
||||||
|
permute_indices_per_token,
|
||||||
|
topk_weights,
|
||||||
|
topk_idx,
|
||||||
|
expert_idx_per_token,
|
||||||
|
dequant_scale,
|
||||||
|
max_tokens_per_expert,
|
||||||
|
) = moe_expert_dispatch(
|
||||||
|
x,
|
||||||
|
gate_out,
|
||||||
|
None, # Use layer.gate_correction_bias in get_moe_scores.
|
||||||
(
|
(
|
||||||
permute_input,
|
layer.up_gate_proj_in_scale if hasattr(layer, "up_gate_proj_in_scale") else None
|
||||||
token_nums_per_expert,
|
), # if set, permute_input will be int8_t
|
||||||
permute_indices_per_token,
|
layer.top_k,
|
||||||
topk_weights,
|
False,
|
||||||
topk_idx,
|
self.moe_quant_type,
|
||||||
expert_idx_per_token,
|
topk_only_mode=True,
|
||||||
) = moe_expert_dispatch(
|
)
|
||||||
x,
|
|
||||||
gate_out,
|
|
||||||
None, # Use layer.gate_correction_bias in get_moe_scores.
|
|
||||||
(
|
|
||||||
layer.up_gate_proj_in_scale if hasattr(layer, "up_gate_proj_in_scale") else None
|
|
||||||
), # if set, permute_input will be int8_t
|
|
||||||
layer.top_k,
|
|
||||||
False,
|
|
||||||
self.moe_quant_type,
|
|
||||||
topk_only_mode=True,
|
|
||||||
)
|
|
||||||
dequant_scale = None
|
|
||||||
max_tokens_per_expert = None
|
|
||||||
else:
|
|
||||||
(
|
|
||||||
permute_input,
|
|
||||||
token_nums_per_expert,
|
|
||||||
permute_indices_per_token,
|
|
||||||
topk_weights,
|
|
||||||
topk_idx,
|
|
||||||
expert_idx_per_token,
|
|
||||||
dequant_scale,
|
|
||||||
max_tokens_per_expert,
|
|
||||||
) = moe_expert_dispatch(
|
|
||||||
x,
|
|
||||||
gate_out,
|
|
||||||
None, # Use layer.gate_correction_bias in get_moe_scores.
|
|
||||||
(
|
|
||||||
layer.up_gate_proj_in_scale if hasattr(layer, "up_gate_proj_in_scale") else None
|
|
||||||
), # if set, permute_input will be int8_t
|
|
||||||
layer.top_k,
|
|
||||||
False,
|
|
||||||
self.moe_quant_type,
|
|
||||||
topk_only_mode=True,
|
|
||||||
)
|
|
||||||
else:
|
else:
|
||||||
if current_platform.is_iluvatar():
|
(
|
||||||
(
|
permute_input,
|
||||||
permute_input,
|
token_nums_per_expert,
|
||||||
token_nums_per_expert,
|
permute_indices_per_token,
|
||||||
permute_indices_per_token,
|
topk_weights,
|
||||||
topk_weights,
|
topk_idx,
|
||||||
topk_idx,
|
expert_idx_per_token,
|
||||||
expert_idx_per_token,
|
dequant_scale,
|
||||||
) = moe_expert_dispatch(
|
max_tokens_per_expert,
|
||||||
x,
|
) = moe_expert_dispatch(
|
||||||
gate_out,
|
x,
|
||||||
layer.gate_correction_bias,
|
gate_out,
|
||||||
(layer.up_gate_proj_in_scale if hasattr(layer, "up_gate_proj_in_scale") else None),
|
layer.gate_correction_bias,
|
||||||
layer.top_k,
|
(layer.up_gate_proj_in_scale if hasattr(layer, "up_gate_proj_in_scale") else None),
|
||||||
False,
|
layer.top_k,
|
||||||
self.moe_quant_type,
|
False,
|
||||||
topk_only_mode=False,
|
self.moe_quant_type,
|
||||||
)
|
topk_only_mode=False,
|
||||||
dequant_scale = None
|
)
|
||||||
max_tokens_per_expert = None
|
|
||||||
else:
|
|
||||||
(
|
|
||||||
permute_input,
|
|
||||||
token_nums_per_expert,
|
|
||||||
permute_indices_per_token,
|
|
||||||
topk_weights,
|
|
||||||
topk_idx,
|
|
||||||
expert_idx_per_token,
|
|
||||||
dequant_scale,
|
|
||||||
max_tokens_per_expert,
|
|
||||||
) = moe_expert_dispatch(
|
|
||||||
x,
|
|
||||||
gate_out,
|
|
||||||
layer.gate_correction_bias,
|
|
||||||
(layer.up_gate_proj_in_scale if hasattr(layer, "up_gate_proj_in_scale") else None),
|
|
||||||
layer.top_k,
|
|
||||||
False,
|
|
||||||
self.moe_quant_type,
|
|
||||||
topk_only_mode=False,
|
|
||||||
)
|
|
||||||
|
|
||||||
if hasattr(layer, "up_gate_proj_in_scale"):
|
if hasattr(layer, "up_gate_proj_in_scale"):
|
||||||
dequant_scale = None
|
dequant_scale = None
|
||||||
|
|||||||
@@ -47,10 +47,14 @@ def get_moe_method(layer=None):
|
|||||||
return moe method based on device platform
|
return moe method based on device platform
|
||||||
"""
|
"""
|
||||||
|
|
||||||
if current_platform.is_cuda() or current_platform.is_iluvatar():
|
if current_platform.is_cuda():
|
||||||
from .fused_moe_cutlass_backend import CutlassMoEMethod
|
from .fused_moe_cutlass_backend import CutlassMoEMethod
|
||||||
|
|
||||||
return CutlassMoEMethod(None)
|
return CutlassMoEMethod(None)
|
||||||
|
elif current_platform.is_iluvatar():
|
||||||
|
from fastdeploy.model_executor.layers.backends import IluvatarCutlassMoEMethod
|
||||||
|
|
||||||
|
return IluvatarCutlassMoEMethod(None)
|
||||||
elif current_platform.is_xpu():
|
elif current_platform.is_xpu():
|
||||||
from fastdeploy.model_executor.layers.backends import XPUMoEMethod
|
from fastdeploy.model_executor.layers.backends import XPUMoEMethod
|
||||||
|
|
||||||
|
|||||||
@@ -149,6 +149,22 @@ class WeightOnlyConfig(QuantConfigBase):
|
|||||||
else:
|
else:
|
||||||
|
|
||||||
return GPUWeightOnlyLinearMethod(self)
|
return GPUWeightOnlyLinearMethod(self)
|
||||||
|
elif current_platform.is_iluvatar():
|
||||||
|
if isinstance(layer, FusedMoE):
|
||||||
|
if layer.use_method == "cutlass":
|
||||||
|
from fastdeploy.model_executor.layers.backends import (
|
||||||
|
IluvatarCutlassWeightOnlyMoEMethod,
|
||||||
|
)
|
||||||
|
|
||||||
|
return IluvatarCutlassWeightOnlyMoEMethod(self)
|
||||||
|
else:
|
||||||
|
raise ValueError(f"Unsupported MOE backend {layer.use_method}")
|
||||||
|
else:
|
||||||
|
from fastdeploy.model_executor.layers.backends import (
|
||||||
|
IluvatarWeightOnlyLinearMethod,
|
||||||
|
)
|
||||||
|
|
||||||
|
return IluvatarWeightOnlyLinearMethod(self)
|
||||||
else:
|
else:
|
||||||
if isinstance(layer, FusedMoE):
|
if isinstance(layer, FusedMoE):
|
||||||
if layer.use_method == "cutlass":
|
if layer.use_method == "cutlass":
|
||||||
|
|||||||
@@ -100,19 +100,13 @@ def iluvatar_moe_expert_ffn(
|
|||||||
up_gate_proj_bias: Optional[paddle.Tensor],
|
up_gate_proj_bias: Optional[paddle.Tensor],
|
||||||
up_gate_proj_scale: Optional[paddle.Tensor],
|
up_gate_proj_scale: Optional[paddle.Tensor],
|
||||||
down_proj_scale: Optional[paddle.Tensor],
|
down_proj_scale: Optional[paddle.Tensor],
|
||||||
down_proj_in_scale: Optional[paddle.Tensor],
|
|
||||||
expert_idx_per_token: Optional[paddle.Tensor],
|
|
||||||
quant_method: str,
|
quant_method: str,
|
||||||
used_in_ep_low_latency: bool,
|
|
||||||
moe_phase: str,
|
moe_phase: str,
|
||||||
):
|
):
|
||||||
assert up_gate_proj_bias is None
|
assert up_gate_proj_bias is None
|
||||||
assert up_gate_proj_scale is not None
|
assert up_gate_proj_scale is not None
|
||||||
assert down_proj_scale is not None
|
assert down_proj_scale is not None
|
||||||
assert down_proj_in_scale is None
|
|
||||||
assert expert_idx_per_token is None
|
|
||||||
assert quant_method in ("weight_only_int8")
|
assert quant_method in ("weight_only_int8")
|
||||||
assert not used_in_ep_low_latency
|
|
||||||
group_gemm_func, tokens_per_expert = _pre_process_expert_ffn(moe_phase, tokens_expert_prefix_sum)
|
group_gemm_func, tokens_per_expert = _pre_process_expert_ffn(moe_phase, tokens_expert_prefix_sum)
|
||||||
ffn1_output = group_gemm_func(permute_input, up_gate_proj_weight, up_gate_proj_scale, tokens_per_expert, -1)
|
ffn1_output = group_gemm_func(permute_input, up_gate_proj_weight, up_gate_proj_scale, tokens_per_expert, -1)
|
||||||
act_out = swiglu(ffn1_output)
|
act_out = swiglu(ffn1_output)
|
||||||
|
|||||||
@@ -11,7 +11,10 @@
|
|||||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
# See the License for the specific language governing permissions and
|
# See the License for the specific language governing permissions and
|
||||||
# limitations under the License.
|
# limitations under the License.
|
||||||
from .base import Platform
|
|
||||||
|
from fastdeploy.utils import console_logger as logger
|
||||||
|
|
||||||
|
from .base import Platform, _Backend
|
||||||
|
|
||||||
|
|
||||||
class IluvatarPlatform(Platform):
|
class IluvatarPlatform(Platform):
|
||||||
@@ -22,4 +25,11 @@ class IluvatarPlatform(Platform):
|
|||||||
"""
|
"""
|
||||||
get_attention_backend_cls
|
get_attention_backend_cls
|
||||||
"""
|
"""
|
||||||
return "fastdeploy.model_executor.layers.attention.IluvatarAttnBackend"
|
if selected_backend == _Backend.APPEND_ATTN:
|
||||||
|
logger.info("Using ixinfer MHA backend instead of append attention")
|
||||||
|
return "fastdeploy.model_executor.layers.backends.iluvatar.MhaAttnBackend"
|
||||||
|
else:
|
||||||
|
raise ValueError(
|
||||||
|
"Invalid attention backend you specified.\n"
|
||||||
|
"Now only support [NATIVE_ATTN, MLA_ATTN, APPEND_ATTN] in cuda place."
|
||||||
|
)
|
||||||
|
|||||||
@@ -20,7 +20,7 @@ import paddle
|
|||||||
|
|
||||||
from fastdeploy import envs
|
from fastdeploy import envs
|
||||||
from fastdeploy.config import FDConfig
|
from fastdeploy.config import FDConfig
|
||||||
from fastdeploy.model_executor.layers.attention import IluvatarAttnBackend
|
from fastdeploy.model_executor.layers.attention import get_attention_backend
|
||||||
from fastdeploy.worker.gpu_model_runner import GPUModelRunner
|
from fastdeploy.worker.gpu_model_runner import GPUModelRunner
|
||||||
|
|
||||||
|
|
||||||
@@ -90,7 +90,8 @@ class IluvatarModelRunner(GPUModelRunner):
|
|||||||
1,
|
1,
|
||||||
int(self.model_config.num_key_value_heads) // self.parallel_config.tensor_parallel_size,
|
int(self.model_config.num_key_value_heads) // self.parallel_config.tensor_parallel_size,
|
||||||
)
|
)
|
||||||
attn_backend = IluvatarAttnBackend(
|
attn_cls = get_attention_backend()
|
||||||
|
attn_backend = attn_cls(
|
||||||
self.fd_config,
|
self.fd_config,
|
||||||
kv_num_heads=self.model_config.kv_num_heads,
|
kv_num_heads=self.model_config.kv_num_heads,
|
||||||
num_heads=num_heads,
|
num_heads=num_heads,
|
||||||
|
|||||||
@@ -33,7 +33,6 @@ omit =
|
|||||||
*/fastdeploy/model_executor/ops/gpu/fastdeploy_ops.py
|
*/fastdeploy/model_executor/ops/gpu/fastdeploy_ops.py
|
||||||
*/fastdeploy/model_executor/ops/gpu/fastdeploy_ops/__init__.py
|
*/fastdeploy/model_executor/ops/gpu/fastdeploy_ops/__init__.py
|
||||||
*/fastdeploy/model_executor/ops/gpu/deep_gemm/utils.py
|
*/fastdeploy/model_executor/ops/gpu/deep_gemm/utils.py
|
||||||
*/fastdeploy/model_executor/layers/attention/iluvatar_attn_backend.py
|
|
||||||
*/fastdeploy/model_executor/xpu_pre_and_post_process.py
|
*/fastdeploy/model_executor/xpu_pre_and_post_process.py
|
||||||
*/fastdeploy/**/dcu/*
|
*/fastdeploy/**/dcu/*
|
||||||
*/fastdeploy/worker/dcu*.py
|
*/fastdeploy/worker/dcu*.py
|
||||||
|
|||||||
Reference in New Issue
Block a user