mirror of
https://github.com/PaddlePaddle/FastDeploy.git
synced 2026-04-22 16:07:51 +08:00
[Iluvatar] refactor attn and moe code (#6887)
This commit is contained in:
@@ -20,7 +20,6 @@ from .block_multihead_attn_backend import BlockAttentionBackend
|
||||
from .dsa_attention_backend import DSAAttentionBackend
|
||||
from .flash_attn_backend import FlashAttentionBackend
|
||||
from .flash_mask_attn_backend import FlashMaskAttentionBackend
|
||||
from .iluvatar_attn_backend import IluvatarAttnBackend
|
||||
from .mla_attention_backend import MLAAttentionBackend
|
||||
from .moba_attention_backend import PlasAttentionBackend
|
||||
from .native_paddle_backend import PaddleNativeAttnBackend
|
||||
@@ -33,7 +32,6 @@ __all__ = [
|
||||
"MLAAttentionBackend",
|
||||
"DSAAttentionBackend",
|
||||
"FlashAttentionBackend",
|
||||
"IluvatarAttnBackend",
|
||||
"BlockAttentionBackend",
|
||||
"Attention",
|
||||
"PlasAttentionBackend",
|
||||
|
||||
@@ -62,3 +62,10 @@ if current_platform.is_intel_hpu():
|
||||
if hasattr(intel_hpu, "__all__"):
|
||||
globals().update({name: getattr(intel_hpu, name) for name in intel_hpu.__all__})
|
||||
__all__.extend(intel_hpu.__all__)
|
||||
|
||||
if current_platform.is_iluvatar():
|
||||
from . import iluvatar
|
||||
|
||||
if hasattr(iluvatar, "__all__"):
|
||||
globals().update({name: getattr(iluvatar, name) for name in iluvatar.__all__})
|
||||
__all__.extend(iluvatar.__all__)
|
||||
|
||||
@@ -0,0 +1,30 @@
|
||||
# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
"""
|
||||
iluvatar gpu backend methods
|
||||
"""
|
||||
from .attention.mha_attn_backend import MhaAttnBackend
|
||||
from .moe.fuse_moe_cutlass_iluvatar_backend import (
|
||||
IluvatarCutlassMoEMethod,
|
||||
IluvatarCutlassWeightOnlyMoEMethod,
|
||||
)
|
||||
from .quantization.weight_only import IluvatarWeightOnlyLinearMethod
|
||||
|
||||
__all__ = [
|
||||
"MhaAttnBackend",
|
||||
"IluvatarCutlassMoEMethod",
|
||||
"IluvatarCutlassWeightOnlyMoEMethod",
|
||||
"IluvatarWeightOnlyLinearMethod",
|
||||
]
|
||||
@@ -0,0 +1,17 @@
|
||||
# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
"""
|
||||
iluvatar gpu backend attention methods
|
||||
"""
|
||||
+4
-4
@@ -39,9 +39,9 @@ if TYPE_CHECKING:
|
||||
|
||||
|
||||
@dataclass
|
||||
class IluvatarAttentionMetadata(AttentionMetadata):
|
||||
class MhaAttentionMetadata(AttentionMetadata):
|
||||
"""
|
||||
IluvatarAttentionMetadata
|
||||
MhaAttentionMetadata
|
||||
"""
|
||||
|
||||
alibi_slopes: Optional[paddle.Tensor] = None
|
||||
@@ -60,7 +60,7 @@ class IluvatarAttentionMetadata(AttentionMetadata):
|
||||
decode_block_tables: paddle.Tensor = None
|
||||
|
||||
|
||||
class IluvatarAttnBackend(AttentionBackend):
|
||||
class MhaAttnBackend(AttentionBackend):
|
||||
"""
|
||||
The backend class that uses paddle native attention implementation.
|
||||
Which is used only for testing purpose.
|
||||
@@ -76,7 +76,7 @@ class IluvatarAttnBackend(AttentionBackend):
|
||||
decoder_block_shape_q: int = -1,
|
||||
):
|
||||
super().__init__()
|
||||
self.attention_metadata = IluvatarAttentionMetadata()
|
||||
self.attention_metadata = MhaAttentionMetadata()
|
||||
self.block_size = fd_config.cache_config.block_size
|
||||
assert self.block_size == 16, "Iluvatar paged attn requires block_size must be 16."
|
||||
self.max_context_len = fd_config.model_config.max_model_len
|
||||
@@ -0,0 +1,17 @@
|
||||
# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
"""
|
||||
iluvatar gpu backend moe methods
|
||||
"""
|
||||
+510
@@ -0,0 +1,510 @@
|
||||
"""
|
||||
# Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
"""
|
||||
|
||||
from typing import Callable
|
||||
|
||||
import paddle
|
||||
from paddle import nn
|
||||
from paddle.nn.quant import weight_quantize
|
||||
|
||||
from fastdeploy.model_executor.layers.moe.fused_moe_backend_base import (
|
||||
UnquantizedFusedMoEMethod,
|
||||
)
|
||||
from fastdeploy.model_executor.layers.moe.moe import get_moe_scores
|
||||
from fastdeploy.model_executor.layers.utils import get_tensor
|
||||
from fastdeploy.model_executor.ops.iluvatar import (
|
||||
moe_expert_dispatch,
|
||||
moe_expert_ffn,
|
||||
moe_expert_reduce,
|
||||
)
|
||||
from fastdeploy.model_executor.utils import (
|
||||
TensorTracker,
|
||||
free_tensor,
|
||||
process_weight_transpose,
|
||||
set_weight_attrs,
|
||||
weight_fully_copied,
|
||||
)
|
||||
|
||||
|
||||
class IluvatarCutlassMoEMethod(UnquantizedFusedMoEMethod):
|
||||
"""
|
||||
Use Cutlass Group Gemm to compute Fused MoE.
|
||||
This method is the oldest way to compute MoE in Paddle.
|
||||
"""
|
||||
|
||||
def process_loaded_weights(self, layer: nn.Layer, state_dict):
|
||||
up_gate_proj_weights, down_proj_weights, logical_expert_ids, ep_rank_to_expert_id_list = (
|
||||
layer.extract_moe_ffn_weights(state_dict)
|
||||
)
|
||||
stacked_up_gate_proj_weights = paddle.stack(up_gate_proj_weights, axis=0)
|
||||
stacked_down_proj_weights = paddle.stack(down_proj_weights, axis=0)
|
||||
|
||||
layer.up_gate_proj_weight.set_value(stacked_up_gate_proj_weights)
|
||||
layer.down_proj_weight.set_value(stacked_down_proj_weights)
|
||||
|
||||
if layer.with_bias:
|
||||
up_gate_proj_bias, down_proj_bias = layer.extract_moe_ffn_bias(state_dict)
|
||||
stacked_up_gate_proj_bias = paddle.stack(up_gate_proj_bias, axis=0)
|
||||
stacked_down_proj_bias = paddle.stack(down_proj_bias, axis=0)
|
||||
|
||||
layer.up_gate_proj_bias.set_value(stacked_up_gate_proj_bias)
|
||||
layer.down_proj_bias.set_value(stacked_down_proj_bias)
|
||||
|
||||
def compute_ffn(
|
||||
self,
|
||||
layer: nn.Layer,
|
||||
permute_input: paddle.Tensor,
|
||||
token_nums_per_expert: paddle.Tensor,
|
||||
expert_idx_per_token: paddle.Tensor,
|
||||
):
|
||||
"""
|
||||
Paddle Cutlass compute Fused MoE.
|
||||
"""
|
||||
ffn_out_without_down_proj_bias = moe_expert_ffn(
|
||||
permute_input,
|
||||
token_nums_per_expert,
|
||||
getattr(layer, self.added_weight_attrs[0]),
|
||||
getattr(layer, self.added_weight_attrs[1]),
|
||||
(layer.up_gate_proj_bias if hasattr(layer, "up_gate_proj_bias") else None),
|
||||
(layer.up_gate_proj_weight_scale if hasattr(layer, "up_gate_proj_weight_scale") else None),
|
||||
(layer.down_proj_weight_scale if hasattr(layer, "down_proj_weight_scale") else None),
|
||||
self.moe_quant_type,
|
||||
layer.fd_config.model_config.moe_phase.phase,
|
||||
)
|
||||
|
||||
if layer.with_bias:
|
||||
down_proj_bias_expand = paddle.index_select(layer.down_proj_bias, expert_idx_per_token, axis=0)
|
||||
ffn_out_without_down_proj_bias = paddle.add(ffn_out_without_down_proj_bias, down_proj_bias_expand)
|
||||
return ffn_out_without_down_proj_bias
|
||||
|
||||
def apply_ep_prefill(
|
||||
self,
|
||||
layer: nn.Layer,
|
||||
x: paddle.Tensor,
|
||||
gate: nn.Layer,
|
||||
topk_ids_hookfunc: Callable = None,
|
||||
) -> paddle.Tensor:
|
||||
"""
|
||||
Apply the EP prefill method.
|
||||
"""
|
||||
raise NotImplementedError
|
||||
|
||||
def apply_ep_decode(
|
||||
self,
|
||||
layer: nn.Layer,
|
||||
x: paddle.Tensor,
|
||||
gate: nn.Layer,
|
||||
topk_ids_hookfunc: Callable = None,
|
||||
) -> paddle.Tensor:
|
||||
"""
|
||||
Apply the EP decoder method.
|
||||
"""
|
||||
raise NotImplementedError
|
||||
|
||||
def apply_tp(
|
||||
self,
|
||||
layer: nn.Layer,
|
||||
x: paddle.Tensor,
|
||||
gate: nn.Layer,
|
||||
topk_ids_hookfunc: Callable = None,
|
||||
) -> paddle.Tensor:
|
||||
"""
|
||||
Paddle Cutlass compute Fused MoE.
|
||||
"""
|
||||
gate_out = gate(x)
|
||||
gate_out = gate_out.cast("float32")
|
||||
if layer.topk_method == "noaux_tc":
|
||||
gate_out, topk_weights, topk_idx = get_moe_scores(
|
||||
gate_out,
|
||||
layer.n_group,
|
||||
layer.topk_group,
|
||||
layer.top_k,
|
||||
layer.routed_scaling_factor,
|
||||
layer.gate_correction_bias,
|
||||
getattr(layer, "renormalize", True),
|
||||
)
|
||||
(
|
||||
permute_input,
|
||||
token_nums_per_expert,
|
||||
permute_indices_per_token,
|
||||
topk_weights,
|
||||
topk_idx,
|
||||
expert_idx_per_token,
|
||||
) = moe_expert_dispatch(
|
||||
x,
|
||||
gate_out,
|
||||
None, # Use layer.gate_correction_bias in get_moe_scores.
|
||||
(
|
||||
layer.up_gate_proj_in_scale if hasattr(layer, "up_gate_proj_in_scale") else None
|
||||
), # if set, permute_input will be int8_t
|
||||
layer.top_k,
|
||||
False,
|
||||
self.moe_quant_type,
|
||||
topk_only_mode=True,
|
||||
)
|
||||
else:
|
||||
(
|
||||
permute_input,
|
||||
token_nums_per_expert,
|
||||
permute_indices_per_token,
|
||||
topk_weights,
|
||||
topk_idx,
|
||||
expert_idx_per_token,
|
||||
) = moe_expert_dispatch(
|
||||
x,
|
||||
gate_out,
|
||||
layer.gate_correction_bias,
|
||||
(layer.up_gate_proj_in_scale if hasattr(layer, "up_gate_proj_in_scale") else None),
|
||||
layer.top_k,
|
||||
False,
|
||||
self.moe_quant_type,
|
||||
topk_only_mode=False,
|
||||
)
|
||||
|
||||
if topk_ids_hookfunc is not None:
|
||||
topk_ids_hookfunc(topk_ids=topk_idx)
|
||||
|
||||
if not layer.with_bias and self.moe_quant_type != "w4a8" and self.moe_quant_type != "w4afp8":
|
||||
# only w4a8 need expert_idx_per_token
|
||||
# Other need not this tensor, so we make it None.
|
||||
expert_idx_per_token = None
|
||||
else:
|
||||
expert_idx_per_token = expert_idx_per_token.cast("int64")
|
||||
|
||||
ffn_out = self.compute_ffn(
|
||||
layer,
|
||||
permute_input,
|
||||
token_nums_per_expert,
|
||||
expert_idx_per_token,
|
||||
)
|
||||
|
||||
# reduce 中会做 topk 个 weight 的 norm 和 routed_scaling_factor
|
||||
fused_moe_out = moe_expert_reduce(
|
||||
ffn_out,
|
||||
topk_weights,
|
||||
permute_indices_per_token,
|
||||
topk_idx,
|
||||
None,
|
||||
norm_topk_prob=False if layer.topk_method == "noaux_tc" else True,
|
||||
routed_scaling_factor=1.0,
|
||||
)
|
||||
|
||||
return fused_moe_out
|
||||
|
||||
|
||||
class IluvatarCutlassWeightOnlyMoEMethod(IluvatarCutlassMoEMethod):
|
||||
"""
|
||||
weight only for moe
|
||||
"""
|
||||
|
||||
def __init__(self, quant_config):
|
||||
super().__init__(quant_config)
|
||||
self.quant_config = quant_config
|
||||
self.moe_quant_type = self.quant_config.algo
|
||||
self.pack_num = 1
|
||||
|
||||
def process_prequanted_weights(self, layer: nn.Layer, state_dict, is_rearrange: bool = False):
|
||||
"""
|
||||
Paddle cutlass process prequanted weights.
|
||||
"""
|
||||
up_gate_proj_expert_weight_key = layer.weight_key_map.get("up_gate_proj_expert_weight_key", None)
|
||||
down_proj_expert_weight_key = layer.weight_key_map.get("down_proj_expert_weight_key", None)
|
||||
up_gate_proj_expert_weight_scale_key = layer.weight_key_map.get("up_gate_proj_expert_weight_scale_key", None)
|
||||
down_proj_expert_weight_scale_key = layer.weight_key_map.get("down_proj_expert_weight_scale_key", None)
|
||||
|
||||
up_gate_proj_weights, down_proj_weights, logical_expert_ids, _ = layer.load_experts_weight(
|
||||
state_dict, up_gate_proj_expert_weight_key, down_proj_expert_weight_key, is_rearrange
|
||||
)
|
||||
# self.check(layer, up_gate_proj_weights, down_proj_weights)
|
||||
up_gate_proj_weight_scale = []
|
||||
down_proj_weight_scale = []
|
||||
|
||||
if isinstance(state_dict, list):
|
||||
state_dict = dict(state_dict)
|
||||
|
||||
for expert_idx in logical_expert_ids:
|
||||
up_gate_proj_weight_scale.append(
|
||||
get_tensor(state_dict.pop(up_gate_proj_expert_weight_scale_key.format(expert_idx)))
|
||||
)
|
||||
down_proj_weight_scale.append(
|
||||
get_tensor(state_dict.pop(down_proj_expert_weight_scale_key.format(expert_idx)))
|
||||
)
|
||||
|
||||
up_gate_proj_weight = paddle.stack(up_gate_proj_weights, axis=0)
|
||||
down_proj_weight = paddle.stack(down_proj_weights, axis=0)
|
||||
up_gate_proj_weight_scale = paddle.stack(up_gate_proj_weight_scale, axis=0)
|
||||
down_proj_weight_scale = paddle.stack(down_proj_weight_scale, axis=0)
|
||||
|
||||
name_tensor_map = {
|
||||
"up_gate_proj_weight": up_gate_proj_weight,
|
||||
"down_proj_weight": down_proj_weight,
|
||||
"up_gate_proj_weight_scale": up_gate_proj_weight_scale,
|
||||
"down_proj_weight_scale": down_proj_weight_scale,
|
||||
}
|
||||
for name, tensor in name_tensor_map.items():
|
||||
getattr(layer, name).set_value(tensor)
|
||||
|
||||
def create_weights(self, layer: nn.Layer, **extra_weight_attrs):
|
||||
"""
|
||||
Paddle cutlass create weight process.
|
||||
"""
|
||||
self.default_dtype = layer._helper.get_default_dtype()
|
||||
if self.moe_quant_type == "weight_only_int4":
|
||||
self.up_gate_proj_weight_shape = [
|
||||
layer.num_local_experts,
|
||||
layer.moe_intermediate_size,
|
||||
layer.hidden_size,
|
||||
]
|
||||
else:
|
||||
self.up_gate_proj_weight_shape = [
|
||||
layer.num_local_experts,
|
||||
layer.moe_intermediate_size * 2,
|
||||
layer.hidden_size,
|
||||
]
|
||||
if self.moe_quant_type == "weight_only_int4":
|
||||
self.down_proj_weight_shape = [
|
||||
layer.num_local_experts,
|
||||
layer.hidden_size // 2,
|
||||
layer.moe_intermediate_size,
|
||||
]
|
||||
else:
|
||||
self.down_proj_weight_shape = [
|
||||
layer.num_local_experts,
|
||||
layer.hidden_size,
|
||||
layer.moe_intermediate_size,
|
||||
]
|
||||
self.up_gate_proj_scale_shape = [layer.num_local_experts, layer.moe_intermediate_size * 2]
|
||||
self.down_proj_scale_shape = [layer.num_local_experts, layer.hidden_size]
|
||||
self.model_format = extra_weight_attrs.get("model_format")
|
||||
# TODO(bukejiyu): remove v1 loader check when v0 loader is removed
|
||||
if self.quant_config.is_checkpoint_bf16 and layer.fd_config.load_config.load_choices == "default_v1":
|
||||
if self.model_format != "torch":
|
||||
up_gate_proj_weight_shape = [
|
||||
layer.num_local_experts,
|
||||
layer.hidden_size,
|
||||
layer.moe_intermediate_size * 2,
|
||||
]
|
||||
down_proj_weight_shape = [layer.num_local_experts, layer.moe_intermediate_size, layer.hidden_size]
|
||||
up_gate_proj_attrs = {
|
||||
**extra_weight_attrs,
|
||||
"tensor_track": TensorTracker(shape=up_gate_proj_weight_shape, output_dim=True),
|
||||
}
|
||||
down_proj_attrs = {
|
||||
**extra_weight_attrs,
|
||||
"tensor_track": TensorTracker(shape=down_proj_weight_shape, output_dim=False),
|
||||
}
|
||||
else:
|
||||
up_gate_proj_weight_shape = [
|
||||
layer.num_local_experts,
|
||||
layer.moe_intermediate_size * 2,
|
||||
layer.hidden_size,
|
||||
]
|
||||
down_proj_weight_shape = [layer.num_local_experts, layer.hidden_size, layer.moe_intermediate_size]
|
||||
up_gate_proj_attrs = {
|
||||
**extra_weight_attrs,
|
||||
"tensor_track": TensorTracker(shape=up_gate_proj_weight_shape, output_dim=False),
|
||||
"SHARD_ID_TO_SHARDED_DIM": {"gate": 0, "down": 1, "up": 0},
|
||||
}
|
||||
down_proj_attrs = {
|
||||
**extra_weight_attrs,
|
||||
"tensor_track": TensorTracker(shape=down_proj_weight_shape, output_dim=True),
|
||||
"SHARD_ID_TO_SHARDED_DIM": {"gate": 0, "down": 1, "up": 0},
|
||||
}
|
||||
|
||||
layer.up_gate_proj_weight = layer.create_parameter(
|
||||
shape=up_gate_proj_weight_shape,
|
||||
dtype=layer.weight_dtype,
|
||||
default_initializer=paddle.nn.initializer.Constant(0),
|
||||
)
|
||||
|
||||
layer.down_proj_weight = layer.create_parameter(
|
||||
shape=down_proj_weight_shape,
|
||||
dtype=layer.weight_dtype,
|
||||
default_initializer=paddle.nn.initializer.Constant(0),
|
||||
)
|
||||
set_weight_attrs(
|
||||
layer.up_gate_proj_weight,
|
||||
up_gate_proj_attrs,
|
||||
)
|
||||
set_weight_attrs(
|
||||
layer.down_proj_weight,
|
||||
down_proj_attrs,
|
||||
)
|
||||
else:
|
||||
self.weight_dtype = "int8"
|
||||
|
||||
up_gate_proj_weight_name = self.added_weight_attrs[0]
|
||||
down_proj_weight_name = self.added_weight_attrs[1]
|
||||
up_gate_proj_scale_name = self.added_scale_attrs[0]
|
||||
down_proj_scale_name = self.added_scale_attrs[1]
|
||||
|
||||
setattr(
|
||||
layer,
|
||||
up_gate_proj_weight_name,
|
||||
layer.create_parameter(
|
||||
shape=self.up_gate_proj_weight_shape,
|
||||
dtype=self.weight_dtype,
|
||||
default_initializer=paddle.nn.initializer.Constant(0),
|
||||
),
|
||||
)
|
||||
setattr(
|
||||
layer,
|
||||
down_proj_weight_name,
|
||||
layer.create_parameter(
|
||||
shape=self.down_proj_weight_shape,
|
||||
dtype=self.weight_dtype,
|
||||
default_initializer=paddle.nn.initializer.Constant(0),
|
||||
),
|
||||
)
|
||||
# weight_scale
|
||||
setattr(
|
||||
layer,
|
||||
up_gate_proj_scale_name,
|
||||
layer.create_parameter(
|
||||
shape=self.up_gate_proj_scale_shape,
|
||||
dtype=self.default_dtype,
|
||||
default_initializer=paddle.nn.initializer.Constant(0),
|
||||
),
|
||||
)
|
||||
setattr(
|
||||
layer,
|
||||
down_proj_scale_name,
|
||||
layer.create_parameter(
|
||||
shape=self.down_proj_scale_shape,
|
||||
dtype=self.default_dtype,
|
||||
default_initializer=paddle.nn.initializer.Constant(0),
|
||||
),
|
||||
)
|
||||
# The v1 loader currently does not support loading offline quantized weight-only weights.
|
||||
moe_extra_weight_attrs = {**extra_weight_attrs, "SHARD_ID_TO_SHARDED_DIM": {"gate": 0, "down": 1, "up": 0}}
|
||||
set_weight_attrs(layer.up_gate_proj_weight, moe_extra_weight_attrs)
|
||||
set_weight_attrs(layer.down_proj_weight, moe_extra_weight_attrs)
|
||||
scale_extra_weight_attrs = {
|
||||
**extra_weight_attrs,
|
||||
"SHARD_ID_TO_SHARDED_DIM": {"gate": 0, "up": 0, "down": None},
|
||||
}
|
||||
set_weight_attrs(layer.up_gate_proj_weight_scale, scale_extra_weight_attrs)
|
||||
set_weight_attrs(layer.down_proj_weight_scale, scale_extra_weight_attrs)
|
||||
|
||||
if layer.with_bias:
|
||||
layer.up_gate_proj_bias = layer.create_parameter(
|
||||
shape=[layer.num_experts, layer.moe_intermediate_size * 2],
|
||||
dtype=layer.weight_dtype,
|
||||
default_initializer=paddle.nn.initializer.Constant(0),
|
||||
)
|
||||
|
||||
layer.down_proj_bias = layer.create_parameter(
|
||||
shape=[layer.num_experts, layer.hidden_size],
|
||||
dtype=layer.weight_dtype,
|
||||
default_initializer=paddle.nn.initializer.Constant(0),
|
||||
)
|
||||
|
||||
set_weight_attrs(
|
||||
layer.up_gate_proj_bias,
|
||||
extra_weight_attrs,
|
||||
)
|
||||
set_weight_attrs(
|
||||
layer.down_proj_bias,
|
||||
extra_weight_attrs,
|
||||
)
|
||||
|
||||
def process_weights_after_loading(self, layer):
|
||||
def _process_quantize(weight_idx):
|
||||
# 1.init shape and type
|
||||
# quantized_weight_name
|
||||
weight_name = self.added_weight_attrs[weight_idx]
|
||||
unquantized_weight_name = weight_name.replace("quant_weight", "weight")
|
||||
weight_shape = self.up_gate_proj_weight_shape if weight_type == "gate_up" else self.down_proj_weight_shape
|
||||
weight_dtype = "int8"
|
||||
# scale
|
||||
scale_name = self.added_scale_attrs[weight_idx]
|
||||
scale_shape = self.up_gate_proj_scale_shape if weight_type == "gate_up" else self.down_proj_scale_shape
|
||||
scale_dtype = self.default_dtype
|
||||
|
||||
# 2.crate tmp tensor
|
||||
|
||||
weight = paddle.empty(weight_shape, dtype=weight_dtype)
|
||||
scale = paddle.empty(scale_shape, dtype=scale_dtype)
|
||||
|
||||
# 3.quantize weight
|
||||
|
||||
for expert_id in range(layer.num_local_experts):
|
||||
weight[expert_id], scale[expert_id] = weight_quantize(
|
||||
getattr(layer, unquantized_weight_name)[expert_id], algo=self.moe_quant_type
|
||||
)
|
||||
|
||||
free_tensor(getattr(layer, unquantized_weight_name))
|
||||
|
||||
# create weight
|
||||
setattr(
|
||||
layer,
|
||||
weight_name,
|
||||
layer.create_parameter(
|
||||
shape=weight_shape,
|
||||
dtype=weight_dtype,
|
||||
default_initializer=paddle.nn.initializer.Constant(0),
|
||||
),
|
||||
)
|
||||
# create scale
|
||||
setattr(
|
||||
layer,
|
||||
scale_name,
|
||||
layer.create_parameter(
|
||||
shape=scale_shape,
|
||||
dtype=scale_dtype,
|
||||
default_initializer=paddle.nn.initializer.Constant(0),
|
||||
),
|
||||
)
|
||||
getattr(layer, weight_name).copy_(weight, False)
|
||||
getattr(layer, scale_name).copy_(scale, False)
|
||||
|
||||
if self.quant_config.is_checkpoint_bf16:
|
||||
weight_id_map = {"gate_up": 0, "down": 1}
|
||||
if weight_fully_copied(layer.up_gate_proj_weight):
|
||||
weight_type = "gate_up"
|
||||
else:
|
||||
weight_type = "down"
|
||||
|
||||
if self.model_format == "torch":
|
||||
unquantized_weight_name = self.added_weight_attrs[weight_id_map[weight_type]].replace(
|
||||
"quant_weight", "weight"
|
||||
)
|
||||
process_weight_transpose(layer, unquantized_weight_name)
|
||||
_process_quantize(weight_id_map[weight_type])
|
||||
else:
|
||||
return
|
||||
|
||||
def process_loaded_weights(self, layer: nn.Layer, state_dict):
|
||||
"""
|
||||
Paddle cutlass load weight process.
|
||||
"""
|
||||
up_gate_proj_weights, down_proj_weights, _, _ = layer.extract_moe_ffn_weights(state_dict)
|
||||
self.check(layer, up_gate_proj_weights, down_proj_weights)
|
||||
for idx, weight_tensor in enumerate([up_gate_proj_weights, down_proj_weights]):
|
||||
weight_name = self.added_weight_attrs[idx]
|
||||
scale_name = self.added_scale_attrs[idx]
|
||||
|
||||
weight_list = []
|
||||
weight_scale_list = []
|
||||
for i in range(layer.num_local_experts):
|
||||
quant_weight, scale = weight_quantize(weight_tensor[i], algo=self.moe_quant_type)
|
||||
weight_list.append(quant_weight)
|
||||
weight_scale_list.append(scale)
|
||||
quanted_weight = paddle.stack(weight_list, axis=0)
|
||||
getattr(layer, weight_name).set_value(quanted_weight)
|
||||
|
||||
quanted_weight_scale = paddle.stack(weight_scale_list, axis=0)
|
||||
getattr(layer, scale_name).set_value(quanted_weight_scale)
|
||||
@@ -0,0 +1,16 @@
|
||||
# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
"""
|
||||
iluvatar quantization methods
|
||||
"""
|
||||
@@ -0,0 +1,183 @@
|
||||
"""
|
||||
# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
"""
|
||||
|
||||
import paddle
|
||||
from paddle.nn.quant import weight_only_linear, weight_quantize
|
||||
|
||||
from fastdeploy.model_executor.layers.linear import (
|
||||
MergedColumnParallelLinear,
|
||||
MergedReplicatedLinear,
|
||||
QKVGateParallelLinear,
|
||||
QKVParallelLinear,
|
||||
)
|
||||
from fastdeploy.model_executor.layers.quantization.weight_only import (
|
||||
WeightOnlyConfig,
|
||||
WeightOnlyLinearMethod,
|
||||
)
|
||||
from fastdeploy.model_executor.layers.utils import get_tensor
|
||||
from fastdeploy.model_executor.utils import (
|
||||
TensorTracker,
|
||||
free_tensor,
|
||||
process_weight_transpose,
|
||||
set_weight_attrs,
|
||||
)
|
||||
|
||||
|
||||
class IluvatarWeightOnlyLinearMethod(WeightOnlyLinearMethod):
|
||||
"""
|
||||
Weight only quantization method for linear layer
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
quant_config: WeightOnlyConfig,
|
||||
) -> None:
|
||||
super().__init__(quant_config)
|
||||
self.quant_config.weight_only_linear_arch = -1
|
||||
self.group_size = -1
|
||||
|
||||
def create_weights(self, layer, **extra_weight_attrs):
|
||||
# TODO(bukejiyu): remove v1 loader check when v0 loader is removed
|
||||
self.model_format = extra_weight_attrs.get("model_format")
|
||||
if self.quant_config.is_checkpoint_bf16 and layer.fd_config.load_config.load_choices == "default_v1":
|
||||
weight_shape = layer.weight_shape[::-1] if self.model_format == "torch" else layer.weight_shape
|
||||
layer.weight = layer.create_parameter(
|
||||
shape=weight_shape,
|
||||
dtype=layer.weight_dtype,
|
||||
is_bias=False,
|
||||
default_initializer=paddle.nn.initializer.Constant(0),
|
||||
)
|
||||
|
||||
quant_attrs = extra_weight_attrs
|
||||
|
||||
if (
|
||||
isinstance(layer, MergedColumnParallelLinear)
|
||||
or isinstance(layer, QKVParallelLinear)
|
||||
or isinstance(layer, MergedReplicatedLinear)
|
||||
or isinstance(layer, QKVGateParallelLinear)
|
||||
):
|
||||
# Only MergedReplicatedLinear uses the default outdim.
|
||||
tensor_output_dim = (self.model_format == "torch") ^ quant_attrs.get("output_dim", True)
|
||||
quant_attrs = {
|
||||
**quant_attrs,
|
||||
"tensor_track": TensorTracker(shape=weight_shape, output_dim=tensor_output_dim),
|
||||
}
|
||||
|
||||
if self.model_format == "torch" and "output_dim" in quant_attrs:
|
||||
quant_attrs["output_dim"] = not quant_attrs["output_dim"]
|
||||
|
||||
set_weight_attrs(
|
||||
layer.weight,
|
||||
quant_attrs,
|
||||
)
|
||||
else:
|
||||
# The scale shape should be equal to the output dim of weight using Per-Channel Quantization.
|
||||
weight_scale_shape = [layer.weight_shape[1]]
|
||||
layer.weight_shape.reverse()
|
||||
if self.quant_config.name() == "wint4":
|
||||
layer.weight_shape[0] //= 2
|
||||
layer.weight_dtype = "int8"
|
||||
|
||||
layer.weight = layer.create_parameter(
|
||||
shape=layer.weight_shape,
|
||||
dtype=layer.weight_dtype,
|
||||
is_bias=False,
|
||||
default_initializer=paddle.nn.initializer.Constant(0),
|
||||
)
|
||||
|
||||
if "output_dim" in extra_weight_attrs:
|
||||
extra_weight_attrs["output_dim"] = not extra_weight_attrs["output_dim"]
|
||||
set_weight_attrs(
|
||||
layer.weight,
|
||||
extra_weight_attrs,
|
||||
)
|
||||
|
||||
layer.weight_scale = layer.create_parameter(
|
||||
shape=weight_scale_shape,
|
||||
dtype=layer._dtype,
|
||||
is_bias=False,
|
||||
)
|
||||
|
||||
set_weight_attrs(
|
||||
layer.weight_scale,
|
||||
extra_weight_attrs,
|
||||
)
|
||||
|
||||
def process_weights_after_loading(self, layer) -> None:
|
||||
def _process_quantize():
|
||||
quanted_weight_tensor, weight_scale_tensor = weight_quantize(
|
||||
layer.weight,
|
||||
algo=self.quant_config.algo,
|
||||
arch=self.quant_config.weight_only_linear_arch,
|
||||
)
|
||||
|
||||
free_tensor(layer.weight)
|
||||
|
||||
layer.weight = layer.create_parameter(
|
||||
shape=quanted_weight_tensor.shape,
|
||||
dtype="int8",
|
||||
is_bias=False,
|
||||
default_initializer=paddle.nn.initializer.Constant(0),
|
||||
)
|
||||
layer.weight_scale = layer.create_parameter(
|
||||
shape=weight_scale_tensor.shape,
|
||||
dtype=layer._dtype,
|
||||
is_bias=False,
|
||||
default_initializer=paddle.nn.initializer.Constant(0),
|
||||
)
|
||||
layer.weight.copy_(quanted_weight_tensor, False)
|
||||
layer.weight_scale.copy_(weight_scale_tensor, False)
|
||||
|
||||
if self.quant_config.is_checkpoint_bf16:
|
||||
if self.model_format == "torch":
|
||||
process_weight_transpose(layer, "weight")
|
||||
_process_quantize()
|
||||
else:
|
||||
return
|
||||
|
||||
def process_loaded_weights(self, layer, weight) -> None:
|
||||
|
||||
quanted_weight_tensor, weight_scale_tensor = weight_quantize(
|
||||
weight,
|
||||
algo=self.quant_config.algo,
|
||||
arch=self.quant_config.weight_only_linear_arch,
|
||||
)
|
||||
layer.weight.set_value(quanted_weight_tensor)
|
||||
layer.weight_scale.set_value(weight_scale_tensor.astype(paddle.get_default_dtype()))
|
||||
|
||||
def process_prequanted_weights(self, layer, state_dict, is_rearrange: bool = False) -> None:
|
||||
"""
|
||||
Process pre-quantized weights before applying them to the model
|
||||
Args:
|
||||
layer: The layer that owns the weights
|
||||
quant_weight: The quantized weights
|
||||
weight_scale: The scale of the quantized weights
|
||||
"""
|
||||
quant_weight = get_tensor(state_dict.pop(layer.weight_key))
|
||||
weight_scale = get_tensor(state_dict.pop(layer.weight_scale_key))
|
||||
layer.weight.set_value(quant_weight)
|
||||
layer.weight_scale.set_value(weight_scale.astype(paddle.get_default_dtype()))
|
||||
|
||||
def apply(self, layer, x):
|
||||
linear_out = weight_only_linear(
|
||||
x,
|
||||
weight=layer.weight,
|
||||
bias=layer.bias if layer.with_bias else None,
|
||||
weight_scale=layer.weight_scale,
|
||||
weight_dtype=("int8" if self.quant_config.name() == "wint8" else "int4"),
|
||||
arch=self.quant_config.weight_only_linear_arch,
|
||||
)
|
||||
return linear_out
|
||||
@@ -37,11 +37,6 @@ if current_platform.is_cuda():
|
||||
)
|
||||
except:
|
||||
logger.warning("import w4afp8_gemm_scale_permute Failed!")
|
||||
elif current_platform.is_iluvatar():
|
||||
from fastdeploy.model_executor.ops.iluvatar import (
|
||||
moe_expert_dispatch,
|
||||
moe_expert_reduce,
|
||||
)
|
||||
|
||||
from fastdeploy.model_executor.layers.moe.moe import get_moe_scores
|
||||
from fastdeploy.model_executor.utils import (
|
||||
@@ -91,40 +86,24 @@ class CutlassMoEMethod(UnquantizedFusedMoEMethod):
|
||||
"""
|
||||
Paddle Cutlass compute Fused MoE.
|
||||
"""
|
||||
if current_platform.is_iluvatar():
|
||||
ffn_out_without_down_proj_bias = fastdeploy.model_executor.ops.iluvatar.moe_expert_ffn(
|
||||
permute_input,
|
||||
token_nums_per_expert,
|
||||
getattr(layer, self.added_weight_attrs[0]),
|
||||
getattr(layer, self.added_weight_attrs[1]),
|
||||
(layer.up_gate_proj_bias if hasattr(layer, "up_gate_proj_bias") else None),
|
||||
(layer.up_gate_proj_weight_scale if hasattr(layer, "up_gate_proj_weight_scale") else None),
|
||||
(layer.down_proj_weight_scale if hasattr(layer, "down_proj_weight_scale") else None),
|
||||
(layer.down_proj_in_scale if hasattr(layer, "down_proj_in_scale") else None),
|
||||
expert_idx_per_token,
|
||||
self.moe_quant_type,
|
||||
used_in_ep_low_latency,
|
||||
layer.fd_config.model_config.moe_phase.phase,
|
||||
)
|
||||
else:
|
||||
ffn_out_without_down_proj_bias = fastdeploy.model_executor.ops.gpu.moe_expert_ffn(
|
||||
permute_input,
|
||||
token_nums_per_expert,
|
||||
getattr(layer, self.added_weight_attrs[0]),
|
||||
getattr(layer, self.added_weight_attrs[1]),
|
||||
dequant_scale,
|
||||
(layer.up_gate_proj_bias if hasattr(layer, "up_gate_proj_bias") else None),
|
||||
(layer.up_gate_proj_weight_scale if hasattr(layer, "up_gate_proj_weight_scale") else None),
|
||||
(layer.down_proj_weight_scale if hasattr(layer, "down_proj_weight_scale") else None),
|
||||
(layer.down_proj_in_scale if hasattr(layer, "down_proj_in_scale") else None),
|
||||
expert_idx_per_token,
|
||||
max_tokens_per_expert,
|
||||
self.moe_quant_type,
|
||||
used_in_ep_low_latency,
|
||||
estimate_total_token_nums,
|
||||
getattr(layer.moe_quant_config, "hadamard_block_size", 128),
|
||||
layer.activation,
|
||||
)
|
||||
ffn_out_without_down_proj_bias = fastdeploy.model_executor.ops.gpu.moe_expert_ffn(
|
||||
permute_input,
|
||||
token_nums_per_expert,
|
||||
getattr(layer, self.added_weight_attrs[0]),
|
||||
getattr(layer, self.added_weight_attrs[1]),
|
||||
dequant_scale,
|
||||
(layer.up_gate_proj_bias if hasattr(layer, "up_gate_proj_bias") else None),
|
||||
(layer.up_gate_proj_weight_scale if hasattr(layer, "up_gate_proj_weight_scale") else None),
|
||||
(layer.down_proj_weight_scale if hasattr(layer, "down_proj_weight_scale") else None),
|
||||
(layer.down_proj_in_scale if hasattr(layer, "down_proj_in_scale") else None),
|
||||
expert_idx_per_token,
|
||||
max_tokens_per_expert,
|
||||
self.moe_quant_type,
|
||||
used_in_ep_low_latency,
|
||||
estimate_total_token_nums,
|
||||
getattr(layer.moe_quant_config, "hadamard_block_size", 128),
|
||||
layer.activation,
|
||||
)
|
||||
|
||||
if layer.with_bias:
|
||||
down_proj_bias_expand = paddle.index_select(layer.down_proj_bias, expert_idx_per_token, axis=0)
|
||||
@@ -307,91 +286,47 @@ class CutlassMoEMethod(UnquantizedFusedMoEMethod):
|
||||
layer.gate_correction_bias,
|
||||
getattr(layer, "renormalize", True),
|
||||
)
|
||||
if current_platform.is_iluvatar():
|
||||
(
|
||||
permute_input,
|
||||
token_nums_per_expert,
|
||||
permute_indices_per_token,
|
||||
topk_weights,
|
||||
topk_idx,
|
||||
expert_idx_per_token,
|
||||
dequant_scale,
|
||||
max_tokens_per_expert,
|
||||
) = moe_expert_dispatch(
|
||||
x,
|
||||
gate_out,
|
||||
None, # Use layer.gate_correction_bias in get_moe_scores.
|
||||
(
|
||||
permute_input,
|
||||
token_nums_per_expert,
|
||||
permute_indices_per_token,
|
||||
topk_weights,
|
||||
topk_idx,
|
||||
expert_idx_per_token,
|
||||
) = moe_expert_dispatch(
|
||||
x,
|
||||
gate_out,
|
||||
None, # Use layer.gate_correction_bias in get_moe_scores.
|
||||
(
|
||||
layer.up_gate_proj_in_scale if hasattr(layer, "up_gate_proj_in_scale") else None
|
||||
), # if set, permute_input will be int8_t
|
||||
layer.top_k,
|
||||
False,
|
||||
self.moe_quant_type,
|
||||
topk_only_mode=True,
|
||||
)
|
||||
dequant_scale = None
|
||||
max_tokens_per_expert = None
|
||||
else:
|
||||
(
|
||||
permute_input,
|
||||
token_nums_per_expert,
|
||||
permute_indices_per_token,
|
||||
topk_weights,
|
||||
topk_idx,
|
||||
expert_idx_per_token,
|
||||
dequant_scale,
|
||||
max_tokens_per_expert,
|
||||
) = moe_expert_dispatch(
|
||||
x,
|
||||
gate_out,
|
||||
None, # Use layer.gate_correction_bias in get_moe_scores.
|
||||
(
|
||||
layer.up_gate_proj_in_scale if hasattr(layer, "up_gate_proj_in_scale") else None
|
||||
), # if set, permute_input will be int8_t
|
||||
layer.top_k,
|
||||
False,
|
||||
self.moe_quant_type,
|
||||
topk_only_mode=True,
|
||||
)
|
||||
layer.up_gate_proj_in_scale if hasattr(layer, "up_gate_proj_in_scale") else None
|
||||
), # if set, permute_input will be int8_t
|
||||
layer.top_k,
|
||||
False,
|
||||
self.moe_quant_type,
|
||||
topk_only_mode=True,
|
||||
)
|
||||
else:
|
||||
if current_platform.is_iluvatar():
|
||||
(
|
||||
permute_input,
|
||||
token_nums_per_expert,
|
||||
permute_indices_per_token,
|
||||
topk_weights,
|
||||
topk_idx,
|
||||
expert_idx_per_token,
|
||||
) = moe_expert_dispatch(
|
||||
x,
|
||||
gate_out,
|
||||
layer.gate_correction_bias,
|
||||
(layer.up_gate_proj_in_scale if hasattr(layer, "up_gate_proj_in_scale") else None),
|
||||
layer.top_k,
|
||||
False,
|
||||
self.moe_quant_type,
|
||||
topk_only_mode=False,
|
||||
)
|
||||
dequant_scale = None
|
||||
max_tokens_per_expert = None
|
||||
else:
|
||||
(
|
||||
permute_input,
|
||||
token_nums_per_expert,
|
||||
permute_indices_per_token,
|
||||
topk_weights,
|
||||
topk_idx,
|
||||
expert_idx_per_token,
|
||||
dequant_scale,
|
||||
max_tokens_per_expert,
|
||||
) = moe_expert_dispatch(
|
||||
x,
|
||||
gate_out,
|
||||
layer.gate_correction_bias,
|
||||
(layer.up_gate_proj_in_scale if hasattr(layer, "up_gate_proj_in_scale") else None),
|
||||
layer.top_k,
|
||||
False,
|
||||
self.moe_quant_type,
|
||||
topk_only_mode=False,
|
||||
)
|
||||
(
|
||||
permute_input,
|
||||
token_nums_per_expert,
|
||||
permute_indices_per_token,
|
||||
topk_weights,
|
||||
topk_idx,
|
||||
expert_idx_per_token,
|
||||
dequant_scale,
|
||||
max_tokens_per_expert,
|
||||
) = moe_expert_dispatch(
|
||||
x,
|
||||
gate_out,
|
||||
layer.gate_correction_bias,
|
||||
(layer.up_gate_proj_in_scale if hasattr(layer, "up_gate_proj_in_scale") else None),
|
||||
layer.top_k,
|
||||
False,
|
||||
self.moe_quant_type,
|
||||
topk_only_mode=False,
|
||||
)
|
||||
|
||||
if hasattr(layer, "up_gate_proj_in_scale"):
|
||||
dequant_scale = None
|
||||
|
||||
@@ -47,10 +47,14 @@ def get_moe_method(layer=None):
|
||||
return moe method based on device platform
|
||||
"""
|
||||
|
||||
if current_platform.is_cuda() or current_platform.is_iluvatar():
|
||||
if current_platform.is_cuda():
|
||||
from .fused_moe_cutlass_backend import CutlassMoEMethod
|
||||
|
||||
return CutlassMoEMethod(None)
|
||||
elif current_platform.is_iluvatar():
|
||||
from fastdeploy.model_executor.layers.backends import IluvatarCutlassMoEMethod
|
||||
|
||||
return IluvatarCutlassMoEMethod(None)
|
||||
elif current_platform.is_xpu():
|
||||
from fastdeploy.model_executor.layers.backends import XPUMoEMethod
|
||||
|
||||
|
||||
@@ -149,6 +149,22 @@ class WeightOnlyConfig(QuantConfigBase):
|
||||
else:
|
||||
|
||||
return GPUWeightOnlyLinearMethod(self)
|
||||
elif current_platform.is_iluvatar():
|
||||
if isinstance(layer, FusedMoE):
|
||||
if layer.use_method == "cutlass":
|
||||
from fastdeploy.model_executor.layers.backends import (
|
||||
IluvatarCutlassWeightOnlyMoEMethod,
|
||||
)
|
||||
|
||||
return IluvatarCutlassWeightOnlyMoEMethod(self)
|
||||
else:
|
||||
raise ValueError(f"Unsupported MOE backend {layer.use_method}")
|
||||
else:
|
||||
from fastdeploy.model_executor.layers.backends import (
|
||||
IluvatarWeightOnlyLinearMethod,
|
||||
)
|
||||
|
||||
return IluvatarWeightOnlyLinearMethod(self)
|
||||
else:
|
||||
if isinstance(layer, FusedMoE):
|
||||
if layer.use_method == "cutlass":
|
||||
|
||||
@@ -100,19 +100,13 @@ def iluvatar_moe_expert_ffn(
|
||||
up_gate_proj_bias: Optional[paddle.Tensor],
|
||||
up_gate_proj_scale: Optional[paddle.Tensor],
|
||||
down_proj_scale: Optional[paddle.Tensor],
|
||||
down_proj_in_scale: Optional[paddle.Tensor],
|
||||
expert_idx_per_token: Optional[paddle.Tensor],
|
||||
quant_method: str,
|
||||
used_in_ep_low_latency: bool,
|
||||
moe_phase: str,
|
||||
):
|
||||
assert up_gate_proj_bias is None
|
||||
assert up_gate_proj_scale is not None
|
||||
assert down_proj_scale is not None
|
||||
assert down_proj_in_scale is None
|
||||
assert expert_idx_per_token is None
|
||||
assert quant_method in ("weight_only_int8")
|
||||
assert not used_in_ep_low_latency
|
||||
group_gemm_func, tokens_per_expert = _pre_process_expert_ffn(moe_phase, tokens_expert_prefix_sum)
|
||||
ffn1_output = group_gemm_func(permute_input, up_gate_proj_weight, up_gate_proj_scale, tokens_per_expert, -1)
|
||||
act_out = swiglu(ffn1_output)
|
||||
|
||||
@@ -11,7 +11,10 @@
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
from .base import Platform
|
||||
|
||||
from fastdeploy.utils import console_logger as logger
|
||||
|
||||
from .base import Platform, _Backend
|
||||
|
||||
|
||||
class IluvatarPlatform(Platform):
|
||||
@@ -22,4 +25,11 @@ class IluvatarPlatform(Platform):
|
||||
"""
|
||||
get_attention_backend_cls
|
||||
"""
|
||||
return "fastdeploy.model_executor.layers.attention.IluvatarAttnBackend"
|
||||
if selected_backend == _Backend.APPEND_ATTN:
|
||||
logger.info("Using ixinfer MHA backend instead of append attention")
|
||||
return "fastdeploy.model_executor.layers.backends.iluvatar.MhaAttnBackend"
|
||||
else:
|
||||
raise ValueError(
|
||||
"Invalid attention backend you specified.\n"
|
||||
"Now only support [NATIVE_ATTN, MLA_ATTN, APPEND_ATTN] in cuda place."
|
||||
)
|
||||
|
||||
@@ -20,7 +20,7 @@ import paddle
|
||||
|
||||
from fastdeploy import envs
|
||||
from fastdeploy.config import FDConfig
|
||||
from fastdeploy.model_executor.layers.attention import IluvatarAttnBackend
|
||||
from fastdeploy.model_executor.layers.attention import get_attention_backend
|
||||
from fastdeploy.worker.gpu_model_runner import GPUModelRunner
|
||||
|
||||
|
||||
@@ -90,7 +90,8 @@ class IluvatarModelRunner(GPUModelRunner):
|
||||
1,
|
||||
int(self.model_config.num_key_value_heads) // self.parallel_config.tensor_parallel_size,
|
||||
)
|
||||
attn_backend = IluvatarAttnBackend(
|
||||
attn_cls = get_attention_backend()
|
||||
attn_backend = attn_cls(
|
||||
self.fd_config,
|
||||
kv_num_heads=self.model_config.kv_num_heads,
|
||||
num_heads=num_heads,
|
||||
|
||||
@@ -33,7 +33,6 @@ omit =
|
||||
*/fastdeploy/model_executor/ops/gpu/fastdeploy_ops.py
|
||||
*/fastdeploy/model_executor/ops/gpu/fastdeploy_ops/__init__.py
|
||||
*/fastdeploy/model_executor/ops/gpu/deep_gemm/utils.py
|
||||
*/fastdeploy/model_executor/layers/attention/iluvatar_attn_backend.py
|
||||
*/fastdeploy/model_executor/xpu_pre_and_post_process.py
|
||||
*/fastdeploy/**/dcu/*
|
||||
*/fastdeploy/worker/dcu*.py
|
||||
|
||||
Reference in New Issue
Block a user