Files
FastDeploy/fastdeploy/model_executor/ops/iluvatar/moe_ops.py
T

83 lines
2.8 KiB
Python

"""
# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""
from typing import Optional
import paddle
from paddle.nn.functional import swiglu
try:
from fastdeploy.model_executor.ops.iluvatar import (
restore_tokens_per_expert,
w8a16_group_gemm,
w8a16_group_gemv,
wi4a16_group_gemm,
)
except:
w8a16_group_gemm = None
w8a16_group_gemv = None
wi4a16_group_gemm = None
restore_tokens_per_expert = None
def _pre_process_expert_ffn(
moe_phase: str,
quant_method: str,
tokens_expert_prefix_sum: paddle.Tensor,
):
if quant_method == "weight_only_int8":
if moe_phase == "decode":
group_gemm_func = w8a16_group_gemv
tokens_per_expert = restore_tokens_per_expert(tokens_expert_prefix_sum).to("int32")
else:
group_gemm_func = w8a16_group_gemm
tokens_per_expert = tokens_expert_prefix_sum
else:
group_gemm_func = wi4a16_group_gemm
tokens_per_expert = tokens_expert_prefix_sum
return group_gemm_func, tokens_per_expert
def iluvatar_moe_expert_ffn(
permute_input: paddle.Tensor,
tokens_expert_prefix_sum: paddle.Tensor,
up_gate_proj_weight: paddle.Tensor,
down_proj_weight: paddle.Tensor,
up_gate_proj_bias: Optional[paddle.Tensor],
up_gate_proj_scale: Optional[paddle.Tensor],
up_gate_proj_zeros: Optional[paddle.Tensor],
down_proj_scale: Optional[paddle.Tensor],
down_proj_zeros: Optional[paddle.Tensor],
quant_method: str,
group_size: int,
moe_phase: str,
):
assert up_gate_proj_bias is None
assert up_gate_proj_scale is not None
assert down_proj_scale is not None
if quant_method == "weight_only_int4":
assert up_gate_proj_zeros is not None
assert down_proj_zeros is not None
group_gemm_func, tokens_per_expert = _pre_process_expert_ffn(moe_phase, quant_method, tokens_expert_prefix_sum)
ffn1_output = group_gemm_func(
permute_input, up_gate_proj_weight, up_gate_proj_scale, up_gate_proj_zeros, tokens_per_expert, group_size
)
act_out = swiglu(ffn1_output)
output = group_gemm_func(
act_out, down_proj_weight, down_proj_scale, down_proj_zeros, tokens_per_expert, group_size
)
return output