mirror of
https://github.com/PaddlePaddle/FastDeploy.git
synced 2026-04-23 00:17:25 +08:00
This reverts commit eb80724b71.
This commit is contained in:
@@ -25,33 +25,16 @@ from fastdeploy.model_executor.layers.linear import (
|
||||
QKVParallelLinear,
|
||||
)
|
||||
from fastdeploy.model_executor.layers.moe import FusedMoE
|
||||
from fastdeploy.model_executor.layers.quantization.fp8_utils import (
|
||||
quant_weight_ue8m0,
|
||||
transform_scale_ue8m0,
|
||||
)
|
||||
from fastdeploy.model_executor.utils import (
|
||||
TensorTracker,
|
||||
process_weight_transpose,
|
||||
set_weight_attrs,
|
||||
)
|
||||
from fastdeploy.platforms import current_platform
|
||||
from fastdeploy.utils import register_custom_python_op
|
||||
|
||||
from ..utils import get_sm_version, get_tensor, per_block_cast_to_fp8
|
||||
from ..utils import get_tensor, per_block_cast_to_fp8
|
||||
from .quant_base import QuantConfigBase, QuantMethodBase
|
||||
|
||||
if current_platform.is_cuda():
|
||||
if get_sm_version() == 100:
|
||||
# SM100 should use PFCC DeepGemm
|
||||
paddle.compat.enable_torch_proxy(scope={"deep_gemm"})
|
||||
from deep_gemm import fp8_gemm_nt
|
||||
else:
|
||||
from fastdeploy.model_executor.ops.gpu.deep_gemm import (
|
||||
gemm_fp8_fp8_bf16_nt as fp8_gemm_nt,
|
||||
)
|
||||
else:
|
||||
fp8_gemm_nt = None
|
||||
|
||||
|
||||
class BlockWiseFP8Config(QuantConfigBase):
|
||||
"""
|
||||
@@ -68,7 +51,6 @@ class BlockWiseFP8Config(QuantConfigBase):
|
||||
self.quant_round_type = 1
|
||||
self.use_deep_gemm = bool(envs.FD_USE_DEEP_GEMM)
|
||||
self.is_checkpoint_bf16 = is_checkpoint_bf16
|
||||
self.deepgemm_scale_ue8m0 = True if get_sm_version() == 100 else False
|
||||
|
||||
def name(self) -> str:
|
||||
return "block_wise_fp8"
|
||||
@@ -99,7 +81,7 @@ class BlockWiseFP8Config(QuantConfigBase):
|
||||
return BlockWiseFP8LinearMethod(self)
|
||||
|
||||
|
||||
def deep_gemm_fp8_gemm_nt_infer_meta(
|
||||
def deep_gemm_fp8_fp8_bf16_nt_infer_meta(
|
||||
x_meta: "paddle.static.MetaTensor",
|
||||
x_scale_tensor_meta: "paddle.static.MetaTensor",
|
||||
layer_weight_meta: "paddle.static.MetaTensor",
|
||||
@@ -111,13 +93,13 @@ def deep_gemm_fp8_gemm_nt_infer_meta(
|
||||
|
||||
|
||||
@register_custom_python_op(
|
||||
name="deep_gemm_fp8_gemm_nt",
|
||||
infer_meta=deep_gemm_fp8_gemm_nt_infer_meta,
|
||||
name="deep_gemm_fp8_fp8_bf16_nt",
|
||||
infer_meta=deep_gemm_fp8_fp8_bf16_nt_infer_meta,
|
||||
input_names=["x", "x_scale_tensor", "layer_weight", "layer_weight_scale_inv", "linear_out_empty"],
|
||||
output_names=["linear_out"],
|
||||
inplace_map={},
|
||||
)
|
||||
def deep_gemm_fp8_gemm_nt(
|
||||
def deep_gemm_fp8_fp8_bf16_nt(
|
||||
x: paddle.Tensor,
|
||||
x_scale_tensor: paddle.Tensor,
|
||||
layer_weight: paddle.Tensor,
|
||||
@@ -125,12 +107,14 @@ def deep_gemm_fp8_gemm_nt(
|
||||
linear_out: paddle.Tensor,
|
||||
layer_output_size: int,
|
||||
):
|
||||
# disable_ue8m0_cast is default False for SM100
|
||||
fp8_gemm_nt(
|
||||
from fastdeploy.model_executor.ops.gpu import deep_gemm
|
||||
|
||||
deep_gemm.gemm_fp8_fp8_bf16_nt(
|
||||
(x, x_scale_tensor),
|
||||
(layer_weight, layer_weight_scale_inv),
|
||||
linear_out,
|
||||
)
|
||||
|
||||
return linear_out
|
||||
|
||||
|
||||
@@ -225,16 +209,8 @@ class BlockWiseFP8LinearMethod(QuantMethodBase):
|
||||
def process_weights_after_loading(self, layer) -> None:
|
||||
def _process_quantize():
|
||||
weight_tensor = layer.weight.transpose([1, 0])
|
||||
quanted_weight_tensor, weight_block_scale_tensor = per_block_cast_to_fp8(weight_tensor)
|
||||
|
||||
if not self.quant_config.deepgemm_scale_ue8m0:
|
||||
quanted_weight_tensor, weight_block_scale_tensor = per_block_cast_to_fp8(weight_tensor)
|
||||
else:
|
||||
quanted_weight_tensor, weight_block_scale_tensor = quant_weight_ue8m0(weight_tensor, [128, 128])
|
||||
weight_block_scale_tensor = transform_scale_ue8m0(
|
||||
weight_block_scale_tensor,
|
||||
mn=quanted_weight_tensor.shape[-2],
|
||||
weight_block_size=[128, 128],
|
||||
)
|
||||
if hasattr(layer.weight, "tensor_track"):
|
||||
layer.weight.tensor_track = None
|
||||
layer.weight.value().get_tensor()._clear()
|
||||
@@ -248,12 +224,13 @@ class BlockWiseFP8LinearMethod(QuantMethodBase):
|
||||
)
|
||||
layer.weight_scale_inv = layer.create_parameter(
|
||||
shape=weight_block_scale_tensor.shape,
|
||||
dtype=weight_block_scale_tensor.dtype,
|
||||
dtype="float32",
|
||||
is_bias=False,
|
||||
default_initializer=paddle.nn.initializer.Constant(0),
|
||||
)
|
||||
|
||||
layer.weight.copy_(quanted_weight_tensor, False)
|
||||
layer.weight_scale_inv.data = weight_block_scale_tensor
|
||||
layer.weight_scale_inv.copy_(weight_block_scale_tensor, False)
|
||||
|
||||
if self.quant_config.is_checkpoint_bf16:
|
||||
if self.model_format == "torch":
|
||||
@@ -286,24 +263,13 @@ class BlockWiseFP8LinearMethod(QuantMethodBase):
|
||||
layer.weight_scale_inv.set_value(weight_scale)
|
||||
|
||||
def apply(self, layer, x):
|
||||
linear_out = paddle.empty((x.shape[0], layer.output_size), dtype=paddle.bfloat16)
|
||||
if x.shape[0] == 0:
|
||||
return linear_out
|
||||
|
||||
x, x_scale_tensor = paddle.incubate.nn.functional.fp8_quant_blockwise(
|
||||
x,
|
||||
using_pow2_scale=self.quant_config.deepgemm_scale_ue8m0,
|
||||
output_scale_transpose=True,
|
||||
using_ue8m0_scale=self.quant_config.deepgemm_scale_ue8m0,
|
||||
x, using_pow2_scale=False, output_scale_transpose=True
|
||||
)
|
||||
x_scale_tensor = x_scale_tensor.T[: x.shape[0], ...]
|
||||
deep_gemm_fp8_gemm_nt(
|
||||
x,
|
||||
x_scale_tensor,
|
||||
layer.weight,
|
||||
layer.weight_scale_inv,
|
||||
linear_out,
|
||||
layer_output_size=layer.output_size,
|
||||
x_scale_tensor = x_scale_tensor.T
|
||||
linear_out = paddle.empty((x.shape[0], layer.output_size), dtype=paddle.bfloat16)
|
||||
linear_out = deep_gemm_fp8_fp8_bf16_nt(
|
||||
x, x_scale_tensor, layer.weight, layer.weight_scale_inv, linear_out, layer.output_size
|
||||
)
|
||||
if layer.with_bias:
|
||||
linear_out = paddle.add(linear_out, layer.bias)
|
||||
|
||||
@@ -1,106 +0,0 @@
|
||||
"""
|
||||
# Copyright (c) 2026 PaddlePaddle Authors. All Rights Reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
"""
|
||||
|
||||
import paddle
|
||||
|
||||
from fastdeploy.platforms import current_platform
|
||||
|
||||
from ..utils import get_sm_version
|
||||
|
||||
if current_platform.is_cuda():
|
||||
if get_sm_version() == 100:
|
||||
# SM100 should use PFCC DeepGemm
|
||||
paddle.compat.enable_torch_proxy(scope={"deep_gemm"})
|
||||
import deep_gemm
|
||||
else:
|
||||
from fastdeploy.model_executor.ops.gpu import deep_gemm
|
||||
else:
|
||||
deep_gemm = None
|
||||
|
||||
|
||||
def ceil_div(x: int, y: int) -> int:
|
||||
return (x + y - 1) // y
|
||||
|
||||
|
||||
def _get_mn_major_tma_aligned_packed_ue8m0_tensor_torch_impl(
|
||||
x: paddle.Tensor,
|
||||
):
|
||||
"""将FP32张量转换为TMA对齐的packed UE8M0格式张量"""
|
||||
|
||||
from deep_gemm.utils import align, get_tma_aligned_size
|
||||
|
||||
# 输入验证:必须是FP32类型的2D或3D张量
|
||||
assert x.dtype == paddle.float and x.dim() in (2, 3)
|
||||
|
||||
# 第一步:将FP32转换为UE8M0格式的uint8张量
|
||||
# 通过位移操作提取FP32的指数部分,转换为无符号8位整数
|
||||
ue8m0_tensor = (x.view(paddle.int) >> 23).to(paddle.uint8)
|
||||
|
||||
# 第二步:创建padding并打包张量
|
||||
# 获取输入张量的最后两个维度尺寸
|
||||
mn, k = x.shape[-2], x.shape[-1]
|
||||
remove_dim = False
|
||||
# 如果是2D张量,添加batch维度以便统一处理
|
||||
if x.dim() == 2:
|
||||
x, remove_dim = x.unsqueeze(0), True
|
||||
b = x.shape[0]
|
||||
# 计算TMA对齐的尺寸(对齐到4字节边界)
|
||||
aligned_mn = get_tma_aligned_size(mn, 4)
|
||||
aligned_k = align(k, 4)
|
||||
# 创建对齐后的padded张量,并填充有效数据
|
||||
padded = paddle.zeros((b, aligned_mn, aligned_k), device=x.device, dtype=paddle.uint8)
|
||||
padded[:, :mn, :k] = ue8m0_tensor
|
||||
# 将uint8数据打包成int32(每4个uint8打包成1个int32)
|
||||
padded = padded.view(-1).view(dtype=paddle.int).view(b, aligned_mn, aligned_k // 4)
|
||||
|
||||
# 第三步:转置张量以满足TMA的内存访问模式要求
|
||||
# 转置张量维度以便TMA能够以MN主序高效访问
|
||||
transposed = paddle.zeros((b, aligned_k // 4, aligned_mn), device=x.device, dtype=paddle.int).mT
|
||||
transposed[:, :, :] = padded
|
||||
# 截取原始非padding部分
|
||||
aligned_x = transposed[:, :mn, :]
|
||||
# 如果输入是2D张量,移除batch维度
|
||||
return aligned_x.squeeze(0) if remove_dim else aligned_x
|
||||
|
||||
|
||||
def transform_scale_ue8m0(sf, mn, weight_block_size=None):
|
||||
get_mn_major_tma_aligned_packed_ue8m0_tensor = _get_mn_major_tma_aligned_packed_ue8m0_tensor_torch_impl
|
||||
if weight_block_size:
|
||||
assert weight_block_size == [128, 128]
|
||||
sf = sf.index_select(-2, paddle.arange(mn, device=sf.device) // 128)
|
||||
sf = get_mn_major_tma_aligned_packed_ue8m0_tensor(sf)
|
||||
return sf
|
||||
|
||||
|
||||
def quant_weight_ue8m0(weight_dequant, weight_block_size):
|
||||
assert weight_block_size == [128, 128]
|
||||
assert weight_dequant.dtype == paddle.bfloat16, f"{weight_dequant.dtype=} {weight_dequant.shape=}"
|
||||
|
||||
*batch_dims, n, k = weight_dequant.shape
|
||||
|
||||
weight_dequant_flat = weight_dequant.view((-1, k))
|
||||
out_w_flat, out_s_flat = deep_gemm.utils.math.per_block_cast_to_fp8(weight_dequant_flat, use_ue8m0=True)
|
||||
|
||||
out_w = out_w_flat.view((*batch_dims, n, k))
|
||||
out_s = out_s_flat.view(
|
||||
(
|
||||
*batch_dims,
|
||||
ceil_div(n, weight_block_size[0]),
|
||||
ceil_div(k, weight_block_size[1]),
|
||||
)
|
||||
)
|
||||
|
||||
return out_w, out_s
|
||||
@@ -17,9 +17,15 @@ from typing import Optional
|
||||
import numpy as np
|
||||
import paddle
|
||||
|
||||
from fastdeploy.model_executor.layers.utils import get_sm_version
|
||||
from fastdeploy.platforms import current_platform
|
||||
|
||||
|
||||
def get_sm_version():
|
||||
prop = paddle.device.cuda.get_device_properties()
|
||||
cc = prop.major * 10 + prop.minor
|
||||
return cc
|
||||
|
||||
|
||||
_ENABLE_MACHETE = False
|
||||
if current_platform.is_cuda() and get_sm_version() == 90:
|
||||
try:
|
||||
|
||||
Reference in New Issue
Block a user