mirror of
https://github.com/PaddlePaddle/FastDeploy.git
synced 2026-04-23 08:21:53 +08:00
[Quantization] Support to load static quant ue8m0 scale of DeepGEMM via v0_loader (#6433)
* support to load static quant ue8m0 scale of deepgemm via v0_loader * [Fix] Fix ue8m0 scale pack dimension calculation and block size validation 1. Fix pack dimension calculation in fused_moe_triton_backend.py: - Changed from `ceil_div(...) // 4` to `(num_scales + 3) // 4` for correct ceiling division - This ensures sufficient pack allocation when num_scales is not a multiple of 4 2. Fix block size hardcoding in block_wise_fp8.py: - Use `self.quant_config.weight_block_size` instead of hardcoded `[128, 128]` - Add assertion to ensure weight_block_size is `[128, 128]` for ue8m0 Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com> --------- Co-authored-by: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -176,12 +176,22 @@ class BlockWiseFP8LinearMethod(QuantMethodBase):
|
||||
)
|
||||
else:
|
||||
layer.weight_shape.reverse()
|
||||
weight_scale_inv_shape = [
|
||||
(layer.weight_shape[0] + self.quant_config.weight_block_size[0] - 1)
|
||||
// self.quant_config.weight_block_size[0],
|
||||
(layer.weight_shape[1] + self.quant_config.weight_block_size[1] - 1)
|
||||
// self.quant_config.weight_block_size[1],
|
||||
]
|
||||
if not self.quant_config.deepgemm_scale_ue8m0:
|
||||
weight_scale_inv_shape = [
|
||||
(layer.weight_shape[0] + self.quant_config.weight_block_size[0] - 1)
|
||||
// self.quant_config.weight_block_size[0],
|
||||
(layer.weight_shape[1] + self.quant_config.weight_block_size[1] - 1)
|
||||
// self.quant_config.weight_block_size[1],
|
||||
]
|
||||
else:
|
||||
num_scales = (
|
||||
layer.weight_shape[1] + self.quant_config.weight_block_size[1] - 1
|
||||
) // self.quant_config.weight_block_size[1]
|
||||
num_scale_packs = (num_scales + 3) // 4
|
||||
weight_scale_inv_shape = [
|
||||
layer.weight_shape[0],
|
||||
num_scale_packs,
|
||||
]
|
||||
|
||||
if self.model_format != "torch" and layer.fd_config.load_config.load_choices == "default_v1":
|
||||
weight_shape = layer.weight_shape[::-1]
|
||||
@@ -204,11 +214,19 @@ class BlockWiseFP8LinearMethod(QuantMethodBase):
|
||||
default_initializer=paddle.nn.initializer.Constant(0),
|
||||
)
|
||||
|
||||
layer.weight_scale_inv = layer.create_parameter(
|
||||
shape=weight_scale_inv_shape,
|
||||
dtype="float32",
|
||||
is_bias=False,
|
||||
)
|
||||
if not self.quant_config.deepgemm_scale_ue8m0:
|
||||
layer.weight_scale_inv = layer.create_parameter(
|
||||
shape=weight_scale_inv_shape,
|
||||
dtype="float32",
|
||||
is_bias=False,
|
||||
)
|
||||
else:
|
||||
layer.weight_scale_inv = layer.create_parameter(
|
||||
shape=weight_scale_inv_shape,
|
||||
dtype="int32",
|
||||
is_bias=False,
|
||||
default_initializer=paddle.nn.initializer.Constant(0),
|
||||
)
|
||||
|
||||
set_weight_attrs(
|
||||
layer.weight,
|
||||
@@ -268,9 +286,22 @@ class BlockWiseFP8LinearMethod(QuantMethodBase):
|
||||
|
||||
def process_loaded_weights(self, layer, weights) -> None:
|
||||
weight_tensor = weights.transpose([1, 0])
|
||||
quanted_weight_tensor, weight_block_scale_tensor = per_block_cast_to_fp8(weight_tensor)
|
||||
if not self.quant_config.deepgemm_scale_ue8m0:
|
||||
quanted_weight_tensor, weight_block_scale_tensor = per_block_cast_to_fp8(weight_tensor)
|
||||
else:
|
||||
weight_block_size = self.quant_config.weight_block_size
|
||||
assert weight_block_size == [
|
||||
128,
|
||||
128,
|
||||
], f"weight_block_size must be [128, 128] for ue8m0, but got {weight_block_size}"
|
||||
quanted_weight_tensor, weight_block_scale_tensor = quant_weight_ue8m0(weight_tensor, weight_block_size)
|
||||
weight_block_scale_tensor = transform_scale_ue8m0(
|
||||
weight_block_scale_tensor,
|
||||
mn=quanted_weight_tensor.shape[-2],
|
||||
weight_block_size=weight_block_size,
|
||||
)
|
||||
layer.weight.copy_(quanted_weight_tensor, False)
|
||||
layer.weight_scale_inv.set_value(weight_block_scale_tensor)
|
||||
layer.weight_scale_inv.data = weight_block_scale_tensor
|
||||
|
||||
def process_prequanted_weights(self, layer, state_dict, is_rearrange: bool = False):
|
||||
"""
|
||||
|
||||
Reference in New Issue
Block a user