mirror of
https://github.com/PaddlePaddle/FastDeploy.git
synced 2026-04-23 00:17:25 +08:00
[Feature] Unify fp8 block_wise quant ops (#5991)
* quant stash * blockwise_quant * precommit * rm tensor.cut * tp ok * add swiglu * rm outdate code * fix activate ut * change baseline * fix baseline error
This commit is contained in:
@@ -18,7 +18,6 @@ from typing import Optional
|
||||
|
||||
import paddle
|
||||
|
||||
import fastdeploy
|
||||
from fastdeploy import envs
|
||||
from fastdeploy.model_executor.layers.linear import (
|
||||
MergedColumnParallelLinear,
|
||||
@@ -264,9 +263,10 @@ class BlockWiseFP8LinearMethod(QuantMethodBase):
|
||||
layer.weight_scale_inv.set_value(weight_scale)
|
||||
|
||||
def apply(self, layer, x):
|
||||
x, x_scale_tensor = fastdeploy.model_executor.ops.gpu.per_token_quant_padding(
|
||||
x, self.quant_config.weight_block_size[0]
|
||||
x, x_scale_tensor = paddle.incubate.nn.functional.fp8_quant_blockwise(
|
||||
x, using_pow2_scale=False, output_scale_transpose=True
|
||||
)
|
||||
x_scale_tensor = x_scale_tensor.T
|
||||
linear_out = paddle.empty((x.shape[0], layer.output_size), dtype=paddle.bfloat16)
|
||||
linear_out = deep_gemm_fp8_fp8_bf16_nt(
|
||||
x, x_scale_tensor, layer.weight, layer.weight_scale_inv, linear_out, layer.output_size
|
||||
|
||||
Reference in New Issue
Block a user