[Feature] Unify fp8 block_wise quant ops (#5991)

* quant stash

* blockwise_quant

* precommit

* rm tensor.cut

* tp ok

* add swiglu

* rm outdate code

* fix activate ut

* change baseline

* fix baseline error
This commit is contained in:
fxyfxy777
2026-01-15 21:50:37 +08:00
committed by GitHub
parent d38cd8b40b
commit 4c92035f2d
17 changed files with 55 additions and 571 deletions
@@ -18,7 +18,6 @@ from typing import Optional
import paddle
import fastdeploy
from fastdeploy import envs
from fastdeploy.model_executor.layers.linear import (
MergedColumnParallelLinear,
@@ -264,9 +263,10 @@ class BlockWiseFP8LinearMethod(QuantMethodBase):
layer.weight_scale_inv.set_value(weight_scale)
def apply(self, layer, x):
x, x_scale_tensor = fastdeploy.model_executor.ops.gpu.per_token_quant_padding(
x, self.quant_config.weight_block_size[0]
x, x_scale_tensor = paddle.incubate.nn.functional.fp8_quant_blockwise(
x, using_pow2_scale=False, output_scale_transpose=True
)
x_scale_tensor = x_scale_tensor.T
linear_out = paddle.empty((x.shape[0], layer.output_size), dtype=paddle.bfloat16)
linear_out = deep_gemm_fp8_fp8_bf16_nt(
x, x_scale_tensor, layer.weight, layer.weight_scale_inv, linear_out, layer.output_size