[Feature] Unify fp8 block_wise quant ops (#5991)

* quant stash * blockwise_quant * precommit * rm tensor.cut * tp ok * add swiglu * rm outdate code * fix activate ut * change baseline * fix baseline error
2026-04-23 00:17:25 +08:00 · 2026-01-15 21:50:37 +08:00
parent d38cd8b40b
commit 4c92035f2d
17 changed files with 55 additions and 571 deletions
@@ -18,7 +18,6 @@ from typing import Optional

 import paddle

-import fastdeploy
 from fastdeploy import envs
 from fastdeploy.model_executor.layers.linear import (
    MergedColumnParallelLinear,
@@ -264,9 +263,10 @@ class BlockWiseFP8LinearMethod(QuantMethodBase):
        layer.weight_scale_inv.set_value(weight_scale)

    def apply(self, layer, x):
-        x, x_scale_tensor = fastdeploy.model_executor.ops.gpu.per_token_quant_padding(
-            x, self.quant_config.weight_block_size[0]
+        x, x_scale_tensor = paddle.incubate.nn.functional.fp8_quant_blockwise(
+            x, using_pow2_scale=False, output_scale_transpose=True
        )
+        x_scale_tensor = x_scale_tensor.T
        linear_out = paddle.empty((x.shape[0], layer.output_size), dtype=paddle.bfloat16)
        linear_out = deep_gemm_fp8_fp8_bf16_nt(
            x, x_scale_tensor, layer.weight, layer.weight_scale_inv, linear_out, layer.output_size