[Feature] FD_USE_PHI_FP8_QUANT (#6320)

* add ut

* add use_fd_quant env

* rm mask_per_token_quant

* add make ops list

* USE_FD_FP8_QUANT -> FD_USE_PHI_FP8_QUANT 默认是true

* modify comments

* use bool type

* Add function declaration
This commit is contained in:
fxyfxy777
2026-02-04 14:33:03 +08:00
committed by GitHub
parent 2ffcb3d9ed
commit 36547cfdb3
8 changed files with 634 additions and 51 deletions
@@ -18,6 +18,7 @@ from typing import Optional
import paddle
import fastdeploy
from fastdeploy import envs
from fastdeploy.model_executor.layers.linear import (
MergedColumnParallelLinear,
@@ -289,14 +290,18 @@ class BlockWiseFP8LinearMethod(QuantMethodBase):
linear_out = paddle.empty((x.shape[0], layer.output_size), dtype=paddle.bfloat16)
if x.shape[0] == 0:
return linear_out
x, x_scale_tensor = paddle.incubate.nn.functional.fp8_quant_blockwise(
x,
using_pow2_scale=self.quant_config.deepgemm_scale_ue8m0,
output_scale_transpose=True,
using_ue8m0_scale=self.quant_config.deepgemm_scale_ue8m0,
)
x_scale_tensor = x_scale_tensor.T[: x.shape[0], ...]
if not fastdeploy.envs.FD_USE_PHI_FP8_QUANT:
x, x_scale_tensor = fastdeploy.model_executor.ops.gpu.per_token_quant_padding(
x, self.quant_config.weight_block_size[0]
)
else:
x, x_scale_tensor = paddle.incubate.nn.functional.fp8_quant_blockwise(
x,
using_pow2_scale=self.quant_config.deepgemm_scale_ue8m0,
output_scale_transpose=True,
using_ue8m0_scale=self.quant_config.deepgemm_scale_ue8m0,
)
x_scale_tensor = x_scale_tensor.T[: x.shape[0], ...]
deep_gemm_fp8_gemm_nt(
x,
x_scale_tensor,