mirror of
https://github.com/PaddlePaddle/FastDeploy.git
synced 2026-04-23 00:17:25 +08:00
[Feature] FD_USE_PHI_FP8_QUANT (#6320)
* add ut * add use_fd_quant env * rm mask_per_token_quant * add make ops list * USE_FD_FP8_QUANT -> FD_USE_PHI_FP8_QUANT 默认是true * modify comments * use bool type * Add function declaration
This commit is contained in:
@@ -18,6 +18,7 @@ from typing import Optional
|
||||
|
||||
import paddle
|
||||
|
||||
import fastdeploy
|
||||
from fastdeploy import envs
|
||||
from fastdeploy.model_executor.layers.linear import (
|
||||
MergedColumnParallelLinear,
|
||||
@@ -289,14 +290,18 @@ class BlockWiseFP8LinearMethod(QuantMethodBase):
|
||||
linear_out = paddle.empty((x.shape[0], layer.output_size), dtype=paddle.bfloat16)
|
||||
if x.shape[0] == 0:
|
||||
return linear_out
|
||||
|
||||
x, x_scale_tensor = paddle.incubate.nn.functional.fp8_quant_blockwise(
|
||||
x,
|
||||
using_pow2_scale=self.quant_config.deepgemm_scale_ue8m0,
|
||||
output_scale_transpose=True,
|
||||
using_ue8m0_scale=self.quant_config.deepgemm_scale_ue8m0,
|
||||
)
|
||||
x_scale_tensor = x_scale_tensor.T[: x.shape[0], ...]
|
||||
if not fastdeploy.envs.FD_USE_PHI_FP8_QUANT:
|
||||
x, x_scale_tensor = fastdeploy.model_executor.ops.gpu.per_token_quant_padding(
|
||||
x, self.quant_config.weight_block_size[0]
|
||||
)
|
||||
else:
|
||||
x, x_scale_tensor = paddle.incubate.nn.functional.fp8_quant_blockwise(
|
||||
x,
|
||||
using_pow2_scale=self.quant_config.deepgemm_scale_ue8m0,
|
||||
output_scale_transpose=True,
|
||||
using_ue8m0_scale=self.quant_config.deepgemm_scale_ue8m0,
|
||||
)
|
||||
x_scale_tensor = x_scale_tensor.T[: x.shape[0], ...]
|
||||
deep_gemm_fp8_gemm_nt(
|
||||
x,
|
||||
x_scale_tensor,
|
||||
|
||||
Reference in New Issue
Block a user