mirror of
https://github.com/PaddlePaddle/FastDeploy.git
synced 2026-04-23 00:17:25 +08:00
Support MXFP4 for GPT-OSS (#5435)
* support mxfp4 in gpt-oss * support mxfp4 in gpt-oss * add scope for flashinfer * remove torch code * update envs.FD_MXFP4_BACKEND * update process_weights_after_loading * update env name * support tp in gpt-oss, add e2e test * add flashinfer-python-paddle in requirements * fix import error * add test * add test * add test * add test
This commit is contained in:
@@ -35,7 +35,7 @@ from fastdeploy.model_executor.utils import (
|
||||
)
|
||||
from fastdeploy.platforms import current_platform
|
||||
|
||||
from .utils import _set_var_distributed, divide, get_tensor
|
||||
from .utils import _set_var_distributed, divide, get_tensor, modules_to_convert
|
||||
|
||||
|
||||
class UnquantizedLinearMethod(QuantMethodBase):
|
||||
@@ -168,7 +168,12 @@ class LinearBase(nn.Layer):
|
||||
self.output_size,
|
||||
]
|
||||
|
||||
if fd_config.quant_config and not skip_quant and fd_config.quant_config.get_quant_method(self):
|
||||
if (
|
||||
fd_config.quant_config
|
||||
and not skip_quant
|
||||
and modules_to_convert(prefix, self.fd_config)
|
||||
and fd_config.quant_config.get_quant_method(self)
|
||||
):
|
||||
self.quant_method = fd_config.quant_config.get_quant_method(self)
|
||||
else:
|
||||
self.quant_method: Optional[QuantMethodBase] = UnquantizedLinearMethod()
|
||||
|
||||
Reference in New Issue
Block a user