Support MXFP4 for GPT-OSS (#5435)

* support mxfp4 in gpt-oss

* support mxfp4 in gpt-oss

* add scope for flashinfer

* remove torch code

* update envs.FD_MXFP4_BACKEND

* update process_weights_after_loading

* update env name

* support tp in gpt-oss, add e2e test

* add flashinfer-python-paddle in requirements

* fix import error

* add test

* add test

* add test

* add test
This commit is contained in:
Haonan Luo
2026-01-22 14:21:01 +08:00
committed by GitHub
parent 309c7d9764
commit 82057cb71f
13 changed files with 670 additions and 25 deletions
+7 -2
View File
@@ -35,7 +35,7 @@ from fastdeploy.model_executor.utils import (
)
from fastdeploy.platforms import current_platform
from .utils import _set_var_distributed, divide, get_tensor
from .utils import _set_var_distributed, divide, get_tensor, modules_to_convert
class UnquantizedLinearMethod(QuantMethodBase):
@@ -168,7 +168,12 @@ class LinearBase(nn.Layer):
self.output_size,
]
if fd_config.quant_config and not skip_quant and fd_config.quant_config.get_quant_method(self):
if (
fd_config.quant_config
and not skip_quant
and modules_to_convert(prefix, self.fd_config)
and fd_config.quant_config.get_quant_method(self)
):
self.quant_method = fd_config.quant_config.get_quant_method(self)
else:
self.quant_method: Optional[QuantMethodBase] = UnquantizedLinearMethod()