Support MXFP4 for GPT-OSS (#5435)

* support mxfp4 in gpt-oss * support mxfp4 in gpt-oss * add scope for flashinfer * remove torch code * update envs.FD_MXFP4_BACKEND * update process_weights_after_loading * update env name * support tp in gpt-oss, add e2e test * add flashinfer-python-paddle in requirements * fix import error * add test * add test * add test * add test
2026-04-23 00:17:25 +08:00 · 2026-01-22 14:21:01 +08:00
parent 309c7d9764
commit 82057cb71f
13 changed files with 670 additions and 25 deletions
@@ -35,7 +35,7 @@ from fastdeploy.model_executor.utils import (
 )
 from fastdeploy.platforms import current_platform

-from .utils import _set_var_distributed, divide, get_tensor
+from .utils import _set_var_distributed, divide, get_tensor, modules_to_convert


 class UnquantizedLinearMethod(QuantMethodBase):
@@ -168,7 +168,12 @@ class LinearBase(nn.Layer):
            self.output_size,
        ]

-        if fd_config.quant_config and not skip_quant and fd_config.quant_config.get_quant_method(self):
+        if (
+            fd_config.quant_config
+            and not skip_quant
+            and modules_to_convert(prefix, self.fd_config)
+            and fd_config.quant_config.get_quant_method(self)
+        ):
            self.quant_method = fd_config.quant_config.get_quant_method(self)
        else:
            self.quant_method: Optional[QuantMethodBase] = UnquantizedLinearMethod()