[Feature] Support NVFP4 MoE on SM100 (#6003)

* fp4 dense * [WIP] support nvfp4, dense part * [wip] developing loading qwen model * loading * update * dense fp4 OK, cudagraph error * [WIP] moe forward part * with flashinfer-backend * qwen3_moe_fp4 * update * support flashinfer-cutlass moe, qwen3-moe-fp4 OK * support ernie4.5-fp4 * fix load error * add some ut * add docs * fix CLA, test * fix the apply() in ModelOptNvFp4FusedMoE * fix CodeStyle * del the PADDLE_COMPATIBLE_API * fix broken url: nvidia_gpu.md * fix docs * fix token_ids * fix CI in Hopper * move flashinfer imports inside the function * fix model_runner Removed the logic for generating random padding IDs. * Remove skip condition for CUDA version in nvfp4 test * add test for nvfp4 * fix according to review * Add Chinese translation link to NVFP4 documentation * del flashinfer.py * fix unittest --------- Co-authored-by: zoooo0820 <zoooo0820@qq.com> Co-authored-by: bukejiyu <395822456@qq.com>
2026-04-23 08:21:53 +08:00 · 2026-01-29 14:16:07 +08:00
parent eb80724b71
commit 44b52701f6
8 changed files with 1369 additions and 5 deletions
@@ -34,6 +34,7 @@ QUANTIZATION_METHODS: List[str] = [
    "mix_quant",
    "tensor_wise_fp8",
    "kvcache",
+    "modelopt_fp4",
    "mxfp4",
 ]

@@ -133,6 +134,11 @@ def _get_offline_quant_config_name(quantization_config, is_torch_weight, is_v1_l
        has_block_size = "weight_block_size" in quantization_config
        if quant_method == "fp8" and has_block_size:
            quant_config_name = "block_wise_fp8"
+        elif quant_method == "modelopt":
+            if quantization_config.get("quant_algo", "") == "NVFP4":
+                quant_config_name = "modelopt_fp4"
+            else:
+                raise ValueError("modelopt only supports NVFP4 quantization.")
        elif quant_method == "mxfp4":
            quant_config_name = "mxfp4"
        else:
@@ -152,6 +158,7 @@ def get_quantization_config(quantization: str) -> Type[QuantConfigBase]:
    from .block_wise_fp8 import BlockWiseFP8Config
    from .kv_cache import KvCacheQuantConfig
    from .mix_quant import MixQuantConfig
+    from .nvfp4 import ModelOptNvFp4Config
    from .tensor_wise_fp8 import TensorWiseFP8Config
    from .w4a8 import W4A8Config
    from .w4afp8 import W4AFP8Config
@@ -176,6 +183,7 @@ def get_quantization_config(quantization: str) -> Type[QuantConfigBase]:
        "tensor_wise_fp8": TensorWiseFP8Config,
        "kvcache": KvCacheQuantConfig,
        "mix_quant": MixQuantConfig,
+        "modelopt_fp4": ModelOptNvFp4Config,
    }
    if envs.FD_MOE_MXFP4_BACKEND is not None:
        method_to_config["mxfp4"] = MXFP4Config