mirror of
https://github.com/PaddlePaddle/FastDeploy.git
synced 2026-04-23 08:21:53 +08:00
[Feature] Support NVFP4 MoE on SM100 (#6003)
* fp4 dense * [WIP] support nvfp4, dense part * [wip] developing loading qwen model * loading * update * dense fp4 OK, cudagraph error * [WIP] moe forward part * with flashinfer-backend * qwen3_moe_fp4 * update * support flashinfer-cutlass moe, qwen3-moe-fp4 OK * support ernie4.5-fp4 * fix load error * add some ut * add docs * fix CLA, test * fix the apply() in ModelOptNvFp4FusedMoE * fix CodeStyle * del the PADDLE_COMPATIBLE_API * fix broken url: nvidia_gpu.md * fix docs * fix token_ids * fix CI in Hopper * move flashinfer imports inside the function * fix model_runner Removed the logic for generating random padding IDs. * Remove skip condition for CUDA version in nvfp4 test * add test for nvfp4 * fix according to review * Add Chinese translation link to NVFP4 documentation * del flashinfer.py * fix unittest --------- Co-authored-by: zoooo0820 <zoooo0820@qq.com> Co-authored-by: bukejiyu <395822456@qq.com>
This commit is contained in:
@@ -34,6 +34,7 @@ QUANTIZATION_METHODS: List[str] = [
|
||||
"mix_quant",
|
||||
"tensor_wise_fp8",
|
||||
"kvcache",
|
||||
"modelopt_fp4",
|
||||
"mxfp4",
|
||||
]
|
||||
|
||||
@@ -133,6 +134,11 @@ def _get_offline_quant_config_name(quantization_config, is_torch_weight, is_v1_l
|
||||
has_block_size = "weight_block_size" in quantization_config
|
||||
if quant_method == "fp8" and has_block_size:
|
||||
quant_config_name = "block_wise_fp8"
|
||||
elif quant_method == "modelopt":
|
||||
if quantization_config.get("quant_algo", "") == "NVFP4":
|
||||
quant_config_name = "modelopt_fp4"
|
||||
else:
|
||||
raise ValueError("modelopt only supports NVFP4 quantization.")
|
||||
elif quant_method == "mxfp4":
|
||||
quant_config_name = "mxfp4"
|
||||
else:
|
||||
@@ -152,6 +158,7 @@ def get_quantization_config(quantization: str) -> Type[QuantConfigBase]:
|
||||
from .block_wise_fp8 import BlockWiseFP8Config
|
||||
from .kv_cache import KvCacheQuantConfig
|
||||
from .mix_quant import MixQuantConfig
|
||||
from .nvfp4 import ModelOptNvFp4Config
|
||||
from .tensor_wise_fp8 import TensorWiseFP8Config
|
||||
from .w4a8 import W4A8Config
|
||||
from .w4afp8 import W4AFP8Config
|
||||
@@ -176,6 +183,7 @@ def get_quantization_config(quantization: str) -> Type[QuantConfigBase]:
|
||||
"tensor_wise_fp8": TensorWiseFP8Config,
|
||||
"kvcache": KvCacheQuantConfig,
|
||||
"mix_quant": MixQuantConfig,
|
||||
"modelopt_fp4": ModelOptNvFp4Config,
|
||||
}
|
||||
if envs.FD_MOE_MXFP4_BACKEND is not None:
|
||||
method_to_config["mxfp4"] = MXFP4Config
|
||||
|
||||
Reference in New Issue
Block a user