From 5f612a348d5485c229ff4d4a2b3750cc66456d0c Mon Sep 17 00:00:00 2001 From: lizexu123 <39205361+lizexu123@users.noreply.github.com> Date: Fri, 3 Apr 2026 15:43:19 +0800 Subject: [PATCH] [BugFix] fix flashinfer-cutedsl moe nvfp4 (#7120) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * fix nvfp4 * fix * add document * fix nvfp4 * support eb5 * support bka * support eb5 * support xpu * fix * fix * add import cutedsl * fix * fix * fix test * fix H卡 * update document * fix * update document * update document * fix --- docs/quantization/nvfp4.md | 64 ++++++- docs/zh/quantization/nvfp4.md | 67 ++++++++ fastdeploy/envs.py | 2 +- .../layers/moe/flashinfer_cutedsl_moe.py | 20 ++- .../layers/quantization/__init__.py | 9 +- .../layers/quantization/nvfp4.py | 161 ++++++++++-------- .../layers/quantization/quant_base.py | 16 ++ tests/quantization/test_modelopt_nvfp4.py | 68 +++++++- 8 files changed, 317 insertions(+), 90 deletions(-) diff --git a/docs/quantization/nvfp4.md b/docs/quantization/nvfp4.md index 929bdf594b..954517b45d 100644 --- a/docs/quantization/nvfp4.md +++ b/docs/quantization/nvfp4.md @@ -18,7 +18,38 @@ Based on [FlashInfer](https://github.com/flashinfer-ai/flashinfer), Fastdeploy s Please ensure that FastDeploy is installed with NVIDIA GPU support. Follow the official guide to set up the base environment: [Fastdeploy NVIDIA GPU Environment Installation Guide](https://paddlepaddle.github.io/FastDeploy/get_started/installation/nvidia_gpu/). -### Running Inference Service +### FlashInfer-cutedsl backend + +#### PaddlePaddle Compatibility Patches for FlashInfer + +Due to compatibility issues between FlashInfer and PaddlePaddle, you need to apply the following patches in `miniconda/envs//lib/python3.10/site-packages/`: + +1. **nvidia_cutlass_dsl/python_packages/cutlass/torch.py** + + Replace `torch.device` with `"torch.device"` (as a string to avoid conflicts). + +2. **flashinfer/utils.py** + + Modify the `get_compute_capability` function: + ```bash + @functools.cache + def get_compute_capability(device: torch.device) -> Tuple[int, int]: + return torch.cuda.get_device_capability(device) + if device.type != "cuda": + raise ValueError("device must be a cuda device") + return torch.cuda.get_device_capability(device.index) + ``` + +3. **flashinfer/cute_dsl/blockscaled_gemm.py** + + Replace `cutlass_torch.current_stream()` with: + ```bash + cuda.CUstream(torch.cuda.current_stream().stream_base.raw_stream) + ``` + +#### Running Inference Service + +flashinfer-cutlass backend: ```bash python -m fastdeploy.entrypoints.openai.api_server \ --model nv-community/Qwen3-30B-A3B-FP4 \ @@ -31,6 +62,26 @@ python -m fastdeploy.entrypoints.openai.api_server \ --max-num-seqs 128 ``` +flashinfer-cutedsl backend: +```bash +python -m fastdeploy.entrypoints.openai.multi_api_server \ + --ports "9811,9812,9813,9814" \ + --num-servers 4 \ + --model ERNIE-4.5-21B-A3B-FP4 \ + --disable-custom-all-reduce \ + --tensor-parallel-size 1 \ + --data-parallel-size 4 \ + --no-enable-prefix-caching \ + --max-model-len 65536 \ + --enable-expert-parallel \ + --num-gpu-blocks-override 8192 \ + --max-num-seqs 4 \ + --gpu-memory-utilization 0.9 \ + --max-num-batched-tokens 512 \ + --ep-prefill-use-worst-num-tokens \ + --graph-optimization-config '{"use_cudagraph":false}' +``` + ### API Access Make service requests using the following command @@ -43,6 +94,15 @@ curl -X POST "http://0.0.0.0:8180/v1/chat/completions" \ ] }' ``` +```shell +curl -X POST "http://0.0.0.0:9811/v1/chat/completions" \ +-H "Content-Type: application/json" \ +-d '{ + "messages": [ + {"role": "user", "content": "把李白的静夜思改写为现代诗"} + ] +}' +``` FastDeploy service interface is compatible with OpenAI protocol. You can make service requests using the following Python code. @@ -64,4 +124,4 @@ for chunk in response: if chunk.choices[0].delta: print(chunk.choices[0].delta.content, end='') print('\n') -```. +``` diff --git a/docs/zh/quantization/nvfp4.md b/docs/zh/quantization/nvfp4.md index 656cc267af..f9cd645171 100644 --- a/docs/zh/quantization/nvfp4.md +++ b/docs/zh/quantization/nvfp4.md @@ -18,6 +18,8 @@ NVFP4 是 NVIDIA 引入的创新 4 位浮点格式,详细介绍请参考[Intro FastDeploy 需以 NVIDIA GPU 模式安装,具体安装方式请参考官方文档:[Fastdeploy NVIDIA GPU 环境安装指南](https://paddlepaddle.github.io/FastDeploy/zh/get_started/installation/nvidia_gpu/)。 ### 运行推理服务 + +flashinfer-cutlass后端: ```bash python -m fastdeploy.entrypoints.openai.api_server \ --model nv-community/Qwen3-30B-A3B-FP4 \ @@ -30,6 +32,62 @@ python -m fastdeploy.entrypoints.openai.api_server \ --max-num-seqs 128 ``` +### flashinfer-cutedsl后端: + +#### PaddlePaddle 兼容性补丁 + +由于 FlashInfer 与 PaddlePaddle 之间存在兼容性问题,需要在 `miniconda/envs//lib/python3.10/site-packages/` 中应用以下补丁: + +1. **nvidia_cutlass_dsl/python_packages/cutlass/torch.py** + + 将 `torch.device` 替换为 `"torch.device"`(作为字符串以避免冲突)。 + +2. **flashinfer/utils.py** + + 修改 `get_compute_capability` 函数: + ```bash + @functools.cache + def get_compute_capability(device: torch.device) -> Tuple[int, int]: + return torch.cuda.get_device_capability(device) + if device.type != "cuda": + raise ValueError("device must be a cuda device") + return torch.cuda.get_device_capability(device.index) + ``` + +3. **flashinfer/cute_dsl/blockscaled_gemm.py** + + 将 `cutlass_torch.current_stream()` 替换为: + ```bash + cuda.CUstream(torch.cuda.current_stream().stream_base.raw_stream) + ``` + +### 运行推理服务 + +```bash +export FD_MOE_BACKEND="flashinfer-cutedsl" +export FD_USE_PFCC_DEEP_EP=1 +export CUDA_VISIBLE_DEVICES=4,5,6,7 + + + +python -m fastdeploy.entrypoints.openai.multi_api_server \ + --ports "9811,9812,9813,9814" \ + --num-servers 4 \ + --model ERNIE-4.5-21B-A3B-FP4 \ + --disable-custom-all-reduce \ + --tensor-parallel-size 1 \ + --data-parallel-size 4 \ + --no-enable-prefix-caching \ + --max-model-len 65536 \ + --enable-expert-parallel \ + --num-gpu-blocks-override 8192 \ + --max-num-seqs 4 \ + --gpu-memory-utilization 0.9 \ + --max-num-batched-tokens 512 \ + --ep-prefill-use-worst-num-tokens \ + --graph-optimization-config '{"use_cudagraph":false}' +``` + ### 接口访问 通过如下命令发起服务请求 @@ -42,6 +100,15 @@ curl -X POST "http://0.0.0.0:8180/v1/chat/completions" \ ] }' ``` +```shell +curl -X POST "http://0.0.0.0:9811/v1/chat/completions" \ +-H "Content-Type: application/json" \ +-d '{ + "messages": [ + {"role": "user", "content": "把李白的静夜思改写为现代诗"} + ] +}' +``` FastDeploy服务接口兼容OpenAI协议,可以通过如下Python代码发起服务请求。 diff --git a/fastdeploy/envs.py b/fastdeploy/envs.py index 0c7ac3e22b..40cf43679e 100644 --- a/fastdeploy/envs.py +++ b/fastdeploy/envs.py @@ -75,7 +75,7 @@ environment_variables: dict[str, Callable[[], Any]] = { # Set moe backend."cutlass","marlin", "triton", "flashinfer-cutlass", "flashinfer-cutedsl" and "flashinfer-trtllm" can be set currently. "FD_MOE_BACKEND": lambda: os.getenv("FD_MOE_BACKEND", "cutlass"), # Set nvfp4 load interleaved weight scale. - "FD_NVFP4_LOAD_BLOCKSCALE_LEAVE": lambda: os.getenv("FD_NVFP4_LOAD_BLOCKSCALE_LEAVE", "0"), + "FD_NVFP4_LOAD_BLOCKSCALE_LEAVE": lambda: bool(int(os.getenv("FD_NVFP4_LOAD_BLOCKSCALE_LEAVE", "0"))), # Set mxfp4 backend."flashinfer" can be set currently. "FD_MOE_MXFP4_BACKEND": lambda: os.getenv("FD_MOE_MXFP4_BACKEND", "flashinfer"), # Whether to use Machete for wint4 dense gemm. diff --git a/fastdeploy/model_executor/layers/moe/flashinfer_cutedsl_moe.py b/fastdeploy/model_executor/layers/moe/flashinfer_cutedsl_moe.py index b449c246cc..654b9090ec 100644 --- a/fastdeploy/model_executor/layers/moe/flashinfer_cutedsl_moe.py +++ b/fastdeploy/model_executor/layers/moe/flashinfer_cutedsl_moe.py @@ -18,7 +18,20 @@ from typing import Any, Optional import paddle -paddle.compat.enable_torch_proxy(scope={"flashinfer"}) +from fastdeploy.model_executor.layers.quantization.quant_base import is_nvfp4_supported + +# Only import flashinfer on supported GPUs (B卡) +if is_nvfp4_supported(): + from flashinfer import ( + scaled_fp4_grouped_quantize, + silu_and_mul_scaled_nvfp4_experts_quantize, + ) + from flashinfer.cute_dsl.blockscaled_gemm import grouped_gemm_nt_masked +else: + # Not B卡, skip flashinfer imports + scaled_fp4_grouped_quantize = None + silu_and_mul_scaled_nvfp4_experts_quantize = None + grouped_gemm_nt_masked = None def _dtype_str(dtype) -> str: @@ -87,11 +100,6 @@ def flashinfer_cutedsl_moe_masked( Returns: paddle.Tensor: [num_experts, m, k] bf16 """ - from flashinfer import ( - scaled_fp4_grouped_quantize, - silu_and_mul_scaled_nvfp4_experts_quantize, - ) - from flashinfer.cute_dsl.blockscaled_gemm import grouped_gemm_nt_masked # === Dtype assertions === # Use string-based dtype check to be compatible with both paddle and torch proxy tensors diff --git a/fastdeploy/model_executor/layers/quantization/__init__.py b/fastdeploy/model_executor/layers/quantization/__init__.py index 3e9e34c54a..85fb3c4e86 100644 --- a/fastdeploy/model_executor/layers/quantization/__init__.py +++ b/fastdeploy/model_executor/layers/quantization/__init__.py @@ -88,6 +88,7 @@ def parse_quant_config(args, model_config, is_ernie, is_v1_loader): quant_config_name = _get_offline_quant_config_name( quantization_config, model_config.model_format == "torch", is_v1_loader ) + elif args.quantization is not None: quantization_config = {} try: @@ -161,7 +162,10 @@ def get_quantization_config(quantization: str) -> Type[QuantConfigBase]: from .block_wise_fp8 import BlockWiseFP8Config from .kv_cache import KvCacheQuantConfig from .mix_quant import MixQuantConfig - from .nvfp4 import ModelOptNvFp4Config + + if quantization == "modelopt_fp4": + from .nvfp4 import ModelOptNvFp4Config + from .tensor_wise_fp8 import TensorWiseFP8Config from .w4a8 import W4A8Config from .w4afp8 import W4AFP8Config @@ -186,9 +190,10 @@ def get_quantization_config(quantization: str) -> Type[QuantConfigBase]: "tensor_wise_fp8": TensorWiseFP8Config, "kvcache": KvCacheQuantConfig, "mix_quant": MixQuantConfig, - "modelopt_fp4": ModelOptNvFp4Config, } if envs.FD_MOE_MXFP4_BACKEND is not None: method_to_config["mxfp4"] = MXFP4Config + if quantization == "modelopt_fp4": + method_to_config["modelopt_fp4"] = ModelOptNvFp4Config return method_to_config[quantization] diff --git a/fastdeploy/model_executor/layers/quantization/nvfp4.py b/fastdeploy/model_executor/layers/quantization/nvfp4.py index 196f6af675..8e5ee3bbe1 100644 --- a/fastdeploy/model_executor/layers/quantization/nvfp4.py +++ b/fastdeploy/model_executor/layers/quantization/nvfp4.py @@ -28,75 +28,105 @@ from fastdeploy.model_executor.layers.moe.fused_moe_backend_base import MoEMetho from fastdeploy.model_executor.utils import ( create_parameter_and_copy, free_tensor, + get_sm_version, set_weight_attrs, ) -from .quant_base import QuantConfigBase, QuantMethodBase +from .quant_base import QuantConfigBase, QuantMethodBase, is_nvfp4_supported -paddle.compat.enable_torch_proxy(scope={"flashinfer"}) +# Only import flashinfer on supported GPUs (B卡) +if is_nvfp4_supported(): + paddle.compat.enable_torch_proxy(scope={"flashinfer"}) -from fastdeploy.platforms import current_platform + from flashinfer import fp4_quantize, mm_fp4 + from flashinfer.fused_moe import cutlass_fused_moe as flashinfer_cutlass_fused_moe -if current_platform.is_cuda(): + from fastdeploy.model_executor.layers.moe.ep import deep_ep from fastdeploy.model_executor.ops.gpu import ( depermute_prefill_combine, prefill_permute_to_masked_gemm, ) - def call_prefill_permute_to_masked_gemm( - x: paddle.Tensor, - scale: paddle.Tensor, - topk_ids: paddle.Tensor, - num_local_experts: int, - max_token_num: int, - ): - """ - Permute input tokens and scales from token-major to expert-major layout - for MoE masked GEMM operations. + if envs.FD_MOE_BACKEND == "flashinfer-cutedsl": + logger.info( + "FlashInfer cutedsl is slow to import because it triggers JIT compilation of " + "CUDA kernels via TVM/CODEGEN, and cuBLASLt initializes lookup tables and " + "compiles GEMM kernels during first load. This may take several minutes. " + "The wait is expected and only happens once per process." + ) + from fastdeploy.model_executor.layers.moe.flashinfer_cutedsl_moe import ( + flashinfer_cutedsl_moe_masked, + ) +else: + # Not B卡, skip flashinfer imports + deep_ep = None + depermute_prefill_combine = None + prefill_permute_to_masked_gemm = None + flashinfer_cutedsl_moe_masked = None + fp4_quantize = None + mm_fp4 = None + flashinfer_cutlass_fused_moe = None + logger.warning( + f"NVFP4 requires Blackwell GPU (SM >= 100), " + f"current GPU has SM {get_sm_version()}. Skipping flashinfer imports." + ) - Args: - x: Input hidden states [num_tokens, hidden]. - scale: Input scales [num_tokens, hidden_scale]. - topk_ids: Expert routing indices [num_tokens, topk] (int64 or int32). - num_local_experts: Number of local experts on this device. - max_token_num: Maximum tokens per expert buffer. - Returns: - tuple: (permute_x, permute_scale, permuted_indice_map, token_nums_per_expert) - """ - if topk_ids.dtype != paddle.int64: - topk_ids = topk_ids.cast(paddle.int64) +def call_prefill_permute_to_masked_gemm( + x: paddle.Tensor, + scale: paddle.Tensor, + topk_ids: paddle.Tensor, + num_local_experts: int, + max_token_num: int, +): + """ + Permute input tokens and scales from token-major to expert-major layout + for MoE masked GEMM operations. - # NVFP4 dispatch returns plain BF16 (no fp8 scale); pass empty tensor so the - # C++ op can detect the no-scale path via tensor.numel() == 0. - if scale is None: - scale = paddle.empty([0], dtype=paddle.float32) + Args: + x: Input hidden states [num_tokens, hidden]. + scale: Input scales [num_tokens, hidden_scale]. + topk_ids: Expert routing indices [num_tokens, topk] (int64 or int32). + num_local_experts: Number of local experts on this device. + max_token_num: Maximum tokens per expert buffer. - results = prefill_permute_to_masked_gemm(x, scale, topk_ids, num_local_experts, max_token_num) + Returns: + tuple: (permute_x, permute_scale, permuted_indice_map, token_nums_per_expert) + """ + if topk_ids.dtype != paddle.int64: + topk_ids = topk_ids.cast(paddle.int64) - return results[0], results[1], results[2], results[3] + # NVFP4 dispatch returns plain BF16 (no fp8 scale); pass empty tensor so the + # C++ op can detect the no-scale path via tensor.numel() == 0. + if scale is None: + scale = paddle.empty([0], dtype=paddle.float32) - def call_depermute_prefill_combine( - x: paddle.Tensor, - indice_map: paddle.Tensor, - topk_weights: paddle.Tensor, - num_worst_tokens: int, - ): - """ - Depermute and combine expert outputs back to token-major layout. + results = prefill_permute_to_masked_gemm(x, scale, topk_ids, num_local_experts, max_token_num) - Args: - x: Expert outputs [num_local_experts, max_tokens_per_expert, hidden]. - indice_map: Flat index tensor [num_worst_tokens, topk] (int32). - topk_weights: Combination weights [num_worst_tokens, topk] (float32). - num_worst_tokens: Number of output tokens to produce. + return results[0], results[1], results[2], results[3] - Returns: - depermuted_x: Combined output [num_worst_tokens, hidden]. - """ - results = depermute_prefill_combine(x, indice_map, topk_weights, num_worst_tokens) - return results +def call_depermute_prefill_combine( + x: paddle.Tensor, + indice_map: paddle.Tensor, + topk_weights: paddle.Tensor, + num_worst_tokens: int, +): + """ + Depermute and combine expert outputs back to token-major layout. + + Args: + x: Expert outputs [num_local_experts, max_tokens_per_expert, hidden]. + indice_map: Flat index tensor [num_worst_tokens, topk] (int32). + topk_weights: Combination weights [num_worst_tokens, topk] (float32). + num_worst_tokens: Number of output tokens to produce. + + Returns: + depermuted_x: Combined output [num_worst_tokens, hidden]. + """ + results = depermute_prefill_combine(x, indice_map, topk_weights, num_worst_tokens) + + return results def next_power_of_2(n: int): @@ -389,8 +419,6 @@ class ModelOptNvFp4LinearMethod(QuantMethodBase): output_dtype = x.dtype # Quantize BF16 or FP16 to (FP4 and interleaved block scale) - from flashinfer import fp4_quantize - x_fp4, x_scale_interleaved = fp4_quantize(x, layer.input_scale_inv) assert x_fp4.dtype == paddle.uint8 @@ -409,9 +437,8 @@ class ModelOptNvFp4LinearMethod(QuantMethodBase): if backend == "cutlass": x_scale_interleaved = x_scale_interleaved.view(paddle.uint8) w_scale_interleaved = w_scale_interleaved.view(paddle.uint8) - from flashinfer import mm_fp4 as fp4_gemm - out = fp4_gemm(x_fp4, w, x_scale_interleaved, w_scale_interleaved, layer.alpha, output_dtype, backend=backend) + out = mm_fp4(x_fp4, w, x_scale_interleaved, w_scale_interleaved, layer.alpha, output_dtype, backend=backend) if layer.with_bias: out = paddle.add(out, layer.bias) assert out.shape == output_shape @@ -564,9 +591,14 @@ class ModelOptNvFp4FusedMoE(MoEMethodBase): set_weight_attrs(layer.up_gate_proj_input_scale, {**extra_weight_attrs, "weight_type": "input_scale"}) set_weight_attrs(layer.down_proj_input_scale, {**extra_weight_attrs, "weight_type": "input_scale"}) + @property + def load_up_proj_weight_first(self) -> bool: + # FlashInfer CUTLASS kernel assumes [Up, Gate] Proj as W13 + if self.backend == "flashinfer-cutlass": + return True + def process_weights_after_loading(self, layer): """ """ - # FlashInfer CUTLASS kernel assumes [Up, Gate] Proj as W13 if self.backend == "flashinfer-cutlass": @@ -606,18 +638,20 @@ class ModelOptNvFp4FusedMoE(MoEMethodBase): up_gate_proj_blockscale_swizzled = layer.up_gate_proj_weight_scale else: up_gate_proj_blockscale_swizzled = _process_scale_interleaved(layer.up_gate_proj_weight_scale) - free_tensor(layer.up_gate_proj_weight_scale) - layer.up_gate_proj_weight_scale = None create_parameter_and_copy( layer, name="up_gate_proj_blockscale_swizzled", weight=up_gate_proj_blockscale_swizzled ) + + free_tensor(layer.up_gate_proj_weight_scale) + layer.up_gate_proj_weight_scale = None + if envs.FD_NVFP4_LOAD_BLOCKSCALE_LEAVE: down_proj_blockscale_swizzled = layer.down_proj_weight_scale else: down_proj_blockscale_swizzled = _process_scale_interleaved(layer.down_proj_weight_scale) + create_parameter_and_copy(layer, name="down_proj_blockscale_swizzled", weight=down_proj_blockscale_swizzled) free_tensor(layer.down_proj_weight_scale) layer.down_proj_weight_scale = None - create_parameter_and_copy(layer, name="down_proj_blockscale_swizzled", weight=down_proj_blockscale_swizzled) def apply_ep_prefill( self, @@ -628,11 +662,6 @@ class ModelOptNvFp4FusedMoE(MoEMethodBase): shared_experts: nn.Layer = None, ) -> paddle.Tensor: - from fastdeploy.model_executor.layers.moe.ep import deep_ep - from fastdeploy.model_executor.layers.moe.flashinfer_cutedsl_moe import ( - flashinfer_cutedsl_moe_masked, - ) - # 1. top experts and weights gate_out = gate(x.cast("float32")) topk_idx, topk_weights = self.ep_prefill_runner.moe_select(layer, gate_out) @@ -741,10 +770,6 @@ class ModelOptNvFp4FusedMoE(MoEMethodBase): shared_experts: nn.Layer = None, ) -> paddle.Tensor: - from fastdeploy.model_executor.layers.moe.flashinfer_cutedsl_moe import ( - flashinfer_cutedsl_moe_masked, - ) - gate_out = gate(x.cast("float32")) topk_idx, topk_weights = self.ep_decoder_runner.moe_select(layer, gate_out) @@ -803,10 +828,6 @@ class ModelOptNvFp4FusedMoE(MoEMethodBase): output = paddle.empty_like(x) # flashinfer cutlass - from flashinfer.fused_moe import ( - cutlass_fused_moe as flashinfer_cutlass_fused_moe, - ) - _ = flashinfer_cutlass_fused_moe( input=x, token_selected_experts=topk_ids.to(paddle.int), diff --git a/fastdeploy/model_executor/layers/quantization/quant_base.py b/fastdeploy/model_executor/layers/quantization/quant_base.py index aa7e065f48..b40ae30dfc 100644 --- a/fastdeploy/model_executor/layers/quantization/quant_base.py +++ b/fastdeploy/model_executor/layers/quantization/quant_base.py @@ -17,6 +17,22 @@ from abc import ABC, abstractmethod from typing import Any, Optional +# NVFP4 requires SM >= 100 (Blackwell architecture) +NVFP4_MIN_SM_VERSION = 100 + +from fastdeploy.platforms import current_platform + + +def is_nvfp4_supported() -> bool: + if current_platform.is_cuda(): + """Check if current GPU supports NVFP4 (requires SM >= 100, Blackwell).""" + from fastdeploy.model_executor.utils import get_sm_version + + sm_version = get_sm_version() + return sm_version >= NVFP4_MIN_SM_VERSION + else: + return False + class QuantMethodBase(ABC): """Base class for different quantized methods.""" diff --git a/tests/quantization/test_modelopt_nvfp4.py b/tests/quantization/test_modelopt_nvfp4.py index 3bf4653c72..ebf3b1de06 100644 --- a/tests/quantization/test_modelopt_nvfp4.py +++ b/tests/quantization/test_modelopt_nvfp4.py @@ -124,17 +124,52 @@ class TestModelOptNvFp4ModuleInit(unittest.TestCase): """Unit tests for nvfp4 module initialization under different environments.""" def test_module_import_without_flashinfer(self): - """Test module reloading when flashinfer is not available.""" - with mock.patch.dict(sys.modules, {"flashinfer": None}): + """Test module reloading when flashinfer is not available (non-Blackwell GPU).""" + # Mock is_nvfp4_supported at the source (quant_base) to return False + # This simulates H-card or non-CUDA platform + with mock.patch( + "fastdeploy.model_executor.layers.quantization.quant_base.is_nvfp4_supported", + return_value=False, + ): with mock.patch("paddleformers.utils.log.logger.warning"): + # Clear the module's flashinfer-related attributes before reload + # to simulate a fresh import on non-supported GPU + if hasattr(nvfp4_module, "fp4_quantize"): + delattr(nvfp4_module, "fp4_quantize") + if hasattr(nvfp4_module, "mm_fp4"): + delattr(nvfp4_module, "mm_fp4") + if hasattr(nvfp4_module, "flashinfer_cutlass_fused_moe"): + delattr(nvfp4_module, "flashinfer_cutlass_fused_moe") importlib.reload(nvfp4_module) + # Verify that flashinfer imports were skipped + self.assertIsNone(nvfp4_module.fp4_quantize) + self.assertIsNone(nvfp4_module.mm_fp4) def test_module_import_with_flashinfer(self): - """Test module reloading when flashinfer is available.""" + """Test module reloading when flashinfer is available (Blackwell GPU).""" + # Create mock flashinfer module with required functions mock_flashinfer = types.ModuleType("flashinfer") - with mock.patch.dict(sys.modules, {"flashinfer": mock_flashinfer}): - with mock.patch("paddle.compat.enable_torch_proxy"): - importlib.reload(nvfp4_module) + mock_flashinfer.fp4_quantize = mock.Mock() + mock_flashinfer.mm_fp4 = mock.Mock() + + mock_fused_moe = types.ModuleType("flashinfer.fused_moe") + mock_fused_moe.cutlass_fused_moe = mock.Mock() + mock_flashinfer.fused_moe = mock_fused_moe + + # Mock is_nvfp4_supported at the source (quant_base) to return True (simulating B-card) + with ( + mock.patch( + "fastdeploy.model_executor.layers.quantization.quant_base.is_nvfp4_supported", + return_value=True, + ), + mock.patch.dict(sys.modules, {"flashinfer": mock_flashinfer, "flashinfer.fused_moe": mock_fused_moe}), + mock.patch("paddle.compat.enable_torch_proxy"), + ): + importlib.reload(nvfp4_module) + + # Verify that flashinfer imports succeeded + self.assertIsNotNone(nvfp4_module.fp4_quantize) + self.assertIsNotNone(nvfp4_module.mm_fp4) class TestModelOptNvFp4ConfigValidation(unittest.TestCase): @@ -328,11 +363,15 @@ class TestModelOptNvFp4LinearMethod(unittest.TestCase): """Test the apply() method with flashinfer-cutlass backend for Linear layers.""" def fake_fp4_quantize(x, input_scale_inv): + # NVFP4 packs two 4-bit values into one uint8, so shape stays the same + # but the actual packed dimension is K//2 in terms of elements x_fp4 = paddle.zeros(x.shape, dtype=paddle.uint8) - x_scale_interleaved = paddle.zeros(x.shape, dtype=paddle.uint8) + # Scale shape should match the packed K dimension + x_scale_interleaved = paddle.zeros([x.shape[0], x.shape[1]], dtype=paddle.uint8) return x_fp4, x_scale_interleaved def fake_fp4_gemm(x_fp4, w, x_scale_interleaved, w_scale_interleaved, alpha, output_dtype, backend=None): + # Simply return zeros with correct output shape return paddle.zeros([x_fp4.shape[0], w.shape[1]], dtype=output_dtype) prev_flashinfer, prev_fused = _install_fake_flashinfer(fp4_quantize=fake_fp4_quantize, mm_fp4=fake_fp4_gemm) @@ -341,6 +380,9 @@ class TestModelOptNvFp4LinearMethod(unittest.TestCase): mock.patch.dict(os.environ, {"FD_MOE_BACKEND": "flashinfer-cutlass"}), mock.patch.object(nvfp4_module.paddle, "float8_e4m3fn", paddle.uint8), mock.patch.object(nvfp4_module, "free_tensor", side_effect=lambda _: None), + # Patch the module-level imports to use our fake functions + mock.patch.object(nvfp4_module, "fp4_quantize", fake_fp4_quantize), + mock.patch.object(nvfp4_module, "mm_fp4", fake_fp4_gemm), ): method = ModelOptNvFp4LinearMethod( ModelOptNvFp4Config(True, kv_cache_quant_algo=None, exclude_modules=[], group_size=16) @@ -352,7 +394,9 @@ class TestModelOptNvFp4LinearMethod(unittest.TestCase): layer.weight_scale_2.set_value(paddle.ones([1], dtype=paddle.float32)) layer.weight_scale.set_value(paddle.ones(layer.weight_scale.shape, dtype=paddle.uint8)) method.process_weights_after_loading(layer) - x = paddle.ones([2, layer.weight.shape[1]], dtype=paddle.float16) + # Input dimension should be K (original, not packed) + # layer.weight_shape[0] = K = 32 + x = paddle.ones([2, layer.weight_shape[0]], dtype=paddle.float16) out = method.apply(layer, x) self.assertEqual(list(out.shape), [2, layer.weight.shape[0]]) finally: @@ -380,6 +424,8 @@ class TestModelOptNvFp4LinearMethod(unittest.TestCase): mock.patch.dict(os.environ, {"FD_MOE_BACKEND": "flashinfer-cutlass"}), mock.patch.object(nvfp4_module.paddle, "float8_e4m3fn", paddle.float16), mock.patch.object(nvfp4_module, "free_tensor", side_effect=lambda _: None), + # Patch the module-level fp4_quantize for H-card (SM 90) where it's None + mock.patch.object(nvfp4_module, "fp4_quantize", fake_fp4_quantize), ): method = ModelOptNvFp4LinearMethod( ModelOptNvFp4Config(True, kv_cache_quant_algo=None, exclude_modules=[], group_size=16) @@ -392,7 +438,9 @@ class TestModelOptNvFp4LinearMethod(unittest.TestCase): method.process_weights_after_loading(layer) method.backend = "unsupported" with self.assertRaises(ValueError): - method.apply(layer, paddle.ones([2, layer.weight.shape[1]], dtype=paddle.float16)) + # Input dimension should be K (original, not packed) + x = paddle.ones([2, layer.weight_shape[0]], dtype=paddle.float16) + method.apply(layer, x) finally: # Restore original modules to avoid affecting other tests if prev_flashinfer is None: @@ -479,6 +527,8 @@ class TestModelOptNvFp4FusedMoE(unittest.TestCase): mock.patch.dict(os.environ, {"FD_MOE_BACKEND": "flashinfer-cutlass"}), mock.patch.object(nvfp4_module.paddle, "float8_e4m3fn", paddle.float16), mock.patch.object(nvfp4_module, "free_tensor", side_effect=lambda _: None), + # Patch the module-level import to use our fake function + mock.patch.object(nvfp4_module, "flashinfer_cutlass_fused_moe", fake_cutlass_fused_moe), ): method = ModelOptNvFp4FusedMoE( ModelOptNvFp4Config(True, kv_cache_quant_algo=None, exclude_modules=[], group_size=16)