From acdf0cd1d98495e6f1124c5ddf7c0e5e6e23f8d3 Mon Sep 17 00:00:00 2001 From: lizexu123 <39205361+lizexu123@users.noreply.github.com> Date: Tue, 6 Jan 2026 14:12:14 +0800 Subject: [PATCH] fix hadamard_block_size (#5888) --- .../utils/auto_gen_w4afp8_gemm_kernel.py | 2 ++ .../layers/quantization/__init__.py | 20 ++++++++++++++++++- .../test_moe_w4afp8_online_quant.py} | 6 ++++++ 3 files changed, 27 insertions(+), 1 deletion(-) rename tests/ci_use/{EB_Lite_with_w4afp8/test_ernie_4_5_w4afp8.py => w4afp8/test_moe_w4afp8_online_quant.py} (98%) diff --git a/custom_ops/utils/auto_gen_w4afp8_gemm_kernel.py b/custom_ops/utils/auto_gen_w4afp8_gemm_kernel.py index 194da2bdde..d325cdc5c8 100644 --- a/custom_ops/utils/auto_gen_w4afp8_gemm_kernel.py +++ b/custom_ops/utils/auto_gen_w4afp8_gemm_kernel.py @@ -94,6 +94,8 @@ gemm_case = [ [2560, 1536, 64, 0, 128], [1536, 2560, 64, 0, 128], [2560, 768, 64, 0, 128], + [768, 2048, 128, 0, 128], + [2048, 384, 128, 0, 128], ] dtype = ["BF16"] diff --git a/fastdeploy/model_executor/layers/quantization/__init__.py b/fastdeploy/model_executor/layers/quantization/__init__.py index 2f2421b3be..5d8ede0789 100644 --- a/fastdeploy/model_executor/layers/quantization/__init__.py +++ b/fastdeploy/model_executor/layers/quantization/__init__.py @@ -36,6 +36,19 @@ QUANTIZATION_METHODS: List[str] = [ ] +def _compute_hadamard_block_size(moe_intermediate_size: int, tp_size: int) -> int: + if moe_intermediate_size % tp_size != 0: + raise ValueError( + f"moe_intermediate_size ({moe_intermediate_size}) must be divisible by " f"tp_size ({tp_size})" + ) + + shard_size = moe_intermediate_size // tp_size + block_size = shard_size & (-shard_size) + block_size = min(block_size, 512) + + return block_size + + def parse_quant_config(args, model_config, is_ernie, is_v1_loader): if args.quantization is not None and isinstance(args.quantization, str): args.quantization = parse_quantization(args.quantization) @@ -89,7 +102,12 @@ def parse_quant_config(args, model_config, is_ernie, is_v1_loader): quantization_config["dense_quant_type"] = "block_wise_fp8" quantization_config["moe_quant_type"] = "w4afp8" tp_size = getattr(args, "tensor_parallel_size", 1) - quantization_config["hadamard_block_size"] = 512 // tp_size + moe_intermediate_size = getattr(model_config, "moe_intermediate_size", None) + if moe_intermediate_size is not None: + hadamard_block_size = _compute_hadamard_block_size(moe_intermediate_size, tp_size) + quantization_config["hadamard_block_size"] = hadamard_block_size + else: + quantization_config["hadamard_block_size"] = 512 quantization_config["quantization"] = "mix_quant" quant_config_name = "mix_quant" else: diff --git a/tests/ci_use/EB_Lite_with_w4afp8/test_ernie_4_5_w4afp8.py b/tests/ci_use/w4afp8/test_moe_w4afp8_online_quant.py similarity index 98% rename from tests/ci_use/EB_Lite_with_w4afp8/test_ernie_4_5_w4afp8.py rename to tests/ci_use/w4afp8/test_moe_w4afp8_online_quant.py index 5f3f5b693c..2c25970db3 100644 --- a/tests/ci_use/EB_Lite_with_w4afp8/test_ernie_4_5_w4afp8.py +++ b/tests/ci_use/w4afp8/test_moe_w4afp8_online_quant.py @@ -49,6 +49,12 @@ W4AFP8_CONFIGS = [ "model_name": "ERNIE-4.5-21B-A3B-PT", "model_subdir": "torch", }, + { + "id": "w4afp8_default_v1", + "load_choices": "default_v1", + "model_name": "Qwen3-30B-A3B", + "model_subdir": "torch", + }, ]