From acdf0cd1d98495e6f1124c5ddf7c0e5e6e23f8d3 Mon Sep 17 00:00:00 2001
From: lizexu123 <39205361+lizexu123@users.noreply.github.com>
Date: Tue, 6 Jan 2026 14:12:14 +0800
Subject: [PATCH] fix hadamard_block_size (#5888)

---
 .../utils/auto_gen_w4afp8_gemm_kernel.py      |  2 ++
 .../layers/quantization/__init__.py           | 20 ++++++++++++++++++-
 .../test_moe_w4afp8_online_quant.py}          |  6 ++++++
 3 files changed, 27 insertions(+), 1 deletion(-)
 rename tests/ci_use/{EB_Lite_with_w4afp8/test_ernie_4_5_w4afp8.py => w4afp8/test_moe_w4afp8_online_quant.py} (98%)

diff --git a/custom_ops/utils/auto_gen_w4afp8_gemm_kernel.py b/custom_ops/utils/auto_gen_w4afp8_gemm_kernel.py
index 194da2bdde..d325cdc5c8 100644
--- a/custom_ops/utils/auto_gen_w4afp8_gemm_kernel.py
+++ b/custom_ops/utils/auto_gen_w4afp8_gemm_kernel.py
@@ -94,6 +94,8 @@ gemm_case = [
     [2560, 1536, 64, 0, 128],
     [1536, 2560, 64, 0, 128],
     [2560, 768, 64, 0, 128],
+    [768, 2048, 128, 0, 128],
+    [2048, 384, 128, 0, 128],
 ]
 
 dtype = ["BF16"]
diff --git a/fastdeploy/model_executor/layers/quantization/__init__.py b/fastdeploy/model_executor/layers/quantization/__init__.py
index 2f2421b3be..5d8ede0789 100644
--- a/fastdeploy/model_executor/layers/quantization/__init__.py
+++ b/fastdeploy/model_executor/layers/quantization/__init__.py
@@ -36,6 +36,19 @@ QUANTIZATION_METHODS: List[str] = [
 ]
 
 
+def _compute_hadamard_block_size(moe_intermediate_size: int, tp_size: int) -> int:
+    if moe_intermediate_size % tp_size != 0:
+        raise ValueError(
+            f"moe_intermediate_size ({moe_intermediate_size}) must be divisible by " f"tp_size ({tp_size})"
+        )
+
+    shard_size = moe_intermediate_size // tp_size
+    block_size = shard_size & (-shard_size)
+    block_size = min(block_size, 512)
+
+    return block_size
+
+
 def parse_quant_config(args, model_config, is_ernie, is_v1_loader):
     if args.quantization is not None and isinstance(args.quantization, str):
         args.quantization = parse_quantization(args.quantization)
@@ -89,7 +102,12 @@ def parse_quant_config(args, model_config, is_ernie, is_v1_loader):
             quantization_config["dense_quant_type"] = "block_wise_fp8"
             quantization_config["moe_quant_type"] = "w4afp8"
             tp_size = getattr(args, "tensor_parallel_size", 1)
-            quantization_config["hadamard_block_size"] = 512 // tp_size
+            moe_intermediate_size = getattr(model_config, "moe_intermediate_size", None)
+            if moe_intermediate_size is not None:
+                hadamard_block_size = _compute_hadamard_block_size(moe_intermediate_size, tp_size)
+                quantization_config["hadamard_block_size"] = hadamard_block_size
+            else:
+                quantization_config["hadamard_block_size"] = 512
             quantization_config["quantization"] = "mix_quant"
             quant_config_name = "mix_quant"
     else:
diff --git a/tests/ci_use/EB_Lite_with_w4afp8/test_ernie_4_5_w4afp8.py b/tests/ci_use/w4afp8/test_moe_w4afp8_online_quant.py
similarity index 98%
rename from tests/ci_use/EB_Lite_with_w4afp8/test_ernie_4_5_w4afp8.py
rename to tests/ci_use/w4afp8/test_moe_w4afp8_online_quant.py
index 5f3f5b693c..2c25970db3 100644
--- a/tests/ci_use/EB_Lite_with_w4afp8/test_ernie_4_5_w4afp8.py
+++ b/tests/ci_use/w4afp8/test_moe_w4afp8_online_quant.py
@@ -49,6 +49,12 @@ W4AFP8_CONFIGS = [
         "model_name": "ERNIE-4.5-21B-A3B-PT",
         "model_subdir": "torch",
     },
+    {
+        "id": "w4afp8_default_v1",
+        "load_choices": "default_v1",
+        "model_name": "Qwen3-30B-A3B",
+        "model_subdir": "torch",
+    },
 ]