WINT4/WINT8 dense gemm default use Machete (#4451)

2026-04-23 00:17:25 +08:00 · 2025-10-23 17:57:59 +08:00
parent a240425db9
commit 4ffe41a747
12 changed files with 310 additions and 15 deletions
@@ -167,7 +167,7 @@ def machete_quantize_and_pack(
        atype,
        quant_type,
        scale_type,
-    )[0]
+    )
    return w_q_prepack, w_s


@@ -194,5 +194,5 @@ def machete_wint_mm(
        out_dtype,  # out_dtype
        group_size,  # group_size
        scheduler,  # scheduler
-    )[0]
+    )
    return out
@@ -38,10 +38,18 @@ if current_platform.is_xpu():
 else:
    from paddle.nn.quant import weight_only_linear

+from fastdeploy.model_executor.layers.quantization.ops.machete_mm import _ENABLE_MACHETE
+
 from ..moe import FusedMoE
 from ..utils import get_tensor
 from .quant_base import QuantConfigBase, QuantMethodBase

+if _ENABLE_MACHETE:
+    from fastdeploy.model_executor.layers.quantization.ops import (
+        machete_quantize_and_pack,
+        machete_wint_mm,
+    )
+

 class WeightOnlyConfig(QuantConfigBase):
    """
@@ -154,14 +162,11 @@ class WeightOnlyConfig(QuantConfigBase):
                else:
                    raise ValueError(f"Unsupported MOE backend {layer.use_method}")
            else:
-                from fastdeploy.model_executor.layers.quantization.ops.machete_mm import (
-                    _ENABLE_MACHETE,
-                )
-
                if (
                    _ENABLE_MACHETE
                    and envs.FD_USE_MACHETE == "1"
                    and not layer.is_quantized
+                    and not layer.fd_config.load_config.dynamic_load_weight
                    and layer.weight_shape[1]
                    and layer.weight_shape[1] % 128 == 0
                ):
@@ -406,9 +411,6 @@ class MacheteWeightOnlyLinearMethod(WeightOnlyLinearMethod):
        raise NotImplementedError("Machete kernel doesn't support prequant. Please set FD_USE_MACHETE to 0.")

    def process_loaded_weights(self, layer, weight) -> None:
-        from fastdeploy.model_executor.layers.quantization.ops import (
-            machete_quantize_and_pack,
-        )

        # Using group scale for machete, group size is 128
        quanted_weight_tensor, weight_scale_tensor = machete_quantize_and_pack(
@@ -421,7 +423,6 @@ class MacheteWeightOnlyLinearMethod(WeightOnlyLinearMethod):
        layer.weight_scale.set_value(weight_scale_tensor.astype(paddle.get_default_dtype()))

    def apply(self, layer, x):
-        from fastdeploy.model_executor.layers.quantization.ops import machete_wint_mm

        # Using group scale for machete, group size is 128
        linear_out = machete_wint_mm(