[XPU] Support W4A8C8-TP4-300B Model (#4068)

* support w4a8 * delete ep block attn * delete moe_topk_select * update note * update * delte useless info * update * add some note * fix some format * update scale info * add ans baseline --------- Co-authored-by: YuBaoku <49938469+EmmonsCurse@users.noreply.github.com>
2026-04-24 01:29:57 +08:00 · 2025-10-10 15:41:32 +08:00
parent c46d5e48f8
commit 20c7b741f4
21 changed files with 2029 additions and 714 deletions
@@ -16,7 +16,8 @@

 from typing import Optional

-from ..moe import FusedMoE
+from fastdeploy.platforms import current_platform
+
 from .quant_base import QuantConfigBase, QuantMethodBase


@@ -40,11 +41,17 @@ class W4A8Config(QuantConfigBase):
        return cls(is_permuted, hadamard_block_size)

    def get_quant_method(self, layer) -> Optional[QuantMethodBase]:
-        if isinstance(layer, FusedMoE):
+        if current_platform.is_cuda():
            from fastdeploy.model_executor.layers.moe.fused_moe_cutlass_backend import (
                CutlassW4A8MoEMethod,
            )

            return CutlassW4A8MoEMethod(self)
+        elif current_platform.is_xpu():
+            from fastdeploy.model_executor.layers.backends.xpu.moe.fused_moe import (
+                XPUW4A8MoEMethod,
+            )
+
+            return XPUW4A8MoEMethod(self)
        else:
            raise ValueError(f"Unsupported layer type {type(layer)} for w4a8")