mirror of
https://github.com/PaddlePaddle/FastDeploy.git
synced 2026-04-24 01:29:57 +08:00
[XPU] Support W4A8C8-TP4-300B Model (#4068)
* support w4a8 * delete ep block attn * delete moe_topk_select * update note * update * delte useless info * update * add some note * fix some format * update scale info * add ans baseline --------- Co-authored-by: YuBaoku <49938469+EmmonsCurse@users.noreply.github.com>
This commit is contained in:
@@ -16,7 +16,8 @@
|
||||
|
||||
from typing import Optional
|
||||
|
||||
from ..moe import FusedMoE
|
||||
from fastdeploy.platforms import current_platform
|
||||
|
||||
from .quant_base import QuantConfigBase, QuantMethodBase
|
||||
|
||||
|
||||
@@ -40,11 +41,17 @@ class W4A8Config(QuantConfigBase):
|
||||
return cls(is_permuted, hadamard_block_size)
|
||||
|
||||
def get_quant_method(self, layer) -> Optional[QuantMethodBase]:
|
||||
if isinstance(layer, FusedMoE):
|
||||
if current_platform.is_cuda():
|
||||
from fastdeploy.model_executor.layers.moe.fused_moe_cutlass_backend import (
|
||||
CutlassW4A8MoEMethod,
|
||||
)
|
||||
|
||||
return CutlassW4A8MoEMethod(self)
|
||||
elif current_platform.is_xpu():
|
||||
from fastdeploy.model_executor.layers.backends.xpu.moe.fused_moe import (
|
||||
XPUW4A8MoEMethod,
|
||||
)
|
||||
|
||||
return XPUW4A8MoEMethod(self)
|
||||
else:
|
||||
raise ValueError(f"Unsupported layer type {type(layer)} for w4a8")
|
||||
|
||||
Reference in New Issue
Block a user