mirror of
https://github.com/PaddlePaddle/FastDeploy.git
synced 2026-04-23 00:17:25 +08:00
fix hadamard_block_size (#5888)
This commit is contained in:
@@ -94,6 +94,8 @@ gemm_case = [
|
|||||||
[2560, 1536, 64, 0, 128],
|
[2560, 1536, 64, 0, 128],
|
||||||
[1536, 2560, 64, 0, 128],
|
[1536, 2560, 64, 0, 128],
|
||||||
[2560, 768, 64, 0, 128],
|
[2560, 768, 64, 0, 128],
|
||||||
|
[768, 2048, 128, 0, 128],
|
||||||
|
[2048, 384, 128, 0, 128],
|
||||||
]
|
]
|
||||||
|
|
||||||
dtype = ["BF16"]
|
dtype = ["BF16"]
|
||||||
|
|||||||
@@ -36,6 +36,19 @@ QUANTIZATION_METHODS: List[str] = [
|
|||||||
]
|
]
|
||||||
|
|
||||||
|
|
||||||
|
def _compute_hadamard_block_size(moe_intermediate_size: int, tp_size: int) -> int:
|
||||||
|
if moe_intermediate_size % tp_size != 0:
|
||||||
|
raise ValueError(
|
||||||
|
f"moe_intermediate_size ({moe_intermediate_size}) must be divisible by " f"tp_size ({tp_size})"
|
||||||
|
)
|
||||||
|
|
||||||
|
shard_size = moe_intermediate_size // tp_size
|
||||||
|
block_size = shard_size & (-shard_size)
|
||||||
|
block_size = min(block_size, 512)
|
||||||
|
|
||||||
|
return block_size
|
||||||
|
|
||||||
|
|
||||||
def parse_quant_config(args, model_config, is_ernie, is_v1_loader):
|
def parse_quant_config(args, model_config, is_ernie, is_v1_loader):
|
||||||
if args.quantization is not None and isinstance(args.quantization, str):
|
if args.quantization is not None and isinstance(args.quantization, str):
|
||||||
args.quantization = parse_quantization(args.quantization)
|
args.quantization = parse_quantization(args.quantization)
|
||||||
@@ -89,7 +102,12 @@ def parse_quant_config(args, model_config, is_ernie, is_v1_loader):
|
|||||||
quantization_config["dense_quant_type"] = "block_wise_fp8"
|
quantization_config["dense_quant_type"] = "block_wise_fp8"
|
||||||
quantization_config["moe_quant_type"] = "w4afp8"
|
quantization_config["moe_quant_type"] = "w4afp8"
|
||||||
tp_size = getattr(args, "tensor_parallel_size", 1)
|
tp_size = getattr(args, "tensor_parallel_size", 1)
|
||||||
quantization_config["hadamard_block_size"] = 512 // tp_size
|
moe_intermediate_size = getattr(model_config, "moe_intermediate_size", None)
|
||||||
|
if moe_intermediate_size is not None:
|
||||||
|
hadamard_block_size = _compute_hadamard_block_size(moe_intermediate_size, tp_size)
|
||||||
|
quantization_config["hadamard_block_size"] = hadamard_block_size
|
||||||
|
else:
|
||||||
|
quantization_config["hadamard_block_size"] = 512
|
||||||
quantization_config["quantization"] = "mix_quant"
|
quantization_config["quantization"] = "mix_quant"
|
||||||
quant_config_name = "mix_quant"
|
quant_config_name = "mix_quant"
|
||||||
else:
|
else:
|
||||||
|
|||||||
+6
@@ -49,6 +49,12 @@ W4AFP8_CONFIGS = [
|
|||||||
"model_name": "ERNIE-4.5-21B-A3B-PT",
|
"model_name": "ERNIE-4.5-21B-A3B-PT",
|
||||||
"model_subdir": "torch",
|
"model_subdir": "torch",
|
||||||
},
|
},
|
||||||
|
{
|
||||||
|
"id": "w4afp8_default_v1",
|
||||||
|
"load_choices": "default_v1",
|
||||||
|
"model_name": "Qwen3-30B-A3B",
|
||||||
|
"model_subdir": "torch",
|
||||||
|
},
|
||||||
]
|
]
|
||||||
|
|
||||||
|
|
||||||
Reference in New Issue
Block a user