[Intel HPU] enable MoE EP for hpu (#5855)

* enable HPU MoE EP

* MoE intermediate_scale stack

* enable loader_v1 esp for tensor_wise_fp8 TP or EP

* modify activation_scale name
This commit is contained in:
Cheng Yanfei
2026-01-15 13:08:00 +08:00
committed by GitHub
parent 7c56041272
commit fbcccaa750
9 changed files with 177 additions and 11 deletions
+10 -1
View File
@@ -468,7 +468,10 @@ def multi_switch_config_context(*changes):
def rename_offline_ckpt_suffix_to_fd_suffix(
fd_config, ckpt_weight_suffix: str = "quant_weight", ckpt_scale_suffix="weight_scale"
fd_config,
ckpt_weight_suffix: str = "quant_weight",
ckpt_scale_suffix="weight_scale",
ckpt_act_suffix="activation_scale",
):
"""
Create a function to rename checkpoint key suffixes for FastDeploy.
@@ -489,6 +492,10 @@ def rename_offline_ckpt_suffix_to_fd_suffix(
ckpt_weight_suffix: "weight",
ckpt_scale_suffix: "weight_scale_inv",
}
tensor_wise_fp8_suffix_map = {
ckpt_weight_suffix: "weight",
ckpt_act_suffix: "in_scale",
}
moe_quant_type = ""
dense_quant_type = ""
if fd_config.quant_config is not None:
@@ -505,6 +512,8 @@ def rename_offline_ckpt_suffix_to_fd_suffix(
# Can be extended to other offline quantization suffixes if needed.
if (is_moe and moe_quant_type == "block_wise_fp8") or (not is_moe and dense_quant_type == "block_wise_fp8"):
fd_suffix_map = fp8_suffix_map
if (is_moe and moe_quant_type == "tensor_wise_fp8") or (not is_moe and dense_quant_type == "tensor_wise_fp8"):
fd_suffix_map = tensor_wise_fp8_suffix_map
for ckpt_suffix, fd_suffix in fd_suffix_map.items():
if re.search(rf"{ckpt_suffix}$", loaded_weight_name):
loaded_weight_name = loaded_weight_name.replace(ckpt_suffix, fd_suffix)