[Intel HPU] enable tensor_wise_fp8 (#5324)

* [Intel HPU] enable tensor_wise_fp8

* update code based on comments

* fix code style issue

* fix bug about RP 5138

* mv kv_cache modifications to HPU backend

* fix FP8 Precision Issues

* fix FP8 Precision Issues

* Add quantization UT

---------

Co-authored-by: yanfeich <yanfei.cheng@intel.com>
Co-authored-by: YuBaoku <49938469+EmmonsCurse@users.noreply.github.com>
This commit is contained in:
fmiao2372
2025-12-17 16:45:03 +08:00
committed by GitHub
parent 15f5112ecb
commit 404cf0ece4
17 changed files with 824 additions and 116 deletions
@@ -18,9 +18,11 @@ intel_hpu backend methods
from .attention.hpu_attn_backend import HPUAttentionBackend
from .moe.fused_moe_hpu_backend import HpuMoEMethod, HpuTensorWiseFP8MoEMethod
from .quantization.tensor_wise_fp8 import HpuTensorWiseFP8LinearMethod
__all__ = [
"HPUAttentionBackend",
"HpuMoEMethod",
"HpuTensorWiseFP8MoEMethod",
"HpuTensorWiseFP8LinearMethod",
]
@@ -12,8 +12,6 @@
# See the License for the specific language governing permissions and
# limitations under the License.
from .hpu_attn_backend import HPUAttentionBackend
__all__ = [
"HPUAttentionBackend",
]
"""
intel_hpu attention
"""
@@ -236,6 +236,8 @@ class HPUAttentionBackend(AttentionBackend_HPU):
# pd_disaggregation
self.use_pd_disaggregation = int(os.getenv("FLAGS_use_pd_disaggregation", 0))
self.start_layer_index = llm_config.model_config.start_layer_index
if llm_config.quant_config:
self.quant_method = llm_config.quant_config.get_quant_method(self)
def init_attention_metadata(self, forward_meta):
"""Initialize attntion metadata hence all layers in the forward pass can reuse it."""
@@ -287,29 +289,47 @@ class HPUAttentionBackend(AttentionBackend_HPU):
from fastdeploy.model_executor.ops.intel_hpu import (
fused_qkv_rope,
fused_sdpa_proj_t,
fused_qkv_rope_ref,
fused_sdpa_proj,
fused_sdpa_proj_ref,
index_copy_,
)
query_states, key_value_states = fused_qkv_rope(
src,
qkv_proj.weight,
qkv_proj.bias,
forward_meta.rotary_embs,
getattr(qkv_proj, "act_scale", None),
getattr(qkv_proj, "weight_scale", None),
getattr(layer, "q_scale", None),
getattr(layer, "cache_k_scale", None),
getattr(layer, "cache_v_scale", None),
q_norm.weight if q_norm is not None else None,
k_norm.weight if k_norm is not None else None,
self.head_dim,
self.num_heads,
forward_meta.total_batch,
transpose=False,
use_neox_style=layer.use_neox_rotary_style,
epsilon=1e-6,
)
if forward_meta.measurement_mode:
qkv_proj_act_scale_key = qkv_proj.weight_key.replace("weight", "activation_scale")
query_states, key_value_states = fused_qkv_rope_ref(
src,
qkv_proj.weight,
qkv_proj.bias,
forward_meta.rotary_embs,
self.head_dim,
self.num_heads,
forward_meta.total_batch,
transpose=False,
use_neox_style=layer.use_neox_rotary_style,
measurement_mode=True,
qkv_act_scale_key=qkv_proj_act_scale_key,
)
else:
query_states, key_value_states = fused_qkv_rope(
src,
qkv_proj.weight,
qkv_proj.bias,
forward_meta.rotary_embs,
getattr(qkv_proj, "act_scale", None),
getattr(qkv_proj, "weight_scale", None),
getattr(layer, "q_scale", None),
getattr(layer, "cache_k_scale", None),
getattr(layer, "cache_v_scale", None),
q_norm.weight if q_norm is not None else None,
k_norm.weight if k_norm is not None else None,
self.head_dim,
self.num_heads,
forward_meta.total_batch,
transpose=False,
use_neox_style=layer.use_neox_rotary_style,
epsilon=1e-6,
)
kv, B, BP_BS, M, H = key_value_states.shape
key_value_states_reshape = key_value_states.reshape([kv, -1, forward_meta.block_size, M, H])
@@ -321,16 +341,38 @@ class HPUAttentionBackend(AttentionBackend_HPU):
index_copy_(v_cache, forward_meta.block_indices, value_states, 0)
if forward_meta.block_list.shape == forward_meta.block_indices.shape:
out_linear_out = fused_sdpa_proj_t(
query_states,
key_value_states,
forward_meta.attn_mask,
None,
o_proj.weight,
scaling_factor=self.head_dim**-0.5,
causal=True,
softmax_mode=0,
)
if forward_meta.measurement_mode:
o_proj_act_scale_key = o_proj.weight_key.replace("weight", "activation_scale")
out_linear_out = fused_sdpa_proj_ref(
query_states,
key_value_states,
forward_meta.attn_mask,
o_proj.weight,
scaling_factor=self.head_dim**-0.5,
causal=True,
softmax_mode=0,
measurement_mode=True,
o_act_scale_key=o_proj_act_scale_key,
)
else:
out_linear_out = fused_sdpa_proj(
query_states,
key_value_states,
forward_meta.attn_mask,
None,
o_proj.weight,
getattr(layer, "q_out_scale", None),
getattr(layer, "cache_k_out_scale", None),
getattr(layer, "cache_v_out_scale", None),
getattr(layer, "s_scale", None),
getattr(o_proj, "act_scale", None),
getattr(layer, "s_out_scale", None),
getattr(o_proj, "act_scale_inv", None),
getattr(o_proj, "weight_scale", None),
scaling_factor=self.head_dim**-0.5,
causal=True,
softmax_mode=0,
)
else:
key_states_with_context = k_cache.index_select(forward_meta.block_list)
val_states_with_context = v_cache.index_select(forward_meta.block_list)
@@ -344,12 +386,20 @@ class HPUAttentionBackend(AttentionBackend_HPU):
query_states.shape[0],
query_states.shape[1],
)
out_linear_out = fused_sdpa_proj_t(
out_linear_out = fused_sdpa_proj(
query_states,
key_value_states_with_context,
forward_meta.attn_mask,
None,
o_proj.weight,
getattr(layer, "q_out_scale", None),
getattr(layer, "cache_k_out_scale", None),
getattr(layer, "cache_v_out_scale", None),
getattr(layer, "s_scale", None),
getattr(o_proj, "act_scale", None),
getattr(layer, "s_out_scale", None),
getattr(o_proj, "act_scale_inv", None),
getattr(o_proj, "weight_scale", None),
scaling_factor=self.head_dim**-0.5,
causal=False,
softmax_mode=0,
@@ -378,45 +428,75 @@ class HPUAttentionBackend(AttentionBackend_HPU):
forward_decode
"""
# metadata = self.attention_metadata
from fastdeploy.model_executor.ops.intel_hpu import fused_block_attention
res = fused_block_attention(
src,
forward_meta.rotary_embs,
forward_meta.caches[2 * layer.layer_id],
forward_meta.caches[2 * layer.layer_id + 1],
forward_meta.block_groups,
forward_meta.block_list,
forward_meta.block_mapping,
forward_meta.attention_mask,
forward_meta.block_indices,
forward_meta.block_offsets,
qkv_proj.weight,
qkv_proj.bias,
o_proj.weight,
q_norm.weight if q_norm is not None else None,
k_norm.weight if k_norm is not None else None,
getattr(qkv_proj, "act_scale", None),
getattr(qkv_proj, "weight_scale", None),
getattr(layer, "q_scaling_scale", None),
getattr(layer, "cache_k_scale", None),
getattr(layer, "s_scale", None),
getattr(layer, "cache_v_scale", None),
getattr(o_proj, "act_scale", None),
getattr(o_proj, "weight_scale", None),
self.head_dim,
self.num_heads,
scaling_factor=self.head_dim**-0.5,
transpose=False,
use_neox_style=layer.use_neox_rotary_style,
epsilon=1e-6,
from fastdeploy.model_executor.ops.intel_hpu import (
fused_block_attention,
fused_block_attention_ref,
)
if forward_meta.measurement_mode:
qkv_proj_act_scale_key = qkv_proj.weight_key.replace("weight", "activation_scale")
o_proj_act_scale_key = o_proj.weight_key.replace("weight", "activation_scale")
out_linear_out = fused_block_attention_ref(
src,
forward_meta.rotary_embs,
forward_meta.caches[2 * layer.layer_id],
forward_meta.caches[2 * layer.layer_id + 1],
forward_meta.block_groups,
forward_meta.block_list,
forward_meta.block_mapping,
forward_meta.attention_mask,
forward_meta.block_indices,
forward_meta.block_offsets,
qkv_proj.weight,
qkv_proj.bias,
o_proj.weight,
self.head_dim,
self.num_heads,
scaling_factor=self.head_dim**-0.5,
transpose=False,
use_neox_style=layer.use_neox_rotary_style,
measurement_mode=True,
qkv_act_scale_key=qkv_proj_act_scale_key,
o_act_scale_key=o_proj_act_scale_key,
)
else:
out_linear_out = fused_block_attention(
src,
forward_meta.rotary_embs,
forward_meta.caches[2 * layer.layer_id],
forward_meta.caches[2 * layer.layer_id + 1],
forward_meta.block_groups,
forward_meta.block_list,
forward_meta.block_mapping,
forward_meta.attention_mask,
forward_meta.block_indices,
forward_meta.block_offsets,
qkv_proj.weight,
qkv_proj.bias,
o_proj.weight,
q_norm.weight if q_norm is not None else None,
k_norm.weight if k_norm is not None else None,
getattr(qkv_proj, "act_scale", None),
getattr(qkv_proj, "weight_scale", None),
getattr(layer, "q_scaling_scale", None),
getattr(layer, "cache_k_scale", None),
getattr(layer, "s_scale", None),
getattr(layer, "cache_v_scale", None),
getattr(o_proj, "act_scale", None),
getattr(o_proj, "weight_scale", None),
self.head_dim,
self.num_heads,
scaling_factor=self.head_dim**-0.5,
transpose=False,
use_neox_style=layer.use_neox_rotary_style,
epsilon=1e-6,
)
# all_reduce
if self.tp_size > 1:
from fastdeploy.distributed.communication import (
tensor_model_parallel_all_reduce_custom,
)
tensor_model_parallel_all_reduce_custom(res)
return res
tensor_model_parallel_all_reduce_custom(out_linear_out)
return out_linear_out
@@ -11,6 +11,6 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
""" "
"""
intel_hpu moe
"""
@@ -23,6 +23,7 @@ from fastdeploy import envs
from fastdeploy.model_executor.layers.moe.fused_moe_backend_base import (
UnquantizedFusedMoEMethod,
)
from fastdeploy.model_executor.layers.utils import get_tensor
class HpuMoEMethod(UnquantizedFusedMoEMethod):
@@ -41,10 +42,17 @@ class HpuMoEMethod(UnquantizedFusedMoEMethod):
stacked_up_gate_proj_weights = paddle.stack(up_gate_proj_weights, axis=0)
stacked_down_proj_weights = paddle.stack(down_proj_weights, axis=0)
layer.up_gate_proj_weight.set_value(stacked_up_gate_proj_weights)
layer.down_proj_weight.set_value(stacked_down_proj_weights)
# for measurement mode
up_gate_proj_expert_weight_key = layer.weight_key_map.get("up_gate_proj_expert_weight_key", None)
down_proj_expert_weight_key = layer.weight_key_map.get("down_proj_expert_weight_key", None)
self.up_gate_proj_act_scale_key = up_gate_proj_expert_weight_key.replace("{}.", "").replace(
"weight", "activation_scale"
)
self.down_proj_expert_act_scale_key = down_proj_expert_weight_key.replace("weight", "activation_scale")
def apply_ep_prefill(
self,
layer: nn.Layer,
@@ -84,22 +92,44 @@ class HpuMoEMethod(UnquantizedFusedMoEMethod):
# norm_topk_prob = False if layer.topk_method == "noaux_tc" else True
chunk_size = envs.FD_HPU_CHUNK_SIZE
from fastdeploy.model_executor.ops.intel_hpu import fused_gate_moe
measurement_mode = getattr(layer, "measurement_mode", False)
if measurement_mode:
from fastdeploy.model_executor.ops.intel_hpu import fused_gate_moe_ref
fused_moe_out = fused_gate_moe(
x,
gate.weight,
layer.gate_correction_bias,
layer.up_gate_proj_weight,
layer.down_proj_weight,
layer.top_k,
norm_topk_prob=True,
permuted_weights=False,
activation="silu",
experts_min=layer.expert_id_offset,
experts_max=layer.expert_id_offset + layer.num_local_experts - 1,
chunk_size=chunk_size,
)
fused_moe_out = fused_gate_moe_ref(
x,
gate.weight,
layer.gate_correction_bias,
layer.up_gate_proj_weight,
layer.down_proj_weight,
layer.top_k,
norm_topk_prob=True,
permuted_weights=False,
activation="silu",
experts_min=layer.expert_id_offset,
experts_max=layer.expert_id_offset + layer.num_local_experts - 1,
chunk_size=chunk_size,
measurement_mode=True,
up_gate_act_scale_key=self.up_gate_proj_act_scale_key,
down_act_scale_key=self.down_proj_expert_act_scale_key,
)
else:
from fastdeploy.model_executor.ops.intel_hpu import fused_gate_moe
fused_moe_out = fused_gate_moe(
x,
gate.weight,
layer.gate_correction_bias,
layer.up_gate_proj_weight,
layer.down_proj_weight,
layer.top_k,
norm_topk_prob=True,
permuted_weights=False,
activation="silu",
experts_min=layer.expert_id_offset,
experts_max=layer.expert_id_offset + layer.num_local_experts - 1,
chunk_size=chunk_size,
)
return fused_moe_out
@@ -110,9 +140,133 @@ class HpuTensorWiseFP8MoEMethod(HpuMoEMethod):
This method is the oldest way to compute MoE in Paddle.
"""
def process_prequanted_weights(self, layer: nn.Layer, state_dict, is_rearrange: bool = False):
"""
Paddle HPU process prequanted weights.
"""
def _extract_scale_tensor(key_template, logical_expert_ids):
result = []
for i in logical_expert_ids:
result.append(get_tensor(state_dict.pop(key_template.format(i))))
return result # bf16 tensor list
def _extract_descale_tensor(key_template, logical_expert_ids):
if key_template.format(0) in state_dict:
# Extract scale tensors for all logical_expert_ids
scale_tensors = []
for i in logical_expert_ids:
scale_tensor = get_tensor(state_dict.pop(key_template.format(i)))
scale_tensors.append(scale_tensor)
# Stack all scale tensors into one tensor
stacked = paddle.stack(scale_tensors)
reciprocal = 1.0 / stacked
# Take min over all logical_expert_ids (axis=0)
min_tensor = paddle.min(reciprocal, axis=0)
return min_tensor.cast(paddle.get_default_dtype())
else:
key = key_template.replace("{}.", "")
scale_tensor = get_tensor(state_dict.pop(key))
reciprocal = 1.0 / scale_tensor
return reciprocal.cast(paddle.get_default_dtype())
up_gate_proj_weight, down_proj_weight, logical_expert_ids, _ = layer.extract_moe_ffn_weights(state_dict)
up_gate_proj_weights = [t.view(paddle.float8_e4m3fn) for t in up_gate_proj_weight]
down_proj_weights = [t.view(paddle.float8_e4m3fn) for t in down_proj_weight]
weight_key_map = layer.weight_key_map
up_gate_proj_expert_weight_scale_key = weight_key_map.get("up_gate_proj_expert_weight_scale_key", None)
down_proj_expert_weight_scale_key = weight_key_map.get("down_proj_expert_weight_scale_key", None)
up_gate_proj_expert_in_scale_key = weight_key_map.get("up_gate_proj_expert_in_scale_key", None)
down_proj_expert_in_scale_key = weight_key_map.get("down_proj_expert_in_scale_key", None)
up_gate_proj_weight_scale = _extract_scale_tensor(up_gate_proj_expert_weight_scale_key, logical_expert_ids)
down_proj_weight_scale = _extract_scale_tensor(down_proj_expert_weight_scale_key, logical_expert_ids)
up_gate_proj_in_scale = _extract_descale_tensor(up_gate_proj_expert_in_scale_key, logical_expert_ids)
down_proj_in_scale = _extract_scale_tensor(down_proj_expert_in_scale_key, logical_expert_ids)
up_gate_proj_weight = paddle.stack(up_gate_proj_weights, axis=0)
down_proj_weight = paddle.stack(down_proj_weights, axis=0)
up_gate_proj_weight_scale = paddle.stack(up_gate_proj_weight_scale, axis=0)
down_proj_weight_scale = paddle.stack(down_proj_weight_scale, axis=0)
name_tensor_map = {
"up_gate_proj_weight": up_gate_proj_weight,
"down_proj_weight": down_proj_weight,
"up_gate_proj_weight_scale": up_gate_proj_weight_scale,
"down_proj_weight_scale": down_proj_weight_scale,
"up_gate_proj_in_scale": up_gate_proj_in_scale,
}
for name, tensor in name_tensor_map.items():
getattr(layer, name).set_value(tensor)
setattr(layer, "down_proj_in_scale", down_proj_in_scale)
def create_weights(self, layer: nn.Layer, **extra_weight_attrs):
# TODO: split create_parameter from process_loaded_weights
return NotImplemented
"""
Paddle HPU create weight process.
"""
self.weight_dtype = "float8_e4m3fn"
self.up_gate_proj_weight_shape = [
layer.num_local_experts,
layer.hidden_size,
layer.moe_intermediate_size * 2,
]
self.down_proj_weight_shape = [
layer.num_local_experts,
layer.moe_intermediate_size,
layer.hidden_size,
]
setattr(
layer,
self.added_weight_attrs[0],
layer.create_parameter(
shape=self.up_gate_proj_weight_shape,
dtype=self.weight_dtype,
default_initializer=paddle.nn.initializer.Constant(0),
),
)
setattr(
layer,
self.added_weight_attrs[1],
layer.create_parameter(
shape=self.down_proj_weight_shape,
dtype=self.weight_dtype,
default_initializer=paddle.nn.initializer.Constant(0),
),
)
self.default_dtype = layer._helper.get_default_dtype()
# in_scales
setattr(
layer,
"up_gate_proj_in_scale",
layer.create_parameter(
shape=[1],
dtype=self.default_dtype,
default_initializer=paddle.nn.initializer.Constant(0),
),
)
# weight_scales
setattr(
layer,
"up_gate_proj_weight_scale",
layer.create_parameter(
shape=[layer.num_local_experts, layer.moe_intermediate_size * 2],
dtype=self.default_dtype,
default_initializer=paddle.nn.initializer.Constant(0),
),
)
setattr(
layer,
"down_proj_weight_scale",
layer.create_parameter(
shape=[layer.num_local_experts, layer.hidden_size],
dtype=self.default_dtype,
default_initializer=paddle.nn.initializer.Constant(0),
),
)
def process_loaded_weights(self, layer: nn.Layer, state_dict):
"""
@@ -0,0 +1,16 @@
# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""
hpu quantization methods
"""
@@ -0,0 +1,237 @@
"""
# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License"
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""
from enum import Enum
from typing import Optional
import paddle
from paddle import nn
from fastdeploy.model_executor.layers.quantization.quant_base import (
QuantConfigBase,
QuantMethodBase,
)
from fastdeploy.model_executor.layers.utils import get_tensor
from fastdeploy.model_executor.utils import set_weight_attrs
class KvCacheQuantzationTypes(str, Enum):
"""
KvCacheQuantzationTypes
"""
FP8 = "float8_e4m3fn"
FP8_E4M3 = "float8_e4m3"
class HPUKvCacheQuantConfig(QuantConfigBase):
"""
quantization config for weight fp8
"""
def __init__(self, kv_cache_quant_type: str, is_channel_wise: bool, has_zero_point: bool) -> None:
"""
__init__
"""
super().__init__()
self.kv_cache_quant_type = kv_cache_quant_type
try:
self.quant_type = KvCacheQuantzationTypes(kv_cache_quant_type)
except ValueError:
raise ValueError(f"Invalid Kvcache type: {kv_cache_quant_type}")
if self.quant_type == KvCacheQuantzationTypes.FP8_E4M3:
self.max_bound = 240.0
elif self.quant_type == KvCacheQuantzationTypes.FP8:
self.max_bound = 448.0
else:
raise ValueError(f"Invalid Kvcache type: {kv_cache_quant_type}")
def name(self) -> str:
"""
get_name
"""
return "kvcache"
@classmethod
def from_config(
cls, kv_cache_quant_type: str, is_channel_wise: bool, has_zero_point: bool
) -> "HPUKvCacheQuantConfig":
"""
from_config
"""
return cls(kv_cache_quant_type, is_channel_wise, has_zero_point)
def get_quant_method(self, layer) -> Optional[QuantMethodBase]:
"""
get_quant_method
"""
return HPUKVCacheMethodBase(self)
class HPUKVCacheMethodBase(QuantMethodBase):
"""
HPUKVCacheMethodBase: HPU need scale in fp32 format but GPU define all scale in bf16 format
"""
def __init__(
self,
quant_config: HPUKvCacheQuantConfig,
) -> None:
"""
HPUKVCacheMethodBase __init__
"""
super().__init__()
self.cache_quant_config = quant_config
def load_scale(self, layer: nn.Layer, state_dict):
"""
load_scale
"""
cache_k_scale_tensor = get_tensor(state_dict.pop(self.cache_k_scale_name)).cast("float32").reshape_([-1])
cache_v_scale_tensor = get_tensor(state_dict.pop(self.cache_v_scale_name)).cast("float32").reshape_([-1])
q_scale_tensor = get_tensor(state_dict.pop(self.q_scale_name)).cast("float32").reshape_([-1])
s_scale_tensor = get_tensor(state_dict.pop(self.s_scale_name)).cast("float32").reshape_([-1])
cache_k_scale = self.cache_quant_config.max_bound / cache_k_scale_tensor
cache_v_scale = self.cache_quant_config.max_bound / cache_v_scale_tensor
cache_k_out_scale = cache_k_scale_tensor / self.cache_quant_config.max_bound
cache_v_out_scale = cache_v_scale_tensor / self.cache_quant_config.max_bound
q_scale = self.cache_quant_config.max_bound / q_scale_tensor
q_out_scale = q_scale_tensor / self.cache_quant_config.max_bound
s_scale = self.cache_quant_config.max_bound / s_scale_tensor
s_out_scale = s_scale_tensor / self.cache_quant_config.max_bound
scaling_factor = layer.head_dim**-0.5
q_scaling_scale = self.cache_quant_config.max_bound / (q_scale_tensor * scaling_factor)
q_scaling_out_scale = (q_scale_tensor * scaling_factor) / self.cache_quant_config.max_bound
layer.cache_k_scale.set_value(cache_k_scale)
layer.cache_v_scale.set_value(cache_v_scale)
layer.cache_k_out_scale.set_value(cache_k_out_scale)
layer.cache_v_out_scale.set_value(cache_v_out_scale)
layer.q_scale.set_value(q_scale)
layer.q_out_scale.set_value(q_out_scale)
layer.q_scaling_scale.set_value(q_scaling_scale)
layer.q_scaling_out_scale.set_value(q_scaling_out_scale)
layer.s_scale.set_value(s_scale)
layer.s_out_scale.set_value(s_out_scale)
def create_weights(self, layer: nn.Layer, **extra_weight_attrs):
"""
create_weights
"""
if self.cache_quant_config.quant_type == KvCacheQuantzationTypes.FP8_E4M3:
layer.cache_quant_type_str = "cache_fp8_sdpa_fp8"
layer.quant_max_bound = 240.0
layer.quant_min_bound = -240.0
else:
raise NotImplementedError(f"{self.cache_quant_config.quant_type} is not implemented")
scale_shape = [1]
layer.cache_k_scale = layer.create_parameter(
shape=scale_shape,
dtype="float32",
default_initializer=paddle.nn.initializer.Constant(0),
)
layer.cache_v_scale = layer.create_parameter(
shape=scale_shape,
dtype="float32",
default_initializer=paddle.nn.initializer.Constant(0),
)
set_weight_attrs(
layer.cache_k_scale,
{
**extra_weight_attrs,
},
)
set_weight_attrs(
layer.cache_v_scale,
{
**extra_weight_attrs,
},
)
layer.cache_k_out_scale = layer.create_parameter(
shape=scale_shape,
dtype="float32",
default_initializer=paddle.nn.initializer.Constant(0),
)
layer.cache_v_out_scale = layer.create_parameter(
shape=scale_shape,
dtype="float32",
default_initializer=paddle.nn.initializer.Constant(0),
)
layer.q_scale = layer.create_parameter(
shape=scale_shape,
dtype="float32",
default_initializer=paddle.nn.initializer.Constant(0),
)
layer.q_out_scale = layer.create_parameter(
shape=scale_shape,
dtype="float32",
default_initializer=paddle.nn.initializer.Constant(0),
)
layer.q_scaling_scale = layer.create_parameter(
shape=scale_shape,
dtype="float32",
default_initializer=paddle.nn.initializer.Constant(0),
)
layer.q_scaling_out_scale = layer.create_parameter(
shape=scale_shape,
dtype="float32",
default_initializer=paddle.nn.initializer.Constant(0),
)
layer.s_scale = layer.create_parameter(
shape=scale_shape,
dtype="float32",
default_initializer=paddle.nn.initializer.Constant(0),
)
layer.s_out_scale = layer.create_parameter(
shape=scale_shape,
dtype="float32",
default_initializer=paddle.nn.initializer.Constant(0),
)
def process_loaded_weights(self, layer: nn.Layer, state_dict):
"""
use for loader v0
"""
self.prefix = layer.prefix
self.cache_k_scale_name = layer.prefix + ".cachek_matmul.activation_scale"
self.cache_v_scale_name = layer.prefix + ".cachev_matmul.activation_scale"
self.q_scale_name = layer.prefix + ".q_matmul.activation_scale"
self.s_scale_name = layer.prefix + ".s_matmul.activation_scale"
self.load_scale(layer, state_dict)
def process_weights_after_loading(self, layer: nn.Layer):
"""
use for loader v1
"""
# cache_k_out_scale is the reciprocal of cache_k_scale
if layer.cache_k_scale._is_initialized():
layer.cache_k_out_scale.set_value(1 / layer.cache_k_scale) # cache_k_out_scale
if layer.cache_v_scale._is_initialized():
layer.cache_v_out_scale.set_value(1 / layer.cache_v_scale)
def apply(self, layer):
"""
apply
"""
raise RuntimeError(f"{self.__class__.__name__}.apply should not be called.")
@@ -0,0 +1,101 @@
"""
# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""
import paddle
from paddle import nn
from fastdeploy.model_executor.layers.quantization.tensor_wise_fp8 import (
TensorWiseFP8Config,
TensorWiseFP8LinearMethod,
)
from fastdeploy.model_executor.layers.utils import get_tensor
from fastdeploy.model_executor.ops.intel_hpu import fused_quant
class HpuTensorWiseFP8LinearMethod(TensorWiseFP8LinearMethod):
"""
Tensor wise fp8 quantization method for linear layer on HPU
"""
def __init__(
self,
quant_config: TensorWiseFP8Config,
) -> None:
super().__init__(quant_config)
self.max_bound = 240.0
def process_prequanted_weights(self, layer, state_dict, is_rearrange: bool = False) -> None:
"""
Process pre-quantized weights before applying them to the model
Args:
layer: The layer that owns the weights
quant_weight: The quantized weights
weight_scale: The scale of the quantized weights
"""
quant_weight = get_tensor(state_dict.pop(layer.weight_key))
weight_scale = get_tensor(state_dict.pop(layer.weight_scale_key))
act_scale = get_tensor(state_dict.pop(layer.act_scale_key))
# these activation_scale will fall in, but only quant for self_attn
# mlp.shared_experts.up_gate_proj / down_proj
# self_attn.qkv_proj / o_proj
if "self_attn" in layer.act_scale_key:
act_scale_inv = act_scale / self.max_bound
act_scale = self.max_bound / act_scale
else:
act_scale_inv = act_scale
act_scale = 1.0 / act_scale
layer.weight.copy_(quant_weight.view("float8_e4m3fn"), False)
layer.weight_scale.set_value(weight_scale.astype(paddle.get_default_dtype()))
layer.act_scale.set_value(act_scale.astype(paddle.get_default_dtype()))
layer.act_scale_inv.set_value(act_scale_inv.astype(paddle.get_default_dtype()))
def create_weights(self, layer: nn.Layer, **extra_weight_attrs) -> None:
"""
Create weights for linear layer on HPU
"""
layer.weight_dtype = "float8_e4m3fn"
layer.weight = layer.create_parameter(
shape=layer.weight_shape,
dtype=layer.weight_dtype,
is_bias=False,
default_initializer=paddle.nn.initializer.Constant(0),
)
layer.weight_scale = layer.create_parameter(
shape=[1],
dtype="bfloat16",
is_bias=False,
)
layer.act_scale = layer.create_parameter(
shape=[1],
dtype="bfloat16",
is_bias=False,
)
layer.act_scale_inv = layer.create_parameter(
shape=[1],
dtype="bfloat16",
is_bias=False,
)
def process_loaded_weights(self, layer: nn.Layer, weight: paddle.Tensor) -> None:
"""
loaded_weights using HPU specific quantization
"""
quanted_weight_tensor, weight_scale_tensor = fused_quant(weight)
layer.weight.set_value(quanted_weight_tensor)
layer.weight_scale.set_value(weight_scale_tensor)
@@ -64,7 +64,6 @@ def get_moe_method():
from fastdeploy.model_executor.layers.backends import HpuMoEMethod
return HpuMoEMethod(None)
# return HpuTensorWiseFP8MoEMethod(None)
elif current_platform.is_maca():
from fastdeploy.model_executor.layers.backends import (
@@ -34,6 +34,7 @@ class KvCacheQuantzationTypes(str, Enum):
INT8 = "int8"
FP8 = "float8_e4m3fn"
FP8_E4M3 = "float8_e4m3"
BLOCK_WISE_FP8 = "block_wise_fp8"
INT8_ZP = "int8_zp"
INT4_ZP = "int4_zp"
@@ -65,6 +66,8 @@ class KvCacheQuantConfig(QuantConfigBase):
if self.quant_type == KvCacheQuantzationTypes.INT8 or self.quant_type == KvCacheQuantzationTypes.INT8_ZP:
self.max_bound = 127.0
self.is_channel_wise = True
elif self.quant_type == KvCacheQuantzationTypes.FP8_E4M3:
self.max_bound = 240.0
elif (
self.quant_type == KvCacheQuantzationTypes.FP8
or self.quant_type == KvCacheQuantzationTypes.FP8_ZP
@@ -101,6 +104,12 @@ class KvCacheQuantConfig(QuantConfigBase):
)
return XPUKVCacheMethodBase(self)
elif current_platform.is_intel_hpu():
from fastdeploy.model_executor.layers.backends.intel_hpu.quantization.kv_cache import (
HPUKVCacheMethodBase,
)
return HPUKVCacheMethodBase(self)
else:
return KVCacheMethodBase(self)
@@ -19,6 +19,7 @@ from typing import Optional
import paddle
from fastdeploy.model_executor.layers.moe import FusedMoE
from fastdeploy.platforms import current_platform
from ..utils import get_tensor
from .quant_base import QuantConfigBase, QuantMethodBase
@@ -52,14 +53,28 @@ class TensorWiseFP8Config(QuantConfigBase):
"""
return method according to this config!
"""
if isinstance(layer, FusedMoE):
from fastdeploy.model_executor.layers.moe.fused_moe_triton_backend import (
TensorWiseFP8MoEMethod,
)
if current_platform.is_intel_hpu():
if isinstance(layer, FusedMoE):
from fastdeploy.model_executor.layers.backends import (
HpuTensorWiseFP8MoEMethod,
)
return TensorWiseFP8MoEMethod(self)
return HpuTensorWiseFP8MoEMethod(self)
else:
from fastdeploy.model_executor.layers.backends import (
HpuTensorWiseFP8LinearMethod,
)
return HpuTensorWiseFP8LinearMethod(self)
else:
return TensorWiseFP8LinearMethod(self)
if isinstance(layer, FusedMoE):
from fastdeploy.model_executor.layers.moe.fused_moe_triton_backend import (
TensorWiseFP8MoEMethod,
)
return TensorWiseFP8MoEMethod(self)
else:
return TensorWiseFP8LinearMethod(self)
class TensorWiseFP8LinearMethod(QuantMethodBase):