mirror of
https://github.com/PaddlePaddle/FastDeploy.git
synced 2026-04-23 00:17:25 +08:00
[XPU] glm-4.5-air (#7071)
This commit is contained in:
@@ -15,7 +15,7 @@ if [ "$1" == "stable" ]; then
|
||||
version_xvllm="20251017"
|
||||
version_xtdk="3.4.0.1"
|
||||
else
|
||||
version_xvllm="20260407"
|
||||
version_xvllm="latest"
|
||||
version_xtdk="3.6.2.1"
|
||||
fi
|
||||
|
||||
|
||||
@@ -156,7 +156,7 @@ std::vector<paddle::Tensor> BlockAttnKernel(
|
||||
rope_head_dim = rotary_embs.dims()[4];
|
||||
}
|
||||
std::string pos_emb_type;
|
||||
if (use_neox_rotary_style == true) {
|
||||
if (use_neox_rotary_style) {
|
||||
pos_emb_type = "NEOX";
|
||||
} else if (rope_head_dim == head_dim / 2) {
|
||||
pos_emb_type = "HALF_HEAD_DIM";
|
||||
@@ -342,12 +342,14 @@ std::vector<paddle::Tensor> BlockAttnKernel(
|
||||
value_cache.data<cdata_t>())),
|
||||
vsl.usual_lod_vp, // seq_lod
|
||||
vsl.slot_mapping_vp, // real_batch
|
||||
prefix_lens_vp, // start_tokens
|
||||
param.batch_size, // batch_size
|
||||
1, // emb_batch_size
|
||||
rope_max_seqlen, // max_seqlen
|
||||
param.head_num,
|
||||
param.kv_head_num,
|
||||
param.head_dim,
|
||||
rope_head_dim,
|
||||
param.max_batch_size,
|
||||
block_size,
|
||||
max_block_per_seq,
|
||||
@@ -586,7 +588,8 @@ std::vector<paddle::Tensor> BlockAttnKernel(
|
||||
ret = infer_ops::
|
||||
split_neox_cache_kv_encoder<XPU_XType, float, XPU_CType, int>(
|
||||
xpu_ctx->x_context(),
|
||||
reinterpret_cast<const XPU_XType*>(qkv.data<data_t>()), // qkv
|
||||
reinterpret_cast<const XPU_XType*>(qkv.data<data_t>()) +
|
||||
total_enc_len * qkv_shape[qkv_shape.size() - 1], // qkv
|
||||
reinterpret_cast<const float*>(
|
||||
rotary_embs.data<float>()), // rotary_pos_emb
|
||||
reinterpret_cast<const int*>(
|
||||
@@ -598,14 +601,16 @@ std::vector<paddle::Tensor> BlockAttnKernel(
|
||||
key_cache.data<cdata_t>())),
|
||||
const_cast<XPU_CType*>(reinterpret_cast<const XPU_CType*>(
|
||||
value_cache.data<cdata_t>())),
|
||||
decoder_seq_lod_vp, // seq_lod
|
||||
decoder_batch_map_vp, // real_batch
|
||||
param.batch_size, // batch_size
|
||||
1, // emb_batch_size
|
||||
rope_max_seqlen, // max_seqlen
|
||||
decoder_seq_lod_vp, // seq_lod
|
||||
decoder_batch_map_vp, // real_batch
|
||||
decoder_context_len_cache_vp, // start_tokens
|
||||
param.batch_size, // batch_size
|
||||
1, // emb_batch_size
|
||||
rope_max_seqlen, // max_seqlen
|
||||
param.head_num,
|
||||
param.kv_head_num,
|
||||
param.head_dim,
|
||||
rope_head_dim,
|
||||
param.max_batch_size,
|
||||
block_size,
|
||||
max_block_per_seq,
|
||||
@@ -806,6 +811,7 @@ std::vector<paddle::Tensor> BlockAttnKernel(
|
||||
param.head_num,
|
||||
param.kv_head_num,
|
||||
param.head_dim,
|
||||
rope_head_dim,
|
||||
param.max_batch_size,
|
||||
block_size,
|
||||
max_block_per_seq,
|
||||
|
||||
@@ -76,19 +76,19 @@ std::vector<std::vector<int64_t>> FusedNoAuxTcInferShape(
|
||||
const float routed_scaling_factor) {
|
||||
std::vector<int64_t> topk_ids_shape = {gating_logits_shape[0], top_k};
|
||||
std::vector<int64_t> topk_weights_shape = {gating_logits_shape[0], top_k};
|
||||
return {gating_logits_shape, topk_ids_shape, topk_weights_shape};
|
||||
return {gating_logits_shape, topk_weights_shape, topk_ids_shape};
|
||||
}
|
||||
|
||||
std::vector<paddle::DataType> FusedNoAuxTcInferDtype(
|
||||
const paddle::DataType& gating_logits_dtype,
|
||||
const paddle::DataType& bias_dtype) {
|
||||
return {
|
||||
gating_logits_dtype, paddle::DataType::INT64, paddle::DataType::FLOAT32};
|
||||
gating_logits_dtype, paddle::DataType::FLOAT32, paddle::DataType::INT32};
|
||||
}
|
||||
|
||||
PD_BUILD_STATIC_OP(fused_noaux_tc)
|
||||
.Inputs({"gating_logits", "bias"})
|
||||
.Outputs({"gating_logits_out", "topk_ids", "topk_weights"})
|
||||
.Outputs({"gating_logits_out", "topk_weights", "topk_ids"})
|
||||
.Attrs({"n_group: int",
|
||||
"topk_group: int",
|
||||
"top_k: int",
|
||||
|
||||
Reference in New Issue
Block a user