[XPU] glm-4.5-air (#7071)

This commit is contained in:
zhupengyang
2026-04-14 11:31:49 +08:00
committed by GitHub
parent 26c47c2afc
commit 27b00cf385
9 changed files with 32 additions and 18 deletions
+1 -1
View File
@@ -15,7 +15,7 @@ if [ "$1" == "stable" ]; then
version_xvllm="20251017"
version_xtdk="3.4.0.1"
else
version_xvllm="20260407"
version_xvllm="latest"
version_xtdk="3.6.2.1"
fi
+13 -7
View File
@@ -156,7 +156,7 @@ std::vector<paddle::Tensor> BlockAttnKernel(
rope_head_dim = rotary_embs.dims()[4];
}
std::string pos_emb_type;
if (use_neox_rotary_style == true) {
if (use_neox_rotary_style) {
pos_emb_type = "NEOX";
} else if (rope_head_dim == head_dim / 2) {
pos_emb_type = "HALF_HEAD_DIM";
@@ -342,12 +342,14 @@ std::vector<paddle::Tensor> BlockAttnKernel(
value_cache.data<cdata_t>())),
vsl.usual_lod_vp, // seq_lod
vsl.slot_mapping_vp, // real_batch
prefix_lens_vp, // start_tokens
param.batch_size, // batch_size
1, // emb_batch_size
rope_max_seqlen, // max_seqlen
param.head_num,
param.kv_head_num,
param.head_dim,
rope_head_dim,
param.max_batch_size,
block_size,
max_block_per_seq,
@@ -586,7 +588,8 @@ std::vector<paddle::Tensor> BlockAttnKernel(
ret = infer_ops::
split_neox_cache_kv_encoder<XPU_XType, float, XPU_CType, int>(
xpu_ctx->x_context(),
reinterpret_cast<const XPU_XType*>(qkv.data<data_t>()), // qkv
reinterpret_cast<const XPU_XType*>(qkv.data<data_t>()) +
total_enc_len * qkv_shape[qkv_shape.size() - 1], // qkv
reinterpret_cast<const float*>(
rotary_embs.data<float>()), // rotary_pos_emb
reinterpret_cast<const int*>(
@@ -598,14 +601,16 @@ std::vector<paddle::Tensor> BlockAttnKernel(
key_cache.data<cdata_t>())),
const_cast<XPU_CType*>(reinterpret_cast<const XPU_CType*>(
value_cache.data<cdata_t>())),
decoder_seq_lod_vp, // seq_lod
decoder_batch_map_vp, // real_batch
param.batch_size, // batch_size
1, // emb_batch_size
rope_max_seqlen, // max_seqlen
decoder_seq_lod_vp, // seq_lod
decoder_batch_map_vp, // real_batch
decoder_context_len_cache_vp, // start_tokens
param.batch_size, // batch_size
1, // emb_batch_size
rope_max_seqlen, // max_seqlen
param.head_num,
param.kv_head_num,
param.head_dim,
rope_head_dim,
param.max_batch_size,
block_size,
max_block_per_seq,
@@ -806,6 +811,7 @@ std::vector<paddle::Tensor> BlockAttnKernel(
param.head_num,
param.kv_head_num,
param.head_dim,
rope_head_dim,
param.max_batch_size,
block_size,
max_block_per_seq,
+3 -3
View File
@@ -76,19 +76,19 @@ std::vector<std::vector<int64_t>> FusedNoAuxTcInferShape(
const float routed_scaling_factor) {
std::vector<int64_t> topk_ids_shape = {gating_logits_shape[0], top_k};
std::vector<int64_t> topk_weights_shape = {gating_logits_shape[0], top_k};
return {gating_logits_shape, topk_ids_shape, topk_weights_shape};
return {gating_logits_shape, topk_weights_shape, topk_ids_shape};
}
std::vector<paddle::DataType> FusedNoAuxTcInferDtype(
const paddle::DataType& gating_logits_dtype,
const paddle::DataType& bias_dtype) {
return {
gating_logits_dtype, paddle::DataType::INT64, paddle::DataType::FLOAT32};
gating_logits_dtype, paddle::DataType::FLOAT32, paddle::DataType::INT32};
}
PD_BUILD_STATIC_OP(fused_noaux_tc)
.Inputs({"gating_logits", "bias"})
.Outputs({"gating_logits_out", "topk_ids", "topk_weights"})
.Outputs({"gating_logits_out", "topk_weights", "topk_ids"})
.Attrs({"n_group: int",
"topk_group: int",
"top_k: int",