[XPU] glm-4.5-air (#7071)

2026-04-23 00:17:25 +08:00 · 2026-04-14 11:31:49 +08:00
parent 26c47c2afc
commit 27b00cf385
9 changed files with 32 additions and 18 deletions
@@ -15,7 +15,7 @@ if [ "$1" == "stable" ]; then
    version_xvllm="20251017"
    version_xtdk="3.4.0.1"
 else
-    version_xvllm="20260407"
+    version_xvllm="latest"
    version_xtdk="3.6.2.1"
 fi

@@ -156,7 +156,7 @@ std::vector<paddle::Tensor> BlockAttnKernel(
    rope_head_dim = rotary_embs.dims()[4];
  }
  std::string pos_emb_type;
-  if (use_neox_rotary_style == true) {
+  if (use_neox_rotary_style) {
    pos_emb_type = "NEOX";
  } else if (rope_head_dim == head_dim / 2) {
    pos_emb_type = "HALF_HEAD_DIM";
@@ -342,12 +342,14 @@ std::vector<paddle::Tensor> BlockAttnKernel(
                  value_cache.data<cdata_t>())),
              vsl.usual_lod_vp,     // seq_lod
              vsl.slot_mapping_vp,  // real_batch
+              prefix_lens_vp,       // start_tokens
              param.batch_size,     // batch_size
              1,                    // emb_batch_size
              rope_max_seqlen,      // max_seqlen
              param.head_num,
              param.kv_head_num,
              param.head_dim,
+              rope_head_dim,
              param.max_batch_size,
              block_size,
              max_block_per_seq,
@@ -586,7 +588,8 @@ std::vector<paddle::Tensor> BlockAttnKernel(
        ret = infer_ops::
            split_neox_cache_kv_encoder<XPU_XType, float, XPU_CType, int>(
                xpu_ctx->x_context(),
-                reinterpret_cast<const XPU_XType*>(qkv.data<data_t>()),  // qkv
+                reinterpret_cast<const XPU_XType*>(qkv.data<data_t>()) +
+                    total_enc_len * qkv_shape[qkv_shape.size() - 1],  // qkv
                reinterpret_cast<const float*>(
                    rotary_embs.data<float>()),  // rotary_pos_emb
                reinterpret_cast<const int*>(
@@ -598,14 +601,16 @@ std::vector<paddle::Tensor> BlockAttnKernel(
                    key_cache.data<cdata_t>())),
                const_cast<XPU_CType*>(reinterpret_cast<const XPU_CType*>(
                    value_cache.data<cdata_t>())),
-                decoder_seq_lod_vp,    // seq_lod
-                decoder_batch_map_vp,  // real_batch
-                param.batch_size,      // batch_size
-                1,                     // emb_batch_size
-                rope_max_seqlen,       // max_seqlen
+                decoder_seq_lod_vp,            // seq_lod
+                decoder_batch_map_vp,          // real_batch
+                decoder_context_len_cache_vp,  // start_tokens
+                param.batch_size,              // batch_size
+                1,                             // emb_batch_size
+                rope_max_seqlen,               // max_seqlen
                param.head_num,
                param.kv_head_num,
                param.head_dim,
+                rope_head_dim,
                param.max_batch_size,
                block_size,
                max_block_per_seq,
@@ -806,6 +811,7 @@ std::vector<paddle::Tensor> BlockAttnKernel(
            param.head_num,
            param.kv_head_num,
            param.head_dim,
+            rope_head_dim,
            param.max_batch_size,
            block_size,
            max_block_per_seq,
@@ -76,19 +76,19 @@ std::vector<std::vector<int64_t>> FusedNoAuxTcInferShape(
    const float routed_scaling_factor) {
  std::vector<int64_t> topk_ids_shape = {gating_logits_shape[0], top_k};
  std::vector<int64_t> topk_weights_shape = {gating_logits_shape[0], top_k};
-  return {gating_logits_shape, topk_ids_shape, topk_weights_shape};
+  return {gating_logits_shape, topk_weights_shape, topk_ids_shape};
 }

 std::vector<paddle::DataType> FusedNoAuxTcInferDtype(
    const paddle::DataType& gating_logits_dtype,
    const paddle::DataType& bias_dtype) {
  return {
-      gating_logits_dtype, paddle::DataType::INT64, paddle::DataType::FLOAT32};
+      gating_logits_dtype, paddle::DataType::FLOAT32, paddle::DataType::INT32};
 }

 PD_BUILD_STATIC_OP(fused_noaux_tc)
    .Inputs({"gating_logits", "bias"})
-    .Outputs({"gating_logits_out", "topk_ids", "topk_weights"})
+    .Outputs({"gating_logits_out", "topk_weights", "topk_ids"})
    .Attrs({"n_group: int",
            "topk_group: int",
            "top_k: int",