diff --git a/custom_ops/xpu_ops/download_dependencies.sh b/custom_ops/xpu_ops/download_dependencies.sh index b927448ccc..9499cf7fee 100644 --- a/custom_ops/xpu_ops/download_dependencies.sh +++ b/custom_ops/xpu_ops/download_dependencies.sh @@ -15,7 +15,7 @@ if [ "$1" == "stable" ]; then version_xvllm="20251017" version_xtdk="3.4.0.1" else - version_xvllm="20260407" + version_xvllm="latest" version_xtdk="3.6.2.1" fi diff --git a/custom_ops/xpu_ops/src/ops/block_attn.cc b/custom_ops/xpu_ops/src/ops/block_attn.cc index b14ac36f7d..a9e23c0834 100644 --- a/custom_ops/xpu_ops/src/ops/block_attn.cc +++ b/custom_ops/xpu_ops/src/ops/block_attn.cc @@ -156,7 +156,7 @@ std::vector BlockAttnKernel( rope_head_dim = rotary_embs.dims()[4]; } std::string pos_emb_type; - if (use_neox_rotary_style == true) { + if (use_neox_rotary_style) { pos_emb_type = "NEOX"; } else if (rope_head_dim == head_dim / 2) { pos_emb_type = "HALF_HEAD_DIM"; @@ -342,12 +342,14 @@ std::vector BlockAttnKernel( value_cache.data())), vsl.usual_lod_vp, // seq_lod vsl.slot_mapping_vp, // real_batch + prefix_lens_vp, // start_tokens param.batch_size, // batch_size 1, // emb_batch_size rope_max_seqlen, // max_seqlen param.head_num, param.kv_head_num, param.head_dim, + rope_head_dim, param.max_batch_size, block_size, max_block_per_seq, @@ -586,7 +588,8 @@ std::vector BlockAttnKernel( ret = infer_ops:: split_neox_cache_kv_encoder( xpu_ctx->x_context(), - reinterpret_cast(qkv.data()), // qkv + reinterpret_cast(qkv.data()) + + total_enc_len * qkv_shape[qkv_shape.size() - 1], // qkv reinterpret_cast( rotary_embs.data()), // rotary_pos_emb reinterpret_cast( @@ -598,14 +601,16 @@ std::vector BlockAttnKernel( key_cache.data())), const_cast(reinterpret_cast( value_cache.data())), - decoder_seq_lod_vp, // seq_lod - decoder_batch_map_vp, // real_batch - param.batch_size, // batch_size - 1, // emb_batch_size - rope_max_seqlen, // max_seqlen + decoder_seq_lod_vp, // seq_lod + decoder_batch_map_vp, // real_batch + decoder_context_len_cache_vp, // start_tokens + param.batch_size, // batch_size + 1, // emb_batch_size + rope_max_seqlen, // max_seqlen param.head_num, param.kv_head_num, param.head_dim, + rope_head_dim, param.max_batch_size, block_size, max_block_per_seq, @@ -806,6 +811,7 @@ std::vector BlockAttnKernel( param.head_num, param.kv_head_num, param.head_dim, + rope_head_dim, param.max_batch_size, block_size, max_block_per_seq, diff --git a/custom_ops/xpu_ops/src/ops/fused_noaux_tc.cc b/custom_ops/xpu_ops/src/ops/fused_noaux_tc.cc index 66dbe7d3f9..5e10757107 100644 --- a/custom_ops/xpu_ops/src/ops/fused_noaux_tc.cc +++ b/custom_ops/xpu_ops/src/ops/fused_noaux_tc.cc @@ -76,19 +76,19 @@ std::vector> FusedNoAuxTcInferShape( const float routed_scaling_factor) { std::vector topk_ids_shape = {gating_logits_shape[0], top_k}; std::vector topk_weights_shape = {gating_logits_shape[0], top_k}; - return {gating_logits_shape, topk_ids_shape, topk_weights_shape}; + return {gating_logits_shape, topk_weights_shape, topk_ids_shape}; } std::vector FusedNoAuxTcInferDtype( const paddle::DataType& gating_logits_dtype, const paddle::DataType& bias_dtype) { return { - gating_logits_dtype, paddle::DataType::INT64, paddle::DataType::FLOAT32}; + gating_logits_dtype, paddle::DataType::FLOAT32, paddle::DataType::INT32}; } PD_BUILD_STATIC_OP(fused_noaux_tc) .Inputs({"gating_logits", "bias"}) - .Outputs({"gating_logits_out", "topk_ids", "topk_weights"}) + .Outputs({"gating_logits_out", "topk_weights", "topk_ids"}) .Attrs({"n_group: int", "topk_group: int", "top_k: int", diff --git a/fastdeploy/model_executor/layers/backends/xpu/moe/fused_moe.py b/fastdeploy/model_executor/layers/backends/xpu/moe/fused_moe.py index 085c202c9a..f2c37452ca 100644 --- a/fastdeploy/model_executor/layers/backends/xpu/moe/fused_moe.py +++ b/fastdeploy/model_executor/layers/backends/xpu/moe/fused_moe.py @@ -313,7 +313,7 @@ class XPUMoEMethod(MoEMethodBase): """ gate_out = gate(x.cast("float32")) if layer.topk_method == "noaux_tc": - _, topk_idx, topk_weights = get_moe_scores( + _, topk_weights, topk_idx = get_moe_scores( gate_out, layer.n_group, layer.topk_group, diff --git a/fastdeploy/model_executor/layers/linear.py b/fastdeploy/model_executor/layers/linear.py index b35d97d766..848a54411b 100644 --- a/fastdeploy/model_executor/layers/linear.py +++ b/fastdeploy/model_executor/layers/linear.py @@ -61,7 +61,8 @@ class UnquantizedLinearMethod(QuantMethodBase): ) if self.model_format == "torch" and "output_dim" in extra_weight_attrs: - extra_weight_attrs["output_dim"] = not extra_weight_attrs["output_dim"] + if extra_weight_attrs["output_dim"] is not None: + extra_weight_attrs["output_dim"] = not extra_weight_attrs["output_dim"] set_weight_attrs( layer.weight, diff --git a/fastdeploy/model_executor/layers/quantization/__init__.py b/fastdeploy/model_executor/layers/quantization/__init__.py index 678873f76d..8e8197ba1d 100644 --- a/fastdeploy/model_executor/layers/quantization/__init__.py +++ b/fastdeploy/model_executor/layers/quantization/__init__.py @@ -136,6 +136,7 @@ def parse_quant_config(args, model_config, is_ernie, is_v1_loader): logger.warning(f"Failed to parse quantization config normally ({e}), trying fallback") quant_config_name = args.quantization["quantization"] quantization_config["quantization"] = quant_config_name + model_config.quantization_config = quantization_config # Special handling for Ernie models if quant_config_name == "wint4" and is_ernie: quantization_config["dense_quant_type"] = "wint8" diff --git a/fastdeploy/model_executor/layers/rotary_embedding.py b/fastdeploy/model_executor/layers/rotary_embedding.py index dd77cf2bc0..945bf93437 100644 --- a/fastdeploy/model_executor/layers/rotary_embedding.py +++ b/fastdeploy/model_executor/layers/rotary_embedding.py @@ -44,7 +44,7 @@ class ErnieRotaryEmbedding: inv_freq = self.base ** (-paddle.arange(0, self.rotary_dim, 2, dtype="float32") / self.rotary_dim) partial_rotary_position_ids = position_ids / self.partial_rotary_factor freqs = paddle.einsum("ij,k->ijk", partial_rotary_position_ids.cast("float32"), inv_freq) - if paddle.is_compiled_with_xpu() or paddle.is_compiled_with_custom_device("iluvatar_gpu"): + if current_platform.is_xpu() or paddle.is_compiled_with_custom_device("iluvatar_gpu"): # shape: [B, S, D] rot_emb = paddle.zeros((2, bsz, max_seq_len, 1, self.rotary_dim), dtype="float32") emb = paddle.stack([freqs, freqs], axis=-1).reshape((bsz, max_seq_len, self.rotary_dim)) @@ -95,9 +95,14 @@ class GlmRotaryEmbedding: else: inv_freq = self.base ** (-paddle.arange(0, self.rotary_dim, 2, dtype="float32") / self.rotary_dim) freqs = paddle.einsum("ij,k->ijk", position_ids.cast("float32"), inv_freq) - # shape: [B, S, D/2] - rot_emb = paddle.zeros((2, bsz, max_seq_len, 1, self.rotary_dim // 2), dtype="float32") - emb = paddle.stack([freqs], axis=-1).reshape((bsz, max_seq_len, self.rotary_dim // 2)) + if current_platform.is_xpu(): + # shape: [B, S, D] + rot_emb = paddle.zeros((2, bsz, max_seq_len, 1, self.rotary_dim), dtype="float32") + emb = paddle.concat([freqs, freqs], axis=-1).reshape((bsz, max_seq_len, self.rotary_dim)) + else: + # shape: [B, S, D/2] + rot_emb = paddle.zeros((2, bsz, max_seq_len, 1, self.rotary_dim // 2), dtype="float32") + emb = paddle.stack([freqs], axis=-1).reshape((bsz, max_seq_len, self.rotary_dim // 2)) # shape: [B, S, 1, D] emb = paddle.unsqueeze(emb, 2) rot_emb[0] = paddle.cos(emb) diff --git a/fastdeploy/model_executor/models/glm4_moe.py b/fastdeploy/model_executor/models/glm4_moe.py index 947df7c038..5afd5970d0 100644 --- a/fastdeploy/model_executor/models/glm4_moe.py +++ b/fastdeploy/model_executor/models/glm4_moe.py @@ -73,7 +73,7 @@ class Glm4MoeMLP(nn.Layer): fd_config=fd_config, prefix=f"{prefix}.up_gate_proj", input_size=fd_config.model_config.hidden_size, - output_size=[intermediate_size, intermediate_size], + output_sizes=[intermediate_size, intermediate_size], with_bias=False, ) diff --git a/fastdeploy/worker/xpu_model_runner.py b/fastdeploy/worker/xpu_model_runner.py index 6774b1b4bc..b33f3af6dc 100644 --- a/fastdeploy/worker/xpu_model_runner.py +++ b/fastdeploy/worker/xpu_model_runner.py @@ -990,6 +990,7 @@ class XPUModelRunner(ModelRunnerBase): position_ids=tmp_position_ids, base=self.model_config.rope_theta, model_config=self.model_config, + partial_rotary_factor=self.model_config.partial_rotary_factor, ) # Set block tables