enhance set_stop_value_multi_ends and standardize the registration of some operators (#4525)

* fix custom_ops * paddleformers>=0.3.1
2026-04-23 00:17:25 +08:00 · 2025-10-21 22:06:06 +08:00
parent dc7facaa7f
commit 3b58310c26
18 changed files with 68 additions and 17 deletions
@@ -15,6 +15,9 @@
 #include "paddle/extension.h"
 #include "moba_attn.h"

+#ifndef PD_BUILD_STATIC_OP
+#define PD_BUILD_STATIC_OP(name) PD_BUILD_OP(static_op_##name)
+#endif

 std::vector<paddle::Tensor> MobaAttention(
        const paddle::Tensor& qkv,
@@ -272,7 +275,7 @@ std::vector<paddle::Tensor> MobaAttention(
 }


-PD_BUILD_OP(moba_attention)
+PD_BUILD_STATIC_OP(moba_attention)
    .Inputs({
        "qkv",
        "q_input",
@@ -16,6 +16,9 @@
 #include "moba_attn/moba_attn_utils.hpp"
 #include "moba_attn/moba_attn.h"

+#ifndef PD_BUILD_STATIC_OP
+#define PD_BUILD_STATIC_OP(name) PD_BUILD_OP(static_op_##name)
+#endif

 template <typename T, int knthreads, int moba_block_size, int kBlockMaxN, int searchtimes>
 __global__ void qk_gate_sort_decoder_kernel(
@@ -221,7 +224,7 @@ std::vector<paddle::Tensor> QkSortDecoder(
    }
 }

-PD_BUILD_OP(moba_qk_sort_decoder)
+PD_BUILD_STATIC_OP(moba_qk_sort_decoder)
    .Inputs({
        "qk_gate_weight",
        "seq_len_encoder",
@@ -27,6 +27,10 @@
 #include "softmax.hpp"
 #include "cutlass/arch/reg_reconfig.h"

+#ifndef PD_BUILD_STATIC_OP
+#define PD_BUILD_STATIC_OP(name) PD_BUILD_OP(static_op_##name)
+#endif
+
 template <int kHeadDim>
 auto get_gmem_layout(int token_num, int head_num) {
    return make_layout(
@@ -360,7 +364,7 @@ void MobaEncoderAttn(
 }


-PD_BUILD_OP(moba_encoder_attn)
+PD_BUILD_STATIC_OP(moba_encoder_attn)
    .Inputs({
        "q_input",
        "k_input",
@@ -15,6 +15,9 @@
 #include "paddle/extension.h"
 #include "moba_attn/moba_attn.h"

+#ifndef PD_BUILD_STATIC_OP
+#define PD_BUILD_STATIC_OP(name) PD_BUILD_OP(static_op_##name)
+#endif

 template <typename T, int kBlockSize, int kHeadDim>
 __global__ void write_encoder_cachekv_c16(
@@ -135,7 +138,7 @@ void MobaEncoderAttnWriteCacheKv(
    }
 }

-PD_BUILD_OP(moba_encoder_attn_write_cache_kv)
+PD_BUILD_STATIC_OP(moba_encoder_attn_write_cache_kv)
    .Inputs({
        "k_input",
        "v_input",
@@ -15,6 +15,10 @@
 #include "paddle/extension.h"
 #include "moba_attn/moba_attn_utils.hpp"

+#ifndef PD_BUILD_STATIC_OP
+#define PD_BUILD_STATIC_OP(name) PD_BUILD_OP(static_op_##name)
+#endif
+
 template <typename T, int knthreads, int moba_block_size, int kBlockM, int kBlockMaxN, int searchtimes>
 __global__ void qk_gate_sort_encoder_kernel(
        const T* qk_gate_weight,
@@ -320,7 +324,7 @@ std::vector<paddle::Tensor> QkSortEncoder(
    }
 }

-PD_BUILD_OP(moba_qk_sort_encoder)
+PD_BUILD_STATIC_OP(moba_qk_sort_encoder)
    .Inputs({
        "qk_gate_weight",
        "seq_len_encoder",
@@ -16,6 +16,10 @@
 #include "moba_attn/moba_attn_utils.hpp"
 #include "moba_attn/moba_attn.h"

+#ifndef PD_BUILD_STATIC_OP
+#define PD_BUILD_STATIC_OP(name) PD_BUILD_OP(static_op_##name)
+#endif
+
 template <typename T, int kBlockSize, int kHeadDim>
 __global__ void get_kv_from_cache_c16_kernel(
        T * k_input,
@@ -251,7 +255,7 @@ std::vector<paddle::Tensor> GetCurCuSeqLenk(
    return {cu_seq_q_pack, cu_seqlens_k, q_pack_tokens_cpu};
 }

-PD_BUILD_OP(get_kv_from_cache)
+PD_BUILD_STATIC_OP(get_kv_from_cache)
    .Inputs({
        "k_input",
        "v_input",
@@ -277,7 +281,7 @@ PD_BUILD_OP(get_kv_from_cache)
                    {"v_input", "v_input_out"}})
    .SetKernelFn(PD_KERNEL(GetKVFromCache));

-PD_BUILD_OP(get_cur_cu_seq_len_k)
+PD_BUILD_STATIC_OP(get_cur_cu_seq_len_k)
    .Inputs({
            "seq_lens_encoder",
            "seq_lens_decoder",
@@ -16,6 +16,9 @@
 #include "moba_attn/moba_attn_utils.hpp"
 #include "moba_attn/moba_attn.h"

+#ifndef PD_BUILD_STATIC_OP
+#define PD_BUILD_STATIC_OP(name) PD_BUILD_OP(static_op_##name)
+#endif

 template <typename T, int moba_block_size, int kHeadDim, int kMaxN>
 __global__ void moba_mlp_einsum_kernel(
@@ -207,7 +210,7 @@ std::vector<paddle::Tensor> MobaMlpEinsum(
    return {k_gate_weight};
 }

-PD_BUILD_OP(moba_mlp_einsum)
+PD_BUILD_STATIC_OP(moba_mlp_einsum)
    .Inputs({
        "k_input",
        "attn_gate_weight",
@@ -25,6 +25,10 @@
 #include "cutlass/cluster_launch.hpp"
 #include "cutlass/arch/reg_reconfig.h"

+#ifndef PD_BUILD_STATIC_OP
+#define PD_BUILD_STATIC_OP(name) PD_BUILD_OP(static_op_##name)
+#endif
+
 template <typename input_type, int kBlockM, int kBlockN, int kMobaBlockSize, int kMaxN, int kHeadDim, bool is_split_kv>
 __global__ void qk_gemm_kernel(
        const input_type *q_input,
@@ -446,7 +450,7 @@ std::vector<paddle::Tensor> MobaQKGemm(
    }
 }

-PD_BUILD_OP(moba_qk_gemm)
+PD_BUILD_STATIC_OP(moba_qk_gemm)
    .Inputs({
        "q_input",
        "k_block_means",
@@ -16,6 +16,10 @@
 #include "moba_attn/moba_attn_utils.hpp"
 #include "moba_attn/moba_attn.h"

+#ifndef PD_BUILD_STATIC_OP
+#define PD_BUILD_STATIC_OP(name) PD_BUILD_OP(static_op_##name)
+#endif
+
 template <typename input_type, int moba_block_size, int kBlockM, int kMaxN, int tokens_per_block, bool need_k_mean>
 __global__ void fused_block_mean_and_rope_kernel(
        const input_type *qkv_input,
@@ -341,7 +345,7 @@ void FusedBlockMeanAndRope(



-PD_BUILD_OP(fused_block_mean_and_rope)
+PD_BUILD_STATIC_OP(fused_block_mean_and_rope)
    .Inputs({
        "qkv_out",
        "k_block_means",