[XPU] bind some OPs for VL model with pybind (#4522)

2026-04-23 00:17:25 +08:00 · 2025-10-27 10:50:08 +08:00
parent cdc40cdc2a
commit 5c6105f4a2
8 changed files with 1789 additions and 1087 deletions
@@ -15,146 +15,150 @@
 #include "helper.h"

 template <int THREADBLOCK_SIZE>
-__global__ void update_inputs_kernel_v1(bool *not_need_stop,
-                                     int *seq_lens_this_time,
-                                     int *seq_lens_encoder,
-                                     int *seq_lens_decoder,
-                                     int *step_seq_lens_decoder,
-                                     int64_t *prompt_lens,
-                                     int64_t *topk_ids,
-                                     int64_t *input_ids,
-                                     int *block_tables,
-                                     const int64_t *stop_nums,
-                                     bool *stop_flags,
-                                     bool *is_block_step,
-                                     const int64_t *next_tokens,
-                                     const int bsz,
-                                     const int max_bsz,
-                                     const int input_ids_stride,
-                                     const int block_num_per_seq,
-                                     const int block_size,
-                                     bool prefill_one_step_stop) {
-    int thread_idx = threadIdx.x;
-    typedef cub::BlockReduce<int64_t, THREADBLOCK_SIZE> BlockReduce;
-    __shared__ typename BlockReduce::TempStorage temp_storage;
+__global__ void update_inputs_kernel_v1(bool* not_need_stop,
+                                        int* seq_lens_this_time,
+                                        int* seq_lens_encoder,
+                                        int* seq_lens_decoder,
+                                        int* step_seq_lens_decoder,
+                                        int64_t* prompt_lens,
+                                        int64_t* topk_ids,
+                                        int64_t* input_ids,
+                                        int* block_tables,
+                                        const int64_t* stop_nums,
+                                        bool* stop_flags,
+                                        bool* is_block_step,
+                                        const int64_t* next_tokens,
+                                        const int bsz,
+                                        const int max_bsz,
+                                        const int input_ids_stride,
+                                        const int block_num_per_seq,
+                                        const int block_size,
+                                        bool prefill_one_step_stop) {
+  int thread_idx = threadIdx.x;
+  typedef cub::BlockReduce<int64_t, THREADBLOCK_SIZE> BlockReduce;
+  __shared__ typename BlockReduce::TempStorage temp_storage;

-    bool stop_flag_now = false;
-    int64_t stop_flag_now_int = 0;
-    if (thread_idx < max_bsz) {
-        if (thread_idx < bsz) {
-            stop_flag_now = stop_flags[thread_idx];
-            stop_flag_now_int = static_cast<int64_t>(stop_flag_now);
-        } else {
-            stop_flag_now_int = 1;
-        }
-    }
+  bool stop_flag_now = false;
+  int64_t stop_flag_now_int = 0;
+  if (thread_idx < max_bsz) {
    if (thread_idx < bsz) {
-        if(stop_flag_now) {
-            seq_lens_this_time[thread_idx] = 0; // stop at next step
-            seq_lens_decoder[thread_idx] = 0;
-            seq_lens_encoder[thread_idx] = 0;
+      stop_flag_now = stop_flags[thread_idx];
+      stop_flag_now_int = static_cast<int64_t>(stop_flag_now);
+    } else {
+      stop_flag_now_int = 1;
+    }
+  }
+  if (thread_idx < bsz) {
+    if (stop_flag_now) {
+      seq_lens_this_time[thread_idx] = 0;  // stop at next step
+      seq_lens_decoder[thread_idx] = 0;
+      seq_lens_encoder[thread_idx] = 0;
+    } else {
+      if (seq_lens_this_time[thread_idx] + seq_lens_decoder[thread_idx] >=
+          prompt_lens[thread_idx]) {
+        if (prefill_one_step_stop) {
+          // prefill done, stop
+          stop_flags[thread_idx] = true;
+          seq_lens_this_time[thread_idx] = 0;
+          seq_lens_decoder[thread_idx] = 0;
+          seq_lens_encoder[thread_idx] = 0;
+          stop_flag_now_int = 1;
        } else {
-            if (seq_lens_this_time[thread_idx] + seq_lens_decoder[thread_idx] >= prompt_lens[thread_idx]) {
-                if (prefill_one_step_stop) {
-                        // prefill done, stop
-                        stop_flags[thread_idx] = true;
-                        seq_lens_this_time[thread_idx] = 0;
-                        seq_lens_decoder[thread_idx] = 0;
-                        seq_lens_encoder[thread_idx] = 0;
-                        stop_flag_now_int = 1;
-                } else{
-                    // decoding
-                    seq_lens_decoder[thread_idx] += seq_lens_this_time[thread_idx];
-                    seq_lens_this_time[thread_idx] = 1;
-                    seq_lens_encoder[thread_idx] = 0;
-                    int64_t *input_ids_now = input_ids + thread_idx * input_ids_stride;
-                    input_ids_now[0] = next_tokens[thread_idx];
+          // decoding
+          seq_lens_decoder[thread_idx] += seq_lens_this_time[thread_idx];
+          seq_lens_this_time[thread_idx] = 1;
+          seq_lens_encoder[thread_idx] = 0;
+          int64_t* input_ids_now = input_ids + thread_idx * input_ids_stride;
+          input_ids_now[0] = next_tokens[thread_idx];

-                    // to judge whether block is not enough
-                    int *block_table_now = block_tables + thread_idx * block_num_per_seq;
-                    if (seq_lens_this_time[thread_idx] != 0 && block_table_now[seq_lens_decoder[thread_idx] / block_size] == -1) {
-                        // should be scheduled by server
-                        is_block_step[thread_idx] = true;
-                        seq_lens_this_time[thread_idx]= 0;
-                        stop_flags[thread_idx] = true;
-                        step_seq_lens_decoder[thread_idx] = seq_lens_decoder[thread_idx];
-                        seq_lens_decoder[thread_idx] = 0;
-                        stop_flag_now_int = 1;
-                    }
-                }
-            } else
-            {
-                stop_flags[thread_idx] = true;
-                seq_lens_this_time[thread_idx] = 0;
-                seq_lens_decoder[thread_idx] = 0;
-                seq_lens_encoder[thread_idx] = 0;
-                topk_ids[thread_idx] = -1;
-                stop_flag_now_int = 1;
-            }
+          // to judge whether block is not enough
+          int* block_table_now = block_tables + thread_idx * block_num_per_seq;
+          if (seq_lens_this_time[thread_idx] != 0 &&
+              block_table_now[seq_lens_decoder[thread_idx] / block_size] ==
+                  -1) {
+            // should be scheduled by server
+            is_block_step[thread_idx] = true;
+            seq_lens_this_time[thread_idx] = 0;
+            stop_flags[thread_idx] = true;
+            step_seq_lens_decoder[thread_idx] = seq_lens_decoder[thread_idx];
+            seq_lens_decoder[thread_idx] = 0;
+            stop_flag_now_int = 1;
+          }
        }
+      } else {
+        stop_flags[thread_idx] = true;
+        seq_lens_this_time[thread_idx] = 0;
+        seq_lens_decoder[thread_idx] = 0;
+        seq_lens_encoder[thread_idx] = 0;
+        topk_ids[thread_idx] = -1;
+        stop_flag_now_int = 1;
+      }
    }
-    __syncthreads();
-    int64_t stop_sum = BlockReduce(temp_storage).Sum(stop_flag_now_int);
-    if (thread_idx == 0) {
-        not_need_stop[0] = stop_sum < stop_nums[0];
-    }
+  }
+  __syncthreads();
+  int64_t stop_sum = BlockReduce(temp_storage).Sum(stop_flag_now_int);
+  if (thread_idx == 0) {
+    not_need_stop[0] = stop_sum < stop_nums[0];
+  }
 }

-void UpdateInputesV1(const paddle::Tensor &stop_flags,
-                   const paddle::Tensor &not_need_stop,  // only on cpu
-                   const paddle::Tensor &seq_lens_this_time,
-                   const paddle::Tensor &seq_lens_encoder,
-                   const paddle::Tensor &seq_lens_decoder,
-                   const paddle::Tensor &step_seq_lens_decoder,
-                   const paddle::Tensor &prompt_lens,
-                   const paddle::Tensor &topk_ids,
-                   const paddle::Tensor &input_ids,
-                   const paddle::Tensor &block_tables,
-                   const paddle::Tensor &stop_nums,
-                   const paddle::Tensor &next_tokens,
-                   const paddle::Tensor &is_block_step,
-                   const int block_size) {
+void UpdateInputsV1(const paddle::Tensor& stop_flags,
+                    const paddle::Tensor& not_need_stop,  // only on cpu
+                    const paddle::Tensor& seq_lens_this_time,
+                    const paddle::Tensor& seq_lens_encoder,
+                    const paddle::Tensor& seq_lens_decoder,
+                    const paddle::Tensor& step_seq_lens_decoder,
+                    const paddle::Tensor& prompt_lens,
+                    const paddle::Tensor& topk_ids,
+                    const paddle::Tensor& input_ids,
+                    const paddle::Tensor& block_tables,
+                    const paddle::Tensor& stop_nums,
+                    const paddle::Tensor& next_tokens,
+                    const paddle::Tensor& is_block_step,
+                    const int block_size) {
 #ifdef PADDLE_WITH_CUSTOM_DEVICE
-    auto dev_ctx = static_cast<const phi::CustomContext*>(paddle::experimental::DeviceContextPool::Instance().Get(input_ids.place()));
-    auto cu_stream = dev_ctx->stream();
+  auto dev_ctx = static_cast<const phi::CustomContext*>(
+      paddle::experimental::DeviceContextPool::Instance().Get(
+          input_ids.place()));
+  auto cu_stream = dev_ctx->stream();
 #else
-    auto cu_stream = input_ids.stream();
+  auto cu_stream = input_ids.stream();
 #endif
-    bool prefill_one_step_stop = false;
-    if (const char *env_p = std::getenv("PREFILL_NODE_ONE_STEP_STOP_V1")) {
-        if (env_p[0] == '1') {
-            prefill_one_step_stop = true;
-        }
+  bool prefill_one_step_stop = false;
+  if (const char* env_p = std::getenv("PREFILL_NODE_ONE_STEP_STOP_V1")) {
+    if (env_p[0] == '1') {
+      prefill_one_step_stop = true;
    }
-    const int max_bsz = stop_flags.shape()[0];
-    const int now_bsz = seq_lens_this_time.shape()[0];
-    const int input_ids_stride = input_ids.shape()[1];
-    const int block_num_per_seq = block_tables.shape()[1];
-    auto not_need_stop_gpu = not_need_stop.copy_to(stop_flags.place(), false);
-    update_inputs_kernel_v1<1024><<<1, 1024, 0, cu_stream>>>(
-        const_cast<bool *>(not_need_stop_gpu.data<bool>()),
-        const_cast<int *>(seq_lens_this_time.data<int>()),
-        const_cast<int *>(seq_lens_encoder.data<int>()),
-        const_cast<int *>(seq_lens_decoder.data<int>()),
-        const_cast<int *>(step_seq_lens_decoder.data<int>()),
-        const_cast<int64_t *>(prompt_lens.data<int64_t>()),
-        const_cast<int64_t *>(topk_ids.data<int64_t>()),
-        const_cast<int64_t *>(input_ids.data<int64_t>()),
-        const_cast<int *>(block_tables.data<int>()),
-        stop_nums.data<int64_t>(),
-        const_cast<bool *>(stop_flags.data<bool>()),
-        const_cast<bool *>(is_block_step.data<bool>()),
-        next_tokens.data<int64_t>(),
-        now_bsz,
-        max_bsz,
-        input_ids_stride,
-        block_num_per_seq,
-        block_size,
-        prefill_one_step_stop);
-    auto not_need_stop_cpu =
-        not_need_stop_gpu.copy_to(not_need_stop.place(), false);
-    bool *not_need_stop_data = const_cast<bool *>(not_need_stop.data<bool>());
-    not_need_stop_data[0] = not_need_stop_cpu.data<bool>()[0];
+  }
+  const int max_bsz = stop_flags.shape()[0];
+  const int now_bsz = seq_lens_this_time.shape()[0];
+  const int input_ids_stride = input_ids.shape()[1];
+  const int block_num_per_seq = block_tables.shape()[1];
+  auto not_need_stop_gpu = not_need_stop.copy_to(stop_flags.place(), false);
+  update_inputs_kernel_v1<1024><<<1, 1024, 0, cu_stream>>>(
+      const_cast<bool*>(not_need_stop_gpu.data<bool>()),
+      const_cast<int*>(seq_lens_this_time.data<int>()),
+      const_cast<int*>(seq_lens_encoder.data<int>()),
+      const_cast<int*>(seq_lens_decoder.data<int>()),
+      const_cast<int*>(step_seq_lens_decoder.data<int>()),
+      const_cast<int64_t*>(prompt_lens.data<int64_t>()),
+      const_cast<int64_t*>(topk_ids.data<int64_t>()),
+      const_cast<int64_t*>(input_ids.data<int64_t>()),
+      const_cast<int*>(block_tables.data<int>()),
+      stop_nums.data<int64_t>(),
+      const_cast<bool*>(stop_flags.data<bool>()),
+      const_cast<bool*>(is_block_step.data<bool>()),
+      next_tokens.data<int64_t>(),
+      now_bsz,
+      max_bsz,
+      input_ids_stride,
+      block_num_per_seq,
+      block_size,
+      prefill_one_step_stop);
+  auto not_need_stop_cpu =
+      not_need_stop_gpu.copy_to(not_need_stop.place(), false);
+  bool* not_need_stop_data = const_cast<bool*>(not_need_stop.data<bool>());
+  not_need_stop_data[0] = not_need_stop_cpu.data<bool>()[0];
 }

 PD_BUILD_STATIC_OP(update_inputs_v1)
@@ -190,4 +194,4 @@ PD_BUILD_STATIC_OP(update_inputs_v1)
                    {"stop_flags", "stop_flags_out"},
                    {"step_seq_lens_decoder", "step_seq_lens_decoder_out"},
                    {"is_block_step", "is_block_step_out"}})
-    .SetKernelFn(PD_KERNEL(UpdateInputesV1));
+    .SetKernelFn(PD_KERNEL(UpdateInputsV1));