[XPU] refactor: XPU plugin namespace migration (#6799)

* [XPU] refactor: XPU plugin namespace migration - Migrate wrapper layer namespace from baidu::xpu::api::plugin to fastdeploy::plugin - Migrate kernel layer namespace from xpu3::plugin to fd_xpu3 - Add api:: prefix for types (Context, SUCCESS, XPUIndexType, ctx_guard) - Remove XPU2 support, keep only XPU3 - Update ops/ directory to use new namespace Total: 137 files changed * [XPU] fix: add return value check and correct error messages - Add PADDLE_ENFORCE_XDNN_SUCCESS check for speculate_get_logits and update_attn_mask_offsets - Fix empty error message in draft_model_postprocess - Correct function name in speculate_schedule_cache error message - Update error messages from 'xpu::plugin::' to 'fastdeploy::plugin::'
2026-04-23 00:17:25 +08:00 · 2026-03-13 10:21:51 +08:00
parent d73fd876ba
commit 1f9f889e37
138 changed files with 1086 additions and 1467 deletions
@@ -72,7 +72,7 @@ std::vector<paddle::Tensor> AdjustBatchKernel(

  auto out = paddle::empty({token_num, dim}, x.type(), x.place());
  if (token_num > 0) {
-    int r = baidu::xpu::api::plugin::eb_adjust_batch<XPUType, XPUType>(
+    int r = fastdeploy::plugin::eb_adjust_batch<XPUType, XPUType>(
        ctx,
        reinterpret_cast<const XPUType *>(x.data<data_t>()),
        reinterpret_cast<XPUType *>(out.data<data_t>()),
@@ -90,7 +90,7 @@ std::vector<paddle::Tensor> GatherNextToken(
  }

  if (output_padding_offset) {
-    int r = baidu::xpu::api::plugin::eb_mtp_gather_next_token<XPUType, XPUType>(
+    int r = fastdeploy::plugin::eb_mtp_gather_next_token<XPUType, XPUType>(
        ctx,
        reinterpret_cast<const XPUType*>(x.data<data_t>()),
        reinterpret_cast<XPUType*>(out.data<data_t>()),
@@ -99,9 +99,9 @@ std::vector<paddle::Tensor> GatherNextToken(
        encoder_batch_map_vp,
        decoder_batch_map_vp,
        dim);
-    PD_CHECK(r == 0, "xpu::plugin::gather_next_token failed.");
+    PD_CHECK(r == 0, "fastdeploy::plugin::gather_next_token failed.");
  } else {
-    int r = baidu::xpu::api::plugin::eb_gather_next_token<XPUType, XPUType>(
+    int r = fastdeploy::plugin::eb_gather_next_token<XPUType, XPUType>(
        ctx,
        reinterpret_cast<const XPUType*>(x.data<data_t>()),
        reinterpret_cast<XPUType*>(out.data<data_t>()),
@@ -109,7 +109,7 @@ std::vector<paddle::Tensor> GatherNextToken(
        encoder_batch_map_vp,
        decoder_batch_map_vp,
        dim);
-    PD_CHECK(r == 0, "xpu::plugin::gather_next_token failed.");
+    PD_CHECK(r == 0, "fastdeploy::plugin::gather_next_token failed.");
  }
  return {out};
 }
@@ -40,19 +40,19 @@ std::vector<paddle::Tensor> GetPaddingOffset(const paddle::Tensor &input_ids,
  auto cu_seqlens_k =
      paddle::full({bsz + 1}, 0, paddle::DataType::INT32, input_ids.place());
  if (token_num_data > 0) {
-    int r = baidu::xpu::api::plugin::get_padding_offset(
-        xpu_ctx->x_context(),
-        batch_id_per_token.data<int>(),
-        cum_offsets_out.data<int>(),
-        cu_seqlens_q.data<int>(),
-        cu_seqlens_k.data<int>(),
-        x_remove_padding.data<int64_t>(),
-        input_ids.data<int64_t>(),
-        cum_offsets.data<int>(),
-        seq_len.data<int>(),
-        seq_length,
-        bsz);
-    PD_CHECK(r == 0, "baidu::xpu::api::plugin::get_padding_offset failed.");
+    int r =
+        fastdeploy::plugin::get_padding_offset(xpu_ctx->x_context(),
+                                               batch_id_per_token.data<int>(),
+                                               cum_offsets_out.data<int>(),
+                                               cu_seqlens_q.data<int>(),
+                                               cu_seqlens_k.data<int>(),
+                                               x_remove_padding.data<int64_t>(),
+                                               input_ids.data<int64_t>(),
+                                               cum_offsets.data<int>(),
+                                               seq_len.data<int>(),
+                                               seq_length,
+                                               bsz);
+    PD_CHECK(r == 0, "fastdeploy::plugin::get_padding_offset failed.");
  }

  return {x_remove_padding,
@@ -44,7 +44,7 @@ void TokenPenaltyMultiScores(const paddle::Tensor &pre_ids,
    case paddle::DataType::FLOAT16: {
      using XPUType = typename XPUTypeTrait<float16>::Type;
      typedef paddle::float16 data_t;
-      int r = baidu::xpu::api::plugin::token_penalty_multi_scores(
+      int r = fastdeploy::plugin::token_penalty_multi_scores(
          xpu_ctx->x_context(),
          pre_ids.data<int64_t>(),
          reinterpret_cast<XPUType *>(
@@ -62,10 +62,11 @@ void TokenPenaltyMultiScores(const paddle::Tensor &pre_ids,
          length_id,
          end_length,
          length_bad_words);
-      PD_CHECK(r == 0, "xpu::plugin::token_penalty_multi_scores failed.");
+      PD_CHECK(r == 0,
+               "fastdeploy::plugin::token_penalty_multi_scores failed.");
    } break;
    case paddle::DataType::FLOAT32: {
-      int r = baidu::xpu::api::plugin::token_penalty_multi_scores(
+      int r = fastdeploy::plugin::token_penalty_multi_scores(
          xpu_ctx->x_context(),
          pre_ids.data<int64_t>(),
          const_cast<float *>(logits.data<float>()),
@@ -82,7 +83,8 @@ void TokenPenaltyMultiScores(const paddle::Tensor &pre_ids,
          length_id,
          end_length,
          length_bad_words);
-      PD_CHECK(r == 0, "xpu::plugin::token_penalty_multi_scores failed.");
+      PD_CHECK(r == 0,
+               "fastdeploy::plugin::token_penalty_multi_scores failed.");
    } break;
    default:
      PD_THROW(
@@ -34,7 +34,7 @@ void LimitThinkingContentLengthV1(const paddle::Tensor& next_tokens,

  const int batch_size = next_tokens.shape()[0];
  const int eos_token_id_len = eos_token_ids.shape()[0];
-  int r = baidu::xpu::api::plugin::limit_thinking_content_length_kernel_v1(
+  int r = fastdeploy::plugin::limit_thinking_content_length_kernel_v1(
      xpu_ctx->x_context(),
      const_cast<int64_t*>(next_tokens.data<int64_t>()),
      max_think_lens.data<int>(),
@@ -46,7 +46,7 @@ void LimitThinkingContentLengthV1(const paddle::Tensor& next_tokens,
      batch_size,
      eos_token_id_len);
  PD_CHECK(r == 0,
-           "baidu::xpu::api::plugin::limit_thinking_content_length_kernel_v1 "
+           "fastdeploy::plugin::limit_thinking_content_length_kernel_v1 "
           "failed.");
 }

@@ -33,7 +33,7 @@ void LimitThinkingContentLengthV2(const paddle::Tensor& next_tokens,
  auto xpu_ctx = static_cast<const phi::XPUContext*>(dev_ctx);

  const int batch_size = next_tokens.shape()[0];
-  int r = baidu::xpu::api::plugin::limit_thinking_content_length_kernel_v2(
+  int r = fastdeploy::plugin::limit_thinking_content_length_kernel_v2(
      xpu_ctx->x_context(),
      const_cast<int64_t*>(next_tokens.data<int64_t>()),
      max_think_lens.data<int>(),
@@ -44,7 +44,7 @@ void LimitThinkingContentLengthV2(const paddle::Tensor& next_tokens,
      line_break_id,
      batch_size);
  PD_CHECK(r == 0,
-           "baidu::xpu::api::plugin::limit_thinking_content_length_kernel_v2 "
+           "fastdeploy::plugin::limit_thinking_content_length_kernel_v2 "
           "failed.");
 }

@@ -30,7 +30,7 @@ void DraftModelPostprocess(const paddle::Tensor& base_model_draft_tokens,
  auto xpu_ctx = static_cast<const phi::XPUContext*>(dev_ctx);
  int real_bsz = base_model_draft_tokens.shape()[0];
  int base_model_draft_token_len = base_model_draft_tokens.shape()[1];
-  int r = baidu::xpu::api::plugin::draft_model_postprocess(
+  int r = fastdeploy::plugin::draft_model_postprocess(
      xpu_ctx->x_context(),
      const_cast<int64_t*>(base_model_draft_tokens.data<int64_t>()),
      const_cast<int*>(base_model_seq_lens_this_time.data<int>()),
@@ -38,7 +38,7 @@ void DraftModelPostprocess(const paddle::Tensor& base_model_draft_tokens,
      const_cast<bool*>(base_model_stop_flags.data<bool>()),
      real_bsz,
      base_model_draft_token_len);
-  PADDLE_ENFORCE_XDNN_SUCCESS(r, "");
+  PADDLE_ENFORCE_XDNN_SUCCESS(r, "draft_model_postprocess");
 }

 PD_BUILD_STATIC_OP(draft_model_postprocess)
@@ -64,7 +64,7 @@ void DraftModelPreprocess(const paddle::Tensor& draft_tokens,
  auto not_need_stop_gpu =
      not_need_stop.copy_to(seq_lens_this_time.place(), false);

-  int r = baidu::xpu::api::plugin::draft_model_preprocess(
+  int r = fastdeploy::plugin::draft_model_preprocess(
      ctx,
      const_cast<int64_t*>(draft_tokens.data<int64_t>()),
      const_cast<int64_t*>(input_ids.data<int64_t>()),
@@ -97,7 +97,7 @@ void DraftModelPreprocess(const paddle::Tensor& draft_tokens,
      splitwise_prefill,
      kvcache_scheduler_v1);

-  PD_CHECK(r == 0, "xpu::plugin::draft_model_preprocess failed.");
+  PD_CHECK(r == 0, "fastdeploy::plugin::draft_model_preprocess failed.");
  auto not_need_stop_cpu =
      not_need_stop_gpu.copy_to(not_need_stop.place(), false);
  bool* not_need_stop_data = const_cast<bool*>(not_need_stop.data<bool>());
@@ -63,7 +63,7 @@ void DraftModelUpdate(const paddle::Tensor& inter_next_tokens,
    }
  }

-  int r = baidu::xpu::api::plugin::draft_model_update(
+  int r = fastdeploy::plugin::draft_model_update(
      ctx,
      inter_next_tokens.data<int64_t>(),
      const_cast<int64_t*>(draft_tokens.data<int64_t>()),
@@ -47,18 +47,19 @@ std::vector<paddle::Tensor> EagleGetHiddenStates(
  auto output_token_num = paddle::empty(
      {1}, seq_lens_this_time.dtype(), seq_lens_this_time.place());

-  int r = api::plugin::compute_order(ctx,
-                                     seq_lens_this_time.data<int>(),
-                                     seq_lens_encoder.data<int>(),
-                                     base_model_seq_lens_this_time.data<int>(),
-                                     base_model_seq_lens_encoder.data<int>(),
-                                     accept_nums.data<int>(),
-                                     position_map.data<int>(),
-                                     output_token_num.data<int>(),
-                                     bsz,
-                                     actual_draft_token_num,
-                                     input_token_num);
-  PD_CHECK(r == 0, "xpu::plugin::compute_order failed.");
+  int r = fastdeploy::plugin::compute_order(
+      ctx,
+      seq_lens_this_time.data<int>(),
+      seq_lens_encoder.data<int>(),
+      base_model_seq_lens_this_time.data<int>(),
+      base_model_seq_lens_encoder.data<int>(),
+      accept_nums.data<int>(),
+      position_map.data<int>(),
+      output_token_num.data<int>(),
+      bsz,
+      actual_draft_token_num,
+      input_token_num);
+  PD_CHECK(r == 0, "fastdeploy::plugin::compute_order failed.");

  int output_token_num_cpu =
      output_token_num.copy_to(paddle::CPUPlace(), false).data<int>()[0];
@@ -72,7 +73,7 @@ std::vector<paddle::Tensor> EagleGetHiddenStates(
    case paddle::DataType::BFLOAT16:
      using XPUTypeBF16 = typename XPUTypeTrait<bfloat16>::Type;
      typedef paddle::bfloat16 bf16_data_t;
-      r = api::plugin::rebuild_hidden_states(
+      r = fastdeploy::plugin::rebuild_hidden_states(
          ctx,
          reinterpret_cast<const XPUTypeBF16*>(input.data<bf16_data_t>()),
          position_map.data<int>(),
@@ -80,12 +81,12 @@ std::vector<paddle::Tensor> EagleGetHiddenStates(
          dim_embed,
          elem_cnt,
          output_token_num_cpu);
-      PD_CHECK(r == 0, "xpu::plugin::rebuild_hidden_states failed.");
+      PD_CHECK(r == 0, "fastdeploy::plugin::rebuild_hidden_states failed.");
      return {out};
    case paddle::DataType::FLOAT16:
      using XPUTypeFP16 = typename XPUTypeTrait<float16>::Type;
      typedef paddle::float16 fp16_data_t;
-      r = api::plugin::rebuild_hidden_states(
+      r = fastdeploy::plugin::rebuild_hidden_states(
          ctx,
          reinterpret_cast<const XPUTypeFP16*>(input.data<fp16_data_t>()),
          position_map.data<int>(),
@@ -93,10 +94,10 @@ std::vector<paddle::Tensor> EagleGetHiddenStates(
          dim_embed,
          elem_cnt,
          output_token_num_cpu);
-      PD_CHECK(r == 0, "xpu::plugin::rebuild_hidden_states failed.");
+      PD_CHECK(r == 0, "fastdeploy::plugin::rebuild_hidden_states failed.");
      return {out};
    case paddle::DataType::FLOAT32:
-      r = api::plugin::rebuild_hidden_states(
+      r = fastdeploy::plugin::rebuild_hidden_states(
          ctx,
          reinterpret_cast<const float*>(input.data<float>()),
          position_map.data<int>(),
@@ -104,7 +105,7 @@ std::vector<paddle::Tensor> EagleGetHiddenStates(
          dim_embed,
          elem_cnt,
          output_token_num_cpu);
-      PD_CHECK(r == 0, "xpu::plugin::rebuild_hidden_states failed.");
+      PD_CHECK(r == 0, "fastdeploy::plugin::rebuild_hidden_states failed.");
      return {out};
    default:
      PD_THROW("Unsupported data type.");
@@ -43,7 +43,7 @@ std::vector<paddle::Tensor> EagleGetSelfHiddenStates(
  auto output_token_num = paddle::empty(
      {1}, seq_lens_this_time.dtype(), seq_lens_this_time.place());

-  int r = api::plugin::compute_self_order(
+  int r = fastdeploy::plugin::compute_self_order(
      ctx,
      reinterpret_cast<const int*>(last_seq_lens_this_time.data<int>()),
      reinterpret_cast<const int*>(seq_lens_this_time.data<int>()),
@@ -51,7 +51,7 @@ std::vector<paddle::Tensor> EagleGetSelfHiddenStates(
      reinterpret_cast<int*>(src_map.data<int>()),
      reinterpret_cast<int*>(output_token_num.data<int>()),
      bsz);
-  PD_CHECK(r == 0, "xpu::plugin::compute_self_order failed.");
+  PD_CHECK(r == 0, "fastdeploy::plugin::compute_self_order failed.");

  int output_token_num_cpu =
      output_token_num.copy_to(paddle::CPUPlace(), false).data<int>()[0];
@@ -67,7 +67,7 @@ std::vector<paddle::Tensor> EagleGetSelfHiddenStates(
    case paddle::DataType::BFLOAT16:
      using XPUTypeBF16 = typename XPUTypeTrait<bfloat16>::Type;
      typedef paddle::bfloat16 bf16_data_t;
-      r = api::plugin::rebuild_self_hidden_states(
+      r = fastdeploy::plugin::rebuild_self_hidden_states(
          ctx,
          reinterpret_cast<const XPUTypeBF16*>(input.data<bf16_data_t>()),
          src_map.data<int>(),
@@ -75,12 +75,13 @@ std::vector<paddle::Tensor> EagleGetSelfHiddenStates(
          input_token_num,
          dim_embed,
          elem_cnt);
-      PD_CHECK(r == 0, "xpu::plugin::rebuild_self_hidden_states failed.");
+      PD_CHECK(r == 0,
+               "fastdeploy::plugin::rebuild_self_hidden_states failed.");
      return {out};
    case paddle::DataType::FLOAT16:
      using XPUTypeFP16 = typename XPUTypeTrait<float16>::Type;
      typedef paddle::float16 fp16_data_t;
-      r = api::plugin::rebuild_self_hidden_states(
+      r = fastdeploy::plugin::rebuild_self_hidden_states(
          ctx,
          reinterpret_cast<const XPUTypeFP16*>(input.data<fp16_data_t>()),
          src_map.data<int>(),
@@ -88,10 +89,11 @@ std::vector<paddle::Tensor> EagleGetSelfHiddenStates(
          input_token_num,
          dim_embed,
          elem_cnt);
-      PD_CHECK(r == 0, "xpu::plugin::rebuild_self_hidden_states failed.");
+      PD_CHECK(r == 0,
+               "fastdeploy::plugin::rebuild_self_hidden_states failed.");
      return {out};
    case paddle::DataType::FLOAT32:
-      r = api::plugin::rebuild_self_hidden_states(
+      r = fastdeploy::plugin::rebuild_self_hidden_states(
          ctx,
          reinterpret_cast<const float*>(input.data<float>()),
          src_map.data<int>(),
@@ -99,7 +101,8 @@ std::vector<paddle::Tensor> EagleGetSelfHiddenStates(
          input_token_num,
          dim_embed,
          elem_cnt);
-      PD_CHECK(r == 0, "xpu::plugin::rebuild_self_hidden_states failed.");
+      PD_CHECK(r == 0,
+               "fastdeploy::plugin::rebuild_self_hidden_states failed.");
      return {out};
    default:
      PD_THROW("Unsupported data type.");
@@ -46,7 +46,7 @@ void MTPStepPaddle(
  const int bsz = seq_lens_this_time.shape()[0];
  const int block_num_per_seq = block_tables.shape()[1];

-  int r = baidu::xpu::api::plugin::mtp_free_and_dispatch_block(
+  int r = fastdeploy::plugin::mtp_free_and_dispatch_block(
      ctx,
      const_cast<bool *>(base_model_stop_flags.data<bool>()),
      const_cast<bool *>(stop_flags.data<bool>()),
@@ -27,7 +27,7 @@ void SpeculateClearAcceptNums(const paddle::Tensor& accept_num,
  auto dev_ctx = paddle::experimental::DeviceContextPool::Instance().Get(place);
  auto xpu_ctx = static_cast<const phi::XPUContext*>(dev_ctx);
  const int max_bsz = seq_lens_decoder.shape()[0];
-  int r = baidu::xpu::api::plugin::speculate_clear_accept_nums(
+  int r = fastdeploy::plugin::speculate_clear_accept_nums(
      xpu_ctx->x_context(),
      const_cast<int*>(accept_num.data<int>()),
      seq_lens_decoder.data<int>(),
@@ -43,7 +43,7 @@ void SpeculateGetLogits(const paddle::Tensor& draft_logits,
  const int vocab_size = logits.shape()[1];
  const int real_bsz = seq_lens_this_time.shape()[0];

-  baidu::xpu::api::plugin::speculate_get_logits(
+  int r = fastdeploy::plugin::speculate_get_logits(
      ctx,
      const_cast<float*>(draft_logits.data<float>()),
      const_cast<int*>(next_token_num.data<int>()),
@@ -56,6 +56,7 @@ void SpeculateGetLogits(const paddle::Tensor& draft_logits,
      seq_lens_encoder.data<int>(),
      real_bsz,
      vocab_size);
+  PADDLE_ENFORCE_XDNN_SUCCESS(r, "speculate_get_logits");
  if (draft_logits.is_cpu()) {
    delete ctx;
  }
@@ -45,7 +45,7 @@ std::vector<paddle::Tensor> SpeculateGetOutputPaddingOffset(
  auto output_cum_offsets =
      output_cum_offsets_tmp.copy_to(output_cum_offsets_tmp.place(), false);
  if (cpu_out_token_num.data<int64_t>()[0] > 0) {
-    int r = baidu::xpu::api::plugin::speculate_get_output_padding_offset(
+    int r = fastdeploy::plugin::speculate_get_output_padding_offset(
        ctx,
        output_padding_offset.mutable_data<int>(),
        output_cum_offsets.mutable_data<int>(),
@@ -58,7 +58,7 @@ std::vector<paddle::Tensor> SpeculateGetPaddingOffset(
  PD_CHECK(seq_len.is_contiguous(), "Seq lens tensor must be contiguous");

  if (token_num_data > 0) {
-    int r = baidu::xpu::api::plugin::speculate_get_padding_offset(
+    int r = fastdeploy::plugin::speculate_get_padding_offset(
        xpu_ctx->x_context(),
        batch_id_per_token.data<int>(),
        cum_offsets_out.data<int>(),
@@ -70,7 +70,7 @@ std::vector<paddle::Tensor> SpeculateGetPaddingOffset(
        bsz);
    PD_CHECK(r == 0, "XPU speculate_get_padding_offset failed");

-    r = baidu::xpu::api::plugin::speculate_remove_padding<int64_t>(
+    r = fastdeploy::plugin::speculate_remove_padding<int64_t>(
        xpu_ctx->x_context(),
        x_remove_padding.data<int64_t>(),
        input_ids.data<int64_t>(),
@@ -38,7 +38,7 @@ std::vector<paddle::Tensor> SpeculateGetSeqLensOutput(
  auto seq_lens_output = paddle::full(
      {bsz}, 0, paddle::DataType::INT32, seq_lens_this_time.place());

-  int r = baidu::xpu::api::plugin::speculate_get_seq_lens_output(
+  int r = fastdeploy::plugin::speculate_get_seq_lens_output(
      ctx,
      seq_lens_output.data<int>(),
      seq_lens_this_time.data<int>(),
@@ -46,7 +46,7 @@ std::vector<paddle::Tensor> RebuildAppendPadding(
    case paddle::DataType::BFLOAT16:
      using XPUTypeBF16 = typename XPUTypeTrait<bfloat16>::Type;
      typedef paddle::bfloat16 bf16_data_t;
-      r = api::plugin::speculate_rebuild_append_padding<XPUTypeBF16>(
+      r = fastdeploy::plugin::speculate_rebuild_append_padding<XPUTypeBF16>(
          ctx,
          const_cast<XPUTypeBF16*>(reinterpret_cast<const XPUTypeBF16*>(
              full_hidden_states.data<bf16_data_t>())),
@@ -58,12 +58,13 @@ std::vector<paddle::Tensor> RebuildAppendPadding(
          dim_embed,
          elem_nums,
          reinterpret_cast<XPUTypeBF16*>(out.data<bf16_data_t>()));
-      PD_CHECK(r == 0, "xpu::plugin::speculate_rebuild_append_padding failed.");
+      PD_CHECK(r == 0,
+               "fastdeploy::plugin::speculate_rebuild_append_padding failed.");
      return {out};
    case paddle::DataType::FLOAT16:
      using XPUTypeFP16 = typename XPUTypeTrait<float16>::Type;
      typedef paddle::float16 fp16_data_t;
-      r = api::plugin::speculate_rebuild_append_padding<XPUTypeFP16>(
+      r = fastdeploy::plugin::speculate_rebuild_append_padding<XPUTypeFP16>(
          ctx,
          const_cast<XPUTypeFP16*>(reinterpret_cast<const XPUTypeFP16*>(
              full_hidden_states.data<fp16_data_t>())),
@@ -75,10 +76,11 @@ std::vector<paddle::Tensor> RebuildAppendPadding(
          dim_embed,
          elem_nums,
          reinterpret_cast<XPUTypeFP16*>(out.data<fp16_data_t>()));
-      PD_CHECK(r == 0, "xpu::plugin::speculate_rebuild_append_padding failed.");
+      PD_CHECK(r == 0,
+               "fastdeploy::plugin::speculate_rebuild_append_padding failed.");
      return {out};
    case paddle::DataType::FLOAT32:
-      r = api::plugin::speculate_rebuild_append_padding<float>(
+      r = fastdeploy::plugin::speculate_rebuild_append_padding<float>(
          ctx,
          const_cast<float*>(full_hidden_states.data<float>()),
          const_cast<int*>(cum_offsets.data<int>()),
@@ -89,7 +91,8 @@ std::vector<paddle::Tensor> RebuildAppendPadding(
          dim_embed,
          elem_nums,
          out.data<float>());
-      PD_CHECK(r == 0, "xpu::plugin::speculate_rebuild_append_padding failed.");
+      PD_CHECK(r == 0,
+               "fastdeploy::plugin::speculate_rebuild_append_padding failed.");
      return {out};
    default:
      PD_THROW("Unsupported data type.");
@@ -62,7 +62,7 @@ void SpeculateScheduleCache(const paddle::Tensor &draft_tokens,
  }
  auto not_need_stop_gpu = not_need_stop.copy_to(stop_flags.place(), false);

-  int r = baidu::xpu::api::plugin::speculate_schedule_cache(
+  int r = fastdeploy::plugin::speculate_schedule_cache(
      ctx,
      draft_tokens.data<int64_t>(),
      const_cast<int *>(block_tables.data<int>()),
@@ -87,7 +87,7 @@ void SpeculateScheduleCache(const paddle::Tensor &draft_tokens,
      block_num_per_seq,
      prefill_one_step_stop);
  // kernel launch
-  PD_CHECK(r == 0, "speculate_free_and_reschedule  failed.");
+  PD_CHECK(r == 0, "speculate_schedule_cache failed.");

  auto not_need_stop_cpu =
      not_need_stop_gpu.copy_to(not_need_stop.place(), true);
@@ -50,7 +50,7 @@ void SpecGetStopFlagsMultiSeqs(const paddle::Tensor &accept_tokens,
  int pre_ids_len = pre_ids.shape()[1];
  int accept_tokens_len = accept_tokens.shape()[1];

-  int r = baidu::xpu::api::plugin::speculate_set_stop_value_multi_seqs(
+  int r = fastdeploy::plugin::speculate_set_stop_value_multi_seqs(
      ctx,
      const_cast<bool *>(stop_flags.data<bool>()),
      const_cast<int64_t *>(accept_tokens.data<int64_t>()),
@@ -67,7 +67,8 @@ void SpecGetStopFlagsMultiSeqs(const paddle::Tensor &accept_tokens,
      stop_seqs_bs,
      stop_seqs_max_len,
      pre_ids_len);
-  PD_CHECK(r == 0, "xpu::plugin::speculate_set_stop_value_multi_seqs failed.");
+  PD_CHECK(r == 0,
+           "fastdeploy::plugin::speculate_set_stop_value_multi_seqs failed.");
 }

 PD_BUILD_STATIC_OP(speculate_set_stop_value_multi_seqs)
@@ -42,7 +42,7 @@ void SpeculateSetValueByFlagsAndIdx(const paddle::Tensor &pre_ids_all,
  int length = pre_ids_all_shape[1];
  int max_draft_tokens = accept_tokens.shape()[1];

-  int r = baidu::xpu::api::plugin::speculate_set_value_by_flag_and_id(
+  int r = fastdeploy::plugin::speculate_set_value_by_flag_and_id(
      ctx,
      const_cast<int64_t *>(pre_ids_all.data<int64_t>()),
      accept_tokens.data<int64_t>(),
@@ -60,7 +60,7 @@ void SpeculateStepPaddleBase(
  const int length = input_ids.shape()[1];
  const int pre_id_length = pre_ids.shape()[1];
  const int max_decoder_block_num = pre_id_length / block_size;
-  int r = baidu::xpu::api::plugin::speculate_free_and_dispatch_block(
+  int r = fastdeploy::plugin::speculate_free_and_dispatch_block(
      ctx,
      const_cast<bool *>(stop_flags.data<bool>()),
      const_cast<int *>(seq_lens_this_time.data<int>()),
@@ -88,7 +88,7 @@ void SpeculateStepPaddleBase(
  auto recover_lens_cpu = recover_lens.copy_to(paddle::CPUPlace(), false);
  int recover_lens_cpu_data = recover_lens_cpu.data<int>()[0];
  if (recover_lens_cpu_data > 0) {
-    r = baidu::xpu::api::plugin::speculate_recover_block(
+    r = fastdeploy::plugin::speculate_recover_block(
        ctx,
        const_cast<int *>(recover_block_list.data<int>()),
        const_cast<int *>(recover_lens.data<int>()),
@@ -70,7 +70,7 @@ void SpeculateStepSchedule(
      paddle::full({1}, 0, paddle::DataType::INT32, stop_flags.place());
  auto step_bs_list =
      paddle::full({bsz}, 0, paddle::DataType::INT32, stop_flags.place());
-  int r = baidu::xpu::api::plugin::speculate_free_and_reschedule(
+  int r = fastdeploy::plugin::speculate_free_and_reschedule(
      ctx,
      const_cast<bool *>(stop_flags.data<bool>()),
      const_cast<int *>(seq_lens_this_time.data<int>()),
@@ -59,7 +59,7 @@ void SpeculateTokenPenaltyMultiScores(
    case paddle::DataType::BFLOAT16: {
      using XPUType = typename XPUTypeTrait<paddle::bfloat16>::Type;
      typedef paddle::bfloat16 data_t;
-      int r = baidu::xpu::api::plugin::speculate_token_penalty_multi_scores(
+      int r = fastdeploy::plugin::speculate_token_penalty_multi_scores(
          ctx,
          pre_ids.data<int64_t>(),
          reinterpret_cast<XPUType*>(
@@ -81,12 +81,13 @@ void SpeculateTokenPenaltyMultiScores(
          length_bad_words,
          token_num,
          max_seq_len);
-      PD_CHECK(r == 0, "xpu::plugin::token_penalty_multi_scores failed.");
+      PD_CHECK(r == 0,
+               "fastdeploy::plugin::token_penalty_multi_scores failed.");
    } break;
    case paddle::DataType::FLOAT16: {
      using XPUType = typename XPUTypeTrait<float16>::Type;
      typedef paddle::float16 data_t;
-      int r = baidu::xpu::api::plugin::speculate_token_penalty_multi_scores(
+      int r = fastdeploy::plugin::speculate_token_penalty_multi_scores(
          ctx,
          pre_ids.data<int64_t>(),
          reinterpret_cast<XPUType*>(
@@ -108,10 +109,11 @@ void SpeculateTokenPenaltyMultiScores(
          length_bad_words,
          token_num,
          max_seq_len);
-      PD_CHECK(r == 0, "xpu::plugin::token_penalty_multi_scores failed.");
+      PD_CHECK(r == 0,
+               "fastdeploy::plugin::token_penalty_multi_scores failed.");
    } break;
    case paddle::DataType::FLOAT32: {
-      int r = baidu::xpu::api::plugin::speculate_token_penalty_multi_scores(
+      int r = fastdeploy::plugin::speculate_token_penalty_multi_scores(
          ctx,
          pre_ids.data<int64_t>(),
          const_cast<float*>(logits.data<float>()),
@@ -132,7 +134,8 @@ void SpeculateTokenPenaltyMultiScores(
          length_bad_words,
          token_num,
          max_seq_len);
-      PD_CHECK(r == 0, "xpu::plugin::token_penalty_multi_scores failed.");
+      PD_CHECK(r == 0,
+               "fastdeploy::plugin::token_penalty_multi_scores failed.");
    } break;
    default:
      PD_THROW(
@@ -46,7 +46,7 @@ void SpeculateUpdate(const paddle::Tensor &seq_lens_encoder,
  }

  auto not_need_stop_xpu = not_need_stop.copy_to(stop_flags.place(), false);
-  int r = baidu::xpu::api::plugin::speculate_update(
+  int r = fastdeploy::plugin::speculate_update(
      ctx,
      const_cast<int *>(seq_lens_encoder.data<int>()),
      const_cast<int *>(seq_lens_decoder.data<int>()),
@@ -46,7 +46,7 @@ void SpeculateUpdateV3(const paddle::Tensor &seq_lens_encoder,
  }

  auto not_need_stop_xpu = not_need_stop.copy_to(stop_flags.place(), false);
-  int r = baidu::xpu::api::plugin::speculate_update_v3(
+  int r = fastdeploy::plugin::speculate_update_v3(
      ctx,
      const_cast<int *>(seq_lens_encoder.data<int>()),
      const_cast<int *>(seq_lens_decoder.data<int>()),
@@ -107,7 +107,7 @@ void SpeculateVerify(const paddle::Tensor &sampled_token_ids,
  int ret;
  if (use_topk) {
    if (enable_topp) {
-      ret = baidu::xpu::api::plugin::speculate_verify<true, true>(
+      ret = fastdeploy::plugin::speculate_verify<true, true>(
          ctx,
          sampled_token_ids.data<int64_t>(),
          const_cast<int64_t *>(accept_tokens.data<int64_t>()),
@@ -140,7 +140,7 @@ void SpeculateVerify(const paddle::Tensor &sampled_token_ids,
          use_target_sampling);
      PD_CHECK(ret == 0, "speculate_verify failed.");
    } else {
-      ret = baidu::xpu::api::plugin::speculate_verify<false, true>(
+      ret = fastdeploy::plugin::speculate_verify<false, true>(
          ctx,
          sampled_token_ids.data<int64_t>(),
          const_cast<int64_t *>(accept_tokens.data<int64_t>()),
@@ -175,7 +175,7 @@ void SpeculateVerify(const paddle::Tensor &sampled_token_ids,
    PD_CHECK(ret == 0, "speculate_verify failed.");
  } else {
    if (enable_topp) {
-      ret = baidu::xpu::api::plugin::speculate_verify<true, false>(
+      ret = fastdeploy::plugin::speculate_verify<true, false>(
          ctx,
          sampled_token_ids.data<int64_t>(),
          const_cast<int64_t *>(accept_tokens.data<int64_t>()),
@@ -208,7 +208,7 @@ void SpeculateVerify(const paddle::Tensor &sampled_token_ids,
          use_target_sampling);
      PD_CHECK(ret == 0, "speculate_verify failed.");
    } else {
-      ret = baidu::xpu::api::plugin::speculate_verify<false, false>(
+      ret = fastdeploy::plugin::speculate_verify<false, false>(
          ctx,
          sampled_token_ids.data<int64_t>(),
          const_cast<int64_t *>(accept_tokens.data<int64_t>()),
@@ -71,9 +71,9 @@ std::vector<paddle::Tensor> TopPCandidates(
      typedef paddle::bfloat16 bf16_data_t;
      switch (candidates_len) {
        FIXED_TOPK(
-            r = api::plugin::top_p_candidates<XPUTypeBF16,
-                                              TopKMaxLength,
-                                              kTopK>(
+            r = fastdeploy::plugin::top_p_candidates<XPUTypeBF16,
+                                                     TopKMaxLength,
+                                                     kTopK>(
                ctx,
                reinterpret_cast<const XPUTypeBF16*>(probs.data<bf16_data_t>()),
                reinterpret_cast<const XPUTypeBF16*>(top_p.data<bf16_data_t>()),
@@ -86,7 +86,7 @@ std::vector<paddle::Tensor> TopPCandidates(
                token_num,
                candidates_len,
                max_seq_len);
-            PD_CHECK(r == 0, "xpu::plugin::top_p_candidates failed.");
+            PD_CHECK(r == 0, "fastdeploy::plugin::top_p_candidates failed.");
            return {verify_scores, verify_tokens, actual_candidate_lens});
      }
    case paddle::DataType::FLOAT16:
@@ -94,9 +94,9 @@ std::vector<paddle::Tensor> TopPCandidates(
      typedef paddle::float16 fp16_data_t;
      switch (candidates_len) {
        FIXED_TOPK(
-            r = api::plugin::top_p_candidates<XPUTypeFP16,
-                                              TopKMaxLength,
-                                              kTopK>(
+            r = fastdeploy::plugin::top_p_candidates<XPUTypeFP16,
+                                                     TopKMaxLength,
+                                                     kTopK>(
                ctx,
                reinterpret_cast<const XPUTypeFP16*>(probs.data<fp16_data_t>()),
                reinterpret_cast<const XPUTypeFP16*>(top_p.data<fp16_data_t>()),
@@ -109,25 +109,26 @@ std::vector<paddle::Tensor> TopPCandidates(
                token_num,
                candidates_len,
                max_seq_len);
-            PD_CHECK(r == 0, "xpu::plugin::top_p_candidates failed.");
+            PD_CHECK(r == 0, "fastdeploy::plugin::top_p_candidates failed.");
            return {verify_scores, verify_tokens, actual_candidate_lens});
      }
    case paddle::DataType::FLOAT32:
      switch (candidates_len) {
        FIXED_TOPK(
-            r = api::plugin::top_p_candidates<float, TopKMaxLength, kTopK>(
-                ctx,
-                probs.data<float>(),
-                top_p.data<float>(),
-                output_padding_offset.data<int>(),
-                verify_tokens.data<int64_t>(),
-                verify_scores.data<float>(),
-                actual_candidate_lens.data<int>(),
-                vocab_size,
-                token_num,
-                candidates_len,
-                max_seq_len);
-            PD_CHECK(r == 0, "xpu::plugin::top_p_candidates failed.");
+            r = fastdeploy::plugin::
+                top_p_candidates<float, TopKMaxLength, kTopK>(
+                    ctx,
+                    probs.data<float>(),
+                    top_p.data<float>(),
+                    output_padding_offset.data<int>(),
+                    verify_tokens.data<int64_t>(),
+                    verify_scores.data<float>(),
+                    actual_candidate_lens.data<int>(),
+                    vocab_size,
+                    token_num,
+                    candidates_len,
+                    max_seq_len);
+            PD_CHECK(r == 0, "fastdeploy::plugin::top_p_candidates failed.");
            return {verify_scores, verify_tokens, actual_candidate_lens});
      }
    default:
@@ -53,7 +53,7 @@ std::vector<paddle::Tensor> UpdateAttnMaskOffsets(
                                        paddle::DataType::INT32,
                                        ids_remove_padding.place());

-  baidu::xpu::api::plugin::update_attn_mask_offsets(
+  int r = fastdeploy::plugin::update_attn_mask_offsets(
      ctx,
      attn_mask_offsets.data<int>(),
      seq_lens_this_time.data<int>(),
@@ -68,6 +68,7 @@ std::vector<paddle::Tensor> UpdateAttnMaskOffsets(
      real_bsz,
      max_model_len,
      decode_states_len);
+  PADDLE_ENFORCE_XDNN_SUCCESS(r, "update_attn_mask_offsets");

  if (ids_remove_padding.is_cpu()) {
    delete ctx;
@@ -66,7 +66,7 @@ std::vector<paddle::Tensor> RecoverBatchSequenceKernel(
  paddle::Tensor out;
  out = paddle::empty({token_num, dim}, x.type(), x.place());

-  int r = baidu::xpu::api::plugin::eb_recover_batch_sequence<XPUType, XPUType>(
+  int r = fastdeploy::plugin::eb_recover_batch_sequence<XPUType, XPUType>(
      ctx,
      reinterpret_cast<const XPUType*>(x.data<data_t>()),
      reinterpret_cast<XPUType*>(out.data<data_t>()),
@@ -75,7 +75,7 @@ std::vector<paddle::Tensor> RecoverBatchSequenceKernel(
      encoder_batch_map_vp,
      decoder_batch_map_vp,
      dim);
-  PD_CHECK(r == 0, "xpu::plugin::eb_recover_batch_sequence failed.");
+  PD_CHECK(r == 0, "fastdeploy::plugin::eb_recover_batch_sequence failed.");
  return {out};
 }

@@ -42,7 +42,7 @@ void RecoverDecodeTask(
  int r = 0;
  if (draft_tokens) {
    const int draft_tokens_len = draft_tokens.get_ptr()->shape()[1];
-    r = baidu::xpu::api::plugin::recover_spec_decode_task(
+    r = fastdeploy::plugin::recover_spec_decode_task(
        xpu_ctx->x_context(),
        const_cast<bool *>(stop_flags.data<bool>()),
        const_cast<int *>(seq_lens_this_time.data<int>()),
@@ -60,7 +60,7 @@ void RecoverDecodeTask(
        draft_tokens_len,
        max_draft_tokens * 2 + 1);
  } else {
-    r = baidu::xpu::api::plugin::recover_decode_task(
+    r = fastdeploy::plugin::recover_decode_task(
        xpu_ctx->x_context(),
        const_cast<bool *>(stop_flags.data<bool>()),
        const_cast<int *>(seq_lens_this_time.data<int>()),
@@ -73,7 +73,7 @@ void RecoverDecodeTask(
        block_num_per_seq,
        block_size);
  }
-  PD_CHECK(r == 0, "baidu::xpu::api::plugin::recover_decode_task failed.");
+  PD_CHECK(r == 0, "fastdeploy::plugin::recover_decode_task failed.");
 }

 PD_BUILD_STATIC_OP(recover_decode_task)
@@ -30,7 +30,7 @@ void SetValueByFlagsAndIdx(const paddle::Tensor &pre_ids_all,
  int bs = seq_lens_this_time.shape()[0];
  int length = pre_ids_all.shape()[1];
  int length_input_ids = input_ids.shape()[1];
-  int r = baidu::xpu::api::plugin::set_value_by_flags_and_idx(
+  int r = fastdeploy::plugin::set_value_by_flags_and_idx(
      xpu_ctx->x_context(),
      stop_flags.data<bool>(),
      const_cast<int64_t *>(pre_ids_all.data<int64_t>()),
@@ -41,7 +41,7 @@ void SetValueByFlagsAndIdx(const paddle::Tensor &pre_ids_all,
      bs,
      length,
      length_input_ids);
-  PD_CHECK(r == 0, "xpu::plugin::set_value_by_flags_and_idx failed.");
+  PD_CHECK(r == 0, "fastdeploy::plugin::set_value_by_flags_and_idx failed.");
 }

 PD_BUILD_OP(set_value_by_flags_and_idx)
@@ -41,26 +41,25 @@ void SpeculateLimitThinkingContentLength(const paddle::Tensor& next_tokens,
  const int eos_token_id_len = eos_token_ids.shape()[0];
  const int inject_len = inject_token_ids.shape()[0];

-  int r =
-      baidu::xpu::api::plugin::speculate_limit_thinking_content_length_kernel(
-          xpu_ctx->x_context(),
-          const_cast<int64_t*>(next_tokens.data<int64_t>()),
-          max_think_lens.data<int>(),
-          const_cast<int*>(max_reply_lens.data<int>()),
-          const_cast<int64_t*>(step_idx.data<int64_t>()),
-          eos_token_ids.data<int64_t>(),
-          const_cast<int*>(limit_status.data<int>()),
-          const_cast<int*>(accept_num.data<int>()),
-          stop_flags.data<bool>(),
-          think_end_id,
-          (inject_len > 0) ? inject_token_ids.data<int64_t>() : nullptr,
-          tokens_per_step,
-          batch_size,
-          eos_token_id_len,
-          inject_len,
-          splitwise_role_is_decode);
+  int r = fastdeploy::plugin::speculate_limit_thinking_content_length_kernel(
+      xpu_ctx->x_context(),
+      const_cast<int64_t*>(next_tokens.data<int64_t>()),
+      max_think_lens.data<int>(),
+      const_cast<int*>(max_reply_lens.data<int>()),
+      const_cast<int64_t*>(step_idx.data<int64_t>()),
+      eos_token_ids.data<int64_t>(),
+      const_cast<int*>(limit_status.data<int>()),
+      const_cast<int*>(accept_num.data<int>()),
+      stop_flags.data<bool>(),
+      think_end_id,
+      (inject_len > 0) ? inject_token_ids.data<int64_t>() : nullptr,
+      tokens_per_step,
+      batch_size,
+      eos_token_id_len,
+      inject_len,
+      splitwise_role_is_decode);
  PD_CHECK(r == 0,
-           "baidu::xpu::api::plugin::"
+           "fastdeploy::plugin::"
           "speculate_limit_thinking_content_length_kernel failed.");
 }

@@ -55,7 +55,7 @@ void StepPaddle(const paddle::Tensor &stop_flags,
  const int length = input_ids.shape()[1];
  const int pre_id_length = pre_ids.shape()[1];
  const int max_decoder_block_num = pre_id_length / block_size;
-  int r = baidu::xpu::api::plugin::free_and_dispatch_block(
+  int r = fastdeploy::plugin::free_and_dispatch_block(
      xpu_ctx->x_context(),
      const_cast<bool *>(stop_flags.data<bool>()),
      const_cast<int *>(seq_lens_this_time.data<int>()),
@@ -81,7 +81,7 @@ void StepPaddle(const paddle::Tensor &stop_flags,
  auto recover_lens_cpu = recover_lens.copy_to(paddle::CPUPlace(), false);
  int recover_lens_cpu_data = recover_lens_cpu.data<int>()[0];
  if (recover_lens_cpu_data > 0) {
-    r = baidu::xpu::api::plugin::recover_block(
+    r = fastdeploy::plugin::recover_block(
        xpu_ctx->x_context(),
        const_cast<int *>(recover_block_list.data<int>()),
        const_cast<int *>(recover_lens.data<int>()),
@@ -39,7 +39,7 @@ void GetStopFlagsMulti(const paddle::Tensor &topk_ids,
  std::vector<int64_t> shape = topk_ids.shape();
  int64_t bs_now = shape[0];
  int64_t end_length = end_ids.shape()[0];
-  int r = baidu::xpu::api::plugin::set_stop_value_multi_ends<int64_t>(
+  int r = fastdeploy::plugin::set_stop_value_multi_ends<int64_t>(
      xpu_ctx->x_context(),
      const_cast<bool *>(stop_flags.data<bool>()),
      const_cast<int64_t *>(topk_ids.data<int64_t>()),
@@ -49,7 +49,7 @@ void GetStopFlagsMulti(const paddle::Tensor &topk_ids,
      bs_now,
      end_length,
      beam_search);
-  PD_CHECK(r == 0, "xpu::plugin::set_stop_value_multi_ends failed.");
+  PD_CHECK(r == 0, "fastdeploy::plugin::set_stop_value_multi_ends failed.");
 }

 PD_BUILD_OP(set_stop_value_multi_ends)
@@ -42,7 +42,7 @@ std::vector<paddle::Tensor> TextImageGatherScatter(
    case paddle::DataType::BFLOAT16: {
      using XPUType = typename XPUTypeTrait<bfloat16>::Type;
      typedef paddle::bfloat16 data_t;
-      int r = baidu::xpu::api::plugin::text_image_gather_scatter<XPUType>(
+      int r = fastdeploy::plugin::text_image_gather_scatter<XPUType>(
          xpu_ctx->x_context(),
          reinterpret_cast<XPUType*>(input.data<data_t>()),
          reinterpret_cast<XPUType*>(text_input.data<data_t>()),
@@ -28,7 +28,7 @@ void TextImageIndexOut(const paddle::Tensor& token_type_ids,
  auto dev_ctx = paddle::experimental::DeviceContextPool::Instance().Get(place);
  auto xpu_ctx = static_cast<const phi::XPUContext*>(dev_ctx);
  const int64_t token_num = token_type_ids.shape()[0];
-  int r = baidu::xpu::api::plugin::text_image_index_out(
+  int r = fastdeploy::plugin::text_image_index_out(
      xpu_ctx->x_context(),
      token_type_ids.data<int32_t>(),
      const_cast<int32_t*>(text_index.data<int32_t>()),
@@ -39,7 +39,7 @@ void UpdateInputs(const paddle::Tensor& stop_flags,
  const int input_ids_stride = input_ids.shape()[1];
  auto not_need_stop_xpu = not_need_stop.copy_to(stop_flags.place(), false);

-  int r = baidu::xpu::api::plugin::update_inputs(
+  int r = fastdeploy::plugin::update_inputs(
      xpu_ctx->x_context(),
      const_cast<bool*>(not_need_stop_xpu.data<bool>()),
      const_cast<int*>(seq_lens_this_time.data<int>()),
@@ -52,7 +52,7 @@ void UpdateInputs(const paddle::Tensor& stop_flags,
      now_bsz,
      max_bsz,
      input_ids_stride);
-  PD_CHECK(r == 0, "baidu::xpu::api::plugin::update_inputs failed.");
+  PD_CHECK(r == 0, "fastdeploy::plugin::update_inputs failed.");
  auto not_need_stop_cpu =
      not_need_stop_xpu.copy_to(not_need_stop.place(), false);
  bool* not_need_stop_data = const_cast<bool*>(not_need_stop.data<bool>());
@@ -44,7 +44,7 @@ void UpdateInputsV1(const paddle::Tensor& stop_flags,
  const int input_ids_stride = input_ids.shape()[1];
  const int block_num_per_seq = block_tables.shape()[1];
  auto not_need_stop_gpu = not_need_stop.copy_to(stop_flags.place(), false);
-  int r = baidu::xpu::api::plugin::update_inputs_v1(
+  int r = fastdeploy::plugin::update_inputs_v1(
      xpu_ctx->x_context(),
      const_cast<bool*>(not_need_stop_gpu.data<bool>()),
      const_cast<int*>(seq_lens_this_time.data<int>()),
@@ -63,7 +63,7 @@ void UpdateInputsV1(const paddle::Tensor& stop_flags,
      input_ids_stride,
      block_num_per_seq,
      block_size);
-  PD_CHECK(r == 0, "baidu::xpu::api::plugin::update_inputs_kernel_v1 failed.");
+  PD_CHECK(r == 0, "fastdeploy::plugin::update_inputs_kernel_v1 failed.");
  auto not_need_stop_cpu =
      not_need_stop_gpu.copy_to(not_need_stop.place(), false);
  bool* not_need_stop_data = const_cast<bool*>(not_need_stop.data<bool>());
@@ -34,15 +34,14 @@ std::vector<paddle::Tensor> WeightQuantizeKernel(const paddle::Tensor &x,
  if (algo == "weight_only_int8") {
    paddle::Tensor out =
        paddle::full({k, n}, 0, paddle::DataType::INT8, x.place());
-    int ret =
-        baidu::xpu::api::plugin::quant2d_per_channel<XPUType, float, int8_t>(
-            xpu_ctx->x_context(),
-            reinterpret_cast<const XPUType *>(x.template data<T>()),
-            nullptr,
-            out.data<int8_t>(),
-            scale.data<float>(),
-            k,
-            n);
+    int ret = fastdeploy::plugin::quant2d_per_channel<XPUType, float, int8_t>(
+        xpu_ctx->x_context(),
+        reinterpret_cast<const XPUType *>(x.template data<T>()),
+        nullptr,
+        out.data<int8_t>(),
+        scale.data<float>(),
+        k,
+        n);
    PD_CHECK(ret == 0);
    return {out, scale};
  } else if (algo == "weight_only_int4") {
@@ -18,13 +18,17 @@
 #pragma once
 #include "xpu/xdnn.h"

-namespace baidu {
-namespace xpu {
-namespace api {
+namespace fd_xpu3 {
+typedef xpu3::int64_t int64_t;
+}
+
+namespace fastdeploy {
 namespace plugin {

+namespace api = baidu::xpu::api;
+
 template <typename T>
-DLL_EXPORT int set_stop_value_multi_ends(Context* ctx,
+DLL_EXPORT int set_stop_value_multi_ends(api::Context* ctx,
                                         bool* stop_flags,
                                         T* topk_ids,
                                         T* next_tokens,
@@ -34,7 +38,7 @@ DLL_EXPORT int set_stop_value_multi_ends(Context* ctx,
                                         const int end_length,
                                         const bool beam_search);

-DLL_EXPORT int set_value_by_flags_and_idx(Context* ctx,
+DLL_EXPORT int set_value_by_flags_and_idx(api::Context* ctx,
                                          const bool* stop_flags,
                                          int64_t* pre_ids_all,
                                          const int64_t* input_ids,
@@ -46,7 +50,7 @@ DLL_EXPORT int set_value_by_flags_and_idx(Context* ctx,
                                          int length_input_ids);

 template <typename T>
-DLL_EXPORT int token_penalty_multi_scores(Context* ctx,
+DLL_EXPORT int token_penalty_multi_scores(api::Context* ctx,
                                          const int64_t* pre_ids,
                                          T* logits,
                                          const T* penalty_scores,
@@ -63,7 +67,7 @@ DLL_EXPORT int token_penalty_multi_scores(Context* ctx,
                                          const int64_t end_length,
                                          const int64_t length_bad_words);

-DLL_EXPORT int get_padding_offset(Context* ctx,
+DLL_EXPORT int get_padding_offset(api::Context* ctx,
                                  int* padding_offset,
                                  int* cum_offsets_out,
                                  int* cu_seqlens_q,
@@ -75,7 +79,7 @@ DLL_EXPORT int get_padding_offset(Context* ctx,
                                  const int max_seq_len,
                                  const int bs);

-DLL_EXPORT int speculate_get_padding_offset(Context* ctx,
+DLL_EXPORT int speculate_get_padding_offset(api::Context* ctx,
                                            int* batch_id_per_token,
                                            int* cum_offsets_out,
                                            int* cu_seqlens_q,
@@ -117,7 +121,7 @@ DLL_EXPORT int draft_model_preprocess(api::Context* ctx,
                                      const bool splitwise_prefill,
                                      const bool kvcache_scheduler_v1);

-DLL_EXPORT int update_inputs(Context* ctx,
+DLL_EXPORT int update_inputs(api::Context* ctx,
                             bool* not_need_stop,
                             int* seq_lens_this_time,
                             int* seq_lens_encoder,
@@ -130,7 +134,7 @@ DLL_EXPORT int update_inputs(Context* ctx,
                             const int max_bsz,
                             const int input_ids_stride);

-DLL_EXPORT int free_and_dispatch_block(Context* ctx,
+DLL_EXPORT int free_and_dispatch_block(api::Context* ctx,
                                       bool* stop_flags,
                                       int* seq_lens_this_time,
                                       int* seq_lens_decoder,
@@ -153,7 +157,7 @@ DLL_EXPORT int free_and_dispatch_block(Context* ctx,
                                       const int max_decoder_block_num);

 DLL_EXPORT int speculate_free_and_dispatch_block(
-    Context* ctx,
+    api::Context* ctx,
    bool* stop_flags,
    int* seq_lens_this_time,
    int* seq_lens_decoder,
@@ -177,7 +181,7 @@ DLL_EXPORT int speculate_free_and_dispatch_block(
    const int max_decoder_block_num,
    const int max_draft_tokens);

-DLL_EXPORT int recover_block(Context* ctx,
+DLL_EXPORT int recover_block(api::Context* ctx,
                             int* recover_block_list,  // [bsz]
                             int* recover_len,
                             bool* stop_flags,
@@ -200,7 +204,7 @@ DLL_EXPORT int recover_block(Context* ctx,
                             const int length,
                             const int pre_id_length);

-DLL_EXPORT int speculate_recover_block(Context* ctx,
+DLL_EXPORT int speculate_recover_block(api::Context* ctx,
                                       int* recover_block_list,  // [bsz]
                                       int* recover_len,
                                       bool* stop_flags,
@@ -224,7 +228,7 @@ DLL_EXPORT int speculate_recover_block(Context* ctx,
                                       const int length,
                                       const int pre_id_length);

-DLL_EXPORT int recover_decode_task(Context* ctx,
+DLL_EXPORT int recover_decode_task(api::Context* ctx,
                                   bool* stop_flags,
                                   int* seq_lens_this_time,
                                   int* seq_lens_encoder,
@@ -236,7 +240,7 @@ DLL_EXPORT int recover_decode_task(Context* ctx,
                                   const int block_num_per_seq,
                                   const int block_size);

-DLL_EXPORT int recover_spec_decode_task(Context* ctx,
+DLL_EXPORT int recover_spec_decode_task(api::Context* ctx,
                                        bool* stop_flags,
                                        int* seq_lens_this_time,
                                        int* seq_lens_encoder,
@@ -253,7 +257,7 @@ DLL_EXPORT int recover_spec_decode_task(Context* ctx,
                                        const int draft_tokens_len,
                                        const int num_extra_tokens);

-DLL_EXPORT int update_inputs_v1(Context* ctx,
+DLL_EXPORT int update_inputs_v1(api::Context* ctx,
                                bool* not_need_stop,
                                int* seq_lens_this_time,
                                int* seq_lens_encoder,
@@ -274,45 +278,45 @@ DLL_EXPORT int update_inputs_v1(Context* ctx,

 template <typename TX, typename TY>
 DLL_EXPORT int eb_adjust_batch(
-    Context* ctx,
+    api::Context* ctx,
    const TX* x,
    TY* y,
-    VectorParam<int32_t>& encoder_seqs_lods,  // NOLINT
-    VectorParam<int32_t>& decoder_seqs_lods,  // NOLINT
-    VectorParam<int32_t>& encoder_batch_map,  // NOLINT
-    VectorParam<int32_t>& decoder_batch_map,  // NOLINT
+    api::VectorParam<int32_t>& encoder_seqs_lods,  // NOLINT
+    api::VectorParam<int32_t>& decoder_seqs_lods,  // NOLINT
+    api::VectorParam<int32_t>& encoder_batch_map,  // NOLINT
+    api::VectorParam<int32_t>& decoder_batch_map,  // NOLINT
    int64_t hidden_dim);

 template <typename TX, typename TY>
 DLL_EXPORT int eb_gather_next_token(
-    Context* ctx,
+    api::Context* ctx,
    const TX* x,
    TY* y,
-    VectorParam<int32_t>& encoder_seqs_lods,  // NOLINT
-    VectorParam<int32_t>& encoder_batch_map,  // NOLINT
-    VectorParam<int32_t>& decoder_batch_map,  // NOLINT
+    api::VectorParam<int32_t>& encoder_seqs_lods,  // NOLINT
+    api::VectorParam<int32_t>& encoder_batch_map,  // NOLINT
+    api::VectorParam<int32_t>& decoder_batch_map,  // NOLINT
    int64_t hidden_dim);

 template <typename TX, typename TY>
 DLL_EXPORT int eb_mtp_gather_next_token(
-    Context* ctx,
+    api::Context* ctx,
    const TX* x,
    TY* y,
-    VectorParam<int32_t>& encoder_seqs_lods,  // NOLINT
-    VectorParam<int32_t>& decoder_seqs_lods,  // NOLINT
-    VectorParam<int32_t>& encoder_batch_map,  // NOLINT
-    VectorParam<int32_t>& decoder_batch_map,  // NOLINT
+    api::VectorParam<int32_t>& encoder_seqs_lods,  // NOLINT
+    api::VectorParam<int32_t>& decoder_seqs_lods,  // NOLINT
+    api::VectorParam<int32_t>& encoder_batch_map,  // NOLINT
+    api::VectorParam<int32_t>& decoder_batch_map,  // NOLINT
    int64_t hidden_dim);

 template <typename TX, typename TY>
 DLL_EXPORT int eb_recover_batch_sequence(
-    Context* ctx,
+    api::Context* ctx,
    const TX* x,
    TY* y,
-    VectorParam<int32_t>& encoder_seqs_lods,  // NOLINT
-    VectorParam<int32_t>& decoder_seqs_lods,  // NOLINT
-    VectorParam<int32_t>& encoder_batch_map,  // NOLINT
-    VectorParam<int32_t>& decoder_batch_map,  // NOLINT
+    api::VectorParam<int32_t>& encoder_seqs_lods,  // NOLINT
+    api::VectorParam<int32_t>& decoder_seqs_lods,  // NOLINT
+    api::VectorParam<int32_t>& encoder_batch_map,  // NOLINT
+    api::VectorParam<int32_t>& decoder_batch_map,  // NOLINT
    int64_t hidden_dim);

 template <typename TX, typename TSCALE = float, typename TY = int8_t>
@@ -324,7 +328,7 @@ DLL_EXPORT int quant2d_per_channel(api::Context* ctx,
                                   int64_t m,
                                   int64_t n);

-DLL_EXPORT int text_image_index_out(Context* ctx,
+DLL_EXPORT int text_image_index_out(api::Context* ctx,
                                    const int* token_type_ids,  // x
                                    int* text_index,            // y1
                                    int* image_index,           // y2
@@ -372,7 +376,7 @@ DLL_EXPORT int limit_thinking_content_length_kernel_v2(

 template <typename T>
 DLL_EXPORT int speculate_token_penalty_multi_scores(
-    Context* ctx,
+    api::Context* ctx,
    const int64_t* pre_ids,
    T* logits,
    const T* penalty_scores,
@@ -392,7 +396,7 @@ DLL_EXPORT int speculate_token_penalty_multi_scores(
    const int64_t length_bad_words,
    const int64_t token_num,
    const int64_t max_seq_len);
-DLL_EXPORT int mtp_free_and_dispatch_block(Context* ctx,
+DLL_EXPORT int mtp_free_and_dispatch_block(api::Context* ctx,
                                           bool* base_model_stop_flags,
                                           bool* stop_flags,
                                           bool* batch_drop,
@@ -409,7 +413,7 @@ DLL_EXPORT int mtp_free_and_dispatch_block(Context* ctx,
                                           const int max_draft_tokens);

 template <bool ENABLE_TOPP, bool USE_TOPK>
-DLL_EXPORT int speculate_verify(Context* ctx,
+DLL_EXPORT int speculate_verify(api::Context* ctx,
                                const int64_t* sampled_token_ids,
                                int64_t* accept_tokens,
                                int* accept_num,
@@ -440,19 +444,19 @@ DLL_EXPORT int speculate_verify(Context* ctx,
                                const bool accept_all_drafts,
                                const bool use_target_sampling);

-DLL_EXPORT int speculate_clear_accept_nums(Context* ctx,
+DLL_EXPORT int speculate_clear_accept_nums(api::Context* ctx,
                                           int* accept_num,
                                           const int* seq_lens_decoder,
                                           const int max_bsz);

-DLL_EXPORT int speculate_get_seq_lens_output(Context* ctx,
+DLL_EXPORT int speculate_get_seq_lens_output(api::Context* ctx,
                                             int* seq_lens_output,
                                             const int* seq_lens_this_time,
                                             const int* seq_lens_encoder,
                                             const int* seq_lens_decoder,
                                             const int real_bsz);

-DLL_EXPORT int draft_model_update(Context* ctx,
+DLL_EXPORT int draft_model_update(api::Context* ctx,
                                  const int64_t* inter_next_tokens,
                                  int64_t* draft_tokens,
                                  int64_t* pre_ids,
@@ -475,7 +479,7 @@ DLL_EXPORT int draft_model_update(Context* ctx,
                                  const int substep,
                                  const bool prefill_one_step_stop);

-DLL_EXPORT int speculate_set_stop_value_multi_seqs(Context* ctx,
+DLL_EXPORT int speculate_set_stop_value_multi_seqs(api::Context* ctx,
                                                   bool* stop_flags,
                                                   int64_t* accept_tokens,
                                                   int* accept_nums,
@@ -504,7 +508,7 @@ DLL_EXPORT int speculate_rebuild_append_padding(api::Context* ctx,
                                                T* out);

 template <typename T>
-DLL_EXPORT int speculate_remove_padding(Context* ctx,
+DLL_EXPORT int speculate_remove_padding(api::Context* ctx,
                                        T* x_remove_padding,
                                        const T* input_ids,
                                        const T* draft_tokens,
@@ -536,7 +540,7 @@ DLL_EXPORT int compute_order(api::Context* ctx,
                             const int actual_draft_token_num,
                             const int input_token_num);

-DLL_EXPORT int draft_model_postprocess(Context* ctx,
+DLL_EXPORT int draft_model_postprocess(api::Context* ctx,
                                       const int64_t* base_model_draft_tokens,
                                       int* base_model_seq_lens_this_time,
                                       const int* base_model_seq_lens_encoder,
@@ -544,7 +548,7 @@ DLL_EXPORT int draft_model_postprocess(Context* ctx,
                                       int bsz,
                                       int base_model_draft_token_len);

-DLL_EXPORT int speculate_set_value_by_flag_and_id(Context* ctx,
+DLL_EXPORT int speculate_set_value_by_flag_and_id(api::Context* ctx,
                                                  int64_t* pre_ids_all,
                                                  const int64_t* accept_tokens,
                                                  int* accept_num,
@@ -557,7 +561,7 @@ DLL_EXPORT int speculate_set_value_by_flag_and_id(Context* ctx,
                                                  int max_draft_tokens);

 DLL_EXPORT int speculate_get_output_padding_offset(
-    Context* ctx,
+    api::Context* ctx,
    int* output_padding_offset,
    int* output_cum_offsets,
    const int* output_cum_offsets_tmp,
@@ -578,7 +582,7 @@ DLL_EXPORT int top_p_candidates(api::Context* ctx,
                                int max_cadidate_len,
                                int max_seq_len);

-DLL_EXPORT int speculate_free_and_reschedule(Context* ctx,
+DLL_EXPORT int speculate_free_and_reschedule(api::Context* ctx,
                                             bool* stop_flags,
                                             int* seq_lens_this_time,
                                             int* seq_lens_decoder,
@@ -601,7 +605,7 @@ DLL_EXPORT int speculate_free_and_reschedule(Context* ctx,
                                             const int max_decoder_block_num,
                                             const int max_draft_tokens);

-DLL_EXPORT int speculate_schedule_cache(Context* ctx,
+DLL_EXPORT int speculate_schedule_cache(api::Context* ctx,
                                        const int64_t* draft_tokens,
                                        int* block_tables,
                                        bool* stop_flags,
@@ -625,7 +629,7 @@ DLL_EXPORT int speculate_schedule_cache(Context* ctx,
                                        const int block_num_per_seq,
                                        const bool prefill_one_step_stop);

-DLL_EXPORT int speculate_update_v3(Context* ctx,
+DLL_EXPORT int speculate_update_v3(api::Context* ctx,
                                   int* seq_lens_encoder,
                                   int* seq_lens_decoder,
                                   bool* not_need_stop,
@@ -641,7 +645,7 @@ DLL_EXPORT int speculate_update_v3(Context* ctx,
                                   const int max_bsz,
                                   const int max_draft_tokens);

-DLL_EXPORT int speculate_update(Context* ctx,
+DLL_EXPORT int speculate_update(api::Context* ctx,
                                int* seq_lens_encoder,
                                int* seq_lens_decoder,
                                bool* not_need_stop,
@@ -674,7 +678,7 @@ DLL_EXPORT int rebuild_self_hidden_states(api::Context* ctx,
                                          int dim_embed,
                                          int elem_cnt);

-DLL_EXPORT int speculate_get_logits(Context* ctx,
+DLL_EXPORT int speculate_get_logits(api::Context* ctx,
                                    float* draft_logits,
                                    int* next_token_num,
                                    int* batch_token_num,
@@ -687,7 +691,7 @@ DLL_EXPORT int speculate_get_logits(Context* ctx,
                                    const int real_bsz,
                                    const int vocab_size);

-DLL_EXPORT int update_attn_mask_offsets(Context* ctx,
+DLL_EXPORT int update_attn_mask_offsets(api::Context* ctx,
                                        int* attn_mask_offsets,
                                        const int* seq_lens_this_time,
                                        const int* seq_lens_encoder,
@@ -723,6 +727,4 @@ DLL_EXPORT int speculate_limit_thinking_content_length_kernel(
 * --------------------------------------------*/

 }  // namespace plugin
-}  // namespace api
-}  // namespace xpu
-}  // namespace baidu
+}  // namespace fastdeploy
@@ -2,8 +2,7 @@
 #include "xpu/kernel/cluster_partition.h"
 #include "xpu/kernel/cluster_primitive.h"

-namespace xpu3 {
-namespace plugin {
+namespace fd_xpu3 {

 template <typename T>
 inline __device__ void update_bad_words_logit(_global_ptr_ T* logits) {
@@ -54,5 +53,4 @@ __global__ void ban_bad_words(T* logits,
 _XPU_DEF__BAN_BAD_WORDS_(float);
 _XPU_DEF__BAN_BAD_WORDS_(float16);

-}  // namespace plugin
-}  // namespace xpu3
+}  // namespace fd_xpu3
@@ -1,8 +1,7 @@
 #include "xpu/kernel/cluster.h"
 #include "xpu/kernel/cluster_debug.h"
 #include "xpu/kernel/cluster_primitive.h"
-namespace xpu3 {
-namespace plugin {
+namespace fd_xpu3 {
 #define MAX_LM_SIZE 28672
 // One core has 32KB LM(gropu LM), MAX_LM_SIZE = (32 - 4)KB / 2 = 30720, 4KB is
 // the stack space
@@ -134,5 +133,4 @@ _XPU_DEF__EB_ADJUST_BATCH_(bfloat16, float);
 _XPU_DEF__EB_ADJUST_BATCH_(float, bfloat16);
 _XPU_DEF__EB_ADJUST_BATCH_(int32_t, int32_t);
 _XPU_DEF__EB_ADJUST_BATCH_(int64_t, int64_t);
-}  // namespace plugin
-}  // namespace xpu3
+}  // namespace fd_xpu3
@@ -1,8 +1,7 @@
 #include "xpu/kernel/cluster.h"
 #include "xpu/kernel/cluster_debug.h"
 #include "xpu/kernel/cluster_primitive.h"
-namespace xpu3 {
-namespace plugin {
+namespace fd_xpu3 {
 #define MAX_LM_SIZE 28672
 // One core has 32KB LM（group LM), MAX_LM_SIZE = (32 - 4)KB / 2 = 30720, 4KB is
 // the stack space
@@ -98,5 +97,4 @@ _XPU_DEF__EB_GATHER_NEXT_TOKEN(bfloat16, float16);
 _XPU_DEF__EB_GATHER_NEXT_TOKEN(float16, bfloat16);
 _XPU_DEF__EB_GATHER_NEXT_TOKEN(bfloat16, float);
 _XPU_DEF__EB_GATHER_NEXT_TOKEN(float, bfloat16);
-}  // namespace plugin
-}  // namespace xpu3
+}  // namespace fd_xpu3
@@ -2,8 +2,7 @@
 #include "xpu/kernel/cluster_partition.h"
 #include "xpu/kernel/cluster_primitive.h"

-namespace xpu3 {
-namespace plugin {
+namespace fd_xpu3 {

 static __device__ inline int loada_float(_shared_ptr_ const int *ptr) {
  int ret;
@@ -322,5 +321,4 @@ __global__ void free_and_dispatch_block(bool *stop_flags,
  }
 }

-}  // namespace plugin
-}  // namespace xpu3
+}  // namespace fd_xpu3
@@ -2,8 +2,7 @@
 #include "xpu/kernel/cluster_partition.h"
 #include "xpu/kernel/cluster_primitive.h"

-namespace xpu3 {
-namespace plugin {
+namespace fd_xpu3 {

 __global__ void get_padding_offset(int *batch_id_per_token,
                                   int *cum_offsets_out,
@@ -49,5 +48,4 @@ __global__ void get_padding_offset(int *batch_id_per_token,
  }
 }

-}  // namespace plugin
-}  // namespace xpu3
+}  // namespace fd_xpu3
@@ -7,8 +7,7 @@
 #include "xpu/kernel/xtdk.h"
 #include "xpu/kernel/xtdk_io.h"

-namespace xpu3 {
-namespace plugin {
+namespace fd_xpu3 {

 template <typename T>
 static inline __device__ bool is_in_end(const T id,
@@ -94,5 +93,4 @@ __global__ void limit_thinking_content_length_kernel_v1(
  }
 }

-}  // namespace plugin
-}  // namespace xpu3
+}  // namespace fd_xpu3
@@ -7,8 +7,7 @@
 #include "xpu/kernel/xtdk.h"
 #include "xpu/kernel/xtdk_io.h"

-namespace xpu3 {
-namespace plugin {
+namespace fd_xpu3 {

 __global__ void limit_thinking_content_length_kernel_v2(
    int64_t* next_tokens,
@@ -89,5 +88,4 @@ __global__ void limit_thinking_content_length_kernel_v2(
  }
 }

-}  // namespace plugin
-}  // namespace xpu3
+}  // namespace fd_xpu3
@@ -2,8 +2,7 @@
 #include "xpu/kernel/cluster_partition.h"
 #include "xpu/kernel/cluster_primitive.h"

-namespace xpu3 {
-namespace plugin {
+namespace fd_xpu3 {

 template <typename T>
 __global__ void min_length_logits_process(T* logits,
@@ -64,5 +63,4 @@ __global__ void min_length_logits_process(T* logits,
 _XPU_DEF__UPDATE_LOGITS_REPEAT_TIMES_(float);
 _XPU_DEF__UPDATE_LOGITS_REPEAT_TIMES_(float16);

-}  // namespace plugin
-}  // namespace xpu3
+}  // namespace fd_xpu3
@@ -2,8 +2,7 @@
 #include "xpu/kernel/cluster_partition.h"
 #include "xpu/kernel/cluster_primitive.h"

-namespace xpu3 {
-namespace plugin {
+namespace fd_xpu3 {

 __global__ void ComputeOrderKernel(const int* seq_lens_this_time,
                                   const int* seq_lens_encoder,
@@ -112,5 +111,4 @@ __global__ void ComputeOrderKernel(const int* seq_lens_this_time,
  LM2GM(&out_offset, output_token_num, sizeof(int));
 }

-}  // namespace plugin
-}  // namespace xpu3
+}  // namespace fd_xpu3
@@ -2,8 +2,7 @@
 #include "xpu/kernel/cluster_partition.h"
 #include "xpu/kernel/cluster_primitive.h"

-namespace xpu3 {
-namespace plugin {
+namespace fd_xpu3 {

 __global__ void ComputeSelfOrderKernel(const int* last_seq_lens_this_time,
                                       const int* seq_lens_this_time,
@@ -69,5 +68,4 @@ __global__ void ComputeSelfOrderKernel(const int* last_seq_lens_this_time,
  LM2GM(&out_offset, output_token_num, sizeof(int));
 }

-}  // namespace plugin
-}  // namespace xpu3
+}  // namespace fd_xpu3
@@ -4,8 +4,7 @@
 #include "xpu/kernel/cluster_primitive_template.h"
 #include "xpu/kernel/cluster_simd.h"
 #include "xpu/kernel/xtdk_io.h"
-namespace xpu3 {
-namespace plugin {
+namespace fd_xpu3 {

 static inline __device__ int v_reduce(int32x16_t v) {
  auto v0 = vsrlp_int32x16(256, v);
@@ -185,5 +184,4 @@ __global__ void draft_model_postprocess(const int64_t* base_model_draft_tokens,
    sync_cluster();
  }
 }
-}  // namespace plugin
-}  // namespace xpu3
+}  // namespace fd_xpu3
@@ -4,8 +4,7 @@
 #include "xpu/kernel/cluster_primitive.h"
 #include "xpu/kernel/cluster_simd.h"

-namespace xpu3 {
-namespace plugin {
+namespace fd_xpu3 {
 __global__ void draft_model_preprocess(int64_t* draft_tokens,
                                       int64_t* input_ids,
                                       bool* stop_flags,
@@ -235,5 +234,4 @@ __global__ void draft_model_preprocess(int64_t* draft_tokens,
  }
 }

-}  // namespace plugin
-}  // namespace xpu3
+}  // namespace fd_xpu3
@@ -3,8 +3,7 @@
 #include "xpu/kernel/cluster_partition.h"
 #include "xpu/kernel/cluster_primitive.h"
 #include "xpu/kernel/cluster_primitive_template.h"
-namespace xpu3 {
-namespace plugin {
+namespace fd_xpu3 {
 inline __device__ bool is_in_end(const int64_t id,
                                 const __global_ptr__ int64_t* end_ids,
                                 int length) {
@@ -108,5 +107,4 @@ __global__ void draft_model_update(const int64_t* inter_next_tokens,
  }
  mfence();
 }
-}  // namespace plugin
-}  // namespace xpu3
+}  // namespace fd_xpu3
@@ -1,8 +1,7 @@
 #include "xpu/kernel/cluster.h"
 #include "xpu/kernel/cluster_debug.h"
 #include "xpu/kernel/cluster_primitive.h"
-namespace xpu3 {
-namespace plugin {
+namespace fd_xpu3 {
 #define MAX_LM_SIZE 28672
 // One core has 32KB LM(group LM), MAX_LM_SIZE = (32 - 4)KB / 2 = 30720, 4KB is
 // the stack space
@@ -125,5 +124,5 @@ _XPU_DEF__EB_MTP_GATHER_NEXT_TOKEN(bfloat16, float16);
 _XPU_DEF__EB_MTP_GATHER_NEXT_TOKEN(float16, bfloat16);
 _XPU_DEF__EB_MTP_GATHER_NEXT_TOKEN(bfloat16, float);
 _XPU_DEF__EB_MTP_GATHER_NEXT_TOKEN(float, bfloat16);
-}  // namespace plugin
-}  // namespace xpu3
+
+}  // namespace fd_xpu3
@@ -1,8 +1,7 @@
 #include "xpu/kernel/cluster.h"
 #include "xpu/kernel/cluster_debug.h"
 #include "xpu/kernel/cluster_primitive.h"
-namespace xpu3 {
-namespace plugin {
+namespace fd_xpu3 {
 #define MAX_LM_SIZE 28672
 // One core has 32KB LM(group LM), MAX_LM_SIZE = (32 - 4)KB / 2 = 30720, 4KB is
 // the stack space
@@ -125,5 +124,5 @@ _XPU_DEF__EB_RECOVER_BATCH_SEQUENCE(bfloat16, float16);
 _XPU_DEF__EB_RECOVER_BATCH_SEQUENCE(float16, bfloat16);
 _XPU_DEF__EB_RECOVER_BATCH_SEQUENCE(bfloat16, float);
 _XPU_DEF__EB_RECOVER_BATCH_SEQUENCE(float, bfloat16);
-}  // namespace plugin
-}  // namespace xpu3
+
+}  // namespace fd_xpu3
@@ -2,8 +2,7 @@
 #include "xpu/kernel/cluster_partition.h"
 #include "xpu/kernel/cluster_primitive.h"

-namespace xpu3 {
-namespace plugin {
+namespace fd_xpu3 {

 static __device__ inline int loada_float(_shared_ptr_ const int *ptr) {
  int ret;
@@ -205,5 +204,4 @@ __global__ void mtp_free_and_dispatch_block(bool *base_model_stop_flags,
  }
 }

-}  // namespace plugin
-}  // namespace xpu3
+}  // namespace fd_xpu3
@@ -2,8 +2,7 @@
 #include "xpu/kernel/cluster_partition.h"
 #include "xpu/kernel/cluster_primitive.h"

-namespace xpu3 {
-namespace plugin {
+namespace fd_xpu3 {

 template <typename T>
 __global__ void RebuildAppendPaddingKernel(const T *full_hidden_states,
@@ -86,5 +85,4 @@ _XPU_DEF_REBUILD_APPEND_PADDING_KERNEL(bfloat16);
 _XPU_DEF_REBUILD_APPEND_PADDING_KERNEL(float16);
 _XPU_DEF_REBUILD_APPEND_PADDING_KERNEL(float);

-}  // namespace plugin
-}  // namespace xpu3
+}  // namespace fd_xpu3
@@ -2,8 +2,7 @@
 #include "xpu/kernel/cluster_partition.h"
 #include "xpu/kernel/cluster_primitive.h"

-namespace xpu3 {
-namespace plugin {
+namespace fd_xpu3 {

 template <typename T>
 __global__ void rebuildHiddenStatesKernel(const T* input,
@@ -61,5 +60,4 @@ _XPU_DEF_REBUILD_HIDDEN_STATES_KERNEL(bfloat16);
 _XPU_DEF_REBUILD_HIDDEN_STATES_KERNEL(float);
 _XPU_DEF_REBUILD_HIDDEN_STATES_KERNEL(float16);

-}  // namespace plugin
-}  // namespace xpu3
+}  // namespace fd_xpu3
@@ -2,8 +2,7 @@
 #include "xpu/kernel/cluster_partition.h"
 #include "xpu/kernel/cluster_primitive.h"

-namespace xpu3 {
-namespace plugin {
+namespace fd_xpu3 {

 template <typename T>
 __global__ void rebuildSelfHiddenStatesKernel(
@@ -52,5 +51,4 @@ _XPU_DEF_REBUILD_SELF_HIDDEN_STATES_KERNEL(bfloat16);
 _XPU_DEF_REBUILD_SELF_HIDDEN_STATES_KERNEL(float);
 _XPU_DEF_REBUILD_SELF_HIDDEN_STATES_KERNEL(float16);

-}  // namespace plugin
-}  // namespace xpu3
+}  // namespace fd_xpu3
@@ -2,8 +2,7 @@
 #include "xpu/kernel/cluster_partition.h"
 #include "xpu/kernel/cluster_primitive.h"

-namespace xpu3 {
-namespace plugin {
+namespace fd_xpu3 {

 __attribute__((global)) void recover_spec_decode_task(
    bool *stop_flags,
@@ -71,5 +70,4 @@ __attribute__((global)) void recover_spec_decode_task(
  }
 }

-}  // namespace plugin
-}  // namespace xpu3
+}  // namespace fd_xpu3
@@ -2,8 +2,7 @@
 #include "xpu/kernel/cluster_partition.h"
 #include "xpu/kernel/cluster_primitive.h"

-namespace xpu3 {
-namespace plugin {
+namespace fd_xpu3 {

 template <typename T>
 inline __device__ void update_bad_words_logit(_global_ptr_ T* logits) {
@@ -74,5 +73,4 @@ _XPU_DEF__BAN_BAD_WORDS_(float);
 _XPU_DEF__BAN_BAD_WORDS_(float16);
 _XPU_DEF__BAN_BAD_WORDS_(bfloat16);

-}  // namespace plugin
-}  // namespace xpu3
+}  // namespace fd_xpu3
@@ -20,8 +20,7 @@
 #include "xpu/kernel/cluster_primitive.h"
 #include "xpu/kernel/xtdk_io.h"

-namespace xpu3 {
-namespace plugin {
+namespace fd_xpu3 {

 __global__ void speculate_clear_accept_nums(int* accept_num,
                                            const int* seq_lens_decoder,
@@ -40,5 +39,4 @@ __global__ void speculate_clear_accept_nums(int* accept_num,
  }
 }

-}  // namespace plugin
-}  // namespace xpu3
+}  // namespace fd_xpu3
@@ -2,8 +2,7 @@
 #include "xpu/kernel/cluster_partition.h"
 #include "xpu/kernel/cluster_primitive.h"

-namespace xpu3 {
-namespace plugin {
+namespace fd_xpu3 {

 static __device__ inline int loada_float(_shared_ptr_ const int *ptr) {
  int ret;
@@ -333,5 +332,4 @@ __global__ void speculate_free_and_dispatch_block(
  }
 }

-}  // namespace plugin
-}  // namespace xpu3
+}  // namespace fd_xpu3
@@ -2,8 +2,7 @@
 #include "xpu/kernel/cluster_partition.h"
 #include "xpu/kernel/cluster_primitive.h"

-namespace xpu3 {
-namespace plugin {
+namespace fd_xpu3 {

 static __device__ inline int loada_float(_shared_ptr_ const int *ptr) {
  int ret;
@@ -284,5 +283,4 @@ __global__ void speculate_free_and_reschedule(bool *stop_flags,
  }
 }

-}  // namespace plugin
-}  // namespace xpu3
+}  // namespace fd_xpu3
@@ -2,8 +2,7 @@
 #include "xpu/kernel/cluster_partition.h"
 #include "xpu/kernel/cluster_primitive.h"

-namespace xpu3 {
-namespace plugin {
+namespace fd_xpu3 {

 __device__ void prefix_sum(__shared_ptr__ int* sm_seq_lens_encoder,
                           __shared_ptr__ int* sm_seq_lens_this_time,
@@ -127,5 +126,4 @@ __global__ void speculate_get_logits(float* draft_logits,
  }
 }

-}  // namespace plugin
-}  // namespace xpu3
+}  // namespace fd_xpu3
@@ -20,8 +20,7 @@
 #include "xpu/kernel/cluster_primitive.h"
 #include "xpu/kernel/xtdk_io.h"

-namespace xpu3 {
-namespace plugin {
+namespace fd_xpu3 {

 __global__ void speculate_get_output_padding_offset(
    int* output_padding_offset,
@@ -59,5 +58,4 @@ __global__ void speculate_get_output_padding_offset(
  }
 }

-}  // namespace plugin
-}  // namespace xpu3
+}  // namespace fd_xpu3
@@ -21,8 +21,7 @@
 #include "xpu/kernel/cluster_simd.h"
 #include "xpu/kernel/xtdk.h"

-namespace xpu3 {
-namespace plugin {
+namespace fd_xpu3 {

 template <typename T>
 __global__ void speculate_remove_padding(T* output_data,
@@ -118,5 +117,4 @@ _XPU_DEF_SPECULATE_KERNELS_(float16);
 _XPU_DEF_SPECULATE_KERNELS_(bfloat16);
 _XPU_DEF_SPECULATE_KERNELS_(int64_t);

-}  // namespace plugin
-}  // namespace xpu3
+}  // namespace fd_xpu3
@@ -20,8 +20,7 @@
 #include "xpu/kernel/cluster_primitive.h"
 #include "xpu/kernel/xtdk_io.h"

-namespace xpu3 {
-namespace plugin {
+namespace fd_xpu3 {

 __global__ void speculate_get_seq_lens_output(int* seq_lens_output,
                                              const int* seq_lens_this_time,
@@ -54,5 +53,4 @@ __global__ void speculate_get_seq_lens_output(int* seq_lens_output,
  }
 }

-}  // namespace plugin
-}  // namespace xpu3
+}  // namespace fd_xpu3
@@ -3,8 +3,7 @@
 #include "xpu/kernel/cluster_partition.h"
 #include "xpu/kernel/cluster_primitive.h"

-namespace xpu3 {
-namespace plugin {
+namespace fd_xpu3 {

 template <typename T>
 __global__ void speculate_min_length_logits_process(
@@ -87,5 +86,4 @@ _XPU_DEF__UPDATE_LOGITS_REPEAT_TIMES_(float);
 _XPU_DEF__UPDATE_LOGITS_REPEAT_TIMES_(float16);
 _XPU_DEF__UPDATE_LOGITS_REPEAT_TIMES_(bfloat16);

-}  // namespace plugin
-}  // namespace xpu3
+}  // namespace fd_xpu3
@@ -2,8 +2,7 @@
 #include "xpu/kernel/cluster_partition.h"
 #include "xpu/kernel/cluster_primitive.h"

-namespace xpu3 {
-namespace plugin {
+namespace fd_xpu3 {

 static __device__ inline int loada_float(_shared_ptr_ const int* ptr) {
  int ret;
@@ -160,5 +159,4 @@ __global__ void speculate_recover_block(int* recover_block_list,  // [bsz]
  }
 }

-}  // namespace plugin
-}  // namespace xpu3
+}  // namespace fd_xpu3
@@ -3,8 +3,7 @@
 #include "xpu/kernel/cluster_primitive.h"
 #include "xpu/kernel/xtdk_io.h"

-namespace xpu3 {
-namespace plugin {
+namespace fd_xpu3 {

 static inline __device__ int v_reduce(int32x16_t &v0, int32x16_t &v1) {
  int res;
@@ -175,5 +174,4 @@ __global__ void speculate_schedule_cache(const int64_t *draft_tokens,
  }
 }

-}  // namespace plugin
-}  // namespace xpu3
+}  // namespace fd_xpu3
@@ -4,8 +4,7 @@
 #include "xpu/kernel/xtdk_math.h"
 #include "xpu/kernel/xtdk_simd.h"

-namespace xpu3 {
-namespace plugin {
+namespace fd_xpu3 {

 __global__ void speculate_set_stop_value_multi_seqs(bool *stop_flags,
                                                    int64_t *accept_tokens,
@@ -99,5 +98,5 @@ __global__ void speculate_set_stop_value_multi_seqs(bool *stop_flags,
    }
  }
 }
-}  // namespace plugin
-}  // namespace xpu3
+
+}  // namespace fd_xpu3
@@ -20,8 +20,7 @@
 #include "xpu/kernel/cluster_primitive.h"
 #include "xpu/kernel/xtdk_io.h"

-namespace xpu3 {
-namespace plugin {
+namespace fd_xpu3 {

 __global__ void speculate_set_value_by_flag_and_id(int64_t *pre_ids_all,
                                                   const int64_t *accept_tokens,
@@ -84,5 +83,4 @@ __global__ void speculate_set_value_by_flag_and_id(int64_t *pre_ids_all,
  }
 }

-}  // namespace plugin
-}  // namespace xpu3
+}  // namespace fd_xpu3
@@ -20,8 +20,7 @@
 #include "xpu/kernel/cluster_primitive.h"
 #include "xpu/kernel/cluster_primitive_template.h"

-namespace xpu3 {
-namespace plugin {
+namespace fd_xpu3 {

 static inline __device__ int v_reduce(int32x16_t &v0, int32x16_t &v1) {
  int res;
@@ -198,5 +197,4 @@ template __global__ void speculate_update<512>(int *seq_lens_encoder,
                                               const int max_bsz,
                                               const int max_draft_tokens);

-}  // namespace plugin
-}  // namespace xpu3
+}  // namespace fd_xpu3
@@ -3,8 +3,7 @@
 #include "xpu/kernel/cluster_primitive.h"
 #include "xpu/kernel/cluster_primitive_template.h"

-namespace xpu3 {
-namespace plugin {
+namespace fd_xpu3 {

 static __device__ void atomic_add(_shared_ptr_ int *ptr, int v) {
  bool fail = true;
@@ -264,5 +263,4 @@ __global__ void speculate_update_repeat_times(const int64_t *pre_ids,
  }
 }

-}  // namespace plugin
-}  // namespace xpu3
+}  // namespace fd_xpu3
@@ -20,8 +20,7 @@
 #include "xpu/kernel/cluster_primitive.h"
 #include "xpu/kernel/cluster_primitive_template.h"

-namespace xpu3 {
-namespace plugin {
+namespace fd_xpu3 {

 static inline __device__ int v_reduce(int32x16_t &v0, int32x16_t &v1) {
  int res;
@@ -198,5 +197,4 @@ template __global__ void speculate_update_v3<512>(int *seq_lens_encoder,
                                                  const int max_bsz,
                                                  const int max_draft_tokens);

-}  // namespace plugin
-}  // namespace xpu3
+}  // namespace fd_xpu3
@@ -3,8 +3,7 @@
 #include "xpu/kernel/cluster_partition.h"
 #include "xpu/kernel/cluster_primitive.h"

-namespace xpu3 {
-namespace plugin {
+namespace fd_xpu3 {

 __device__ void do_cast(const int *xlm, float *ylm, int64_t len) {
  for (int64_t i = 0; i < len; i += 32) {
@@ -279,5 +278,4 @@ _XPU_DEF__UPDATE_VALUE_BY_REPEAT_TIMES_SIMD(float);
 _XPU_DEF__UPDATE_VALUE_BY_REPEAT_TIMES_SIMD(float16);
 _XPU_DEF__UPDATE_VALUE_BY_REPEAT_TIMES_SIMD(bfloat16);

-}  // namespace plugin
-}  // namespace xpu3
+}  // namespace fd_xpu3
@@ -6,8 +6,8 @@
 // #include "xpu/internal/aten/xrand_philox4x32_10.h"
 // #include "xpu/internal/aten/xrand_uniform.h"
 // #include "xpu/internal/aten/xrand_global.h"
-namespace xpu3 {
-namespace plugin {
+namespace fd_xpu3 {
+
 static inline __device__ int v_reduce(int32x16_t &v0, int32x16_t &v1) {
  int res;
  v1 = vvadd_int32x16(v0, v1);
@@ -380,5 +380,5 @@ SPECULATE_VERIFY_INSTANTIATE(true, true)
 SPECULATE_VERIFY_INSTANTIATE(true, false)
 SPECULATE_VERIFY_INSTANTIATE(false, true)
 SPECULATE_VERIFY_INSTANTIATE(false, false)
-}  // namespace plugin
-}  // namespace xpu3
+
+}  // namespace fd_xpu3
@@ -3,8 +3,7 @@
 #include "xpu/kernel/cluster_primitive.h"
 #include "xpu/kernel/cluster_primitive_template.h"

-namespace xpu3 {
-namespace plugin {
+namespace fd_xpu3 {

 template <typename T, int MaxLength, int TopPBeamTopK>
 __device__ void top_p_candidates_big_n(
@@ -345,5 +344,4 @@ _XPU_DEF_TOP_P_CANDIDATES_KERNEL(float, 2, 5);
 _XPU_DEF_TOP_P_CANDIDATES_KERNEL(float, 2, 8);
 _XPU_DEF_TOP_P_CANDIDATES_KERNEL(float, 2, 10);

-}  // namespace plugin
-}  // namespace xpu3
+}  // namespace fd_xpu3
@@ -3,8 +3,7 @@
 #include "xpu/kernel/cluster_primitive.h"
 #include "xpu/kernel/xtdk_io.h"

-namespace xpu3 {
-namespace plugin {
+namespace fd_xpu3 {

 __global__ void update_attn_mask_offsets(int* attn_mask_offsets,
                                         const int* seq_lens_this_time,
@@ -73,5 +72,4 @@ __global__ void update_attn_mask_offsets(int* attn_mask_offsets,
  }
 }

-}  // namespace plugin
-}  // namespace xpu3
+}  // namespace fd_xpu3
@@ -5,8 +5,7 @@
 // TODO()
 // #include "xpu/quant_xpu.h"
 // #include "xpu_plugin.h"
-namespace xpu3 {
-namespace plugin {
+namespace fd_xpu3 {
 #define MAX_SM_SIZE 32768
 // One core has 32KB LM（group LM), MAX_LM_SIZE = (32 - 4)KB / 2 = 30720, 4KB is
 // the stack space
@@ -1065,5 +1064,4 @@ _XPU_DEF__QUANT2d_PER_CHANNEL_CACHED(float, float, int8_t, 64);
 _XPU_DEF__QUANT2d_PER_CHANNEL_CACHED(float16, float, int8_t, 128);
 _XPU_DEF__QUANT2d_PER_CHANNEL_CACHED(bfloat16, float, int8_t, 128);
 _XPU_DEF__QUANT2d_PER_CHANNEL_CACHED(float, float, int8_t, 128);
-}  // namespace plugin
-}  // namespace xpu3
+}  // namespace fd_xpu3
@@ -2,8 +2,7 @@
 #include "xpu/kernel/cluster_partition.h"
 #include "xpu/kernel/cluster_primitive.h"

-namespace xpu3 {
-namespace plugin {
+namespace fd_xpu3 {

 static __device__ inline int loada_float(_shared_ptr_ const int* ptr) {
  int ret;
@@ -150,5 +149,4 @@ __global__ void recover_block(int* recover_block_list,  // [bsz]
  }
 }

-}  // namespace plugin
-}  // namespace xpu3
+}  // namespace fd_xpu3
@@ -2,8 +2,7 @@
 #include "xpu/kernel/cluster_partition.h"
 #include "xpu/kernel/cluster_primitive.h"

-namespace xpu3 {
-namespace plugin {
+namespace fd_xpu3 {

 __global__ void recover_decode_task(bool *stop_flags,
                                    int *seq_lens_this_time,
@@ -39,5 +38,4 @@ __global__ void recover_decode_task(bool *stop_flags,
  }
 }

-}  // namespace plugin
-}  // namespace xpu3
+}  // namespace fd_xpu3
@@ -2,8 +2,7 @@
 #include "xpu/kernel/cluster_partition.h"
 #include "xpu/kernel/cluster_primitive.h"

-namespace xpu3 {
-namespace plugin {
+namespace fd_xpu3 {

 __global__ void remove_padding(int64_t *x_remove_padding,
                               const int64_t *input_data,
@@ -36,5 +35,4 @@ __global__ void remove_padding(int64_t *x_remove_padding,
  }
 }

-}  // namespace plugin
-}  // namespace xpu3
+}  // namespace fd_xpu3
@@ -3,8 +3,7 @@
 #include "xpu/kernel/xtdk_math.h"
 #include "xpu/kernel/xtdk_simd.h"

-namespace xpu3 {
-namespace plugin {
+namespace fd_xpu3 {

 template <typename T>
 static inline __device__ bool is_in_end(const T id,
@@ -97,5 +96,4 @@ __global__ void set_stop_value_multi_ends(bool* stop_flags,
      const bool prefill_one_step_stop);
 _XPU_DEF__SET_VALUE_BY_FLAGS_BOTH_(int64_t);

-}  // namespace plugin
-}  // namespace xpu3
+}  // namespace fd_xpu3
@@ -1,6 +1,5 @@
 #include "xpu/kernel/cluster.h"
-namespace xpu3 {
-namespace plugin {
+namespace fd_xpu3 {

 __global__ void set_value_by_flags_and_idx(const bool* stop_flags,
                                           int64_t* pre_ids_all,
@@ -46,5 +45,4 @@ __global__ void set_value_by_flags_and_idx(const bool* stop_flags,
  }
 }

-}  // namespace plugin
-}  // namespace xpu3
+}  // namespace fd_xpu3
@@ -3,8 +3,7 @@
 #include "xpu/kernel/cluster_primitive.h"
 #include "xpu/kernel/xtdk_io.h"

-namespace xpu3 {
-namespace plugin {
+namespace fd_xpu3 {

 template <typename T>
 static __device__ inline void text_image_gather(
@@ -215,5 +214,4 @@ __global__ void text_image_gather_scatter(T* input,

 _XPU_DEF_TEXT_IMAGE_GATHER_SCATTER(bfloat16);

-}  // namespace plugin
-}  // namespace xpu3
+}  // namespace fd_xpu3
@@ -20,8 +20,7 @@
 #include "xpu/kernel/cluster_primitive.h"
 #include "xpu/kernel/cluster_primitive_template.h"

-namespace xpu3 {
-namespace plugin {
+namespace fd_xpu3 {

 static __device__ void do_calc(const _shared_ptr_ int* lm_x,
                               int* lm_y1,
@@ -110,5 +109,4 @@ __global__ void text_image_index_out_kernel(const int* token_type_ids,  // x
    buffer_ptr_y2.gm_store(image_index + i, read_size);
  }
 }
-}  // namespace plugin
-}  // namespace xpu3
+}  // namespace fd_xpu3
@@ -2,8 +2,7 @@
 #include "xpu/kernel/cluster_partition.h"
 #include "xpu/kernel/cluster_primitive.h"

-namespace xpu3 {
-namespace plugin {
+namespace fd_xpu3 {

 __global__ void update_inputs(bool *not_need_stop,
                              int *seq_lens_this_time,
@@ -71,5 +70,4 @@ __global__ void update_inputs(bool *not_need_stop,
  }
 }

-}  // namespace plugin
-}  // namespace xpu3
+}  // namespace fd_xpu3
@@ -7,8 +7,7 @@
 #include "xpu/kernel/xtdk.h"
 #include "xpu/kernel/xtdk_io.h"

-namespace xpu3 {
-namespace plugin {
+namespace fd_xpu3 {

 __global__ void update_inputs_v1(bool* not_need_stop,
                                 int* seq_lens_this_time,
@@ -148,5 +147,4 @@ __global__ void update_inputs_v1(bool* not_need_stop,
  }
 }

-}  // namespace plugin
-}  // namespace xpu3
+}  // namespace fd_xpu3
@@ -2,8 +2,7 @@
 #include "xpu/kernel/cluster_partition.h"
 #include "xpu/kernel/cluster_primitive.h"

-namespace xpu3 {
-namespace plugin {
+namespace fd_xpu3 {

 static __device__ void atomic_add(_shared_ptr_ int *ptr, int v) {
  bool fail = true;
@@ -71,5 +70,4 @@ __global__ void update_repeat_times(const int64_t *pre_ids,
  }
 }

-}  // namespace plugin
-}  // namespace xpu3
+}  // namespace fd_xpu3
@@ -3,8 +3,7 @@
 #include "xpu/kernel/cluster_partition.h"
 #include "xpu/kernel/cluster_primitive.h"

-namespace xpu3 {
-namespace plugin {
+namespace fd_xpu3 {

 __device__ void do_cast(const int *xlm, float *ylm, int64_t len) {
  for (int64_t i = 0; i < len; i += 32) {
@@ -222,5 +221,4 @@ __global__ void update_value_by_repeat_times_simd(
 _XPU_DEF__UPDATE_VALUE_BY_REPEAT_TIMES_SIMD(float);
 _XPU_DEF__UPDATE_VALUE_BY_REPEAT_TIMES_SIMD(float16);

-}  // namespace plugin
-}  // namespace xpu3
+}  // namespace fd_xpu3
@@ -17,8 +17,7 @@
 #include "xpu/refactor/impl_public/wrapper_check.h"
 #include "xpu/xdnn.h"

-namespace xpu3 {
-namespace plugin {
+namespace fd_xpu3 {
 template <typename TX, typename TY>
 __attribute__((global)) void eb_adjust_batch(TX *src,
                                             TY *dst,
@@ -29,12 +28,9 @@ __attribute__((global)) void eb_adjust_batch(TX *src,
                                             int en_batch,
                                             int de_batch,
                                             int64_t copy_size);
-}  // namespace plugin
-}  // namespace xpu3
+}  // namespace fd_xpu3

-namespace baidu {
-namespace xpu {
-namespace api {
+namespace fastdeploy {
 namespace plugin {

 template <typename TX, typename TY>
@@ -93,10 +89,10 @@ static int xpu3_wrapper(api::Context *ctx,
                        int en_batch,
                        int de_batch,
                        int64_t hidden_dim) {
-  using XPU_INDEX_TYPE_TX = typename XPUIndexType<TX>::type;
-  using XPU_INDEX_TYPE_TY = typename XPUIndexType<TY>::type;
+  using XPU_INDEX_TYPE_TX = typename api::XPUIndexType<TX>::type;
+  using XPU_INDEX_TYPE_TY = typename api::XPUIndexType<TY>::type;
  auto eb_adjust_batch_kernel =
-      xpu3::plugin::eb_adjust_batch<XPU_INDEX_TYPE_TX, XPU_INDEX_TYPE_TY>;
+      fd_xpu3::eb_adjust_batch<XPU_INDEX_TYPE_TX, XPU_INDEX_TYPE_TY>;
  // NOTE: Don't change 16 to 64, because kernel use gsm
  int32_t ret_xre =
      eb_adjust_batch_kernel<<<ctx->ncluster(), 16, ctx->xpu_stream>>>(
@@ -226,6 +222,4 @@ INSTANTIATION_EB_ADJUST_BATCH(float, bfloat16);
 INSTANTIATION_EB_ADJUST_BATCH(int32_t, int32_t);
 INSTANTIATION_EB_ADJUST_BATCH(int64_t, int64_t);
 }  // namespace plugin
-}  // namespace api
-}  // namespace xpu
-}  // namespace baidu
+}  // namespace fastdeploy
@@ -17,8 +17,7 @@
 #include "xpu/refactor/impl_public/wrapper_check.h"
 #include "xpu/xdnn.h"

-namespace xpu3 {
-namespace plugin {
+namespace fd_xpu3 {
 template <typename TX, typename TY>
 __attribute__((global)) void eb_gather_next_token(TX *src,
                                                  TY *dst,
@@ -28,12 +27,9 @@ __attribute__((global)) void eb_gather_next_token(TX *src,
                                                  int en_batch,
                                                  int de_batch,
                                                  int64_t copy_size);
-}  // namespace plugin
-}  // namespace xpu3
+}  // namespace fd_xpu3

-namespace baidu {
-namespace xpu {
-namespace api {
+namespace fastdeploy {
 namespace plugin {
 template <typename TX, typename TY>
 static int cpu_wrapper(api::Context *ctx,
@@ -74,7 +70,7 @@ static int xpu3_wrapper(api::Context *ctx,
                        int en_batch,
                        int de_batch,
                        int64_t hidden_dim) {
-  auto eb_gather_next_token_kernel = xpu3::plugin::eb_gather_next_token<TX, TY>;
+  auto eb_gather_next_token_kernel = fd_xpu3::eb_gather_next_token<TX, TY>;
  // NOTE: Don't change 16 to 64, because kernel use gsm
  int32_t ret_xre =
      eb_gather_next_token_kernel<<<ctx->ncluster(), 16, ctx->xpu_stream>>>(
@@ -187,6 +183,4 @@ INSTANTIATION_EB_GATHER_NEXT_TOKEN(float16, bfloat16);
 INSTANTIATION_EB_GATHER_NEXT_TOKEN(bfloat16, float);
 INSTANTIATION_EB_GATHER_NEXT_TOKEN(float, bfloat16);
 }  // namespace plugin
-}  // namespace api
-}  // namespace xpu
-}  // namespace baidu
+}  // namespace fastdeploy
@@ -17,8 +17,7 @@
 #include "xpu/plugin.h"
 #include "xpu/refactor/impl_public/wrapper_check.h"

-namespace xpu3 {
-namespace plugin {
+namespace fd_xpu3 {

 __attribute__((global)) void free_and_dispatch_block(
    bool *stop_flags,
@@ -42,15 +41,12 @@ __attribute__((global)) void free_and_dispatch_block(
    const int block_num_per_seq,
    const int max_decoder_block_num);

-}  // namespace plugin
-}  // namespace xpu3
+}  // namespace fd_xpu3

-namespace baidu {
-namespace xpu {
-namespace api {
+namespace fastdeploy {
 namespace plugin {

-static int cpu_wrapper(Context *ctx,
+static int cpu_wrapper(api::Context *ctx,
                       bool *stop_flags,
                       int *seq_lens_this_time,
                       int *seq_lens_decoder,
@@ -171,7 +167,7 @@ static int cpu_wrapper(Context *ctx,
  return api::SUCCESS;
 }

-static int xpu3_wrapper(Context *ctx,
+static int xpu3_wrapper(api::Context *ctx,
                        bool *stop_flags,
                        int *seq_lens_this_time,
                        int *seq_lens_decoder,
@@ -192,8 +188,8 @@ static int xpu3_wrapper(Context *ctx,
                        const int block_size,
                        const int block_num_per_seq,
                        const int max_decoder_block_num) {
-  using XPU_INT64 = typename XPUIndexType<int64_t>::type;
-  auto free_and_dispatch_block_kernel = xpu3::plugin::free_and_dispatch_block;
+  using XPU_INT64 = typename api::XPUIndexType<int64_t>::type;
+  auto free_and_dispatch_block_kernel = fd_xpu3::free_and_dispatch_block;
  int32_t ret_xre =
      free_and_dispatch_block_kernel<<<ctx->ncluster(), 64, ctx->xpu_stream>>>(
          stop_flags,
@@ -220,7 +216,7 @@ static int xpu3_wrapper(Context *ctx,
  return api::SUCCESS;
 }

-int free_and_dispatch_block(Context *ctx,
+int free_and_dispatch_block(api::Context *ctx,
                            bool *stop_flags,
                            int *seq_lens_this_time,
                            int *seq_lens_decoder,
@@ -285,7 +281,7 @@ int free_and_dispatch_block(Context *ctx,
                       block_num_per_seq,
                       max_decoder_block_num);
  }
-  if (ctx->dev().type() == api::kXPU2 || ctx->dev().type() == api::kXPU3) {
+  if (ctx->dev().type() == api::kXPU3) {
    return xpu3_wrapper(ctx,
                        stop_flags,
                        seq_lens_this_time,
@@ -312,6 +308,4 @@ int free_and_dispatch_block(Context *ctx,
 }

 }  // namespace plugin
-}  // namespace api
-}  // namespace xpu
-}  // namespace baidu
+}  // namespace fastdeploy
@@ -17,8 +17,7 @@
 #include "xpu/plugin.h"
 #include "xpu/refactor/impl_public/wrapper_check.h"

-namespace xpu3 {
-namespace plugin {
+namespace fd_xpu3 {

 __attribute__((global)) void get_padding_offset(int *padding_offset,
                                                int *cum_offsets_out,
@@ -35,12 +34,9 @@ __attribute__((global)) void remove_padding(int64_t *x_remove_padding,
                                            const int sequence_length,
                                            const int bs);

-}  // namespace plugin
-}  // namespace xpu3
+}  // namespace fd_xpu3

-namespace baidu {
-namespace xpu {
-namespace api {
+namespace fastdeploy {
 namespace plugin {

 static int get_padding_offset_cpu(int *padding_offset,
@@ -80,7 +76,7 @@ static int remove_padding_cpu(int64_t *x_remove_padding,
  return api::SUCCESS;
 }

-static int cpu_wrapper(Context *ctx,
+static int cpu_wrapper(api::Context *ctx,
                       int *padding_offset,
                       int *cum_offsets_out,
                       int *cu_seqlens_q,
@@ -104,7 +100,7 @@ static int cpu_wrapper(Context *ctx,
  return api::SUCCESS;
 }

-static int xpu3_wrapper(Context *ctx,
+static int xpu3_wrapper(api::Context *ctx,
                        int *padding_offset,
                        int *cum_offsets_out,
                        int *cu_seqlens_q,
@@ -115,9 +111,9 @@ static int xpu3_wrapper(Context *ctx,
                        const int *seq_lens,
                        const int max_seq_len,
                        const int bs) {
-  using XPU_INT64 = typename XPUIndexType<int64_t>::type;
-  auto get_padding_offset = xpu3::plugin::get_padding_offset;
-  auto remove_padding = xpu3::plugin::remove_padding;
+  using XPU_INT64 = typename api::XPUIndexType<int64_t>::type;
+  auto get_padding_offset = fd_xpu3::get_padding_offset;
+  auto remove_padding = fd_xpu3::remove_padding;
  int32_t ret_xre =
      get_padding_offset<<<ctx->ncluster(), 64, ctx->xpu_stream>>>(
          padding_offset,
@@ -140,7 +136,7 @@ static int xpu3_wrapper(Context *ctx,
  return api::SUCCESS;
 }

-int get_padding_offset(Context *ctx,
+int get_padding_offset(api::Context *ctx,
                       int *padding_offset,
                       int *cum_offsets_out,
                       int *cu_seqlens_q,
@@ -171,7 +167,7 @@ int get_padding_offset(Context *ctx,
                       max_seq_len,
                       bs);
  }
-  if (ctx->dev().type() == api::kXPU2 || ctx->dev().type() == api::kXPU3) {
+  if (ctx->dev().type() == api::kXPU3) {
    return xpu3_wrapper(ctx,
                        padding_offset,
                        cum_offsets_out,
@@ -188,6 +184,4 @@ int get_padding_offset(Context *ctx,
 }

 }  // namespace plugin
-}  // namespace api
-}  // namespace xpu
-}  // namespace baidu
+}  // namespace fastdeploy
@@ -17,8 +17,7 @@
 #include "xpu/plugin.h"
 #include "xpu/refactor/impl_public/wrapper_check.h"

-namespace xpu3 {
-namespace plugin {
+namespace fd_xpu3 {

 __attribute__((global)) void limit_thinking_content_length_kernel_v1(
    int64_t* next_tokens,
@@ -30,15 +29,12 @@ __attribute__((global)) void limit_thinking_content_length_kernel_v1(
    const int64_t think_end_id,
    const int bs,
    const int eos_token_id_len);
-}  // namespace plugin
-}  // namespace xpu3
+}  // namespace fd_xpu3

-namespace baidu {
-namespace xpu {
-namespace api {
+namespace fastdeploy {
 namespace plugin {

-static int cpu_wrapper(Context* ctx,
+static int cpu_wrapper(api::Context* ctx,
                       int64_t* next_tokens,
                       const int* max_think_lens,
                       const int64_t* step_idx,
@@ -80,7 +76,7 @@ static int cpu_wrapper(Context* ctx,
  }
  return api::SUCCESS;
 }
-static int xpu3_wrapper(Context* ctx,
+static int xpu3_wrapper(api::Context* ctx,
                        int64_t* next_tokens,
                        const int* max_think_lens,
                        const int64_t* step_idx,
@@ -90,9 +86,9 @@ static int xpu3_wrapper(Context* ctx,
                        const int64_t think_end_id,
                        const int bs,
                        const int eos_token_id_len) {
-  using XPU_INT64 = typename XPUIndexType<int64_t>::type;
+  using XPU_INT64 = typename api::XPUIndexType<int64_t>::type;
  auto limit_thinking_content_length_kernel_v1 =
-      xpu3::plugin::limit_thinking_content_length_kernel_v1;
+      fd_xpu3::limit_thinking_content_length_kernel_v1;
  int32_t ret_xre =
      limit_thinking_content_length_kernel_v1<<<1, 64, ctx->xpu_stream>>>(
          reinterpret_cast<XPU_INT64*>(next_tokens),
@@ -108,7 +104,7 @@ static int xpu3_wrapper(Context* ctx,
  return api::SUCCESS;
 }

-int limit_thinking_content_length_kernel_v1(Context* ctx,
+int limit_thinking_content_length_kernel_v1(api::Context* ctx,
                                            int64_t* next_tokens,
                                            const int* max_think_lens,
                                            const int64_t* step_idx,
@@ -141,7 +137,7 @@ int limit_thinking_content_length_kernel_v1(Context* ctx,
                       bs,
                       eos_token_id_len);
  }
-  if (ctx->dev().type() == api::kXPU2 || ctx->dev().type() == api::kXPU3) {
+  if (ctx->dev().type() == api::kXPU3) {
    return xpu3_wrapper(ctx,
                        next_tokens,
                        max_think_lens,
@@ -157,6 +153,4 @@ int limit_thinking_content_length_kernel_v1(Context* ctx,
 }

 }  // namespace plugin
-}  // namespace api
-}  // namespace xpu
-}  // namespace baidu
+}  // namespace fastdeploy
@@ -17,8 +17,7 @@
 #include "xpu/plugin.h"
 #include "xpu/refactor/impl_public/wrapper_check.h"

-namespace xpu3 {
-namespace plugin {
+namespace fd_xpu3 {

 __attribute__((global)) void limit_thinking_content_length_kernel_v2(
    int64_t* next_tokens,
@@ -30,15 +29,12 @@ __attribute__((global)) void limit_thinking_content_length_kernel_v2(
    const int64_t line_break_id,
    const int bs);

-}  // namespace plugin
-}  // namespace xpu3
+}  // namespace fd_xpu3

-namespace baidu {
-namespace xpu {
-namespace api {
+namespace fastdeploy {
 namespace plugin {

-static int cpu_wrapper(Context* ctx,
+static int cpu_wrapper(api::Context* ctx,
                       int64_t* next_tokens,
                       const int* max_think_lens,
                       const int64_t* step_idx,
@@ -86,7 +82,7 @@ static int cpu_wrapper(Context* ctx,
  }
  return api::SUCCESS;
 }
-static int xpu3_wrapper(Context* ctx,
+static int xpu3_wrapper(api::Context* ctx,
                        int64_t* next_tokens,
                        const int* max_think_lens,
                        const int64_t* step_idx,
@@ -95,9 +91,9 @@ static int xpu3_wrapper(Context* ctx,
                        const int64_t think_end_id,
                        const int64_t line_break_id,
                        const int bs) {
-  using XPU_INT64 = typename XPUIndexType<int64_t>::type;
+  using XPU_INT64 = typename api::XPUIndexType<int64_t>::type;
  auto limit_thinking_content_length_kernel_v2 =
-      xpu3::plugin::limit_thinking_content_length_kernel_v2;
+      fd_xpu3::limit_thinking_content_length_kernel_v2;
  int32_t ret_xre =
      limit_thinking_content_length_kernel_v2<<<1, 64, ctx->xpu_stream>>>(
          reinterpret_cast<XPU_INT64*>(next_tokens),
@@ -112,7 +108,7 @@ static int xpu3_wrapper(Context* ctx,
  return api::SUCCESS;
 }

-int limit_thinking_content_length_kernel_v2(Context* ctx,
+int limit_thinking_content_length_kernel_v2(api::Context* ctx,
                                            int64_t* next_tokens,
                                            const int* max_think_lens,
                                            const int64_t* step_idx,
@@ -142,7 +138,7 @@ int limit_thinking_content_length_kernel_v2(Context* ctx,
                       line_break_id,
                       bs);
  }
-  if (ctx->dev().type() == api::kXPU2 || ctx->dev().type() == api::kXPU3) {
+  if (ctx->dev().type() == api::kXPU3) {
    return xpu3_wrapper(ctx,
                        next_tokens,
                        max_think_lens,
@@ -157,6 +153,4 @@ int limit_thinking_content_length_kernel_v2(Context* ctx,
 }

 }  // namespace plugin
-}  // namespace api
-}  // namespace xpu
-}  // namespace baidu
+}  // namespace fastdeploy
@@ -15,8 +15,7 @@
 #include "xpu/plugin.h"
 #include "xpu/refactor/impl_public/wrapper_check.h"

-namespace xpu3 {
-namespace plugin {
+namespace fd_xpu3 {
 __attribute__((global)) void ComputeOrderKernel(
    const int* seq_lens_this_time,
    const int* seq_lens_encoder,
@@ -28,15 +27,12 @@ __attribute__((global)) void ComputeOrderKernel(
    const int bsz,
    const int actual_draft_token_num,
    const int input_token_num);
-}  // namespace plugin
-}  // namespace xpu3
+}  // namespace fd_xpu3

-namespace baidu {
-namespace xpu {
-namespace api {
+namespace fastdeploy {
 namespace plugin {

-static int cpu_wrapper(Context* ctx,
+static int cpu_wrapper(api::Context* ctx,
                       const int* seq_lens_this_time,
                       const int* seq_lens_encoder,
                       const int* base_model_seq_lens_this_time,
@@ -97,7 +93,7 @@ static int cpu_wrapper(Context* ctx,
  return api::SUCCESS;
 }

-static int xpu3_wrapper(Context* ctx,
+static int xpu3_wrapper(api::Context* ctx,
                        const int* seq_lens_this_time,
                        const int* seq_lens_encoder,
                        const int* base_model_seq_lens_this_time,
@@ -108,7 +104,7 @@ static int xpu3_wrapper(Context* ctx,
                        const int bsz,
                        const int actual_draft_token_num,
                        const int input_token_num) {
-  int32_t ret_xre = xpu3::plugin::ComputeOrderKernel<<<1, 1, ctx->xpu_stream>>>(
+  int32_t ret_xre = fd_xpu3::ComputeOrderKernel<<<1, 1, ctx->xpu_stream>>>(
      seq_lens_this_time,
      seq_lens_encoder,
      base_model_seq_lens_this_time,
@@ -123,7 +119,7 @@ static int xpu3_wrapper(Context* ctx,
  return api::SUCCESS;
 }

-int compute_order(Context* ctx,
+int compute_order(api::Context* ctx,
                  const int* seq_lens_this_time,
                  const int* seq_lens_encoder,
                  const int* base_model_seq_lens_this_time,
@@ -187,6 +183,4 @@ int compute_order(Context* ctx,
 }

 }  // namespace plugin
-}  // namespace api
-}  // namespace xpu
-}  // namespace baidu
+}  // namespace fastdeploy
--- a/Show More
+++ b/Show More