[XPU] refactor: XPU plugin namespace migration (#6799)

* [XPU] refactor: XPU plugin namespace migration

- Migrate wrapper layer namespace from baidu::xpu::api::plugin to fastdeploy::plugin
- Migrate kernel layer namespace from xpu3::plugin to fd_xpu3
- Add api:: prefix for types (Context, SUCCESS, XPUIndexType, ctx_guard)
- Remove XPU2 support, keep only XPU3
- Update ops/ directory to use new namespace

Total: 137 files changed

* [XPU] fix: add return value check and correct error messages

- Add PADDLE_ENFORCE_XDNN_SUCCESS check for speculate_get_logits and update_attn_mask_offsets
- Fix empty error message in draft_model_postprocess
- Correct function name in speculate_schedule_cache error message
- Update error messages from 'xpu::plugin::' to 'fastdeploy::plugin::'
This commit is contained in:
mayang002
2026-03-13 10:21:51 +08:00
committed by GitHub
parent d73fd876ba
commit 1f9f889e37
138 changed files with 1086 additions and 1467 deletions
+1 -1
View File
@@ -72,7 +72,7 @@ std::vector<paddle::Tensor> AdjustBatchKernel(
auto out = paddle::empty({token_num, dim}, x.type(), x.place());
if (token_num > 0) {
int r = baidu::xpu::api::plugin::eb_adjust_batch<XPUType, XPUType>(
int r = fastdeploy::plugin::eb_adjust_batch<XPUType, XPUType>(
ctx,
reinterpret_cast<const XPUType *>(x.data<data_t>()),
reinterpret_cast<XPUType *>(out.data<data_t>()),
@@ -90,7 +90,7 @@ std::vector<paddle::Tensor> GatherNextToken(
}
if (output_padding_offset) {
int r = baidu::xpu::api::plugin::eb_mtp_gather_next_token<XPUType, XPUType>(
int r = fastdeploy::plugin::eb_mtp_gather_next_token<XPUType, XPUType>(
ctx,
reinterpret_cast<const XPUType*>(x.data<data_t>()),
reinterpret_cast<XPUType*>(out.data<data_t>()),
@@ -99,9 +99,9 @@ std::vector<paddle::Tensor> GatherNextToken(
encoder_batch_map_vp,
decoder_batch_map_vp,
dim);
PD_CHECK(r == 0, "xpu::plugin::gather_next_token failed.");
PD_CHECK(r == 0, "fastdeploy::plugin::gather_next_token failed.");
} else {
int r = baidu::xpu::api::plugin::eb_gather_next_token<XPUType, XPUType>(
int r = fastdeploy::plugin::eb_gather_next_token<XPUType, XPUType>(
ctx,
reinterpret_cast<const XPUType*>(x.data<data_t>()),
reinterpret_cast<XPUType*>(out.data<data_t>()),
@@ -109,7 +109,7 @@ std::vector<paddle::Tensor> GatherNextToken(
encoder_batch_map_vp,
decoder_batch_map_vp,
dim);
PD_CHECK(r == 0, "xpu::plugin::gather_next_token failed.");
PD_CHECK(r == 0, "fastdeploy::plugin::gather_next_token failed.");
}
return {out};
}
@@ -40,19 +40,19 @@ std::vector<paddle::Tensor> GetPaddingOffset(const paddle::Tensor &input_ids,
auto cu_seqlens_k =
paddle::full({bsz + 1}, 0, paddle::DataType::INT32, input_ids.place());
if (token_num_data > 0) {
int r = baidu::xpu::api::plugin::get_padding_offset(
xpu_ctx->x_context(),
batch_id_per_token.data<int>(),
cum_offsets_out.data<int>(),
cu_seqlens_q.data<int>(),
cu_seqlens_k.data<int>(),
x_remove_padding.data<int64_t>(),
input_ids.data<int64_t>(),
cum_offsets.data<int>(),
seq_len.data<int>(),
seq_length,
bsz);
PD_CHECK(r == 0, "baidu::xpu::api::plugin::get_padding_offset failed.");
int r =
fastdeploy::plugin::get_padding_offset(xpu_ctx->x_context(),
batch_id_per_token.data<int>(),
cum_offsets_out.data<int>(),
cu_seqlens_q.data<int>(),
cu_seqlens_k.data<int>(),
x_remove_padding.data<int64_t>(),
input_ids.data<int64_t>(),
cum_offsets.data<int>(),
seq_len.data<int>(),
seq_length,
bsz);
PD_CHECK(r == 0, "fastdeploy::plugin::get_padding_offset failed.");
}
return {x_remove_padding,
@@ -44,7 +44,7 @@ void TokenPenaltyMultiScores(const paddle::Tensor &pre_ids,
case paddle::DataType::FLOAT16: {
using XPUType = typename XPUTypeTrait<float16>::Type;
typedef paddle::float16 data_t;
int r = baidu::xpu::api::plugin::token_penalty_multi_scores(
int r = fastdeploy::plugin::token_penalty_multi_scores(
xpu_ctx->x_context(),
pre_ids.data<int64_t>(),
reinterpret_cast<XPUType *>(
@@ -62,10 +62,11 @@ void TokenPenaltyMultiScores(const paddle::Tensor &pre_ids,
length_id,
end_length,
length_bad_words);
PD_CHECK(r == 0, "xpu::plugin::token_penalty_multi_scores failed.");
PD_CHECK(r == 0,
"fastdeploy::plugin::token_penalty_multi_scores failed.");
} break;
case paddle::DataType::FLOAT32: {
int r = baidu::xpu::api::plugin::token_penalty_multi_scores(
int r = fastdeploy::plugin::token_penalty_multi_scores(
xpu_ctx->x_context(),
pre_ids.data<int64_t>(),
const_cast<float *>(logits.data<float>()),
@@ -82,7 +83,8 @@ void TokenPenaltyMultiScores(const paddle::Tensor &pre_ids,
length_id,
end_length,
length_bad_words);
PD_CHECK(r == 0, "xpu::plugin::token_penalty_multi_scores failed.");
PD_CHECK(r == 0,
"fastdeploy::plugin::token_penalty_multi_scores failed.");
} break;
default:
PD_THROW(
@@ -34,7 +34,7 @@ void LimitThinkingContentLengthV1(const paddle::Tensor& next_tokens,
const int batch_size = next_tokens.shape()[0];
const int eos_token_id_len = eos_token_ids.shape()[0];
int r = baidu::xpu::api::plugin::limit_thinking_content_length_kernel_v1(
int r = fastdeploy::plugin::limit_thinking_content_length_kernel_v1(
xpu_ctx->x_context(),
const_cast<int64_t*>(next_tokens.data<int64_t>()),
max_think_lens.data<int>(),
@@ -46,7 +46,7 @@ void LimitThinkingContentLengthV1(const paddle::Tensor& next_tokens,
batch_size,
eos_token_id_len);
PD_CHECK(r == 0,
"baidu::xpu::api::plugin::limit_thinking_content_length_kernel_v1 "
"fastdeploy::plugin::limit_thinking_content_length_kernel_v1 "
"failed.");
}
@@ -33,7 +33,7 @@ void LimitThinkingContentLengthV2(const paddle::Tensor& next_tokens,
auto xpu_ctx = static_cast<const phi::XPUContext*>(dev_ctx);
const int batch_size = next_tokens.shape()[0];
int r = baidu::xpu::api::plugin::limit_thinking_content_length_kernel_v2(
int r = fastdeploy::plugin::limit_thinking_content_length_kernel_v2(
xpu_ctx->x_context(),
const_cast<int64_t*>(next_tokens.data<int64_t>()),
max_think_lens.data<int>(),
@@ -44,7 +44,7 @@ void LimitThinkingContentLengthV2(const paddle::Tensor& next_tokens,
line_break_id,
batch_size);
PD_CHECK(r == 0,
"baidu::xpu::api::plugin::limit_thinking_content_length_kernel_v2 "
"fastdeploy::plugin::limit_thinking_content_length_kernel_v2 "
"failed.");
}
@@ -30,7 +30,7 @@ void DraftModelPostprocess(const paddle::Tensor& base_model_draft_tokens,
auto xpu_ctx = static_cast<const phi::XPUContext*>(dev_ctx);
int real_bsz = base_model_draft_tokens.shape()[0];
int base_model_draft_token_len = base_model_draft_tokens.shape()[1];
int r = baidu::xpu::api::plugin::draft_model_postprocess(
int r = fastdeploy::plugin::draft_model_postprocess(
xpu_ctx->x_context(),
const_cast<int64_t*>(base_model_draft_tokens.data<int64_t>()),
const_cast<int*>(base_model_seq_lens_this_time.data<int>()),
@@ -38,7 +38,7 @@ void DraftModelPostprocess(const paddle::Tensor& base_model_draft_tokens,
const_cast<bool*>(base_model_stop_flags.data<bool>()),
real_bsz,
base_model_draft_token_len);
PADDLE_ENFORCE_XDNN_SUCCESS(r, "");
PADDLE_ENFORCE_XDNN_SUCCESS(r, "draft_model_postprocess");
}
PD_BUILD_STATIC_OP(draft_model_postprocess)
@@ -64,7 +64,7 @@ void DraftModelPreprocess(const paddle::Tensor& draft_tokens,
auto not_need_stop_gpu =
not_need_stop.copy_to(seq_lens_this_time.place(), false);
int r = baidu::xpu::api::plugin::draft_model_preprocess(
int r = fastdeploy::plugin::draft_model_preprocess(
ctx,
const_cast<int64_t*>(draft_tokens.data<int64_t>()),
const_cast<int64_t*>(input_ids.data<int64_t>()),
@@ -97,7 +97,7 @@ void DraftModelPreprocess(const paddle::Tensor& draft_tokens,
splitwise_prefill,
kvcache_scheduler_v1);
PD_CHECK(r == 0, "xpu::plugin::draft_model_preprocess failed.");
PD_CHECK(r == 0, "fastdeploy::plugin::draft_model_preprocess failed.");
auto not_need_stop_cpu =
not_need_stop_gpu.copy_to(not_need_stop.place(), false);
bool* not_need_stop_data = const_cast<bool*>(not_need_stop.data<bool>());
@@ -63,7 +63,7 @@ void DraftModelUpdate(const paddle::Tensor& inter_next_tokens,
}
}
int r = baidu::xpu::api::plugin::draft_model_update(
int r = fastdeploy::plugin::draft_model_update(
ctx,
inter_next_tokens.data<int64_t>(),
const_cast<int64_t*>(draft_tokens.data<int64_t>()),
@@ -47,18 +47,19 @@ std::vector<paddle::Tensor> EagleGetHiddenStates(
auto output_token_num = paddle::empty(
{1}, seq_lens_this_time.dtype(), seq_lens_this_time.place());
int r = api::plugin::compute_order(ctx,
seq_lens_this_time.data<int>(),
seq_lens_encoder.data<int>(),
base_model_seq_lens_this_time.data<int>(),
base_model_seq_lens_encoder.data<int>(),
accept_nums.data<int>(),
position_map.data<int>(),
output_token_num.data<int>(),
bsz,
actual_draft_token_num,
input_token_num);
PD_CHECK(r == 0, "xpu::plugin::compute_order failed.");
int r = fastdeploy::plugin::compute_order(
ctx,
seq_lens_this_time.data<int>(),
seq_lens_encoder.data<int>(),
base_model_seq_lens_this_time.data<int>(),
base_model_seq_lens_encoder.data<int>(),
accept_nums.data<int>(),
position_map.data<int>(),
output_token_num.data<int>(),
bsz,
actual_draft_token_num,
input_token_num);
PD_CHECK(r == 0, "fastdeploy::plugin::compute_order failed.");
int output_token_num_cpu =
output_token_num.copy_to(paddle::CPUPlace(), false).data<int>()[0];
@@ -72,7 +73,7 @@ std::vector<paddle::Tensor> EagleGetHiddenStates(
case paddle::DataType::BFLOAT16:
using XPUTypeBF16 = typename XPUTypeTrait<bfloat16>::Type;
typedef paddle::bfloat16 bf16_data_t;
r = api::plugin::rebuild_hidden_states(
r = fastdeploy::plugin::rebuild_hidden_states(
ctx,
reinterpret_cast<const XPUTypeBF16*>(input.data<bf16_data_t>()),
position_map.data<int>(),
@@ -80,12 +81,12 @@ std::vector<paddle::Tensor> EagleGetHiddenStates(
dim_embed,
elem_cnt,
output_token_num_cpu);
PD_CHECK(r == 0, "xpu::plugin::rebuild_hidden_states failed.");
PD_CHECK(r == 0, "fastdeploy::plugin::rebuild_hidden_states failed.");
return {out};
case paddle::DataType::FLOAT16:
using XPUTypeFP16 = typename XPUTypeTrait<float16>::Type;
typedef paddle::float16 fp16_data_t;
r = api::plugin::rebuild_hidden_states(
r = fastdeploy::plugin::rebuild_hidden_states(
ctx,
reinterpret_cast<const XPUTypeFP16*>(input.data<fp16_data_t>()),
position_map.data<int>(),
@@ -93,10 +94,10 @@ std::vector<paddle::Tensor> EagleGetHiddenStates(
dim_embed,
elem_cnt,
output_token_num_cpu);
PD_CHECK(r == 0, "xpu::plugin::rebuild_hidden_states failed.");
PD_CHECK(r == 0, "fastdeploy::plugin::rebuild_hidden_states failed.");
return {out};
case paddle::DataType::FLOAT32:
r = api::plugin::rebuild_hidden_states(
r = fastdeploy::plugin::rebuild_hidden_states(
ctx,
reinterpret_cast<const float*>(input.data<float>()),
position_map.data<int>(),
@@ -104,7 +105,7 @@ std::vector<paddle::Tensor> EagleGetHiddenStates(
dim_embed,
elem_cnt,
output_token_num_cpu);
PD_CHECK(r == 0, "xpu::plugin::rebuild_hidden_states failed.");
PD_CHECK(r == 0, "fastdeploy::plugin::rebuild_hidden_states failed.");
return {out};
default:
PD_THROW("Unsupported data type.");
@@ -43,7 +43,7 @@ std::vector<paddle::Tensor> EagleGetSelfHiddenStates(
auto output_token_num = paddle::empty(
{1}, seq_lens_this_time.dtype(), seq_lens_this_time.place());
int r = api::plugin::compute_self_order(
int r = fastdeploy::plugin::compute_self_order(
ctx,
reinterpret_cast<const int*>(last_seq_lens_this_time.data<int>()),
reinterpret_cast<const int*>(seq_lens_this_time.data<int>()),
@@ -51,7 +51,7 @@ std::vector<paddle::Tensor> EagleGetSelfHiddenStates(
reinterpret_cast<int*>(src_map.data<int>()),
reinterpret_cast<int*>(output_token_num.data<int>()),
bsz);
PD_CHECK(r == 0, "xpu::plugin::compute_self_order failed.");
PD_CHECK(r == 0, "fastdeploy::plugin::compute_self_order failed.");
int output_token_num_cpu =
output_token_num.copy_to(paddle::CPUPlace(), false).data<int>()[0];
@@ -67,7 +67,7 @@ std::vector<paddle::Tensor> EagleGetSelfHiddenStates(
case paddle::DataType::BFLOAT16:
using XPUTypeBF16 = typename XPUTypeTrait<bfloat16>::Type;
typedef paddle::bfloat16 bf16_data_t;
r = api::plugin::rebuild_self_hidden_states(
r = fastdeploy::plugin::rebuild_self_hidden_states(
ctx,
reinterpret_cast<const XPUTypeBF16*>(input.data<bf16_data_t>()),
src_map.data<int>(),
@@ -75,12 +75,13 @@ std::vector<paddle::Tensor> EagleGetSelfHiddenStates(
input_token_num,
dim_embed,
elem_cnt);
PD_CHECK(r == 0, "xpu::plugin::rebuild_self_hidden_states failed.");
PD_CHECK(r == 0,
"fastdeploy::plugin::rebuild_self_hidden_states failed.");
return {out};
case paddle::DataType::FLOAT16:
using XPUTypeFP16 = typename XPUTypeTrait<float16>::Type;
typedef paddle::float16 fp16_data_t;
r = api::plugin::rebuild_self_hidden_states(
r = fastdeploy::plugin::rebuild_self_hidden_states(
ctx,
reinterpret_cast<const XPUTypeFP16*>(input.data<fp16_data_t>()),
src_map.data<int>(),
@@ -88,10 +89,11 @@ std::vector<paddle::Tensor> EagleGetSelfHiddenStates(
input_token_num,
dim_embed,
elem_cnt);
PD_CHECK(r == 0, "xpu::plugin::rebuild_self_hidden_states failed.");
PD_CHECK(r == 0,
"fastdeploy::plugin::rebuild_self_hidden_states failed.");
return {out};
case paddle::DataType::FLOAT32:
r = api::plugin::rebuild_self_hidden_states(
r = fastdeploy::plugin::rebuild_self_hidden_states(
ctx,
reinterpret_cast<const float*>(input.data<float>()),
src_map.data<int>(),
@@ -99,7 +101,8 @@ std::vector<paddle::Tensor> EagleGetSelfHiddenStates(
input_token_num,
dim_embed,
elem_cnt);
PD_CHECK(r == 0, "xpu::plugin::rebuild_self_hidden_states failed.");
PD_CHECK(r == 0,
"fastdeploy::plugin::rebuild_self_hidden_states failed.");
return {out};
default:
PD_THROW("Unsupported data type.");
@@ -46,7 +46,7 @@ void MTPStepPaddle(
const int bsz = seq_lens_this_time.shape()[0];
const int block_num_per_seq = block_tables.shape()[1];
int r = baidu::xpu::api::plugin::mtp_free_and_dispatch_block(
int r = fastdeploy::plugin::mtp_free_and_dispatch_block(
ctx,
const_cast<bool *>(base_model_stop_flags.data<bool>()),
const_cast<bool *>(stop_flags.data<bool>()),
@@ -27,7 +27,7 @@ void SpeculateClearAcceptNums(const paddle::Tensor& accept_num,
auto dev_ctx = paddle::experimental::DeviceContextPool::Instance().Get(place);
auto xpu_ctx = static_cast<const phi::XPUContext*>(dev_ctx);
const int max_bsz = seq_lens_decoder.shape()[0];
int r = baidu::xpu::api::plugin::speculate_clear_accept_nums(
int r = fastdeploy::plugin::speculate_clear_accept_nums(
xpu_ctx->x_context(),
const_cast<int*>(accept_num.data<int>()),
seq_lens_decoder.data<int>(),
@@ -43,7 +43,7 @@ void SpeculateGetLogits(const paddle::Tensor& draft_logits,
const int vocab_size = logits.shape()[1];
const int real_bsz = seq_lens_this_time.shape()[0];
baidu::xpu::api::plugin::speculate_get_logits(
int r = fastdeploy::plugin::speculate_get_logits(
ctx,
const_cast<float*>(draft_logits.data<float>()),
const_cast<int*>(next_token_num.data<int>()),
@@ -56,6 +56,7 @@ void SpeculateGetLogits(const paddle::Tensor& draft_logits,
seq_lens_encoder.data<int>(),
real_bsz,
vocab_size);
PADDLE_ENFORCE_XDNN_SUCCESS(r, "speculate_get_logits");
if (draft_logits.is_cpu()) {
delete ctx;
}
@@ -45,7 +45,7 @@ std::vector<paddle::Tensor> SpeculateGetOutputPaddingOffset(
auto output_cum_offsets =
output_cum_offsets_tmp.copy_to(output_cum_offsets_tmp.place(), false);
if (cpu_out_token_num.data<int64_t>()[0] > 0) {
int r = baidu::xpu::api::plugin::speculate_get_output_padding_offset(
int r = fastdeploy::plugin::speculate_get_output_padding_offset(
ctx,
output_padding_offset.mutable_data<int>(),
output_cum_offsets.mutable_data<int>(),
@@ -58,7 +58,7 @@ std::vector<paddle::Tensor> SpeculateGetPaddingOffset(
PD_CHECK(seq_len.is_contiguous(), "Seq lens tensor must be contiguous");
if (token_num_data > 0) {
int r = baidu::xpu::api::plugin::speculate_get_padding_offset(
int r = fastdeploy::plugin::speculate_get_padding_offset(
xpu_ctx->x_context(),
batch_id_per_token.data<int>(),
cum_offsets_out.data<int>(),
@@ -70,7 +70,7 @@ std::vector<paddle::Tensor> SpeculateGetPaddingOffset(
bsz);
PD_CHECK(r == 0, "XPU speculate_get_padding_offset failed");
r = baidu::xpu::api::plugin::speculate_remove_padding<int64_t>(
r = fastdeploy::plugin::speculate_remove_padding<int64_t>(
xpu_ctx->x_context(),
x_remove_padding.data<int64_t>(),
input_ids.data<int64_t>(),
@@ -38,7 +38,7 @@ std::vector<paddle::Tensor> SpeculateGetSeqLensOutput(
auto seq_lens_output = paddle::full(
{bsz}, 0, paddle::DataType::INT32, seq_lens_this_time.place());
int r = baidu::xpu::api::plugin::speculate_get_seq_lens_output(
int r = fastdeploy::plugin::speculate_get_seq_lens_output(
ctx,
seq_lens_output.data<int>(),
seq_lens_this_time.data<int>(),
@@ -46,7 +46,7 @@ std::vector<paddle::Tensor> RebuildAppendPadding(
case paddle::DataType::BFLOAT16:
using XPUTypeBF16 = typename XPUTypeTrait<bfloat16>::Type;
typedef paddle::bfloat16 bf16_data_t;
r = api::plugin::speculate_rebuild_append_padding<XPUTypeBF16>(
r = fastdeploy::plugin::speculate_rebuild_append_padding<XPUTypeBF16>(
ctx,
const_cast<XPUTypeBF16*>(reinterpret_cast<const XPUTypeBF16*>(
full_hidden_states.data<bf16_data_t>())),
@@ -58,12 +58,13 @@ std::vector<paddle::Tensor> RebuildAppendPadding(
dim_embed,
elem_nums,
reinterpret_cast<XPUTypeBF16*>(out.data<bf16_data_t>()));
PD_CHECK(r == 0, "xpu::plugin::speculate_rebuild_append_padding failed.");
PD_CHECK(r == 0,
"fastdeploy::plugin::speculate_rebuild_append_padding failed.");
return {out};
case paddle::DataType::FLOAT16:
using XPUTypeFP16 = typename XPUTypeTrait<float16>::Type;
typedef paddle::float16 fp16_data_t;
r = api::plugin::speculate_rebuild_append_padding<XPUTypeFP16>(
r = fastdeploy::plugin::speculate_rebuild_append_padding<XPUTypeFP16>(
ctx,
const_cast<XPUTypeFP16*>(reinterpret_cast<const XPUTypeFP16*>(
full_hidden_states.data<fp16_data_t>())),
@@ -75,10 +76,11 @@ std::vector<paddle::Tensor> RebuildAppendPadding(
dim_embed,
elem_nums,
reinterpret_cast<XPUTypeFP16*>(out.data<fp16_data_t>()));
PD_CHECK(r == 0, "xpu::plugin::speculate_rebuild_append_padding failed.");
PD_CHECK(r == 0,
"fastdeploy::plugin::speculate_rebuild_append_padding failed.");
return {out};
case paddle::DataType::FLOAT32:
r = api::plugin::speculate_rebuild_append_padding<float>(
r = fastdeploy::plugin::speculate_rebuild_append_padding<float>(
ctx,
const_cast<float*>(full_hidden_states.data<float>()),
const_cast<int*>(cum_offsets.data<int>()),
@@ -89,7 +91,8 @@ std::vector<paddle::Tensor> RebuildAppendPadding(
dim_embed,
elem_nums,
out.data<float>());
PD_CHECK(r == 0, "xpu::plugin::speculate_rebuild_append_padding failed.");
PD_CHECK(r == 0,
"fastdeploy::plugin::speculate_rebuild_append_padding failed.");
return {out};
default:
PD_THROW("Unsupported data type.");
@@ -62,7 +62,7 @@ void SpeculateScheduleCache(const paddle::Tensor &draft_tokens,
}
auto not_need_stop_gpu = not_need_stop.copy_to(stop_flags.place(), false);
int r = baidu::xpu::api::plugin::speculate_schedule_cache(
int r = fastdeploy::plugin::speculate_schedule_cache(
ctx,
draft_tokens.data<int64_t>(),
const_cast<int *>(block_tables.data<int>()),
@@ -87,7 +87,7 @@ void SpeculateScheduleCache(const paddle::Tensor &draft_tokens,
block_num_per_seq,
prefill_one_step_stop);
// kernel launch
PD_CHECK(r == 0, "speculate_free_and_reschedule failed.");
PD_CHECK(r == 0, "speculate_schedule_cache failed.");
auto not_need_stop_cpu =
not_need_stop_gpu.copy_to(not_need_stop.place(), true);
@@ -50,7 +50,7 @@ void SpecGetStopFlagsMultiSeqs(const paddle::Tensor &accept_tokens,
int pre_ids_len = pre_ids.shape()[1];
int accept_tokens_len = accept_tokens.shape()[1];
int r = baidu::xpu::api::plugin::speculate_set_stop_value_multi_seqs(
int r = fastdeploy::plugin::speculate_set_stop_value_multi_seqs(
ctx,
const_cast<bool *>(stop_flags.data<bool>()),
const_cast<int64_t *>(accept_tokens.data<int64_t>()),
@@ -67,7 +67,8 @@ void SpecGetStopFlagsMultiSeqs(const paddle::Tensor &accept_tokens,
stop_seqs_bs,
stop_seqs_max_len,
pre_ids_len);
PD_CHECK(r == 0, "xpu::plugin::speculate_set_stop_value_multi_seqs failed.");
PD_CHECK(r == 0,
"fastdeploy::plugin::speculate_set_stop_value_multi_seqs failed.");
}
PD_BUILD_STATIC_OP(speculate_set_stop_value_multi_seqs)
@@ -42,7 +42,7 @@ void SpeculateSetValueByFlagsAndIdx(const paddle::Tensor &pre_ids_all,
int length = pre_ids_all_shape[1];
int max_draft_tokens = accept_tokens.shape()[1];
int r = baidu::xpu::api::plugin::speculate_set_value_by_flag_and_id(
int r = fastdeploy::plugin::speculate_set_value_by_flag_and_id(
ctx,
const_cast<int64_t *>(pre_ids_all.data<int64_t>()),
accept_tokens.data<int64_t>(),
@@ -60,7 +60,7 @@ void SpeculateStepPaddleBase(
const int length = input_ids.shape()[1];
const int pre_id_length = pre_ids.shape()[1];
const int max_decoder_block_num = pre_id_length / block_size;
int r = baidu::xpu::api::plugin::speculate_free_and_dispatch_block(
int r = fastdeploy::plugin::speculate_free_and_dispatch_block(
ctx,
const_cast<bool *>(stop_flags.data<bool>()),
const_cast<int *>(seq_lens_this_time.data<int>()),
@@ -88,7 +88,7 @@ void SpeculateStepPaddleBase(
auto recover_lens_cpu = recover_lens.copy_to(paddle::CPUPlace(), false);
int recover_lens_cpu_data = recover_lens_cpu.data<int>()[0];
if (recover_lens_cpu_data > 0) {
r = baidu::xpu::api::plugin::speculate_recover_block(
r = fastdeploy::plugin::speculate_recover_block(
ctx,
const_cast<int *>(recover_block_list.data<int>()),
const_cast<int *>(recover_lens.data<int>()),
@@ -70,7 +70,7 @@ void SpeculateStepSchedule(
paddle::full({1}, 0, paddle::DataType::INT32, stop_flags.place());
auto step_bs_list =
paddle::full({bsz}, 0, paddle::DataType::INT32, stop_flags.place());
int r = baidu::xpu::api::plugin::speculate_free_and_reschedule(
int r = fastdeploy::plugin::speculate_free_and_reschedule(
ctx,
const_cast<bool *>(stop_flags.data<bool>()),
const_cast<int *>(seq_lens_this_time.data<int>()),
@@ -59,7 +59,7 @@ void SpeculateTokenPenaltyMultiScores(
case paddle::DataType::BFLOAT16: {
using XPUType = typename XPUTypeTrait<paddle::bfloat16>::Type;
typedef paddle::bfloat16 data_t;
int r = baidu::xpu::api::plugin::speculate_token_penalty_multi_scores(
int r = fastdeploy::plugin::speculate_token_penalty_multi_scores(
ctx,
pre_ids.data<int64_t>(),
reinterpret_cast<XPUType*>(
@@ -81,12 +81,13 @@ void SpeculateTokenPenaltyMultiScores(
length_bad_words,
token_num,
max_seq_len);
PD_CHECK(r == 0, "xpu::plugin::token_penalty_multi_scores failed.");
PD_CHECK(r == 0,
"fastdeploy::plugin::token_penalty_multi_scores failed.");
} break;
case paddle::DataType::FLOAT16: {
using XPUType = typename XPUTypeTrait<float16>::Type;
typedef paddle::float16 data_t;
int r = baidu::xpu::api::plugin::speculate_token_penalty_multi_scores(
int r = fastdeploy::plugin::speculate_token_penalty_multi_scores(
ctx,
pre_ids.data<int64_t>(),
reinterpret_cast<XPUType*>(
@@ -108,10 +109,11 @@ void SpeculateTokenPenaltyMultiScores(
length_bad_words,
token_num,
max_seq_len);
PD_CHECK(r == 0, "xpu::plugin::token_penalty_multi_scores failed.");
PD_CHECK(r == 0,
"fastdeploy::plugin::token_penalty_multi_scores failed.");
} break;
case paddle::DataType::FLOAT32: {
int r = baidu::xpu::api::plugin::speculate_token_penalty_multi_scores(
int r = fastdeploy::plugin::speculate_token_penalty_multi_scores(
ctx,
pre_ids.data<int64_t>(),
const_cast<float*>(logits.data<float>()),
@@ -132,7 +134,8 @@ void SpeculateTokenPenaltyMultiScores(
length_bad_words,
token_num,
max_seq_len);
PD_CHECK(r == 0, "xpu::plugin::token_penalty_multi_scores failed.");
PD_CHECK(r == 0,
"fastdeploy::plugin::token_penalty_multi_scores failed.");
} break;
default:
PD_THROW(
@@ -46,7 +46,7 @@ void SpeculateUpdate(const paddle::Tensor &seq_lens_encoder,
}
auto not_need_stop_xpu = not_need_stop.copy_to(stop_flags.place(), false);
int r = baidu::xpu::api::plugin::speculate_update(
int r = fastdeploy::plugin::speculate_update(
ctx,
const_cast<int *>(seq_lens_encoder.data<int>()),
const_cast<int *>(seq_lens_decoder.data<int>()),
@@ -46,7 +46,7 @@ void SpeculateUpdateV3(const paddle::Tensor &seq_lens_encoder,
}
auto not_need_stop_xpu = not_need_stop.copy_to(stop_flags.place(), false);
int r = baidu::xpu::api::plugin::speculate_update_v3(
int r = fastdeploy::plugin::speculate_update_v3(
ctx,
const_cast<int *>(seq_lens_encoder.data<int>()),
const_cast<int *>(seq_lens_decoder.data<int>()),
@@ -107,7 +107,7 @@ void SpeculateVerify(const paddle::Tensor &sampled_token_ids,
int ret;
if (use_topk) {
if (enable_topp) {
ret = baidu::xpu::api::plugin::speculate_verify<true, true>(
ret = fastdeploy::plugin::speculate_verify<true, true>(
ctx,
sampled_token_ids.data<int64_t>(),
const_cast<int64_t *>(accept_tokens.data<int64_t>()),
@@ -140,7 +140,7 @@ void SpeculateVerify(const paddle::Tensor &sampled_token_ids,
use_target_sampling);
PD_CHECK(ret == 0, "speculate_verify failed.");
} else {
ret = baidu::xpu::api::plugin::speculate_verify<false, true>(
ret = fastdeploy::plugin::speculate_verify<false, true>(
ctx,
sampled_token_ids.data<int64_t>(),
const_cast<int64_t *>(accept_tokens.data<int64_t>()),
@@ -175,7 +175,7 @@ void SpeculateVerify(const paddle::Tensor &sampled_token_ids,
PD_CHECK(ret == 0, "speculate_verify failed.");
} else {
if (enable_topp) {
ret = baidu::xpu::api::plugin::speculate_verify<true, false>(
ret = fastdeploy::plugin::speculate_verify<true, false>(
ctx,
sampled_token_ids.data<int64_t>(),
const_cast<int64_t *>(accept_tokens.data<int64_t>()),
@@ -208,7 +208,7 @@ void SpeculateVerify(const paddle::Tensor &sampled_token_ids,
use_target_sampling);
PD_CHECK(ret == 0, "speculate_verify failed.");
} else {
ret = baidu::xpu::api::plugin::speculate_verify<false, false>(
ret = fastdeploy::plugin::speculate_verify<false, false>(
ctx,
sampled_token_ids.data<int64_t>(),
const_cast<int64_t *>(accept_tokens.data<int64_t>()),
@@ -71,9 +71,9 @@ std::vector<paddle::Tensor> TopPCandidates(
typedef paddle::bfloat16 bf16_data_t;
switch (candidates_len) {
FIXED_TOPK(
r = api::plugin::top_p_candidates<XPUTypeBF16,
TopKMaxLength,
kTopK>(
r = fastdeploy::plugin::top_p_candidates<XPUTypeBF16,
TopKMaxLength,
kTopK>(
ctx,
reinterpret_cast<const XPUTypeBF16*>(probs.data<bf16_data_t>()),
reinterpret_cast<const XPUTypeBF16*>(top_p.data<bf16_data_t>()),
@@ -86,7 +86,7 @@ std::vector<paddle::Tensor> TopPCandidates(
token_num,
candidates_len,
max_seq_len);
PD_CHECK(r == 0, "xpu::plugin::top_p_candidates failed.");
PD_CHECK(r == 0, "fastdeploy::plugin::top_p_candidates failed.");
return {verify_scores, verify_tokens, actual_candidate_lens});
}
case paddle::DataType::FLOAT16:
@@ -94,9 +94,9 @@ std::vector<paddle::Tensor> TopPCandidates(
typedef paddle::float16 fp16_data_t;
switch (candidates_len) {
FIXED_TOPK(
r = api::plugin::top_p_candidates<XPUTypeFP16,
TopKMaxLength,
kTopK>(
r = fastdeploy::plugin::top_p_candidates<XPUTypeFP16,
TopKMaxLength,
kTopK>(
ctx,
reinterpret_cast<const XPUTypeFP16*>(probs.data<fp16_data_t>()),
reinterpret_cast<const XPUTypeFP16*>(top_p.data<fp16_data_t>()),
@@ -109,25 +109,26 @@ std::vector<paddle::Tensor> TopPCandidates(
token_num,
candidates_len,
max_seq_len);
PD_CHECK(r == 0, "xpu::plugin::top_p_candidates failed.");
PD_CHECK(r == 0, "fastdeploy::plugin::top_p_candidates failed.");
return {verify_scores, verify_tokens, actual_candidate_lens});
}
case paddle::DataType::FLOAT32:
switch (candidates_len) {
FIXED_TOPK(
r = api::plugin::top_p_candidates<float, TopKMaxLength, kTopK>(
ctx,
probs.data<float>(),
top_p.data<float>(),
output_padding_offset.data<int>(),
verify_tokens.data<int64_t>(),
verify_scores.data<float>(),
actual_candidate_lens.data<int>(),
vocab_size,
token_num,
candidates_len,
max_seq_len);
PD_CHECK(r == 0, "xpu::plugin::top_p_candidates failed.");
r = fastdeploy::plugin::
top_p_candidates<float, TopKMaxLength, kTopK>(
ctx,
probs.data<float>(),
top_p.data<float>(),
output_padding_offset.data<int>(),
verify_tokens.data<int64_t>(),
verify_scores.data<float>(),
actual_candidate_lens.data<int>(),
vocab_size,
token_num,
candidates_len,
max_seq_len);
PD_CHECK(r == 0, "fastdeploy::plugin::top_p_candidates failed.");
return {verify_scores, verify_tokens, actual_candidate_lens});
}
default:
@@ -53,7 +53,7 @@ std::vector<paddle::Tensor> UpdateAttnMaskOffsets(
paddle::DataType::INT32,
ids_remove_padding.place());
baidu::xpu::api::plugin::update_attn_mask_offsets(
int r = fastdeploy::plugin::update_attn_mask_offsets(
ctx,
attn_mask_offsets.data<int>(),
seq_lens_this_time.data<int>(),
@@ -68,6 +68,7 @@ std::vector<paddle::Tensor> UpdateAttnMaskOffsets(
real_bsz,
max_model_len,
decode_states_len);
PADDLE_ENFORCE_XDNN_SUCCESS(r, "update_attn_mask_offsets");
if (ids_remove_padding.is_cpu()) {
delete ctx;
@@ -66,7 +66,7 @@ std::vector<paddle::Tensor> RecoverBatchSequenceKernel(
paddle::Tensor out;
out = paddle::empty({token_num, dim}, x.type(), x.place());
int r = baidu::xpu::api::plugin::eb_recover_batch_sequence<XPUType, XPUType>(
int r = fastdeploy::plugin::eb_recover_batch_sequence<XPUType, XPUType>(
ctx,
reinterpret_cast<const XPUType*>(x.data<data_t>()),
reinterpret_cast<XPUType*>(out.data<data_t>()),
@@ -75,7 +75,7 @@ std::vector<paddle::Tensor> RecoverBatchSequenceKernel(
encoder_batch_map_vp,
decoder_batch_map_vp,
dim);
PD_CHECK(r == 0, "xpu::plugin::eb_recover_batch_sequence failed.");
PD_CHECK(r == 0, "fastdeploy::plugin::eb_recover_batch_sequence failed.");
return {out};
}
@@ -42,7 +42,7 @@ void RecoverDecodeTask(
int r = 0;
if (draft_tokens) {
const int draft_tokens_len = draft_tokens.get_ptr()->shape()[1];
r = baidu::xpu::api::plugin::recover_spec_decode_task(
r = fastdeploy::plugin::recover_spec_decode_task(
xpu_ctx->x_context(),
const_cast<bool *>(stop_flags.data<bool>()),
const_cast<int *>(seq_lens_this_time.data<int>()),
@@ -60,7 +60,7 @@ void RecoverDecodeTask(
draft_tokens_len,
max_draft_tokens * 2 + 1);
} else {
r = baidu::xpu::api::plugin::recover_decode_task(
r = fastdeploy::plugin::recover_decode_task(
xpu_ctx->x_context(),
const_cast<bool *>(stop_flags.data<bool>()),
const_cast<int *>(seq_lens_this_time.data<int>()),
@@ -73,7 +73,7 @@ void RecoverDecodeTask(
block_num_per_seq,
block_size);
}
PD_CHECK(r == 0, "baidu::xpu::api::plugin::recover_decode_task failed.");
PD_CHECK(r == 0, "fastdeploy::plugin::recover_decode_task failed.");
}
PD_BUILD_STATIC_OP(recover_decode_task)
@@ -30,7 +30,7 @@ void SetValueByFlagsAndIdx(const paddle::Tensor &pre_ids_all,
int bs = seq_lens_this_time.shape()[0];
int length = pre_ids_all.shape()[1];
int length_input_ids = input_ids.shape()[1];
int r = baidu::xpu::api::plugin::set_value_by_flags_and_idx(
int r = fastdeploy::plugin::set_value_by_flags_and_idx(
xpu_ctx->x_context(),
stop_flags.data<bool>(),
const_cast<int64_t *>(pre_ids_all.data<int64_t>()),
@@ -41,7 +41,7 @@ void SetValueByFlagsAndIdx(const paddle::Tensor &pre_ids_all,
bs,
length,
length_input_ids);
PD_CHECK(r == 0, "xpu::plugin::set_value_by_flags_and_idx failed.");
PD_CHECK(r == 0, "fastdeploy::plugin::set_value_by_flags_and_idx failed.");
}
PD_BUILD_OP(set_value_by_flags_and_idx)
@@ -41,26 +41,25 @@ void SpeculateLimitThinkingContentLength(const paddle::Tensor& next_tokens,
const int eos_token_id_len = eos_token_ids.shape()[0];
const int inject_len = inject_token_ids.shape()[0];
int r =
baidu::xpu::api::plugin::speculate_limit_thinking_content_length_kernel(
xpu_ctx->x_context(),
const_cast<int64_t*>(next_tokens.data<int64_t>()),
max_think_lens.data<int>(),
const_cast<int*>(max_reply_lens.data<int>()),
const_cast<int64_t*>(step_idx.data<int64_t>()),
eos_token_ids.data<int64_t>(),
const_cast<int*>(limit_status.data<int>()),
const_cast<int*>(accept_num.data<int>()),
stop_flags.data<bool>(),
think_end_id,
(inject_len > 0) ? inject_token_ids.data<int64_t>() : nullptr,
tokens_per_step,
batch_size,
eos_token_id_len,
inject_len,
splitwise_role_is_decode);
int r = fastdeploy::plugin::speculate_limit_thinking_content_length_kernel(
xpu_ctx->x_context(),
const_cast<int64_t*>(next_tokens.data<int64_t>()),
max_think_lens.data<int>(),
const_cast<int*>(max_reply_lens.data<int>()),
const_cast<int64_t*>(step_idx.data<int64_t>()),
eos_token_ids.data<int64_t>(),
const_cast<int*>(limit_status.data<int>()),
const_cast<int*>(accept_num.data<int>()),
stop_flags.data<bool>(),
think_end_id,
(inject_len > 0) ? inject_token_ids.data<int64_t>() : nullptr,
tokens_per_step,
batch_size,
eos_token_id_len,
inject_len,
splitwise_role_is_decode);
PD_CHECK(r == 0,
"baidu::xpu::api::plugin::"
"fastdeploy::plugin::"
"speculate_limit_thinking_content_length_kernel failed.");
}
+2 -2
View File
@@ -55,7 +55,7 @@ void StepPaddle(const paddle::Tensor &stop_flags,
const int length = input_ids.shape()[1];
const int pre_id_length = pre_ids.shape()[1];
const int max_decoder_block_num = pre_id_length / block_size;
int r = baidu::xpu::api::plugin::free_and_dispatch_block(
int r = fastdeploy::plugin::free_and_dispatch_block(
xpu_ctx->x_context(),
const_cast<bool *>(stop_flags.data<bool>()),
const_cast<int *>(seq_lens_this_time.data<int>()),
@@ -81,7 +81,7 @@ void StepPaddle(const paddle::Tensor &stop_flags,
auto recover_lens_cpu = recover_lens.copy_to(paddle::CPUPlace(), false);
int recover_lens_cpu_data = recover_lens_cpu.data<int>()[0];
if (recover_lens_cpu_data > 0) {
r = baidu::xpu::api::plugin::recover_block(
r = fastdeploy::plugin::recover_block(
xpu_ctx->x_context(),
const_cast<int *>(recover_block_list.data<int>()),
const_cast<int *>(recover_lens.data<int>()),
@@ -39,7 +39,7 @@ void GetStopFlagsMulti(const paddle::Tensor &topk_ids,
std::vector<int64_t> shape = topk_ids.shape();
int64_t bs_now = shape[0];
int64_t end_length = end_ids.shape()[0];
int r = baidu::xpu::api::plugin::set_stop_value_multi_ends<int64_t>(
int r = fastdeploy::plugin::set_stop_value_multi_ends<int64_t>(
xpu_ctx->x_context(),
const_cast<bool *>(stop_flags.data<bool>()),
const_cast<int64_t *>(topk_ids.data<int64_t>()),
@@ -49,7 +49,7 @@ void GetStopFlagsMulti(const paddle::Tensor &topk_ids,
bs_now,
end_length,
beam_search);
PD_CHECK(r == 0, "xpu::plugin::set_stop_value_multi_ends failed.");
PD_CHECK(r == 0, "fastdeploy::plugin::set_stop_value_multi_ends failed.");
}
PD_BUILD_OP(set_stop_value_multi_ends)
@@ -42,7 +42,7 @@ std::vector<paddle::Tensor> TextImageGatherScatter(
case paddle::DataType::BFLOAT16: {
using XPUType = typename XPUTypeTrait<bfloat16>::Type;
typedef paddle::bfloat16 data_t;
int r = baidu::xpu::api::plugin::text_image_gather_scatter<XPUType>(
int r = fastdeploy::plugin::text_image_gather_scatter<XPUType>(
xpu_ctx->x_context(),
reinterpret_cast<XPUType*>(input.data<data_t>()),
reinterpret_cast<XPUType*>(text_input.data<data_t>()),
@@ -28,7 +28,7 @@ void TextImageIndexOut(const paddle::Tensor& token_type_ids,
auto dev_ctx = paddle::experimental::DeviceContextPool::Instance().Get(place);
auto xpu_ctx = static_cast<const phi::XPUContext*>(dev_ctx);
const int64_t token_num = token_type_ids.shape()[0];
int r = baidu::xpu::api::plugin::text_image_index_out(
int r = fastdeploy::plugin::text_image_index_out(
xpu_ctx->x_context(),
token_type_ids.data<int32_t>(),
const_cast<int32_t*>(text_index.data<int32_t>()),
+2 -2
View File
@@ -39,7 +39,7 @@ void UpdateInputs(const paddle::Tensor& stop_flags,
const int input_ids_stride = input_ids.shape()[1];
auto not_need_stop_xpu = not_need_stop.copy_to(stop_flags.place(), false);
int r = baidu::xpu::api::plugin::update_inputs(
int r = fastdeploy::plugin::update_inputs(
xpu_ctx->x_context(),
const_cast<bool*>(not_need_stop_xpu.data<bool>()),
const_cast<int*>(seq_lens_this_time.data<int>()),
@@ -52,7 +52,7 @@ void UpdateInputs(const paddle::Tensor& stop_flags,
now_bsz,
max_bsz,
input_ids_stride);
PD_CHECK(r == 0, "baidu::xpu::api::plugin::update_inputs failed.");
PD_CHECK(r == 0, "fastdeploy::plugin::update_inputs failed.");
auto not_need_stop_cpu =
not_need_stop_xpu.copy_to(not_need_stop.place(), false);
bool* not_need_stop_data = const_cast<bool*>(not_need_stop.data<bool>());
@@ -44,7 +44,7 @@ void UpdateInputsV1(const paddle::Tensor& stop_flags,
const int input_ids_stride = input_ids.shape()[1];
const int block_num_per_seq = block_tables.shape()[1];
auto not_need_stop_gpu = not_need_stop.copy_to(stop_flags.place(), false);
int r = baidu::xpu::api::plugin::update_inputs_v1(
int r = fastdeploy::plugin::update_inputs_v1(
xpu_ctx->x_context(),
const_cast<bool*>(not_need_stop_gpu.data<bool>()),
const_cast<int*>(seq_lens_this_time.data<int>()),
@@ -63,7 +63,7 @@ void UpdateInputsV1(const paddle::Tensor& stop_flags,
input_ids_stride,
block_num_per_seq,
block_size);
PD_CHECK(r == 0, "baidu::xpu::api::plugin::update_inputs_kernel_v1 failed.");
PD_CHECK(r == 0, "fastdeploy::plugin::update_inputs_kernel_v1 failed.");
auto not_need_stop_cpu =
not_need_stop_gpu.copy_to(not_need_stop.place(), false);
bool* not_need_stop_data = const_cast<bool*>(not_need_stop.data<bool>());
@@ -34,15 +34,14 @@ std::vector<paddle::Tensor> WeightQuantizeKernel(const paddle::Tensor &x,
if (algo == "weight_only_int8") {
paddle::Tensor out =
paddle::full({k, n}, 0, paddle::DataType::INT8, x.place());
int ret =
baidu::xpu::api::plugin::quant2d_per_channel<XPUType, float, int8_t>(
xpu_ctx->x_context(),
reinterpret_cast<const XPUType *>(x.template data<T>()),
nullptr,
out.data<int8_t>(),
scale.data<float>(),
k,
n);
int ret = fastdeploy::plugin::quant2d_per_channel<XPUType, float, int8_t>(
xpu_ctx->x_context(),
reinterpret_cast<const XPUType *>(x.template data<T>()),
nullptr,
out.data<int8_t>(),
scale.data<float>(),
k,
n);
PD_CHECK(ret == 0);
return {out, scale};
} else if (algo == "weight_only_int4") {
@@ -18,13 +18,17 @@
#pragma once
#include "xpu/xdnn.h"
namespace baidu {
namespace xpu {
namespace api {
namespace fd_xpu3 {
typedef xpu3::int64_t int64_t;
}
namespace fastdeploy {
namespace plugin {
namespace api = baidu::xpu::api;
template <typename T>
DLL_EXPORT int set_stop_value_multi_ends(Context* ctx,
DLL_EXPORT int set_stop_value_multi_ends(api::Context* ctx,
bool* stop_flags,
T* topk_ids,
T* next_tokens,
@@ -34,7 +38,7 @@ DLL_EXPORT int set_stop_value_multi_ends(Context* ctx,
const int end_length,
const bool beam_search);
DLL_EXPORT int set_value_by_flags_and_idx(Context* ctx,
DLL_EXPORT int set_value_by_flags_and_idx(api::Context* ctx,
const bool* stop_flags,
int64_t* pre_ids_all,
const int64_t* input_ids,
@@ -46,7 +50,7 @@ DLL_EXPORT int set_value_by_flags_and_idx(Context* ctx,
int length_input_ids);
template <typename T>
DLL_EXPORT int token_penalty_multi_scores(Context* ctx,
DLL_EXPORT int token_penalty_multi_scores(api::Context* ctx,
const int64_t* pre_ids,
T* logits,
const T* penalty_scores,
@@ -63,7 +67,7 @@ DLL_EXPORT int token_penalty_multi_scores(Context* ctx,
const int64_t end_length,
const int64_t length_bad_words);
DLL_EXPORT int get_padding_offset(Context* ctx,
DLL_EXPORT int get_padding_offset(api::Context* ctx,
int* padding_offset,
int* cum_offsets_out,
int* cu_seqlens_q,
@@ -75,7 +79,7 @@ DLL_EXPORT int get_padding_offset(Context* ctx,
const int max_seq_len,
const int bs);
DLL_EXPORT int speculate_get_padding_offset(Context* ctx,
DLL_EXPORT int speculate_get_padding_offset(api::Context* ctx,
int* batch_id_per_token,
int* cum_offsets_out,
int* cu_seqlens_q,
@@ -117,7 +121,7 @@ DLL_EXPORT int draft_model_preprocess(api::Context* ctx,
const bool splitwise_prefill,
const bool kvcache_scheduler_v1);
DLL_EXPORT int update_inputs(Context* ctx,
DLL_EXPORT int update_inputs(api::Context* ctx,
bool* not_need_stop,
int* seq_lens_this_time,
int* seq_lens_encoder,
@@ -130,7 +134,7 @@ DLL_EXPORT int update_inputs(Context* ctx,
const int max_bsz,
const int input_ids_stride);
DLL_EXPORT int free_and_dispatch_block(Context* ctx,
DLL_EXPORT int free_and_dispatch_block(api::Context* ctx,
bool* stop_flags,
int* seq_lens_this_time,
int* seq_lens_decoder,
@@ -153,7 +157,7 @@ DLL_EXPORT int free_and_dispatch_block(Context* ctx,
const int max_decoder_block_num);
DLL_EXPORT int speculate_free_and_dispatch_block(
Context* ctx,
api::Context* ctx,
bool* stop_flags,
int* seq_lens_this_time,
int* seq_lens_decoder,
@@ -177,7 +181,7 @@ DLL_EXPORT int speculate_free_and_dispatch_block(
const int max_decoder_block_num,
const int max_draft_tokens);
DLL_EXPORT int recover_block(Context* ctx,
DLL_EXPORT int recover_block(api::Context* ctx,
int* recover_block_list, // [bsz]
int* recover_len,
bool* stop_flags,
@@ -200,7 +204,7 @@ DLL_EXPORT int recover_block(Context* ctx,
const int length,
const int pre_id_length);
DLL_EXPORT int speculate_recover_block(Context* ctx,
DLL_EXPORT int speculate_recover_block(api::Context* ctx,
int* recover_block_list, // [bsz]
int* recover_len,
bool* stop_flags,
@@ -224,7 +228,7 @@ DLL_EXPORT int speculate_recover_block(Context* ctx,
const int length,
const int pre_id_length);
DLL_EXPORT int recover_decode_task(Context* ctx,
DLL_EXPORT int recover_decode_task(api::Context* ctx,
bool* stop_flags,
int* seq_lens_this_time,
int* seq_lens_encoder,
@@ -236,7 +240,7 @@ DLL_EXPORT int recover_decode_task(Context* ctx,
const int block_num_per_seq,
const int block_size);
DLL_EXPORT int recover_spec_decode_task(Context* ctx,
DLL_EXPORT int recover_spec_decode_task(api::Context* ctx,
bool* stop_flags,
int* seq_lens_this_time,
int* seq_lens_encoder,
@@ -253,7 +257,7 @@ DLL_EXPORT int recover_spec_decode_task(Context* ctx,
const int draft_tokens_len,
const int num_extra_tokens);
DLL_EXPORT int update_inputs_v1(Context* ctx,
DLL_EXPORT int update_inputs_v1(api::Context* ctx,
bool* not_need_stop,
int* seq_lens_this_time,
int* seq_lens_encoder,
@@ -274,45 +278,45 @@ DLL_EXPORT int update_inputs_v1(Context* ctx,
template <typename TX, typename TY>
DLL_EXPORT int eb_adjust_batch(
Context* ctx,
api::Context* ctx,
const TX* x,
TY* y,
VectorParam<int32_t>& encoder_seqs_lods, // NOLINT
VectorParam<int32_t>& decoder_seqs_lods, // NOLINT
VectorParam<int32_t>& encoder_batch_map, // NOLINT
VectorParam<int32_t>& decoder_batch_map, // NOLINT
api::VectorParam<int32_t>& encoder_seqs_lods, // NOLINT
api::VectorParam<int32_t>& decoder_seqs_lods, // NOLINT
api::VectorParam<int32_t>& encoder_batch_map, // NOLINT
api::VectorParam<int32_t>& decoder_batch_map, // NOLINT
int64_t hidden_dim);
template <typename TX, typename TY>
DLL_EXPORT int eb_gather_next_token(
Context* ctx,
api::Context* ctx,
const TX* x,
TY* y,
VectorParam<int32_t>& encoder_seqs_lods, // NOLINT
VectorParam<int32_t>& encoder_batch_map, // NOLINT
VectorParam<int32_t>& decoder_batch_map, // NOLINT
api::VectorParam<int32_t>& encoder_seqs_lods, // NOLINT
api::VectorParam<int32_t>& encoder_batch_map, // NOLINT
api::VectorParam<int32_t>& decoder_batch_map, // NOLINT
int64_t hidden_dim);
template <typename TX, typename TY>
DLL_EXPORT int eb_mtp_gather_next_token(
Context* ctx,
api::Context* ctx,
const TX* x,
TY* y,
VectorParam<int32_t>& encoder_seqs_lods, // NOLINT
VectorParam<int32_t>& decoder_seqs_lods, // NOLINT
VectorParam<int32_t>& encoder_batch_map, // NOLINT
VectorParam<int32_t>& decoder_batch_map, // NOLINT
api::VectorParam<int32_t>& encoder_seqs_lods, // NOLINT
api::VectorParam<int32_t>& decoder_seqs_lods, // NOLINT
api::VectorParam<int32_t>& encoder_batch_map, // NOLINT
api::VectorParam<int32_t>& decoder_batch_map, // NOLINT
int64_t hidden_dim);
template <typename TX, typename TY>
DLL_EXPORT int eb_recover_batch_sequence(
Context* ctx,
api::Context* ctx,
const TX* x,
TY* y,
VectorParam<int32_t>& encoder_seqs_lods, // NOLINT
VectorParam<int32_t>& decoder_seqs_lods, // NOLINT
VectorParam<int32_t>& encoder_batch_map, // NOLINT
VectorParam<int32_t>& decoder_batch_map, // NOLINT
api::VectorParam<int32_t>& encoder_seqs_lods, // NOLINT
api::VectorParam<int32_t>& decoder_seqs_lods, // NOLINT
api::VectorParam<int32_t>& encoder_batch_map, // NOLINT
api::VectorParam<int32_t>& decoder_batch_map, // NOLINT
int64_t hidden_dim);
template <typename TX, typename TSCALE = float, typename TY = int8_t>
@@ -324,7 +328,7 @@ DLL_EXPORT int quant2d_per_channel(api::Context* ctx,
int64_t m,
int64_t n);
DLL_EXPORT int text_image_index_out(Context* ctx,
DLL_EXPORT int text_image_index_out(api::Context* ctx,
const int* token_type_ids, // x
int* text_index, // y1
int* image_index, // y2
@@ -372,7 +376,7 @@ DLL_EXPORT int limit_thinking_content_length_kernel_v2(
template <typename T>
DLL_EXPORT int speculate_token_penalty_multi_scores(
Context* ctx,
api::Context* ctx,
const int64_t* pre_ids,
T* logits,
const T* penalty_scores,
@@ -392,7 +396,7 @@ DLL_EXPORT int speculate_token_penalty_multi_scores(
const int64_t length_bad_words,
const int64_t token_num,
const int64_t max_seq_len);
DLL_EXPORT int mtp_free_and_dispatch_block(Context* ctx,
DLL_EXPORT int mtp_free_and_dispatch_block(api::Context* ctx,
bool* base_model_stop_flags,
bool* stop_flags,
bool* batch_drop,
@@ -409,7 +413,7 @@ DLL_EXPORT int mtp_free_and_dispatch_block(Context* ctx,
const int max_draft_tokens);
template <bool ENABLE_TOPP, bool USE_TOPK>
DLL_EXPORT int speculate_verify(Context* ctx,
DLL_EXPORT int speculate_verify(api::Context* ctx,
const int64_t* sampled_token_ids,
int64_t* accept_tokens,
int* accept_num,
@@ -440,19 +444,19 @@ DLL_EXPORT int speculate_verify(Context* ctx,
const bool accept_all_drafts,
const bool use_target_sampling);
DLL_EXPORT int speculate_clear_accept_nums(Context* ctx,
DLL_EXPORT int speculate_clear_accept_nums(api::Context* ctx,
int* accept_num,
const int* seq_lens_decoder,
const int max_bsz);
DLL_EXPORT int speculate_get_seq_lens_output(Context* ctx,
DLL_EXPORT int speculate_get_seq_lens_output(api::Context* ctx,
int* seq_lens_output,
const int* seq_lens_this_time,
const int* seq_lens_encoder,
const int* seq_lens_decoder,
const int real_bsz);
DLL_EXPORT int draft_model_update(Context* ctx,
DLL_EXPORT int draft_model_update(api::Context* ctx,
const int64_t* inter_next_tokens,
int64_t* draft_tokens,
int64_t* pre_ids,
@@ -475,7 +479,7 @@ DLL_EXPORT int draft_model_update(Context* ctx,
const int substep,
const bool prefill_one_step_stop);
DLL_EXPORT int speculate_set_stop_value_multi_seqs(Context* ctx,
DLL_EXPORT int speculate_set_stop_value_multi_seqs(api::Context* ctx,
bool* stop_flags,
int64_t* accept_tokens,
int* accept_nums,
@@ -504,7 +508,7 @@ DLL_EXPORT int speculate_rebuild_append_padding(api::Context* ctx,
T* out);
template <typename T>
DLL_EXPORT int speculate_remove_padding(Context* ctx,
DLL_EXPORT int speculate_remove_padding(api::Context* ctx,
T* x_remove_padding,
const T* input_ids,
const T* draft_tokens,
@@ -536,7 +540,7 @@ DLL_EXPORT int compute_order(api::Context* ctx,
const int actual_draft_token_num,
const int input_token_num);
DLL_EXPORT int draft_model_postprocess(Context* ctx,
DLL_EXPORT int draft_model_postprocess(api::Context* ctx,
const int64_t* base_model_draft_tokens,
int* base_model_seq_lens_this_time,
const int* base_model_seq_lens_encoder,
@@ -544,7 +548,7 @@ DLL_EXPORT int draft_model_postprocess(Context* ctx,
int bsz,
int base_model_draft_token_len);
DLL_EXPORT int speculate_set_value_by_flag_and_id(Context* ctx,
DLL_EXPORT int speculate_set_value_by_flag_and_id(api::Context* ctx,
int64_t* pre_ids_all,
const int64_t* accept_tokens,
int* accept_num,
@@ -557,7 +561,7 @@ DLL_EXPORT int speculate_set_value_by_flag_and_id(Context* ctx,
int max_draft_tokens);
DLL_EXPORT int speculate_get_output_padding_offset(
Context* ctx,
api::Context* ctx,
int* output_padding_offset,
int* output_cum_offsets,
const int* output_cum_offsets_tmp,
@@ -578,7 +582,7 @@ DLL_EXPORT int top_p_candidates(api::Context* ctx,
int max_cadidate_len,
int max_seq_len);
DLL_EXPORT int speculate_free_and_reschedule(Context* ctx,
DLL_EXPORT int speculate_free_and_reschedule(api::Context* ctx,
bool* stop_flags,
int* seq_lens_this_time,
int* seq_lens_decoder,
@@ -601,7 +605,7 @@ DLL_EXPORT int speculate_free_and_reschedule(Context* ctx,
const int max_decoder_block_num,
const int max_draft_tokens);
DLL_EXPORT int speculate_schedule_cache(Context* ctx,
DLL_EXPORT int speculate_schedule_cache(api::Context* ctx,
const int64_t* draft_tokens,
int* block_tables,
bool* stop_flags,
@@ -625,7 +629,7 @@ DLL_EXPORT int speculate_schedule_cache(Context* ctx,
const int block_num_per_seq,
const bool prefill_one_step_stop);
DLL_EXPORT int speculate_update_v3(Context* ctx,
DLL_EXPORT int speculate_update_v3(api::Context* ctx,
int* seq_lens_encoder,
int* seq_lens_decoder,
bool* not_need_stop,
@@ -641,7 +645,7 @@ DLL_EXPORT int speculate_update_v3(Context* ctx,
const int max_bsz,
const int max_draft_tokens);
DLL_EXPORT int speculate_update(Context* ctx,
DLL_EXPORT int speculate_update(api::Context* ctx,
int* seq_lens_encoder,
int* seq_lens_decoder,
bool* not_need_stop,
@@ -674,7 +678,7 @@ DLL_EXPORT int rebuild_self_hidden_states(api::Context* ctx,
int dim_embed,
int elem_cnt);
DLL_EXPORT int speculate_get_logits(Context* ctx,
DLL_EXPORT int speculate_get_logits(api::Context* ctx,
float* draft_logits,
int* next_token_num,
int* batch_token_num,
@@ -687,7 +691,7 @@ DLL_EXPORT int speculate_get_logits(Context* ctx,
const int real_bsz,
const int vocab_size);
DLL_EXPORT int update_attn_mask_offsets(Context* ctx,
DLL_EXPORT int update_attn_mask_offsets(api::Context* ctx,
int* attn_mask_offsets,
const int* seq_lens_this_time,
const int* seq_lens_encoder,
@@ -723,6 +727,4 @@ DLL_EXPORT int speculate_limit_thinking_content_length_kernel(
* --------------------------------------------*/
} // namespace plugin
} // namespace api
} // namespace xpu
} // namespace baidu
} // namespace fastdeploy
@@ -2,8 +2,7 @@
#include "xpu/kernel/cluster_partition.h"
#include "xpu/kernel/cluster_primitive.h"
namespace xpu3 {
namespace plugin {
namespace fd_xpu3 {
template <typename T>
inline __device__ void update_bad_words_logit(_global_ptr_ T* logits) {
@@ -54,5 +53,4 @@ __global__ void ban_bad_words(T* logits,
_XPU_DEF__BAN_BAD_WORDS_(float);
_XPU_DEF__BAN_BAD_WORDS_(float16);
} // namespace plugin
} // namespace xpu3
} // namespace fd_xpu3
@@ -1,8 +1,7 @@
#include "xpu/kernel/cluster.h"
#include "xpu/kernel/cluster_debug.h"
#include "xpu/kernel/cluster_primitive.h"
namespace xpu3 {
namespace plugin {
namespace fd_xpu3 {
#define MAX_LM_SIZE 28672
// One core has 32KB LM(gropu LM), MAX_LM_SIZE = (32 - 4)KB / 2 = 30720, 4KB is
// the stack space
@@ -134,5 +133,4 @@ _XPU_DEF__EB_ADJUST_BATCH_(bfloat16, float);
_XPU_DEF__EB_ADJUST_BATCH_(float, bfloat16);
_XPU_DEF__EB_ADJUST_BATCH_(int32_t, int32_t);
_XPU_DEF__EB_ADJUST_BATCH_(int64_t, int64_t);
} // namespace plugin
} // namespace xpu3
} // namespace fd_xpu3
@@ -1,8 +1,7 @@
#include "xpu/kernel/cluster.h"
#include "xpu/kernel/cluster_debug.h"
#include "xpu/kernel/cluster_primitive.h"
namespace xpu3 {
namespace plugin {
namespace fd_xpu3 {
#define MAX_LM_SIZE 28672
// One core has 32KB LMgroup LM), MAX_LM_SIZE = (32 - 4)KB / 2 = 30720, 4KB is
// the stack space
@@ -98,5 +97,4 @@ _XPU_DEF__EB_GATHER_NEXT_TOKEN(bfloat16, float16);
_XPU_DEF__EB_GATHER_NEXT_TOKEN(float16, bfloat16);
_XPU_DEF__EB_GATHER_NEXT_TOKEN(bfloat16, float);
_XPU_DEF__EB_GATHER_NEXT_TOKEN(float, bfloat16);
} // namespace plugin
} // namespace xpu3
} // namespace fd_xpu3
@@ -2,8 +2,7 @@
#include "xpu/kernel/cluster_partition.h"
#include "xpu/kernel/cluster_primitive.h"
namespace xpu3 {
namespace plugin {
namespace fd_xpu3 {
static __device__ inline int loada_float(_shared_ptr_ const int *ptr) {
int ret;
@@ -322,5 +321,4 @@ __global__ void free_and_dispatch_block(bool *stop_flags,
}
}
} // namespace plugin
} // namespace xpu3
} // namespace fd_xpu3
@@ -2,8 +2,7 @@
#include "xpu/kernel/cluster_partition.h"
#include "xpu/kernel/cluster_primitive.h"
namespace xpu3 {
namespace plugin {
namespace fd_xpu3 {
__global__ void get_padding_offset(int *batch_id_per_token,
int *cum_offsets_out,
@@ -49,5 +48,4 @@ __global__ void get_padding_offset(int *batch_id_per_token,
}
}
} // namespace plugin
} // namespace xpu3
} // namespace fd_xpu3
@@ -7,8 +7,7 @@
#include "xpu/kernel/xtdk.h"
#include "xpu/kernel/xtdk_io.h"
namespace xpu3 {
namespace plugin {
namespace fd_xpu3 {
template <typename T>
static inline __device__ bool is_in_end(const T id,
@@ -94,5 +93,4 @@ __global__ void limit_thinking_content_length_kernel_v1(
}
}
} // namespace plugin
} // namespace xpu3
} // namespace fd_xpu3
@@ -7,8 +7,7 @@
#include "xpu/kernel/xtdk.h"
#include "xpu/kernel/xtdk_io.h"
namespace xpu3 {
namespace plugin {
namespace fd_xpu3 {
__global__ void limit_thinking_content_length_kernel_v2(
int64_t* next_tokens,
@@ -89,5 +88,4 @@ __global__ void limit_thinking_content_length_kernel_v2(
}
}
} // namespace plugin
} // namespace xpu3
} // namespace fd_xpu3
@@ -2,8 +2,7 @@
#include "xpu/kernel/cluster_partition.h"
#include "xpu/kernel/cluster_primitive.h"
namespace xpu3 {
namespace plugin {
namespace fd_xpu3 {
template <typename T>
__global__ void min_length_logits_process(T* logits,
@@ -64,5 +63,4 @@ __global__ void min_length_logits_process(T* logits,
_XPU_DEF__UPDATE_LOGITS_REPEAT_TIMES_(float);
_XPU_DEF__UPDATE_LOGITS_REPEAT_TIMES_(float16);
} // namespace plugin
} // namespace xpu3
} // namespace fd_xpu3
@@ -2,8 +2,7 @@
#include "xpu/kernel/cluster_partition.h"
#include "xpu/kernel/cluster_primitive.h"
namespace xpu3 {
namespace plugin {
namespace fd_xpu3 {
__global__ void ComputeOrderKernel(const int* seq_lens_this_time,
const int* seq_lens_encoder,
@@ -112,5 +111,4 @@ __global__ void ComputeOrderKernel(const int* seq_lens_this_time,
LM2GM(&out_offset, output_token_num, sizeof(int));
}
} // namespace plugin
} // namespace xpu3
} // namespace fd_xpu3
@@ -2,8 +2,7 @@
#include "xpu/kernel/cluster_partition.h"
#include "xpu/kernel/cluster_primitive.h"
namespace xpu3 {
namespace plugin {
namespace fd_xpu3 {
__global__ void ComputeSelfOrderKernel(const int* last_seq_lens_this_time,
const int* seq_lens_this_time,
@@ -69,5 +68,4 @@ __global__ void ComputeSelfOrderKernel(const int* last_seq_lens_this_time,
LM2GM(&out_offset, output_token_num, sizeof(int));
}
} // namespace plugin
} // namespace xpu3
} // namespace fd_xpu3
@@ -4,8 +4,7 @@
#include "xpu/kernel/cluster_primitive_template.h"
#include "xpu/kernel/cluster_simd.h"
#include "xpu/kernel/xtdk_io.h"
namespace xpu3 {
namespace plugin {
namespace fd_xpu3 {
static inline __device__ int v_reduce(int32x16_t v) {
auto v0 = vsrlp_int32x16(256, v);
@@ -185,5 +184,4 @@ __global__ void draft_model_postprocess(const int64_t* base_model_draft_tokens,
sync_cluster();
}
}
} // namespace plugin
} // namespace xpu3
} // namespace fd_xpu3
@@ -4,8 +4,7 @@
#include "xpu/kernel/cluster_primitive.h"
#include "xpu/kernel/cluster_simd.h"
namespace xpu3 {
namespace plugin {
namespace fd_xpu3 {
__global__ void draft_model_preprocess(int64_t* draft_tokens,
int64_t* input_ids,
bool* stop_flags,
@@ -235,5 +234,4 @@ __global__ void draft_model_preprocess(int64_t* draft_tokens,
}
}
} // namespace plugin
} // namespace xpu3
} // namespace fd_xpu3
@@ -3,8 +3,7 @@
#include "xpu/kernel/cluster_partition.h"
#include "xpu/kernel/cluster_primitive.h"
#include "xpu/kernel/cluster_primitive_template.h"
namespace xpu3 {
namespace plugin {
namespace fd_xpu3 {
inline __device__ bool is_in_end(const int64_t id,
const __global_ptr__ int64_t* end_ids,
int length) {
@@ -108,5 +107,4 @@ __global__ void draft_model_update(const int64_t* inter_next_tokens,
}
mfence();
}
} // namespace plugin
} // namespace xpu3
} // namespace fd_xpu3
@@ -1,8 +1,7 @@
#include "xpu/kernel/cluster.h"
#include "xpu/kernel/cluster_debug.h"
#include "xpu/kernel/cluster_primitive.h"
namespace xpu3 {
namespace plugin {
namespace fd_xpu3 {
#define MAX_LM_SIZE 28672
// One core has 32KB LM(group LM), MAX_LM_SIZE = (32 - 4)KB / 2 = 30720, 4KB is
// the stack space
@@ -125,5 +124,5 @@ _XPU_DEF__EB_MTP_GATHER_NEXT_TOKEN(bfloat16, float16);
_XPU_DEF__EB_MTP_GATHER_NEXT_TOKEN(float16, bfloat16);
_XPU_DEF__EB_MTP_GATHER_NEXT_TOKEN(bfloat16, float);
_XPU_DEF__EB_MTP_GATHER_NEXT_TOKEN(float, bfloat16);
} // namespace plugin
} // namespace xpu3
} // namespace fd_xpu3
@@ -1,8 +1,7 @@
#include "xpu/kernel/cluster.h"
#include "xpu/kernel/cluster_debug.h"
#include "xpu/kernel/cluster_primitive.h"
namespace xpu3 {
namespace plugin {
namespace fd_xpu3 {
#define MAX_LM_SIZE 28672
// One core has 32KB LM(group LM), MAX_LM_SIZE = (32 - 4)KB / 2 = 30720, 4KB is
// the stack space
@@ -125,5 +124,5 @@ _XPU_DEF__EB_RECOVER_BATCH_SEQUENCE(bfloat16, float16);
_XPU_DEF__EB_RECOVER_BATCH_SEQUENCE(float16, bfloat16);
_XPU_DEF__EB_RECOVER_BATCH_SEQUENCE(bfloat16, float);
_XPU_DEF__EB_RECOVER_BATCH_SEQUENCE(float, bfloat16);
} // namespace plugin
} // namespace xpu3
} // namespace fd_xpu3
@@ -2,8 +2,7 @@
#include "xpu/kernel/cluster_partition.h"
#include "xpu/kernel/cluster_primitive.h"
namespace xpu3 {
namespace plugin {
namespace fd_xpu3 {
static __device__ inline int loada_float(_shared_ptr_ const int *ptr) {
int ret;
@@ -205,5 +204,4 @@ __global__ void mtp_free_and_dispatch_block(bool *base_model_stop_flags,
}
}
} // namespace plugin
} // namespace xpu3
} // namespace fd_xpu3
@@ -2,8 +2,7 @@
#include "xpu/kernel/cluster_partition.h"
#include "xpu/kernel/cluster_primitive.h"
namespace xpu3 {
namespace plugin {
namespace fd_xpu3 {
template <typename T>
__global__ void RebuildAppendPaddingKernel(const T *full_hidden_states,
@@ -86,5 +85,4 @@ _XPU_DEF_REBUILD_APPEND_PADDING_KERNEL(bfloat16);
_XPU_DEF_REBUILD_APPEND_PADDING_KERNEL(float16);
_XPU_DEF_REBUILD_APPEND_PADDING_KERNEL(float);
} // namespace plugin
} // namespace xpu3
} // namespace fd_xpu3
@@ -2,8 +2,7 @@
#include "xpu/kernel/cluster_partition.h"
#include "xpu/kernel/cluster_primitive.h"
namespace xpu3 {
namespace plugin {
namespace fd_xpu3 {
template <typename T>
__global__ void rebuildHiddenStatesKernel(const T* input,
@@ -61,5 +60,4 @@ _XPU_DEF_REBUILD_HIDDEN_STATES_KERNEL(bfloat16);
_XPU_DEF_REBUILD_HIDDEN_STATES_KERNEL(float);
_XPU_DEF_REBUILD_HIDDEN_STATES_KERNEL(float16);
} // namespace plugin
} // namespace xpu3
} // namespace fd_xpu3
@@ -2,8 +2,7 @@
#include "xpu/kernel/cluster_partition.h"
#include "xpu/kernel/cluster_primitive.h"
namespace xpu3 {
namespace plugin {
namespace fd_xpu3 {
template <typename T>
__global__ void rebuildSelfHiddenStatesKernel(
@@ -52,5 +51,4 @@ _XPU_DEF_REBUILD_SELF_HIDDEN_STATES_KERNEL(bfloat16);
_XPU_DEF_REBUILD_SELF_HIDDEN_STATES_KERNEL(float);
_XPU_DEF_REBUILD_SELF_HIDDEN_STATES_KERNEL(float16);
} // namespace plugin
} // namespace xpu3
} // namespace fd_xpu3
@@ -2,8 +2,7 @@
#include "xpu/kernel/cluster_partition.h"
#include "xpu/kernel/cluster_primitive.h"
namespace xpu3 {
namespace plugin {
namespace fd_xpu3 {
__attribute__((global)) void recover_spec_decode_task(
bool *stop_flags,
@@ -71,5 +70,4 @@ __attribute__((global)) void recover_spec_decode_task(
}
}
} // namespace plugin
} // namespace xpu3
} // namespace fd_xpu3
@@ -2,8 +2,7 @@
#include "xpu/kernel/cluster_partition.h"
#include "xpu/kernel/cluster_primitive.h"
namespace xpu3 {
namespace plugin {
namespace fd_xpu3 {
template <typename T>
inline __device__ void update_bad_words_logit(_global_ptr_ T* logits) {
@@ -74,5 +73,4 @@ _XPU_DEF__BAN_BAD_WORDS_(float);
_XPU_DEF__BAN_BAD_WORDS_(float16);
_XPU_DEF__BAN_BAD_WORDS_(bfloat16);
} // namespace plugin
} // namespace xpu3
} // namespace fd_xpu3
@@ -20,8 +20,7 @@
#include "xpu/kernel/cluster_primitive.h"
#include "xpu/kernel/xtdk_io.h"
namespace xpu3 {
namespace plugin {
namespace fd_xpu3 {
__global__ void speculate_clear_accept_nums(int* accept_num,
const int* seq_lens_decoder,
@@ -40,5 +39,4 @@ __global__ void speculate_clear_accept_nums(int* accept_num,
}
}
} // namespace plugin
} // namespace xpu3
} // namespace fd_xpu3
@@ -2,8 +2,7 @@
#include "xpu/kernel/cluster_partition.h"
#include "xpu/kernel/cluster_primitive.h"
namespace xpu3 {
namespace plugin {
namespace fd_xpu3 {
static __device__ inline int loada_float(_shared_ptr_ const int *ptr) {
int ret;
@@ -333,5 +332,4 @@ __global__ void speculate_free_and_dispatch_block(
}
}
} // namespace plugin
} // namespace xpu3
} // namespace fd_xpu3
@@ -2,8 +2,7 @@
#include "xpu/kernel/cluster_partition.h"
#include "xpu/kernel/cluster_primitive.h"
namespace xpu3 {
namespace plugin {
namespace fd_xpu3 {
static __device__ inline int loada_float(_shared_ptr_ const int *ptr) {
int ret;
@@ -284,5 +283,4 @@ __global__ void speculate_free_and_reschedule(bool *stop_flags,
}
}
} // namespace plugin
} // namespace xpu3
} // namespace fd_xpu3
@@ -2,8 +2,7 @@
#include "xpu/kernel/cluster_partition.h"
#include "xpu/kernel/cluster_primitive.h"
namespace xpu3 {
namespace plugin {
namespace fd_xpu3 {
__device__ void prefix_sum(__shared_ptr__ int* sm_seq_lens_encoder,
__shared_ptr__ int* sm_seq_lens_this_time,
@@ -127,5 +126,4 @@ __global__ void speculate_get_logits(float* draft_logits,
}
}
} // namespace plugin
} // namespace xpu3
} // namespace fd_xpu3
@@ -20,8 +20,7 @@
#include "xpu/kernel/cluster_primitive.h"
#include "xpu/kernel/xtdk_io.h"
namespace xpu3 {
namespace plugin {
namespace fd_xpu3 {
__global__ void speculate_get_output_padding_offset(
int* output_padding_offset,
@@ -59,5 +58,4 @@ __global__ void speculate_get_output_padding_offset(
}
}
} // namespace plugin
} // namespace xpu3
} // namespace fd_xpu3
@@ -21,8 +21,7 @@
#include "xpu/kernel/cluster_simd.h"
#include "xpu/kernel/xtdk.h"
namespace xpu3 {
namespace plugin {
namespace fd_xpu3 {
template <typename T>
__global__ void speculate_remove_padding(T* output_data,
@@ -118,5 +117,4 @@ _XPU_DEF_SPECULATE_KERNELS_(float16);
_XPU_DEF_SPECULATE_KERNELS_(bfloat16);
_XPU_DEF_SPECULATE_KERNELS_(int64_t);
} // namespace plugin
} // namespace xpu3
} // namespace fd_xpu3
@@ -20,8 +20,7 @@
#include "xpu/kernel/cluster_primitive.h"
#include "xpu/kernel/xtdk_io.h"
namespace xpu3 {
namespace plugin {
namespace fd_xpu3 {
__global__ void speculate_get_seq_lens_output(int* seq_lens_output,
const int* seq_lens_this_time,
@@ -54,5 +53,4 @@ __global__ void speculate_get_seq_lens_output(int* seq_lens_output,
}
}
} // namespace plugin
} // namespace xpu3
} // namespace fd_xpu3
@@ -3,8 +3,7 @@
#include "xpu/kernel/cluster_partition.h"
#include "xpu/kernel/cluster_primitive.h"
namespace xpu3 {
namespace plugin {
namespace fd_xpu3 {
template <typename T>
__global__ void speculate_min_length_logits_process(
@@ -87,5 +86,4 @@ _XPU_DEF__UPDATE_LOGITS_REPEAT_TIMES_(float);
_XPU_DEF__UPDATE_LOGITS_REPEAT_TIMES_(float16);
_XPU_DEF__UPDATE_LOGITS_REPEAT_TIMES_(bfloat16);
} // namespace plugin
} // namespace xpu3
} // namespace fd_xpu3
@@ -2,8 +2,7 @@
#include "xpu/kernel/cluster_partition.h"
#include "xpu/kernel/cluster_primitive.h"
namespace xpu3 {
namespace plugin {
namespace fd_xpu3 {
static __device__ inline int loada_float(_shared_ptr_ const int* ptr) {
int ret;
@@ -160,5 +159,4 @@ __global__ void speculate_recover_block(int* recover_block_list, // [bsz]
}
}
} // namespace plugin
} // namespace xpu3
} // namespace fd_xpu3
@@ -3,8 +3,7 @@
#include "xpu/kernel/cluster_primitive.h"
#include "xpu/kernel/xtdk_io.h"
namespace xpu3 {
namespace plugin {
namespace fd_xpu3 {
static inline __device__ int v_reduce(int32x16_t &v0, int32x16_t &v1) {
int res;
@@ -175,5 +174,4 @@ __global__ void speculate_schedule_cache(const int64_t *draft_tokens,
}
}
} // namespace plugin
} // namespace xpu3
} // namespace fd_xpu3
@@ -4,8 +4,7 @@
#include "xpu/kernel/xtdk_math.h"
#include "xpu/kernel/xtdk_simd.h"
namespace xpu3 {
namespace plugin {
namespace fd_xpu3 {
__global__ void speculate_set_stop_value_multi_seqs(bool *stop_flags,
int64_t *accept_tokens,
@@ -99,5 +98,5 @@ __global__ void speculate_set_stop_value_multi_seqs(bool *stop_flags,
}
}
}
} // namespace plugin
} // namespace xpu3
} // namespace fd_xpu3
@@ -20,8 +20,7 @@
#include "xpu/kernel/cluster_primitive.h"
#include "xpu/kernel/xtdk_io.h"
namespace xpu3 {
namespace plugin {
namespace fd_xpu3 {
__global__ void speculate_set_value_by_flag_and_id(int64_t *pre_ids_all,
const int64_t *accept_tokens,
@@ -84,5 +83,4 @@ __global__ void speculate_set_value_by_flag_and_id(int64_t *pre_ids_all,
}
}
} // namespace plugin
} // namespace xpu3
} // namespace fd_xpu3
@@ -20,8 +20,7 @@
#include "xpu/kernel/cluster_primitive.h"
#include "xpu/kernel/cluster_primitive_template.h"
namespace xpu3 {
namespace plugin {
namespace fd_xpu3 {
static inline __device__ int v_reduce(int32x16_t &v0, int32x16_t &v1) {
int res;
@@ -198,5 +197,4 @@ template __global__ void speculate_update<512>(int *seq_lens_encoder,
const int max_bsz,
const int max_draft_tokens);
} // namespace plugin
} // namespace xpu3
} // namespace fd_xpu3
@@ -3,8 +3,7 @@
#include "xpu/kernel/cluster_primitive.h"
#include "xpu/kernel/cluster_primitive_template.h"
namespace xpu3 {
namespace plugin {
namespace fd_xpu3 {
static __device__ void atomic_add(_shared_ptr_ int *ptr, int v) {
bool fail = true;
@@ -264,5 +263,4 @@ __global__ void speculate_update_repeat_times(const int64_t *pre_ids,
}
}
} // namespace plugin
} // namespace xpu3
} // namespace fd_xpu3
@@ -20,8 +20,7 @@
#include "xpu/kernel/cluster_primitive.h"
#include "xpu/kernel/cluster_primitive_template.h"
namespace xpu3 {
namespace plugin {
namespace fd_xpu3 {
static inline __device__ int v_reduce(int32x16_t &v0, int32x16_t &v1) {
int res;
@@ -198,5 +197,4 @@ template __global__ void speculate_update_v3<512>(int *seq_lens_encoder,
const int max_bsz,
const int max_draft_tokens);
} // namespace plugin
} // namespace xpu3
} // namespace fd_xpu3
@@ -3,8 +3,7 @@
#include "xpu/kernel/cluster_partition.h"
#include "xpu/kernel/cluster_primitive.h"
namespace xpu3 {
namespace plugin {
namespace fd_xpu3 {
__device__ void do_cast(const int *xlm, float *ylm, int64_t len) {
for (int64_t i = 0; i < len; i += 32) {
@@ -279,5 +278,4 @@ _XPU_DEF__UPDATE_VALUE_BY_REPEAT_TIMES_SIMD(float);
_XPU_DEF__UPDATE_VALUE_BY_REPEAT_TIMES_SIMD(float16);
_XPU_DEF__UPDATE_VALUE_BY_REPEAT_TIMES_SIMD(bfloat16);
} // namespace plugin
} // namespace xpu3
} // namespace fd_xpu3
@@ -6,8 +6,8 @@
// #include "xpu/internal/aten/xrand_philox4x32_10.h"
// #include "xpu/internal/aten/xrand_uniform.h"
// #include "xpu/internal/aten/xrand_global.h"
namespace xpu3 {
namespace plugin {
namespace fd_xpu3 {
static inline __device__ int v_reduce(int32x16_t &v0, int32x16_t &v1) {
int res;
v1 = vvadd_int32x16(v0, v1);
@@ -380,5 +380,5 @@ SPECULATE_VERIFY_INSTANTIATE(true, true)
SPECULATE_VERIFY_INSTANTIATE(true, false)
SPECULATE_VERIFY_INSTANTIATE(false, true)
SPECULATE_VERIFY_INSTANTIATE(false, false)
} // namespace plugin
} // namespace xpu3
} // namespace fd_xpu3
@@ -3,8 +3,7 @@
#include "xpu/kernel/cluster_primitive.h"
#include "xpu/kernel/cluster_primitive_template.h"
namespace xpu3 {
namespace plugin {
namespace fd_xpu3 {
template <typename T, int MaxLength, int TopPBeamTopK>
__device__ void top_p_candidates_big_n(
@@ -345,5 +344,4 @@ _XPU_DEF_TOP_P_CANDIDATES_KERNEL(float, 2, 5);
_XPU_DEF_TOP_P_CANDIDATES_KERNEL(float, 2, 8);
_XPU_DEF_TOP_P_CANDIDATES_KERNEL(float, 2, 10);
} // namespace plugin
} // namespace xpu3
} // namespace fd_xpu3
@@ -3,8 +3,7 @@
#include "xpu/kernel/cluster_primitive.h"
#include "xpu/kernel/xtdk_io.h"
namespace xpu3 {
namespace plugin {
namespace fd_xpu3 {
__global__ void update_attn_mask_offsets(int* attn_mask_offsets,
const int* seq_lens_this_time,
@@ -73,5 +72,4 @@ __global__ void update_attn_mask_offsets(int* attn_mask_offsets,
}
}
} // namespace plugin
} // namespace xpu3
} // namespace fd_xpu3
@@ -5,8 +5,7 @@
// TODO()
// #include "xpu/quant_xpu.h"
// #include "xpu_plugin.h"
namespace xpu3 {
namespace plugin {
namespace fd_xpu3 {
#define MAX_SM_SIZE 32768
// One core has 32KB LMgroup LM), MAX_LM_SIZE = (32 - 4)KB / 2 = 30720, 4KB is
// the stack space
@@ -1065,5 +1064,4 @@ _XPU_DEF__QUANT2d_PER_CHANNEL_CACHED(float, float, int8_t, 64);
_XPU_DEF__QUANT2d_PER_CHANNEL_CACHED(float16, float, int8_t, 128);
_XPU_DEF__QUANT2d_PER_CHANNEL_CACHED(bfloat16, float, int8_t, 128);
_XPU_DEF__QUANT2d_PER_CHANNEL_CACHED(float, float, int8_t, 128);
} // namespace plugin
} // namespace xpu3
} // namespace fd_xpu3
@@ -2,8 +2,7 @@
#include "xpu/kernel/cluster_partition.h"
#include "xpu/kernel/cluster_primitive.h"
namespace xpu3 {
namespace plugin {
namespace fd_xpu3 {
static __device__ inline int loada_float(_shared_ptr_ const int* ptr) {
int ret;
@@ -150,5 +149,4 @@ __global__ void recover_block(int* recover_block_list, // [bsz]
}
}
} // namespace plugin
} // namespace xpu3
} // namespace fd_xpu3
@@ -2,8 +2,7 @@
#include "xpu/kernel/cluster_partition.h"
#include "xpu/kernel/cluster_primitive.h"
namespace xpu3 {
namespace plugin {
namespace fd_xpu3 {
__global__ void recover_decode_task(bool *stop_flags,
int *seq_lens_this_time,
@@ -39,5 +38,4 @@ __global__ void recover_decode_task(bool *stop_flags,
}
}
} // namespace plugin
} // namespace xpu3
} // namespace fd_xpu3
@@ -2,8 +2,7 @@
#include "xpu/kernel/cluster_partition.h"
#include "xpu/kernel/cluster_primitive.h"
namespace xpu3 {
namespace plugin {
namespace fd_xpu3 {
__global__ void remove_padding(int64_t *x_remove_padding,
const int64_t *input_data,
@@ -36,5 +35,4 @@ __global__ void remove_padding(int64_t *x_remove_padding,
}
}
} // namespace plugin
} // namespace xpu3
} // namespace fd_xpu3
@@ -3,8 +3,7 @@
#include "xpu/kernel/xtdk_math.h"
#include "xpu/kernel/xtdk_simd.h"
namespace xpu3 {
namespace plugin {
namespace fd_xpu3 {
template <typename T>
static inline __device__ bool is_in_end(const T id,
@@ -97,5 +96,4 @@ __global__ void set_stop_value_multi_ends(bool* stop_flags,
const bool prefill_one_step_stop);
_XPU_DEF__SET_VALUE_BY_FLAGS_BOTH_(int64_t);
} // namespace plugin
} // namespace xpu3
} // namespace fd_xpu3
@@ -1,6 +1,5 @@
#include "xpu/kernel/cluster.h"
namespace xpu3 {
namespace plugin {
namespace fd_xpu3 {
__global__ void set_value_by_flags_and_idx(const bool* stop_flags,
int64_t* pre_ids_all,
@@ -46,5 +45,4 @@ __global__ void set_value_by_flags_and_idx(const bool* stop_flags,
}
}
} // namespace plugin
} // namespace xpu3
} // namespace fd_xpu3
@@ -3,8 +3,7 @@
#include "xpu/kernel/cluster_primitive.h"
#include "xpu/kernel/xtdk_io.h"
namespace xpu3 {
namespace plugin {
namespace fd_xpu3 {
template <typename T>
static __device__ inline void text_image_gather(
@@ -215,5 +214,4 @@ __global__ void text_image_gather_scatter(T* input,
_XPU_DEF_TEXT_IMAGE_GATHER_SCATTER(bfloat16);
} // namespace plugin
} // namespace xpu3
} // namespace fd_xpu3
@@ -20,8 +20,7 @@
#include "xpu/kernel/cluster_primitive.h"
#include "xpu/kernel/cluster_primitive_template.h"
namespace xpu3 {
namespace plugin {
namespace fd_xpu3 {
static __device__ void do_calc(const _shared_ptr_ int* lm_x,
int* lm_y1,
@@ -110,5 +109,4 @@ __global__ void text_image_index_out_kernel(const int* token_type_ids, // x
buffer_ptr_y2.gm_store(image_index + i, read_size);
}
}
} // namespace plugin
} // namespace xpu3
} // namespace fd_xpu3
@@ -2,8 +2,7 @@
#include "xpu/kernel/cluster_partition.h"
#include "xpu/kernel/cluster_primitive.h"
namespace xpu3 {
namespace plugin {
namespace fd_xpu3 {
__global__ void update_inputs(bool *not_need_stop,
int *seq_lens_this_time,
@@ -71,5 +70,4 @@ __global__ void update_inputs(bool *not_need_stop,
}
}
} // namespace plugin
} // namespace xpu3
} // namespace fd_xpu3
@@ -7,8 +7,7 @@
#include "xpu/kernel/xtdk.h"
#include "xpu/kernel/xtdk_io.h"
namespace xpu3 {
namespace plugin {
namespace fd_xpu3 {
__global__ void update_inputs_v1(bool* not_need_stop,
int* seq_lens_this_time,
@@ -148,5 +147,4 @@ __global__ void update_inputs_v1(bool* not_need_stop,
}
}
} // namespace plugin
} // namespace xpu3
} // namespace fd_xpu3
@@ -2,8 +2,7 @@
#include "xpu/kernel/cluster_partition.h"
#include "xpu/kernel/cluster_primitive.h"
namespace xpu3 {
namespace plugin {
namespace fd_xpu3 {
static __device__ void atomic_add(_shared_ptr_ int *ptr, int v) {
bool fail = true;
@@ -71,5 +70,4 @@ __global__ void update_repeat_times(const int64_t *pre_ids,
}
}
} // namespace plugin
} // namespace xpu3
} // namespace fd_xpu3
@@ -3,8 +3,7 @@
#include "xpu/kernel/cluster_partition.h"
#include "xpu/kernel/cluster_primitive.h"
namespace xpu3 {
namespace plugin {
namespace fd_xpu3 {
__device__ void do_cast(const int *xlm, float *ylm, int64_t len) {
for (int64_t i = 0; i < len; i += 32) {
@@ -222,5 +221,4 @@ __global__ void update_value_by_repeat_times_simd(
_XPU_DEF__UPDATE_VALUE_BY_REPEAT_TIMES_SIMD(float);
_XPU_DEF__UPDATE_VALUE_BY_REPEAT_TIMES_SIMD(float16);
} // namespace plugin
} // namespace xpu3
} // namespace fd_xpu3
@@ -17,8 +17,7 @@
#include "xpu/refactor/impl_public/wrapper_check.h"
#include "xpu/xdnn.h"
namespace xpu3 {
namespace plugin {
namespace fd_xpu3 {
template <typename TX, typename TY>
__attribute__((global)) void eb_adjust_batch(TX *src,
TY *dst,
@@ -29,12 +28,9 @@ __attribute__((global)) void eb_adjust_batch(TX *src,
int en_batch,
int de_batch,
int64_t copy_size);
} // namespace plugin
} // namespace xpu3
} // namespace fd_xpu3
namespace baidu {
namespace xpu {
namespace api {
namespace fastdeploy {
namespace plugin {
template <typename TX, typename TY>
@@ -93,10 +89,10 @@ static int xpu3_wrapper(api::Context *ctx,
int en_batch,
int de_batch,
int64_t hidden_dim) {
using XPU_INDEX_TYPE_TX = typename XPUIndexType<TX>::type;
using XPU_INDEX_TYPE_TY = typename XPUIndexType<TY>::type;
using XPU_INDEX_TYPE_TX = typename api::XPUIndexType<TX>::type;
using XPU_INDEX_TYPE_TY = typename api::XPUIndexType<TY>::type;
auto eb_adjust_batch_kernel =
xpu3::plugin::eb_adjust_batch<XPU_INDEX_TYPE_TX, XPU_INDEX_TYPE_TY>;
fd_xpu3::eb_adjust_batch<XPU_INDEX_TYPE_TX, XPU_INDEX_TYPE_TY>;
// NOTE: Don't change 16 to 64, because kernel use gsm
int32_t ret_xre =
eb_adjust_batch_kernel<<<ctx->ncluster(), 16, ctx->xpu_stream>>>(
@@ -226,6 +222,4 @@ INSTANTIATION_EB_ADJUST_BATCH(float, bfloat16);
INSTANTIATION_EB_ADJUST_BATCH(int32_t, int32_t);
INSTANTIATION_EB_ADJUST_BATCH(int64_t, int64_t);
} // namespace plugin
} // namespace api
} // namespace xpu
} // namespace baidu
} // namespace fastdeploy
@@ -17,8 +17,7 @@
#include "xpu/refactor/impl_public/wrapper_check.h"
#include "xpu/xdnn.h"
namespace xpu3 {
namespace plugin {
namespace fd_xpu3 {
template <typename TX, typename TY>
__attribute__((global)) void eb_gather_next_token(TX *src,
TY *dst,
@@ -28,12 +27,9 @@ __attribute__((global)) void eb_gather_next_token(TX *src,
int en_batch,
int de_batch,
int64_t copy_size);
} // namespace plugin
} // namespace xpu3
} // namespace fd_xpu3
namespace baidu {
namespace xpu {
namespace api {
namespace fastdeploy {
namespace plugin {
template <typename TX, typename TY>
static int cpu_wrapper(api::Context *ctx,
@@ -74,7 +70,7 @@ static int xpu3_wrapper(api::Context *ctx,
int en_batch,
int de_batch,
int64_t hidden_dim) {
auto eb_gather_next_token_kernel = xpu3::plugin::eb_gather_next_token<TX, TY>;
auto eb_gather_next_token_kernel = fd_xpu3::eb_gather_next_token<TX, TY>;
// NOTE: Don't change 16 to 64, because kernel use gsm
int32_t ret_xre =
eb_gather_next_token_kernel<<<ctx->ncluster(), 16, ctx->xpu_stream>>>(
@@ -187,6 +183,4 @@ INSTANTIATION_EB_GATHER_NEXT_TOKEN(float16, bfloat16);
INSTANTIATION_EB_GATHER_NEXT_TOKEN(bfloat16, float);
INSTANTIATION_EB_GATHER_NEXT_TOKEN(float, bfloat16);
} // namespace plugin
} // namespace api
} // namespace xpu
} // namespace baidu
} // namespace fastdeploy
@@ -17,8 +17,7 @@
#include "xpu/plugin.h"
#include "xpu/refactor/impl_public/wrapper_check.h"
namespace xpu3 {
namespace plugin {
namespace fd_xpu3 {
__attribute__((global)) void free_and_dispatch_block(
bool *stop_flags,
@@ -42,15 +41,12 @@ __attribute__((global)) void free_and_dispatch_block(
const int block_num_per_seq,
const int max_decoder_block_num);
} // namespace plugin
} // namespace xpu3
} // namespace fd_xpu3
namespace baidu {
namespace xpu {
namespace api {
namespace fastdeploy {
namespace plugin {
static int cpu_wrapper(Context *ctx,
static int cpu_wrapper(api::Context *ctx,
bool *stop_flags,
int *seq_lens_this_time,
int *seq_lens_decoder,
@@ -171,7 +167,7 @@ static int cpu_wrapper(Context *ctx,
return api::SUCCESS;
}
static int xpu3_wrapper(Context *ctx,
static int xpu3_wrapper(api::Context *ctx,
bool *stop_flags,
int *seq_lens_this_time,
int *seq_lens_decoder,
@@ -192,8 +188,8 @@ static int xpu3_wrapper(Context *ctx,
const int block_size,
const int block_num_per_seq,
const int max_decoder_block_num) {
using XPU_INT64 = typename XPUIndexType<int64_t>::type;
auto free_and_dispatch_block_kernel = xpu3::plugin::free_and_dispatch_block;
using XPU_INT64 = typename api::XPUIndexType<int64_t>::type;
auto free_and_dispatch_block_kernel = fd_xpu3::free_and_dispatch_block;
int32_t ret_xre =
free_and_dispatch_block_kernel<<<ctx->ncluster(), 64, ctx->xpu_stream>>>(
stop_flags,
@@ -220,7 +216,7 @@ static int xpu3_wrapper(Context *ctx,
return api::SUCCESS;
}
int free_and_dispatch_block(Context *ctx,
int free_and_dispatch_block(api::Context *ctx,
bool *stop_flags,
int *seq_lens_this_time,
int *seq_lens_decoder,
@@ -285,7 +281,7 @@ int free_and_dispatch_block(Context *ctx,
block_num_per_seq,
max_decoder_block_num);
}
if (ctx->dev().type() == api::kXPU2 || ctx->dev().type() == api::kXPU3) {
if (ctx->dev().type() == api::kXPU3) {
return xpu3_wrapper(ctx,
stop_flags,
seq_lens_this_time,
@@ -312,6 +308,4 @@ int free_and_dispatch_block(Context *ctx,
}
} // namespace plugin
} // namespace api
} // namespace xpu
} // namespace baidu
} // namespace fastdeploy
@@ -17,8 +17,7 @@
#include "xpu/plugin.h"
#include "xpu/refactor/impl_public/wrapper_check.h"
namespace xpu3 {
namespace plugin {
namespace fd_xpu3 {
__attribute__((global)) void get_padding_offset(int *padding_offset,
int *cum_offsets_out,
@@ -35,12 +34,9 @@ __attribute__((global)) void remove_padding(int64_t *x_remove_padding,
const int sequence_length,
const int bs);
} // namespace plugin
} // namespace xpu3
} // namespace fd_xpu3
namespace baidu {
namespace xpu {
namespace api {
namespace fastdeploy {
namespace plugin {
static int get_padding_offset_cpu(int *padding_offset,
@@ -80,7 +76,7 @@ static int remove_padding_cpu(int64_t *x_remove_padding,
return api::SUCCESS;
}
static int cpu_wrapper(Context *ctx,
static int cpu_wrapper(api::Context *ctx,
int *padding_offset,
int *cum_offsets_out,
int *cu_seqlens_q,
@@ -104,7 +100,7 @@ static int cpu_wrapper(Context *ctx,
return api::SUCCESS;
}
static int xpu3_wrapper(Context *ctx,
static int xpu3_wrapper(api::Context *ctx,
int *padding_offset,
int *cum_offsets_out,
int *cu_seqlens_q,
@@ -115,9 +111,9 @@ static int xpu3_wrapper(Context *ctx,
const int *seq_lens,
const int max_seq_len,
const int bs) {
using XPU_INT64 = typename XPUIndexType<int64_t>::type;
auto get_padding_offset = xpu3::plugin::get_padding_offset;
auto remove_padding = xpu3::plugin::remove_padding;
using XPU_INT64 = typename api::XPUIndexType<int64_t>::type;
auto get_padding_offset = fd_xpu3::get_padding_offset;
auto remove_padding = fd_xpu3::remove_padding;
int32_t ret_xre =
get_padding_offset<<<ctx->ncluster(), 64, ctx->xpu_stream>>>(
padding_offset,
@@ -140,7 +136,7 @@ static int xpu3_wrapper(Context *ctx,
return api::SUCCESS;
}
int get_padding_offset(Context *ctx,
int get_padding_offset(api::Context *ctx,
int *padding_offset,
int *cum_offsets_out,
int *cu_seqlens_q,
@@ -171,7 +167,7 @@ int get_padding_offset(Context *ctx,
max_seq_len,
bs);
}
if (ctx->dev().type() == api::kXPU2 || ctx->dev().type() == api::kXPU3) {
if (ctx->dev().type() == api::kXPU3) {
return xpu3_wrapper(ctx,
padding_offset,
cum_offsets_out,
@@ -188,6 +184,4 @@ int get_padding_offset(Context *ctx,
}
} // namespace plugin
} // namespace api
} // namespace xpu
} // namespace baidu
} // namespace fastdeploy
@@ -17,8 +17,7 @@
#include "xpu/plugin.h"
#include "xpu/refactor/impl_public/wrapper_check.h"
namespace xpu3 {
namespace plugin {
namespace fd_xpu3 {
__attribute__((global)) void limit_thinking_content_length_kernel_v1(
int64_t* next_tokens,
@@ -30,15 +29,12 @@ __attribute__((global)) void limit_thinking_content_length_kernel_v1(
const int64_t think_end_id,
const int bs,
const int eos_token_id_len);
} // namespace plugin
} // namespace xpu3
} // namespace fd_xpu3
namespace baidu {
namespace xpu {
namespace api {
namespace fastdeploy {
namespace plugin {
static int cpu_wrapper(Context* ctx,
static int cpu_wrapper(api::Context* ctx,
int64_t* next_tokens,
const int* max_think_lens,
const int64_t* step_idx,
@@ -80,7 +76,7 @@ static int cpu_wrapper(Context* ctx,
}
return api::SUCCESS;
}
static int xpu3_wrapper(Context* ctx,
static int xpu3_wrapper(api::Context* ctx,
int64_t* next_tokens,
const int* max_think_lens,
const int64_t* step_idx,
@@ -90,9 +86,9 @@ static int xpu3_wrapper(Context* ctx,
const int64_t think_end_id,
const int bs,
const int eos_token_id_len) {
using XPU_INT64 = typename XPUIndexType<int64_t>::type;
using XPU_INT64 = typename api::XPUIndexType<int64_t>::type;
auto limit_thinking_content_length_kernel_v1 =
xpu3::plugin::limit_thinking_content_length_kernel_v1;
fd_xpu3::limit_thinking_content_length_kernel_v1;
int32_t ret_xre =
limit_thinking_content_length_kernel_v1<<<1, 64, ctx->xpu_stream>>>(
reinterpret_cast<XPU_INT64*>(next_tokens),
@@ -108,7 +104,7 @@ static int xpu3_wrapper(Context* ctx,
return api::SUCCESS;
}
int limit_thinking_content_length_kernel_v1(Context* ctx,
int limit_thinking_content_length_kernel_v1(api::Context* ctx,
int64_t* next_tokens,
const int* max_think_lens,
const int64_t* step_idx,
@@ -141,7 +137,7 @@ int limit_thinking_content_length_kernel_v1(Context* ctx,
bs,
eos_token_id_len);
}
if (ctx->dev().type() == api::kXPU2 || ctx->dev().type() == api::kXPU3) {
if (ctx->dev().type() == api::kXPU3) {
return xpu3_wrapper(ctx,
next_tokens,
max_think_lens,
@@ -157,6 +153,4 @@ int limit_thinking_content_length_kernel_v1(Context* ctx,
}
} // namespace plugin
} // namespace api
} // namespace xpu
} // namespace baidu
} // namespace fastdeploy
@@ -17,8 +17,7 @@
#include "xpu/plugin.h"
#include "xpu/refactor/impl_public/wrapper_check.h"
namespace xpu3 {
namespace plugin {
namespace fd_xpu3 {
__attribute__((global)) void limit_thinking_content_length_kernel_v2(
int64_t* next_tokens,
@@ -30,15 +29,12 @@ __attribute__((global)) void limit_thinking_content_length_kernel_v2(
const int64_t line_break_id,
const int bs);
} // namespace plugin
} // namespace xpu3
} // namespace fd_xpu3
namespace baidu {
namespace xpu {
namespace api {
namespace fastdeploy {
namespace plugin {
static int cpu_wrapper(Context* ctx,
static int cpu_wrapper(api::Context* ctx,
int64_t* next_tokens,
const int* max_think_lens,
const int64_t* step_idx,
@@ -86,7 +82,7 @@ static int cpu_wrapper(Context* ctx,
}
return api::SUCCESS;
}
static int xpu3_wrapper(Context* ctx,
static int xpu3_wrapper(api::Context* ctx,
int64_t* next_tokens,
const int* max_think_lens,
const int64_t* step_idx,
@@ -95,9 +91,9 @@ static int xpu3_wrapper(Context* ctx,
const int64_t think_end_id,
const int64_t line_break_id,
const int bs) {
using XPU_INT64 = typename XPUIndexType<int64_t>::type;
using XPU_INT64 = typename api::XPUIndexType<int64_t>::type;
auto limit_thinking_content_length_kernel_v2 =
xpu3::plugin::limit_thinking_content_length_kernel_v2;
fd_xpu3::limit_thinking_content_length_kernel_v2;
int32_t ret_xre =
limit_thinking_content_length_kernel_v2<<<1, 64, ctx->xpu_stream>>>(
reinterpret_cast<XPU_INT64*>(next_tokens),
@@ -112,7 +108,7 @@ static int xpu3_wrapper(Context* ctx,
return api::SUCCESS;
}
int limit_thinking_content_length_kernel_v2(Context* ctx,
int limit_thinking_content_length_kernel_v2(api::Context* ctx,
int64_t* next_tokens,
const int* max_think_lens,
const int64_t* step_idx,
@@ -142,7 +138,7 @@ int limit_thinking_content_length_kernel_v2(Context* ctx,
line_break_id,
bs);
}
if (ctx->dev().type() == api::kXPU2 || ctx->dev().type() == api::kXPU3) {
if (ctx->dev().type() == api::kXPU3) {
return xpu3_wrapper(ctx,
next_tokens,
max_think_lens,
@@ -157,6 +153,4 @@ int limit_thinking_content_length_kernel_v2(Context* ctx,
}
} // namespace plugin
} // namespace api
} // namespace xpu
} // namespace baidu
} // namespace fastdeploy
@@ -15,8 +15,7 @@
#include "xpu/plugin.h"
#include "xpu/refactor/impl_public/wrapper_check.h"
namespace xpu3 {
namespace plugin {
namespace fd_xpu3 {
__attribute__((global)) void ComputeOrderKernel(
const int* seq_lens_this_time,
const int* seq_lens_encoder,
@@ -28,15 +27,12 @@ __attribute__((global)) void ComputeOrderKernel(
const int bsz,
const int actual_draft_token_num,
const int input_token_num);
} // namespace plugin
} // namespace xpu3
} // namespace fd_xpu3
namespace baidu {
namespace xpu {
namespace api {
namespace fastdeploy {
namespace plugin {
static int cpu_wrapper(Context* ctx,
static int cpu_wrapper(api::Context* ctx,
const int* seq_lens_this_time,
const int* seq_lens_encoder,
const int* base_model_seq_lens_this_time,
@@ -97,7 +93,7 @@ static int cpu_wrapper(Context* ctx,
return api::SUCCESS;
}
static int xpu3_wrapper(Context* ctx,
static int xpu3_wrapper(api::Context* ctx,
const int* seq_lens_this_time,
const int* seq_lens_encoder,
const int* base_model_seq_lens_this_time,
@@ -108,7 +104,7 @@ static int xpu3_wrapper(Context* ctx,
const int bsz,
const int actual_draft_token_num,
const int input_token_num) {
int32_t ret_xre = xpu3::plugin::ComputeOrderKernel<<<1, 1, ctx->xpu_stream>>>(
int32_t ret_xre = fd_xpu3::ComputeOrderKernel<<<1, 1, ctx->xpu_stream>>>(
seq_lens_this_time,
seq_lens_encoder,
base_model_seq_lens_this_time,
@@ -123,7 +119,7 @@ static int xpu3_wrapper(Context* ctx,
return api::SUCCESS;
}
int compute_order(Context* ctx,
int compute_order(api::Context* ctx,
const int* seq_lens_this_time,
const int* seq_lens_encoder,
const int* base_model_seq_lens_this_time,
@@ -187,6 +183,4 @@ int compute_order(Context* ctx,
}
} // namespace plugin
} // namespace api
} // namespace xpu
} // namespace baidu
} // namespace fastdeploy

Some files were not shown because too many files have changed in this diff Show More