mirror of
https://github.com/PaddlePaddle/FastDeploy.git
synced 2026-04-23 00:17:25 +08:00
[XPU] refactor: XPU plugin namespace migration (#6799)
* [XPU] refactor: XPU plugin namespace migration - Migrate wrapper layer namespace from baidu::xpu::api::plugin to fastdeploy::plugin - Migrate kernel layer namespace from xpu3::plugin to fd_xpu3 - Add api:: prefix for types (Context, SUCCESS, XPUIndexType, ctx_guard) - Remove XPU2 support, keep only XPU3 - Update ops/ directory to use new namespace Total: 137 files changed * [XPU] fix: add return value check and correct error messages - Add PADDLE_ENFORCE_XDNN_SUCCESS check for speculate_get_logits and update_attn_mask_offsets - Fix empty error message in draft_model_postprocess - Correct function name in speculate_schedule_cache error message - Update error messages from 'xpu::plugin::' to 'fastdeploy::plugin::'
This commit is contained in:
@@ -72,7 +72,7 @@ std::vector<paddle::Tensor> AdjustBatchKernel(
|
||||
|
||||
auto out = paddle::empty({token_num, dim}, x.type(), x.place());
|
||||
if (token_num > 0) {
|
||||
int r = baidu::xpu::api::plugin::eb_adjust_batch<XPUType, XPUType>(
|
||||
int r = fastdeploy::plugin::eb_adjust_batch<XPUType, XPUType>(
|
||||
ctx,
|
||||
reinterpret_cast<const XPUType *>(x.data<data_t>()),
|
||||
reinterpret_cast<XPUType *>(out.data<data_t>()),
|
||||
|
||||
@@ -90,7 +90,7 @@ std::vector<paddle::Tensor> GatherNextToken(
|
||||
}
|
||||
|
||||
if (output_padding_offset) {
|
||||
int r = baidu::xpu::api::plugin::eb_mtp_gather_next_token<XPUType, XPUType>(
|
||||
int r = fastdeploy::plugin::eb_mtp_gather_next_token<XPUType, XPUType>(
|
||||
ctx,
|
||||
reinterpret_cast<const XPUType*>(x.data<data_t>()),
|
||||
reinterpret_cast<XPUType*>(out.data<data_t>()),
|
||||
@@ -99,9 +99,9 @@ std::vector<paddle::Tensor> GatherNextToken(
|
||||
encoder_batch_map_vp,
|
||||
decoder_batch_map_vp,
|
||||
dim);
|
||||
PD_CHECK(r == 0, "xpu::plugin::gather_next_token failed.");
|
||||
PD_CHECK(r == 0, "fastdeploy::plugin::gather_next_token failed.");
|
||||
} else {
|
||||
int r = baidu::xpu::api::plugin::eb_gather_next_token<XPUType, XPUType>(
|
||||
int r = fastdeploy::plugin::eb_gather_next_token<XPUType, XPUType>(
|
||||
ctx,
|
||||
reinterpret_cast<const XPUType*>(x.data<data_t>()),
|
||||
reinterpret_cast<XPUType*>(out.data<data_t>()),
|
||||
@@ -109,7 +109,7 @@ std::vector<paddle::Tensor> GatherNextToken(
|
||||
encoder_batch_map_vp,
|
||||
decoder_batch_map_vp,
|
||||
dim);
|
||||
PD_CHECK(r == 0, "xpu::plugin::gather_next_token failed.");
|
||||
PD_CHECK(r == 0, "fastdeploy::plugin::gather_next_token failed.");
|
||||
}
|
||||
return {out};
|
||||
}
|
||||
|
||||
@@ -40,19 +40,19 @@ std::vector<paddle::Tensor> GetPaddingOffset(const paddle::Tensor &input_ids,
|
||||
auto cu_seqlens_k =
|
||||
paddle::full({bsz + 1}, 0, paddle::DataType::INT32, input_ids.place());
|
||||
if (token_num_data > 0) {
|
||||
int r = baidu::xpu::api::plugin::get_padding_offset(
|
||||
xpu_ctx->x_context(),
|
||||
batch_id_per_token.data<int>(),
|
||||
cum_offsets_out.data<int>(),
|
||||
cu_seqlens_q.data<int>(),
|
||||
cu_seqlens_k.data<int>(),
|
||||
x_remove_padding.data<int64_t>(),
|
||||
input_ids.data<int64_t>(),
|
||||
cum_offsets.data<int>(),
|
||||
seq_len.data<int>(),
|
||||
seq_length,
|
||||
bsz);
|
||||
PD_CHECK(r == 0, "baidu::xpu::api::plugin::get_padding_offset failed.");
|
||||
int r =
|
||||
fastdeploy::plugin::get_padding_offset(xpu_ctx->x_context(),
|
||||
batch_id_per_token.data<int>(),
|
||||
cum_offsets_out.data<int>(),
|
||||
cu_seqlens_q.data<int>(),
|
||||
cu_seqlens_k.data<int>(),
|
||||
x_remove_padding.data<int64_t>(),
|
||||
input_ids.data<int64_t>(),
|
||||
cum_offsets.data<int>(),
|
||||
seq_len.data<int>(),
|
||||
seq_length,
|
||||
bsz);
|
||||
PD_CHECK(r == 0, "fastdeploy::plugin::get_padding_offset failed.");
|
||||
}
|
||||
|
||||
return {x_remove_padding,
|
||||
|
||||
@@ -44,7 +44,7 @@ void TokenPenaltyMultiScores(const paddle::Tensor &pre_ids,
|
||||
case paddle::DataType::FLOAT16: {
|
||||
using XPUType = typename XPUTypeTrait<float16>::Type;
|
||||
typedef paddle::float16 data_t;
|
||||
int r = baidu::xpu::api::plugin::token_penalty_multi_scores(
|
||||
int r = fastdeploy::plugin::token_penalty_multi_scores(
|
||||
xpu_ctx->x_context(),
|
||||
pre_ids.data<int64_t>(),
|
||||
reinterpret_cast<XPUType *>(
|
||||
@@ -62,10 +62,11 @@ void TokenPenaltyMultiScores(const paddle::Tensor &pre_ids,
|
||||
length_id,
|
||||
end_length,
|
||||
length_bad_words);
|
||||
PD_CHECK(r == 0, "xpu::plugin::token_penalty_multi_scores failed.");
|
||||
PD_CHECK(r == 0,
|
||||
"fastdeploy::plugin::token_penalty_multi_scores failed.");
|
||||
} break;
|
||||
case paddle::DataType::FLOAT32: {
|
||||
int r = baidu::xpu::api::plugin::token_penalty_multi_scores(
|
||||
int r = fastdeploy::plugin::token_penalty_multi_scores(
|
||||
xpu_ctx->x_context(),
|
||||
pre_ids.data<int64_t>(),
|
||||
const_cast<float *>(logits.data<float>()),
|
||||
@@ -82,7 +83,8 @@ void TokenPenaltyMultiScores(const paddle::Tensor &pre_ids,
|
||||
length_id,
|
||||
end_length,
|
||||
length_bad_words);
|
||||
PD_CHECK(r == 0, "xpu::plugin::token_penalty_multi_scores failed.");
|
||||
PD_CHECK(r == 0,
|
||||
"fastdeploy::plugin::token_penalty_multi_scores failed.");
|
||||
} break;
|
||||
default:
|
||||
PD_THROW(
|
||||
|
||||
@@ -34,7 +34,7 @@ void LimitThinkingContentLengthV1(const paddle::Tensor& next_tokens,
|
||||
|
||||
const int batch_size = next_tokens.shape()[0];
|
||||
const int eos_token_id_len = eos_token_ids.shape()[0];
|
||||
int r = baidu::xpu::api::plugin::limit_thinking_content_length_kernel_v1(
|
||||
int r = fastdeploy::plugin::limit_thinking_content_length_kernel_v1(
|
||||
xpu_ctx->x_context(),
|
||||
const_cast<int64_t*>(next_tokens.data<int64_t>()),
|
||||
max_think_lens.data<int>(),
|
||||
@@ -46,7 +46,7 @@ void LimitThinkingContentLengthV1(const paddle::Tensor& next_tokens,
|
||||
batch_size,
|
||||
eos_token_id_len);
|
||||
PD_CHECK(r == 0,
|
||||
"baidu::xpu::api::plugin::limit_thinking_content_length_kernel_v1 "
|
||||
"fastdeploy::plugin::limit_thinking_content_length_kernel_v1 "
|
||||
"failed.");
|
||||
}
|
||||
|
||||
|
||||
@@ -33,7 +33,7 @@ void LimitThinkingContentLengthV2(const paddle::Tensor& next_tokens,
|
||||
auto xpu_ctx = static_cast<const phi::XPUContext*>(dev_ctx);
|
||||
|
||||
const int batch_size = next_tokens.shape()[0];
|
||||
int r = baidu::xpu::api::plugin::limit_thinking_content_length_kernel_v2(
|
||||
int r = fastdeploy::plugin::limit_thinking_content_length_kernel_v2(
|
||||
xpu_ctx->x_context(),
|
||||
const_cast<int64_t*>(next_tokens.data<int64_t>()),
|
||||
max_think_lens.data<int>(),
|
||||
@@ -44,7 +44,7 @@ void LimitThinkingContentLengthV2(const paddle::Tensor& next_tokens,
|
||||
line_break_id,
|
||||
batch_size);
|
||||
PD_CHECK(r == 0,
|
||||
"baidu::xpu::api::plugin::limit_thinking_content_length_kernel_v2 "
|
||||
"fastdeploy::plugin::limit_thinking_content_length_kernel_v2 "
|
||||
"failed.");
|
||||
}
|
||||
|
||||
|
||||
@@ -30,7 +30,7 @@ void DraftModelPostprocess(const paddle::Tensor& base_model_draft_tokens,
|
||||
auto xpu_ctx = static_cast<const phi::XPUContext*>(dev_ctx);
|
||||
int real_bsz = base_model_draft_tokens.shape()[0];
|
||||
int base_model_draft_token_len = base_model_draft_tokens.shape()[1];
|
||||
int r = baidu::xpu::api::plugin::draft_model_postprocess(
|
||||
int r = fastdeploy::plugin::draft_model_postprocess(
|
||||
xpu_ctx->x_context(),
|
||||
const_cast<int64_t*>(base_model_draft_tokens.data<int64_t>()),
|
||||
const_cast<int*>(base_model_seq_lens_this_time.data<int>()),
|
||||
@@ -38,7 +38,7 @@ void DraftModelPostprocess(const paddle::Tensor& base_model_draft_tokens,
|
||||
const_cast<bool*>(base_model_stop_flags.data<bool>()),
|
||||
real_bsz,
|
||||
base_model_draft_token_len);
|
||||
PADDLE_ENFORCE_XDNN_SUCCESS(r, "");
|
||||
PADDLE_ENFORCE_XDNN_SUCCESS(r, "draft_model_postprocess");
|
||||
}
|
||||
|
||||
PD_BUILD_STATIC_OP(draft_model_postprocess)
|
||||
|
||||
@@ -64,7 +64,7 @@ void DraftModelPreprocess(const paddle::Tensor& draft_tokens,
|
||||
auto not_need_stop_gpu =
|
||||
not_need_stop.copy_to(seq_lens_this_time.place(), false);
|
||||
|
||||
int r = baidu::xpu::api::plugin::draft_model_preprocess(
|
||||
int r = fastdeploy::plugin::draft_model_preprocess(
|
||||
ctx,
|
||||
const_cast<int64_t*>(draft_tokens.data<int64_t>()),
|
||||
const_cast<int64_t*>(input_ids.data<int64_t>()),
|
||||
@@ -97,7 +97,7 @@ void DraftModelPreprocess(const paddle::Tensor& draft_tokens,
|
||||
splitwise_prefill,
|
||||
kvcache_scheduler_v1);
|
||||
|
||||
PD_CHECK(r == 0, "xpu::plugin::draft_model_preprocess failed.");
|
||||
PD_CHECK(r == 0, "fastdeploy::plugin::draft_model_preprocess failed.");
|
||||
auto not_need_stop_cpu =
|
||||
not_need_stop_gpu.copy_to(not_need_stop.place(), false);
|
||||
bool* not_need_stop_data = const_cast<bool*>(not_need_stop.data<bool>());
|
||||
|
||||
@@ -63,7 +63,7 @@ void DraftModelUpdate(const paddle::Tensor& inter_next_tokens,
|
||||
}
|
||||
}
|
||||
|
||||
int r = baidu::xpu::api::plugin::draft_model_update(
|
||||
int r = fastdeploy::plugin::draft_model_update(
|
||||
ctx,
|
||||
inter_next_tokens.data<int64_t>(),
|
||||
const_cast<int64_t*>(draft_tokens.data<int64_t>()),
|
||||
|
||||
@@ -47,18 +47,19 @@ std::vector<paddle::Tensor> EagleGetHiddenStates(
|
||||
auto output_token_num = paddle::empty(
|
||||
{1}, seq_lens_this_time.dtype(), seq_lens_this_time.place());
|
||||
|
||||
int r = api::plugin::compute_order(ctx,
|
||||
seq_lens_this_time.data<int>(),
|
||||
seq_lens_encoder.data<int>(),
|
||||
base_model_seq_lens_this_time.data<int>(),
|
||||
base_model_seq_lens_encoder.data<int>(),
|
||||
accept_nums.data<int>(),
|
||||
position_map.data<int>(),
|
||||
output_token_num.data<int>(),
|
||||
bsz,
|
||||
actual_draft_token_num,
|
||||
input_token_num);
|
||||
PD_CHECK(r == 0, "xpu::plugin::compute_order failed.");
|
||||
int r = fastdeploy::plugin::compute_order(
|
||||
ctx,
|
||||
seq_lens_this_time.data<int>(),
|
||||
seq_lens_encoder.data<int>(),
|
||||
base_model_seq_lens_this_time.data<int>(),
|
||||
base_model_seq_lens_encoder.data<int>(),
|
||||
accept_nums.data<int>(),
|
||||
position_map.data<int>(),
|
||||
output_token_num.data<int>(),
|
||||
bsz,
|
||||
actual_draft_token_num,
|
||||
input_token_num);
|
||||
PD_CHECK(r == 0, "fastdeploy::plugin::compute_order failed.");
|
||||
|
||||
int output_token_num_cpu =
|
||||
output_token_num.copy_to(paddle::CPUPlace(), false).data<int>()[0];
|
||||
@@ -72,7 +73,7 @@ std::vector<paddle::Tensor> EagleGetHiddenStates(
|
||||
case paddle::DataType::BFLOAT16:
|
||||
using XPUTypeBF16 = typename XPUTypeTrait<bfloat16>::Type;
|
||||
typedef paddle::bfloat16 bf16_data_t;
|
||||
r = api::plugin::rebuild_hidden_states(
|
||||
r = fastdeploy::plugin::rebuild_hidden_states(
|
||||
ctx,
|
||||
reinterpret_cast<const XPUTypeBF16*>(input.data<bf16_data_t>()),
|
||||
position_map.data<int>(),
|
||||
@@ -80,12 +81,12 @@ std::vector<paddle::Tensor> EagleGetHiddenStates(
|
||||
dim_embed,
|
||||
elem_cnt,
|
||||
output_token_num_cpu);
|
||||
PD_CHECK(r == 0, "xpu::plugin::rebuild_hidden_states failed.");
|
||||
PD_CHECK(r == 0, "fastdeploy::plugin::rebuild_hidden_states failed.");
|
||||
return {out};
|
||||
case paddle::DataType::FLOAT16:
|
||||
using XPUTypeFP16 = typename XPUTypeTrait<float16>::Type;
|
||||
typedef paddle::float16 fp16_data_t;
|
||||
r = api::plugin::rebuild_hidden_states(
|
||||
r = fastdeploy::plugin::rebuild_hidden_states(
|
||||
ctx,
|
||||
reinterpret_cast<const XPUTypeFP16*>(input.data<fp16_data_t>()),
|
||||
position_map.data<int>(),
|
||||
@@ -93,10 +94,10 @@ std::vector<paddle::Tensor> EagleGetHiddenStates(
|
||||
dim_embed,
|
||||
elem_cnt,
|
||||
output_token_num_cpu);
|
||||
PD_CHECK(r == 0, "xpu::plugin::rebuild_hidden_states failed.");
|
||||
PD_CHECK(r == 0, "fastdeploy::plugin::rebuild_hidden_states failed.");
|
||||
return {out};
|
||||
case paddle::DataType::FLOAT32:
|
||||
r = api::plugin::rebuild_hidden_states(
|
||||
r = fastdeploy::plugin::rebuild_hidden_states(
|
||||
ctx,
|
||||
reinterpret_cast<const float*>(input.data<float>()),
|
||||
position_map.data<int>(),
|
||||
@@ -104,7 +105,7 @@ std::vector<paddle::Tensor> EagleGetHiddenStates(
|
||||
dim_embed,
|
||||
elem_cnt,
|
||||
output_token_num_cpu);
|
||||
PD_CHECK(r == 0, "xpu::plugin::rebuild_hidden_states failed.");
|
||||
PD_CHECK(r == 0, "fastdeploy::plugin::rebuild_hidden_states failed.");
|
||||
return {out};
|
||||
default:
|
||||
PD_THROW("Unsupported data type.");
|
||||
|
||||
@@ -43,7 +43,7 @@ std::vector<paddle::Tensor> EagleGetSelfHiddenStates(
|
||||
auto output_token_num = paddle::empty(
|
||||
{1}, seq_lens_this_time.dtype(), seq_lens_this_time.place());
|
||||
|
||||
int r = api::plugin::compute_self_order(
|
||||
int r = fastdeploy::plugin::compute_self_order(
|
||||
ctx,
|
||||
reinterpret_cast<const int*>(last_seq_lens_this_time.data<int>()),
|
||||
reinterpret_cast<const int*>(seq_lens_this_time.data<int>()),
|
||||
@@ -51,7 +51,7 @@ std::vector<paddle::Tensor> EagleGetSelfHiddenStates(
|
||||
reinterpret_cast<int*>(src_map.data<int>()),
|
||||
reinterpret_cast<int*>(output_token_num.data<int>()),
|
||||
bsz);
|
||||
PD_CHECK(r == 0, "xpu::plugin::compute_self_order failed.");
|
||||
PD_CHECK(r == 0, "fastdeploy::plugin::compute_self_order failed.");
|
||||
|
||||
int output_token_num_cpu =
|
||||
output_token_num.copy_to(paddle::CPUPlace(), false).data<int>()[0];
|
||||
@@ -67,7 +67,7 @@ std::vector<paddle::Tensor> EagleGetSelfHiddenStates(
|
||||
case paddle::DataType::BFLOAT16:
|
||||
using XPUTypeBF16 = typename XPUTypeTrait<bfloat16>::Type;
|
||||
typedef paddle::bfloat16 bf16_data_t;
|
||||
r = api::plugin::rebuild_self_hidden_states(
|
||||
r = fastdeploy::plugin::rebuild_self_hidden_states(
|
||||
ctx,
|
||||
reinterpret_cast<const XPUTypeBF16*>(input.data<bf16_data_t>()),
|
||||
src_map.data<int>(),
|
||||
@@ -75,12 +75,13 @@ std::vector<paddle::Tensor> EagleGetSelfHiddenStates(
|
||||
input_token_num,
|
||||
dim_embed,
|
||||
elem_cnt);
|
||||
PD_CHECK(r == 0, "xpu::plugin::rebuild_self_hidden_states failed.");
|
||||
PD_CHECK(r == 0,
|
||||
"fastdeploy::plugin::rebuild_self_hidden_states failed.");
|
||||
return {out};
|
||||
case paddle::DataType::FLOAT16:
|
||||
using XPUTypeFP16 = typename XPUTypeTrait<float16>::Type;
|
||||
typedef paddle::float16 fp16_data_t;
|
||||
r = api::plugin::rebuild_self_hidden_states(
|
||||
r = fastdeploy::plugin::rebuild_self_hidden_states(
|
||||
ctx,
|
||||
reinterpret_cast<const XPUTypeFP16*>(input.data<fp16_data_t>()),
|
||||
src_map.data<int>(),
|
||||
@@ -88,10 +89,11 @@ std::vector<paddle::Tensor> EagleGetSelfHiddenStates(
|
||||
input_token_num,
|
||||
dim_embed,
|
||||
elem_cnt);
|
||||
PD_CHECK(r == 0, "xpu::plugin::rebuild_self_hidden_states failed.");
|
||||
PD_CHECK(r == 0,
|
||||
"fastdeploy::plugin::rebuild_self_hidden_states failed.");
|
||||
return {out};
|
||||
case paddle::DataType::FLOAT32:
|
||||
r = api::plugin::rebuild_self_hidden_states(
|
||||
r = fastdeploy::plugin::rebuild_self_hidden_states(
|
||||
ctx,
|
||||
reinterpret_cast<const float*>(input.data<float>()),
|
||||
src_map.data<int>(),
|
||||
@@ -99,7 +101,8 @@ std::vector<paddle::Tensor> EagleGetSelfHiddenStates(
|
||||
input_token_num,
|
||||
dim_embed,
|
||||
elem_cnt);
|
||||
PD_CHECK(r == 0, "xpu::plugin::rebuild_self_hidden_states failed.");
|
||||
PD_CHECK(r == 0,
|
||||
"fastdeploy::plugin::rebuild_self_hidden_states failed.");
|
||||
return {out};
|
||||
default:
|
||||
PD_THROW("Unsupported data type.");
|
||||
|
||||
@@ -46,7 +46,7 @@ void MTPStepPaddle(
|
||||
const int bsz = seq_lens_this_time.shape()[0];
|
||||
const int block_num_per_seq = block_tables.shape()[1];
|
||||
|
||||
int r = baidu::xpu::api::plugin::mtp_free_and_dispatch_block(
|
||||
int r = fastdeploy::plugin::mtp_free_and_dispatch_block(
|
||||
ctx,
|
||||
const_cast<bool *>(base_model_stop_flags.data<bool>()),
|
||||
const_cast<bool *>(stop_flags.data<bool>()),
|
||||
|
||||
@@ -27,7 +27,7 @@ void SpeculateClearAcceptNums(const paddle::Tensor& accept_num,
|
||||
auto dev_ctx = paddle::experimental::DeviceContextPool::Instance().Get(place);
|
||||
auto xpu_ctx = static_cast<const phi::XPUContext*>(dev_ctx);
|
||||
const int max_bsz = seq_lens_decoder.shape()[0];
|
||||
int r = baidu::xpu::api::plugin::speculate_clear_accept_nums(
|
||||
int r = fastdeploy::plugin::speculate_clear_accept_nums(
|
||||
xpu_ctx->x_context(),
|
||||
const_cast<int*>(accept_num.data<int>()),
|
||||
seq_lens_decoder.data<int>(),
|
||||
|
||||
@@ -43,7 +43,7 @@ void SpeculateGetLogits(const paddle::Tensor& draft_logits,
|
||||
const int vocab_size = logits.shape()[1];
|
||||
const int real_bsz = seq_lens_this_time.shape()[0];
|
||||
|
||||
baidu::xpu::api::plugin::speculate_get_logits(
|
||||
int r = fastdeploy::plugin::speculate_get_logits(
|
||||
ctx,
|
||||
const_cast<float*>(draft_logits.data<float>()),
|
||||
const_cast<int*>(next_token_num.data<int>()),
|
||||
@@ -56,6 +56,7 @@ void SpeculateGetLogits(const paddle::Tensor& draft_logits,
|
||||
seq_lens_encoder.data<int>(),
|
||||
real_bsz,
|
||||
vocab_size);
|
||||
PADDLE_ENFORCE_XDNN_SUCCESS(r, "speculate_get_logits");
|
||||
if (draft_logits.is_cpu()) {
|
||||
delete ctx;
|
||||
}
|
||||
|
||||
@@ -45,7 +45,7 @@ std::vector<paddle::Tensor> SpeculateGetOutputPaddingOffset(
|
||||
auto output_cum_offsets =
|
||||
output_cum_offsets_tmp.copy_to(output_cum_offsets_tmp.place(), false);
|
||||
if (cpu_out_token_num.data<int64_t>()[0] > 0) {
|
||||
int r = baidu::xpu::api::plugin::speculate_get_output_padding_offset(
|
||||
int r = fastdeploy::plugin::speculate_get_output_padding_offset(
|
||||
ctx,
|
||||
output_padding_offset.mutable_data<int>(),
|
||||
output_cum_offsets.mutable_data<int>(),
|
||||
|
||||
@@ -58,7 +58,7 @@ std::vector<paddle::Tensor> SpeculateGetPaddingOffset(
|
||||
PD_CHECK(seq_len.is_contiguous(), "Seq lens tensor must be contiguous");
|
||||
|
||||
if (token_num_data > 0) {
|
||||
int r = baidu::xpu::api::plugin::speculate_get_padding_offset(
|
||||
int r = fastdeploy::plugin::speculate_get_padding_offset(
|
||||
xpu_ctx->x_context(),
|
||||
batch_id_per_token.data<int>(),
|
||||
cum_offsets_out.data<int>(),
|
||||
@@ -70,7 +70,7 @@ std::vector<paddle::Tensor> SpeculateGetPaddingOffset(
|
||||
bsz);
|
||||
PD_CHECK(r == 0, "XPU speculate_get_padding_offset failed");
|
||||
|
||||
r = baidu::xpu::api::plugin::speculate_remove_padding<int64_t>(
|
||||
r = fastdeploy::plugin::speculate_remove_padding<int64_t>(
|
||||
xpu_ctx->x_context(),
|
||||
x_remove_padding.data<int64_t>(),
|
||||
input_ids.data<int64_t>(),
|
||||
|
||||
@@ -38,7 +38,7 @@ std::vector<paddle::Tensor> SpeculateGetSeqLensOutput(
|
||||
auto seq_lens_output = paddle::full(
|
||||
{bsz}, 0, paddle::DataType::INT32, seq_lens_this_time.place());
|
||||
|
||||
int r = baidu::xpu::api::plugin::speculate_get_seq_lens_output(
|
||||
int r = fastdeploy::plugin::speculate_get_seq_lens_output(
|
||||
ctx,
|
||||
seq_lens_output.data<int>(),
|
||||
seq_lens_this_time.data<int>(),
|
||||
|
||||
@@ -46,7 +46,7 @@ std::vector<paddle::Tensor> RebuildAppendPadding(
|
||||
case paddle::DataType::BFLOAT16:
|
||||
using XPUTypeBF16 = typename XPUTypeTrait<bfloat16>::Type;
|
||||
typedef paddle::bfloat16 bf16_data_t;
|
||||
r = api::plugin::speculate_rebuild_append_padding<XPUTypeBF16>(
|
||||
r = fastdeploy::plugin::speculate_rebuild_append_padding<XPUTypeBF16>(
|
||||
ctx,
|
||||
const_cast<XPUTypeBF16*>(reinterpret_cast<const XPUTypeBF16*>(
|
||||
full_hidden_states.data<bf16_data_t>())),
|
||||
@@ -58,12 +58,13 @@ std::vector<paddle::Tensor> RebuildAppendPadding(
|
||||
dim_embed,
|
||||
elem_nums,
|
||||
reinterpret_cast<XPUTypeBF16*>(out.data<bf16_data_t>()));
|
||||
PD_CHECK(r == 0, "xpu::plugin::speculate_rebuild_append_padding failed.");
|
||||
PD_CHECK(r == 0,
|
||||
"fastdeploy::plugin::speculate_rebuild_append_padding failed.");
|
||||
return {out};
|
||||
case paddle::DataType::FLOAT16:
|
||||
using XPUTypeFP16 = typename XPUTypeTrait<float16>::Type;
|
||||
typedef paddle::float16 fp16_data_t;
|
||||
r = api::plugin::speculate_rebuild_append_padding<XPUTypeFP16>(
|
||||
r = fastdeploy::plugin::speculate_rebuild_append_padding<XPUTypeFP16>(
|
||||
ctx,
|
||||
const_cast<XPUTypeFP16*>(reinterpret_cast<const XPUTypeFP16*>(
|
||||
full_hidden_states.data<fp16_data_t>())),
|
||||
@@ -75,10 +76,11 @@ std::vector<paddle::Tensor> RebuildAppendPadding(
|
||||
dim_embed,
|
||||
elem_nums,
|
||||
reinterpret_cast<XPUTypeFP16*>(out.data<fp16_data_t>()));
|
||||
PD_CHECK(r == 0, "xpu::plugin::speculate_rebuild_append_padding failed.");
|
||||
PD_CHECK(r == 0,
|
||||
"fastdeploy::plugin::speculate_rebuild_append_padding failed.");
|
||||
return {out};
|
||||
case paddle::DataType::FLOAT32:
|
||||
r = api::plugin::speculate_rebuild_append_padding<float>(
|
||||
r = fastdeploy::plugin::speculate_rebuild_append_padding<float>(
|
||||
ctx,
|
||||
const_cast<float*>(full_hidden_states.data<float>()),
|
||||
const_cast<int*>(cum_offsets.data<int>()),
|
||||
@@ -89,7 +91,8 @@ std::vector<paddle::Tensor> RebuildAppendPadding(
|
||||
dim_embed,
|
||||
elem_nums,
|
||||
out.data<float>());
|
||||
PD_CHECK(r == 0, "xpu::plugin::speculate_rebuild_append_padding failed.");
|
||||
PD_CHECK(r == 0,
|
||||
"fastdeploy::plugin::speculate_rebuild_append_padding failed.");
|
||||
return {out};
|
||||
default:
|
||||
PD_THROW("Unsupported data type.");
|
||||
|
||||
@@ -62,7 +62,7 @@ void SpeculateScheduleCache(const paddle::Tensor &draft_tokens,
|
||||
}
|
||||
auto not_need_stop_gpu = not_need_stop.copy_to(stop_flags.place(), false);
|
||||
|
||||
int r = baidu::xpu::api::plugin::speculate_schedule_cache(
|
||||
int r = fastdeploy::plugin::speculate_schedule_cache(
|
||||
ctx,
|
||||
draft_tokens.data<int64_t>(),
|
||||
const_cast<int *>(block_tables.data<int>()),
|
||||
@@ -87,7 +87,7 @@ void SpeculateScheduleCache(const paddle::Tensor &draft_tokens,
|
||||
block_num_per_seq,
|
||||
prefill_one_step_stop);
|
||||
// kernel launch
|
||||
PD_CHECK(r == 0, "speculate_free_and_reschedule failed.");
|
||||
PD_CHECK(r == 0, "speculate_schedule_cache failed.");
|
||||
|
||||
auto not_need_stop_cpu =
|
||||
not_need_stop_gpu.copy_to(not_need_stop.place(), true);
|
||||
|
||||
@@ -50,7 +50,7 @@ void SpecGetStopFlagsMultiSeqs(const paddle::Tensor &accept_tokens,
|
||||
int pre_ids_len = pre_ids.shape()[1];
|
||||
int accept_tokens_len = accept_tokens.shape()[1];
|
||||
|
||||
int r = baidu::xpu::api::plugin::speculate_set_stop_value_multi_seqs(
|
||||
int r = fastdeploy::plugin::speculate_set_stop_value_multi_seqs(
|
||||
ctx,
|
||||
const_cast<bool *>(stop_flags.data<bool>()),
|
||||
const_cast<int64_t *>(accept_tokens.data<int64_t>()),
|
||||
@@ -67,7 +67,8 @@ void SpecGetStopFlagsMultiSeqs(const paddle::Tensor &accept_tokens,
|
||||
stop_seqs_bs,
|
||||
stop_seqs_max_len,
|
||||
pre_ids_len);
|
||||
PD_CHECK(r == 0, "xpu::plugin::speculate_set_stop_value_multi_seqs failed.");
|
||||
PD_CHECK(r == 0,
|
||||
"fastdeploy::plugin::speculate_set_stop_value_multi_seqs failed.");
|
||||
}
|
||||
|
||||
PD_BUILD_STATIC_OP(speculate_set_stop_value_multi_seqs)
|
||||
|
||||
@@ -42,7 +42,7 @@ void SpeculateSetValueByFlagsAndIdx(const paddle::Tensor &pre_ids_all,
|
||||
int length = pre_ids_all_shape[1];
|
||||
int max_draft_tokens = accept_tokens.shape()[1];
|
||||
|
||||
int r = baidu::xpu::api::plugin::speculate_set_value_by_flag_and_id(
|
||||
int r = fastdeploy::plugin::speculate_set_value_by_flag_and_id(
|
||||
ctx,
|
||||
const_cast<int64_t *>(pre_ids_all.data<int64_t>()),
|
||||
accept_tokens.data<int64_t>(),
|
||||
|
||||
@@ -60,7 +60,7 @@ void SpeculateStepPaddleBase(
|
||||
const int length = input_ids.shape()[1];
|
||||
const int pre_id_length = pre_ids.shape()[1];
|
||||
const int max_decoder_block_num = pre_id_length / block_size;
|
||||
int r = baidu::xpu::api::plugin::speculate_free_and_dispatch_block(
|
||||
int r = fastdeploy::plugin::speculate_free_and_dispatch_block(
|
||||
ctx,
|
||||
const_cast<bool *>(stop_flags.data<bool>()),
|
||||
const_cast<int *>(seq_lens_this_time.data<int>()),
|
||||
@@ -88,7 +88,7 @@ void SpeculateStepPaddleBase(
|
||||
auto recover_lens_cpu = recover_lens.copy_to(paddle::CPUPlace(), false);
|
||||
int recover_lens_cpu_data = recover_lens_cpu.data<int>()[0];
|
||||
if (recover_lens_cpu_data > 0) {
|
||||
r = baidu::xpu::api::plugin::speculate_recover_block(
|
||||
r = fastdeploy::plugin::speculate_recover_block(
|
||||
ctx,
|
||||
const_cast<int *>(recover_block_list.data<int>()),
|
||||
const_cast<int *>(recover_lens.data<int>()),
|
||||
|
||||
@@ -70,7 +70,7 @@ void SpeculateStepSchedule(
|
||||
paddle::full({1}, 0, paddle::DataType::INT32, stop_flags.place());
|
||||
auto step_bs_list =
|
||||
paddle::full({bsz}, 0, paddle::DataType::INT32, stop_flags.place());
|
||||
int r = baidu::xpu::api::plugin::speculate_free_and_reschedule(
|
||||
int r = fastdeploy::plugin::speculate_free_and_reschedule(
|
||||
ctx,
|
||||
const_cast<bool *>(stop_flags.data<bool>()),
|
||||
const_cast<int *>(seq_lens_this_time.data<int>()),
|
||||
|
||||
@@ -59,7 +59,7 @@ void SpeculateTokenPenaltyMultiScores(
|
||||
case paddle::DataType::BFLOAT16: {
|
||||
using XPUType = typename XPUTypeTrait<paddle::bfloat16>::Type;
|
||||
typedef paddle::bfloat16 data_t;
|
||||
int r = baidu::xpu::api::plugin::speculate_token_penalty_multi_scores(
|
||||
int r = fastdeploy::plugin::speculate_token_penalty_multi_scores(
|
||||
ctx,
|
||||
pre_ids.data<int64_t>(),
|
||||
reinterpret_cast<XPUType*>(
|
||||
@@ -81,12 +81,13 @@ void SpeculateTokenPenaltyMultiScores(
|
||||
length_bad_words,
|
||||
token_num,
|
||||
max_seq_len);
|
||||
PD_CHECK(r == 0, "xpu::plugin::token_penalty_multi_scores failed.");
|
||||
PD_CHECK(r == 0,
|
||||
"fastdeploy::plugin::token_penalty_multi_scores failed.");
|
||||
} break;
|
||||
case paddle::DataType::FLOAT16: {
|
||||
using XPUType = typename XPUTypeTrait<float16>::Type;
|
||||
typedef paddle::float16 data_t;
|
||||
int r = baidu::xpu::api::plugin::speculate_token_penalty_multi_scores(
|
||||
int r = fastdeploy::plugin::speculate_token_penalty_multi_scores(
|
||||
ctx,
|
||||
pre_ids.data<int64_t>(),
|
||||
reinterpret_cast<XPUType*>(
|
||||
@@ -108,10 +109,11 @@ void SpeculateTokenPenaltyMultiScores(
|
||||
length_bad_words,
|
||||
token_num,
|
||||
max_seq_len);
|
||||
PD_CHECK(r == 0, "xpu::plugin::token_penalty_multi_scores failed.");
|
||||
PD_CHECK(r == 0,
|
||||
"fastdeploy::plugin::token_penalty_multi_scores failed.");
|
||||
} break;
|
||||
case paddle::DataType::FLOAT32: {
|
||||
int r = baidu::xpu::api::plugin::speculate_token_penalty_multi_scores(
|
||||
int r = fastdeploy::plugin::speculate_token_penalty_multi_scores(
|
||||
ctx,
|
||||
pre_ids.data<int64_t>(),
|
||||
const_cast<float*>(logits.data<float>()),
|
||||
@@ -132,7 +134,8 @@ void SpeculateTokenPenaltyMultiScores(
|
||||
length_bad_words,
|
||||
token_num,
|
||||
max_seq_len);
|
||||
PD_CHECK(r == 0, "xpu::plugin::token_penalty_multi_scores failed.");
|
||||
PD_CHECK(r == 0,
|
||||
"fastdeploy::plugin::token_penalty_multi_scores failed.");
|
||||
} break;
|
||||
default:
|
||||
PD_THROW(
|
||||
|
||||
@@ -46,7 +46,7 @@ void SpeculateUpdate(const paddle::Tensor &seq_lens_encoder,
|
||||
}
|
||||
|
||||
auto not_need_stop_xpu = not_need_stop.copy_to(stop_flags.place(), false);
|
||||
int r = baidu::xpu::api::plugin::speculate_update(
|
||||
int r = fastdeploy::plugin::speculate_update(
|
||||
ctx,
|
||||
const_cast<int *>(seq_lens_encoder.data<int>()),
|
||||
const_cast<int *>(seq_lens_decoder.data<int>()),
|
||||
|
||||
@@ -46,7 +46,7 @@ void SpeculateUpdateV3(const paddle::Tensor &seq_lens_encoder,
|
||||
}
|
||||
|
||||
auto not_need_stop_xpu = not_need_stop.copy_to(stop_flags.place(), false);
|
||||
int r = baidu::xpu::api::plugin::speculate_update_v3(
|
||||
int r = fastdeploy::plugin::speculate_update_v3(
|
||||
ctx,
|
||||
const_cast<int *>(seq_lens_encoder.data<int>()),
|
||||
const_cast<int *>(seq_lens_decoder.data<int>()),
|
||||
|
||||
@@ -107,7 +107,7 @@ void SpeculateVerify(const paddle::Tensor &sampled_token_ids,
|
||||
int ret;
|
||||
if (use_topk) {
|
||||
if (enable_topp) {
|
||||
ret = baidu::xpu::api::plugin::speculate_verify<true, true>(
|
||||
ret = fastdeploy::plugin::speculate_verify<true, true>(
|
||||
ctx,
|
||||
sampled_token_ids.data<int64_t>(),
|
||||
const_cast<int64_t *>(accept_tokens.data<int64_t>()),
|
||||
@@ -140,7 +140,7 @@ void SpeculateVerify(const paddle::Tensor &sampled_token_ids,
|
||||
use_target_sampling);
|
||||
PD_CHECK(ret == 0, "speculate_verify failed.");
|
||||
} else {
|
||||
ret = baidu::xpu::api::plugin::speculate_verify<false, true>(
|
||||
ret = fastdeploy::plugin::speculate_verify<false, true>(
|
||||
ctx,
|
||||
sampled_token_ids.data<int64_t>(),
|
||||
const_cast<int64_t *>(accept_tokens.data<int64_t>()),
|
||||
@@ -175,7 +175,7 @@ void SpeculateVerify(const paddle::Tensor &sampled_token_ids,
|
||||
PD_CHECK(ret == 0, "speculate_verify failed.");
|
||||
} else {
|
||||
if (enable_topp) {
|
||||
ret = baidu::xpu::api::plugin::speculate_verify<true, false>(
|
||||
ret = fastdeploy::plugin::speculate_verify<true, false>(
|
||||
ctx,
|
||||
sampled_token_ids.data<int64_t>(),
|
||||
const_cast<int64_t *>(accept_tokens.data<int64_t>()),
|
||||
@@ -208,7 +208,7 @@ void SpeculateVerify(const paddle::Tensor &sampled_token_ids,
|
||||
use_target_sampling);
|
||||
PD_CHECK(ret == 0, "speculate_verify failed.");
|
||||
} else {
|
||||
ret = baidu::xpu::api::plugin::speculate_verify<false, false>(
|
||||
ret = fastdeploy::plugin::speculate_verify<false, false>(
|
||||
ctx,
|
||||
sampled_token_ids.data<int64_t>(),
|
||||
const_cast<int64_t *>(accept_tokens.data<int64_t>()),
|
||||
|
||||
@@ -71,9 +71,9 @@ std::vector<paddle::Tensor> TopPCandidates(
|
||||
typedef paddle::bfloat16 bf16_data_t;
|
||||
switch (candidates_len) {
|
||||
FIXED_TOPK(
|
||||
r = api::plugin::top_p_candidates<XPUTypeBF16,
|
||||
TopKMaxLength,
|
||||
kTopK>(
|
||||
r = fastdeploy::plugin::top_p_candidates<XPUTypeBF16,
|
||||
TopKMaxLength,
|
||||
kTopK>(
|
||||
ctx,
|
||||
reinterpret_cast<const XPUTypeBF16*>(probs.data<bf16_data_t>()),
|
||||
reinterpret_cast<const XPUTypeBF16*>(top_p.data<bf16_data_t>()),
|
||||
@@ -86,7 +86,7 @@ std::vector<paddle::Tensor> TopPCandidates(
|
||||
token_num,
|
||||
candidates_len,
|
||||
max_seq_len);
|
||||
PD_CHECK(r == 0, "xpu::plugin::top_p_candidates failed.");
|
||||
PD_CHECK(r == 0, "fastdeploy::plugin::top_p_candidates failed.");
|
||||
return {verify_scores, verify_tokens, actual_candidate_lens});
|
||||
}
|
||||
case paddle::DataType::FLOAT16:
|
||||
@@ -94,9 +94,9 @@ std::vector<paddle::Tensor> TopPCandidates(
|
||||
typedef paddle::float16 fp16_data_t;
|
||||
switch (candidates_len) {
|
||||
FIXED_TOPK(
|
||||
r = api::plugin::top_p_candidates<XPUTypeFP16,
|
||||
TopKMaxLength,
|
||||
kTopK>(
|
||||
r = fastdeploy::plugin::top_p_candidates<XPUTypeFP16,
|
||||
TopKMaxLength,
|
||||
kTopK>(
|
||||
ctx,
|
||||
reinterpret_cast<const XPUTypeFP16*>(probs.data<fp16_data_t>()),
|
||||
reinterpret_cast<const XPUTypeFP16*>(top_p.data<fp16_data_t>()),
|
||||
@@ -109,25 +109,26 @@ std::vector<paddle::Tensor> TopPCandidates(
|
||||
token_num,
|
||||
candidates_len,
|
||||
max_seq_len);
|
||||
PD_CHECK(r == 0, "xpu::plugin::top_p_candidates failed.");
|
||||
PD_CHECK(r == 0, "fastdeploy::plugin::top_p_candidates failed.");
|
||||
return {verify_scores, verify_tokens, actual_candidate_lens});
|
||||
}
|
||||
case paddle::DataType::FLOAT32:
|
||||
switch (candidates_len) {
|
||||
FIXED_TOPK(
|
||||
r = api::plugin::top_p_candidates<float, TopKMaxLength, kTopK>(
|
||||
ctx,
|
||||
probs.data<float>(),
|
||||
top_p.data<float>(),
|
||||
output_padding_offset.data<int>(),
|
||||
verify_tokens.data<int64_t>(),
|
||||
verify_scores.data<float>(),
|
||||
actual_candidate_lens.data<int>(),
|
||||
vocab_size,
|
||||
token_num,
|
||||
candidates_len,
|
||||
max_seq_len);
|
||||
PD_CHECK(r == 0, "xpu::plugin::top_p_candidates failed.");
|
||||
r = fastdeploy::plugin::
|
||||
top_p_candidates<float, TopKMaxLength, kTopK>(
|
||||
ctx,
|
||||
probs.data<float>(),
|
||||
top_p.data<float>(),
|
||||
output_padding_offset.data<int>(),
|
||||
verify_tokens.data<int64_t>(),
|
||||
verify_scores.data<float>(),
|
||||
actual_candidate_lens.data<int>(),
|
||||
vocab_size,
|
||||
token_num,
|
||||
candidates_len,
|
||||
max_seq_len);
|
||||
PD_CHECK(r == 0, "fastdeploy::plugin::top_p_candidates failed.");
|
||||
return {verify_scores, verify_tokens, actual_candidate_lens});
|
||||
}
|
||||
default:
|
||||
|
||||
@@ -53,7 +53,7 @@ std::vector<paddle::Tensor> UpdateAttnMaskOffsets(
|
||||
paddle::DataType::INT32,
|
||||
ids_remove_padding.place());
|
||||
|
||||
baidu::xpu::api::plugin::update_attn_mask_offsets(
|
||||
int r = fastdeploy::plugin::update_attn_mask_offsets(
|
||||
ctx,
|
||||
attn_mask_offsets.data<int>(),
|
||||
seq_lens_this_time.data<int>(),
|
||||
@@ -68,6 +68,7 @@ std::vector<paddle::Tensor> UpdateAttnMaskOffsets(
|
||||
real_bsz,
|
||||
max_model_len,
|
||||
decode_states_len);
|
||||
PADDLE_ENFORCE_XDNN_SUCCESS(r, "update_attn_mask_offsets");
|
||||
|
||||
if (ids_remove_padding.is_cpu()) {
|
||||
delete ctx;
|
||||
|
||||
@@ -66,7 +66,7 @@ std::vector<paddle::Tensor> RecoverBatchSequenceKernel(
|
||||
paddle::Tensor out;
|
||||
out = paddle::empty({token_num, dim}, x.type(), x.place());
|
||||
|
||||
int r = baidu::xpu::api::plugin::eb_recover_batch_sequence<XPUType, XPUType>(
|
||||
int r = fastdeploy::plugin::eb_recover_batch_sequence<XPUType, XPUType>(
|
||||
ctx,
|
||||
reinterpret_cast<const XPUType*>(x.data<data_t>()),
|
||||
reinterpret_cast<XPUType*>(out.data<data_t>()),
|
||||
@@ -75,7 +75,7 @@ std::vector<paddle::Tensor> RecoverBatchSequenceKernel(
|
||||
encoder_batch_map_vp,
|
||||
decoder_batch_map_vp,
|
||||
dim);
|
||||
PD_CHECK(r == 0, "xpu::plugin::eb_recover_batch_sequence failed.");
|
||||
PD_CHECK(r == 0, "fastdeploy::plugin::eb_recover_batch_sequence failed.");
|
||||
return {out};
|
||||
}
|
||||
|
||||
|
||||
@@ -42,7 +42,7 @@ void RecoverDecodeTask(
|
||||
int r = 0;
|
||||
if (draft_tokens) {
|
||||
const int draft_tokens_len = draft_tokens.get_ptr()->shape()[1];
|
||||
r = baidu::xpu::api::plugin::recover_spec_decode_task(
|
||||
r = fastdeploy::plugin::recover_spec_decode_task(
|
||||
xpu_ctx->x_context(),
|
||||
const_cast<bool *>(stop_flags.data<bool>()),
|
||||
const_cast<int *>(seq_lens_this_time.data<int>()),
|
||||
@@ -60,7 +60,7 @@ void RecoverDecodeTask(
|
||||
draft_tokens_len,
|
||||
max_draft_tokens * 2 + 1);
|
||||
} else {
|
||||
r = baidu::xpu::api::plugin::recover_decode_task(
|
||||
r = fastdeploy::plugin::recover_decode_task(
|
||||
xpu_ctx->x_context(),
|
||||
const_cast<bool *>(stop_flags.data<bool>()),
|
||||
const_cast<int *>(seq_lens_this_time.data<int>()),
|
||||
@@ -73,7 +73,7 @@ void RecoverDecodeTask(
|
||||
block_num_per_seq,
|
||||
block_size);
|
||||
}
|
||||
PD_CHECK(r == 0, "baidu::xpu::api::plugin::recover_decode_task failed.");
|
||||
PD_CHECK(r == 0, "fastdeploy::plugin::recover_decode_task failed.");
|
||||
}
|
||||
|
||||
PD_BUILD_STATIC_OP(recover_decode_task)
|
||||
|
||||
@@ -30,7 +30,7 @@ void SetValueByFlagsAndIdx(const paddle::Tensor &pre_ids_all,
|
||||
int bs = seq_lens_this_time.shape()[0];
|
||||
int length = pre_ids_all.shape()[1];
|
||||
int length_input_ids = input_ids.shape()[1];
|
||||
int r = baidu::xpu::api::plugin::set_value_by_flags_and_idx(
|
||||
int r = fastdeploy::plugin::set_value_by_flags_and_idx(
|
||||
xpu_ctx->x_context(),
|
||||
stop_flags.data<bool>(),
|
||||
const_cast<int64_t *>(pre_ids_all.data<int64_t>()),
|
||||
@@ -41,7 +41,7 @@ void SetValueByFlagsAndIdx(const paddle::Tensor &pre_ids_all,
|
||||
bs,
|
||||
length,
|
||||
length_input_ids);
|
||||
PD_CHECK(r == 0, "xpu::plugin::set_value_by_flags_and_idx failed.");
|
||||
PD_CHECK(r == 0, "fastdeploy::plugin::set_value_by_flags_and_idx failed.");
|
||||
}
|
||||
|
||||
PD_BUILD_OP(set_value_by_flags_and_idx)
|
||||
|
||||
@@ -41,26 +41,25 @@ void SpeculateLimitThinkingContentLength(const paddle::Tensor& next_tokens,
|
||||
const int eos_token_id_len = eos_token_ids.shape()[0];
|
||||
const int inject_len = inject_token_ids.shape()[0];
|
||||
|
||||
int r =
|
||||
baidu::xpu::api::plugin::speculate_limit_thinking_content_length_kernel(
|
||||
xpu_ctx->x_context(),
|
||||
const_cast<int64_t*>(next_tokens.data<int64_t>()),
|
||||
max_think_lens.data<int>(),
|
||||
const_cast<int*>(max_reply_lens.data<int>()),
|
||||
const_cast<int64_t*>(step_idx.data<int64_t>()),
|
||||
eos_token_ids.data<int64_t>(),
|
||||
const_cast<int*>(limit_status.data<int>()),
|
||||
const_cast<int*>(accept_num.data<int>()),
|
||||
stop_flags.data<bool>(),
|
||||
think_end_id,
|
||||
(inject_len > 0) ? inject_token_ids.data<int64_t>() : nullptr,
|
||||
tokens_per_step,
|
||||
batch_size,
|
||||
eos_token_id_len,
|
||||
inject_len,
|
||||
splitwise_role_is_decode);
|
||||
int r = fastdeploy::plugin::speculate_limit_thinking_content_length_kernel(
|
||||
xpu_ctx->x_context(),
|
||||
const_cast<int64_t*>(next_tokens.data<int64_t>()),
|
||||
max_think_lens.data<int>(),
|
||||
const_cast<int*>(max_reply_lens.data<int>()),
|
||||
const_cast<int64_t*>(step_idx.data<int64_t>()),
|
||||
eos_token_ids.data<int64_t>(),
|
||||
const_cast<int*>(limit_status.data<int>()),
|
||||
const_cast<int*>(accept_num.data<int>()),
|
||||
stop_flags.data<bool>(),
|
||||
think_end_id,
|
||||
(inject_len > 0) ? inject_token_ids.data<int64_t>() : nullptr,
|
||||
tokens_per_step,
|
||||
batch_size,
|
||||
eos_token_id_len,
|
||||
inject_len,
|
||||
splitwise_role_is_decode);
|
||||
PD_CHECK(r == 0,
|
||||
"baidu::xpu::api::plugin::"
|
||||
"fastdeploy::plugin::"
|
||||
"speculate_limit_thinking_content_length_kernel failed.");
|
||||
}
|
||||
|
||||
|
||||
@@ -55,7 +55,7 @@ void StepPaddle(const paddle::Tensor &stop_flags,
|
||||
const int length = input_ids.shape()[1];
|
||||
const int pre_id_length = pre_ids.shape()[1];
|
||||
const int max_decoder_block_num = pre_id_length / block_size;
|
||||
int r = baidu::xpu::api::plugin::free_and_dispatch_block(
|
||||
int r = fastdeploy::plugin::free_and_dispatch_block(
|
||||
xpu_ctx->x_context(),
|
||||
const_cast<bool *>(stop_flags.data<bool>()),
|
||||
const_cast<int *>(seq_lens_this_time.data<int>()),
|
||||
@@ -81,7 +81,7 @@ void StepPaddle(const paddle::Tensor &stop_flags,
|
||||
auto recover_lens_cpu = recover_lens.copy_to(paddle::CPUPlace(), false);
|
||||
int recover_lens_cpu_data = recover_lens_cpu.data<int>()[0];
|
||||
if (recover_lens_cpu_data > 0) {
|
||||
r = baidu::xpu::api::plugin::recover_block(
|
||||
r = fastdeploy::plugin::recover_block(
|
||||
xpu_ctx->x_context(),
|
||||
const_cast<int *>(recover_block_list.data<int>()),
|
||||
const_cast<int *>(recover_lens.data<int>()),
|
||||
|
||||
@@ -39,7 +39,7 @@ void GetStopFlagsMulti(const paddle::Tensor &topk_ids,
|
||||
std::vector<int64_t> shape = topk_ids.shape();
|
||||
int64_t bs_now = shape[0];
|
||||
int64_t end_length = end_ids.shape()[0];
|
||||
int r = baidu::xpu::api::plugin::set_stop_value_multi_ends<int64_t>(
|
||||
int r = fastdeploy::plugin::set_stop_value_multi_ends<int64_t>(
|
||||
xpu_ctx->x_context(),
|
||||
const_cast<bool *>(stop_flags.data<bool>()),
|
||||
const_cast<int64_t *>(topk_ids.data<int64_t>()),
|
||||
@@ -49,7 +49,7 @@ void GetStopFlagsMulti(const paddle::Tensor &topk_ids,
|
||||
bs_now,
|
||||
end_length,
|
||||
beam_search);
|
||||
PD_CHECK(r == 0, "xpu::plugin::set_stop_value_multi_ends failed.");
|
||||
PD_CHECK(r == 0, "fastdeploy::plugin::set_stop_value_multi_ends failed.");
|
||||
}
|
||||
|
||||
PD_BUILD_OP(set_stop_value_multi_ends)
|
||||
|
||||
@@ -42,7 +42,7 @@ std::vector<paddle::Tensor> TextImageGatherScatter(
|
||||
case paddle::DataType::BFLOAT16: {
|
||||
using XPUType = typename XPUTypeTrait<bfloat16>::Type;
|
||||
typedef paddle::bfloat16 data_t;
|
||||
int r = baidu::xpu::api::plugin::text_image_gather_scatter<XPUType>(
|
||||
int r = fastdeploy::plugin::text_image_gather_scatter<XPUType>(
|
||||
xpu_ctx->x_context(),
|
||||
reinterpret_cast<XPUType*>(input.data<data_t>()),
|
||||
reinterpret_cast<XPUType*>(text_input.data<data_t>()),
|
||||
|
||||
@@ -28,7 +28,7 @@ void TextImageIndexOut(const paddle::Tensor& token_type_ids,
|
||||
auto dev_ctx = paddle::experimental::DeviceContextPool::Instance().Get(place);
|
||||
auto xpu_ctx = static_cast<const phi::XPUContext*>(dev_ctx);
|
||||
const int64_t token_num = token_type_ids.shape()[0];
|
||||
int r = baidu::xpu::api::plugin::text_image_index_out(
|
||||
int r = fastdeploy::plugin::text_image_index_out(
|
||||
xpu_ctx->x_context(),
|
||||
token_type_ids.data<int32_t>(),
|
||||
const_cast<int32_t*>(text_index.data<int32_t>()),
|
||||
|
||||
@@ -39,7 +39,7 @@ void UpdateInputs(const paddle::Tensor& stop_flags,
|
||||
const int input_ids_stride = input_ids.shape()[1];
|
||||
auto not_need_stop_xpu = not_need_stop.copy_to(stop_flags.place(), false);
|
||||
|
||||
int r = baidu::xpu::api::plugin::update_inputs(
|
||||
int r = fastdeploy::plugin::update_inputs(
|
||||
xpu_ctx->x_context(),
|
||||
const_cast<bool*>(not_need_stop_xpu.data<bool>()),
|
||||
const_cast<int*>(seq_lens_this_time.data<int>()),
|
||||
@@ -52,7 +52,7 @@ void UpdateInputs(const paddle::Tensor& stop_flags,
|
||||
now_bsz,
|
||||
max_bsz,
|
||||
input_ids_stride);
|
||||
PD_CHECK(r == 0, "baidu::xpu::api::plugin::update_inputs failed.");
|
||||
PD_CHECK(r == 0, "fastdeploy::plugin::update_inputs failed.");
|
||||
auto not_need_stop_cpu =
|
||||
not_need_stop_xpu.copy_to(not_need_stop.place(), false);
|
||||
bool* not_need_stop_data = const_cast<bool*>(not_need_stop.data<bool>());
|
||||
|
||||
@@ -44,7 +44,7 @@ void UpdateInputsV1(const paddle::Tensor& stop_flags,
|
||||
const int input_ids_stride = input_ids.shape()[1];
|
||||
const int block_num_per_seq = block_tables.shape()[1];
|
||||
auto not_need_stop_gpu = not_need_stop.copy_to(stop_flags.place(), false);
|
||||
int r = baidu::xpu::api::plugin::update_inputs_v1(
|
||||
int r = fastdeploy::plugin::update_inputs_v1(
|
||||
xpu_ctx->x_context(),
|
||||
const_cast<bool*>(not_need_stop_gpu.data<bool>()),
|
||||
const_cast<int*>(seq_lens_this_time.data<int>()),
|
||||
@@ -63,7 +63,7 @@ void UpdateInputsV1(const paddle::Tensor& stop_flags,
|
||||
input_ids_stride,
|
||||
block_num_per_seq,
|
||||
block_size);
|
||||
PD_CHECK(r == 0, "baidu::xpu::api::plugin::update_inputs_kernel_v1 failed.");
|
||||
PD_CHECK(r == 0, "fastdeploy::plugin::update_inputs_kernel_v1 failed.");
|
||||
auto not_need_stop_cpu =
|
||||
not_need_stop_gpu.copy_to(not_need_stop.place(), false);
|
||||
bool* not_need_stop_data = const_cast<bool*>(not_need_stop.data<bool>());
|
||||
|
||||
@@ -34,15 +34,14 @@ std::vector<paddle::Tensor> WeightQuantizeKernel(const paddle::Tensor &x,
|
||||
if (algo == "weight_only_int8") {
|
||||
paddle::Tensor out =
|
||||
paddle::full({k, n}, 0, paddle::DataType::INT8, x.place());
|
||||
int ret =
|
||||
baidu::xpu::api::plugin::quant2d_per_channel<XPUType, float, int8_t>(
|
||||
xpu_ctx->x_context(),
|
||||
reinterpret_cast<const XPUType *>(x.template data<T>()),
|
||||
nullptr,
|
||||
out.data<int8_t>(),
|
||||
scale.data<float>(),
|
||||
k,
|
||||
n);
|
||||
int ret = fastdeploy::plugin::quant2d_per_channel<XPUType, float, int8_t>(
|
||||
xpu_ctx->x_context(),
|
||||
reinterpret_cast<const XPUType *>(x.template data<T>()),
|
||||
nullptr,
|
||||
out.data<int8_t>(),
|
||||
scale.data<float>(),
|
||||
k,
|
||||
n);
|
||||
PD_CHECK(ret == 0);
|
||||
return {out, scale};
|
||||
} else if (algo == "weight_only_int4") {
|
||||
|
||||
@@ -18,13 +18,17 @@
|
||||
#pragma once
|
||||
#include "xpu/xdnn.h"
|
||||
|
||||
namespace baidu {
|
||||
namespace xpu {
|
||||
namespace api {
|
||||
namespace fd_xpu3 {
|
||||
typedef xpu3::int64_t int64_t;
|
||||
}
|
||||
|
||||
namespace fastdeploy {
|
||||
namespace plugin {
|
||||
|
||||
namespace api = baidu::xpu::api;
|
||||
|
||||
template <typename T>
|
||||
DLL_EXPORT int set_stop_value_multi_ends(Context* ctx,
|
||||
DLL_EXPORT int set_stop_value_multi_ends(api::Context* ctx,
|
||||
bool* stop_flags,
|
||||
T* topk_ids,
|
||||
T* next_tokens,
|
||||
@@ -34,7 +38,7 @@ DLL_EXPORT int set_stop_value_multi_ends(Context* ctx,
|
||||
const int end_length,
|
||||
const bool beam_search);
|
||||
|
||||
DLL_EXPORT int set_value_by_flags_and_idx(Context* ctx,
|
||||
DLL_EXPORT int set_value_by_flags_and_idx(api::Context* ctx,
|
||||
const bool* stop_flags,
|
||||
int64_t* pre_ids_all,
|
||||
const int64_t* input_ids,
|
||||
@@ -46,7 +50,7 @@ DLL_EXPORT int set_value_by_flags_and_idx(Context* ctx,
|
||||
int length_input_ids);
|
||||
|
||||
template <typename T>
|
||||
DLL_EXPORT int token_penalty_multi_scores(Context* ctx,
|
||||
DLL_EXPORT int token_penalty_multi_scores(api::Context* ctx,
|
||||
const int64_t* pre_ids,
|
||||
T* logits,
|
||||
const T* penalty_scores,
|
||||
@@ -63,7 +67,7 @@ DLL_EXPORT int token_penalty_multi_scores(Context* ctx,
|
||||
const int64_t end_length,
|
||||
const int64_t length_bad_words);
|
||||
|
||||
DLL_EXPORT int get_padding_offset(Context* ctx,
|
||||
DLL_EXPORT int get_padding_offset(api::Context* ctx,
|
||||
int* padding_offset,
|
||||
int* cum_offsets_out,
|
||||
int* cu_seqlens_q,
|
||||
@@ -75,7 +79,7 @@ DLL_EXPORT int get_padding_offset(Context* ctx,
|
||||
const int max_seq_len,
|
||||
const int bs);
|
||||
|
||||
DLL_EXPORT int speculate_get_padding_offset(Context* ctx,
|
||||
DLL_EXPORT int speculate_get_padding_offset(api::Context* ctx,
|
||||
int* batch_id_per_token,
|
||||
int* cum_offsets_out,
|
||||
int* cu_seqlens_q,
|
||||
@@ -117,7 +121,7 @@ DLL_EXPORT int draft_model_preprocess(api::Context* ctx,
|
||||
const bool splitwise_prefill,
|
||||
const bool kvcache_scheduler_v1);
|
||||
|
||||
DLL_EXPORT int update_inputs(Context* ctx,
|
||||
DLL_EXPORT int update_inputs(api::Context* ctx,
|
||||
bool* not_need_stop,
|
||||
int* seq_lens_this_time,
|
||||
int* seq_lens_encoder,
|
||||
@@ -130,7 +134,7 @@ DLL_EXPORT int update_inputs(Context* ctx,
|
||||
const int max_bsz,
|
||||
const int input_ids_stride);
|
||||
|
||||
DLL_EXPORT int free_and_dispatch_block(Context* ctx,
|
||||
DLL_EXPORT int free_and_dispatch_block(api::Context* ctx,
|
||||
bool* stop_flags,
|
||||
int* seq_lens_this_time,
|
||||
int* seq_lens_decoder,
|
||||
@@ -153,7 +157,7 @@ DLL_EXPORT int free_and_dispatch_block(Context* ctx,
|
||||
const int max_decoder_block_num);
|
||||
|
||||
DLL_EXPORT int speculate_free_and_dispatch_block(
|
||||
Context* ctx,
|
||||
api::Context* ctx,
|
||||
bool* stop_flags,
|
||||
int* seq_lens_this_time,
|
||||
int* seq_lens_decoder,
|
||||
@@ -177,7 +181,7 @@ DLL_EXPORT int speculate_free_and_dispatch_block(
|
||||
const int max_decoder_block_num,
|
||||
const int max_draft_tokens);
|
||||
|
||||
DLL_EXPORT int recover_block(Context* ctx,
|
||||
DLL_EXPORT int recover_block(api::Context* ctx,
|
||||
int* recover_block_list, // [bsz]
|
||||
int* recover_len,
|
||||
bool* stop_flags,
|
||||
@@ -200,7 +204,7 @@ DLL_EXPORT int recover_block(Context* ctx,
|
||||
const int length,
|
||||
const int pre_id_length);
|
||||
|
||||
DLL_EXPORT int speculate_recover_block(Context* ctx,
|
||||
DLL_EXPORT int speculate_recover_block(api::Context* ctx,
|
||||
int* recover_block_list, // [bsz]
|
||||
int* recover_len,
|
||||
bool* stop_flags,
|
||||
@@ -224,7 +228,7 @@ DLL_EXPORT int speculate_recover_block(Context* ctx,
|
||||
const int length,
|
||||
const int pre_id_length);
|
||||
|
||||
DLL_EXPORT int recover_decode_task(Context* ctx,
|
||||
DLL_EXPORT int recover_decode_task(api::Context* ctx,
|
||||
bool* stop_flags,
|
||||
int* seq_lens_this_time,
|
||||
int* seq_lens_encoder,
|
||||
@@ -236,7 +240,7 @@ DLL_EXPORT int recover_decode_task(Context* ctx,
|
||||
const int block_num_per_seq,
|
||||
const int block_size);
|
||||
|
||||
DLL_EXPORT int recover_spec_decode_task(Context* ctx,
|
||||
DLL_EXPORT int recover_spec_decode_task(api::Context* ctx,
|
||||
bool* stop_flags,
|
||||
int* seq_lens_this_time,
|
||||
int* seq_lens_encoder,
|
||||
@@ -253,7 +257,7 @@ DLL_EXPORT int recover_spec_decode_task(Context* ctx,
|
||||
const int draft_tokens_len,
|
||||
const int num_extra_tokens);
|
||||
|
||||
DLL_EXPORT int update_inputs_v1(Context* ctx,
|
||||
DLL_EXPORT int update_inputs_v1(api::Context* ctx,
|
||||
bool* not_need_stop,
|
||||
int* seq_lens_this_time,
|
||||
int* seq_lens_encoder,
|
||||
@@ -274,45 +278,45 @@ DLL_EXPORT int update_inputs_v1(Context* ctx,
|
||||
|
||||
template <typename TX, typename TY>
|
||||
DLL_EXPORT int eb_adjust_batch(
|
||||
Context* ctx,
|
||||
api::Context* ctx,
|
||||
const TX* x,
|
||||
TY* y,
|
||||
VectorParam<int32_t>& encoder_seqs_lods, // NOLINT
|
||||
VectorParam<int32_t>& decoder_seqs_lods, // NOLINT
|
||||
VectorParam<int32_t>& encoder_batch_map, // NOLINT
|
||||
VectorParam<int32_t>& decoder_batch_map, // NOLINT
|
||||
api::VectorParam<int32_t>& encoder_seqs_lods, // NOLINT
|
||||
api::VectorParam<int32_t>& decoder_seqs_lods, // NOLINT
|
||||
api::VectorParam<int32_t>& encoder_batch_map, // NOLINT
|
||||
api::VectorParam<int32_t>& decoder_batch_map, // NOLINT
|
||||
int64_t hidden_dim);
|
||||
|
||||
template <typename TX, typename TY>
|
||||
DLL_EXPORT int eb_gather_next_token(
|
||||
Context* ctx,
|
||||
api::Context* ctx,
|
||||
const TX* x,
|
||||
TY* y,
|
||||
VectorParam<int32_t>& encoder_seqs_lods, // NOLINT
|
||||
VectorParam<int32_t>& encoder_batch_map, // NOLINT
|
||||
VectorParam<int32_t>& decoder_batch_map, // NOLINT
|
||||
api::VectorParam<int32_t>& encoder_seqs_lods, // NOLINT
|
||||
api::VectorParam<int32_t>& encoder_batch_map, // NOLINT
|
||||
api::VectorParam<int32_t>& decoder_batch_map, // NOLINT
|
||||
int64_t hidden_dim);
|
||||
|
||||
template <typename TX, typename TY>
|
||||
DLL_EXPORT int eb_mtp_gather_next_token(
|
||||
Context* ctx,
|
||||
api::Context* ctx,
|
||||
const TX* x,
|
||||
TY* y,
|
||||
VectorParam<int32_t>& encoder_seqs_lods, // NOLINT
|
||||
VectorParam<int32_t>& decoder_seqs_lods, // NOLINT
|
||||
VectorParam<int32_t>& encoder_batch_map, // NOLINT
|
||||
VectorParam<int32_t>& decoder_batch_map, // NOLINT
|
||||
api::VectorParam<int32_t>& encoder_seqs_lods, // NOLINT
|
||||
api::VectorParam<int32_t>& decoder_seqs_lods, // NOLINT
|
||||
api::VectorParam<int32_t>& encoder_batch_map, // NOLINT
|
||||
api::VectorParam<int32_t>& decoder_batch_map, // NOLINT
|
||||
int64_t hidden_dim);
|
||||
|
||||
template <typename TX, typename TY>
|
||||
DLL_EXPORT int eb_recover_batch_sequence(
|
||||
Context* ctx,
|
||||
api::Context* ctx,
|
||||
const TX* x,
|
||||
TY* y,
|
||||
VectorParam<int32_t>& encoder_seqs_lods, // NOLINT
|
||||
VectorParam<int32_t>& decoder_seqs_lods, // NOLINT
|
||||
VectorParam<int32_t>& encoder_batch_map, // NOLINT
|
||||
VectorParam<int32_t>& decoder_batch_map, // NOLINT
|
||||
api::VectorParam<int32_t>& encoder_seqs_lods, // NOLINT
|
||||
api::VectorParam<int32_t>& decoder_seqs_lods, // NOLINT
|
||||
api::VectorParam<int32_t>& encoder_batch_map, // NOLINT
|
||||
api::VectorParam<int32_t>& decoder_batch_map, // NOLINT
|
||||
int64_t hidden_dim);
|
||||
|
||||
template <typename TX, typename TSCALE = float, typename TY = int8_t>
|
||||
@@ -324,7 +328,7 @@ DLL_EXPORT int quant2d_per_channel(api::Context* ctx,
|
||||
int64_t m,
|
||||
int64_t n);
|
||||
|
||||
DLL_EXPORT int text_image_index_out(Context* ctx,
|
||||
DLL_EXPORT int text_image_index_out(api::Context* ctx,
|
||||
const int* token_type_ids, // x
|
||||
int* text_index, // y1
|
||||
int* image_index, // y2
|
||||
@@ -372,7 +376,7 @@ DLL_EXPORT int limit_thinking_content_length_kernel_v2(
|
||||
|
||||
template <typename T>
|
||||
DLL_EXPORT int speculate_token_penalty_multi_scores(
|
||||
Context* ctx,
|
||||
api::Context* ctx,
|
||||
const int64_t* pre_ids,
|
||||
T* logits,
|
||||
const T* penalty_scores,
|
||||
@@ -392,7 +396,7 @@ DLL_EXPORT int speculate_token_penalty_multi_scores(
|
||||
const int64_t length_bad_words,
|
||||
const int64_t token_num,
|
||||
const int64_t max_seq_len);
|
||||
DLL_EXPORT int mtp_free_and_dispatch_block(Context* ctx,
|
||||
DLL_EXPORT int mtp_free_and_dispatch_block(api::Context* ctx,
|
||||
bool* base_model_stop_flags,
|
||||
bool* stop_flags,
|
||||
bool* batch_drop,
|
||||
@@ -409,7 +413,7 @@ DLL_EXPORT int mtp_free_and_dispatch_block(Context* ctx,
|
||||
const int max_draft_tokens);
|
||||
|
||||
template <bool ENABLE_TOPP, bool USE_TOPK>
|
||||
DLL_EXPORT int speculate_verify(Context* ctx,
|
||||
DLL_EXPORT int speculate_verify(api::Context* ctx,
|
||||
const int64_t* sampled_token_ids,
|
||||
int64_t* accept_tokens,
|
||||
int* accept_num,
|
||||
@@ -440,19 +444,19 @@ DLL_EXPORT int speculate_verify(Context* ctx,
|
||||
const bool accept_all_drafts,
|
||||
const bool use_target_sampling);
|
||||
|
||||
DLL_EXPORT int speculate_clear_accept_nums(Context* ctx,
|
||||
DLL_EXPORT int speculate_clear_accept_nums(api::Context* ctx,
|
||||
int* accept_num,
|
||||
const int* seq_lens_decoder,
|
||||
const int max_bsz);
|
||||
|
||||
DLL_EXPORT int speculate_get_seq_lens_output(Context* ctx,
|
||||
DLL_EXPORT int speculate_get_seq_lens_output(api::Context* ctx,
|
||||
int* seq_lens_output,
|
||||
const int* seq_lens_this_time,
|
||||
const int* seq_lens_encoder,
|
||||
const int* seq_lens_decoder,
|
||||
const int real_bsz);
|
||||
|
||||
DLL_EXPORT int draft_model_update(Context* ctx,
|
||||
DLL_EXPORT int draft_model_update(api::Context* ctx,
|
||||
const int64_t* inter_next_tokens,
|
||||
int64_t* draft_tokens,
|
||||
int64_t* pre_ids,
|
||||
@@ -475,7 +479,7 @@ DLL_EXPORT int draft_model_update(Context* ctx,
|
||||
const int substep,
|
||||
const bool prefill_one_step_stop);
|
||||
|
||||
DLL_EXPORT int speculate_set_stop_value_multi_seqs(Context* ctx,
|
||||
DLL_EXPORT int speculate_set_stop_value_multi_seqs(api::Context* ctx,
|
||||
bool* stop_flags,
|
||||
int64_t* accept_tokens,
|
||||
int* accept_nums,
|
||||
@@ -504,7 +508,7 @@ DLL_EXPORT int speculate_rebuild_append_padding(api::Context* ctx,
|
||||
T* out);
|
||||
|
||||
template <typename T>
|
||||
DLL_EXPORT int speculate_remove_padding(Context* ctx,
|
||||
DLL_EXPORT int speculate_remove_padding(api::Context* ctx,
|
||||
T* x_remove_padding,
|
||||
const T* input_ids,
|
||||
const T* draft_tokens,
|
||||
@@ -536,7 +540,7 @@ DLL_EXPORT int compute_order(api::Context* ctx,
|
||||
const int actual_draft_token_num,
|
||||
const int input_token_num);
|
||||
|
||||
DLL_EXPORT int draft_model_postprocess(Context* ctx,
|
||||
DLL_EXPORT int draft_model_postprocess(api::Context* ctx,
|
||||
const int64_t* base_model_draft_tokens,
|
||||
int* base_model_seq_lens_this_time,
|
||||
const int* base_model_seq_lens_encoder,
|
||||
@@ -544,7 +548,7 @@ DLL_EXPORT int draft_model_postprocess(Context* ctx,
|
||||
int bsz,
|
||||
int base_model_draft_token_len);
|
||||
|
||||
DLL_EXPORT int speculate_set_value_by_flag_and_id(Context* ctx,
|
||||
DLL_EXPORT int speculate_set_value_by_flag_and_id(api::Context* ctx,
|
||||
int64_t* pre_ids_all,
|
||||
const int64_t* accept_tokens,
|
||||
int* accept_num,
|
||||
@@ -557,7 +561,7 @@ DLL_EXPORT int speculate_set_value_by_flag_and_id(Context* ctx,
|
||||
int max_draft_tokens);
|
||||
|
||||
DLL_EXPORT int speculate_get_output_padding_offset(
|
||||
Context* ctx,
|
||||
api::Context* ctx,
|
||||
int* output_padding_offset,
|
||||
int* output_cum_offsets,
|
||||
const int* output_cum_offsets_tmp,
|
||||
@@ -578,7 +582,7 @@ DLL_EXPORT int top_p_candidates(api::Context* ctx,
|
||||
int max_cadidate_len,
|
||||
int max_seq_len);
|
||||
|
||||
DLL_EXPORT int speculate_free_and_reschedule(Context* ctx,
|
||||
DLL_EXPORT int speculate_free_and_reschedule(api::Context* ctx,
|
||||
bool* stop_flags,
|
||||
int* seq_lens_this_time,
|
||||
int* seq_lens_decoder,
|
||||
@@ -601,7 +605,7 @@ DLL_EXPORT int speculate_free_and_reschedule(Context* ctx,
|
||||
const int max_decoder_block_num,
|
||||
const int max_draft_tokens);
|
||||
|
||||
DLL_EXPORT int speculate_schedule_cache(Context* ctx,
|
||||
DLL_EXPORT int speculate_schedule_cache(api::Context* ctx,
|
||||
const int64_t* draft_tokens,
|
||||
int* block_tables,
|
||||
bool* stop_flags,
|
||||
@@ -625,7 +629,7 @@ DLL_EXPORT int speculate_schedule_cache(Context* ctx,
|
||||
const int block_num_per_seq,
|
||||
const bool prefill_one_step_stop);
|
||||
|
||||
DLL_EXPORT int speculate_update_v3(Context* ctx,
|
||||
DLL_EXPORT int speculate_update_v3(api::Context* ctx,
|
||||
int* seq_lens_encoder,
|
||||
int* seq_lens_decoder,
|
||||
bool* not_need_stop,
|
||||
@@ -641,7 +645,7 @@ DLL_EXPORT int speculate_update_v3(Context* ctx,
|
||||
const int max_bsz,
|
||||
const int max_draft_tokens);
|
||||
|
||||
DLL_EXPORT int speculate_update(Context* ctx,
|
||||
DLL_EXPORT int speculate_update(api::Context* ctx,
|
||||
int* seq_lens_encoder,
|
||||
int* seq_lens_decoder,
|
||||
bool* not_need_stop,
|
||||
@@ -674,7 +678,7 @@ DLL_EXPORT int rebuild_self_hidden_states(api::Context* ctx,
|
||||
int dim_embed,
|
||||
int elem_cnt);
|
||||
|
||||
DLL_EXPORT int speculate_get_logits(Context* ctx,
|
||||
DLL_EXPORT int speculate_get_logits(api::Context* ctx,
|
||||
float* draft_logits,
|
||||
int* next_token_num,
|
||||
int* batch_token_num,
|
||||
@@ -687,7 +691,7 @@ DLL_EXPORT int speculate_get_logits(Context* ctx,
|
||||
const int real_bsz,
|
||||
const int vocab_size);
|
||||
|
||||
DLL_EXPORT int update_attn_mask_offsets(Context* ctx,
|
||||
DLL_EXPORT int update_attn_mask_offsets(api::Context* ctx,
|
||||
int* attn_mask_offsets,
|
||||
const int* seq_lens_this_time,
|
||||
const int* seq_lens_encoder,
|
||||
@@ -723,6 +727,4 @@ DLL_EXPORT int speculate_limit_thinking_content_length_kernel(
|
||||
* --------------------------------------------*/
|
||||
|
||||
} // namespace plugin
|
||||
} // namespace api
|
||||
} // namespace xpu
|
||||
} // namespace baidu
|
||||
} // namespace fastdeploy
|
||||
|
||||
@@ -2,8 +2,7 @@
|
||||
#include "xpu/kernel/cluster_partition.h"
|
||||
#include "xpu/kernel/cluster_primitive.h"
|
||||
|
||||
namespace xpu3 {
|
||||
namespace plugin {
|
||||
namespace fd_xpu3 {
|
||||
|
||||
template <typename T>
|
||||
inline __device__ void update_bad_words_logit(_global_ptr_ T* logits) {
|
||||
@@ -54,5 +53,4 @@ __global__ void ban_bad_words(T* logits,
|
||||
_XPU_DEF__BAN_BAD_WORDS_(float);
|
||||
_XPU_DEF__BAN_BAD_WORDS_(float16);
|
||||
|
||||
} // namespace plugin
|
||||
} // namespace xpu3
|
||||
} // namespace fd_xpu3
|
||||
|
||||
@@ -1,8 +1,7 @@
|
||||
#include "xpu/kernel/cluster.h"
|
||||
#include "xpu/kernel/cluster_debug.h"
|
||||
#include "xpu/kernel/cluster_primitive.h"
|
||||
namespace xpu3 {
|
||||
namespace plugin {
|
||||
namespace fd_xpu3 {
|
||||
#define MAX_LM_SIZE 28672
|
||||
// One core has 32KB LM(gropu LM), MAX_LM_SIZE = (32 - 4)KB / 2 = 30720, 4KB is
|
||||
// the stack space
|
||||
@@ -134,5 +133,4 @@ _XPU_DEF__EB_ADJUST_BATCH_(bfloat16, float);
|
||||
_XPU_DEF__EB_ADJUST_BATCH_(float, bfloat16);
|
||||
_XPU_DEF__EB_ADJUST_BATCH_(int32_t, int32_t);
|
||||
_XPU_DEF__EB_ADJUST_BATCH_(int64_t, int64_t);
|
||||
} // namespace plugin
|
||||
} // namespace xpu3
|
||||
} // namespace fd_xpu3
|
||||
|
||||
@@ -1,8 +1,7 @@
|
||||
#include "xpu/kernel/cluster.h"
|
||||
#include "xpu/kernel/cluster_debug.h"
|
||||
#include "xpu/kernel/cluster_primitive.h"
|
||||
namespace xpu3 {
|
||||
namespace plugin {
|
||||
namespace fd_xpu3 {
|
||||
#define MAX_LM_SIZE 28672
|
||||
// One core has 32KB LM(group LM), MAX_LM_SIZE = (32 - 4)KB / 2 = 30720, 4KB is
|
||||
// the stack space
|
||||
@@ -98,5 +97,4 @@ _XPU_DEF__EB_GATHER_NEXT_TOKEN(bfloat16, float16);
|
||||
_XPU_DEF__EB_GATHER_NEXT_TOKEN(float16, bfloat16);
|
||||
_XPU_DEF__EB_GATHER_NEXT_TOKEN(bfloat16, float);
|
||||
_XPU_DEF__EB_GATHER_NEXT_TOKEN(float, bfloat16);
|
||||
} // namespace plugin
|
||||
} // namespace xpu3
|
||||
} // namespace fd_xpu3
|
||||
|
||||
@@ -2,8 +2,7 @@
|
||||
#include "xpu/kernel/cluster_partition.h"
|
||||
#include "xpu/kernel/cluster_primitive.h"
|
||||
|
||||
namespace xpu3 {
|
||||
namespace plugin {
|
||||
namespace fd_xpu3 {
|
||||
|
||||
static __device__ inline int loada_float(_shared_ptr_ const int *ptr) {
|
||||
int ret;
|
||||
@@ -322,5 +321,4 @@ __global__ void free_and_dispatch_block(bool *stop_flags,
|
||||
}
|
||||
}
|
||||
|
||||
} // namespace plugin
|
||||
} // namespace xpu3
|
||||
} // namespace fd_xpu3
|
||||
|
||||
@@ -2,8 +2,7 @@
|
||||
#include "xpu/kernel/cluster_partition.h"
|
||||
#include "xpu/kernel/cluster_primitive.h"
|
||||
|
||||
namespace xpu3 {
|
||||
namespace plugin {
|
||||
namespace fd_xpu3 {
|
||||
|
||||
__global__ void get_padding_offset(int *batch_id_per_token,
|
||||
int *cum_offsets_out,
|
||||
@@ -49,5 +48,4 @@ __global__ void get_padding_offset(int *batch_id_per_token,
|
||||
}
|
||||
}
|
||||
|
||||
} // namespace plugin
|
||||
} // namespace xpu3
|
||||
} // namespace fd_xpu3
|
||||
|
||||
+2
-4
@@ -7,8 +7,7 @@
|
||||
#include "xpu/kernel/xtdk.h"
|
||||
#include "xpu/kernel/xtdk_io.h"
|
||||
|
||||
namespace xpu3 {
|
||||
namespace plugin {
|
||||
namespace fd_xpu3 {
|
||||
|
||||
template <typename T>
|
||||
static inline __device__ bool is_in_end(const T id,
|
||||
@@ -94,5 +93,4 @@ __global__ void limit_thinking_content_length_kernel_v1(
|
||||
}
|
||||
}
|
||||
|
||||
} // namespace plugin
|
||||
} // namespace xpu3
|
||||
} // namespace fd_xpu3
|
||||
|
||||
+2
-4
@@ -7,8 +7,7 @@
|
||||
#include "xpu/kernel/xtdk.h"
|
||||
#include "xpu/kernel/xtdk_io.h"
|
||||
|
||||
namespace xpu3 {
|
||||
namespace plugin {
|
||||
namespace fd_xpu3 {
|
||||
|
||||
__global__ void limit_thinking_content_length_kernel_v2(
|
||||
int64_t* next_tokens,
|
||||
@@ -89,5 +88,4 @@ __global__ void limit_thinking_content_length_kernel_v2(
|
||||
}
|
||||
}
|
||||
|
||||
} // namespace plugin
|
||||
} // namespace xpu3
|
||||
} // namespace fd_xpu3
|
||||
|
||||
@@ -2,8 +2,7 @@
|
||||
#include "xpu/kernel/cluster_partition.h"
|
||||
#include "xpu/kernel/cluster_primitive.h"
|
||||
|
||||
namespace xpu3 {
|
||||
namespace plugin {
|
||||
namespace fd_xpu3 {
|
||||
|
||||
template <typename T>
|
||||
__global__ void min_length_logits_process(T* logits,
|
||||
@@ -64,5 +63,4 @@ __global__ void min_length_logits_process(T* logits,
|
||||
_XPU_DEF__UPDATE_LOGITS_REPEAT_TIMES_(float);
|
||||
_XPU_DEF__UPDATE_LOGITS_REPEAT_TIMES_(float16);
|
||||
|
||||
} // namespace plugin
|
||||
} // namespace xpu3
|
||||
} // namespace fd_xpu3
|
||||
|
||||
@@ -2,8 +2,7 @@
|
||||
#include "xpu/kernel/cluster_partition.h"
|
||||
#include "xpu/kernel/cluster_primitive.h"
|
||||
|
||||
namespace xpu3 {
|
||||
namespace plugin {
|
||||
namespace fd_xpu3 {
|
||||
|
||||
__global__ void ComputeOrderKernel(const int* seq_lens_this_time,
|
||||
const int* seq_lens_encoder,
|
||||
@@ -112,5 +111,4 @@ __global__ void ComputeOrderKernel(const int* seq_lens_this_time,
|
||||
LM2GM(&out_offset, output_token_num, sizeof(int));
|
||||
}
|
||||
|
||||
} // namespace plugin
|
||||
} // namespace xpu3
|
||||
} // namespace fd_xpu3
|
||||
|
||||
+2
-4
@@ -2,8 +2,7 @@
|
||||
#include "xpu/kernel/cluster_partition.h"
|
||||
#include "xpu/kernel/cluster_primitive.h"
|
||||
|
||||
namespace xpu3 {
|
||||
namespace plugin {
|
||||
namespace fd_xpu3 {
|
||||
|
||||
__global__ void ComputeSelfOrderKernel(const int* last_seq_lens_this_time,
|
||||
const int* seq_lens_this_time,
|
||||
@@ -69,5 +68,4 @@ __global__ void ComputeSelfOrderKernel(const int* last_seq_lens_this_time,
|
||||
LM2GM(&out_offset, output_token_num, sizeof(int));
|
||||
}
|
||||
|
||||
} // namespace plugin
|
||||
} // namespace xpu3
|
||||
} // namespace fd_xpu3
|
||||
|
||||
+2
-4
@@ -4,8 +4,7 @@
|
||||
#include "xpu/kernel/cluster_primitive_template.h"
|
||||
#include "xpu/kernel/cluster_simd.h"
|
||||
#include "xpu/kernel/xtdk_io.h"
|
||||
namespace xpu3 {
|
||||
namespace plugin {
|
||||
namespace fd_xpu3 {
|
||||
|
||||
static inline __device__ int v_reduce(int32x16_t v) {
|
||||
auto v0 = vsrlp_int32x16(256, v);
|
||||
@@ -185,5 +184,4 @@ __global__ void draft_model_postprocess(const int64_t* base_model_draft_tokens,
|
||||
sync_cluster();
|
||||
}
|
||||
}
|
||||
} // namespace plugin
|
||||
} // namespace xpu3
|
||||
} // namespace fd_xpu3
|
||||
|
||||
+2
-4
@@ -4,8 +4,7 @@
|
||||
#include "xpu/kernel/cluster_primitive.h"
|
||||
#include "xpu/kernel/cluster_simd.h"
|
||||
|
||||
namespace xpu3 {
|
||||
namespace plugin {
|
||||
namespace fd_xpu3 {
|
||||
__global__ void draft_model_preprocess(int64_t* draft_tokens,
|
||||
int64_t* input_ids,
|
||||
bool* stop_flags,
|
||||
@@ -235,5 +234,4 @@ __global__ void draft_model_preprocess(int64_t* draft_tokens,
|
||||
}
|
||||
}
|
||||
|
||||
} // namespace plugin
|
||||
} // namespace xpu3
|
||||
} // namespace fd_xpu3
|
||||
|
||||
+2
-4
@@ -3,8 +3,7 @@
|
||||
#include "xpu/kernel/cluster_partition.h"
|
||||
#include "xpu/kernel/cluster_primitive.h"
|
||||
#include "xpu/kernel/cluster_primitive_template.h"
|
||||
namespace xpu3 {
|
||||
namespace plugin {
|
||||
namespace fd_xpu3 {
|
||||
inline __device__ bool is_in_end(const int64_t id,
|
||||
const __global_ptr__ int64_t* end_ids,
|
||||
int length) {
|
||||
@@ -108,5 +107,4 @@ __global__ void draft_model_update(const int64_t* inter_next_tokens,
|
||||
}
|
||||
mfence();
|
||||
}
|
||||
} // namespace plugin
|
||||
} // namespace xpu3
|
||||
} // namespace fd_xpu3
|
||||
|
||||
+3
-4
@@ -1,8 +1,7 @@
|
||||
#include "xpu/kernel/cluster.h"
|
||||
#include "xpu/kernel/cluster_debug.h"
|
||||
#include "xpu/kernel/cluster_primitive.h"
|
||||
namespace xpu3 {
|
||||
namespace plugin {
|
||||
namespace fd_xpu3 {
|
||||
#define MAX_LM_SIZE 28672
|
||||
// One core has 32KB LM(group LM), MAX_LM_SIZE = (32 - 4)KB / 2 = 30720, 4KB is
|
||||
// the stack space
|
||||
@@ -125,5 +124,5 @@ _XPU_DEF__EB_MTP_GATHER_NEXT_TOKEN(bfloat16, float16);
|
||||
_XPU_DEF__EB_MTP_GATHER_NEXT_TOKEN(float16, bfloat16);
|
||||
_XPU_DEF__EB_MTP_GATHER_NEXT_TOKEN(bfloat16, float);
|
||||
_XPU_DEF__EB_MTP_GATHER_NEXT_TOKEN(float, bfloat16);
|
||||
} // namespace plugin
|
||||
} // namespace xpu3
|
||||
|
||||
} // namespace fd_xpu3
|
||||
|
||||
+3
-4
@@ -1,8 +1,7 @@
|
||||
#include "xpu/kernel/cluster.h"
|
||||
#include "xpu/kernel/cluster_debug.h"
|
||||
#include "xpu/kernel/cluster_primitive.h"
|
||||
namespace xpu3 {
|
||||
namespace plugin {
|
||||
namespace fd_xpu3 {
|
||||
#define MAX_LM_SIZE 28672
|
||||
// One core has 32KB LM(group LM), MAX_LM_SIZE = (32 - 4)KB / 2 = 30720, 4KB is
|
||||
// the stack space
|
||||
@@ -125,5 +124,5 @@ _XPU_DEF__EB_RECOVER_BATCH_SEQUENCE(bfloat16, float16);
|
||||
_XPU_DEF__EB_RECOVER_BATCH_SEQUENCE(float16, bfloat16);
|
||||
_XPU_DEF__EB_RECOVER_BATCH_SEQUENCE(bfloat16, float);
|
||||
_XPU_DEF__EB_RECOVER_BATCH_SEQUENCE(float, bfloat16);
|
||||
} // namespace plugin
|
||||
} // namespace xpu3
|
||||
|
||||
} // namespace fd_xpu3
|
||||
|
||||
+2
-4
@@ -2,8 +2,7 @@
|
||||
#include "xpu/kernel/cluster_partition.h"
|
||||
#include "xpu/kernel/cluster_primitive.h"
|
||||
|
||||
namespace xpu3 {
|
||||
namespace plugin {
|
||||
namespace fd_xpu3 {
|
||||
|
||||
static __device__ inline int loada_float(_shared_ptr_ const int *ptr) {
|
||||
int ret;
|
||||
@@ -205,5 +204,4 @@ __global__ void mtp_free_and_dispatch_block(bool *base_model_stop_flags,
|
||||
}
|
||||
}
|
||||
|
||||
} // namespace plugin
|
||||
} // namespace xpu3
|
||||
} // namespace fd_xpu3
|
||||
|
||||
+2
-4
@@ -2,8 +2,7 @@
|
||||
#include "xpu/kernel/cluster_partition.h"
|
||||
#include "xpu/kernel/cluster_primitive.h"
|
||||
|
||||
namespace xpu3 {
|
||||
namespace plugin {
|
||||
namespace fd_xpu3 {
|
||||
|
||||
template <typename T>
|
||||
__global__ void RebuildAppendPaddingKernel(const T *full_hidden_states,
|
||||
@@ -86,5 +85,4 @@ _XPU_DEF_REBUILD_APPEND_PADDING_KERNEL(bfloat16);
|
||||
_XPU_DEF_REBUILD_APPEND_PADDING_KERNEL(float16);
|
||||
_XPU_DEF_REBUILD_APPEND_PADDING_KERNEL(float);
|
||||
|
||||
} // namespace plugin
|
||||
} // namespace xpu3
|
||||
} // namespace fd_xpu3
|
||||
|
||||
+2
-4
@@ -2,8 +2,7 @@
|
||||
#include "xpu/kernel/cluster_partition.h"
|
||||
#include "xpu/kernel/cluster_primitive.h"
|
||||
|
||||
namespace xpu3 {
|
||||
namespace plugin {
|
||||
namespace fd_xpu3 {
|
||||
|
||||
template <typename T>
|
||||
__global__ void rebuildHiddenStatesKernel(const T* input,
|
||||
@@ -61,5 +60,4 @@ _XPU_DEF_REBUILD_HIDDEN_STATES_KERNEL(bfloat16);
|
||||
_XPU_DEF_REBUILD_HIDDEN_STATES_KERNEL(float);
|
||||
_XPU_DEF_REBUILD_HIDDEN_STATES_KERNEL(float16);
|
||||
|
||||
} // namespace plugin
|
||||
} // namespace xpu3
|
||||
} // namespace fd_xpu3
|
||||
|
||||
+2
-4
@@ -2,8 +2,7 @@
|
||||
#include "xpu/kernel/cluster_partition.h"
|
||||
#include "xpu/kernel/cluster_primitive.h"
|
||||
|
||||
namespace xpu3 {
|
||||
namespace plugin {
|
||||
namespace fd_xpu3 {
|
||||
|
||||
template <typename T>
|
||||
__global__ void rebuildSelfHiddenStatesKernel(
|
||||
@@ -52,5 +51,4 @@ _XPU_DEF_REBUILD_SELF_HIDDEN_STATES_KERNEL(bfloat16);
|
||||
_XPU_DEF_REBUILD_SELF_HIDDEN_STATES_KERNEL(float);
|
||||
_XPU_DEF_REBUILD_SELF_HIDDEN_STATES_KERNEL(float16);
|
||||
|
||||
} // namespace plugin
|
||||
} // namespace xpu3
|
||||
} // namespace fd_xpu3
|
||||
|
||||
+2
-4
@@ -2,8 +2,7 @@
|
||||
#include "xpu/kernel/cluster_partition.h"
|
||||
#include "xpu/kernel/cluster_primitive.h"
|
||||
|
||||
namespace xpu3 {
|
||||
namespace plugin {
|
||||
namespace fd_xpu3 {
|
||||
|
||||
__attribute__((global)) void recover_spec_decode_task(
|
||||
bool *stop_flags,
|
||||
@@ -71,5 +70,4 @@ __attribute__((global)) void recover_spec_decode_task(
|
||||
}
|
||||
}
|
||||
|
||||
} // namespace plugin
|
||||
} // namespace xpu3
|
||||
} // namespace fd_xpu3
|
||||
|
||||
+2
-4
@@ -2,8 +2,7 @@
|
||||
#include "xpu/kernel/cluster_partition.h"
|
||||
#include "xpu/kernel/cluster_primitive.h"
|
||||
|
||||
namespace xpu3 {
|
||||
namespace plugin {
|
||||
namespace fd_xpu3 {
|
||||
|
||||
template <typename T>
|
||||
inline __device__ void update_bad_words_logit(_global_ptr_ T* logits) {
|
||||
@@ -74,5 +73,4 @@ _XPU_DEF__BAN_BAD_WORDS_(float);
|
||||
_XPU_DEF__BAN_BAD_WORDS_(float16);
|
||||
_XPU_DEF__BAN_BAD_WORDS_(bfloat16);
|
||||
|
||||
} // namespace plugin
|
||||
} // namespace xpu3
|
||||
} // namespace fd_xpu3
|
||||
|
||||
+2
-4
@@ -20,8 +20,7 @@
|
||||
#include "xpu/kernel/cluster_primitive.h"
|
||||
#include "xpu/kernel/xtdk_io.h"
|
||||
|
||||
namespace xpu3 {
|
||||
namespace plugin {
|
||||
namespace fd_xpu3 {
|
||||
|
||||
__global__ void speculate_clear_accept_nums(int* accept_num,
|
||||
const int* seq_lens_decoder,
|
||||
@@ -40,5 +39,4 @@ __global__ void speculate_clear_accept_nums(int* accept_num,
|
||||
}
|
||||
}
|
||||
|
||||
} // namespace plugin
|
||||
} // namespace xpu3
|
||||
} // namespace fd_xpu3
|
||||
|
||||
+2
-4
@@ -2,8 +2,7 @@
|
||||
#include "xpu/kernel/cluster_partition.h"
|
||||
#include "xpu/kernel/cluster_primitive.h"
|
||||
|
||||
namespace xpu3 {
|
||||
namespace plugin {
|
||||
namespace fd_xpu3 {
|
||||
|
||||
static __device__ inline int loada_float(_shared_ptr_ const int *ptr) {
|
||||
int ret;
|
||||
@@ -333,5 +332,4 @@ __global__ void speculate_free_and_dispatch_block(
|
||||
}
|
||||
}
|
||||
|
||||
} // namespace plugin
|
||||
} // namespace xpu3
|
||||
} // namespace fd_xpu3
|
||||
|
||||
+2
-4
@@ -2,8 +2,7 @@
|
||||
#include "xpu/kernel/cluster_partition.h"
|
||||
#include "xpu/kernel/cluster_primitive.h"
|
||||
|
||||
namespace xpu3 {
|
||||
namespace plugin {
|
||||
namespace fd_xpu3 {
|
||||
|
||||
static __device__ inline int loada_float(_shared_ptr_ const int *ptr) {
|
||||
int ret;
|
||||
@@ -284,5 +283,4 @@ __global__ void speculate_free_and_reschedule(bool *stop_flags,
|
||||
}
|
||||
}
|
||||
|
||||
} // namespace plugin
|
||||
} // namespace xpu3
|
||||
} // namespace fd_xpu3
|
||||
|
||||
+2
-4
@@ -2,8 +2,7 @@
|
||||
#include "xpu/kernel/cluster_partition.h"
|
||||
#include "xpu/kernel/cluster_primitive.h"
|
||||
|
||||
namespace xpu3 {
|
||||
namespace plugin {
|
||||
namespace fd_xpu3 {
|
||||
|
||||
__device__ void prefix_sum(__shared_ptr__ int* sm_seq_lens_encoder,
|
||||
__shared_ptr__ int* sm_seq_lens_this_time,
|
||||
@@ -127,5 +126,4 @@ __global__ void speculate_get_logits(float* draft_logits,
|
||||
}
|
||||
}
|
||||
|
||||
} // namespace plugin
|
||||
} // namespace xpu3
|
||||
} // namespace fd_xpu3
|
||||
|
||||
+2
-4
@@ -20,8 +20,7 @@
|
||||
#include "xpu/kernel/cluster_primitive.h"
|
||||
#include "xpu/kernel/xtdk_io.h"
|
||||
|
||||
namespace xpu3 {
|
||||
namespace plugin {
|
||||
namespace fd_xpu3 {
|
||||
|
||||
__global__ void speculate_get_output_padding_offset(
|
||||
int* output_padding_offset,
|
||||
@@ -59,5 +58,4 @@ __global__ void speculate_get_output_padding_offset(
|
||||
}
|
||||
}
|
||||
|
||||
} // namespace plugin
|
||||
} // namespace xpu3
|
||||
} // namespace fd_xpu3
|
||||
|
||||
+2
-4
@@ -21,8 +21,7 @@
|
||||
#include "xpu/kernel/cluster_simd.h"
|
||||
#include "xpu/kernel/xtdk.h"
|
||||
|
||||
namespace xpu3 {
|
||||
namespace plugin {
|
||||
namespace fd_xpu3 {
|
||||
|
||||
template <typename T>
|
||||
__global__ void speculate_remove_padding(T* output_data,
|
||||
@@ -118,5 +117,4 @@ _XPU_DEF_SPECULATE_KERNELS_(float16);
|
||||
_XPU_DEF_SPECULATE_KERNELS_(bfloat16);
|
||||
_XPU_DEF_SPECULATE_KERNELS_(int64_t);
|
||||
|
||||
} // namespace plugin
|
||||
} // namespace xpu3
|
||||
} // namespace fd_xpu3
|
||||
|
||||
+2
-4
@@ -20,8 +20,7 @@
|
||||
#include "xpu/kernel/cluster_primitive.h"
|
||||
#include "xpu/kernel/xtdk_io.h"
|
||||
|
||||
namespace xpu3 {
|
||||
namespace plugin {
|
||||
namespace fd_xpu3 {
|
||||
|
||||
__global__ void speculate_get_seq_lens_output(int* seq_lens_output,
|
||||
const int* seq_lens_this_time,
|
||||
@@ -54,5 +53,4 @@ __global__ void speculate_get_seq_lens_output(int* seq_lens_output,
|
||||
}
|
||||
}
|
||||
|
||||
} // namespace plugin
|
||||
} // namespace xpu3
|
||||
} // namespace fd_xpu3
|
||||
|
||||
+2
-4
@@ -3,8 +3,7 @@
|
||||
#include "xpu/kernel/cluster_partition.h"
|
||||
#include "xpu/kernel/cluster_primitive.h"
|
||||
|
||||
namespace xpu3 {
|
||||
namespace plugin {
|
||||
namespace fd_xpu3 {
|
||||
|
||||
template <typename T>
|
||||
__global__ void speculate_min_length_logits_process(
|
||||
@@ -87,5 +86,4 @@ _XPU_DEF__UPDATE_LOGITS_REPEAT_TIMES_(float);
|
||||
_XPU_DEF__UPDATE_LOGITS_REPEAT_TIMES_(float16);
|
||||
_XPU_DEF__UPDATE_LOGITS_REPEAT_TIMES_(bfloat16);
|
||||
|
||||
} // namespace plugin
|
||||
} // namespace xpu3
|
||||
} // namespace fd_xpu3
|
||||
|
||||
+2
-4
@@ -2,8 +2,7 @@
|
||||
#include "xpu/kernel/cluster_partition.h"
|
||||
#include "xpu/kernel/cluster_primitive.h"
|
||||
|
||||
namespace xpu3 {
|
||||
namespace plugin {
|
||||
namespace fd_xpu3 {
|
||||
|
||||
static __device__ inline int loada_float(_shared_ptr_ const int* ptr) {
|
||||
int ret;
|
||||
@@ -160,5 +159,4 @@ __global__ void speculate_recover_block(int* recover_block_list, // [bsz]
|
||||
}
|
||||
}
|
||||
|
||||
} // namespace plugin
|
||||
} // namespace xpu3
|
||||
} // namespace fd_xpu3
|
||||
|
||||
+2
-4
@@ -3,8 +3,7 @@
|
||||
#include "xpu/kernel/cluster_primitive.h"
|
||||
#include "xpu/kernel/xtdk_io.h"
|
||||
|
||||
namespace xpu3 {
|
||||
namespace plugin {
|
||||
namespace fd_xpu3 {
|
||||
|
||||
static inline __device__ int v_reduce(int32x16_t &v0, int32x16_t &v1) {
|
||||
int res;
|
||||
@@ -175,5 +174,4 @@ __global__ void speculate_schedule_cache(const int64_t *draft_tokens,
|
||||
}
|
||||
}
|
||||
|
||||
} // namespace plugin
|
||||
} // namespace xpu3
|
||||
} // namespace fd_xpu3
|
||||
|
||||
+3
-4
@@ -4,8 +4,7 @@
|
||||
#include "xpu/kernel/xtdk_math.h"
|
||||
#include "xpu/kernel/xtdk_simd.h"
|
||||
|
||||
namespace xpu3 {
|
||||
namespace plugin {
|
||||
namespace fd_xpu3 {
|
||||
|
||||
__global__ void speculate_set_stop_value_multi_seqs(bool *stop_flags,
|
||||
int64_t *accept_tokens,
|
||||
@@ -99,5 +98,5 @@ __global__ void speculate_set_stop_value_multi_seqs(bool *stop_flags,
|
||||
}
|
||||
}
|
||||
}
|
||||
} // namespace plugin
|
||||
} // namespace xpu3
|
||||
|
||||
} // namespace fd_xpu3
|
||||
|
||||
+2
-4
@@ -20,8 +20,7 @@
|
||||
#include "xpu/kernel/cluster_primitive.h"
|
||||
#include "xpu/kernel/xtdk_io.h"
|
||||
|
||||
namespace xpu3 {
|
||||
namespace plugin {
|
||||
namespace fd_xpu3 {
|
||||
|
||||
__global__ void speculate_set_value_by_flag_and_id(int64_t *pre_ids_all,
|
||||
const int64_t *accept_tokens,
|
||||
@@ -84,5 +83,4 @@ __global__ void speculate_set_value_by_flag_and_id(int64_t *pre_ids_all,
|
||||
}
|
||||
}
|
||||
|
||||
} // namespace plugin
|
||||
} // namespace xpu3
|
||||
} // namespace fd_xpu3
|
||||
|
||||
@@ -20,8 +20,7 @@
|
||||
#include "xpu/kernel/cluster_primitive.h"
|
||||
#include "xpu/kernel/cluster_primitive_template.h"
|
||||
|
||||
namespace xpu3 {
|
||||
namespace plugin {
|
||||
namespace fd_xpu3 {
|
||||
|
||||
static inline __device__ int v_reduce(int32x16_t &v0, int32x16_t &v1) {
|
||||
int res;
|
||||
@@ -198,5 +197,4 @@ template __global__ void speculate_update<512>(int *seq_lens_encoder,
|
||||
const int max_bsz,
|
||||
const int max_draft_tokens);
|
||||
|
||||
} // namespace plugin
|
||||
} // namespace xpu3
|
||||
} // namespace fd_xpu3
|
||||
|
||||
+2
-4
@@ -3,8 +3,7 @@
|
||||
#include "xpu/kernel/cluster_primitive.h"
|
||||
#include "xpu/kernel/cluster_primitive_template.h"
|
||||
|
||||
namespace xpu3 {
|
||||
namespace plugin {
|
||||
namespace fd_xpu3 {
|
||||
|
||||
static __device__ void atomic_add(_shared_ptr_ int *ptr, int v) {
|
||||
bool fail = true;
|
||||
@@ -264,5 +263,4 @@ __global__ void speculate_update_repeat_times(const int64_t *pre_ids,
|
||||
}
|
||||
}
|
||||
|
||||
} // namespace plugin
|
||||
} // namespace xpu3
|
||||
} // namespace fd_xpu3
|
||||
|
||||
+2
-4
@@ -20,8 +20,7 @@
|
||||
#include "xpu/kernel/cluster_primitive.h"
|
||||
#include "xpu/kernel/cluster_primitive_template.h"
|
||||
|
||||
namespace xpu3 {
|
||||
namespace plugin {
|
||||
namespace fd_xpu3 {
|
||||
|
||||
static inline __device__ int v_reduce(int32x16_t &v0, int32x16_t &v1) {
|
||||
int res;
|
||||
@@ -198,5 +197,4 @@ template __global__ void speculate_update_v3<512>(int *seq_lens_encoder,
|
||||
const int max_bsz,
|
||||
const int max_draft_tokens);
|
||||
|
||||
} // namespace plugin
|
||||
} // namespace xpu3
|
||||
} // namespace fd_xpu3
|
||||
|
||||
+2
-4
@@ -3,8 +3,7 @@
|
||||
#include "xpu/kernel/cluster_partition.h"
|
||||
#include "xpu/kernel/cluster_primitive.h"
|
||||
|
||||
namespace xpu3 {
|
||||
namespace plugin {
|
||||
namespace fd_xpu3 {
|
||||
|
||||
__device__ void do_cast(const int *xlm, float *ylm, int64_t len) {
|
||||
for (int64_t i = 0; i < len; i += 32) {
|
||||
@@ -279,5 +278,4 @@ _XPU_DEF__UPDATE_VALUE_BY_REPEAT_TIMES_SIMD(float);
|
||||
_XPU_DEF__UPDATE_VALUE_BY_REPEAT_TIMES_SIMD(float16);
|
||||
_XPU_DEF__UPDATE_VALUE_BY_REPEAT_TIMES_SIMD(bfloat16);
|
||||
|
||||
} // namespace plugin
|
||||
} // namespace xpu3
|
||||
} // namespace fd_xpu3
|
||||
|
||||
@@ -6,8 +6,8 @@
|
||||
// #include "xpu/internal/aten/xrand_philox4x32_10.h"
|
||||
// #include "xpu/internal/aten/xrand_uniform.h"
|
||||
// #include "xpu/internal/aten/xrand_global.h"
|
||||
namespace xpu3 {
|
||||
namespace plugin {
|
||||
namespace fd_xpu3 {
|
||||
|
||||
static inline __device__ int v_reduce(int32x16_t &v0, int32x16_t &v1) {
|
||||
int res;
|
||||
v1 = vvadd_int32x16(v0, v1);
|
||||
@@ -380,5 +380,5 @@ SPECULATE_VERIFY_INSTANTIATE(true, true)
|
||||
SPECULATE_VERIFY_INSTANTIATE(true, false)
|
||||
SPECULATE_VERIFY_INSTANTIATE(false, true)
|
||||
SPECULATE_VERIFY_INSTANTIATE(false, false)
|
||||
} // namespace plugin
|
||||
} // namespace xpu3
|
||||
|
||||
} // namespace fd_xpu3
|
||||
|
||||
@@ -3,8 +3,7 @@
|
||||
#include "xpu/kernel/cluster_primitive.h"
|
||||
#include "xpu/kernel/cluster_primitive_template.h"
|
||||
|
||||
namespace xpu3 {
|
||||
namespace plugin {
|
||||
namespace fd_xpu3 {
|
||||
|
||||
template <typename T, int MaxLength, int TopPBeamTopK>
|
||||
__device__ void top_p_candidates_big_n(
|
||||
@@ -345,5 +344,4 @@ _XPU_DEF_TOP_P_CANDIDATES_KERNEL(float, 2, 5);
|
||||
_XPU_DEF_TOP_P_CANDIDATES_KERNEL(float, 2, 8);
|
||||
_XPU_DEF_TOP_P_CANDIDATES_KERNEL(float, 2, 10);
|
||||
|
||||
} // namespace plugin
|
||||
} // namespace xpu3
|
||||
} // namespace fd_xpu3
|
||||
|
||||
+2
-4
@@ -3,8 +3,7 @@
|
||||
#include "xpu/kernel/cluster_primitive.h"
|
||||
#include "xpu/kernel/xtdk_io.h"
|
||||
|
||||
namespace xpu3 {
|
||||
namespace plugin {
|
||||
namespace fd_xpu3 {
|
||||
|
||||
__global__ void update_attn_mask_offsets(int* attn_mask_offsets,
|
||||
const int* seq_lens_this_time,
|
||||
@@ -73,5 +72,4 @@ __global__ void update_attn_mask_offsets(int* attn_mask_offsets,
|
||||
}
|
||||
}
|
||||
|
||||
} // namespace plugin
|
||||
} // namespace xpu3
|
||||
} // namespace fd_xpu3
|
||||
|
||||
@@ -5,8 +5,7 @@
|
||||
// TODO()
|
||||
// #include "xpu/quant_xpu.h"
|
||||
// #include "xpu_plugin.h"
|
||||
namespace xpu3 {
|
||||
namespace plugin {
|
||||
namespace fd_xpu3 {
|
||||
#define MAX_SM_SIZE 32768
|
||||
// One core has 32KB LM(group LM), MAX_LM_SIZE = (32 - 4)KB / 2 = 30720, 4KB is
|
||||
// the stack space
|
||||
@@ -1065,5 +1064,4 @@ _XPU_DEF__QUANT2d_PER_CHANNEL_CACHED(float, float, int8_t, 64);
|
||||
_XPU_DEF__QUANT2d_PER_CHANNEL_CACHED(float16, float, int8_t, 128);
|
||||
_XPU_DEF__QUANT2d_PER_CHANNEL_CACHED(bfloat16, float, int8_t, 128);
|
||||
_XPU_DEF__QUANT2d_PER_CHANNEL_CACHED(float, float, int8_t, 128);
|
||||
} // namespace plugin
|
||||
} // namespace xpu3
|
||||
} // namespace fd_xpu3
|
||||
|
||||
@@ -2,8 +2,7 @@
|
||||
#include "xpu/kernel/cluster_partition.h"
|
||||
#include "xpu/kernel/cluster_primitive.h"
|
||||
|
||||
namespace xpu3 {
|
||||
namespace plugin {
|
||||
namespace fd_xpu3 {
|
||||
|
||||
static __device__ inline int loada_float(_shared_ptr_ const int* ptr) {
|
||||
int ret;
|
||||
@@ -150,5 +149,4 @@ __global__ void recover_block(int* recover_block_list, // [bsz]
|
||||
}
|
||||
}
|
||||
|
||||
} // namespace plugin
|
||||
} // namespace xpu3
|
||||
} // namespace fd_xpu3
|
||||
|
||||
@@ -2,8 +2,7 @@
|
||||
#include "xpu/kernel/cluster_partition.h"
|
||||
#include "xpu/kernel/cluster_primitive.h"
|
||||
|
||||
namespace xpu3 {
|
||||
namespace plugin {
|
||||
namespace fd_xpu3 {
|
||||
|
||||
__global__ void recover_decode_task(bool *stop_flags,
|
||||
int *seq_lens_this_time,
|
||||
@@ -39,5 +38,4 @@ __global__ void recover_decode_task(bool *stop_flags,
|
||||
}
|
||||
}
|
||||
|
||||
} // namespace plugin
|
||||
} // namespace xpu3
|
||||
} // namespace fd_xpu3
|
||||
|
||||
@@ -2,8 +2,7 @@
|
||||
#include "xpu/kernel/cluster_partition.h"
|
||||
#include "xpu/kernel/cluster_primitive.h"
|
||||
|
||||
namespace xpu3 {
|
||||
namespace plugin {
|
||||
namespace fd_xpu3 {
|
||||
|
||||
__global__ void remove_padding(int64_t *x_remove_padding,
|
||||
const int64_t *input_data,
|
||||
@@ -36,5 +35,4 @@ __global__ void remove_padding(int64_t *x_remove_padding,
|
||||
}
|
||||
}
|
||||
|
||||
} // namespace plugin
|
||||
} // namespace xpu3
|
||||
} // namespace fd_xpu3
|
||||
|
||||
@@ -3,8 +3,7 @@
|
||||
#include "xpu/kernel/xtdk_math.h"
|
||||
#include "xpu/kernel/xtdk_simd.h"
|
||||
|
||||
namespace xpu3 {
|
||||
namespace plugin {
|
||||
namespace fd_xpu3 {
|
||||
|
||||
template <typename T>
|
||||
static inline __device__ bool is_in_end(const T id,
|
||||
@@ -97,5 +96,4 @@ __global__ void set_stop_value_multi_ends(bool* stop_flags,
|
||||
const bool prefill_one_step_stop);
|
||||
_XPU_DEF__SET_VALUE_BY_FLAGS_BOTH_(int64_t);
|
||||
|
||||
} // namespace plugin
|
||||
} // namespace xpu3
|
||||
} // namespace fd_xpu3
|
||||
|
||||
@@ -1,6 +1,5 @@
|
||||
#include "xpu/kernel/cluster.h"
|
||||
namespace xpu3 {
|
||||
namespace plugin {
|
||||
namespace fd_xpu3 {
|
||||
|
||||
__global__ void set_value_by_flags_and_idx(const bool* stop_flags,
|
||||
int64_t* pre_ids_all,
|
||||
@@ -46,5 +45,4 @@ __global__ void set_value_by_flags_and_idx(const bool* stop_flags,
|
||||
}
|
||||
}
|
||||
|
||||
} // namespace plugin
|
||||
} // namespace xpu3
|
||||
} // namespace fd_xpu3
|
||||
|
||||
@@ -3,8 +3,7 @@
|
||||
#include "xpu/kernel/cluster_primitive.h"
|
||||
#include "xpu/kernel/xtdk_io.h"
|
||||
|
||||
namespace xpu3 {
|
||||
namespace plugin {
|
||||
namespace fd_xpu3 {
|
||||
|
||||
template <typename T>
|
||||
static __device__ inline void text_image_gather(
|
||||
@@ -215,5 +214,4 @@ __global__ void text_image_gather_scatter(T* input,
|
||||
|
||||
_XPU_DEF_TEXT_IMAGE_GATHER_SCATTER(bfloat16);
|
||||
|
||||
} // namespace plugin
|
||||
} // namespace xpu3
|
||||
} // namespace fd_xpu3
|
||||
|
||||
@@ -20,8 +20,7 @@
|
||||
#include "xpu/kernel/cluster_primitive.h"
|
||||
#include "xpu/kernel/cluster_primitive_template.h"
|
||||
|
||||
namespace xpu3 {
|
||||
namespace plugin {
|
||||
namespace fd_xpu3 {
|
||||
|
||||
static __device__ void do_calc(const _shared_ptr_ int* lm_x,
|
||||
int* lm_y1,
|
||||
@@ -110,5 +109,4 @@ __global__ void text_image_index_out_kernel(const int* token_type_ids, // x
|
||||
buffer_ptr_y2.gm_store(image_index + i, read_size);
|
||||
}
|
||||
}
|
||||
} // namespace plugin
|
||||
} // namespace xpu3
|
||||
} // namespace fd_xpu3
|
||||
|
||||
@@ -2,8 +2,7 @@
|
||||
#include "xpu/kernel/cluster_partition.h"
|
||||
#include "xpu/kernel/cluster_primitive.h"
|
||||
|
||||
namespace xpu3 {
|
||||
namespace plugin {
|
||||
namespace fd_xpu3 {
|
||||
|
||||
__global__ void update_inputs(bool *not_need_stop,
|
||||
int *seq_lens_this_time,
|
||||
@@ -71,5 +70,4 @@ __global__ void update_inputs(bool *not_need_stop,
|
||||
}
|
||||
}
|
||||
|
||||
} // namespace plugin
|
||||
} // namespace xpu3
|
||||
} // namespace fd_xpu3
|
||||
|
||||
@@ -7,8 +7,7 @@
|
||||
#include "xpu/kernel/xtdk.h"
|
||||
#include "xpu/kernel/xtdk_io.h"
|
||||
|
||||
namespace xpu3 {
|
||||
namespace plugin {
|
||||
namespace fd_xpu3 {
|
||||
|
||||
__global__ void update_inputs_v1(bool* not_need_stop,
|
||||
int* seq_lens_this_time,
|
||||
@@ -148,5 +147,4 @@ __global__ void update_inputs_v1(bool* not_need_stop,
|
||||
}
|
||||
}
|
||||
|
||||
} // namespace plugin
|
||||
} // namespace xpu3
|
||||
} // namespace fd_xpu3
|
||||
|
||||
@@ -2,8 +2,7 @@
|
||||
#include "xpu/kernel/cluster_partition.h"
|
||||
#include "xpu/kernel/cluster_primitive.h"
|
||||
|
||||
namespace xpu3 {
|
||||
namespace plugin {
|
||||
namespace fd_xpu3 {
|
||||
|
||||
static __device__ void atomic_add(_shared_ptr_ int *ptr, int v) {
|
||||
bool fail = true;
|
||||
@@ -71,5 +70,4 @@ __global__ void update_repeat_times(const int64_t *pre_ids,
|
||||
}
|
||||
}
|
||||
|
||||
} // namespace plugin
|
||||
} // namespace xpu3
|
||||
} // namespace fd_xpu3
|
||||
|
||||
+2
-4
@@ -3,8 +3,7 @@
|
||||
#include "xpu/kernel/cluster_partition.h"
|
||||
#include "xpu/kernel/cluster_primitive.h"
|
||||
|
||||
namespace xpu3 {
|
||||
namespace plugin {
|
||||
namespace fd_xpu3 {
|
||||
|
||||
__device__ void do_cast(const int *xlm, float *ylm, int64_t len) {
|
||||
for (int64_t i = 0; i < len; i += 32) {
|
||||
@@ -222,5 +221,4 @@ __global__ void update_value_by_repeat_times_simd(
|
||||
_XPU_DEF__UPDATE_VALUE_BY_REPEAT_TIMES_SIMD(float);
|
||||
_XPU_DEF__UPDATE_VALUE_BY_REPEAT_TIMES_SIMD(float16);
|
||||
|
||||
} // namespace plugin
|
||||
} // namespace xpu3
|
||||
} // namespace fd_xpu3
|
||||
|
||||
@@ -17,8 +17,7 @@
|
||||
#include "xpu/refactor/impl_public/wrapper_check.h"
|
||||
#include "xpu/xdnn.h"
|
||||
|
||||
namespace xpu3 {
|
||||
namespace plugin {
|
||||
namespace fd_xpu3 {
|
||||
template <typename TX, typename TY>
|
||||
__attribute__((global)) void eb_adjust_batch(TX *src,
|
||||
TY *dst,
|
||||
@@ -29,12 +28,9 @@ __attribute__((global)) void eb_adjust_batch(TX *src,
|
||||
int en_batch,
|
||||
int de_batch,
|
||||
int64_t copy_size);
|
||||
} // namespace plugin
|
||||
} // namespace xpu3
|
||||
} // namespace fd_xpu3
|
||||
|
||||
namespace baidu {
|
||||
namespace xpu {
|
||||
namespace api {
|
||||
namespace fastdeploy {
|
||||
namespace plugin {
|
||||
|
||||
template <typename TX, typename TY>
|
||||
@@ -93,10 +89,10 @@ static int xpu3_wrapper(api::Context *ctx,
|
||||
int en_batch,
|
||||
int de_batch,
|
||||
int64_t hidden_dim) {
|
||||
using XPU_INDEX_TYPE_TX = typename XPUIndexType<TX>::type;
|
||||
using XPU_INDEX_TYPE_TY = typename XPUIndexType<TY>::type;
|
||||
using XPU_INDEX_TYPE_TX = typename api::XPUIndexType<TX>::type;
|
||||
using XPU_INDEX_TYPE_TY = typename api::XPUIndexType<TY>::type;
|
||||
auto eb_adjust_batch_kernel =
|
||||
xpu3::plugin::eb_adjust_batch<XPU_INDEX_TYPE_TX, XPU_INDEX_TYPE_TY>;
|
||||
fd_xpu3::eb_adjust_batch<XPU_INDEX_TYPE_TX, XPU_INDEX_TYPE_TY>;
|
||||
// NOTE: Don't change 16 to 64, because kernel use gsm
|
||||
int32_t ret_xre =
|
||||
eb_adjust_batch_kernel<<<ctx->ncluster(), 16, ctx->xpu_stream>>>(
|
||||
@@ -226,6 +222,4 @@ INSTANTIATION_EB_ADJUST_BATCH(float, bfloat16);
|
||||
INSTANTIATION_EB_ADJUST_BATCH(int32_t, int32_t);
|
||||
INSTANTIATION_EB_ADJUST_BATCH(int64_t, int64_t);
|
||||
} // namespace plugin
|
||||
} // namespace api
|
||||
} // namespace xpu
|
||||
} // namespace baidu
|
||||
} // namespace fastdeploy
|
||||
|
||||
@@ -17,8 +17,7 @@
|
||||
#include "xpu/refactor/impl_public/wrapper_check.h"
|
||||
#include "xpu/xdnn.h"
|
||||
|
||||
namespace xpu3 {
|
||||
namespace plugin {
|
||||
namespace fd_xpu3 {
|
||||
template <typename TX, typename TY>
|
||||
__attribute__((global)) void eb_gather_next_token(TX *src,
|
||||
TY *dst,
|
||||
@@ -28,12 +27,9 @@ __attribute__((global)) void eb_gather_next_token(TX *src,
|
||||
int en_batch,
|
||||
int de_batch,
|
||||
int64_t copy_size);
|
||||
} // namespace plugin
|
||||
} // namespace xpu3
|
||||
} // namespace fd_xpu3
|
||||
|
||||
namespace baidu {
|
||||
namespace xpu {
|
||||
namespace api {
|
||||
namespace fastdeploy {
|
||||
namespace plugin {
|
||||
template <typename TX, typename TY>
|
||||
static int cpu_wrapper(api::Context *ctx,
|
||||
@@ -74,7 +70,7 @@ static int xpu3_wrapper(api::Context *ctx,
|
||||
int en_batch,
|
||||
int de_batch,
|
||||
int64_t hidden_dim) {
|
||||
auto eb_gather_next_token_kernel = xpu3::plugin::eb_gather_next_token<TX, TY>;
|
||||
auto eb_gather_next_token_kernel = fd_xpu3::eb_gather_next_token<TX, TY>;
|
||||
// NOTE: Don't change 16 to 64, because kernel use gsm
|
||||
int32_t ret_xre =
|
||||
eb_gather_next_token_kernel<<<ctx->ncluster(), 16, ctx->xpu_stream>>>(
|
||||
@@ -187,6 +183,4 @@ INSTANTIATION_EB_GATHER_NEXT_TOKEN(float16, bfloat16);
|
||||
INSTANTIATION_EB_GATHER_NEXT_TOKEN(bfloat16, float);
|
||||
INSTANTIATION_EB_GATHER_NEXT_TOKEN(float, bfloat16);
|
||||
} // namespace plugin
|
||||
} // namespace api
|
||||
} // namespace xpu
|
||||
} // namespace baidu
|
||||
} // namespace fastdeploy
|
||||
|
||||
@@ -17,8 +17,7 @@
|
||||
#include "xpu/plugin.h"
|
||||
#include "xpu/refactor/impl_public/wrapper_check.h"
|
||||
|
||||
namespace xpu3 {
|
||||
namespace plugin {
|
||||
namespace fd_xpu3 {
|
||||
|
||||
__attribute__((global)) void free_and_dispatch_block(
|
||||
bool *stop_flags,
|
||||
@@ -42,15 +41,12 @@ __attribute__((global)) void free_and_dispatch_block(
|
||||
const int block_num_per_seq,
|
||||
const int max_decoder_block_num);
|
||||
|
||||
} // namespace plugin
|
||||
} // namespace xpu3
|
||||
} // namespace fd_xpu3
|
||||
|
||||
namespace baidu {
|
||||
namespace xpu {
|
||||
namespace api {
|
||||
namespace fastdeploy {
|
||||
namespace plugin {
|
||||
|
||||
static int cpu_wrapper(Context *ctx,
|
||||
static int cpu_wrapper(api::Context *ctx,
|
||||
bool *stop_flags,
|
||||
int *seq_lens_this_time,
|
||||
int *seq_lens_decoder,
|
||||
@@ -171,7 +167,7 @@ static int cpu_wrapper(Context *ctx,
|
||||
return api::SUCCESS;
|
||||
}
|
||||
|
||||
static int xpu3_wrapper(Context *ctx,
|
||||
static int xpu3_wrapper(api::Context *ctx,
|
||||
bool *stop_flags,
|
||||
int *seq_lens_this_time,
|
||||
int *seq_lens_decoder,
|
||||
@@ -192,8 +188,8 @@ static int xpu3_wrapper(Context *ctx,
|
||||
const int block_size,
|
||||
const int block_num_per_seq,
|
||||
const int max_decoder_block_num) {
|
||||
using XPU_INT64 = typename XPUIndexType<int64_t>::type;
|
||||
auto free_and_dispatch_block_kernel = xpu3::plugin::free_and_dispatch_block;
|
||||
using XPU_INT64 = typename api::XPUIndexType<int64_t>::type;
|
||||
auto free_and_dispatch_block_kernel = fd_xpu3::free_and_dispatch_block;
|
||||
int32_t ret_xre =
|
||||
free_and_dispatch_block_kernel<<<ctx->ncluster(), 64, ctx->xpu_stream>>>(
|
||||
stop_flags,
|
||||
@@ -220,7 +216,7 @@ static int xpu3_wrapper(Context *ctx,
|
||||
return api::SUCCESS;
|
||||
}
|
||||
|
||||
int free_and_dispatch_block(Context *ctx,
|
||||
int free_and_dispatch_block(api::Context *ctx,
|
||||
bool *stop_flags,
|
||||
int *seq_lens_this_time,
|
||||
int *seq_lens_decoder,
|
||||
@@ -285,7 +281,7 @@ int free_and_dispatch_block(Context *ctx,
|
||||
block_num_per_seq,
|
||||
max_decoder_block_num);
|
||||
}
|
||||
if (ctx->dev().type() == api::kXPU2 || ctx->dev().type() == api::kXPU3) {
|
||||
if (ctx->dev().type() == api::kXPU3) {
|
||||
return xpu3_wrapper(ctx,
|
||||
stop_flags,
|
||||
seq_lens_this_time,
|
||||
@@ -312,6 +308,4 @@ int free_and_dispatch_block(Context *ctx,
|
||||
}
|
||||
|
||||
} // namespace plugin
|
||||
} // namespace api
|
||||
} // namespace xpu
|
||||
} // namespace baidu
|
||||
} // namespace fastdeploy
|
||||
|
||||
@@ -17,8 +17,7 @@
|
||||
#include "xpu/plugin.h"
|
||||
#include "xpu/refactor/impl_public/wrapper_check.h"
|
||||
|
||||
namespace xpu3 {
|
||||
namespace plugin {
|
||||
namespace fd_xpu3 {
|
||||
|
||||
__attribute__((global)) void get_padding_offset(int *padding_offset,
|
||||
int *cum_offsets_out,
|
||||
@@ -35,12 +34,9 @@ __attribute__((global)) void remove_padding(int64_t *x_remove_padding,
|
||||
const int sequence_length,
|
||||
const int bs);
|
||||
|
||||
} // namespace plugin
|
||||
} // namespace xpu3
|
||||
} // namespace fd_xpu3
|
||||
|
||||
namespace baidu {
|
||||
namespace xpu {
|
||||
namespace api {
|
||||
namespace fastdeploy {
|
||||
namespace plugin {
|
||||
|
||||
static int get_padding_offset_cpu(int *padding_offset,
|
||||
@@ -80,7 +76,7 @@ static int remove_padding_cpu(int64_t *x_remove_padding,
|
||||
return api::SUCCESS;
|
||||
}
|
||||
|
||||
static int cpu_wrapper(Context *ctx,
|
||||
static int cpu_wrapper(api::Context *ctx,
|
||||
int *padding_offset,
|
||||
int *cum_offsets_out,
|
||||
int *cu_seqlens_q,
|
||||
@@ -104,7 +100,7 @@ static int cpu_wrapper(Context *ctx,
|
||||
return api::SUCCESS;
|
||||
}
|
||||
|
||||
static int xpu3_wrapper(Context *ctx,
|
||||
static int xpu3_wrapper(api::Context *ctx,
|
||||
int *padding_offset,
|
||||
int *cum_offsets_out,
|
||||
int *cu_seqlens_q,
|
||||
@@ -115,9 +111,9 @@ static int xpu3_wrapper(Context *ctx,
|
||||
const int *seq_lens,
|
||||
const int max_seq_len,
|
||||
const int bs) {
|
||||
using XPU_INT64 = typename XPUIndexType<int64_t>::type;
|
||||
auto get_padding_offset = xpu3::plugin::get_padding_offset;
|
||||
auto remove_padding = xpu3::plugin::remove_padding;
|
||||
using XPU_INT64 = typename api::XPUIndexType<int64_t>::type;
|
||||
auto get_padding_offset = fd_xpu3::get_padding_offset;
|
||||
auto remove_padding = fd_xpu3::remove_padding;
|
||||
int32_t ret_xre =
|
||||
get_padding_offset<<<ctx->ncluster(), 64, ctx->xpu_stream>>>(
|
||||
padding_offset,
|
||||
@@ -140,7 +136,7 @@ static int xpu3_wrapper(Context *ctx,
|
||||
return api::SUCCESS;
|
||||
}
|
||||
|
||||
int get_padding_offset(Context *ctx,
|
||||
int get_padding_offset(api::Context *ctx,
|
||||
int *padding_offset,
|
||||
int *cum_offsets_out,
|
||||
int *cu_seqlens_q,
|
||||
@@ -171,7 +167,7 @@ int get_padding_offset(Context *ctx,
|
||||
max_seq_len,
|
||||
bs);
|
||||
}
|
||||
if (ctx->dev().type() == api::kXPU2 || ctx->dev().type() == api::kXPU3) {
|
||||
if (ctx->dev().type() == api::kXPU3) {
|
||||
return xpu3_wrapper(ctx,
|
||||
padding_offset,
|
||||
cum_offsets_out,
|
||||
@@ -188,6 +184,4 @@ int get_padding_offset(Context *ctx,
|
||||
}
|
||||
|
||||
} // namespace plugin
|
||||
} // namespace api
|
||||
} // namespace xpu
|
||||
} // namespace baidu
|
||||
} // namespace fastdeploy
|
||||
|
||||
@@ -17,8 +17,7 @@
|
||||
#include "xpu/plugin.h"
|
||||
#include "xpu/refactor/impl_public/wrapper_check.h"
|
||||
|
||||
namespace xpu3 {
|
||||
namespace plugin {
|
||||
namespace fd_xpu3 {
|
||||
|
||||
__attribute__((global)) void limit_thinking_content_length_kernel_v1(
|
||||
int64_t* next_tokens,
|
||||
@@ -30,15 +29,12 @@ __attribute__((global)) void limit_thinking_content_length_kernel_v1(
|
||||
const int64_t think_end_id,
|
||||
const int bs,
|
||||
const int eos_token_id_len);
|
||||
} // namespace plugin
|
||||
} // namespace xpu3
|
||||
} // namespace fd_xpu3
|
||||
|
||||
namespace baidu {
|
||||
namespace xpu {
|
||||
namespace api {
|
||||
namespace fastdeploy {
|
||||
namespace plugin {
|
||||
|
||||
static int cpu_wrapper(Context* ctx,
|
||||
static int cpu_wrapper(api::Context* ctx,
|
||||
int64_t* next_tokens,
|
||||
const int* max_think_lens,
|
||||
const int64_t* step_idx,
|
||||
@@ -80,7 +76,7 @@ static int cpu_wrapper(Context* ctx,
|
||||
}
|
||||
return api::SUCCESS;
|
||||
}
|
||||
static int xpu3_wrapper(Context* ctx,
|
||||
static int xpu3_wrapper(api::Context* ctx,
|
||||
int64_t* next_tokens,
|
||||
const int* max_think_lens,
|
||||
const int64_t* step_idx,
|
||||
@@ -90,9 +86,9 @@ static int xpu3_wrapper(Context* ctx,
|
||||
const int64_t think_end_id,
|
||||
const int bs,
|
||||
const int eos_token_id_len) {
|
||||
using XPU_INT64 = typename XPUIndexType<int64_t>::type;
|
||||
using XPU_INT64 = typename api::XPUIndexType<int64_t>::type;
|
||||
auto limit_thinking_content_length_kernel_v1 =
|
||||
xpu3::plugin::limit_thinking_content_length_kernel_v1;
|
||||
fd_xpu3::limit_thinking_content_length_kernel_v1;
|
||||
int32_t ret_xre =
|
||||
limit_thinking_content_length_kernel_v1<<<1, 64, ctx->xpu_stream>>>(
|
||||
reinterpret_cast<XPU_INT64*>(next_tokens),
|
||||
@@ -108,7 +104,7 @@ static int xpu3_wrapper(Context* ctx,
|
||||
return api::SUCCESS;
|
||||
}
|
||||
|
||||
int limit_thinking_content_length_kernel_v1(Context* ctx,
|
||||
int limit_thinking_content_length_kernel_v1(api::Context* ctx,
|
||||
int64_t* next_tokens,
|
||||
const int* max_think_lens,
|
||||
const int64_t* step_idx,
|
||||
@@ -141,7 +137,7 @@ int limit_thinking_content_length_kernel_v1(Context* ctx,
|
||||
bs,
|
||||
eos_token_id_len);
|
||||
}
|
||||
if (ctx->dev().type() == api::kXPU2 || ctx->dev().type() == api::kXPU3) {
|
||||
if (ctx->dev().type() == api::kXPU3) {
|
||||
return xpu3_wrapper(ctx,
|
||||
next_tokens,
|
||||
max_think_lens,
|
||||
@@ -157,6 +153,4 @@ int limit_thinking_content_length_kernel_v1(Context* ctx,
|
||||
}
|
||||
|
||||
} // namespace plugin
|
||||
} // namespace api
|
||||
} // namespace xpu
|
||||
} // namespace baidu
|
||||
} // namespace fastdeploy
|
||||
|
||||
@@ -17,8 +17,7 @@
|
||||
#include "xpu/plugin.h"
|
||||
#include "xpu/refactor/impl_public/wrapper_check.h"
|
||||
|
||||
namespace xpu3 {
|
||||
namespace plugin {
|
||||
namespace fd_xpu3 {
|
||||
|
||||
__attribute__((global)) void limit_thinking_content_length_kernel_v2(
|
||||
int64_t* next_tokens,
|
||||
@@ -30,15 +29,12 @@ __attribute__((global)) void limit_thinking_content_length_kernel_v2(
|
||||
const int64_t line_break_id,
|
||||
const int bs);
|
||||
|
||||
} // namespace plugin
|
||||
} // namespace xpu3
|
||||
} // namespace fd_xpu3
|
||||
|
||||
namespace baidu {
|
||||
namespace xpu {
|
||||
namespace api {
|
||||
namespace fastdeploy {
|
||||
namespace plugin {
|
||||
|
||||
static int cpu_wrapper(Context* ctx,
|
||||
static int cpu_wrapper(api::Context* ctx,
|
||||
int64_t* next_tokens,
|
||||
const int* max_think_lens,
|
||||
const int64_t* step_idx,
|
||||
@@ -86,7 +82,7 @@ static int cpu_wrapper(Context* ctx,
|
||||
}
|
||||
return api::SUCCESS;
|
||||
}
|
||||
static int xpu3_wrapper(Context* ctx,
|
||||
static int xpu3_wrapper(api::Context* ctx,
|
||||
int64_t* next_tokens,
|
||||
const int* max_think_lens,
|
||||
const int64_t* step_idx,
|
||||
@@ -95,9 +91,9 @@ static int xpu3_wrapper(Context* ctx,
|
||||
const int64_t think_end_id,
|
||||
const int64_t line_break_id,
|
||||
const int bs) {
|
||||
using XPU_INT64 = typename XPUIndexType<int64_t>::type;
|
||||
using XPU_INT64 = typename api::XPUIndexType<int64_t>::type;
|
||||
auto limit_thinking_content_length_kernel_v2 =
|
||||
xpu3::plugin::limit_thinking_content_length_kernel_v2;
|
||||
fd_xpu3::limit_thinking_content_length_kernel_v2;
|
||||
int32_t ret_xre =
|
||||
limit_thinking_content_length_kernel_v2<<<1, 64, ctx->xpu_stream>>>(
|
||||
reinterpret_cast<XPU_INT64*>(next_tokens),
|
||||
@@ -112,7 +108,7 @@ static int xpu3_wrapper(Context* ctx,
|
||||
return api::SUCCESS;
|
||||
}
|
||||
|
||||
int limit_thinking_content_length_kernel_v2(Context* ctx,
|
||||
int limit_thinking_content_length_kernel_v2(api::Context* ctx,
|
||||
int64_t* next_tokens,
|
||||
const int* max_think_lens,
|
||||
const int64_t* step_idx,
|
||||
@@ -142,7 +138,7 @@ int limit_thinking_content_length_kernel_v2(Context* ctx,
|
||||
line_break_id,
|
||||
bs);
|
||||
}
|
||||
if (ctx->dev().type() == api::kXPU2 || ctx->dev().type() == api::kXPU3) {
|
||||
if (ctx->dev().type() == api::kXPU3) {
|
||||
return xpu3_wrapper(ctx,
|
||||
next_tokens,
|
||||
max_think_lens,
|
||||
@@ -157,6 +153,4 @@ int limit_thinking_content_length_kernel_v2(Context* ctx,
|
||||
}
|
||||
|
||||
} // namespace plugin
|
||||
} // namespace api
|
||||
} // namespace xpu
|
||||
} // namespace baidu
|
||||
} // namespace fastdeploy
|
||||
|
||||
@@ -15,8 +15,7 @@
|
||||
#include "xpu/plugin.h"
|
||||
#include "xpu/refactor/impl_public/wrapper_check.h"
|
||||
|
||||
namespace xpu3 {
|
||||
namespace plugin {
|
||||
namespace fd_xpu3 {
|
||||
__attribute__((global)) void ComputeOrderKernel(
|
||||
const int* seq_lens_this_time,
|
||||
const int* seq_lens_encoder,
|
||||
@@ -28,15 +27,12 @@ __attribute__((global)) void ComputeOrderKernel(
|
||||
const int bsz,
|
||||
const int actual_draft_token_num,
|
||||
const int input_token_num);
|
||||
} // namespace plugin
|
||||
} // namespace xpu3
|
||||
} // namespace fd_xpu3
|
||||
|
||||
namespace baidu {
|
||||
namespace xpu {
|
||||
namespace api {
|
||||
namespace fastdeploy {
|
||||
namespace plugin {
|
||||
|
||||
static int cpu_wrapper(Context* ctx,
|
||||
static int cpu_wrapper(api::Context* ctx,
|
||||
const int* seq_lens_this_time,
|
||||
const int* seq_lens_encoder,
|
||||
const int* base_model_seq_lens_this_time,
|
||||
@@ -97,7 +93,7 @@ static int cpu_wrapper(Context* ctx,
|
||||
return api::SUCCESS;
|
||||
}
|
||||
|
||||
static int xpu3_wrapper(Context* ctx,
|
||||
static int xpu3_wrapper(api::Context* ctx,
|
||||
const int* seq_lens_this_time,
|
||||
const int* seq_lens_encoder,
|
||||
const int* base_model_seq_lens_this_time,
|
||||
@@ -108,7 +104,7 @@ static int xpu3_wrapper(Context* ctx,
|
||||
const int bsz,
|
||||
const int actual_draft_token_num,
|
||||
const int input_token_num) {
|
||||
int32_t ret_xre = xpu3::plugin::ComputeOrderKernel<<<1, 1, ctx->xpu_stream>>>(
|
||||
int32_t ret_xre = fd_xpu3::ComputeOrderKernel<<<1, 1, ctx->xpu_stream>>>(
|
||||
seq_lens_this_time,
|
||||
seq_lens_encoder,
|
||||
base_model_seq_lens_this_time,
|
||||
@@ -123,7 +119,7 @@ static int xpu3_wrapper(Context* ctx,
|
||||
return api::SUCCESS;
|
||||
}
|
||||
|
||||
int compute_order(Context* ctx,
|
||||
int compute_order(api::Context* ctx,
|
||||
const int* seq_lens_this_time,
|
||||
const int* seq_lens_encoder,
|
||||
const int* base_model_seq_lens_this_time,
|
||||
@@ -187,6 +183,4 @@ int compute_order(Context* ctx,
|
||||
}
|
||||
|
||||
} // namespace plugin
|
||||
} // namespace api
|
||||
} // namespace xpu
|
||||
} // namespace baidu
|
||||
} // namespace fastdeploy
|
||||
|
||||
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user