mirror of
https://github.com/PaddlePaddle/FastDeploy.git
synced 2026-04-23 00:17:25 +08:00
【Hackathon 10th Spring No.45】FastDeploy 支持在 T4/V100 硬件的编译 -part (#6488)
* fix(custom_ops): gate unsupported ops for sm70/sm75 build * fix(custom_ops): gate deepgemm exports to sm75+ only * [BugFix][OP] deduplicate CUDA sources to avoid moe_deepgemm multiple definition * revert two custom_ops files to 352f922f9
This commit is contained in:
@@ -1244,6 +1244,7 @@ void PerTokenGroupQuantFp8(const paddle::Tensor& input,
|
||||
bool scale_ue8m0);
|
||||
|
||||
PYBIND11_MODULE(fastdeploy_ops, m) {
|
||||
#ifdef ENABLE_SM80_EXT_OPS
|
||||
m.def("get_expert_token_num",
|
||||
&GetExpertTokenNum,
|
||||
py::arg("topk_ids"),
|
||||
@@ -1266,6 +1267,7 @@ PYBIND11_MODULE(fastdeploy_ops, m) {
|
||||
py::arg("enable_softmax_top_k_fused"),
|
||||
py::arg("redundant_ep_rank_num_plus_one"),
|
||||
"moe export RedundantTopKSelect function");
|
||||
#endif
|
||||
|
||||
/**
|
||||
* open_shm_and_get_meta_signal.cc
|
||||
@@ -1291,9 +1293,11 @@ PYBIND11_MODULE(fastdeploy_ops, m) {
|
||||
py::arg("wait_flag"),
|
||||
"get_output_kv_signal function");
|
||||
|
||||
#ifdef ENABLE_SM75_EXT_OPS
|
||||
m.def("moe_deepgemm_permute", &MoEDeepGEMMPermute, "MoEDeepGEMMPermute");
|
||||
m.def(
|
||||
"moe_deepgemm_depermute", &MoEDeepGEMMDePermute, "MoEDeepGEMMDePermute");
|
||||
#endif
|
||||
/**
|
||||
* alloc_cache_pinned.cc
|
||||
* cuda_host_alloc
|
||||
@@ -1307,6 +1311,7 @@ PYBIND11_MODULE(fastdeploy_ops, m) {
|
||||
m.def(
|
||||
"cuda_host_free", &cuda_host_free, "Free pinned memory", py::arg("ptr"));
|
||||
py::register_exception<CudaError>(m, "CudaError");
|
||||
#ifdef ENABLE_SM80_EXT_OPS
|
||||
/**
|
||||
* append_attention.cu
|
||||
* append_attention
|
||||
@@ -1315,11 +1320,13 @@ PYBIND11_MODULE(fastdeploy_ops, m) {
|
||||
m.def("append_attention_with_output",
|
||||
&AppendAttentionWithOutput,
|
||||
"append attention with output function");
|
||||
#endif
|
||||
|
||||
#ifdef ENABLE_FLASH_MASK_ATTENTION
|
||||
m.def("flash_mask_attention", &FlashAttentionMask, "flash_mask_attention");
|
||||
#endif
|
||||
|
||||
#ifdef ENABLE_SM80_EXT_OPS
|
||||
/**
|
||||
* gqa_rope_write_cache.cu
|
||||
* gqa_rope_write_cache
|
||||
@@ -1334,6 +1341,7 @@ PYBIND11_MODULE(fastdeploy_ops, m) {
|
||||
m.def("pre_cache_len_concat",
|
||||
&PreCacheLenConcat,
|
||||
"pre_cache len concat function");
|
||||
|
||||
/**
|
||||
* moe/fused_moe/fused_moe.cu
|
||||
* fused_moe
|
||||
@@ -1389,6 +1397,7 @@ PYBIND11_MODULE(fastdeploy_ops, m) {
|
||||
py::arg("norm_topk_prob"),
|
||||
py::arg("routed_scaling_factor"),
|
||||
"ep moe export combine function");
|
||||
#endif
|
||||
|
||||
m.def("per_token_quant",
|
||||
&PerTokenQuant,
|
||||
@@ -1445,6 +1454,7 @@ PYBIND11_MODULE(fastdeploy_ops, m) {
|
||||
"machete supported schedules function");
|
||||
#endif
|
||||
|
||||
#ifdef ENABLE_SM80_EXT_OPS
|
||||
/**
|
||||
* moe/fused_moe/moe_topk_select.cu
|
||||
* moe_topk_select
|
||||
@@ -1486,6 +1496,7 @@ PYBIND11_MODULE(fastdeploy_ops, m) {
|
||||
py::arg("norm_topk_prob"),
|
||||
py::arg("routed_scaling_factor"),
|
||||
"moe export reduce function");
|
||||
#endif
|
||||
|
||||
/**
|
||||
* dequant_int8.cu
|
||||
@@ -1509,6 +1520,7 @@ PYBIND11_MODULE(fastdeploy_ops, m) {
|
||||
&OpenShmAndGetMetaSignalFunc,
|
||||
"open_shm_and_get_meta_signal function");
|
||||
|
||||
#ifdef ENABLE_SM80_EXT_OPS
|
||||
/**
|
||||
* append_attn/get_block_shape_and_split_kv_block.cu
|
||||
* get_block_shape_and_split_kv_block
|
||||
@@ -1516,6 +1528,7 @@ PYBIND11_MODULE(fastdeploy_ops, m) {
|
||||
m.def("get_block_shape_and_split_kv_block",
|
||||
&GetBlockShapeAndSplitKVBlock,
|
||||
"get_block_shape_and_split_kv_block function");
|
||||
#endif
|
||||
|
||||
/**
|
||||
* get_padding_offset.cu
|
||||
@@ -1567,9 +1580,11 @@ PYBIND11_MODULE(fastdeploy_ops, m) {
|
||||
&RecoverDecodeTask,
|
||||
"recover decode task for scheduler v1 function");
|
||||
|
||||
#ifdef ENABLE_SM80_EXT_OPS
|
||||
m.def("group_swiglu_with_masked",
|
||||
&GroupSwigluWithMasked,
|
||||
"group_swiglu_with_masked function");
|
||||
#endif
|
||||
|
||||
m.def("text_image_index_out",
|
||||
&TextImageIndexOut,
|
||||
@@ -1579,7 +1594,9 @@ PYBIND11_MODULE(fastdeploy_ops, m) {
|
||||
&TextImageGatherScatter,
|
||||
"text_image_gather_scatter function");
|
||||
|
||||
#ifdef ENABLE_SM80_EXT_OPS
|
||||
m.def("count_tokens_per_expert_func", &count_tokens_per_expert_func);
|
||||
|
||||
m.def("tritonmoe_preprocess_func", &tritonmoe_preprocess_kernel);
|
||||
|
||||
m.def("MoeWna16MarlinGemmApi",
|
||||
@@ -1609,6 +1626,7 @@ PYBIND11_MODULE(fastdeploy_ops, m) {
|
||||
py::arg("use_atomic_add"),
|
||||
py::arg("use_fp32_reduce"),
|
||||
py::arg("is_zp_float"));
|
||||
#endif
|
||||
|
||||
m.def("get_position_ids_and_mask_encoder_batch",
|
||||
&GetPositionIdsAndMaskEncoderBatch,
|
||||
@@ -1651,6 +1669,7 @@ PYBIND11_MODULE(fastdeploy_ops, m) {
|
||||
py::arg("input"),
|
||||
py::arg("scales"),
|
||||
py::arg("scale_ub"));
|
||||
#ifdef ENABLE_SM80_EXT_OPS
|
||||
m.def("decode_mla_write_cache",
|
||||
&DecodeMLAWriteCacheKernel,
|
||||
"decode_mla_write_cache function");
|
||||
@@ -1658,14 +1677,17 @@ PYBIND11_MODULE(fastdeploy_ops, m) {
|
||||
m.def("prefill_mla_write_cache",
|
||||
&PrefillMLAWriteCacheKernel,
|
||||
"prefill_mla_write_cache function");
|
||||
#endif
|
||||
|
||||
m.def("fused_rotary_position_encoding",
|
||||
&FusedRotaryPositionEncoding,
|
||||
"fused_rotary_position_encoding function");
|
||||
|
||||
#ifdef ENABLE_SM80_EXT_OPS
|
||||
m.def("multi_head_latent_attention",
|
||||
&MultiHeadLatentAttention,
|
||||
"multi_head_latent_attention function");
|
||||
#endif
|
||||
|
||||
m.def("noaux_tc", &NoauxTc, "noaux_tc for Deepseekv3 MoE compute");
|
||||
|
||||
@@ -1731,6 +1753,7 @@ PYBIND11_MODULE(fastdeploy_ops, m) {
|
||||
&get_graph_buffer_ipc_meta,
|
||||
"get_graph_buffer_ipc_meta");
|
||||
|
||||
#ifdef ENABLE_SM80_EXT_OPS
|
||||
m.def("speculate_get_seq_lens_output",
|
||||
&SpeculateGetSeqLensOutput,
|
||||
"speculate_get_seq_lens_output function");
|
||||
@@ -1839,6 +1862,7 @@ PYBIND11_MODULE(fastdeploy_ops, m) {
|
||||
m.def("speculate_get_target_logits",
|
||||
&SpeculateGetTargetLogits,
|
||||
"speculate_get_target_logits function");
|
||||
#endif
|
||||
|
||||
m.def("update_attn_mask_offsets",
|
||||
&UpdateAttnMaskOffsets,
|
||||
@@ -1848,7 +1872,9 @@ PYBIND11_MODULE(fastdeploy_ops, m) {
|
||||
&FusedNeoxRopeEmbedding,
|
||||
"fused_neox_rope_embedding function");
|
||||
|
||||
#ifndef DISABLE_GELU_TANH_OP
|
||||
m.def("gelu_tanh", &GeluTanh, "gelu_tanh function");
|
||||
#endif
|
||||
|
||||
m.def("reasoning_phase_token_constraint",
|
||||
&ReasoningPhaseTokenConstraint,
|
||||
|
||||
Reference in New Issue
Block a user