diff --git a/custom_ops/gpu_ops/cpp_extensions.cc b/custom_ops/gpu_ops/cpp_extensions.cc index 7076e17629..46fdcff740 100644 --- a/custom_ops/gpu_ops/cpp_extensions.cc +++ b/custom_ops/gpu_ops/cpp_extensions.cc @@ -1244,6 +1244,7 @@ void PerTokenGroupQuantFp8(const paddle::Tensor& input, bool scale_ue8m0); PYBIND11_MODULE(fastdeploy_ops, m) { +#ifdef ENABLE_SM80_EXT_OPS m.def("get_expert_token_num", &GetExpertTokenNum, py::arg("topk_ids"), @@ -1266,6 +1267,7 @@ PYBIND11_MODULE(fastdeploy_ops, m) { py::arg("enable_softmax_top_k_fused"), py::arg("redundant_ep_rank_num_plus_one"), "moe export RedundantTopKSelect function"); +#endif /** * open_shm_and_get_meta_signal.cc @@ -1291,9 +1293,11 @@ PYBIND11_MODULE(fastdeploy_ops, m) { py::arg("wait_flag"), "get_output_kv_signal function"); +#ifdef ENABLE_SM75_EXT_OPS m.def("moe_deepgemm_permute", &MoEDeepGEMMPermute, "MoEDeepGEMMPermute"); m.def( "moe_deepgemm_depermute", &MoEDeepGEMMDePermute, "MoEDeepGEMMDePermute"); +#endif /** * alloc_cache_pinned.cc * cuda_host_alloc @@ -1307,6 +1311,7 @@ PYBIND11_MODULE(fastdeploy_ops, m) { m.def( "cuda_host_free", &cuda_host_free, "Free pinned memory", py::arg("ptr")); py::register_exception(m, "CudaError"); +#ifdef ENABLE_SM80_EXT_OPS /** * append_attention.cu * append_attention @@ -1315,11 +1320,13 @@ PYBIND11_MODULE(fastdeploy_ops, m) { m.def("append_attention_with_output", &AppendAttentionWithOutput, "append attention with output function"); +#endif #ifdef ENABLE_FLASH_MASK_ATTENTION m.def("flash_mask_attention", &FlashAttentionMask, "flash_mask_attention"); #endif +#ifdef ENABLE_SM80_EXT_OPS /** * gqa_rope_write_cache.cu * gqa_rope_write_cache @@ -1334,6 +1341,7 @@ PYBIND11_MODULE(fastdeploy_ops, m) { m.def("pre_cache_len_concat", &PreCacheLenConcat, "pre_cache len concat function"); + /** * moe/fused_moe/fused_moe.cu * fused_moe @@ -1389,6 +1397,7 @@ PYBIND11_MODULE(fastdeploy_ops, m) { py::arg("norm_topk_prob"), py::arg("routed_scaling_factor"), "ep moe export combine function"); +#endif m.def("per_token_quant", &PerTokenQuant, @@ -1445,6 +1454,7 @@ PYBIND11_MODULE(fastdeploy_ops, m) { "machete supported schedules function"); #endif +#ifdef ENABLE_SM80_EXT_OPS /** * moe/fused_moe/moe_topk_select.cu * moe_topk_select @@ -1486,6 +1496,7 @@ PYBIND11_MODULE(fastdeploy_ops, m) { py::arg("norm_topk_prob"), py::arg("routed_scaling_factor"), "moe export reduce function"); +#endif /** * dequant_int8.cu @@ -1509,6 +1520,7 @@ PYBIND11_MODULE(fastdeploy_ops, m) { &OpenShmAndGetMetaSignalFunc, "open_shm_and_get_meta_signal function"); +#ifdef ENABLE_SM80_EXT_OPS /** * append_attn/get_block_shape_and_split_kv_block.cu * get_block_shape_and_split_kv_block @@ -1516,6 +1528,7 @@ PYBIND11_MODULE(fastdeploy_ops, m) { m.def("get_block_shape_and_split_kv_block", &GetBlockShapeAndSplitKVBlock, "get_block_shape_and_split_kv_block function"); +#endif /** * get_padding_offset.cu @@ -1567,9 +1580,11 @@ PYBIND11_MODULE(fastdeploy_ops, m) { &RecoverDecodeTask, "recover decode task for scheduler v1 function"); +#ifdef ENABLE_SM80_EXT_OPS m.def("group_swiglu_with_masked", &GroupSwigluWithMasked, "group_swiglu_with_masked function"); +#endif m.def("text_image_index_out", &TextImageIndexOut, @@ -1579,7 +1594,9 @@ PYBIND11_MODULE(fastdeploy_ops, m) { &TextImageGatherScatter, "text_image_gather_scatter function"); +#ifdef ENABLE_SM80_EXT_OPS m.def("count_tokens_per_expert_func", &count_tokens_per_expert_func); + m.def("tritonmoe_preprocess_func", &tritonmoe_preprocess_kernel); m.def("MoeWna16MarlinGemmApi", @@ -1609,6 +1626,7 @@ PYBIND11_MODULE(fastdeploy_ops, m) { py::arg("use_atomic_add"), py::arg("use_fp32_reduce"), py::arg("is_zp_float")); +#endif m.def("get_position_ids_and_mask_encoder_batch", &GetPositionIdsAndMaskEncoderBatch, @@ -1651,6 +1669,7 @@ PYBIND11_MODULE(fastdeploy_ops, m) { py::arg("input"), py::arg("scales"), py::arg("scale_ub")); +#ifdef ENABLE_SM80_EXT_OPS m.def("decode_mla_write_cache", &DecodeMLAWriteCacheKernel, "decode_mla_write_cache function"); @@ -1658,14 +1677,17 @@ PYBIND11_MODULE(fastdeploy_ops, m) { m.def("prefill_mla_write_cache", &PrefillMLAWriteCacheKernel, "prefill_mla_write_cache function"); +#endif m.def("fused_rotary_position_encoding", &FusedRotaryPositionEncoding, "fused_rotary_position_encoding function"); +#ifdef ENABLE_SM80_EXT_OPS m.def("multi_head_latent_attention", &MultiHeadLatentAttention, "multi_head_latent_attention function"); +#endif m.def("noaux_tc", &NoauxTc, "noaux_tc for Deepseekv3 MoE compute"); @@ -1731,6 +1753,7 @@ PYBIND11_MODULE(fastdeploy_ops, m) { &get_graph_buffer_ipc_meta, "get_graph_buffer_ipc_meta"); +#ifdef ENABLE_SM80_EXT_OPS m.def("speculate_get_seq_lens_output", &SpeculateGetSeqLensOutput, "speculate_get_seq_lens_output function"); @@ -1839,6 +1862,7 @@ PYBIND11_MODULE(fastdeploy_ops, m) { m.def("speculate_get_target_logits", &SpeculateGetTargetLogits, "speculate_get_target_logits function"); +#endif m.def("update_attn_mask_offsets", &UpdateAttnMaskOffsets, @@ -1848,7 +1872,9 @@ PYBIND11_MODULE(fastdeploy_ops, m) { &FusedNeoxRopeEmbedding, "fused_neox_rope_embedding function"); +#ifndef DISABLE_GELU_TANH_OP m.def("gelu_tanh", &GeluTanh, "gelu_tanh function"); +#endif m.def("reasoning_phase_token_constraint", &ReasoningPhaseTokenConstraint, diff --git a/custom_ops/setup_ops.py b/custom_ops/setup_ops.py index 2e53012bb3..109028af6e 100644 --- a/custom_ops/setup_ops.py +++ b/custom_ops/setup_ops.py @@ -179,6 +179,32 @@ def get_gencode_flags(archs): return flags +def get_compile_parallelism(): + """ + Decide safe compile parallelism for both build workers and nvcc threads. + """ + cpu_count = os.cpu_count() or 1 + + max_jobs_env = os.getenv("MAX_JOBS") + if max_jobs_env is not None: + try: + max_jobs = int(max_jobs_env) + if max_jobs < 1: + raise ValueError + except ValueError as exc: + raise ValueError(f"Invalid MAX_JOBS={max_jobs_env!r}, expected a positive integer.") from exc + else: + # Cap default build workers to avoid OOM in high-core CI runners. + max_jobs = min(cpu_count, 32) + os.environ["MAX_JOBS"] = str(max_jobs) + + # Limit nvcc internal threads to avoid resource exhaustion when Paddle's + # ThreadPoolExecutor also launches many parallel compilations. + # Total threads ~= (number of parallel compile jobs) * nvcc_threads. + nvcc_threads = min(max_jobs, 4) + return max_jobs, nvcc_threads + + def find_end_files(directory, end_str): """ Find files with end str in directory. @@ -313,6 +339,11 @@ elif paddle.is_compiled_with_cuda(): "gpu_ops/reasoning_phase_token_constraint.cu", "gpu_ops/get_attn_mask_q.cu", ] + sm_versions = get_sm_version(archs) + # Some kernels in this file require SM75+ instructions. Exclude them when building SM70 (V100). + disable_gelu_tanh = 70 in sm_versions + if disable_gelu_tanh: + sources = [s for s in sources if s != "gpu_ops/gelu_tanh.cu"] # pd_disaggregation sources += [ @@ -352,6 +383,9 @@ elif paddle.is_compiled_with_cuda(): cc_compile_args = [] nvcc_compile_args = get_gencode_flags(archs) + if disable_gelu_tanh: + cc_compile_args += ["-DDISABLE_GELU_TANH_OP"] + nvcc_compile_args += ["-DDISABLE_GELU_TANH_OP"] nvcc_compile_args += ["-DPADDLE_DEV"] nvcc_compile_args += ["-DPADDLE_ON_INFERENCE"] nvcc_compile_args += ["-DPy_LIMITED_API=0x03090000"] @@ -363,10 +397,8 @@ elif paddle.is_compiled_with_cuda(): "-Igpu_ops", "-Ithird_party/nlohmann_json/include", ] - # Limit nvcc internal threads to avoid resource exhaustion when Paddle's - # ThreadPoolExecutor also launches many parallel compilations. - # Total threads ≈ (number of parallel compile jobs) × nvcc_threads, so cap nvcc_threads at 4. - nvcc_threads = min(os.cpu_count() or 1, 4) + max_jobs, nvcc_threads = get_compile_parallelism() + print(f"MAX_JOBS = {max_jobs}, nvcc -t = {nvcc_threads}") nvcc_compile_args += ["-t", str(nvcc_threads)] nvcc_version = get_nvcc_version() @@ -379,14 +411,16 @@ elif paddle.is_compiled_with_cuda(): if nvcc_version >= 12.0: sources += ["gpu_ops/sample_kernels/air_top_p_sampling.cu"] - cc = max(get_sm_version(archs)) + cc = max(sm_versions) print(f"cc = {cc}") fp8_auto_gen_directory = "gpu_ops/cutlass_kernels/fp8_gemm_fused/autogen" if os.path.isdir(fp8_auto_gen_directory): shutil.rmtree(fp8_auto_gen_directory) if cc >= 75: + cc_compile_args += ["-DENABLE_SM75_EXT_OPS"] nvcc_compile_args += [ + "-DENABLE_SM75_EXT_OPS", "-DENABLE_SCALED_MM_C2X=1", "-Igpu_ops/cutlass_kernels/w8a8", ] @@ -394,9 +428,14 @@ elif paddle.is_compiled_with_cuda(): "gpu_ops/cutlass_kernels/w8a8/scaled_mm_entry.cu", "gpu_ops/cutlass_kernels/w8a8/scaled_mm_c2x.cu", "gpu_ops/quantization/common.cu", + # cpp_extensions.cc always registers these two ops; include their kernels on SM75 as well. + "gpu_ops/moe/moe_deepgemm_permute.cu", + "gpu_ops/moe/moe_deepgemm_depermute.cu", ] if cc >= 80: + cc_compile_args += ["-DENABLE_SM80_EXT_OPS"] + nvcc_compile_args += ["-DENABLE_SM80_EXT_OPS"] # append_attention os.system( "python utils/auto_gen_template_instantiation.py --config gpu_ops/append_attn/template_config.json --output gpu_ops/append_attn/template_instantiation/autogen" @@ -519,6 +558,10 @@ elif paddle.is_compiled_with_cuda(): sources += find_end_files("gpu_ops/machete", ".cu") cc_compile_args += ["-DENABLE_MACHETE"] + # Deduplicate translation units while preserving order. Some files are + # appended explicitly for SM75 and also discovered by later directory globs. + sources = list(dict.fromkeys(sources)) + setup( name="fastdeploy_ops", ext_modules=CUDAExtension(