diff --git a/custom_ops/gpu_ops/cpp_extensions.cc b/custom_ops/gpu_ops/cpp_extensions.cc
index 7076e17629..46fdcff740 100644
--- a/custom_ops/gpu_ops/cpp_extensions.cc
+++ b/custom_ops/gpu_ops/cpp_extensions.cc
@@ -1244,6 +1244,7 @@ void PerTokenGroupQuantFp8(const paddle::Tensor& input,
                            bool scale_ue8m0);
 
 PYBIND11_MODULE(fastdeploy_ops, m) {
+#ifdef ENABLE_SM80_EXT_OPS
   m.def("get_expert_token_num",
         &GetExpertTokenNum,
         py::arg("topk_ids"),
@@ -1266,6 +1267,7 @@ PYBIND11_MODULE(fastdeploy_ops, m) {
         py::arg("enable_softmax_top_k_fused"),
         py::arg("redundant_ep_rank_num_plus_one"),
         "moe export RedundantTopKSelect function");
+#endif
 
   /**
    * open_shm_and_get_meta_signal.cc
@@ -1291,9 +1293,11 @@ PYBIND11_MODULE(fastdeploy_ops, m) {
         py::arg("wait_flag"),
         "get_output_kv_signal function");
 
+#ifdef ENABLE_SM75_EXT_OPS
   m.def("moe_deepgemm_permute", &MoEDeepGEMMPermute, "MoEDeepGEMMPermute");
   m.def(
       "moe_deepgemm_depermute", &MoEDeepGEMMDePermute, "MoEDeepGEMMDePermute");
+#endif
   /**
    * alloc_cache_pinned.cc
    * cuda_host_alloc
@@ -1307,6 +1311,7 @@ PYBIND11_MODULE(fastdeploy_ops, m) {
   m.def(
       "cuda_host_free", &cuda_host_free, "Free pinned memory", py::arg("ptr"));
   py::register_exception<CudaError>(m, "CudaError");
+#ifdef ENABLE_SM80_EXT_OPS
   /**
    * append_attention.cu
    * append_attention
@@ -1315,11 +1320,13 @@ PYBIND11_MODULE(fastdeploy_ops, m) {
   m.def("append_attention_with_output",
         &AppendAttentionWithOutput,
         "append attention with output function");
+#endif
 
 #ifdef ENABLE_FLASH_MASK_ATTENTION
   m.def("flash_mask_attention", &FlashAttentionMask, "flash_mask_attention");
 #endif
 
+#ifdef ENABLE_SM80_EXT_OPS
   /**
    * gqa_rope_write_cache.cu
    * gqa_rope_write_cache
@@ -1334,6 +1341,7 @@ PYBIND11_MODULE(fastdeploy_ops, m) {
   m.def("pre_cache_len_concat",
         &PreCacheLenConcat,
         "pre_cache len concat function");
+
   /**
    * moe/fused_moe/fused_moe.cu
    * fused_moe
@@ -1389,6 +1397,7 @@ PYBIND11_MODULE(fastdeploy_ops, m) {
         py::arg("norm_topk_prob"),
         py::arg("routed_scaling_factor"),
         "ep moe export combine function");
+#endif
 
   m.def("per_token_quant",
         &PerTokenQuant,
@@ -1445,6 +1454,7 @@ PYBIND11_MODULE(fastdeploy_ops, m) {
         "machete supported schedules function");
 #endif
 
+#ifdef ENABLE_SM80_EXT_OPS
   /**
    * moe/fused_moe/moe_topk_select.cu
    * moe_topk_select
@@ -1486,6 +1496,7 @@ PYBIND11_MODULE(fastdeploy_ops, m) {
         py::arg("norm_topk_prob"),
         py::arg("routed_scaling_factor"),
         "moe export reduce function");
+#endif
 
   /**
    * dequant_int8.cu
@@ -1509,6 +1520,7 @@ PYBIND11_MODULE(fastdeploy_ops, m) {
         &OpenShmAndGetMetaSignalFunc,
         "open_shm_and_get_meta_signal function");
 
+#ifdef ENABLE_SM80_EXT_OPS
   /**
    * append_attn/get_block_shape_and_split_kv_block.cu
    * get_block_shape_and_split_kv_block
@@ -1516,6 +1528,7 @@ PYBIND11_MODULE(fastdeploy_ops, m) {
   m.def("get_block_shape_and_split_kv_block",
         &GetBlockShapeAndSplitKVBlock,
         "get_block_shape_and_split_kv_block function");
+#endif
 
   /**
    * get_padding_offset.cu
@@ -1567,9 +1580,11 @@ PYBIND11_MODULE(fastdeploy_ops, m) {
         &RecoverDecodeTask,
         "recover decode task for scheduler v1 function");
 
+#ifdef ENABLE_SM80_EXT_OPS
   m.def("group_swiglu_with_masked",
         &GroupSwigluWithMasked,
         "group_swiglu_with_masked function");
+#endif
 
   m.def("text_image_index_out",
         &TextImageIndexOut,
@@ -1579,7 +1594,9 @@ PYBIND11_MODULE(fastdeploy_ops, m) {
         &TextImageGatherScatter,
         "text_image_gather_scatter function");
 
+#ifdef ENABLE_SM80_EXT_OPS
   m.def("count_tokens_per_expert_func", &count_tokens_per_expert_func);
+
   m.def("tritonmoe_preprocess_func", &tritonmoe_preprocess_kernel);
 
   m.def("MoeWna16MarlinGemmApi",
@@ -1609,6 +1626,7 @@ PYBIND11_MODULE(fastdeploy_ops, m) {
         py::arg("use_atomic_add"),
         py::arg("use_fp32_reduce"),
         py::arg("is_zp_float"));
+#endif
 
   m.def("get_position_ids_and_mask_encoder_batch",
         &GetPositionIdsAndMaskEncoderBatch,
@@ -1651,6 +1669,7 @@ PYBIND11_MODULE(fastdeploy_ops, m) {
         py::arg("input"),
         py::arg("scales"),
         py::arg("scale_ub"));
+#ifdef ENABLE_SM80_EXT_OPS
   m.def("decode_mla_write_cache",
         &DecodeMLAWriteCacheKernel,
         "decode_mla_write_cache function");
@@ -1658,14 +1677,17 @@ PYBIND11_MODULE(fastdeploy_ops, m) {
   m.def("prefill_mla_write_cache",
         &PrefillMLAWriteCacheKernel,
         "prefill_mla_write_cache function");
+#endif
 
   m.def("fused_rotary_position_encoding",
         &FusedRotaryPositionEncoding,
         "fused_rotary_position_encoding function");
 
+#ifdef ENABLE_SM80_EXT_OPS
   m.def("multi_head_latent_attention",
         &MultiHeadLatentAttention,
         "multi_head_latent_attention function");
+#endif
 
   m.def("noaux_tc", &NoauxTc, "noaux_tc for Deepseekv3 MoE compute");
 
@@ -1731,6 +1753,7 @@ PYBIND11_MODULE(fastdeploy_ops, m) {
         &get_graph_buffer_ipc_meta,
         "get_graph_buffer_ipc_meta");
 
+#ifdef ENABLE_SM80_EXT_OPS
   m.def("speculate_get_seq_lens_output",
         &SpeculateGetSeqLensOutput,
         "speculate_get_seq_lens_output function");
@@ -1839,6 +1862,7 @@ PYBIND11_MODULE(fastdeploy_ops, m) {
   m.def("speculate_get_target_logits",
         &SpeculateGetTargetLogits,
         "speculate_get_target_logits function");
+#endif
 
   m.def("update_attn_mask_offsets",
         &UpdateAttnMaskOffsets,
@@ -1848,7 +1872,9 @@ PYBIND11_MODULE(fastdeploy_ops, m) {
         &FusedNeoxRopeEmbedding,
         "fused_neox_rope_embedding function");
 
+#ifndef DISABLE_GELU_TANH_OP
   m.def("gelu_tanh", &GeluTanh, "gelu_tanh function");
+#endif
 
   m.def("reasoning_phase_token_constraint",
         &ReasoningPhaseTokenConstraint,
diff --git a/custom_ops/setup_ops.py b/custom_ops/setup_ops.py
index 2e53012bb3..109028af6e 100644
--- a/custom_ops/setup_ops.py
+++ b/custom_ops/setup_ops.py
@@ -179,6 +179,32 @@ def get_gencode_flags(archs):
     return flags
 
 
+def get_compile_parallelism():
+    """
+    Decide safe compile parallelism for both build workers and nvcc threads.
+    """
+    cpu_count = os.cpu_count() or 1
+
+    max_jobs_env = os.getenv("MAX_JOBS")
+    if max_jobs_env is not None:
+        try:
+            max_jobs = int(max_jobs_env)
+            if max_jobs < 1:
+                raise ValueError
+        except ValueError as exc:
+            raise ValueError(f"Invalid MAX_JOBS={max_jobs_env!r}, expected a positive integer.") from exc
+    else:
+        # Cap default build workers to avoid OOM in high-core CI runners.
+        max_jobs = min(cpu_count, 32)
+        os.environ["MAX_JOBS"] = str(max_jobs)
+
+    # Limit nvcc internal threads to avoid resource exhaustion when Paddle's
+    # ThreadPoolExecutor also launches many parallel compilations.
+    # Total threads ~= (number of parallel compile jobs) * nvcc_threads.
+    nvcc_threads = min(max_jobs, 4)
+    return max_jobs, nvcc_threads
+
+
 def find_end_files(directory, end_str):
     """
     Find files with end str in directory.
@@ -313,6 +339,11 @@ elif paddle.is_compiled_with_cuda():
         "gpu_ops/reasoning_phase_token_constraint.cu",
         "gpu_ops/get_attn_mask_q.cu",
     ]
+    sm_versions = get_sm_version(archs)
+    # Some kernels in this file require SM75+ instructions. Exclude them when building SM70 (V100).
+    disable_gelu_tanh = 70 in sm_versions
+    if disable_gelu_tanh:
+        sources = [s for s in sources if s != "gpu_ops/gelu_tanh.cu"]
 
     # pd_disaggregation
     sources += [
@@ -352,6 +383,9 @@ elif paddle.is_compiled_with_cuda():
 
     cc_compile_args = []
     nvcc_compile_args = get_gencode_flags(archs)
+    if disable_gelu_tanh:
+        cc_compile_args += ["-DDISABLE_GELU_TANH_OP"]
+        nvcc_compile_args += ["-DDISABLE_GELU_TANH_OP"]
     nvcc_compile_args += ["-DPADDLE_DEV"]
     nvcc_compile_args += ["-DPADDLE_ON_INFERENCE"]
     nvcc_compile_args += ["-DPy_LIMITED_API=0x03090000"]
@@ -363,10 +397,8 @@ elif paddle.is_compiled_with_cuda():
         "-Igpu_ops",
         "-Ithird_party/nlohmann_json/include",
     ]
-    # Limit nvcc internal threads to avoid resource exhaustion when Paddle's
-    # ThreadPoolExecutor also launches many parallel compilations.
-    # Total threads ≈ (number of parallel compile jobs) × nvcc_threads, so cap nvcc_threads at 4.
-    nvcc_threads = min(os.cpu_count() or 1, 4)
+    max_jobs, nvcc_threads = get_compile_parallelism()
+    print(f"MAX_JOBS = {max_jobs}, nvcc -t = {nvcc_threads}")
     nvcc_compile_args += ["-t", str(nvcc_threads)]
 
     nvcc_version = get_nvcc_version()
@@ -379,14 +411,16 @@ elif paddle.is_compiled_with_cuda():
 
     if nvcc_version >= 12.0:
         sources += ["gpu_ops/sample_kernels/air_top_p_sampling.cu"]
-    cc = max(get_sm_version(archs))
+    cc = max(sm_versions)
     print(f"cc = {cc}")
     fp8_auto_gen_directory = "gpu_ops/cutlass_kernels/fp8_gemm_fused/autogen"
     if os.path.isdir(fp8_auto_gen_directory):
         shutil.rmtree(fp8_auto_gen_directory)
 
     if cc >= 75:
+        cc_compile_args += ["-DENABLE_SM75_EXT_OPS"]
         nvcc_compile_args += [
+            "-DENABLE_SM75_EXT_OPS",
             "-DENABLE_SCALED_MM_C2X=1",
             "-Igpu_ops/cutlass_kernels/w8a8",
         ]
@@ -394,9 +428,14 @@ elif paddle.is_compiled_with_cuda():
             "gpu_ops/cutlass_kernels/w8a8/scaled_mm_entry.cu",
             "gpu_ops/cutlass_kernels/w8a8/scaled_mm_c2x.cu",
             "gpu_ops/quantization/common.cu",
+            # cpp_extensions.cc always registers these two ops; include their kernels on SM75 as well.
+            "gpu_ops/moe/moe_deepgemm_permute.cu",
+            "gpu_ops/moe/moe_deepgemm_depermute.cu",
         ]
 
     if cc >= 80:
+        cc_compile_args += ["-DENABLE_SM80_EXT_OPS"]
+        nvcc_compile_args += ["-DENABLE_SM80_EXT_OPS"]
         # append_attention
         os.system(
             "python utils/auto_gen_template_instantiation.py --config gpu_ops/append_attn/template_config.json --output gpu_ops/append_attn/template_instantiation/autogen"
@@ -519,6 +558,10 @@ elif paddle.is_compiled_with_cuda():
         sources += find_end_files("gpu_ops/machete", ".cu")
         cc_compile_args += ["-DENABLE_MACHETE"]
 
+    # Deduplicate translation units while preserving order. Some files are
+    # appended explicitly for SM75 and also discovered by later directory globs.
+    sources = list(dict.fromkeys(sources))
+
     setup(
         name="fastdeploy_ops",
         ext_modules=CUDAExtension(