cuda13.0, implement changes to CCCL (#6751)

2026-04-23 17:11:21 +08:00 · 2026-03-10 16:47:02 +08:00
parent 54581b8653
commit b57c960837
13 changed files with 211 additions and 27 deletions
@@ -209,7 +209,8 @@ __global__ void dynamic_per_token_scaled_fp8_quant_kernel(
  using BlockReduce = cub::BlockReduce<float, 1024>;
  __shared__ typename BlockReduce::TempStorage reduceStorage;
  float const block_absmax_val_maybe =
-      BlockReduce(reduceStorage).Reduce(absmax_val, cub::Max{}, blockDim.x);
+      BlockReduce(reduceStorage)
+          .Reduce(absmax_val, fd_cub_compat::Max{}, blockDim.x);
  __shared__ float token_scale;
  if (tid == 0) {
    if (scale_ub > 0) {
@@ -9,6 +9,8 @@
 #include <cub/cub.cuh>
 #include <cuda_runtime.h>

+#include "../cccl_compat.h"  // CCCL 3.0 compatibility
+
 namespace fastdeploy {

 // Vectorization containers