cuda13.0, implement changes to CCCL (#6751)

2026-04-24 09:44:10 +08:00 · 2026-03-10 16:47:02 +08:00
parent 54581b8653
commit b57c960837
13 changed files with 211 additions and 27 deletions
@@ -209,7 +209,8 @@ __global__ void dynamic_per_token_scaled_fp8_quant_kernel(
  using BlockReduce = cub::BlockReduce<float, 1024>;
  __shared__ typename BlockReduce::TempStorage reduceStorage;
  float const block_absmax_val_maybe =
-      BlockReduce(reduceStorage).Reduce(absmax_val, cub::Max{}, blockDim.x);
+      BlockReduce(reduceStorage)
+          .Reduce(absmax_val, fd_cub_compat::Max{}, blockDim.x);
  __shared__ float token_scale;
  if (tid == 0) {
    if (scale_ub > 0) {