cuda13.0, implement changes to CCCL (#6751)

This commit is contained in:
wangyifei
2026-03-10 16:47:02 +08:00
committed by GitHub
parent 54581b8653
commit b57c960837
13 changed files with 211 additions and 27 deletions
+2 -1
View File
@@ -209,7 +209,8 @@ __global__ void dynamic_per_token_scaled_fp8_quant_kernel(
using BlockReduce = cub::BlockReduce<float, 1024>;
__shared__ typename BlockReduce::TempStorage reduceStorage;
float const block_absmax_val_maybe =
BlockReduce(reduceStorage).Reduce(absmax_val, cub::Max{}, blockDim.x);
BlockReduce(reduceStorage)
.Reduce(absmax_val, fd_cub_compat::Max{}, blockDim.x);
__shared__ float token_scale;
if (tid == 0) {
if (scale_ub > 0) {
@@ -9,6 +9,8 @@
#include <cub/cub.cuh>
#include <cuda_runtime.h>
#include "../cccl_compat.h" // CCCL 3.0 compatibility
namespace fastdeploy {
// Vectorization containers