mirror of
https://github.com/PaddlePaddle/FastDeploy.git
synced 2026-04-23 17:11:21 +08:00
cuda13.0, implement changes to CCCL (#6751)
This commit is contained in:
@@ -209,7 +209,8 @@ __global__ void dynamic_per_token_scaled_fp8_quant_kernel(
|
||||
using BlockReduce = cub::BlockReduce<float, 1024>;
|
||||
__shared__ typename BlockReduce::TempStorage reduceStorage;
|
||||
float const block_absmax_val_maybe =
|
||||
BlockReduce(reduceStorage).Reduce(absmax_val, cub::Max{}, blockDim.x);
|
||||
BlockReduce(reduceStorage)
|
||||
.Reduce(absmax_val, fd_cub_compat::Max{}, blockDim.x);
|
||||
__shared__ float token_scale;
|
||||
if (tid == 0) {
|
||||
if (scale_ub > 0) {
|
||||
|
||||
@@ -9,6 +9,8 @@
|
||||
#include <cub/cub.cuh>
|
||||
#include <cuda_runtime.h>
|
||||
|
||||
#include "../cccl_compat.h" // CCCL 3.0 compatibility
|
||||
|
||||
namespace fastdeploy {
|
||||
|
||||
// Vectorization containers
|
||||
|
||||
Reference in New Issue
Block a user