mirror of
https://github.com/PaddlePaddle/FastDeploy.git
synced 2026-04-24 09:44:10 +08:00
cuda13.0, implement changes to CCCL (#6751)
This commit is contained in:
@@ -209,7 +209,8 @@ __global__ void dynamic_per_token_scaled_fp8_quant_kernel(
|
||||
using BlockReduce = cub::BlockReduce<float, 1024>;
|
||||
__shared__ typename BlockReduce::TempStorage reduceStorage;
|
||||
float const block_absmax_val_maybe =
|
||||
BlockReduce(reduceStorage).Reduce(absmax_val, cub::Max{}, blockDim.x);
|
||||
BlockReduce(reduceStorage)
|
||||
.Reduce(absmax_val, fd_cub_compat::Max{}, blockDim.x);
|
||||
__shared__ float token_scale;
|
||||
if (tid == 0) {
|
||||
if (scale_ub > 0) {
|
||||
|
||||
Reference in New Issue
Block a user