mirror of
https://github.com/PaddlePaddle/FastDeploy.git
synced 2026-04-23 00:17:25 +08:00
@@ -220,7 +220,7 @@ std::vector<paddle::DataType> CutlassFp8Fp8HalfBlockGemmFusedInferDtype(
|
||||
}
|
||||
|
||||
PD_BUILD_STATIC_OP(cutlass_fp8_fp8_half_block_gemm_fused)
|
||||
.Inputs({"x", "y", "x_sacle", "y_scale", paddle::Optional("bias")})
|
||||
.Inputs({"x", "y", "x_scale", "y_scale", paddle::Optional("bias")})
|
||||
.Attrs({"transpose_x: bool",
|
||||
"transpose_y: bool",
|
||||
"output_dtype: std::string",
|
||||
|
||||
@@ -694,9 +694,9 @@ __global__ void quant2d_per_channel_cached(
|
||||
constexpr int PINGPONG_LM_LEN = 512;
|
||||
constexpr int LM_LEN = PINGPONG_LM_LEN * 2;
|
||||
constexpr int INPUT_BUF_LEN = 4 * 1024;
|
||||
constexpr int OUPUT_BUF_LEN = 2 * 1024;
|
||||
constexpr int OUTPUT_BUF_LEN = 2 * 1024;
|
||||
constexpr int INPUT_MAX_LEN = INPUT_BUF_LEN / sizeof(TX);
|
||||
constexpr int OUTPUT_MAX_LEN = OUPUT_BUF_LEN / sizeof(TY);
|
||||
constexpr int OUTPUT_MAX_LEN = OUTPUT_BUF_LEN / sizeof(TY);
|
||||
constexpr int INPUT_SIZE =
|
||||
INPUT_MAX_LEN < OUTPUT_MAX_LEN ? INPUT_MAX_LEN : OUTPUT_MAX_LEN;
|
||||
__simd__ __shared__ float
|
||||
|
||||
Reference in New Issue
Block a user