supports dynamic Cfp8 (#3767)

* supports dynamic Cfp8 * add unittest
2026-04-23 17:11:21 +08:00 · 2025-09-08 11:41:29 +08:00
parent b5e20e3015
commit af49b81ffd
20 changed files with 1417 additions and 225 deletions
@@ -56,6 +56,7 @@ CascadeAppendAttentionC8Kernel<paddle::bfloat16, paddle::bfloat16, false>(
    const bool causal,
    const bool is_decoder,
    const bool enable_prefill,
+    const std::string& cache_quant_type_str,
    cudaStream_t& stream,
    paddle::Tensor* out);

@@ -103,5 +104,6 @@ CascadeAppendAttentionC8Kernel<paddle::bfloat16, paddle::bfloat16, true>(
    const bool causal,
    const bool is_decoder,
    const bool enable_prefill,
+    const std::string& cache_quant_type_str,
    cudaStream_t& stream,
    paddle::Tensor* out);
@@ -54,6 +54,7 @@ template void CascadeAppendAttentionC8Kernel<paddle::bfloat16, paddle::float8_e4
    const bool causal,
    const bool is_decoder,
    const bool enable_prefill,
+    const std::string& cache_quant_type_str,
    cudaStream_t& stream,
    paddle::Tensor* out);

@@ -98,5 +99,6 @@ template void CascadeAppendAttentionC8Kernel<paddle::bfloat16, paddle::float8_e4
    const bool causal,
    const bool is_decoder,
    const bool enable_prefill,
+    const std::string& cache_quant_type_str,
    cudaStream_t& stream,
    paddle::Tensor* out);
@@ -54,6 +54,7 @@ template void CascadeAppendAttentionC8Kernel<paddle::bfloat16, int8_t, false>(
    const bool causal,
    const bool is_decoder,
    const bool enable_prefill,
+    const std::string& cache_quant_type_str,
    cudaStream_t& stream,
    paddle::Tensor* out);

@@ -100,5 +101,6 @@ template void CascadeAppendAttentionC8Kernel<paddle::bfloat16, int8_t, true>(
    const bool causal,
    const bool is_decoder,
    const bool enable_prefill,
+    const std::string& cache_quant_type_str,
    cudaStream_t& stream,
    paddle::Tensor* out);
@@ -54,6 +54,7 @@ template void CascadeAppendAttentionC8Kernel<paddle::float16, paddle::float16, f
    const bool causal,
    const bool is_decoder,
    const bool enable_prefill,
+    const std::string& cache_quant_type_str,
    cudaStream_t& stream,
    paddle::Tensor* out);

@@ -100,5 +101,6 @@ template void CascadeAppendAttentionC8Kernel<paddle::float16, paddle::float16, t
    const bool causal,
    const bool is_decoder,
    const bool enable_prefill,
+    const std::string& cache_quant_type_str,
    cudaStream_t& stream,
    paddle::Tensor* out);
@@ -54,6 +54,7 @@ template void CascadeAppendAttentionC8Kernel<paddle::float16, paddle::float8_e4m
    const bool causal,
    const bool is_decoder,
    const bool enable_prefill,
+    const std::string& cache_quant_type_str,
    cudaStream_t& stream,
    paddle::Tensor* out);

@@ -99,5 +100,6 @@ template void CascadeAppendAttentionC8Kernel<paddle::float16, paddle::float8_e4m
    const bool causal,
    const bool is_decoder,
    const bool enable_prefill,
+    const std::string& cache_quant_type_str,
    cudaStream_t& stream,
    paddle::Tensor* out);
@@ -54,6 +54,7 @@ template void CascadeAppendAttentionC8Kernel<paddle::float16, int8_t, false>(
    const bool causal,
    const bool is_decoder,
    const bool enable_prefill,
+    const std::string& cache_quant_type_str,
    cudaStream_t& stream,
    paddle::Tensor* out);

@@ -99,5 +100,6 @@ template void CascadeAppendAttentionC8Kernel<paddle::float16, int8_t, true>(
    const bool causal,
    const bool is_decoder,
    const bool enable_prefill,
+    const std::string& cache_quant_type_str,
    cudaStream_t& stream,
    paddle::Tensor* out);