Revert "【New Feature】W4afp8 supports per group quantization (#4272)" (#4854)

This reverts commit 93fcf7e4ec.
2026-04-24 01:29:57 +08:00 · 2025-11-06 17:48:28 +08:00
parent 3478d20262
commit 819b2dbbae
26 changed files with 1718 additions and 4378 deletions
@@ -18,30 +18,30 @@
 #include <vector>
 #include "helper.h"

+
+
 std::vector<paddle::Tensor> W4AFp8Gemm(
-    const paddle::Tensor& input,
-    const paddle::Tensor& weight,
-    const paddle::Tensor&
-        tokens,  // If tokenpadding=0, this tensor represents the prefix sum of
-                 // tensors, otherwise it represents the number of tokens in
-                 // each group
-    const paddle::Tensor& weight_scale,
-    const paddle::optional<paddle::Tensor>& input_dequant_scale,
-    const int64_t token_padding_size,
-    const int64_t max_tokens,
-    const bool is_bfloat16);
+        const paddle::Tensor& input,
+        const paddle::Tensor& weight,
+        const paddle::Tensor& tokens, // If tokenpadding=0, this tensor represents the prefix sum of tensors, otherwise it represents the number of tokens in each group
+        const paddle::Tensor& input_row_sum,
+        const paddle::Tensor& weight_scale,
+        const int64_t token_padding_size,
+        const int64_t max_tokens,
+        const bool is_bfloat16);

 template <typename InputType, typename OutputType>
-void DisPatchW4AFp8GemmWrapper(const InputType* input,
-                               const InputType* weight,
-                               const int64_t* tokens,
-                               const float* input_dequant_scale,
-                               const float* weight_scale,
-                               OutputType* out,
-                               const int64_t token_padding_size,
-                               const int64_t max_tokens,
-                               const int num_experts,
-                               const int64_t M,
-                               const int64_t K,
-                               const int WeightScaleGroup,
-                               cudaStream_t stream);
+void DisPatchW4AFp8GemmWrapper(
+        const InputType* input,
+        const InputType* weight,
+        const int64_t * tokens,
+        const float * input_row_sum,
+        const float * row_scale,
+        const float * weight_scale,
+        OutputType * out,
+        const int64_t token_padding_size,
+        const int64_t max_tokens,
+        const int num_experts,
+        const int64_t M,
+        const int64_t K,
+        cudaStream_t stream);