Revert "【New Feature】W4afp8 supports per group quantization (#4272)" (#4854)

This reverts commit 93fcf7e4ec.
This commit is contained in:
YuBaoku
2025-11-06 17:48:28 +08:00
committed by GitHub
parent 3478d20262
commit 819b2dbbae
26 changed files with 1718 additions and 4378 deletions
+24 -24
View File
@@ -18,30 +18,30 @@
#include <vector>
#include "helper.h"
std::vector<paddle::Tensor> W4AFp8Gemm(
const paddle::Tensor& input,
const paddle::Tensor& weight,
const paddle::Tensor&
tokens, // If tokenpadding=0, this tensor represents the prefix sum of
// tensors, otherwise it represents the number of tokens in
// each group
const paddle::Tensor& weight_scale,
const paddle::optional<paddle::Tensor>& input_dequant_scale,
const int64_t token_padding_size,
const int64_t max_tokens,
const bool is_bfloat16);
const paddle::Tensor& input,
const paddle::Tensor& weight,
const paddle::Tensor& tokens, // If tokenpadding=0, this tensor represents the prefix sum of tensors, otherwise it represents the number of tokens in each group
const paddle::Tensor& input_row_sum,
const paddle::Tensor& weight_scale,
const int64_t token_padding_size,
const int64_t max_tokens,
const bool is_bfloat16);
template <typename InputType, typename OutputType>
void DisPatchW4AFp8GemmWrapper(const InputType* input,
const InputType* weight,
const int64_t* tokens,
const float* input_dequant_scale,
const float* weight_scale,
OutputType* out,
const int64_t token_padding_size,
const int64_t max_tokens,
const int num_experts,
const int64_t M,
const int64_t K,
const int WeightScaleGroup,
cudaStream_t stream);
void DisPatchW4AFp8GemmWrapper(
const InputType* input,
const InputType* weight,
const int64_t * tokens,
const float * input_row_sum,
const float * row_scale,
const float * weight_scale,
OutputType * out,
const int64_t token_padding_size,
const int64_t max_tokens,
const int num_experts,
const int64_t M,
const int64_t K,
cudaStream_t stream);