[Optimize] optimize mask_quant & swiglu (#6222)

* optimize mask_quant op speed up 1.5

* fix calculate sequence

* add fused

* rm log

* push kernel code

* add ut

* accuracy ok

* add ue8m0

* add ut

* add merge develop

* rm ut of mask_per_token_quant
This commit is contained in:
fxyfxy777
2026-02-02 13:52:38 +08:00
committed by GitHub
parent 25656455ee
commit 2ada119a38
7 changed files with 555 additions and 452 deletions
+10 -7
View File
@@ -303,10 +303,12 @@ std::vector<paddle::Tensor> PerTokenQuant(paddle::Tensor& input,
const int block_size);
std::vector<paddle::Tensor> PerTokenQuantPadding(paddle::Tensor& input,
const int block_size);
std::vector<paddle::Tensor> MaskedPerTokenQuant(
std::vector<paddle::Tensor> FusedMaskSwigluFP8Quant(
paddle::Tensor& input,
paddle::Tensor& recv_expert_count,
const int block_size);
paddle::Tensor& token_nums_per_expert,
const int block_size,
const bool use_ue8m0);
std::vector<paddle::Tensor> EPMoeExpertCombine(
const paddle::Tensor& ffn_out,
@@ -1267,12 +1269,13 @@ PYBIND11_MODULE(fastdeploy_ops, m) {
py::arg("routed_scaling_factor"),
"ep moe export combine function");
m.def("masked_per_token_quant",
&MaskedPerTokenQuant,
m.def("fused_mask_swiglu_fp8_quant",
&FusedMaskSwigluFP8Quant,
py::arg("input"),
py::arg("recv_expert_count"),
py::arg("token_nums_per_expert"),
py::arg("block_size"),
"per token per block quant");
py::arg("use_ue8m0") = false,
"fused mask swiglu and fp8 quant");
#ifdef ENABLE_MACHETE
/*machete/machete_mm.cu