mirror of
https://github.com/PaddlePaddle/FastDeploy.git
synced 2026-04-23 08:21:53 +08:00
[Optimize] optimize mask_quant & swiglu (#6222)
* optimize mask_quant op speed up 1.5 * fix calculate sequence * add fused * rm log * push kernel code * add ut * accuracy ok * add ue8m0 * add ut * add merge develop * rm ut of mask_per_token_quant
This commit is contained in:
@@ -303,10 +303,12 @@ std::vector<paddle::Tensor> PerTokenQuant(paddle::Tensor& input,
|
||||
const int block_size);
|
||||
std::vector<paddle::Tensor> PerTokenQuantPadding(paddle::Tensor& input,
|
||||
const int block_size);
|
||||
std::vector<paddle::Tensor> MaskedPerTokenQuant(
|
||||
|
||||
std::vector<paddle::Tensor> FusedMaskSwigluFP8Quant(
|
||||
paddle::Tensor& input,
|
||||
paddle::Tensor& recv_expert_count,
|
||||
const int block_size);
|
||||
paddle::Tensor& token_nums_per_expert,
|
||||
const int block_size,
|
||||
const bool use_ue8m0);
|
||||
|
||||
std::vector<paddle::Tensor> EPMoeExpertCombine(
|
||||
const paddle::Tensor& ffn_out,
|
||||
@@ -1267,12 +1269,13 @@ PYBIND11_MODULE(fastdeploy_ops, m) {
|
||||
py::arg("routed_scaling_factor"),
|
||||
"ep moe export combine function");
|
||||
|
||||
m.def("masked_per_token_quant",
|
||||
&MaskedPerTokenQuant,
|
||||
m.def("fused_mask_swiglu_fp8_quant",
|
||||
&FusedMaskSwigluFP8Quant,
|
||||
py::arg("input"),
|
||||
py::arg("recv_expert_count"),
|
||||
py::arg("token_nums_per_expert"),
|
||||
py::arg("block_size"),
|
||||
"per token per block quant");
|
||||
py::arg("use_ue8m0") = false,
|
||||
"fused mask swiglu and fp8 quant");
|
||||
|
||||
#ifdef ENABLE_MACHETE
|
||||
/*machete/machete_mm.cu
|
||||
|
||||
Reference in New Issue
Block a user