mirror of
https://github.com/PaddlePaddle/FastDeploy.git
synced 2026-04-23 00:17:25 +08:00
[BugFix] fix fused_mask_swiglu_fp8_quant bug (#6316)
* optimize mask_quant op speed up 1.5
* fix calculate sequence
* add fused
* rm log
* push kernel code
* add ut
* accuracy ok
* add ue8m0
* add ut
* add merge develop
* rm ut of mask_per_token_quant
* Revert "[Optimize] optimize mask_quant & swiglu (#6222)"
This reverts commit 2ada119a38.
* add block_size
* pre-commit
This commit is contained in:
@@ -414,7 +414,7 @@ class DeepGemmFusedMoeMethod(MoEMethodBase):
|
||||
)
|
||||
|
||||
act_out_fp8, scale = fastdeploy.model_executor.ops.gpu.fused_mask_swiglu_fp8_quant(
|
||||
up_gate_proj_out, token_nums_per_expert, use_ue8m0=False
|
||||
up_gate_proj_out, token_nums_per_expert, block_size=128, use_ue8m0=False
|
||||
)
|
||||
|
||||
deep_gemm.m_grouped_gemm_fp8_fp8_bf16_nt_masked(
|
||||
|
||||
Reference in New Issue
Block a user