[Optimization][OP]support per_token_group_fp8_quant cuda kernel (#6865)

* support per_token_group_fp8_quant cuda kernel

* Potential fix for pull request finding

Co-authored-by: Copilot Autofix powered by AI <175728472+Copilot@users.noreply.github.com>

* update code

---------

Co-authored-by: Copilot Autofix powered by AI <175728472+Copilot@users.noreply.github.com>
This commit is contained in:
AIbin
2026-03-17 19:17:51 +08:00
committed by GitHub
parent b61731bb96
commit cb6819d086
3 changed files with 733 additions and 25 deletions
+13
View File
@@ -1232,6 +1232,15 @@ std::vector<paddle::Tensor> CpGatherIndexerKQuantCacheKernel(
const paddle::Tensor& block_table,
const paddle::Tensor& cu_seq_lens);
void PerTokenGroupQuantFp8(const paddle::Tensor& input,
paddle::Tensor& output_q,
paddle::Tensor& output_s,
int64_t group_size,
double eps,
double fp8_min,
double fp8_max,
bool scale_ue8m0);
PYBIND11_MODULE(fastdeploy_ops, m) {
m.def("get_expert_token_num",
&GetExpertTokenNum,
@@ -1878,4 +1887,8 @@ PYBIND11_MODULE(fastdeploy_ops, m) {
m.def("cp_gather_indexer_k_quant_cache",
&CpGatherIndexerKQuantCacheKernel,
"cp_gather_indexer_k_quant_cache");
m.def("per_token_group_fp8_quant",
&PerTokenGroupQuantFp8,
"per_token_group_quant_fp8");
}