[Optimization][OP]support per_token_group_fp8_quant cuda kernel (#6865)

* support per_token_group_fp8_quant cuda kernel * Potential fix for pull request finding Co-authored-by: Copilot Autofix powered by AI <175728472+Copilot@users.noreply.github.com> * update code --------- Co-authored-by: Copilot Autofix powered by AI <175728472+Copilot@users.noreply.github.com>
2026-04-24 01:29:57 +08:00 · 2026-03-17 19:17:51 +08:00
parent b61731bb96
commit cb6819d086
3 changed files with 733 additions and 25 deletions
@@ -1232,6 +1232,15 @@ std::vector<paddle::Tensor> CpGatherIndexerKQuantCacheKernel(
    const paddle::Tensor& block_table,
    const paddle::Tensor& cu_seq_lens);

+void PerTokenGroupQuantFp8(const paddle::Tensor& input,
+                           paddle::Tensor& output_q,
+                           paddle::Tensor& output_s,
+                           int64_t group_size,
+                           double eps,
+                           double fp8_min,
+                           double fp8_max,
+                           bool scale_ue8m0);
+
 PYBIND11_MODULE(fastdeploy_ops, m) {
  m.def("get_expert_token_num",
        &GetExpertTokenNum,
@@ -1878,4 +1887,8 @@ PYBIND11_MODULE(fastdeploy_ops, m) {
  m.def("cp_gather_indexer_k_quant_cache",
        &CpGatherIndexerKQuantCacheKernel,
        "cp_gather_indexer_k_quant_cache");
+
+  m.def("per_token_group_fp8_quant",
+        &PerTokenGroupQuantFp8,
+        "per_token_group_quant_fp8");
 }