mirror of
https://github.com/PaddlePaddle/FastDeploy.git
synced 2026-04-24 01:29:57 +08:00
[Optimization][OP]support per_token_group_fp8_quant cuda kernel (#6865)
* support per_token_group_fp8_quant cuda kernel * Potential fix for pull request finding Co-authored-by: Copilot Autofix powered by AI <175728472+Copilot@users.noreply.github.com> * update code --------- Co-authored-by: Copilot Autofix powered by AI <175728472+Copilot@users.noreply.github.com>
This commit is contained in:
@@ -1232,6 +1232,15 @@ std::vector<paddle::Tensor> CpGatherIndexerKQuantCacheKernel(
|
||||
const paddle::Tensor& block_table,
|
||||
const paddle::Tensor& cu_seq_lens);
|
||||
|
||||
void PerTokenGroupQuantFp8(const paddle::Tensor& input,
|
||||
paddle::Tensor& output_q,
|
||||
paddle::Tensor& output_s,
|
||||
int64_t group_size,
|
||||
double eps,
|
||||
double fp8_min,
|
||||
double fp8_max,
|
||||
bool scale_ue8m0);
|
||||
|
||||
PYBIND11_MODULE(fastdeploy_ops, m) {
|
||||
m.def("get_expert_token_num",
|
||||
&GetExpertTokenNum,
|
||||
@@ -1878,4 +1887,8 @@ PYBIND11_MODULE(fastdeploy_ops, m) {
|
||||
m.def("cp_gather_indexer_k_quant_cache",
|
||||
&CpGatherIndexerKQuantCacheKernel,
|
||||
"cp_gather_indexer_k_quant_cache");
|
||||
|
||||
m.def("per_token_group_fp8_quant",
|
||||
&PerTokenGroupQuantFp8,
|
||||
"per_token_group_quant_fp8");
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user