mirror of
https://github.com/PaddlePaddle/FastDeploy.git
synced 2026-04-23 00:17:25 +08:00
[Feature] Support Ernie FP8 on sm100 (#5593)
* Deepgemm暂时可用版本 * dense部分 e8m0 ok * EB模型E8M0跑通的版本 * code check * support 21b-tp2, dev_paddle * 单机4.5T ep OK的版本 * 修复删除的代码,单机4.5T ep(非cudagraph) * eb tp * Support SM100 block-wise FP8 inference * refine codes, support deepgemm on sm100 * add thirdparty PFCC/DeepGEMM * fix ep decode * 使用deepep ue8m0, 解决精度问题 * 修复FP8 TP精度 * Deepgemm升级适配Hopper逻辑 * add ue8m0 kernel * add ue8m0 kernel * fix custom_ops/gpu_ops/cpp_extensions.cc * eb 输出正常 * eb5 text is right * 目测精度一致 * 自测精度对齐 * 替换masked_per_token_quant, ep精度OK * 性能提升约30% * 暂时跑通ep但是有问题 * 自测一致 * rm test fun * fix ep event * 图优化算子更新Deepgemm * fix build * 暂时绕过deepgemm CI编译问题 * 根据SM区分deepgemm版本 * remove useless code --------- Co-authored-by: ckl117 <ckl117@163.com> Co-authored-by: K11OntheBoat <“ruianmaidanglao@163.com”> Co-authored-by: fxyfxy777 <fxyfxy777@163.com>
This commit is contained in:
@@ -254,7 +254,10 @@ def per_block_cast_to_fp8(x: Tensor, block_size: list = [128, 128]) -> Tuple[Ten
|
||||
Only used in deep_gemm block wise quant weight.
|
||||
copy from FastDeploy/custom_ops/gpu_ops/fp8_deep_gemm/tests/test_core.py.
|
||||
"""
|
||||
from fastdeploy.model_executor.ops.gpu.deep_gemm import ceil_div
|
||||
try:
|
||||
from deep_gemm import ceil_div
|
||||
except ModuleNotFoundError:
|
||||
from fastdeploy.model_executor.ops.gpu.deep_gemm import ceil_div
|
||||
|
||||
assert x.dim() == 2
|
||||
m, n = x.shape
|
||||
@@ -551,6 +554,12 @@ def vocab_range_from_global_vocab_size(global_vocab_size: int, rank: int, world_
|
||||
return vocab_range_from_per_partition_vocab_size(per_partition_vocab_size, rank, offset=offset)
|
||||
|
||||
|
||||
def get_sm_version():
|
||||
prop = paddle.device.cuda.get_device_properties()
|
||||
cc = prop.major * 10 + prop.minor
|
||||
return cc
|
||||
|
||||
|
||||
def modules_to_convert(prefix: str, fd_config: FDConfig):
|
||||
import fnmatch
|
||||
|
||||
|
||||
Reference in New Issue
Block a user