[Others] support import deepgemm/deepep from fleet ops (#6351)

* update paddleformers to v1.0

* only change import fleetpath
This commit is contained in:
JYChen
2026-02-09 11:53:13 +08:00
committed by GitHub
parent 74762b0fb2
commit 9bcd863902
4 changed files with 46 additions and 33 deletions
@@ -28,24 +28,23 @@ from fastdeploy.platforms import current_platform
from fastdeploy.utils import register_custom_python_op
from fastdeploy.worker.tbo import let_another_thread_run
from ..utils import get_sm_version
from .fused_moe_backend_base import MoEMethodBase
from .fused_moe_triton_backend import BlockWiseFP8MoEMethod
if current_platform.is_cuda():
if get_sm_version() == 100:
logger.info("Detected sm100, use PFCC DeepGEMM")
paddle.compat.enable_torch_proxy(scope={"deep_gemm"})
from deep_gemm import (
m_grouped_fp8_gemm_nt_contiguous,
m_grouped_fp8_gemm_nt_masked,
try:
m_grouped_fp8_gemm_nt_contiguous = (
fastdeploy.model_executor.layers.quantization.fp8_utils.deep_gemm.m_grouped_fp8_gemm_nt_contiguous
)
else:
from fastdeploy.model_executor.ops.gpu.deep_gemm import (
m_grouped_gemm_fp8_fp8_bf16_nt_contiguous as m_grouped_fp8_gemm_nt_contiguous,
m_grouped_fp8_gemm_nt_masked = (
fastdeploy.model_executor.layers.quantization.fp8_utils.deep_gemm.m_grouped_fp8_gemm_nt_masked
)
from fastdeploy.model_executor.ops.gpu.deep_gemm import (
m_grouped_gemm_fp8_fp8_bf16_nt_masked as m_grouped_fp8_gemm_nt_masked,
except:
m_grouped_fp8_gemm_nt_contiguous = (
fastdeploy.model_executor.layers.quantization.fp8_utils.deep_gemm.m_grouped_gemm_fp8_fp8_bf16_nt_contiguous
)
m_grouped_fp8_gemm_nt_masked = (
fastdeploy.model_executor.layers.quantization.fp8_utils.deep_gemm.m_grouped_gemm_fp8_fp8_bf16_nt_masked
)
else:
m_grouped_fp8_gemm_nt_contiguous = None
@@ -42,14 +42,10 @@ from ..utils import get_sm_version, get_tensor, per_block_cast_to_fp8
from .quant_base import QuantConfigBase, QuantMethodBase
if current_platform.is_cuda():
if get_sm_version() == 100:
# SM100 should use PFCC DeepGemm
paddle.compat.enable_torch_proxy(scope={"deep_gemm"})
from deep_gemm import fp8_gemm_nt
else:
from fastdeploy.model_executor.ops.gpu.deep_gemm import (
gemm_fp8_fp8_bf16_nt as fp8_gemm_nt,
)
try:
fp8_gemm_nt = fastdeploy.model_executor.layers.quantization.fp8_utils.deep_gemm.fp8_gemm_nt
except:
fp8_gemm_nt = fastdeploy.model_executor.layers.quantization.fp8_utils.deep_gemm.gemm_fp8_fp8_bf16_nt
else:
fp8_gemm_nt = None
@@ -21,16 +21,36 @@ from fastdeploy.platforms import current_platform
from ..utils import get_sm_version
if current_platform.is_cuda():
if get_sm_version() == 100:
# SM100 should use PFCC DeepGemm
logger.info("Detected sm100, use PFCC DeepGEMM")
paddle.compat.enable_torch_proxy(scope={"deep_gemm"})
import deep_gemm
def load_deep_gemm():
"""
Load DeepGemm module according to FastDeploy env switch.
Returns:
Imported deep_gemm module object.
"""
if current_platform.is_cuda():
if get_sm_version() == 100:
# SM100 should use PFCC DeepGemm
paddle.compat.enable_torch_proxy(scope={"deep_gemm"})
try:
from paddlefleet.ops import deep_gemm
logger.info("Detected sm100, use PaddleFleet DeepGEMM")
except:
import deep_gemm
logger.info("Detected sm100, use PFCC DeepGEMM")
else:
logger.info("use FastDeploy DeepGEMM")
from fastdeploy.model_executor.ops.gpu import deep_gemm
else:
from fastdeploy.model_executor.ops.gpu import deep_gemm
else:
deep_gemm = None
deep_gemm = None
return deep_gemm
deep_gemm = load_deep_gemm()
def ceil_div(x: int, y: int) -> int:
+2 -4
View File
@@ -23,6 +23,7 @@ from paddle import Tensor, nn
from paddle.framework import in_dynamic_mode
from scipy.linalg import block_diag
import fastdeploy
from fastdeploy.config import FDConfig
from fastdeploy.platforms import current_platform
@@ -254,10 +255,7 @@ def per_block_cast_to_fp8(x: Tensor, block_size: list = [128, 128]) -> Tuple[Ten
Only used in deep_gemm block wise quant weight.
copy from FastDeploy/custom_ops/gpu_ops/fp8_deep_gemm/tests/test_core.py.
"""
try:
from deep_gemm import ceil_div
except ModuleNotFoundError:
from fastdeploy.model_executor.ops.gpu.deep_gemm import ceil_div
ceil_div = fastdeploy.model_executor.layers.quantization.fp8_utils.deep_gemm.ceil_div
assert x.dim() == 2
m, n = x.shape