[Feature] support blackwell gemm in ht (#7053)

* [Feature] support blackwell gemm in ht * [Feature] support ops for convert * fix cuda error 716 * fix cuda error * opt memory * remove unused code
2026-04-23 08:21:53 +08:00 · 2026-04-07 19:52:51 +08:00
parent 334b02c12b
commit 446b26bbc0
5 changed files with 1031 additions and 2 deletions
@@ -66,6 +66,7 @@ class BlockWiseFP8Config(QuantConfigBase):
        self.quant_min_bound = -448
        self.quant_round_type = 1
        self.use_deep_gemm = bool(envs.FD_USE_DEEP_GEMM)
+        self.use_blackwell_gemm = bool(envs.FD_USE_BLACKWELL_GEMM)
        self.is_checkpoint_bf16 = is_checkpoint_bf16
        self.deepgemm_scale_ue8m0 = True if get_sm_version() >= 100 else False

@@ -83,7 +84,16 @@ class BlockWiseFP8Config(QuantConfigBase):
        Get quantization method.
        """
        if isinstance(layer, FusedMoE):
-            if layer.ep_size > 1 or self.use_deep_gemm:
+            if self.use_blackwell_gemm:
+                assert (
+                    self.use_deep_gemm
+                ), "Blackwell gemm is supported only for prefill moe, please set FD_USE_DEEP_GEMM=1 as well"
+                from fastdeploy.model_executor.layers.moe.fused_moe_blackwell_backend import (
+                    BlackwellGemmFusedMoeMethod,
+                )
+
+                return BlackwellGemmFusedMoeMethod(self)
+            elif layer.ep_size > 1 or self.use_deep_gemm:
                from fastdeploy.model_executor.layers.moe.fused_moe_deepgemm_backend import (
                    DeepGemmFusedMoeMethod,
                )