[Feature] Add deepgemm bias epilogue for SM100 (#6857)

* [Feature] Add deepgemm bias epilogue for SM100 * fix
2026-04-23 08:21:53 +08:00 · 2026-03-16 20:12:00 +08:00
parent bd4b6092dd
commit 5c92f4d0cd
1 changed files with 39 additions and 16 deletions
@@ -123,13 +123,23 @@ def deep_gemm_fp8_gemm_nt(
    layer_weight_scale_inv: paddle.Tensor,
    linear_out: paddle.Tensor,
    layer_output_size: int,
+    bias: paddle.Tensor = None,
 ):
-    # disable_ue8m0_cast is default False for SM100
-    fp8_gemm_nt(
-        (x, x_scale_tensor),
-        (layer_weight, layer_weight_scale_inv),
-        linear_out,
-    )
+    if get_sm_version() == 100 and current_platform.is_cuda():
+        # disable_ue8m0_cast is default False for SM100
+        fp8_gemm_nt(
+            (x, x_scale_tensor),
+            (layer_weight, layer_weight_scale_inv),
+            linear_out,
+            bias=bias,
+        )
+    else:
+        # disable_ue8m0_cast is default False for SM100
+        fp8_gemm_nt(
+            (x, x_scale_tensor),
+            (layer_weight, layer_weight_scale_inv),
+            linear_out,
+        )
    return linear_out


@@ -337,14 +347,27 @@ class BlockWiseFP8LinearMethod(QuantMethodBase):
                using_ue8m0_scale=self.quant_config.deepgemm_scale_ue8m0,
            )
            x_scale_tensor = x_scale_tensor.T[: x.shape[0], ...]
-        deep_gemm_fp8_gemm_nt(
-            x,
-            x_scale_tensor,
-            layer.weight,
-            layer.weight_scale_inv,
-            linear_out,
-            layer_output_size=layer.output_size,
-        )
-        if layer.with_bias:
-            linear_out = paddle.add(linear_out, layer.bias)
+
+        if get_sm_version() == 100 and current_platform.is_cuda():
+            deep_gemm_fp8_gemm_nt(
+                x,
+                x_scale_tensor,
+                layer.weight,
+                layer.weight_scale_inv,
+                linear_out,
+                layer_output_size=layer.output_size,
+                bias=layer.bias if layer.with_bias else None,
+            )
+        else:
+            deep_gemm_fp8_gemm_nt(
+                x,
+                x_scale_tensor,
+                layer.weight,
+                layer.weight_scale_inv,
+                linear_out,
+                layer_output_size=layer.output_size,
+            )
+            if layer.with_bias:
+                linear_out = paddle.add(linear_out, layer.bias)
+
        return linear_out