init (#6642)

Co-authored-by: gongweibao <gognweibao@baidu.com>
2026-04-25 09:57:51 +08:00 · 2026-03-04 21:55:31 +08:00
parent 5c8f5184d9
commit ddb06ff83f
306 changed files with 40627 additions and 34418 deletions
@@ -30,7 +30,8 @@ struct KernelTmaWarpSpecializedCooperativeFP8BlockScaledSubGroupMAccum

 // n-buffer in smem (Hopper TMA), pipelined with Hopper GMMA and TMA, Warp
 // specialized dynamic schedule For FP8 kernels with Block Scaling
-template <int Stages_, class ClusterShape_ = Shape<_1, _1, _1>,
+template <int Stages_,
+          class ClusterShape_ = Shape<_1, _1, _1>,
          class KernelSchedule = KernelTmaWarpSpecialized,
          int ScaleGranularityM =
              0  // `ScaleGranularityM` specifies scaling granularity along M,
@@ -38,7 +39,8 @@ template <int Stages_, class ClusterShape_ = Shape<_1, _1, _1>,
                 // granularity is `size<0>(TileShape_MNK{})` along M.
          >
 struct MainloopSm90TmaGmmaWarpSpecializedBlockScalingSubGroupMFP8
-    : MainloopSm90TmaGmmaWarpSpecialized<Stages_, ClusterShape_,
+    : MainloopSm90TmaGmmaWarpSpecialized<Stages_,
+                                         ClusterShape_,
                                         KernelSchedule> {
  static_assert(
      cute::is_same_v<