[OP] Add flashmla baseline implementation and precision test (#7477)

2026-04-22 16:07:51 +08:00 · 2026-04-21 13:37:52 +08:00
parent 3c8c82d5d4
commit 609f649dd7
2 changed files with 108 additions and 0 deletions
@@ -0,0 +1,78 @@
+"""
+# Copyright (c) 2025  PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+
+import unittest
+
+import paddle
+
+paddle.set_default_dtype("bfloat16")
+
+from fastdeploy.model_executor.layers.attention.mla_attention_backend import (
+    MLAAttentionBackend,
+)
+
+
+class TestFlashMLA(unittest.TestCase):
+    def setUp(self):
+        pass
+
+    def test_flashmla(self):
+        bsz = 128
+        kv_len = 1000
+        decoder_q = paddle.randn([bsz, 1, 128, 576], dtype="bfloat16")
+        cache_seqlens = paddle.zeros([bsz], dtype="int32") + kv_len
+        block_tables = paddle.arange((kv_len // 64 + 1) * bsz, dtype="int32").reshape([bsz, -1])
+        latent_cache = paddle.randn([10000, 1, 64, 576], dtype="bfloat16")
+        # copy from dsv3
+        attn_softmax_scale = 0.1352337788608801
+
+        baseline_out = MLAAttentionBackend.flashmla_baseline(
+            decoder_q, latent_cache, block_tables, cache_seqlens, attn_softmax_scale
+        )
+
+        paddle.enable_compat(scope={"flash_mla"})  # Enable paddle.enable_compat before importing flash_mla
+        try:
+            import flash_mla
+        except ImportError:
+            print(100 * "Please install flash_mla first")
+            return
+
+        tile_scheduler_metadata, num_splits = flash_mla.get_mla_metadata()
+
+        new_cache_shape = latent_cache.shape
+        assert new_cache_shape[1] == 1
+        new_cache_shape[1], new_cache_shape[2] = new_cache_shape[2], new_cache_shape[1]
+
+        decoder_res, _ = flash_mla.flash_mla_with_kvcache(
+            decoder_q,
+            # 外面的开源仓库的kv cache存储格式和FD的不同
+            # 幸好这里缓存的头是1，直接view即可，否则上上下下要改很多！
+            latent_cache.view(new_cache_shape),
+            block_tables,
+            cache_seqlens,
+            512,  # t.dv,
+            tile_scheduler_metadata,
+            num_splits,
+            softmax_scale=attn_softmax_scale,
+            causal=True,
+        )
+
+        max_diff = (decoder_res - baseline_out).abs().max().item()
+        self.assertLessEqual(max_diff, 0.1)
+
+
+if __name__ == "__main__":
+    unittest.main()