[Optimization] Support FA2/FA3/FA4 with attn_mask_q (#6354)

* support FA4 sm100 * flash attn backend support mask * flash attn backend run flashmask correct * add test for flash_attn_backend and flash_attn_func * check * add test for fa4 * requirements.txt add fa4 whl * check test on sm100 * fix CI conflict * add enable_torch_proxy for flash_mask * lazy import fa4 * check * fix tests import * check test_load_mpt import
2026-04-24 01:29:57 +08:00 · 2026-02-05 14:39:00 +08:00
parent 72edd394d9
commit 29a313a402
22 changed files with 999 additions and 101 deletions
@@ -19,6 +19,7 @@ import re
 from collections.abc import Mapping
 from contextlib import contextmanager
 from dataclasses import dataclass, field
+from functools import cache
 from typing import Any, List, Optional, Union

 import paddle
@@ -547,3 +548,11 @@ def rename_offline_ckpt_suffix_to_fd_suffix(
        return loaded_weight_name

    return fn
+
+
+@cache
+def get_sm_version():
+    if paddle.cuda.is_available():
+        prop = paddle.device.cuda.get_device_properties()
+        return prop.major * 10 + prop.minor
+    return 0