mirror of
https://github.com/PaddlePaddle/FastDeploy.git
synced 2026-04-23 00:17:25 +08:00
407 lines
12 KiB
Python
407 lines
12 KiB
Python
#!/usr/bin/env python3
|
|
# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
|
|
#
|
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
# you may not use this file except in compliance with the License.
|
|
# You may obtain a copy of the License at
|
|
#
|
|
# http://www.apache.org/licenses/LICENSE-2.0
|
|
#
|
|
# Unless required by applicable law or agreed to in writing, software
|
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
# See the License for the specific language governing permissions and
|
|
# limitations under the License.
|
|
|
|
"""
|
|
DSMLAWriteCacheKernel 算子单元测试
|
|
|
|
测试 cpp_extensions.cc 中定义的 dsk_attn_write_cache 算子:
|
|
- 参数: kv_nope, kv_pe, kv_cache, slot_mapping, seq_lens, seq_lens_decoder,
|
|
batch_id_per_token, cu_seqlens_q, block_tables, kv_signal_data(optional),
|
|
scale(optional), cache_quant_type_str, max_seq_len, is_prefill
|
|
"""
|
|
|
|
|
|
import time
|
|
import unittest
|
|
|
|
import paddle
|
|
|
|
from fastdeploy.model_executor.layers.attention.dsa_helper import dsk_attn_write_cache
|
|
|
|
# from fastdeploy.model_executor.ops.gpu import dsk_attn_write_cache
|
|
|
|
|
|
# DS MLA 常量定义
|
|
KV_LORA_RANK = 512 # NoPE 部分维度
|
|
PE_DIM = 64 # RoPE 部分维度
|
|
BLOCK_SIZE = 64 # 每个 block 的 token 数
|
|
# FP8 entry size: 512(fp8 nope) + 16(scales) + 128(rope bf16) = 656 bytes
|
|
FP8_ENTRY_SIZE = 656
|
|
|
|
|
|
def create_test_tensors(
|
|
batch_size: int = 2,
|
|
num_tokens: int = 16,
|
|
max_num_blocks: int = 100,
|
|
is_prefill: bool = True,
|
|
dtype: str = "bfloat16",
|
|
):
|
|
"""创建测试用张量
|
|
|
|
Args:
|
|
batch_size: 批次大小
|
|
num_tokens: token 总数
|
|
max_num_blocks: 最大 block 数
|
|
is_prefill: 是否为 prefill 阶段
|
|
dtype: 输入数据类型
|
|
|
|
Returns:
|
|
dict: 包含所有测试张量的字典
|
|
"""
|
|
# 输入张量
|
|
kv_nope = paddle.randn([num_tokens, KV_LORA_RANK], dtype=dtype)
|
|
kv_pe = paddle.randn([num_tokens, PE_DIM], dtype=dtype).unsqueeze(1)
|
|
|
|
# KV cache: [num_blocks, num_kv_heads=1, block_size, entry_size]
|
|
kv_cache = paddle.zeros([max_num_blocks, 1, BLOCK_SIZE, FP8_ENTRY_SIZE], dtype="uint8")
|
|
|
|
# slot_mapping: 每个 token 对应的 cache slot 位置
|
|
slot_mapping = paddle.arange(num_tokens, dtype="int64")
|
|
|
|
# seq_lens: 每个请求的序列长度
|
|
tokens_per_req = num_tokens // batch_size
|
|
seq_lens = paddle.full([batch_size], tokens_per_req, dtype="int32")
|
|
|
|
# seq_lens_decoder: decode 阶段的序列长度
|
|
if is_prefill:
|
|
seq_lens_decoder = paddle.zeros([batch_size], dtype="int32")
|
|
else:
|
|
seq_lens_decoder = paddle.full([batch_size], 1, dtype="int32")
|
|
|
|
# batch_id_per_token: 每个 token 所属的 batch
|
|
batch_id_per_token = paddle.concat([paddle.full([tokens_per_req], i, dtype="int32") for i in range(batch_size)])
|
|
|
|
# cu_seqlens_q: 累积序列长度 [0, seq1_len, seq1_len + seq2_len, ...]
|
|
cu_seqlens = [0]
|
|
for i in range(batch_size):
|
|
cu_seqlens.append(cu_seqlens[-1] + tokens_per_req)
|
|
cu_seqlens_q = paddle.to_tensor(cu_seqlens, dtype="int32")
|
|
|
|
# block_tables: [batch_size, max_blocks_per_seq]
|
|
max_blocks_per_seq = 10
|
|
block_tables = paddle.randint(0, max_num_blocks, [batch_size, max_blocks_per_seq], dtype="int32")
|
|
|
|
# scale: 量化缩放因子 (optional)
|
|
scale = paddle.ones([num_tokens, 1], dtype="float32")
|
|
|
|
return {
|
|
"kv_nope": kv_nope,
|
|
"kv_pe": kv_pe,
|
|
"kv_cache": kv_cache,
|
|
"slot_mapping": slot_mapping,
|
|
"seq_lens": seq_lens,
|
|
"seq_lens_decoder": seq_lens_decoder,
|
|
"batch_id_per_token": batch_id_per_token,
|
|
"cu_seqlens_q": cu_seqlens_q,
|
|
"block_tables": block_tables,
|
|
"scale": scale,
|
|
"max_seq_len": 4096,
|
|
"is_prefill": is_prefill,
|
|
}
|
|
|
|
|
|
class BaseDSMLAWriteCacheTest(unittest.TestCase):
|
|
"""基础测试类,包含共用的初始化和辅助方法"""
|
|
|
|
@classmethod
|
|
def setUpClass(cls):
|
|
"""测试类初始化"""
|
|
paddle.set_device("gpu")
|
|
|
|
def setUp(self):
|
|
"""每个测试前检查"""
|
|
pass
|
|
|
|
|
|
# ==================== 基础功能测试 ====================
|
|
|
|
|
|
class TestBasicPrefill(BaseDSMLAWriteCacheTest):
|
|
"""测试基本 prefill 模式"""
|
|
|
|
def test_basic_prefill(self):
|
|
"""基本 prefill 模式测试"""
|
|
tensors = create_test_tensors(batch_size=2, num_tokens=16, is_prefill=True)
|
|
|
|
dsk_attn_write_cache(
|
|
tensors["kv_nope"],
|
|
tensors["kv_pe"],
|
|
tensors["kv_cache"],
|
|
tensors["slot_mapping"],
|
|
tensors["scale"],
|
|
"fp8_ds_mla",
|
|
)
|
|
|
|
# dsk_attn_write_cache 是 in-place 操作,直接修改 kv_cache
|
|
# 返回值是空列表,验证 kv_cache 已被修改
|
|
|
|
self.assertEqual(tensors["kv_cache"].dtype, paddle.uint8)
|
|
|
|
|
|
class TestBasicDecode(BaseDSMLAWriteCacheTest):
|
|
"""测试基本 decode 模式"""
|
|
|
|
def test_basic_decode(self):
|
|
"""基本 decode 模式测试"""
|
|
tensors = create_test_tensors(batch_size=2, num_tokens=2, is_prefill=False)
|
|
|
|
dsk_attn_write_cache(
|
|
tensors["kv_nope"],
|
|
tensors["kv_pe"],
|
|
tensors["kv_cache"],
|
|
tensors["slot_mapping"],
|
|
tensors["scale"],
|
|
"fp8_ds_mla",
|
|
)
|
|
|
|
# in-place 操作验证
|
|
|
|
self.assertEqual(tensors["kv_cache"].dtype, paddle.uint8)
|
|
|
|
|
|
# ==================== 边界条件测试 ====================
|
|
|
|
|
|
class TestSingleToken(BaseDSMLAWriteCacheTest):
|
|
"""测试单 token 场景"""
|
|
|
|
def test_single_token(self):
|
|
"""单 token 场景测试"""
|
|
tensors = create_test_tensors(batch_size=1, num_tokens=1)
|
|
|
|
dsk_attn_write_cache(
|
|
tensors["kv_nope"],
|
|
tensors["kv_pe"],
|
|
tensors["kv_cache"],
|
|
tensors["slot_mapping"],
|
|
tensors["scale"],
|
|
"fp8_ds_mla",
|
|
)
|
|
|
|
|
|
class TestLargeBatch(BaseDSMLAWriteCacheTest):
|
|
"""测试大批次场景"""
|
|
|
|
def test_large_batch(self):
|
|
"""大批次场景测试"""
|
|
tensors = create_test_tensors(batch_size=32, num_tokens=512)
|
|
|
|
dsk_attn_write_cache(
|
|
tensors["kv_nope"],
|
|
tensors["kv_pe"],
|
|
tensors["kv_cache"],
|
|
tensors["slot_mapping"],
|
|
tensors["scale"],
|
|
"fp8_ds_mla",
|
|
)
|
|
|
|
self.assertEqual(tensors["kv_cache"].dtype, paddle.uint8)
|
|
|
|
|
|
class TestUnalignedTokens(BaseDSMLAWriteCacheTest):
|
|
"""测试非对齐 token 数(非 block_size 整数倍)"""
|
|
|
|
def test_unaligned_tokens(self):
|
|
"""非对齐 token 数测试"""
|
|
# 17 tokens 不是 16 (block_size) 的整数倍
|
|
tensors = create_test_tensors(batch_size=1, num_tokens=17)
|
|
|
|
dsk_attn_write_cache(
|
|
tensors["kv_nope"],
|
|
tensors["kv_pe"],
|
|
tensors["kv_cache"],
|
|
tensors["slot_mapping"],
|
|
tensors["scale"],
|
|
"fp8_ds_mla",
|
|
)
|
|
|
|
|
|
# ==================== 量化类型测试 ====================
|
|
|
|
|
|
class TestQuantTypeFp8DsMla(BaseDSMLAWriteCacheTest):
|
|
"""测试 fp8_ds_mla 量化类型"""
|
|
|
|
def test_quant_type_fp8_ds_mla(self):
|
|
"""fp8_ds_mla 量化类型测试"""
|
|
tensors = create_test_tensors(batch_size=2, num_tokens=16)
|
|
|
|
dsk_attn_write_cache(
|
|
tensors["kv_nope"],
|
|
tensors["kv_pe"],
|
|
tensors["kv_cache"],
|
|
tensors["slot_mapping"],
|
|
tensors["scale"],
|
|
"fp8_ds_mla", # 主要测试的量化类型
|
|
)
|
|
|
|
|
|
class TestQuantTypeNone(BaseDSMLAWriteCacheTest):
|
|
"""测试无量化模式"""
|
|
|
|
def test_quant_type_none(self):
|
|
"""无量化模式测试"""
|
|
tensors = create_test_tensors(batch_size=2, num_tokens=16)
|
|
# 无量化时 cache 格式不同: [num_blocks, 1, block_size, kv_lora_rank + pe_dim]
|
|
tensors["kv_cache"] = paddle.zeros([100, 1, BLOCK_SIZE, KV_LORA_RANK + PE_DIM], dtype="bfloat16")
|
|
|
|
try:
|
|
dsk_attn_write_cache(
|
|
tensors["kv_nope"],
|
|
tensors["kv_pe"],
|
|
tensors["kv_cache"],
|
|
tensors["slot_mapping"],
|
|
None, # scale 在无量化时可为 None
|
|
True,
|
|
)
|
|
|
|
except Exception as e:
|
|
# 如果 'none' 类型不支持,跳过
|
|
self.skipTest(f"'none' quant type 可能未实现: {e}")
|
|
|
|
|
|
# ==================== 可选参数测试 ====================
|
|
|
|
|
|
class TestWithoutScale(BaseDSMLAWriteCacheTest):
|
|
"""测试不传 scale 参数"""
|
|
|
|
def test_without_scale(self):
|
|
"""不传 scale 参数测试"""
|
|
tensors = create_test_tensors(batch_size=2, num_tokens=16)
|
|
|
|
dsk_attn_write_cache(
|
|
tensors["kv_nope"],
|
|
tensors["kv_pe"],
|
|
tensors["kv_cache"],
|
|
tensors["slot_mapping"],
|
|
None,
|
|
"fp8_ds_mla",
|
|
)
|
|
|
|
|
|
class TestWithoutKvSignalData(BaseDSMLAWriteCacheTest):
|
|
"""测试不传 kv_signal_data 参数"""
|
|
|
|
def test_without_kv_signal_data(self):
|
|
"""不传 kv_signal_data 参数测试"""
|
|
tensors = create_test_tensors(batch_size=2, num_tokens=16)
|
|
|
|
dsk_attn_write_cache(
|
|
tensors["kv_nope"],
|
|
tensors["kv_pe"],
|
|
tensors["kv_cache"],
|
|
tensors["slot_mapping"],
|
|
tensors["scale"],
|
|
"fp8_ds_mla",
|
|
)
|
|
|
|
|
|
# ==================== 数据类型测试 ====================
|
|
|
|
|
|
class TestBfloat16Input(BaseDSMLAWriteCacheTest):
|
|
"""测试 bfloat16 输入"""
|
|
|
|
def test_bfloat16_input(self):
|
|
"""bfloat16 输入测试"""
|
|
tensors = create_test_tensors(dtype="bfloat16")
|
|
|
|
dsk_attn_write_cache(
|
|
tensors["kv_nope"],
|
|
tensors["kv_pe"],
|
|
tensors["kv_cache"],
|
|
tensors["slot_mapping"],
|
|
tensors["scale"],
|
|
"fp8_ds_mla",
|
|
)
|
|
|
|
|
|
class TestFloat16Input(BaseDSMLAWriteCacheTest):
|
|
"""测试 float16 输入"""
|
|
|
|
def test_float16_input(self):
|
|
"""float16 输入测试"""
|
|
tensors = create_test_tensors(dtype="float16")
|
|
|
|
try:
|
|
dsk_attn_write_cache(
|
|
tensors["kv_nope"],
|
|
tensors["kv_pe"],
|
|
tensors["kv_cache"],
|
|
tensors["slot_mapping"],
|
|
tensors["scale"],
|
|
"fp8_ds_mla",
|
|
)
|
|
|
|
except Exception as e:
|
|
self.skipTest(f"float16 输入可能不支持: {e}")
|
|
|
|
|
|
# ==================== 性能测试 ====================
|
|
|
|
|
|
class TestDSMLAWriteCachePerformance(BaseDSMLAWriteCacheTest):
|
|
"""DSMLAWriteCacheKernel 性能测试"""
|
|
|
|
def test_warmup_and_benchmark(self):
|
|
"""Warmup 并简单 benchmark"""
|
|
tensors = create_test_tensors(batch_size=16, num_tokens=256)
|
|
|
|
# Warmup
|
|
for _ in range(5):
|
|
_ = dsk_attn_write_cache(
|
|
tensors["kv_nope"],
|
|
tensors["kv_pe"],
|
|
tensors["kv_cache"],
|
|
tensors["slot_mapping"],
|
|
tensors["scale"],
|
|
"fp8_ds_mla",
|
|
)
|
|
|
|
paddle.device.synchronize()
|
|
|
|
# Benchmark
|
|
num_iters = 100
|
|
start = time.perf_counter()
|
|
|
|
for _ in range(num_iters):
|
|
_ = dsk_attn_write_cache(
|
|
tensors["kv_nope"],
|
|
tensors["kv_pe"],
|
|
tensors["kv_cache"],
|
|
tensors["slot_mapping"],
|
|
tensors["scale"],
|
|
"fp8_ds_mla",
|
|
)
|
|
|
|
paddle.device.synchronize()
|
|
end = time.perf_counter()
|
|
|
|
avg_time_ms = (end - start) / num_iters * 1000
|
|
print(f"\n[Benchmark] 256 tokens, avg time: {avg_time_ms:.3f} ms")
|
|
|
|
# 性能阈值检查 (可根据实际情况调整)
|
|
self.assertLess(avg_time_ms, 100.0, "性能应在 10ms 内")
|
|
|
|
|
|
if __name__ == "__main__":
|
|
print("=" * 70)
|
|
print("DSMLAWriteCacheKernel 单元测试")
|
|
print("=" * 70)
|
|
|
|
# 运行测试
|
|
unittest.main(verbosity=2)
|