mirror of
https://github.com/PaddlePaddle/FastDeploy.git
synced 2026-04-23 00:17:25 +08:00
7da5f54fb3
* Add unit test for swap_layout * remove splitwise_scheduler test
98 lines
3.2 KiB
Python
98 lines
3.2 KiB
Python
import random
|
|
import time
|
|
import unittest
|
|
|
|
import paddle
|
|
|
|
from fastdeploy.cache_manager.ops import cuda_host_alloc, cuda_host_free
|
|
from fastdeploy.model_executor.ops.gpu import swap_cache_layout
|
|
|
|
|
|
class Test(unittest.TestCase):
|
|
def setUp(self):
|
|
self.layer_num = 30
|
|
self.block_num = 3000
|
|
self.head_num = 4
|
|
self.block_size = 64
|
|
self.head_dim = 128
|
|
|
|
self.swap_block_num = 100
|
|
self.cache_shape = [self.block_num, self.head_num, self.block_size, self.head_dim]
|
|
assert self.swap_block_num <= self.block_num
|
|
|
|
# cache layout: layer_num * [block_num, head_num, block_size, head_dim]
|
|
# buffer layout: [block_num, layer_num, head_num, block_size, head_dim]
|
|
# self.gpu_cache_tensors = self._init_gpu_cache()
|
|
|
|
self.block_bytes = self.head_num * self.block_size * self.head_dim * 2
|
|
buffer_total_bytes = self.swap_block_num * self.layer_num * self.block_bytes
|
|
self.cpu_buffer = cuda_host_alloc(buffer_total_bytes)
|
|
|
|
self.gpu_block_ids = random.sample(list(range(self.block_num)), self.swap_block_num)
|
|
self.cpu_block_ids = list(range(self.swap_block_num))
|
|
|
|
def tearDown(self) -> None:
|
|
cuda_host_free(self.cpu_buffer)
|
|
|
|
def _init_gpu_cache(self, fill_value=None):
|
|
gpu_cache_tensors = []
|
|
for i in range(self.layer_num):
|
|
if fill_value is None:
|
|
value = i
|
|
else:
|
|
value = float(fill_value)
|
|
gpu_cache_tensors.append(paddle.full(self.cache_shape, fill_value=value, dtype=paddle.float16))
|
|
paddle.device.synchronize()
|
|
return gpu_cache_tensors
|
|
|
|
def _swap_cache_layout(self):
|
|
self.gpu_cache_tensors = self._init_gpu_cache()
|
|
|
|
ss = time.time()
|
|
swap_cache_layout(
|
|
self.gpu_cache_tensors,
|
|
self.cpu_buffer,
|
|
self.cache_shape,
|
|
self.gpu_block_ids,
|
|
self.cpu_block_ids,
|
|
0,
|
|
0,
|
|
)
|
|
cost_time = time.time() - ss
|
|
total_gb = self.block_bytes * self.swap_block_num * self.layer_num / 1073741824
|
|
speed = total_gb / cost_time
|
|
print(
|
|
f"swap cache layout (device to host), total_gb: {total_gb:.6f}GB, cost_time: {cost_time:.6f}s, speed: {speed:.6f}GB/s"
|
|
)
|
|
|
|
self.gpu_cache_tensors = self._init_gpu_cache(-1)
|
|
|
|
ss = time.time()
|
|
swap_cache_layout(
|
|
self.gpu_cache_tensors,
|
|
self.cpu_buffer,
|
|
self.cache_shape,
|
|
self.gpu_block_ids,
|
|
self.cpu_block_ids,
|
|
0,
|
|
1,
|
|
)
|
|
cost_time = time.time() - ss
|
|
speed = total_gb / cost_time
|
|
print(
|
|
f"swap cache layout (host to device), total_gb: {total_gb:.6f}GB, cost_time: {cost_time:.6f}s, speed: {speed:.6f}GB/s"
|
|
)
|
|
|
|
for i in range(self.layer_num):
|
|
gpu_cache = self.gpu_cache_tensors[i][self.gpu_block_ids]
|
|
assert paddle.allclose(gpu_cache, paddle.ones_like(gpu_cache) * i)
|
|
|
|
def test_swap_cache_layout(self):
|
|
"""test swap cache layout"""
|
|
for _ in range(5):
|
|
self._swap_cache_layout()
|
|
|
|
|
|
if __name__ == "__main__":
|
|
unittest.main()
|