mirror of
https://github.com/PaddlePaddle/FastDeploy.git
synced 2026-04-23 00:17:25 +08:00
9148562ed0
* [CI]【Hackathon 10th Spring No.35】resource_manager 单测补充 * [CI]【Hackathon 10th Spring No.35】resource_manager 单测补充 * [CI]【Hackathon 10th Spring No.35】add __main__ block --------- Co-authored-by: cloudforge1 <cloudforge1@users.noreply.github.com> Co-authored-by: CSWYF3634076 <wangyafeng@baidu.com>
254 lines
8.8 KiB
Python
254 lines
8.8 KiB
Python
# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
|
|
#
|
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
# you may not use this file except in compliance with the License.
|
|
# You may obtain a copy of the License at
|
|
#
|
|
# http://www.apache.org/licenses/LICENSE-2.0
|
|
#
|
|
# Unless required by applicable law or agreed to in writing, software
|
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
# See the License for the specific language governing permissions and
|
|
# limitations under the License.
|
|
|
|
from types import SimpleNamespace
|
|
from unittest.mock import patch
|
|
|
|
import pytest
|
|
|
|
# -- Stubs ------------------------------------------------------------------
|
|
|
|
|
|
class _StubCacheManager:
|
|
"""Minimal PrefixCacheManager surface for unit-testing ResourceManager."""
|
|
|
|
def __init__(self, *args, num_blocks=100, **kwargs):
|
|
self.num_gpu_blocks = num_blocks
|
|
self.gpu_free_block_list = list(range(num_blocks))
|
|
self._recycled = []
|
|
self._released = []
|
|
|
|
def allocate_gpu_blocks(self, n):
|
|
out = self.gpu_free_block_list[:n]
|
|
self.gpu_free_block_list = self.gpu_free_block_list[n:]
|
|
return out
|
|
|
|
def recycle_gpu_blocks(self, blocks):
|
|
self._recycled.extend(blocks)
|
|
self.gpu_free_block_list.extend(blocks)
|
|
|
|
def release_block_ids_async(self, task):
|
|
self._released.append(task)
|
|
|
|
def free_block_ids_async(self, n):
|
|
return n
|
|
|
|
def update_cache_config(self, cfg):
|
|
pass
|
|
|
|
def request_block_ids(self, task, block_size, dec_token_num):
|
|
total = (len(task.prompt_token_ids) + block_size - 1) // block_size
|
|
common = list(range(total // 2))
|
|
unique = list(range(100, 100 + total - total // 2))
|
|
return common, unique, {"gpu_cache_blocks": len(common), "cpu_cache_blocks": 0}
|
|
|
|
|
|
class _Task:
|
|
"""Real task object with all fields ResourceManager touches."""
|
|
|
|
def __init__(self, request_id="req-1", prompt_len=128, disaggregate_info=None):
|
|
self.request_id = request_id
|
|
self.prompt_token_ids = list(range(prompt_len))
|
|
self.prompt_token_ids_len = prompt_len
|
|
self.block_tables = []
|
|
self.need_block_tables = []
|
|
self.disaggregate_info = disaggregate_info
|
|
self.seq_lens_decoder = 0
|
|
self.inference_time_cost = -1.0
|
|
self.tokens_all_num = 0
|
|
self.idx = 0
|
|
self.num_cached_tokens = 0
|
|
self.gpu_cache_token_num = 0
|
|
self.cpu_cache_token_num = 0
|
|
self.cache_info = None
|
|
self.cache_prepare_time = 0.0
|
|
self._seed = None
|
|
|
|
def get(self, k):
|
|
return self._seed if k == "seed" else None
|
|
|
|
def set(self, k, v):
|
|
if k == "seed":
|
|
self._seed = v
|
|
|
|
|
|
def _cache_cfg(block_size=64, dec_token_num=128, max_block_num_per_seq=16, enable_prefix_caching=False):
|
|
return SimpleNamespace(
|
|
block_size=block_size,
|
|
dec_token_num=dec_token_num,
|
|
max_block_num_per_seq=max_block_num_per_seq,
|
|
enable_prefix_caching=enable_prefix_caching,
|
|
)
|
|
|
|
|
|
def _config(cache_config=None):
|
|
return SimpleNamespace(cache_config=cache_config or _cache_cfg())
|
|
|
|
|
|
def _noop_logger():
|
|
return SimpleNamespace(
|
|
info=lambda *a, **kw: None,
|
|
debug=lambda *a, **kw: None,
|
|
error=lambda *a, **kw: None,
|
|
warning=lambda *a, **kw: None,
|
|
)
|
|
|
|
|
|
def _stub_metrics():
|
|
m = SimpleNamespace()
|
|
for n in (
|
|
"max_batch_size",
|
|
"batch_size",
|
|
"available_gpu_block_num",
|
|
"gpu_cache_usage_perc",
|
|
"prefix_cache_token_num",
|
|
"prefix_gpu_cache_token_num",
|
|
"prefix_cpu_cache_token_num",
|
|
):
|
|
setattr(m, n, SimpleNamespace(set=lambda v: None, inc=lambda v: None))
|
|
return m
|
|
|
|
|
|
@pytest.fixture()
|
|
def rm_factory():
|
|
"""Yield a factory that creates ResourceManagers with stubbed deps."""
|
|
with (
|
|
patch("fastdeploy.engine.resource_manager.PrefixCacheManager", _StubCacheManager),
|
|
patch("fastdeploy.engine.resource_manager.main_process_metrics", _stub_metrics()),
|
|
patch("fastdeploy.engine.resource_manager.llm_logger", _noop_logger()),
|
|
):
|
|
from fastdeploy.engine.resource_manager import ResourceManager
|
|
|
|
def make(max_seqs=4, block_size=64, dec_token=128, enable_prefix=False, num_free=100):
|
|
cc = _cache_cfg(block_size, dec_token, 16, enable_prefix)
|
|
rm = ResourceManager(max_seqs, _config(cc), 1, "mixed")
|
|
rm.cache_manager = _StubCacheManager(num_blocks=num_free)
|
|
return rm
|
|
|
|
yield make
|
|
|
|
|
|
# -- Tests ------------------------------------------------------------------
|
|
|
|
|
|
def test_init_block_math_and_config(rm_factory):
|
|
"""Constructor fields, block calculations, reset_cache_config."""
|
|
rm = rm_factory(max_seqs=8, block_size=64, dec_token=128)
|
|
assert rm.max_num_seqs == 8
|
|
assert rm.stop_flags == [True] * 8
|
|
assert rm.get_required_block_number(100) == 4
|
|
assert rm.get_encoder_block_number(100) == 2
|
|
assert rm.get_decoder_block_number() == 2
|
|
assert rm.total_block_number() == 100
|
|
rm.reset_cache_config(_cache_cfg(block_size=128))
|
|
assert rm.cfg.block_size == 128
|
|
|
|
|
|
def test_availability_and_sufficiency(rm_factory):
|
|
"""available_batch, available_block_num, is_resource_sufficient."""
|
|
rm = rm_factory(max_seqs=4, dec_token=0, num_free=100)
|
|
assert rm.available_batch() == 4
|
|
assert rm.available_block_num() == 100
|
|
assert rm.is_resource_sufficient(64)
|
|
rm.stop_flags = [False] * 4
|
|
assert not rm.is_resource_sufficient(1)
|
|
rm2 = rm_factory(max_seqs=4, num_free=0)
|
|
assert not rm2.is_resource_sufficient(64)
|
|
|
|
|
|
def test_allocate_no_prefix(rm_factory):
|
|
"""Main allocation path without prefix caching (happy + empty-blocks)."""
|
|
rm = rm_factory(max_seqs=4, enable_prefix=False, dec_token=0, num_free=100)
|
|
tasks = [_Task(request_id=f"r{i}") for i in range(3)]
|
|
result = rm.allocate_resources_for_new_tasks(tasks)
|
|
assert len(result) == 3
|
|
assert rm.stop_flags == [False, False, False, True]
|
|
assert rm.real_bsz == 3
|
|
assert all(t.get("seed") is not None for t in result)
|
|
assert all(len(t.block_tables) > 0 for t in result)
|
|
|
|
|
|
def test_allocate_with_prefix(rm_factory):
|
|
"""Allocation with prefix cache (exercises _record_request_cache_info)."""
|
|
rm = rm_factory(max_seqs=4, enable_prefix=True, dec_token=0, block_size=64, num_free=100)
|
|
t = _Task(prompt_len=256)
|
|
result = rm.allocate_resources_for_new_tasks([t])
|
|
assert len(result) == 1
|
|
assert len(t.block_tables) > 0
|
|
assert t.num_cached_tokens >= 0
|
|
assert t.cache_info is not None
|
|
|
|
|
|
def test_allocate_disaggregate(rm_factory):
|
|
"""Disaggregate prefill/decode paths (prefix + no-prefix)."""
|
|
rm = rm_factory(max_seqs=4, enable_prefix=True, dec_token=0, block_size=64, num_free=100)
|
|
t = _Task(prompt_len=256, disaggregate_info={"role": "prefill"})
|
|
rm.allocate_resources_for_new_tasks([t])
|
|
assert "block_tables" in t.disaggregate_info
|
|
assert t.request_id in rm.req_dict
|
|
# No-prefix + decode
|
|
rm2 = rm_factory(max_seqs=4, enable_prefix=False, dec_token=0, num_free=100)
|
|
t2 = _Task(prompt_len=128, disaggregate_info={"role": "decode"})
|
|
rm2.allocate_resources_for_new_tasks([t2])
|
|
assert t2.request_id in rm2.req_dict
|
|
|
|
|
|
def test_recycle_free_and_check(rm_factory):
|
|
"""_recycle_block_tables, free_block_tables, check_and_free_block_tables."""
|
|
rm = rm_factory(enable_prefix=False, num_free=100)
|
|
t = _Task()
|
|
t.block_tables = [0, 1, 2]
|
|
rm._recycle_block_tables(t)
|
|
assert 0 in rm.cache_manager._recycled
|
|
# Prefix recycle delegates to release_block_ids_async
|
|
rm2 = rm_factory(enable_prefix=True, num_free=100)
|
|
t2 = _Task()
|
|
t2.block_tables = [0, 1]
|
|
rm2._recycle_block_tables(t2)
|
|
assert t2 in rm2.cache_manager._released
|
|
# free + check paths
|
|
assert rm.free_block_tables(10) == 10
|
|
rm.check_and_free_block_tables()
|
|
rm3 = rm_factory(enable_prefix=True, num_free=5)
|
|
rm3.check_and_free_block_tables()
|
|
|
|
|
|
def test_info_and_cache_usage(rm_factory):
|
|
"""info() string and get_gpu_cache_usage_perc."""
|
|
rm = rm_factory(num_free=100)
|
|
assert "ResourceManager info" in rm.info()
|
|
rm.cache_manager.num_gpu_blocks = 100
|
|
rm.cache_manager.gpu_free_block_list = list(range(80))
|
|
assert abs(rm.get_gpu_cache_usage_perc() - 0.2) < 1e-9
|
|
rm2 = rm_factory(num_free=0)
|
|
rm2.cache_manager.num_gpu_blocks = 0
|
|
assert rm2.get_gpu_cache_usage_perc() == 0.0
|
|
|
|
|
|
def test_delete_cached_data(rm_factory):
|
|
"""_delete_cached_data: full and partial cache hits."""
|
|
rm = rm_factory(block_size=64)
|
|
t = _Task(prompt_len=128)
|
|
rm._delete_cached_data(t, 128)
|
|
assert t.prompt_token_ids_len == 64
|
|
assert t.seq_lens_decoder == 64
|
|
t2 = _Task(prompt_len=256)
|
|
rm._delete_cached_data(t2, 64)
|
|
assert t2.prompt_token_ids_len == 192
|
|
assert t2.seq_lens_decoder == 64
|
|
|
|
|
|
if __name__ == "__main__":
|
|
pytest.main([__file__, "-v"])
|