mirror of
https://github.com/PaddlePaddle/FastDeploy.git
synced 2026-04-23 00:17:25 +08:00
[Feature] consider multimodal model when dummy run (#6045)
* add mm do profile * updata code * update code * update code * update code * update test case * update code * update code * fix xpu bug * update code * add mm do profile * update test case * update code
This commit is contained in:
@@ -34,6 +34,7 @@ def make_prefix_cache_manager(max_num_seqs, enable_mm=False, num_gpu_blocks_over
|
||||
speculative_cfg = SimpleNamespace(method=None)
|
||||
model_cfg.print = print
|
||||
model_cfg.architectures = ["test_model"]
|
||||
model_cfg.mm_max_tokens_per_item = None
|
||||
cache_cfg.bytes_per_layer_per_block = 1
|
||||
parallel_cfg = ParallelConfig(args)
|
||||
scheduler_cfg = SchedulerConfig(args)
|
||||
|
||||
@@ -80,6 +80,7 @@ def _build_manager(
|
||||
model_cfg.print = print
|
||||
model_cfg.max_model_len = max_model_len
|
||||
model_cfg.architectures = architectures or ["test_model"]
|
||||
model_cfg.mm_max_tokens_per_item = None
|
||||
cache_cfg.bytes_per_layer_per_block = 1
|
||||
cache_cfg.kv_cache_ratio = 1.0
|
||||
parallel_cfg = ParallelConfig(args)
|
||||
@@ -140,6 +141,7 @@ class TestResourceManagerV1(unittest.TestCase):
|
||||
model_cfg.print = print
|
||||
model_cfg.max_model_len = 3200
|
||||
model_cfg.architectures = ["test_model"]
|
||||
model_cfg.mm_max_tokens_per_item = None
|
||||
cache_cfg.bytes_per_layer_per_block = 1
|
||||
cache_cfg.kv_cache_ratio = 1.0
|
||||
parallel_cfg = ParallelConfig(args)
|
||||
@@ -301,6 +303,7 @@ class TestRevertChunkedMMInput(unittest.TestCase):
|
||||
model_cfg.print = print
|
||||
model_cfg.max_model_len = 3200
|
||||
model_cfg.architectures = ["test_model"]
|
||||
model_cfg.mm_max_tokens_per_item = None
|
||||
cache_cfg.bytes_per_layer_per_block = 1
|
||||
cache_cfg.kv_cache_ratio = 1.0
|
||||
cache_cfg.block_size = 64
|
||||
|
||||
@@ -32,6 +32,7 @@ def test_normal_schedule():
|
||||
speculative_cfg = SimpleNamespace(method=None)
|
||||
model_cfg.print = print
|
||||
model_cfg.max_model_len = 5120
|
||||
model_cfg.mm_max_tokens_per_item = None
|
||||
cache_cfg.bytes_per_layer_per_block = 1
|
||||
parallel_cfg = ParallelConfig(args)
|
||||
scheduler_cfg = SchedulerConfig(args)
|
||||
@@ -97,6 +98,7 @@ def test_preempted_request():
|
||||
speculative_cfg = SimpleNamespace(method=None)
|
||||
model_cfg.print = print
|
||||
model_cfg.max_model_len = 5120
|
||||
model_cfg.mm_max_tokens_per_item = None
|
||||
cache_cfg.bytes_per_layer_per_block = 1
|
||||
parallel_cfg = ParallelConfig(args)
|
||||
scheduler_cfg = SchedulerConfig(args)
|
||||
@@ -163,6 +165,7 @@ def test_caching_output():
|
||||
speculative_cfg = SimpleNamespace(method=None)
|
||||
model_cfg.print = print
|
||||
model_cfg.max_model_len = 5120
|
||||
model_cfg.mm_max_tokens_per_item = None
|
||||
cache_cfg.bytes_per_layer_per_block = 1
|
||||
parallel_cfg = ParallelConfig(args)
|
||||
scheduler_cfg = SchedulerConfig(args)
|
||||
|
||||
Reference in New Issue
Block a user