From f0189292df5692d02b30c97cf650e86555b3b89b Mon Sep 17 00:00:00 2001 From: bukejiyu <52310069+bukejiyu@users.noreply.github.com> Date: Wed, 12 Nov 2025 20:26:49 +0800 Subject: [PATCH] [CI] fix test_model_cache (#4982) * ci * update --- .../model_executor/load_weight_utils.py | 2 +- tests/conftest.py | 15 ++++--- tests/model_loader/test_model_cache.py | 8 +--- tests/model_loader/test_offline_model.py | 6 --- tests/model_loader/test_torch_model.py | 5 --- tests/model_loader/utils.py | 40 ------------------- 6 files changed, 12 insertions(+), 64 deletions(-) diff --git a/fastdeploy/model_executor/load_weight_utils.py b/fastdeploy/model_executor/load_weight_utils.py index f2cf9ba1be..1071313385 100644 --- a/fastdeploy/model_executor/load_weight_utils.py +++ b/fastdeploy/model_executor/load_weight_utils.py @@ -398,7 +398,7 @@ def get_all_weights_file(model_path: str): """ model_path = Path(model_path) use_safetensors = True - files_list = [file for file in model_path.glob("*.pdparams") if file.name != "scheduler.pdparams"] + files_list = [str(file) for file in model_path.glob("*.pdparams") if file.name != "scheduler.pdparams"] if len(files_list) > 0: key_name_list = [] use_safetensors = False diff --git a/tests/conftest.py b/tests/conftest.py index 99536c2d97..22c71ca862 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -15,7 +15,12 @@ import time from typing import Any, Union import pytest -from model_loader.utils import clean_ports +from e2e.utils.serving_utils import ( + FD_API_PORT, + FD_CACHE_QUEUE_PORT, + FD_ENGINE_QUEUE_PORT, + clean_ports, +) class FDRunner: @@ -31,10 +36,7 @@ class FDRunner: ) -> None: from fastdeploy.entrypoints.llm import LLM - ports_to_clean = [] - port_keys = ["engine_worker_queue_port", "cache_queue_port", "port", "metrics_port"] - ports_to_clean.extend(kwargs[k] for k in port_keys if k in kwargs) - clean_ports(ports_to_clean) + clean_ports() time.sleep(10) graph_optimization_config = {"use_cudagraph": False} self.llm = LLM( @@ -46,6 +48,9 @@ class FDRunner: quantization=quantization, max_num_batched_tokens=max_model_len, graph_optimization_config=graph_optimization_config, + port=FD_API_PORT, + cache_queue_port=FD_CACHE_QUEUE_PORT, + engine_worker_queue_port=FD_ENGINE_QUEUE_PORT, **kwargs, ) diff --git a/tests/model_loader/test_model_cache.py b/tests/model_loader/test_model_cache.py index ff924a6d06..9a37e52749 100644 --- a/tests/model_loader/test_model_cache.py +++ b/tests/model_loader/test_model_cache.py @@ -31,9 +31,6 @@ from tests.model_loader.utils import ( run_with_timeout, ) -FD_ENGINE_QUEUE_PORT = int(os.getenv("FD_ENGINE_QUEUE_PORT", 8313)) -FD_CACHE_QUEUE_PORT = int(os.getenv("FD_CACHE_QUEUE_PORT", 8333)) - prompts = ["解释下“温故而知新", "Hello, how are you?"] @@ -48,6 +45,7 @@ model_param_map = { ], "max_num_seqs": 1, "graph_optimization_config": {"use_cudagraph": False}, + "env": {"FD_USE_MACHETE": "0"}, } } @@ -102,9 +100,7 @@ def test_model_cache( max_tokens, quantization, "default_v1", - FD_ENGINE_QUEUE_PORT, prompts, - FD_CACHE_QUEUE_PORT, ), ) @@ -123,9 +119,7 @@ def test_model_cache( max_tokens, quantization, "default_v1", - FD_ENGINE_QUEUE_PORT, prompts, - FD_CACHE_QUEUE_PORT, ), ) check_tokens_id_and_text_close( diff --git a/tests/model_loader/test_offline_model.py b/tests/model_loader/test_offline_model.py index 25373da435..ea67fadf15 100644 --- a/tests/model_loader/test_offline_model.py +++ b/tests/model_loader/test_offline_model.py @@ -30,10 +30,6 @@ from tests.model_loader.utils import ( run_with_timeout, ) -FD_ENGINE_QUEUE_PORT = int(os.getenv("FD_ENGINE_QUEUE_PORT", 8313)) -FD_CACHE_QUEUE_PORT = int(os.getenv("FD_CACHE_QUEUE_PORT", 8333)) - - model_param_map = { "Qwen3-30B-A3B-FP8": { "tensor_parallel_size": 2, @@ -102,8 +98,6 @@ def test_offline_model( max_tokens, quantization, "default_v1", - FD_ENGINE_QUEUE_PORT, prompts, - FD_CACHE_QUEUE_PORT, ), ) diff --git a/tests/model_loader/test_torch_model.py b/tests/model_loader/test_torch_model.py index e10c1376d8..bc8252a442 100644 --- a/tests/model_loader/test_torch_model.py +++ b/tests/model_loader/test_torch_model.py @@ -31,9 +31,6 @@ from tests.model_loader.utils import ( run_with_timeout, ) -FD_ENGINE_QUEUE_PORT = int(os.getenv("FD_ENGINE_QUEUE_PORT", 8313)) -FD_CACHE_QUEUE_PORT = int(os.getenv("FD_CACHE_QUEUE_PORT", 8333)) - prompts = ["北京天安门在哪里?"] @@ -134,9 +131,7 @@ def test_model_against_baseline( max_tokens, quantization, "default_v1", - FD_ENGINE_QUEUE_PORT, prompts, - FD_CACHE_QUEUE_PORT, ), ) diff --git a/tests/model_loader/utils.py b/tests/model_loader/utils.py index 18672e54ff..9a41422fe9 100644 --- a/tests/model_loader/utils.py +++ b/tests/model_loader/utils.py @@ -85,9 +85,7 @@ def form_model_get_output_topp0( max_tokens, quantization, load_choices, - engine_worker_queue_port, prompts, - cache_queue_port, result_queue, ): try: @@ -98,8 +96,6 @@ def form_model_get_output_topp0( max_model_len=max_model_len, load_choices=load_choices, quantization=quantization, - engine_worker_queue_port=engine_worker_queue_port, - cache_queue_port=cache_queue_port, ) as fd_model: fd_outputs = fd_model.generate_topp0(prompts, max_tokens=max_tokens) result_queue.put(fd_outputs) @@ -109,42 +105,6 @@ def form_model_get_output_topp0( pytest.fail(f"Failed to initialize LLM model from {model_path}") -def form_model_get_output_topp1( - fd_runner, - model_path, - tensor_parallel_size, - max_num_seqs, - max_model_len, - max_tokens, - quantization, - load_choices, - engine_worker_queue_port, - prompts, - cache_queue_port, - result_queue, -): - try: - with fd_runner( - model_path, - tensor_parallel_size=tensor_parallel_size, - max_num_seqs=max_num_seqs, - max_model_len=max_model_len, - load_choices=load_choices, - quantization=quantization, - engine_worker_queue_port=engine_worker_queue_port, - cache_queue_port=cache_queue_port, - ) as fd_model: - from fastdeploy.engine.sampling_params import SamplingParams - - sampling_params = SamplingParams(temperature=0, top_p=1, max_tokens=max_tokens) - fd_outputs = fd_model.generate(prompts, sampling_params) - result_queue.put(fd_outputs) - except Exception: - print(f"Failed using {load_choices} loader to load model from {model_path}.") - traceback.print_exc() - pytest.fail(f"Failed to initialize LLM model from {model_path}") - - def kill_process_on_port(port: int): """ Kill processes that are listening on the given port.