[CI] fix test_model_cache (#4982)

* ci

* update
This commit is contained in:
bukejiyu
2025-11-12 20:26:49 +08:00
committed by GitHub
parent a2d06118e1
commit f0189292df
6 changed files with 12 additions and 64 deletions
@@ -398,7 +398,7 @@ def get_all_weights_file(model_path: str):
"""
model_path = Path(model_path)
use_safetensors = True
files_list = [file for file in model_path.glob("*.pdparams") if file.name != "scheduler.pdparams"]
files_list = [str(file) for file in model_path.glob("*.pdparams") if file.name != "scheduler.pdparams"]
if len(files_list) > 0:
key_name_list = []
use_safetensors = False
+10 -5
View File
@@ -15,7 +15,12 @@ import time
from typing import Any, Union
import pytest
from model_loader.utils import clean_ports
from e2e.utils.serving_utils import (
FD_API_PORT,
FD_CACHE_QUEUE_PORT,
FD_ENGINE_QUEUE_PORT,
clean_ports,
)
class FDRunner:
@@ -31,10 +36,7 @@ class FDRunner:
) -> None:
from fastdeploy.entrypoints.llm import LLM
ports_to_clean = []
port_keys = ["engine_worker_queue_port", "cache_queue_port", "port", "metrics_port"]
ports_to_clean.extend(kwargs[k] for k in port_keys if k in kwargs)
clean_ports(ports_to_clean)
clean_ports()
time.sleep(10)
graph_optimization_config = {"use_cudagraph": False}
self.llm = LLM(
@@ -46,6 +48,9 @@ class FDRunner:
quantization=quantization,
max_num_batched_tokens=max_model_len,
graph_optimization_config=graph_optimization_config,
port=FD_API_PORT,
cache_queue_port=FD_CACHE_QUEUE_PORT,
engine_worker_queue_port=FD_ENGINE_QUEUE_PORT,
**kwargs,
)
+1 -7
View File
@@ -31,9 +31,6 @@ from tests.model_loader.utils import (
run_with_timeout,
)
FD_ENGINE_QUEUE_PORT = int(os.getenv("FD_ENGINE_QUEUE_PORT", 8313))
FD_CACHE_QUEUE_PORT = int(os.getenv("FD_CACHE_QUEUE_PORT", 8333))
prompts = ["解释下“温故而知新", "Hello, how are you?"]
@@ -48,6 +45,7 @@ model_param_map = {
],
"max_num_seqs": 1,
"graph_optimization_config": {"use_cudagraph": False},
"env": {"FD_USE_MACHETE": "0"},
}
}
@@ -102,9 +100,7 @@ def test_model_cache(
max_tokens,
quantization,
"default_v1",
FD_ENGINE_QUEUE_PORT,
prompts,
FD_CACHE_QUEUE_PORT,
),
)
@@ -123,9 +119,7 @@ def test_model_cache(
max_tokens,
quantization,
"default_v1",
FD_ENGINE_QUEUE_PORT,
prompts,
FD_CACHE_QUEUE_PORT,
),
)
check_tokens_id_and_text_close(
-6
View File
@@ -30,10 +30,6 @@ from tests.model_loader.utils import (
run_with_timeout,
)
FD_ENGINE_QUEUE_PORT = int(os.getenv("FD_ENGINE_QUEUE_PORT", 8313))
FD_CACHE_QUEUE_PORT = int(os.getenv("FD_CACHE_QUEUE_PORT", 8333))
model_param_map = {
"Qwen3-30B-A3B-FP8": {
"tensor_parallel_size": 2,
@@ -102,8 +98,6 @@ def test_offline_model(
max_tokens,
quantization,
"default_v1",
FD_ENGINE_QUEUE_PORT,
prompts,
FD_CACHE_QUEUE_PORT,
),
)
-5
View File
@@ -31,9 +31,6 @@ from tests.model_loader.utils import (
run_with_timeout,
)
FD_ENGINE_QUEUE_PORT = int(os.getenv("FD_ENGINE_QUEUE_PORT", 8313))
FD_CACHE_QUEUE_PORT = int(os.getenv("FD_CACHE_QUEUE_PORT", 8333))
prompts = ["北京天安门在哪里?"]
@@ -134,9 +131,7 @@ def test_model_against_baseline(
max_tokens,
quantization,
"default_v1",
FD_ENGINE_QUEUE_PORT,
prompts,
FD_CACHE_QUEUE_PORT,
),
)
-40
View File
@@ -85,9 +85,7 @@ def form_model_get_output_topp0(
max_tokens,
quantization,
load_choices,
engine_worker_queue_port,
prompts,
cache_queue_port,
result_queue,
):
try:
@@ -98,8 +96,6 @@ def form_model_get_output_topp0(
max_model_len=max_model_len,
load_choices=load_choices,
quantization=quantization,
engine_worker_queue_port=engine_worker_queue_port,
cache_queue_port=cache_queue_port,
) as fd_model:
fd_outputs = fd_model.generate_topp0(prompts, max_tokens=max_tokens)
result_queue.put(fd_outputs)
@@ -109,42 +105,6 @@ def form_model_get_output_topp0(
pytest.fail(f"Failed to initialize LLM model from {model_path}")
def form_model_get_output_topp1(
fd_runner,
model_path,
tensor_parallel_size,
max_num_seqs,
max_model_len,
max_tokens,
quantization,
load_choices,
engine_worker_queue_port,
prompts,
cache_queue_port,
result_queue,
):
try:
with fd_runner(
model_path,
tensor_parallel_size=tensor_parallel_size,
max_num_seqs=max_num_seqs,
max_model_len=max_model_len,
load_choices=load_choices,
quantization=quantization,
engine_worker_queue_port=engine_worker_queue_port,
cache_queue_port=cache_queue_port,
) as fd_model:
from fastdeploy.engine.sampling_params import SamplingParams
sampling_params = SamplingParams(temperature=0, top_p=1, max_tokens=max_tokens)
fd_outputs = fd_model.generate(prompts, sampling_params)
result_queue.put(fd_outputs)
except Exception:
print(f"Failed using {load_choices} loader to load model from {model_path}.")
traceback.print_exc()
pytest.fail(f"Failed to initialize LLM model from {model_path}")
def kill_process_on_port(port: int):
"""
Kill processes that are listening on the given port.