[CI] fix test_model_cache (#4982)

* ci * update
2026-04-23 00:17:25 +08:00 · 2025-11-12 20:26:49 +08:00
parent a2d06118e1
commit f0189292df
6 changed files with 12 additions and 64 deletions
@@ -398,7 +398,7 @@ def get_all_weights_file(model_path: str):
    """
    model_path = Path(model_path)
    use_safetensors = True
-    files_list = [file for file in model_path.glob("*.pdparams") if file.name != "scheduler.pdparams"]
+    files_list = [str(file) for file in model_path.glob("*.pdparams") if file.name != "scheduler.pdparams"]
    if len(files_list) > 0:
        key_name_list = []
        use_safetensors = False
@@ -15,7 +15,12 @@ import time
 from typing import Any, Union

 import pytest
-from model_loader.utils import clean_ports
+from e2e.utils.serving_utils import (
+    FD_API_PORT,
+    FD_CACHE_QUEUE_PORT,
+    FD_ENGINE_QUEUE_PORT,
+    clean_ports,
+)


 class FDRunner:
@@ -31,10 +36,7 @@ class FDRunner:
    ) -> None:
        from fastdeploy.entrypoints.llm import LLM

-        ports_to_clean = []
-        port_keys = ["engine_worker_queue_port", "cache_queue_port", "port", "metrics_port"]
-        ports_to_clean.extend(kwargs[k] for k in port_keys if k in kwargs)
-        clean_ports(ports_to_clean)
+        clean_ports()
        time.sleep(10)
        graph_optimization_config = {"use_cudagraph": False}
        self.llm = LLM(
@@ -46,6 +48,9 @@ class FDRunner:
            quantization=quantization,
            max_num_batched_tokens=max_model_len,
            graph_optimization_config=graph_optimization_config,
+            port=FD_API_PORT,
+            cache_queue_port=FD_CACHE_QUEUE_PORT,
+            engine_worker_queue_port=FD_ENGINE_QUEUE_PORT,
            **kwargs,
        )

@@ -31,9 +31,6 @@ from tests.model_loader.utils import (
    run_with_timeout,
 )

-FD_ENGINE_QUEUE_PORT = int(os.getenv("FD_ENGINE_QUEUE_PORT", 8313))
-FD_CACHE_QUEUE_PORT = int(os.getenv("FD_CACHE_QUEUE_PORT", 8333))
-
 prompts = ["解释下“温故而知新", "Hello, how are you?"]


@@ -48,6 +45,7 @@ model_param_map = {
        ],
        "max_num_seqs": 1,
        "graph_optimization_config": {"use_cudagraph": False},
+        "env": {"FD_USE_MACHETE": "0"},
    }
 }

@@ -102,9 +100,7 @@ def test_model_cache(
            max_tokens,
            quantization,
            "default_v1",
-            FD_ENGINE_QUEUE_PORT,
            prompts,
-            FD_CACHE_QUEUE_PORT,
        ),
    )

@@ -123,9 +119,7 @@ def test_model_cache(
            max_tokens,
            quantization,
            "default_v1",
-            FD_ENGINE_QUEUE_PORT,
            prompts,
-            FD_CACHE_QUEUE_PORT,
        ),
    )
    check_tokens_id_and_text_close(
@@ -30,10 +30,6 @@ from tests.model_loader.utils import (
    run_with_timeout,
 )

-FD_ENGINE_QUEUE_PORT = int(os.getenv("FD_ENGINE_QUEUE_PORT", 8313))
-FD_CACHE_QUEUE_PORT = int(os.getenv("FD_CACHE_QUEUE_PORT", 8333))
-
-
 model_param_map = {
    "Qwen3-30B-A3B-FP8": {
        "tensor_parallel_size": 2,
@@ -102,8 +98,6 @@ def test_offline_model(
            max_tokens,
            quantization,
            "default_v1",
-            FD_ENGINE_QUEUE_PORT,
            prompts,
-            FD_CACHE_QUEUE_PORT,
        ),
    )
@@ -31,9 +31,6 @@ from tests.model_loader.utils import (
    run_with_timeout,
 )

-FD_ENGINE_QUEUE_PORT = int(os.getenv("FD_ENGINE_QUEUE_PORT", 8313))
-FD_CACHE_QUEUE_PORT = int(os.getenv("FD_CACHE_QUEUE_PORT", 8333))
-
 prompts = ["北京天安门在哪里?"]


@@ -134,9 +131,7 @@ def test_model_against_baseline(
            max_tokens,
            quantization,
            "default_v1",
-            FD_ENGINE_QUEUE_PORT,
            prompts,
-            FD_CACHE_QUEUE_PORT,
        ),
    )

@@ -85,9 +85,7 @@ def form_model_get_output_topp0(
    max_tokens,
    quantization,
    load_choices,
-    engine_worker_queue_port,
    prompts,
-    cache_queue_port,
    result_queue,
 ):
    try:
@@ -98,8 +96,6 @@ def form_model_get_output_topp0(
            max_model_len=max_model_len,
            load_choices=load_choices,
            quantization=quantization,
-            engine_worker_queue_port=engine_worker_queue_port,
-            cache_queue_port=cache_queue_port,
        ) as fd_model:
            fd_outputs = fd_model.generate_topp0(prompts, max_tokens=max_tokens)
            result_queue.put(fd_outputs)
@@ -109,42 +105,6 @@ def form_model_get_output_topp0(
        pytest.fail(f"Failed to initialize LLM model from {model_path}")


-def form_model_get_output_topp1(
-    fd_runner,
-    model_path,
-    tensor_parallel_size,
-    max_num_seqs,
-    max_model_len,
-    max_tokens,
-    quantization,
-    load_choices,
-    engine_worker_queue_port,
-    prompts,
-    cache_queue_port,
-    result_queue,
-):
-    try:
-        with fd_runner(
-            model_path,
-            tensor_parallel_size=tensor_parallel_size,
-            max_num_seqs=max_num_seqs,
-            max_model_len=max_model_len,
-            load_choices=load_choices,
-            quantization=quantization,
-            engine_worker_queue_port=engine_worker_queue_port,
-            cache_queue_port=cache_queue_port,
-        ) as fd_model:
-            from fastdeploy.engine.sampling_params import SamplingParams
-
-            sampling_params = SamplingParams(temperature=0, top_p=1, max_tokens=max_tokens)
-            fd_outputs = fd_model.generate(prompts, sampling_params)
-            result_queue.put(fd_outputs)
-    except Exception:
-        print(f"Failed using {load_choices} loader to load model from {model_path}.")
-        traceback.print_exc()
-        pytest.fail(f"Failed to initialize LLM model from {model_path}")
-
-
 def kill_process_on_port(port: int):
    """
    Kill processes that are listening on the given port.