From f0189292df5692d02b30c97cf650e86555b3b89b Mon Sep 17 00:00:00 2001
From: bukejiyu <52310069+bukejiyu@users.noreply.github.com>
Date: Wed, 12 Nov 2025 20:26:49 +0800
Subject: [PATCH] [CI] fix test_model_cache (#4982)

* ci

* update
---
 .../model_executor/load_weight_utils.py       |  2 +-
 tests/conftest.py                             | 15 ++++---
 tests/model_loader/test_model_cache.py        |  8 +---
 tests/model_loader/test_offline_model.py      |  6 ---
 tests/model_loader/test_torch_model.py        |  5 ---
 tests/model_loader/utils.py                   | 40 -------------------
 6 files changed, 12 insertions(+), 64 deletions(-)

diff --git a/fastdeploy/model_executor/load_weight_utils.py b/fastdeploy/model_executor/load_weight_utils.py
index f2cf9ba1be..1071313385 100644
--- a/fastdeploy/model_executor/load_weight_utils.py
+++ b/fastdeploy/model_executor/load_weight_utils.py
@@ -398,7 +398,7 @@ def get_all_weights_file(model_path: str):
     """
     model_path = Path(model_path)
     use_safetensors = True
-    files_list = [file for file in model_path.glob("*.pdparams") if file.name != "scheduler.pdparams"]
+    files_list = [str(file) for file in model_path.glob("*.pdparams") if file.name != "scheduler.pdparams"]
     if len(files_list) > 0:
         key_name_list = []
         use_safetensors = False
diff --git a/tests/conftest.py b/tests/conftest.py
index 99536c2d97..22c71ca862 100644
--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -15,7 +15,12 @@ import time
 from typing import Any, Union
 
 import pytest
-from model_loader.utils import clean_ports
+from e2e.utils.serving_utils import (
+    FD_API_PORT,
+    FD_CACHE_QUEUE_PORT,
+    FD_ENGINE_QUEUE_PORT,
+    clean_ports,
+)
 
 
 class FDRunner:
@@ -31,10 +36,7 @@ class FDRunner:
     ) -> None:
         from fastdeploy.entrypoints.llm import LLM
 
-        ports_to_clean = []
-        port_keys = ["engine_worker_queue_port", "cache_queue_port", "port", "metrics_port"]
-        ports_to_clean.extend(kwargs[k] for k in port_keys if k in kwargs)
-        clean_ports(ports_to_clean)
+        clean_ports()
         time.sleep(10)
         graph_optimization_config = {"use_cudagraph": False}
         self.llm = LLM(
@@ -46,6 +48,9 @@ class FDRunner:
             quantization=quantization,
             max_num_batched_tokens=max_model_len,
             graph_optimization_config=graph_optimization_config,
+            port=FD_API_PORT,
+            cache_queue_port=FD_CACHE_QUEUE_PORT,
+            engine_worker_queue_port=FD_ENGINE_QUEUE_PORT,
             **kwargs,
         )
 
diff --git a/tests/model_loader/test_model_cache.py b/tests/model_loader/test_model_cache.py
index ff924a6d06..9a37e52749 100644
--- a/tests/model_loader/test_model_cache.py
+++ b/tests/model_loader/test_model_cache.py
@@ -31,9 +31,6 @@ from tests.model_loader.utils import (
     run_with_timeout,
 )
 
-FD_ENGINE_QUEUE_PORT = int(os.getenv("FD_ENGINE_QUEUE_PORT", 8313))
-FD_CACHE_QUEUE_PORT = int(os.getenv("FD_CACHE_QUEUE_PORT", 8333))
-
 prompts = ["解释下“温故而知新", "Hello, how are you?"]
 
 
@@ -48,6 +45,7 @@ model_param_map = {
         ],
         "max_num_seqs": 1,
         "graph_optimization_config": {"use_cudagraph": False},
+        "env": {"FD_USE_MACHETE": "0"},
     }
 }
 
@@ -102,9 +100,7 @@ def test_model_cache(
             max_tokens,
             quantization,
             "default_v1",
-            FD_ENGINE_QUEUE_PORT,
             prompts,
-            FD_CACHE_QUEUE_PORT,
         ),
     )
 
@@ -123,9 +119,7 @@ def test_model_cache(
             max_tokens,
             quantization,
             "default_v1",
-            FD_ENGINE_QUEUE_PORT,
             prompts,
-            FD_CACHE_QUEUE_PORT,
         ),
     )
     check_tokens_id_and_text_close(
diff --git a/tests/model_loader/test_offline_model.py b/tests/model_loader/test_offline_model.py
index 25373da435..ea67fadf15 100644
--- a/tests/model_loader/test_offline_model.py
+++ b/tests/model_loader/test_offline_model.py
@@ -30,10 +30,6 @@ from tests.model_loader.utils import (
     run_with_timeout,
 )
 
-FD_ENGINE_QUEUE_PORT = int(os.getenv("FD_ENGINE_QUEUE_PORT", 8313))
-FD_CACHE_QUEUE_PORT = int(os.getenv("FD_CACHE_QUEUE_PORT", 8333))
-
-
 model_param_map = {
     "Qwen3-30B-A3B-FP8": {
         "tensor_parallel_size": 2,
@@ -102,8 +98,6 @@ def test_offline_model(
             max_tokens,
             quantization,
             "default_v1",
-            FD_ENGINE_QUEUE_PORT,
             prompts,
-            FD_CACHE_QUEUE_PORT,
         ),
     )
diff --git a/tests/model_loader/test_torch_model.py b/tests/model_loader/test_torch_model.py
index e10c1376d8..bc8252a442 100644
--- a/tests/model_loader/test_torch_model.py
+++ b/tests/model_loader/test_torch_model.py
@@ -31,9 +31,6 @@ from tests.model_loader.utils import (
     run_with_timeout,
 )
 
-FD_ENGINE_QUEUE_PORT = int(os.getenv("FD_ENGINE_QUEUE_PORT", 8313))
-FD_CACHE_QUEUE_PORT = int(os.getenv("FD_CACHE_QUEUE_PORT", 8333))
-
 prompts = ["北京天安门在哪里?"]
 
 
@@ -134,9 +131,7 @@ def test_model_against_baseline(
             max_tokens,
             quantization,
             "default_v1",
-            FD_ENGINE_QUEUE_PORT,
             prompts,
-            FD_CACHE_QUEUE_PORT,
         ),
     )
 
diff --git a/tests/model_loader/utils.py b/tests/model_loader/utils.py
index 18672e54ff..9a41422fe9 100644
--- a/tests/model_loader/utils.py
+++ b/tests/model_loader/utils.py
@@ -85,9 +85,7 @@ def form_model_get_output_topp0(
     max_tokens,
     quantization,
     load_choices,
-    engine_worker_queue_port,
     prompts,
-    cache_queue_port,
     result_queue,
 ):
     try:
@@ -98,8 +96,6 @@ def form_model_get_output_topp0(
             max_model_len=max_model_len,
             load_choices=load_choices,
             quantization=quantization,
-            engine_worker_queue_port=engine_worker_queue_port,
-            cache_queue_port=cache_queue_port,
         ) as fd_model:
             fd_outputs = fd_model.generate_topp0(prompts, max_tokens=max_tokens)
             result_queue.put(fd_outputs)
@@ -109,42 +105,6 @@ def form_model_get_output_topp0(
         pytest.fail(f"Failed to initialize LLM model from {model_path}")
 
 
-def form_model_get_output_topp1(
-    fd_runner,
-    model_path,
-    tensor_parallel_size,
-    max_num_seqs,
-    max_model_len,
-    max_tokens,
-    quantization,
-    load_choices,
-    engine_worker_queue_port,
-    prompts,
-    cache_queue_port,
-    result_queue,
-):
-    try:
-        with fd_runner(
-            model_path,
-            tensor_parallel_size=tensor_parallel_size,
-            max_num_seqs=max_num_seqs,
-            max_model_len=max_model_len,
-            load_choices=load_choices,
-            quantization=quantization,
-            engine_worker_queue_port=engine_worker_queue_port,
-            cache_queue_port=cache_queue_port,
-        ) as fd_model:
-            from fastdeploy.engine.sampling_params import SamplingParams
-
-            sampling_params = SamplingParams(temperature=0, top_p=1, max_tokens=max_tokens)
-            fd_outputs = fd_model.generate(prompts, sampling_params)
-            result_queue.put(fd_outputs)
-    except Exception:
-        print(f"Failed using {load_choices} loader to load model from {model_path}.")
-        traceback.print_exc()
-        pytest.fail(f"Failed to initialize LLM model from {model_path}")
-
-
 def kill_process_on_port(port: int):
     """
     Kill processes that are listening on the given port.