mirror of
https://github.com/PaddlePaddle/FastDeploy.git
synced 2026-04-23 00:17:25 +08:00
@@ -398,7 +398,7 @@ def get_all_weights_file(model_path: str):
|
||||
"""
|
||||
model_path = Path(model_path)
|
||||
use_safetensors = True
|
||||
files_list = [file for file in model_path.glob("*.pdparams") if file.name != "scheduler.pdparams"]
|
||||
files_list = [str(file) for file in model_path.glob("*.pdparams") if file.name != "scheduler.pdparams"]
|
||||
if len(files_list) > 0:
|
||||
key_name_list = []
|
||||
use_safetensors = False
|
||||
|
||||
+10
-5
@@ -15,7 +15,12 @@ import time
|
||||
from typing import Any, Union
|
||||
|
||||
import pytest
|
||||
from model_loader.utils import clean_ports
|
||||
from e2e.utils.serving_utils import (
|
||||
FD_API_PORT,
|
||||
FD_CACHE_QUEUE_PORT,
|
||||
FD_ENGINE_QUEUE_PORT,
|
||||
clean_ports,
|
||||
)
|
||||
|
||||
|
||||
class FDRunner:
|
||||
@@ -31,10 +36,7 @@ class FDRunner:
|
||||
) -> None:
|
||||
from fastdeploy.entrypoints.llm import LLM
|
||||
|
||||
ports_to_clean = []
|
||||
port_keys = ["engine_worker_queue_port", "cache_queue_port", "port", "metrics_port"]
|
||||
ports_to_clean.extend(kwargs[k] for k in port_keys if k in kwargs)
|
||||
clean_ports(ports_to_clean)
|
||||
clean_ports()
|
||||
time.sleep(10)
|
||||
graph_optimization_config = {"use_cudagraph": False}
|
||||
self.llm = LLM(
|
||||
@@ -46,6 +48,9 @@ class FDRunner:
|
||||
quantization=quantization,
|
||||
max_num_batched_tokens=max_model_len,
|
||||
graph_optimization_config=graph_optimization_config,
|
||||
port=FD_API_PORT,
|
||||
cache_queue_port=FD_CACHE_QUEUE_PORT,
|
||||
engine_worker_queue_port=FD_ENGINE_QUEUE_PORT,
|
||||
**kwargs,
|
||||
)
|
||||
|
||||
|
||||
@@ -31,9 +31,6 @@ from tests.model_loader.utils import (
|
||||
run_with_timeout,
|
||||
)
|
||||
|
||||
FD_ENGINE_QUEUE_PORT = int(os.getenv("FD_ENGINE_QUEUE_PORT", 8313))
|
||||
FD_CACHE_QUEUE_PORT = int(os.getenv("FD_CACHE_QUEUE_PORT", 8333))
|
||||
|
||||
prompts = ["解释下“温故而知新", "Hello, how are you?"]
|
||||
|
||||
|
||||
@@ -48,6 +45,7 @@ model_param_map = {
|
||||
],
|
||||
"max_num_seqs": 1,
|
||||
"graph_optimization_config": {"use_cudagraph": False},
|
||||
"env": {"FD_USE_MACHETE": "0"},
|
||||
}
|
||||
}
|
||||
|
||||
@@ -102,9 +100,7 @@ def test_model_cache(
|
||||
max_tokens,
|
||||
quantization,
|
||||
"default_v1",
|
||||
FD_ENGINE_QUEUE_PORT,
|
||||
prompts,
|
||||
FD_CACHE_QUEUE_PORT,
|
||||
),
|
||||
)
|
||||
|
||||
@@ -123,9 +119,7 @@ def test_model_cache(
|
||||
max_tokens,
|
||||
quantization,
|
||||
"default_v1",
|
||||
FD_ENGINE_QUEUE_PORT,
|
||||
prompts,
|
||||
FD_CACHE_QUEUE_PORT,
|
||||
),
|
||||
)
|
||||
check_tokens_id_and_text_close(
|
||||
|
||||
@@ -30,10 +30,6 @@ from tests.model_loader.utils import (
|
||||
run_with_timeout,
|
||||
)
|
||||
|
||||
FD_ENGINE_QUEUE_PORT = int(os.getenv("FD_ENGINE_QUEUE_PORT", 8313))
|
||||
FD_CACHE_QUEUE_PORT = int(os.getenv("FD_CACHE_QUEUE_PORT", 8333))
|
||||
|
||||
|
||||
model_param_map = {
|
||||
"Qwen3-30B-A3B-FP8": {
|
||||
"tensor_parallel_size": 2,
|
||||
@@ -102,8 +98,6 @@ def test_offline_model(
|
||||
max_tokens,
|
||||
quantization,
|
||||
"default_v1",
|
||||
FD_ENGINE_QUEUE_PORT,
|
||||
prompts,
|
||||
FD_CACHE_QUEUE_PORT,
|
||||
),
|
||||
)
|
||||
|
||||
@@ -31,9 +31,6 @@ from tests.model_loader.utils import (
|
||||
run_with_timeout,
|
||||
)
|
||||
|
||||
FD_ENGINE_QUEUE_PORT = int(os.getenv("FD_ENGINE_QUEUE_PORT", 8313))
|
||||
FD_CACHE_QUEUE_PORT = int(os.getenv("FD_CACHE_QUEUE_PORT", 8333))
|
||||
|
||||
prompts = ["北京天安门在哪里?"]
|
||||
|
||||
|
||||
@@ -134,9 +131,7 @@ def test_model_against_baseline(
|
||||
max_tokens,
|
||||
quantization,
|
||||
"default_v1",
|
||||
FD_ENGINE_QUEUE_PORT,
|
||||
prompts,
|
||||
FD_CACHE_QUEUE_PORT,
|
||||
),
|
||||
)
|
||||
|
||||
|
||||
@@ -85,9 +85,7 @@ def form_model_get_output_topp0(
|
||||
max_tokens,
|
||||
quantization,
|
||||
load_choices,
|
||||
engine_worker_queue_port,
|
||||
prompts,
|
||||
cache_queue_port,
|
||||
result_queue,
|
||||
):
|
||||
try:
|
||||
@@ -98,8 +96,6 @@ def form_model_get_output_topp0(
|
||||
max_model_len=max_model_len,
|
||||
load_choices=load_choices,
|
||||
quantization=quantization,
|
||||
engine_worker_queue_port=engine_worker_queue_port,
|
||||
cache_queue_port=cache_queue_port,
|
||||
) as fd_model:
|
||||
fd_outputs = fd_model.generate_topp0(prompts, max_tokens=max_tokens)
|
||||
result_queue.put(fd_outputs)
|
||||
@@ -109,42 +105,6 @@ def form_model_get_output_topp0(
|
||||
pytest.fail(f"Failed to initialize LLM model from {model_path}")
|
||||
|
||||
|
||||
def form_model_get_output_topp1(
|
||||
fd_runner,
|
||||
model_path,
|
||||
tensor_parallel_size,
|
||||
max_num_seqs,
|
||||
max_model_len,
|
||||
max_tokens,
|
||||
quantization,
|
||||
load_choices,
|
||||
engine_worker_queue_port,
|
||||
prompts,
|
||||
cache_queue_port,
|
||||
result_queue,
|
||||
):
|
||||
try:
|
||||
with fd_runner(
|
||||
model_path,
|
||||
tensor_parallel_size=tensor_parallel_size,
|
||||
max_num_seqs=max_num_seqs,
|
||||
max_model_len=max_model_len,
|
||||
load_choices=load_choices,
|
||||
quantization=quantization,
|
||||
engine_worker_queue_port=engine_worker_queue_port,
|
||||
cache_queue_port=cache_queue_port,
|
||||
) as fd_model:
|
||||
from fastdeploy.engine.sampling_params import SamplingParams
|
||||
|
||||
sampling_params = SamplingParams(temperature=0, top_p=1, max_tokens=max_tokens)
|
||||
fd_outputs = fd_model.generate(prompts, sampling_params)
|
||||
result_queue.put(fd_outputs)
|
||||
except Exception:
|
||||
print(f"Failed using {load_choices} loader to load model from {model_path}.")
|
||||
traceback.print_exc()
|
||||
pytest.fail(f"Failed to initialize LLM model from {model_path}")
|
||||
|
||||
|
||||
def kill_process_on_port(port: int):
|
||||
"""
|
||||
Kill processes that are listening on the given port.
|
||||
|
||||
Reference in New Issue
Block a user