From 6fd3e72da1d531922dc9b89a14514cd468b91a04 Mon Sep 17 00:00:00 2001 From: qwes5s5 <45442318+qwes5s5@users.noreply.github.com> Date: Fri, 10 Oct 2025 16:17:44 +0800 Subject: [PATCH] [FastDeploy Cli] Bench Command eval and throughput (#4239) * bench command * bench command * bench command * bench command * bench command --------- Co-authored-by: K11OntheBoat --- fastdeploy/benchmarks/__init__.py | 0 .../cli/benchmark => benchmarks}/datasets.py | 86 ++ fastdeploy/benchmarks/latency.py | 137 ++ fastdeploy/benchmarks/lib/__init__.py | 0 .../lib}/endpoint_request_func.py | 0 fastdeploy/benchmarks/lib/utils.py | 90 ++ fastdeploy/benchmarks/serve.py | 1213 +++++++++++++++++ fastdeploy/benchmarks/throughput.py | 464 +++++++ fastdeploy/entrypoints/cli/__init__.py | 6 + fastdeploy/entrypoints/cli/benchmark/eval.py | 416 ++++++ .../entrypoints/cli/benchmark/latency.py | 118 +- fastdeploy/entrypoints/cli/benchmark/serve.py | 1194 +--------------- .../entrypoints/cli/benchmark/throughput.py | 36 + fastdeploy/entrypoints/cli/collect_env.py | 2 +- setup.py | 5 +- .../test_endpoint_request_func_benchmarks.py | 210 +++ tests/benchmarks/lib/test_utils_benchmarks.py | 104 ++ tests/benchmarks/test_datasets_benchmarks.py | 151 ++ tests/benchmarks/test_latency_benchmarks.py | 102 ++ tests/benchmarks/test_serve_benchmarks.py | 397 ++++++ .../benchmarks/test_throughput_benchmarks.py | 485 +++++++ tests/entrypoints/cli/benchmark/test_eval.py | 275 ++++ .../cli/benchmark/test_throughput.py | 57 + .../cli/test_collect_env_conmmand.py | 2 +- 24 files changed, 4237 insertions(+), 1313 deletions(-) create mode 100644 fastdeploy/benchmarks/__init__.py rename fastdeploy/{entrypoints/cli/benchmark => benchmarks}/datasets.py (85%) create mode 100644 fastdeploy/benchmarks/latency.py create mode 100644 fastdeploy/benchmarks/lib/__init__.py rename fastdeploy/{entrypoints/cli/benchmark => benchmarks/lib}/endpoint_request_func.py (100%) create mode 100644 fastdeploy/benchmarks/lib/utils.py create mode 100644 fastdeploy/benchmarks/serve.py create mode 100644 fastdeploy/benchmarks/throughput.py create mode 100644 fastdeploy/entrypoints/cli/benchmark/eval.py create mode 100644 fastdeploy/entrypoints/cli/benchmark/throughput.py create mode 100644 tests/benchmarks/lib/test_endpoint_request_func_benchmarks.py create mode 100644 tests/benchmarks/lib/test_utils_benchmarks.py create mode 100644 tests/benchmarks/test_datasets_benchmarks.py create mode 100644 tests/benchmarks/test_latency_benchmarks.py create mode 100644 tests/benchmarks/test_serve_benchmarks.py create mode 100644 tests/benchmarks/test_throughput_benchmarks.py create mode 100644 tests/entrypoints/cli/benchmark/test_eval.py create mode 100644 tests/entrypoints/cli/benchmark/test_throughput.py diff --git a/fastdeploy/benchmarks/__init__.py b/fastdeploy/benchmarks/__init__.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/fastdeploy/entrypoints/cli/benchmark/datasets.py b/fastdeploy/benchmarks/datasets.py similarity index 85% rename from fastdeploy/entrypoints/cli/benchmark/datasets.py rename to fastdeploy/benchmarks/datasets.py index 34529056b0..7892f4748b 100644 --- a/fastdeploy/entrypoints/cli/benchmark/datasets.py +++ b/fastdeploy/benchmarks/datasets.py @@ -28,8 +28,10 @@ from dataclasses import dataclass from io import BytesIO from typing import Any, Optional, Union +import numpy as np from fontTools.feaLib import ast from PIL import Image +from transformers import PreTrainedTokenizerBase from fastdeploy.utils import FlexibleArgumentParser @@ -320,6 +322,90 @@ class EBChatDataset(BenchmarkDataset): return samples +class RandomDataset(BenchmarkDataset): + # Default values copied from benchmark_serving.py for the random dataset. + DEFAULT_PREFIX_LEN = 0 + DEFAULT_RANGE_RATIO = 0.0 + DEFAULT_INPUT_LEN = 1024 + DEFAULT_OUTPUT_LEN = 128 + + def __init__( + self, + **kwargs, + ) -> None: + super().__init__(**kwargs) + random.seed(self.random_seed) + np.random.seed(self.random_seed) + + def sample( + self, + tokenizer: PreTrainedTokenizerBase, + num_requests: int, + prefix_len: int = DEFAULT_PREFIX_LEN, + range_ratio: float = DEFAULT_RANGE_RATIO, + input_len: int = DEFAULT_INPUT_LEN, + output_len: int = DEFAULT_OUTPUT_LEN, + **kwargs, + ) -> list[SampleRequest]: + # Enforce range_ratio < 1 + assert range_ratio < 1.0, "random_range_ratio must be < 1.0 to ensure a valid sampling range" + cnt = 1 + vocab_size = tokenizer.vocab_size + num_special_tokens = tokenizer.num_special_tokens_to_add() + real_input_len = input_len - num_special_tokens + + prefix_token_ids = np.random.randint(0, vocab_size, size=prefix_len).tolist() if prefix_len > 0 else [] + + # New sampling logic: [X * (1 - b), X * (1 + b)] + input_low = int(real_input_len * (1 - range_ratio)) + input_high = int(real_input_len * (1 + range_ratio)) + output_low = int(output_len * (1 - range_ratio)) + output_high = int(output_len * (1 + range_ratio)) + + # Add logging for debugging + logger.info( + "Sampling input_len from [%s, %s] and output_len from [%s, %s]", + input_low, + input_high, + output_low, + output_high, + ) + + input_lens = np.random.randint(input_low, input_high + 1, size=num_requests) + output_lens = np.random.randint(output_low, output_high + 1, size=num_requests) + offsets = np.random.randint(0, vocab_size, size=num_requests) + + requests = [] + for i in range(num_requests): + inner_seq = ((offsets[i] + i + np.arange(input_lens[i])) % vocab_size).tolist() + token_sequence = prefix_token_ids + inner_seq + prompt = tokenizer.decode(token_sequence) + # After decoding the prompt we have to encode and decode it again. + # This is done because in some cases N consecutive tokens + # give a string tokenized into != N number of tokens. + # For example for GPT2Tokenizer: + # [6880, 6881] -> ['Ġcalls', 'here'] -> + # [1650, 939, 486] -> ['Ġcall', 'sh', 'ere'] + # To avoid uncontrolled change of the prompt length, + # the encoded sequence is truncated before being decode again. + total_input_len = prefix_len + int(input_lens[i]) + re_encoded_sequence = tokenizer.encode(prompt, add_special_tokens=False)[:total_input_len] + prompt = tokenizer.decode(re_encoded_sequence) + total_input_len = len(re_encoded_sequence) + requests.append( + SampleRequest( + no=cnt, + prompt=prompt, + prompt_len=total_input_len, + history_QA=[], + json_data=None, + expected_output_len=int(output_lens[i]), + ) + ) + cnt += 1 + return requests + + class _ValidateDatasetArgs(argparse.Action): """Argparse action to validate dataset name and path compatibility.""" diff --git a/fastdeploy/benchmarks/latency.py b/fastdeploy/benchmarks/latency.py new file mode 100644 index 0000000000..e750b225ee --- /dev/null +++ b/fastdeploy/benchmarks/latency.py @@ -0,0 +1,137 @@ +""" +# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License" +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" + +# This file is modified from https://github.com/vllm-project/vllm/blob/main/vllm/benchmarks/latency.py + +import argparse +import dataclasses +import json +import time + +import numpy as np +from tqdm import tqdm + +import fastdeploy.envs as envs +from fastdeploy.engine.args_utils import EngineArgs + + +def add_cli_args(parser: argparse.ArgumentParser): + parser.add_argument("--input-len", type=int, default=32) + parser.add_argument("--output-len", type=int, default=128) + parser.add_argument("--batch-size", type=int, default=8) + parser.add_argument( + "--n", + type=int, + default=1, + help="Number of generated sequences per prompt.", + ) + parser.add_argument("--use-beam-search", action="store_true") + parser.add_argument( + "--num-iters-warmup", + type=int, + default=10, + help="Number of iterations to run for warmup.", + ) + parser.add_argument("--num-iters", type=int, default=30, help="Number of iterations to run.") + parser.add_argument( + "--profile", + action="store_true", + help="profile the generation process of a single batch", + ) + parser.add_argument( + "--output-json", + type=str, + default=None, + help="Path to save the latency results in JSON format.", + ) + parser.add_argument( + "--disable-detokenize", + action="store_true", + help=("Do not detokenize responses (i.e. do not include " "detokenization time in the latency measurement)"), + ) + + parser = EngineArgs.add_cli_args(parser) + # V1 enables prefix caching by default which skews the latency + # numbers. We need to disable prefix caching by default. + parser.set_defaults(enable_prefix_caching=False) + + +def main(args: argparse.Namespace): + if args.profile and not envs.VLLM_TORCH_PROFILER_DIR: + raise OSError( + "The environment variable 'VLLM_TORCH_PROFILER_DIR' is not set. " + "Please set it to a valid path to use torch profiler." + ) + engine_args = EngineArgs.from_cli_args(args) + + # Lazy import to avoid importing LLM when the bench command is not selected. + from fastdeploy import LLM, SamplingParams + + # NOTE(woosuk): If the request cannot be processed in a single batch, + # the engine will automatically process the request in multiple batches. + llm = LLM(**dataclasses.asdict(engine_args)) + assert llm.llm_engine.cfg.max_model_len >= (args.input_len + args.output_len), ( + "Please ensure that max_model_len is greater than" " the sum of input_len and output_len." + ) + + sampling_params = SamplingParams( + n=args.n, + temperature=1.0, + top_p=1.0, + max_tokens=args.output_len, + ) + dummy_prompt_token_ids = np.random.randint(10000, size=(args.batch_size, args.input_len)) + dummy_prompts = [{"prompt_token_ids": batch} for batch in dummy_prompt_token_ids.tolist()] + + def llm_generate(): + llm.generate(dummy_prompts, sampling_params=sampling_params, use_tqdm=False, stream=True) + + def run_to_completion(): + start_time = time.perf_counter() + llm_generate() + end_time = time.perf_counter() + latency = end_time - start_time + return latency + + print("Warming up...") + for _ in tqdm(range(args.num_iters_warmup), desc="Warmup iterations"): + run_to_completion() + + if args.profile: + print("Profiling...") + run_to_completion() + return + + # Benchmark. + latencies = [] + for _ in tqdm(range(args.num_iters), desc="Profiling iterations"): + latencies.append(run_to_completion()) + latencies = np.array(latencies) + percentages = [10, 25, 50, 75, 90, 99] + percentiles = np.percentile(latencies, percentages) + print(f"Avg latency: {np.mean(latencies)} seconds") + for percentage, percentile in zip(percentages, percentiles): + print(f"{percentage}% percentile latency: {percentile} seconds") + + # Output JSON results if specified + if args.output_json: + results = { + "avg_latency": np.mean(latencies), + "latencies": latencies.tolist(), + "percentiles": dict(zip(percentages, percentiles.tolist())), + } + with open(args.output_json, "w") as f: + json.dump(results, f, indent=4) diff --git a/fastdeploy/benchmarks/lib/__init__.py b/fastdeploy/benchmarks/lib/__init__.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/fastdeploy/entrypoints/cli/benchmark/endpoint_request_func.py b/fastdeploy/benchmarks/lib/endpoint_request_func.py similarity index 100% rename from fastdeploy/entrypoints/cli/benchmark/endpoint_request_func.py rename to fastdeploy/benchmarks/lib/endpoint_request_func.py diff --git a/fastdeploy/benchmarks/lib/utils.py b/fastdeploy/benchmarks/lib/utils.py new file mode 100644 index 0000000000..4eba58a3b2 --- /dev/null +++ b/fastdeploy/benchmarks/lib/utils.py @@ -0,0 +1,90 @@ +""" +# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License" +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" + +# This file is modified from https://github.com/vllm-project/vllm/blob/main/benchmarks/benchmark_utils.py + + +import argparse +import json +import math +import os +from typing import Any + + +def convert_to_pytorch_benchmark_format( + args: argparse.Namespace, + metrics: dict[str, list], + extra_info: dict[str, Any], +) -> list: + """ + Save the benchmark results in the format used by PyTorch OSS benchmark with + on metric per record + https://github.com/pytorch/pytorch/wiki/How-to-integrate-with-PyTorch-OSS-benchmark-database + """ + records = [] + if not os.environ.get("SAVE_TO_PYTORCH_BENCHMARK_FORMAT", False): + return records + + for name, benchmark_values in metrics.items(): + record = { + "benchmark": { + "name": "vLLM benchmark", + "extra_info": { + "args": vars(args), + }, + }, + "model": { + "name": args.model, + }, + "metric": { + "name": name, + "benchmark_values": benchmark_values, + "extra_info": extra_info, + }, + } + + tp = record["benchmark"]["extra_info"]["args"].get("tensor_parallel_size") + # Save tensor_parallel_size parameter if it's part of the metadata + if not tp and "tensor_parallel_size" in extra_info: + record["benchmark"]["extra_info"]["args"]["tensor_parallel_size"] = extra_info["tensor_parallel_size"] + + records.append(record) + + return records + + +class InfEncoder(json.JSONEncoder): + """InfEncoder""" + + def clear_inf(self, o: Any): + """clear_inf""" + if isinstance(o, dict): + return {k: self.clear_inf(v) for k, v in o.items()} + elif isinstance(o, list): + return [self.clear_inf(v) for v in o] + elif isinstance(o, float) and math.isinf(o): + return "inf" + return o + + def iterencode(self, o: Any, *args, **kwargs) -> Any: + """iterencode""" + return super().iterencode(self.clear_inf(o), *args, **kwargs) + + +def write_to_json(filename: str, records: list) -> None: + """write_to_json""" + with open(filename, "w") as f: + json.dump(records, f, cls=InfEncoder) diff --git a/fastdeploy/benchmarks/serve.py b/fastdeploy/benchmarks/serve.py new file mode 100644 index 0000000000..65cb739955 --- /dev/null +++ b/fastdeploy/benchmarks/serve.py @@ -0,0 +1,1213 @@ +""" +# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License" +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" + +# This file is modified from https://github.com/vllm-project/vllm/blob/main/benchmarks/benchmark_serving.py + +import argparse +import asyncio +import gc +import json +import math +import os +import random +import time +import warnings +from collections.abc import AsyncGenerator, Iterable +from dataclasses import dataclass +from datetime import datetime +from typing import Any, Optional + +import numpy as np +import yaml +from tqdm.asyncio import tqdm + +from fastdeploy.benchmarks.datasets import ( + SampleRequest, + add_dataset_parser, + get_samples, +) +from fastdeploy.benchmarks.lib.endpoint_request_func import ( + ASYNC_REQUEST_FUNCS, + OPENAI_COMPATIBLE_BACKENDS, + RequestFuncInput, + RequestFuncOutput, +) + +MILLISECONDS_TO_SECONDS_CONVERSION = 1000 + + +@dataclass +class BenchmarkMetrics: + """Class containing all metrics that are used in this script""" + + completed: int + total_input: int + total_output: int + request_throughput: float + request_goodput: float + output_throughput: float + total_token_throughput: float + mean_s_decode: float + median_s_decode: float + std_s_decode: float + percentiles_s_decode: list[tuple[float, float]] + mean_ttft_ms: float + median_ttft_ms: float + std_ttft_ms: float + percentiles_ttft_ms: list[tuple[float, float]] + mean_s_ttft_ms: float + median_s_ttft_ms: float + std_s_ttft_ms: float + percentiles_s_ttft_ms: list[tuple[float, float]] + mean_tpot_ms: float + median_tpot_ms: float + std_tpot_ms: float + percentiles_tpot_ms: list[tuple[float, float]] + mean_itl_ms: float + median_itl_ms: float + std_itl_ms: float + percentiles_itl_ms: list[tuple[float, float]] + mean_s_itl_ms: float + median_s_itl_ms: float + std_s_itl_ms: float + percentiles_s_itl_ms: list[tuple[float, float]] + # E2EL stands for end-to-end latency per request. + # It is the time taken on the client side from sending + # a request to receiving a complete response. + mean_e2el_ms: float + median_e2el_ms: float + std_e2el_ms: float + percentiles_e2el_ms: list[tuple[float, float]] + mean_s_e2el_ms: float + median_s_e2el_ms: float + std_s_e2el_ms: float + percentiles_s_e2el_ms: list[tuple[float, float]] + mean_input_len: float + median_input_len: float + std_input_len: float + percentiles_input_len: list[tuple[float, float]] + mean_s_input_len: float + median_s_input_len: float + std_s_input_len: float + percentiles_s_input_len: list[tuple[float, float]] + mean_output_len: float + median_output_len: float + std_output_len: float + percentiles_output_len: list[tuple[float, float]] + + +def add_cli_args(parser: argparse.ArgumentParser): + add_dataset_parser(parser) + parser.add_argument( + "--label", + type=str, + default=None, + help="The label (prefix) of the benchmark results. If not specified, " + "the endpoint type will be used as the label.", + ) + parser.add_argument( + "--backend", + type=str, + default="openai-chat", + choices=list(ASYNC_REQUEST_FUNCS.keys()), + ) + parser.add_argument( + "--base-url", + type=str, + default=None, + help="Server or API base url if not using http host and port.", + ) + # Use 127.0.0.1 here instead of localhost to force the use of ipv4 + parser.add_argument("--host", type=str, default="127.0.0.1") + parser.add_argument("--port", type=int, default=8000) + parser.add_argument( + "--endpoint", + type=str, + default="/v1/chat/completions", + help="API endpoint.", + ) + parser.add_argument( + "--header", + metavar="KEY=VALUE", + nargs="*", + help="Key-value pairs (e.g, --header x-additional-info=0.3.3) " + "for headers to be passed with each request. These headers override " + "per backend constants and values set via environment variable, and " + "will be overriden by other arguments (such as request ids).", + ) + parser.add_argument( + "--max-concurrency", + type=int, + default=None, + help="Maximum number of concurrent requests. This can be used " + "to help simulate an environment where a higher level component " + "is enforcing a maximum number of concurrent requests. While the " + "--request-rate argument controls the rate at which requests are " + "initiated, this argument will control how many are actually allowed " + "to execute at a time. This means that when used in combination, the " + "actual request rate may be lower than specified with --request-rate, " + "if the server is not processing requests fast enough to keep up.", + ) + + parser.add_argument( + "--model", + type=str, + required=True, + help="Name of the model.", + ) + parser.add_argument( + "--tokenizer", + type=str, + help="Name or path of the tokenizer, if not using the default tokenizer.", # noqa: E501 + ) + parser.add_argument("--use-beam-search", action="store_true") + parser.add_argument( + "--logprobs", + type=int, + default=None, + help=( + "Number of logprobs-per-token to compute & return as part of " + "the request. If unspecified, then either (1) if beam search " + "is disabled, no logprobs are computed & a single dummy " + "logprob is returned for each token; or (2) if beam search " + "is enabled 1 logprob per token is computed" + ), + ) + parser.add_argument( + "--request-rate", + type=float, + default=float("inf"), + help="Number of requests per second. If this is inf, " + "then all the requests are sent at time 0. " + "Otherwise, we use Poisson process or gamma distribution " + "to synthesize the request arrival times.", + ) + parser.add_argument( + "--burstiness", + type=float, + default=1.0, + help="Burstiness factor of the request generation. " + "Only take effect when request_rate is not inf. " + "Default value is 1, which follows Poisson process. " + "Otherwise, the request intervals follow a gamma distribution. " + "A lower burstiness value (0 < burstiness < 1) results in more " + "bursty requests. A higher burstiness value (burstiness > 1) " + "results in a more uniform arrival of requests.", + ) + parser.add_argument( + "--trust-remote-code", + action="store_true", + help="Trust remote code from huggingface", + ) + parser.add_argument( + "--disable-tqdm", + action="store_true", + help="Specify to disable tqdm progress bar.", + ) + parser.add_argument( + "--profile", + action="store_true", + help="Use Torch Profiler. The endpoint must be launched with " "VLLM_TORCH_PROFILER_DIR to enable profiler.", + ) + parser.add_argument( + "--save-result", + action="store_true", + help="Specify to save benchmark results to a json file", + ) + parser.add_argument( + "--save-detailed", + action="store_true", + help="When saving the results, whether to include per request " + "information such as response, error, ttfs, tpots, etc.", + ) + parser.add_argument( + "--append-result", + action="store_true", + help="Append the benchmark result to the existing json file.", + ) + parser.add_argument( + "--metadata", + metavar="KEY=VALUE", + nargs="*", + help="Key-value pairs (e.g, --metadata version=0.3.3 tp=1) " + "for metadata of this run to be saved in the result JSON file " + "for record keeping purposes.", + ) + parser.add_argument( + "--result-dir", + type=str, + default=None, + help="Specify directory to save benchmark json results." + "If not specified, results are saved in the current directory.", + ) + parser.add_argument( + "--result-filename", + type=str, + default=None, + help="Specify the filename to save benchmark json results." + "If not specified, results will be saved in " + "{label}-{args.request_rate}qps-{base_model_id}-{current_dt}.json" # noqa + " format.", + ) + parser.add_argument( + "--ignore-eos", + action="store_true", + help="Set ignore_eos flag when sending the benchmark request." + "Warning: ignore_eos is not supported in deepspeed_mii and tgi.", + ) + parser.add_argument( + "--percentile-metrics", + type=str, + default="ttft,tpot,itl", + help="Comma-separated list of selected metrics to report percentils. " + "This argument specifies the metrics to report percentiles. " + 'Allowed metric names are "ttft", "tpot", "itl", "e2el". ', + ) + parser.add_argument( + "--metric-percentiles", + type=str, + default="99", + help="Comma-separated list of percentiles for selected metrics. " + 'To report 25-th, 50-th, and 75-th percentiles, use "25,50,75". ' + 'Default value is "99".' + 'Use "--percentile-metrics" to select metrics.', + ) + parser.add_argument( + "--goodput", + nargs="+", + required=False, + help='Specify service level objectives for goodput as "KEY:VALUE" ' + "pairs, where the key is a metric name, and the value is in " + 'milliseconds. Multiple "KEY:VALUE" pairs can be provided, ' + "separated by spaces. Allowed request level metric names are " + '"ttft", "tpot", "e2el". For more context on the definition of ' + "goodput, refer to DistServe paper: https://arxiv.org/pdf/2401.09670 " + "and the blog: https://hao-ai-lab.github.io/blogs/distserve", + ) + parser.add_argument( + "--request-id-prefix", + type=str, + required=False, + default="benchmark-serving", + help="Specify the prefix of request id.", + ) + + sampling_group = parser.add_argument_group("sampling parameters") + sampling_group.add_argument( + "--top-p", + type=float, + default=None, + help="Top-p sampling parameter. Only has effect on " "openai-compatible backends.", + ) + sampling_group.add_argument( + "--top-k", + type=int, + default=None, + help="Top-k sampling parameter. Only has effect on " "openai-compatible backends.", + ) + sampling_group.add_argument( + "--min-p", + type=float, + default=None, + help="Min-p sampling parameter. Only has effect on " "openai-compatible backends.", + ) + sampling_group.add_argument( + "--temperature", + type=float, + default=None, + help="Temperature sampling parameter. Only has effect on " + "openai-compatible backends. If not specified, default to greedy " + "decoding (i.e. temperature==0.0).", + ) + parser.add_argument( + "--debug", + action="store_true", + help="print debug information (output)", + ) + parser.add_argument( + "--tokenizer-mode", + type=str, + default="auto", + choices=["auto", "slow", "mistral", "custom"], + help='The tokenizer mode.\n\n* "auto" will use the ' + 'fast tokenizer if available.\n* "slow" will ' + "always use the slow tokenizer. \n* " + '"mistral" will always use the `mistral_common` tokenizer. \n*' + '"custom" will use --tokenizer to select the preregistered tokenizer.', + ) + parser.add_argument( + "--shuffle", + action="store_true", + help="shuffle dataset", + ) + parser.add_argument( + "--hyperparameter-path", + type=str, + default=None, + help="Path to the hyperparameter. ", + ) + + parser.add_argument( + "--served-model-name", + type=str, + default=None, + help="The model name used in the API. " + "If not specified, the model name will be the " + "same as the ``--model`` argument. ", + ) + + parser.add_argument( + "--lora-modules", + nargs="+", + default=None, + help="A subset of LoRA module names passed in when " + "launching the server. For each request, the " + "script chooses a LoRA module at random.", + ) + + parser.add_argument( + "--ramp-up-strategy", + type=str, + default=None, + choices=["linear", "exponential"], + help="The ramp-up strategy. This would be used to " + "ramp up the request rate from initial RPS to final " + "RPS rate (specified by --ramp-up-start-rps and " + "--ramp-up-end-rps.) over the duration of the benchmark.", + ) + parser.add_argument( + "--ramp-up-start-rps", + type=int, + default=None, + help="The starting request rate for ramp-up (RPS). " "Needs to be specified when --ramp-up-strategy is used.", + ) + parser.add_argument( + "--ramp-up-end-rps", + type=int, + default=None, + help="The ending request rate for ramp-up (RPS). " "Needs to be specified when --ramp-up-strategy is used.", + ) + parser.add_argument( + "--ready-check-timeout-sec", + type=int, + default=600, + help="Maximum time to wait for the endpoint to become ready " + "in seconds (default: 600 seconds / 10 minutes).", + ) + + +async def get_request( + input_requests: list[SampleRequest], + request_rate: float, + burstiness: float = 1.0, +) -> AsyncGenerator[SampleRequest, None]: + """ + Asynchronously generates requests at a specified rate + with OPTIONAL burstiness. + + Args: + input_requests: + A list of input requests, each represented as a SampleRequest. + request_rate: + The rate at which requests are generated (requests/s). + burstiness (optional): + The burstiness factor of the request generation. + Only takes effect when request_rate is not inf. + Default value is 1, which follows a Poisson process. + Otherwise, the request intervals follow a gamma distribution. + A lower burstiness value (0 < burstiness < 1) results + in more bursty requests, while a higher burstiness value + (burstiness > 1) results in a more uniform arrival of requests. + """ + input_requests: Iterable[SampleRequest] = iter(input_requests) + + # Calculate scale parameter theta to maintain the desired request_rate. + assert burstiness > 0, f"A positive burstiness factor is expected, but given {burstiness}." + theta = 1.0 / (request_rate * burstiness) + + for request in input_requests: + yield request + + if request_rate == float("inf"): + # If the request rate is infinity, then we don't need to wait. + continue + + # Sample the request interval from the gamma distribution. + # If burstiness is 1, it follows exponential distribution. + interval = np.random.gamma(shape=burstiness, scale=theta) + # The next request will be sent after the interval. + await asyncio.sleep(interval) + + +def calculate_metrics( + input_requests: list[SampleRequest], + outputs: list[RequestFuncOutput], + dur_s: float, + selected_percentiles: list[float], + goodput_config_dict: dict[str, float], +) -> tuple[BenchmarkMetrics, list[int]]: + """Calculates various performance metrics based on the inputs and outputs.""" + input_lens: list[int] = [] + infer_input_lens: list[int] = [] # 推理侧输入token数 + actual_output_lens: list[int] = [] + total_input = 0 + completed = 0 + good_completed = 0 + itls: list[float] = [] + s_itls: list[float] = [] + tpots: list[float] = [] + all_tpots: list[float] = [] + ttfts: list[float] = [] + s_ttfts: list[float] = [] + e2els: list[float] = [] + s_e2els: list[float] = [] + s_decodes: list[float] = [] + for i in range(len(outputs)): + if outputs[i].success: + output_len = outputs[i].output_tokens + + if not output_len: + print("no output_len") + # We use the tokenizer to count the number of output tokens + # for some serving backends instead of looking at + # len(outputs[i].itl) since multiple output tokens may be + # bundled together + # Note : this may inflate the output token count slightly + continue + + actual_output_lens.append(output_len) + input_lens.append(outputs[i].prompt_len) + infer_input_lens.append(outputs[i].prompt_tokens) + total_input += outputs[i].prompt_tokens + tpot = 0 + if output_len > 1: + latency_minus_ttft = outputs[i].latency - outputs[i].ttft + tpot = latency_minus_ttft / (output_len - 1) + tpots.append(tpot) + # Note: if output_len <= 1, we regard tpot as 0 for goodput + all_tpots.append(tpot) + itls += outputs[i].itl + # 推理侧ITL + s_a = outputs[i].arrival_time[1:] + for j in range(len(s_a) - 2): + s_itls.append(s_a[j + 1] - s_a[j]) + ttfts.append(outputs[i].ttft) + # 推理侧TTFT + s_ttfts.append(outputs[i].arrival_time[1]) + e2els.append(outputs[i].latency) + # 推理侧整句时延 + s_e2els.append(outputs[i].arrival_time[-1]) + # 解码速度去掉首token + if len(outputs[i].arrival_time) > 2: + s_decodes.append( + (outputs[i].output_tokens - 1) / (outputs[i].arrival_time[-1] - outputs[i].arrival_time[1]) + ) + else: + print("len(outputs[i].arrival_time) <= 2") + completed += 1 + else: + actual_output_lens.append(0) + input_lens.append(0) + infer_input_lens.append(0) + + if goodput_config_dict: + valid_metrics = [] + slo_values = [] + + if "ttft" in goodput_config_dict: + valid_metrics.append(ttfts) + slo_values.append(goodput_config_dict["ttft"] / MILLISECONDS_TO_SECONDS_CONVERSION) + if "tpot" in goodput_config_dict: + valid_metrics.append(all_tpots) + slo_values.append(goodput_config_dict["tpot"] / MILLISECONDS_TO_SECONDS_CONVERSION) + if "e2el" in goodput_config_dict: + valid_metrics.append(e2els) + slo_values.append(goodput_config_dict["e2el"] / MILLISECONDS_TO_SECONDS_CONVERSION) + + for req_metric in zip(*valid_metrics): + is_good_req = all([s >= r for s, r in zip(slo_values, req_metric)]) + if is_good_req: + good_completed += 1 + + if completed == 0: + warnings.warn( + "All requests failed. This is likely due to a misconfiguration " "on the benchmark arguments.", + stacklevel=2, + ) + metrics = BenchmarkMetrics( + completed=completed, + total_input=total_input, + total_output=sum(actual_output_lens), + request_throughput=completed / dur_s, + request_goodput=good_completed / dur_s, + output_throughput=sum(actual_output_lens) / dur_s, + total_token_throughput=(total_input + sum(actual_output_lens)) / dur_s, + mean_s_decode=np.mean(s_decodes or 0) * 1, # ttfts is empty if streaming is not supported by backend + std_s_decode=np.std(s_decodes or 0) * 1, + median_s_decode=np.median(s_decodes or 0) * 1, + percentiles_s_decode=[(p, np.percentile(s_decodes or 0, p) * 1) for p in selected_percentiles], + mean_ttft_ms=np.mean(ttfts or 0) * 1000, # ttfts is empty if streaming is not supported by backend + std_ttft_ms=np.std(ttfts or 0) * 1000, + median_ttft_ms=np.median(ttfts or 0) * 1000, + percentiles_ttft_ms=[(p, np.percentile(ttfts or 0, p) * 1000) for p in selected_percentiles], + mean_s_ttft_ms=np.mean(s_ttfts or 0) * 1000, # ttfts is empty if streaming is not supported by backend + std_s_ttft_ms=np.std(s_ttfts or 0) * 1000, + median_s_ttft_ms=np.median(s_ttfts or 0) * 1000, + percentiles_s_ttft_ms=[(p, np.percentile(s_ttfts or 0, p) * 1000) for p in selected_percentiles], + mean_tpot_ms=np.mean(tpots or 0) * 1000, + std_tpot_ms=np.std(tpots or 0) * 1000, + median_tpot_ms=np.median(tpots or 0) * 1000, + percentiles_tpot_ms=[(p, np.percentile(tpots or 0, p) * 1000) for p in selected_percentiles], + mean_itl_ms=np.mean(itls or 0) * 1000, + std_itl_ms=np.std(itls or 0) * 1000, + median_itl_ms=np.median(itls or 0) * 1000, + percentiles_itl_ms=[(p, np.percentile(itls or 0, p) * 1000) for p in selected_percentiles], + mean_s_itl_ms=np.mean(s_itls or 0) * 1000, + std_s_itl_ms=np.std(s_itls or 0) * 1000, + median_s_itl_ms=np.median(s_itls or 0) * 1000, + percentiles_s_itl_ms=[(p, np.percentile(s_itls or 0, p) * 1000) for p in selected_percentiles], + mean_e2el_ms=np.mean(e2els or 0) * 1000, + std_e2el_ms=np.std(e2els or 0) * 1000, + median_e2el_ms=np.median(e2els or 0) * 1000, + percentiles_e2el_ms=[(p, np.percentile(e2els or 0, p) * 1000) for p in selected_percentiles], + mean_s_e2el_ms=np.mean(s_e2els or 0) * 1000, + std_s_e2el_ms=np.std(s_e2els or 0) * 1000, + median_s_e2el_ms=np.median(s_e2els or 0) * 1000, + percentiles_s_e2el_ms=[(p, np.percentile(s_e2els or 0, p) * 1000) for p in selected_percentiles], + mean_input_len=np.mean(input_lens or 0) * 1, + std_input_len=np.std(input_lens or 0) * 1, + median_input_len=np.median(input_lens or 0) * 1, + percentiles_input_len=[(p, np.percentile(input_lens or 0, p)) for p in selected_percentiles], + mean_s_input_len=np.mean(infer_input_lens or 0) * 1, + std_s_input_len=np.std(infer_input_lens or 0) * 1, + median_s_input_len=np.median(infer_input_lens or 0) * 1, + percentiles_s_input_len=[(p, np.percentile(infer_input_lens or 0, p)) for p in selected_percentiles], + mean_output_len=np.mean(actual_output_lens or 0) * 1, + std_output_len=np.std(actual_output_lens or 0) * 1, + median_output_len=np.median(actual_output_lens or 0) * 1, + percentiles_output_len=[(p, np.percentile(actual_output_lens or 0, p)) for p in selected_percentiles], + ) + + return metrics, actual_output_lens + + +async def benchmark( + backend: str, + api_url: str, + base_url: str, + model_id: str, + model_name: str, + input_requests: list[SampleRequest], + hyper_parameters: dict, + logprobs: Optional[int], + request_rate: float, + burstiness: float, + disable_tqdm: bool, + profile: bool, + selected_percentile_metrics: list[str], + selected_percentiles: list[float], + ignore_eos: bool, + debug: bool, + goodput_config_dict: dict[str, float], + max_concurrency: Optional[int], + lora_modules: Optional[Iterable[str]], + extra_body: Optional[dict], +): + """Benchmarks an API endpoint using a given set of sample inputs and returns""" + if backend in ASYNC_REQUEST_FUNCS: + request_func = ASYNC_REQUEST_FUNCS[backend] + else: + raise ValueError(f"Unknown backend: {backend}") + + print("Starting initial single prompt test run...") + test_prompt, test_output_len, test_no = ( + input_requests[0].prompt, + input_requests[0].expected_output_len, + input_requests[0].no, + ) + test_history_QA = input_requests[0].history_QA + + test_input = RequestFuncInput( + model=model_id, + model_name=model_name, + prompt=test_prompt, + no=test_no, + prompt_len=0, + history_QA=test_history_QA, + hyper_parameters=hyper_parameters, + api_url=api_url, + output_len=test_output_len, + logprobs=logprobs, + ignore_eos=ignore_eos, + debug=debug, + extra_body=extra_body, + ) + + print("test_input:", test_input) + + test_output = await request_func(request_func_input=test_input) + + print("test_output:", test_output) + + if not test_output.success: + raise ValueError( + f"Initial test run failed - Please make sure that 1. benchmark arguments are correctly specified and 2. the http_proxy and https_proxy are turned off. Error: {test_output.error}" + ) + else: + print("Initial test run completed. Starting main benchmark run...") + + if lora_modules: + # For each input request, choose a LoRA module at random. + lora_modules = iter([random.choice(lora_modules) for _ in range(len(input_requests))]) + + if profile: + print("Starting profiler...") + profile_input = RequestFuncInput( + model=model_id, + model_name=model_name, + prompt=test_prompt, + no=test_no, + api_url=base_url + "/start_profile", + output_len=test_output_len, + logprobs=logprobs, + ignore_eos=ignore_eos, + extra_body=extra_body, + ) + profile_output = await request_func(request_func_input=profile_input) + if profile_output.success: + print("Profiler started") + + if burstiness == 1.0: + distribution = "Poisson process" + else: + distribution = "Gamma distribution" + + print(f"Traffic request rate: {request_rate}") + print(f"Burstiness factor: {burstiness} ({distribution})") + print(f"Maximum request concurrency: {max_concurrency}") + + pbar = None if disable_tqdm else tqdm(total=len(input_requests)) + + # This can be used once the minimum Python version is 3.10 or higher, + # and it will simplify the code in limited_request_func. + # semaphore = (asyncio.Semaphore(max_concurrency) + # if max_concurrency else contextlib.nullcontext()) + semaphore = asyncio.Semaphore(max_concurrency) if max_concurrency else None + + async def limited_request_func(request_func_input, pbar): + if semaphore is None: + return await request_func(request_func_input=request_func_input, pbar=pbar) + async with semaphore: + return await request_func(request_func_input=request_func_input, pbar=pbar) + + benchmark_start_time = time.perf_counter() + tasks: list[asyncio.Task] = [] + async for request in get_request(input_requests, request_rate, burstiness): + prompt, output_len, no = ( + request.prompt, + request.expected_output_len, + request.no, + ) + history_QA = request.history_QA + + req_model_id, req_model_name = model_id, model_name + if lora_modules: + req_lora_module = next(lora_modules) + req_model_id, req_model_name = req_lora_module, req_lora_module + + request_func_input = RequestFuncInput( + model=req_model_id, + model_name=req_model_name, + prompt=prompt, + no=no, + prompt_len=0, + history_QA=history_QA, + hyper_parameters=hyper_parameters, + api_url=api_url, + output_len=output_len, + logprobs=logprobs, + debug=debug, + ignore_eos=ignore_eos, + extra_body=extra_body, + ) + tasks.append(asyncio.create_task(limited_request_func(request_func_input=request_func_input, pbar=pbar))) + outputs: list[RequestFuncOutput] = await asyncio.gather(*tasks) + + if profile: + print("Stopping profiler...") + profile_input = RequestFuncInput( + model=model_id, + prompt=test_prompt, + no=test_no, + api_url=base_url + "/stop_profile", + output_len=test_output_len, + logprobs=logprobs, + ) + profile_output = await request_func(request_func_input=profile_input) + if profile_output.success: + print("Profiler stopped") + + if pbar is not None: + pbar.close() + + benchmark_duration = time.perf_counter() - benchmark_start_time + print("benchmark_duration:", benchmark_duration) + + metrics, actual_output_lens = calculate_metrics( + input_requests=input_requests, + outputs=outputs, + dur_s=benchmark_duration, + # tokenizer=tokenizer, + selected_percentiles=selected_percentiles, + goodput_config_dict=goodput_config_dict, + ) + + print("{s:{c}^{n}}".format(s=" Serving Benchmark Result ", n=50, c="=")) + print("{:<40} {:<10}".format("Successful requests:", metrics.completed)) + print("{:<40} {:<10.2f}".format("Benchmark duration (s):", benchmark_duration)) + print("{:<40} {:<10}".format("Total input tokens:", metrics.total_input)) + print("{:<40} {:<10}".format("Total generated tokens:", metrics.total_output)) + print("{:<40} {:<10.3f}".format("Request throughput (req/s):", metrics.request_throughput)) + if goodput_config_dict: + print("{:<40} {:<10.2f}".format("Request goodput (req/s):", metrics.request_goodput)) + print("{:<40} {:<10.2f}".format("Output token throughput (tok/s):", metrics.output_throughput)) + print("{:<40} {:<10.2f}".format("Total Token throughput (tok/s):", metrics.total_token_throughput)) + + result = { + "duration": benchmark_duration, + "completed": metrics.completed, + "total_input_tokens": metrics.total_input, + "total_output_tokens": metrics.total_output, + "request_throughput": metrics.request_throughput, + "request_goodput:": (metrics.request_goodput if goodput_config_dict else None), + "output_throughput": metrics.output_throughput, + "total_token_throughput": metrics.total_token_throughput, + "input_lens": [output.prompt_len for output in outputs], + "infer_input_lens": [output.prompt_tokens for output in outputs], + "output_lens": actual_output_lens, + "ttfts": [output.ttft for output in outputs], + "itls": [output.itl for output in outputs], + "input_texts": [input.prompt for input in input_requests], + "generated_texts": [output.generated_text for output in outputs], + "reasoning_contents": [output.reasoning_content for output in outputs], + "errors": [output.error for output in outputs], + } + + def process_one_metric( + # E.g., "ttft" + metric_attribute_name: str, + # E.g., "TTFT" + metric_name: str, + # E.g., "Time to First Token" + metric_header: str, + ): + # This function prints and adds statistics of the specified + # metric. + if metric_attribute_name not in selected_percentile_metrics: + return + print("{s:{c}^{n}}".format(s=metric_header, n=50, c="-")) + print( + "{:<40} {:<10.2f}".format( + f"Mean {metric_name} (ms):", + getattr(metrics, f"mean_{metric_attribute_name}_ms"), + ) + ) + print( + "{:<40} {:<10.2f}".format( + f"Median {metric_name} (ms):", + getattr(metrics, f"median_{metric_attribute_name}_ms"), + ) + ) + result[f"mean_{metric_attribute_name}_ms"] = getattr(metrics, f"mean_{metric_attribute_name}_ms") + result[f"median_{metric_attribute_name}_ms"] = getattr(metrics, f"median_{metric_attribute_name}_ms") + result[f"std_{metric_attribute_name}_ms"] = getattr(metrics, f"std_{metric_attribute_name}_ms") + for p, value in getattr(metrics, f"percentiles_{metric_attribute_name}_ms"): + p_word = str(int(p)) if int(p) == p else str(p) + print("{:<40} {:<10.2f}".format(f"P{p_word} {metric_name} (ms):", value)) + result[f"p{p_word}_{metric_attribute_name}_ms"] = value + + def process_one_length( + # E.g., "ttft" + metric_attribute_name: str, + # E.g., "TTFT" + metric_name: str, + # E.g., "Time to First Token" + metric_header: str, + ): + # This function prints and adds statistics of the specified + # metric. + if metric_attribute_name not in selected_percentile_metrics: + return + print("{s:{c}^{n}}".format(s=metric_header, n=50, c="-")) + print( + "{:<40} {:<10.2f}".format( + f"Mean {metric_name}:", + getattr(metrics, f"mean_{metric_attribute_name}"), + ) + ) + print( + "{:<40} {:<10.2f}".format( + f"Median {metric_name}:", + getattr(metrics, f"median_{metric_attribute_name}"), + ) + ) + result[f"mean_{metric_attribute_name}"] = getattr(metrics, f"mean_{metric_attribute_name}") + result[f"median_{metric_attribute_name}"] = getattr(metrics, f"median_{metric_attribute_name}") + result[f"std_{metric_attribute_name}"] = getattr(metrics, f"std_{metric_attribute_name}") + for p, value in getattr(metrics, f"percentiles_{metric_attribute_name}"): + p_word = str(int(p)) if int(p) == p else str(p) + print("{:<40} {:<10.2f}".format(f"P{p_word} {metric_name}:", value)) + result[f"p{p_word}_{metric_attribute_name}"] = value + + process_one_length("s_decode", "Decode", "解码速度(tok/s)") + process_one_metric("ttft", "TTFT", "Time to First Token") + process_one_metric("s_ttft", "S_TTFT", "Infer Time to First Token") + process_one_metric("tpot", "TPOT", "Time per Output Token (excl. 1st token)") + process_one_metric("itl", "ITL", "Inter-token Latency") + process_one_metric("s_itl", "S_ITL", "Infer Inter-token Latency") + process_one_metric("e2el", "E2EL", "End-to-end Latency") + process_one_metric("s_e2el", "S_E2EL", "Infer End-to-end Latency") + process_one_length("input_len", "Cached Tokens", "Cached Tokens") + process_one_length("s_input_len", "Input Length", "Infer Input Length") + process_one_length("output_len", "Output Length", "Output Length") + + print("=" * 50) + + return result + + +def check_goodput_args(args): + # Check and parse goodput arguments + goodput_config_dict = {} + VALID_NAMES = ["ttft", "tpot", "e2el"] + if args.goodput: + goodput_config_dict = parse_goodput(args.goodput) + for slo_name, slo_val in goodput_config_dict.items(): + if slo_name not in VALID_NAMES: + raise ValueError( + f"Invalid metric name found, {slo_name}: {slo_val}. " + "The service level objective name should be one of " + f"{str(VALID_NAMES)}. " + ) + if slo_val < 0: + raise ValueError( + f"Invalid value found, {slo_name}: {slo_val}. " + "The service level objective value should be " + "non-negative." + ) + return goodput_config_dict + + +def convert_to_pytorch_benchmark_format( + args: argparse.Namespace, + metrics: dict[str, list], + extra_info: dict[str, Any], +) -> list: + """ + Save the benchmark results in the format used by PyTorch OSS benchmark with + on metric per record + https://github.com/pytorch/pytorch/wiki/How-to-integrate-with-PyTorch-OSS-benchmark-database + """ + records = [] + if not os.environ.get("SAVE_TO_PYTORCH_BENCHMARK_FORMAT", False): + return records + + for name, benchmark_values in metrics.items(): + record = { + "benchmark": { + "name": "fastdeploy benchmark", + "extra_info": { + "args": vars(args), + }, + }, + "model": { + "name": args.model, + }, + "metric": { + "name": name, + "benchmark_values": benchmark_values, + "extra_info": extra_info, + }, + } + + tp = record["benchmark"]["extra_info"]["args"].get("tensor_parallel_size") + # Save tensor_parallel_size parameter if it's part of the metadata + if not tp and "tensor_parallel_size" in extra_info: + record["benchmark"]["extra_info"]["args"]["tensor_parallel_size"] = extra_info["tensor_parallel_size"] + + records.append(record) + + return records + + +class InfEncoder(json.JSONEncoder): + """InfEncoder""" + + def clear_inf(self, o: Any): + """clear_inf""" + if isinstance(o, dict): + return {k: self.clear_inf(v) for k, v in o.items()} + elif isinstance(o, list): + return [self.clear_inf(v) for v in o] + elif isinstance(o, float) and math.isinf(o): + return "inf" + return o + + def iterencode(self, o: Any, *args, **kwargs) -> Any: + """iterencode""" + return super().iterencode(self.clear_inf(o), *args, **kwargs) + + +def write_to_json(filename: str, records: list) -> None: + """write_to_json""" + with open(filename, "w") as f: + json.dump(records, f, cls=InfEncoder) + + +def save_to_pytorch_benchmark_format(args: argparse.Namespace, results: dict[str, Any], file_name: str) -> None: + """Save the benchmarking results to PyTorch Benchmark Format JSON file""" + metrics = [ + "median_ttft_ms", + "mean_ttft_ms", + "std_ttft_ms", + "p99_ttft_ms", + "mean_tpot_ms", + "median_tpot_ms", + "std_tpot_ms", + "p99_tpot_ms", + "median_itl_ms", + "mean_itl_ms", + "std_itl_ms", + "p99_itl_ms", + ] + # These raw data might be useful, but they are rather big. They can be added + # later if needed + ignored_metrics = ["ttfts", "itls", "generated_texts", "errors"] + pt_records = convert_to_pytorch_benchmark_format( + args=args, + metrics={k: [results[k]] for k in metrics}, + extra_info={k: results[k] for k in results if k not in metrics and k not in ignored_metrics}, + ) + if pt_records: + # Don't use json suffix here as we don't want CI to pick it up + pt_file = f"{os.path.splitext(file_name)[0]}.pytorch.json" + write_to_json(pt_file, pt_records) + + +def parse_goodput(slo_pairs): + goodput_config_dict = {} + try: + for slo_pair in slo_pairs: + slo_name, slo_val = slo_pair.split(":") + goodput_config_dict[slo_name] = float(slo_val) + except ValueError as err: + raise argparse.ArgumentTypeError( + "Invalid format found for service level objectives. " + 'Specify service level objectives for goodput as "KEY:VALUE" ' + "pairs, where the key is a metric name, and the value is a " + "number in milliseconds." + ) from err + return goodput_config_dict + + +async def main_async(args: argparse.Namespace): + print(args) + random.seed(args.seed) + np.random.seed(args.seed) + + # Validate ramp-up arguments + if args.ramp_up_strategy is not None: + if args.request_rate != float("inf"): + raise ValueError( + "When using ramp-up, do not specify --request-rate. " + "The request rate will be controlled by ramp-up parameters. " + "Please remove the --request-rate argument." + ) + if args.ramp_up_start_rps is None or args.ramp_up_end_rps is None: + raise ValueError( + "When using --ramp-up-strategy, both --ramp-up-start-rps and " "--ramp-up-end-rps must be specified" + ) + if args.ramp_up_start_rps < 0 or args.ramp_up_end_rps < 0: + raise ValueError("Ramp-up start and end RPS must be non-negative") + if args.ramp_up_start_rps > args.ramp_up_end_rps: + raise ValueError("Ramp-up start RPS must be less than end RPS") + if args.ramp_up_strategy == "exponential" and args.ramp_up_start_rps == 0: + raise ValueError("For exponential ramp-up, the start RPS cannot be 0.") + + endpoint_type = args.backend + backend = args.backend + label = args.label + model_id = args.model + model_name = args.served_model_name + tokenizer_id = args.tokenizer if args.tokenizer is not None else args.model + + if args.base_url is not None: + api_url = f"{args.base_url}{args.endpoint}" + base_url = f"{args.base_url}" + else: + api_url = f"http://{args.host}:{args.port}{args.endpoint}" + base_url = f"http://{args.host}:{args.port}" + print(f"API URL: {api_url}") + print(f"base URL: {base_url}") + + # Headers + headers = None + if args.header: + headers = {} + for item in args.header: + if "=" in item: + kvstring = item.split("=", 1) + headers[kvstring[0].strip()] = kvstring[1].strip() + else: + raise ValueError("Invalid header format. Please use KEY=VALUE format.") + + if args.dataset_name is None: + raise ValueError("Please specify '--dataset-name' and the corresponding " "'--dataset-path' if required.") + + # Load the dataset. + input_requests = get_samples(args) + goodput_config_dict = check_goodput_args(args) + + # Collect the sampling parameters. + sampling_params = { + k: v + for k, v in { + "top_p": args.top_p, + "top_k": args.top_k, + "min_p": args.min_p, + "temperature": args.temperature, + }.items() + if v is not None + } + + # Sampling parameters are only supported by openai-compatible backend. + if sampling_params and args.backend not in OPENAI_COMPATIBLE_BACKENDS: + raise ValueError("Sampling parameters are only supported by " "openai-compatible backends.") + + if "temperature" not in sampling_params: + sampling_params["temperature"] = 0.0 # Default to greedy decoding. + + # Avoid GC processing "static" data - reduce pause times. + gc.collect() + gc.freeze() + + # 超参由yaml传入 + if args.hyperparameter_path: + with open(args.hyperparameter_path, "r") as f: + hyper_parameters = yaml.safe_load(f) + else: + hyper_parameters = {} + + benchmark_result = await benchmark( + backend=backend, + api_url=api_url, + base_url=base_url, + model_id=model_id, + model_name=model_name, + input_requests=input_requests, + hyper_parameters=hyper_parameters, + logprobs=args.logprobs, + request_rate=args.request_rate, + burstiness=args.burstiness, + disable_tqdm=args.disable_tqdm, + profile=args.profile, + selected_percentile_metrics=args.percentile_metrics.split(","), + selected_percentiles=[float(p) for p in args.metric_percentiles.split(",")], + ignore_eos=args.ignore_eos, + debug=args.debug, + goodput_config_dict=goodput_config_dict, + max_concurrency=args.max_concurrency, + lora_modules=args.lora_modules, + extra_body=sampling_params, + ) + + # Save config and results to json + result_json: dict[str, Any] = {} + + # Setup + current_dt = datetime.now().strftime("%Y%m%d-%H%M%S") + result_json["date"] = current_dt + result_json["endpoint_type"] = args.backend + result_json["label"] = label + result_json["model_id"] = model_id + result_json["tokenizer_id"] = tokenizer_id + result_json["num_prompts"] = args.num_prompts + + # Metadata + if args.metadata: + for item in args.metadata: + if "=" in item: + kvstring = item.split("=", 1) + result_json[kvstring[0].strip()] = kvstring[1].strip() + else: + raise ValueError("Invalid metadata format. Please use KEY=VALUE format.") + + # Traffic + result_json["request_rate"] = args.request_rate if args.request_rate < float("inf") else "inf" + result_json["burstiness"] = args.burstiness + result_json["max_concurrency"] = args.max_concurrency + + if args.ramp_up_strategy is not None: + result_json["ramp_up_strategy"] = args.ramp_up_strategy + result_json["ramp_up_start_rps"] = args.ramp_up_start_rps + result_json["ramp_up_end_rps"] = args.ramp_up_end_rps + + # Merge with benchmark result + result_json = {**result_json, **benchmark_result} + + if not args.save_detailed: + # Remove fields with too many data points + for field in [ + "input_lens", + "output_lens", + "ttfts", + "itls", + "generated_texts", + "errors", + ]: + if field in result_json: + del result_json[field] + if field in benchmark_result: + del benchmark_result[field] + + # Save to file + if args.save_result or args.append_result: + base_model_id = model_id.split("/")[-1] + max_concurrency_str = f"-concurrency{args.max_concurrency}" if args.max_concurrency is not None else "" + label = label or endpoint_type + if args.ramp_up_strategy is not None: + file_name = f"{label}-ramp-up-{args.ramp_up_strategy}-{args.ramp_up_start_rps}qps-{args.ramp_up_end_rps}qps{max_concurrency_str}-{base_model_id}-{current_dt}.json" # noqa + else: + file_name = ( + f"{label}-{args.request_rate}qps{max_concurrency_str}-{base_model_id}-{current_dt}.json" # noqa + ) + if args.result_filename: + file_name = args.result_filename + if args.result_dir: + os.makedirs(args.result_dir, exist_ok=True) + file_name = os.path.join(args.result_dir, file_name) + with open(file_name, mode="a+" if args.append_result else "w", encoding="utf-8") as outfile: + # Append a newline. + if args.append_result and outfile.tell() != 0: + outfile.write("\n") + json.dump(result_json, outfile) + save_to_pytorch_benchmark_format(args, result_json, file_name) + + return result_json + + +def main(args: argparse.Namespace) -> dict[str, Any]: + return asyncio.run(main_async(args)) diff --git a/fastdeploy/benchmarks/throughput.py b/fastdeploy/benchmarks/throughput.py new file mode 100644 index 0000000000..ff875282b8 --- /dev/null +++ b/fastdeploy/benchmarks/throughput.py @@ -0,0 +1,464 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +"""Benchmark offline inference throughput.""" +import argparse +import dataclasses +import json +import os +import random +import time +import warnings +from typing import Any, Optional + +try: + import torch + + TORCH_AVAILABLE = True +except (ImportError, NameError, AttributeError, OSError): + TORCH_AVAILABLE = False +from tqdm import tqdm +from transformers import AutoModelForCausalLM, AutoTokenizer, PreTrainedTokenizerBase + +from fastdeploy.benchmarks.datasets import ( + EBChatDataset, + EBDataset, + RandomDataset, + SampleRequest, +) +from fastdeploy.benchmarks.lib.utils import ( + convert_to_pytorch_benchmark_format, + write_to_json, +) +from fastdeploy.engine.args_utils import EngineArgs +from fastdeploy.engine.request import RequestOutput + + +def run_fd( + requests: list[SampleRequest], + n: int, + engine_args: EngineArgs, + disable_detokenize: bool = False, +) -> tuple[float, Optional[list[RequestOutput]]]: + from fastdeploy import LLM, SamplingParams + + llm = LLM(**dataclasses.asdict(engine_args)) + assert all( + llm.llm_engine.cfg.max_model_len >= (request.prompt_len + request.expected_output_len) for request in requests + ), ( + "Please ensure that max_model_len is greater than the sum of" + " prompt_len and expected_output_len for all requests." + ) + # Add the requests to the engine. + prompts = [] + sampling_params: list[SamplingParams] = [] + for request in requests: + # 处理tokenized输入 + if "prompt_token_ids" in request.prompt: + prompt = { + "prompt_token_ids": request.prompt["prompt_token_ids"], + "multi_modal_data": getattr(request, "multi_modal_data", None), + } + # 处理普通文本输入 + else: + prompt = {"prompt": str(request.prompt), "multi_modal_data": getattr(request, "multi_modal_data", None)} + prompts.append(prompt) + + sampling_params.append( + SamplingParams( + n=n, + temperature=1.0, + top_p=1.0, + max_tokens=request.expected_output_len, + ) + ) + outputs = None + start = time.perf_counter() + outputs = llm.generate(prompts, sampling_params, use_tqdm=True) + end = time.perf_counter() + return end - start, outputs + + +def run_fd_chat( + requests: list[SampleRequest], n: int, engine_args: EngineArgs, disable_detokenize: bool = False +) -> tuple[float, list[RequestOutput]]: + """ + Run vLLM chat benchmark. This function is recommended ONLY for benchmarking + multimodal models as it properly handles multimodal inputs and chat + formatting. For non-multimodal models, use run_vllm() instead. + """ + from fastdeploy import LLM, SamplingParams + + llm = LLM(**dataclasses.asdict(engine_args)) + + assert all( + llm.llm_engine.cfg.max_model_len >= (request.prompt_len + request.expected_output_len) for request in requests + ), ( + "Please ensure that max_model_len is greater than the sum of " + "prompt_len and expected_output_len for all requests." + ) + + prompts = [] + sampling_params: list[SamplingParams] = [] + for request in requests: + prompts.append(request.prompt) + sampling_params.append( + SamplingParams( + n=n, + temperature=1.0, + top_p=1.0, + max_tokens=request.expected_output_len, + ) + ) + start = time.perf_counter() + outputs = llm.chat(prompts, sampling_params, use_tqdm=True) + end = time.perf_counter() + return end - start, outputs + + +def run_hf( + requests: list[SampleRequest], + model: str, + tokenizer: PreTrainedTokenizerBase, + n: int, + max_batch_size: int, + trust_remote_code: bool, + disable_detokenize: bool = False, +) -> float: + llm = AutoModelForCausalLM.from_pretrained(model, torch_dtype=torch.float16, trust_remote_code=trust_remote_code) + if llm.config.model_type == "llama": + # To enable padding in the HF backend. + tokenizer.pad_token = tokenizer.eos_token + llm = llm.cuda() + + pbar = tqdm(total=len(requests)) + start = time.perf_counter() + batch: list[str] = [] + max_prompt_len = 0 + max_output_len = 0 + for i in range(len(requests)): + prompt = requests[i].prompt + prompt_len = requests[i].prompt_len + output_len = requests[i].expected_output_len + # Add the prompt to the batch. + batch.append(prompt) + max_prompt_len = max(max_prompt_len, prompt_len) + max_output_len = max(max_output_len, output_len) + if len(batch) < max_batch_size and i != len(requests) - 1: + # Check if we can add more requests to the batch. + next_prompt_len = requests[i + 1].prompt_len + next_output_len = requests[i + 1].expected_output_len + if (max(max_prompt_len, next_prompt_len) + max(max_output_len, next_output_len)) <= 2048: + # We can add more requests to the batch. + continue + + # Generate the sequences. + input_ids = tokenizer(batch, return_tensors="pt", padding=True).input_ids + llm_outputs = llm.generate( + input_ids=input_ids.cuda(), + do_sample=True, + num_return_sequences=n, + temperature=1.0, + top_p=1.0, + use_cache=True, + max_new_tokens=max_output_len, + ) + if not disable_detokenize: + # Include the decoding time. + tokenizer.batch_decode(llm_outputs, skip_special_tokens=True) + pbar.update(len(batch)) + + # Clear the batch. + batch = [] + max_prompt_len = 0 + max_output_len = 0 + end = time.perf_counter() + return end - start + + +def save_to_pytorch_benchmark_format(args: argparse.Namespace, results: dict[str, Any]) -> None: + pt_records = convert_to_pytorch_benchmark_format( + args=args, + metrics={ + "requests_per_second": [results["requests_per_second"]], + "tokens_per_second": [results["tokens_per_second"]], + }, + extra_info={k: results[k] for k in ["elapsed_time", "num_requests", "total_num_tokens"]}, + ) + if pt_records: + # Don't use json suffix here as we don't want CI to pick it up + pt_file = f"{os.path.splitext(args.output_json)[0]}.pytorch.json" + write_to_json(pt_file, pt_records) + + +def get_requests(args, tokenizer): + # Common parameters for all dataset types. + common_kwargs = { + "dataset_path": args.dataset_path, + "random_seed": args.seed, + } + sample_kwargs = { + # "tokenizer": tokenizer, + "lora_path": args.lora_path, + # "max_loras": args.max_loras, + "num_requests": args.num_prompts, + "input_len": args.input_len, + "output_len": args.output_len, + } + if args.dataset_path is None or args.dataset_name == "random": + sample_kwargs["range_ratio"] = args.random_range_ratio + sample_kwargs["prefix_len"] = args.prefix_len + sample_kwargs["tokenizer"] = tokenizer + dataset_cls = RandomDataset + elif args.dataset_name == "EB": + dataset_cls = EBDataset + elif args.dataset_name == "EBChat": + dataset_cls = EBChatDataset + else: + raise ValueError(f"Unknown dataset name: {args.dataset_name}") + # Remove None values + sample_kwargs = {k: v for k, v in sample_kwargs.items() if v is not None} + return dataset_cls(**common_kwargs).sample(**sample_kwargs) + + +def validate_args(args): + """ + Validate command-line arguments. + """ + + # === Deprecation and Defaulting === + if args.dataset is not None: + warnings.warn( + "The '--dataset' argument will be deprecated in the next release. " + "Please use '--dataset-name' and '--dataset-path' instead.", + stacklevel=2, + ) + args.dataset_path = args.dataset + + if not getattr(args, "tokenizer", None): + args.tokenizer = args.model + + # === Backend Validation === + valid_backends = {"fastdeploy", "hf", "fastdeploy-chat"} + if args.backend not in valid_backends: + raise ValueError(f"Unsupported backend: {args.backend}") + + # === Dataset Configuration === + if not args.dataset and not args.dataset_path: + print("When dataset path is not set, it will default to random dataset") + args.dataset_name = "random" + if args.input_len is None: + raise ValueError("input_len must be provided for a random dataset") + + # === Dataset Name Specific Checks === + # --hf-subset and --hf-split: only used + # when dataset_name is 'hf' + if args.dataset_name != "hf" and ( + getattr(args, "hf_subset", None) is not None or getattr(args, "hf_split", None) is not None + ): + warnings.warn( + "--hf-subset and --hf-split will be ignored \ + since --dataset-name is not 'hf'.", + stacklevel=2, + ) + # elif args.dataset_name == "hf": + # if args.dataset_path in ( + # VisionArenaDataset.SUPPORTED_DATASET_PATHS.keys() + # | ConversationDataset.SUPPORTED_DATASET_PATHS): + # assert args.backend == "vllm-chat", f"{args.dataset_path} needs to use vllm-chat as the backend." #noqa: E501 + # elif args.dataset_path in (InstructCoderDataset.SUPPORTED_DATASET_PATHS + # | AIMODataset.SUPPORTED_DATASET_PATHS): + # assert args.backend == "vllm", f"{args.dataset_path} needs to use vllm as the backend." #noqa: E501 + # else: + # raise ValueError( + # f"{args.dataset_path} is not supported by hf dataset.") + + # --random-range-ratio: only used when dataset_name is 'random' + if args.dataset_name != "random" and args.random_range_ratio is not None: + warnings.warn( + "--random-range-ratio will be ignored since \ + --dataset-name is not 'random'.", + stacklevel=2, + ) + + # --prefix-len: only used when dataset_name is 'random', 'sonnet', or not + # set. + if args.dataset_name not in {"random", "sonnet", None} and args.prefix_len is not None: + warnings.warn( + "--prefix-len will be ignored since --dataset-name\ + is not 'random', 'sonnet', or not set.", + stacklevel=2, + ) + + # === LoRA Settings === + if getattr(args, "enable_lora", False) and args.lora_path is None: + raise ValueError("LoRA path must be provided when enable_lora is True") + + # === Backend-specific Validations === + if args.backend == "hf" and args.hf_max_batch_size is None: + raise ValueError("HF max batch size is required for HF backend") + if args.backend != "hf" and args.hf_max_batch_size is not None: + raise ValueError("HF max batch size is only for HF backend.") + + if args.backend in {"hf", "mii"} and getattr(args, "quantization", None) is not None: + raise ValueError("Quantization is only for vLLM backend.") + + +def add_cli_args(parser: argparse.ArgumentParser): + parser.add_argument("--backend", type=str, choices=["fastdeploy", "hf", "fastdeploy-chat"], default="fastdeploy") + parser.add_argument( + "--dataset-name", + type=str, + choices=["EBChat", "random", "EB"], + help="Name of the dataset to benchmark on.", + default="random", + ) + parser.add_argument( + "--dataset", + type=str, + default=None, + help="Path to the ShareGPT dataset, will be deprecated in\ + the next release. The dataset is expected to " + "be a json in form of list[dict[..., conversations: " + "list[dict[..., value: ]]]]", + ) + parser.add_argument("--dataset-path", type=str, default=None, help="Path to the dataset") + parser.add_argument("--input-len", type=int, default=None, help="Input prompt length for each request") + parser.add_argument( + "--output-len", + type=int, + default=None, + help="Output length for each request. Overrides the " "output length from the dataset.", + ) + parser.add_argument("--n", type=int, default=1, help="Number of generated sequences per prompt.") + parser.add_argument("--num-prompts", type=int, default=50, help="Number of prompts to process.") + parser.add_argument("--hf-max-batch-size", type=int, default=None, help="Maximum batch size for HF backend.") + parser.add_argument( + "--output-json", type=str, default=None, help="Path to save the throughput results in JSON format." + ) + parser.add_argument( + "--disable-frontend-multiprocessing", + action="store_true", + default=False, + help="Disable decoupled async engine frontend.", + ) + parser.add_argument( + "--disable-detokenize", + action="store_true", + help=("Do not detokenize the response (i.e. do not include " "detokenization time in the measurement)"), + ) + # LoRA + parser.add_argument( + "--lora-path", + type=str, + default=None, + help="Path to the lora adapters to use. This can be an absolute path, " + "a relative path, or a Hugging Face model identifier.", + ) + parser.add_argument( + "--prefix-len", + type=int, + default=0, + help="Number of fixed prefix tokens before the random " "context in a request (default: 0).", + ) + # random dataset + parser.add_argument( + "--random-range-ratio", + type=float, + default=0.0, + help="Range ratio for sampling input/output length, " + "used only for RandomDataset. Must be in the range [0, 1) to define " + "a symmetric sampling range " + "[length * (1 - range_ratio), length * (1 + range_ratio)].", + ) + + # hf dtaset + parser.add_argument("--hf-subset", type=str, default=None, help="Subset of the HF dataset.") + parser.add_argument("--hf-split", type=str, default=None, help="Split of the HF dataset.") + + parser.add_argument( + "--trust_remote_code", + action="store_true", + help="Sets trust_remote_code to True to execute code to create HF Datasets from the Hub", + ) + parser = EngineArgs.add_cli_args(parser) + parser.set_defaults(enable_prefix_caching=False) + + +def main(args: argparse.Namespace): + if args.tokenizer is None: + args.tokenizer = args.model + validate_args(args) + if args.seed is None: + args.seed = 0 + random.seed(args.seed) + # Sample the requests. + if args.backend == "hf": + tokenizer = AutoTokenizer.from_pretrained(args.tokenizer, trust_remote_code=args.trust_remote_code) + else: + tokenizer = None + requests = get_requests(args, tokenizer) + # is_multi_modal = any(request.multi_modal_data is not None + # for request in requests) + request_outputs: Optional[list[RequestOutput]] = None + if args.backend == "fastdeploy": + elapsed_time, request_outputs = run_fd( + requests, args.n, EngineArgs.from_cli_args(args), args.disable_detokenize + ) + elif args.backend == "hf": + if not TORCH_AVAILABLE: + raise Exception("PyTorch is not available.") + else: + assert args.tensor_parallel_size == 1 + elapsed_time = run_hf( + requests, + args.model, + tokenizer, + args.n, + args.hf_max_batch_size, + args.trust_remote_code, + args.disable_detokenize, + ) + elif args.backend == "fastdeploy-chat": + elapsed_time, request_outputs = run_fd_chat( + requests, args.n, EngineArgs.from_cli_args(args), args.disable_detokenize + ) + else: + raise ValueError(f"Unknown backend: {args.backend}") + + if request_outputs: + # Note: with the vllm and vllm-chat backends, + # we have request_outputs, which we use to count tokens. + total_prompt_tokens = 0 + total_output_tokens = 0 + for ro in request_outputs: + if not isinstance(ro, RequestOutput): + continue + total_prompt_tokens += len(ro.prompt_token_ids) if ro.prompt_token_ids else 0 + if ro.outputs and hasattr(ro.outputs, "token_ids"): + total_output_tokens += len(ro.outputs.token_ids) + total_num_tokens = total_prompt_tokens + total_output_tokens + else: + total_num_tokens = sum(r.prompt_len + r.expected_output_len for r in requests) + total_output_tokens = sum(r.expected_output_len for r in requests) + total_prompt_tokens = total_num_tokens - total_output_tokens + + print( + f"Throughput: {len(requests) / elapsed_time:.2f} requests/s, " + f"{total_num_tokens / elapsed_time:.2f} total tokens/s, " + f"{total_output_tokens / elapsed_time:.2f} output tokens/s" + ) + print(f"Total num prompt tokens: {total_prompt_tokens}") + print(f"Total num output tokens: {total_output_tokens}") + + # Output JSON results if specified + if args.output_json: + results = { + "elapsed_time": elapsed_time, + "num_requests": len(requests), + "total_num_tokens": total_num_tokens, + "requests_per_second": len(requests) / elapsed_time, + "tokens_per_second": total_num_tokens / elapsed_time, + } + with open(args.output_json, "w") as f: + json.dump(results, f, indent=4) + save_to_pytorch_benchmark_format(args, results) diff --git a/fastdeploy/entrypoints/cli/__init__.py b/fastdeploy/entrypoints/cli/__init__.py index eaf86ad4c9..2564d10f03 100644 --- a/fastdeploy/entrypoints/cli/__init__.py +++ b/fastdeploy/entrypoints/cli/__init__.py @@ -1,7 +1,13 @@ +from fastdeploy.entrypoints.cli.benchmark.eval import BenchmarkEvalSubcommand from fastdeploy.entrypoints.cli.benchmark.latency import BenchmarkLatencySubcommand from fastdeploy.entrypoints.cli.benchmark.serve import BenchmarkServingSubcommand +from fastdeploy.entrypoints.cli.benchmark.throughput import ( + BenchmarkThroughputSubcommand, +) __all__: list[str] = [ "BenchmarkLatencySubcommand", "BenchmarkServingSubcommand", + "BenchmarkThroughputSubcommand", + "BenchmarkEvalSubcommand", ] diff --git a/fastdeploy/entrypoints/cli/benchmark/eval.py b/fastdeploy/entrypoints/cli/benchmark/eval.py new file mode 100644 index 0000000000..e8142bcdaf --- /dev/null +++ b/fastdeploy/entrypoints/cli/benchmark/eval.py @@ -0,0 +1,416 @@ +""" +# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License" +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" + +import argparse +import json +import logging +import subprocess +import sys +from functools import partial +from typing import Union + +import pkg_resources + +from fastdeploy.entrypoints.cli.benchmark.base import BenchmarkSubcommandBase + + +def _int_or_none_list_arg_type(min_len: int, max_len: int, defaults: str, value: str, split_char: str = ","): + def parse_value(item): + item = item.strip().lower() + if item == "none": + return None + try: + return int(item) + except ValueError: + raise argparse.ArgumentTypeError(f"{item} is not an integer or None") + + items = [parse_value(v) for v in value.split(split_char)] + num_items = len(items) + + if num_items == 1: + # Makes downstream handling the same for single and multiple values + items = items * max_len + elif num_items < min_len or num_items > max_len: + raise argparse.ArgumentTypeError(f"Argument requires {max_len} integers or None, separated by '{split_char}'") + elif num_items != max_len: + logging.warning( + f"Argument requires {max_len} integers or None, separated by '{split_char}'. " + "Missing values will be filled with defaults." + ) + default_items = [parse_value(v) for v in defaults.split(split_char)] + items.extend(default_items[num_items:]) # extend items list with missing defaults + + return items + + +def try_parse_json(value: str) -> Union[str, dict, None]: + """尝试解析JSON格式的字符串""" + if value is None: + return None + try: + return json.loads(value) + except json.JSONDecodeError: + if "{" in value: + raise argparse.ArgumentTypeError(f"Invalid JSON: {value}. Hint: Use double quotes for JSON strings.") + return value + + +class BenchmarkEvalSubcommand(BenchmarkSubcommandBase): + """The `eval` subcommand for fastdeploy bench.""" + + name = "eval" + help = "Run evaluation using lm-evaluation-harness." + + @classmethod + def add_cli_args(cls, parser: argparse.ArgumentParser) -> None: + parser.add_argument("--model", "-m", type=str, default="hf", help="Name of model e.g. `hf`") + parser.add_argument( + "--tasks", + "-t", + default=None, + type=str, + metavar="task1,task2", + help="Comma-separated list of task names or task groupings to evaluate on.\nTo get full list of tasks, use one of the commands `lm-eval --tasks {{list_groups,list_subtasks,list_tags,list}}` to list out all available names for task groupings; only (sub)tasks; tags; or all of the above", + ) + parser.add_argument( + "--model_args", + "-a", + default="", + type=try_parse_json, + help="""Comma separated string or JSON formatted arguments for model, e.g. `pretrained=EleutherAI/pythia-160m,dtype=float32` or '{"pretrained":"EleutherAI/pythia-160m","dtype":"float32"}'""", + ) + parser.add_argument( + "--num_fewshot", + "-f", + type=int, + default=None, + metavar="N", + help="Number of examples in few-shot context", + ) + parser.add_argument( + "--batch_size", + "-b", + type=str, + default=1, + metavar="auto|auto:N|N", + help="Acceptable values are 'auto', 'auto:N' or N, where N is an integer. Default 1.", + ) + parser.add_argument( + "--max_batch_size", + type=int, + default=None, + metavar="N", + help="Maximal batch size to try with --batch_size auto.", + ) + parser.add_argument( + "--device", + type=str, + default=None, + help="Device to use (e.g. cuda, cuda:0, cpu).", + ) + parser.add_argument( + "--output_path", + "-o", + default=None, + type=str, + metavar="DIR|DIR/file.json", + help="Path where result metrics will be saved. Can be either a directory or a .json file. If the path is a directory and log_samples is true, the results will be saved in the directory. Else the parent directory will be used.", + ) + parser.add_argument( + "--limit", + "-L", + type=float, + default=None, + metavar="N|0 None: + """构建并执行lm-eval命令""" + # 检查lm_eval版本是否为0.4.9.1 + try: + version = pkg_resources.get_distribution("lm_eval").version + if version != "0.4.9.1": + print( + f"Warning: lm_eval version {version} is installed, but version 0.4.9.1 is required.\n" + "Please install the correct version with:\n" + "pip install lm_eval==0.4.9.1", + file=sys.stderr, + ) + sys.exit(1) + except pkg_resources.DistributionNotFound: + print( + "Error: lm_eval is not installed. Please install version 0.4.9.1 with:\n" + "pip install lm_eval==0.4.9.1", + file=sys.stderr, + ) + sys.exit(1) + + cmd = ["lm-eval"] + if args.model: + cmd.extend(["--model", args.model]) + + if args.model: + cmd.extend(["--tasks", args.tasks]) + + if args.model_args: + if isinstance(args.model_args, dict): + model_args = ",".join(f"{k}={v}" for k, v in args.model_args.items()) + else: + model_args = args.model_args + cmd.extend(["--model_args", model_args]) + + if args.gen_kwargs: + if isinstance(args.gen_kwargs, dict): + gen_args = ",".join(f"{k}={v}" for k, v in args.gen_kwargs.items()) + else: + gen_args = args.gen_kwargs + cmd.extend(["--gen_kwargs", gen_args]) + + if args.batch_size: + cmd.extend(["--batch_size", str(args.batch_size)]) + + if args.output_path: + cmd.extend(["--output_path", args.output_path]) + + if args.write_out: + cmd.append("--write_out") + if args.num_fewshot is not None: + cmd.extend(["--num_fewshot", str(args.num_fewshot)]) + if args.max_batch_size is not None: + cmd.extend(["--max_batch_size", str(args.max_batch_size)]) + if args.device: + cmd.extend(["--device", args.device]) + if args.limit is not None: + cmd.extend(["--limit", str(args.limit)]) + if args.samples: + cmd.extend(["--samples", args.samples]) + if args.use_cache: + cmd.extend(["--use_cache", args.use_cache]) + if args.cache_requests: + cmd.extend(["--cache_requests", args.cache_requests]) + if args.check_integrity: + cmd.append("--check_integrity") + if args.write_out: + cmd.append("--write_out") + if args.log_samples: + cmd.append("--log_samples") + if args.system_instruction: + cmd.extend(["--system_instruction", args.system_instruction]) + if args.apply_chat_template: + if args.apply_chat_template is True: + cmd.append("--apply_chat_template") + else: + cmd.extend(["--apply_chat_template", args.apply_chat_template]) + if args.fewshot_as_multiturn: + cmd.append("--fewshot_as_multiturn") + if args.show_config: + cmd.append("--show_config") + if args.include_path: + cmd.extend(["--include_path", args.include_path]) + if args.verbosity: + cmd.extend(["--verbosity", args.verbosity]) + if args.wandb_args: + cmd.extend(["--wandb_args", args.wandb_args]) + if args.wandb_config_args: + cmd.extend(["--wandb_config_args", args.wandb_config_args]) + if args.hf_hub_log_args: + cmd.extend(["--hf_hub_log_args", args.hf_hub_log_args]) + if args.predict_only: + cmd.append("--predict_only") + if args.seed: + if isinstance(args.seed, list): + seed_arg = ",".join(str(x) for x in args.seed) + else: + seed_arg = str(args.seed) + cmd.extend(["--seed", seed_arg]) + if args.trust_remote_code: + cmd.append("--trust_remote_code") + if args.confirm_run_unsafe_code: + cmd.append("--confirm_run_unsafe_code") + if args.metadata: + if isinstance(args.metadata, dict): + metadata_arg = json.dumps(args.metadata) + else: + metadata_arg = str(args.metadata) + cmd.extend(["--metadata", metadata_arg]) + # 打印执行的命令 + print("Executing command:", " ".join(cmd)) + + try: + subprocess.run(cmd, check=True) + except subprocess.CalledProcessError as e: + print(f"Error running lm-eval: {e}", file=sys.stderr) + sys.exit(e.returncode) + except FileNotFoundError: + print("Error: lm-eval not found. Please install lm-evaluation-harness first.", file=sys.stderr) + sys.exit(1) diff --git a/fastdeploy/entrypoints/cli/benchmark/latency.py b/fastdeploy/entrypoints/cli/benchmark/latency.py index 0c1aa3142f..0423ddbf8b 100644 --- a/fastdeploy/entrypoints/cli/benchmark/latency.py +++ b/fastdeploy/entrypoints/cli/benchmark/latency.py @@ -17,127 +17,11 @@ # This file is modified from https://github.com/vllm-project/vllm/blob/main/vllm/benchmarks/latency.py import argparse -import dataclasses -import json -import time -import numpy as np -from tqdm import tqdm - -import fastdeploy.envs as envs -from fastdeploy.engine.args_utils import EngineArgs +from fastdeploy.benchmarks.latency import add_cli_args, main from fastdeploy.entrypoints.cli.benchmark.base import BenchmarkSubcommandBase -def add_cli_args(parser: argparse.ArgumentParser): - parser.add_argument("--input-len", type=int, default=32) - parser.add_argument("--output-len", type=int, default=128) - parser.add_argument("--batch-size", type=int, default=8) - parser.add_argument( - "--n", - type=int, - default=1, - help="Number of generated sequences per prompt.", - ) - parser.add_argument("--use-beam-search", action="store_true") - parser.add_argument( - "--num-iters-warmup", - type=int, - default=10, - help="Number of iterations to run for warmup.", - ) - parser.add_argument("--num-iters", type=int, default=30, help="Number of iterations to run.") - parser.add_argument( - "--profile", - action="store_true", - help="profile the generation process of a single batch", - ) - parser.add_argument( - "--output-json", - type=str, - default=None, - help="Path to save the latency results in JSON format.", - ) - parser.add_argument( - "--disable-detokenize", - action="store_true", - help=("Do not detokenize responses (i.e. do not include " "detokenization time in the latency measurement)"), - ) - - parser = EngineArgs.add_cli_args(parser) - # V1 enables prefix caching by default which skews the latency - # numbers. We need to disable prefix caching by default. - parser.set_defaults(enable_prefix_caching=False) - - -def main(args: argparse.Namespace): - if args.profile and not envs.VLLM_TORCH_PROFILER_DIR: - raise OSError( - "The environment variable 'VLLM_TORCH_PROFILER_DIR' is not set. " - "Please set it to a valid path to use torch profiler." - ) - engine_args = EngineArgs.from_cli_args(args) - - # Lazy import to avoid importing LLM when the bench command is not selected. - from fastdeploy import LLM, SamplingParams - - # NOTE(woosuk): If the request cannot be processed in a single batch, - # the engine will automatically process the request in multiple batches. - llm = LLM(**dataclasses.asdict(engine_args)) - assert llm.llm_engine.cfg.max_model_len >= (args.input_len + args.output_len), ( - "Please ensure that max_model_len is greater than" " the sum of input_len and output_len." - ) - - sampling_params = SamplingParams( - n=args.n, - temperature=1.0, - top_p=1.0, - max_tokens=args.output_len, - ) - dummy_prompt_token_ids = np.random.randint(10000, size=(args.batch_size, args.input_len)) - dummy_prompts = [{"prompt_token_ids": batch} for batch in dummy_prompt_token_ids.tolist()] - - def llm_generate(): - llm.generate(dummy_prompts, sampling_params=sampling_params, use_tqdm=False, stream=True) - - def run_to_completion(): - start_time = time.perf_counter() - llm_generate() - end_time = time.perf_counter() - latency = end_time - start_time - return latency - - print("Warming up...") - for _ in tqdm(range(args.num_iters_warmup), desc="Warmup iterations"): - run_to_completion() - - if args.profile: - print("Profiling...") - run_to_completion() - return - - # Benchmark. - latencies = [] - for _ in tqdm(range(args.num_iters), desc="Profiling iterations"): - latencies.append(run_to_completion()) - latencies = np.array(latencies) - percentages = [10, 25, 50, 75, 90, 99] - percentiles = np.percentile(latencies, percentages) - print(f"Avg latency: {np.mean(latencies)} seconds") - for percentage, percentile in zip(percentages, percentiles): - print(f"{percentage}% percentile latency: {percentile} seconds") - - # Output JSON results if specified - if args.output_json: - results = { - "avg_latency": np.mean(latencies), - "latencies": latencies.tolist(), - "percentiles": dict(zip(percentages, percentiles.tolist())), - } - with open(args.output_json, "w") as f: - json.dump(results, f, indent=4) - - class BenchmarkLatencySubcommand(BenchmarkSubcommandBase): """The `latency` subcommand for fastdeploy bench.""" diff --git a/fastdeploy/entrypoints/cli/benchmark/serve.py b/fastdeploy/entrypoints/cli/benchmark/serve.py index 6bd995fb20..61cbb06ee8 100644 --- a/fastdeploy/entrypoints/cli/benchmark/serve.py +++ b/fastdeploy/entrypoints/cli/benchmark/serve.py @@ -17,1201 +17,9 @@ # This file is modified from https://github.com/vllm-project/vllm/blob/main/benchmarks/benchmark_serving.py import argparse -import asyncio -import gc -import json -import math -import os -import random -import time -import warnings -from collections.abc import AsyncGenerator, Iterable -from dataclasses import dataclass -from datetime import datetime -from typing import Any, Optional - -import numpy as np -import yaml -from tqdm.asyncio import tqdm +from fastdeploy.benchmarks.serve import add_cli_args, main from fastdeploy.entrypoints.cli.benchmark.base import BenchmarkSubcommandBase -from fastdeploy.entrypoints.cli.benchmark.datasets import ( - SampleRequest, - add_dataset_parser, - get_samples, -) -from fastdeploy.entrypoints.cli.benchmark.endpoint_request_func import ( - ASYNC_REQUEST_FUNCS, - OPENAI_COMPATIBLE_BACKENDS, - RequestFuncInput, - RequestFuncOutput, -) - -MILLISECONDS_TO_SECONDS_CONVERSION = 1000 - - -@dataclass -class BenchmarkMetrics: - """Class containing all metrics that are used in this script""" - - completed: int - total_input: int - total_output: int - request_throughput: float - request_goodput: float - output_throughput: float - total_token_throughput: float - mean_s_decode: float - median_s_decode: float - std_s_decode: float - percentiles_s_decode: list[tuple[float, float]] - mean_ttft_ms: float - median_ttft_ms: float - std_ttft_ms: float - percentiles_ttft_ms: list[tuple[float, float]] - mean_s_ttft_ms: float - median_s_ttft_ms: float - std_s_ttft_ms: float - percentiles_s_ttft_ms: list[tuple[float, float]] - mean_tpot_ms: float - median_tpot_ms: float - std_tpot_ms: float - percentiles_tpot_ms: list[tuple[float, float]] - mean_itl_ms: float - median_itl_ms: float - std_itl_ms: float - percentiles_itl_ms: list[tuple[float, float]] - mean_s_itl_ms: float - median_s_itl_ms: float - std_s_itl_ms: float - percentiles_s_itl_ms: list[tuple[float, float]] - # E2EL stands for end-to-end latency per request. - # It is the time taken on the client side from sending - # a request to receiving a complete response. - mean_e2el_ms: float - median_e2el_ms: float - std_e2el_ms: float - percentiles_e2el_ms: list[tuple[float, float]] - mean_s_e2el_ms: float - median_s_e2el_ms: float - std_s_e2el_ms: float - percentiles_s_e2el_ms: list[tuple[float, float]] - mean_input_len: float - median_input_len: float - std_input_len: float - percentiles_input_len: list[tuple[float, float]] - mean_s_input_len: float - median_s_input_len: float - std_s_input_len: float - percentiles_s_input_len: list[tuple[float, float]] - mean_output_len: float - median_output_len: float - std_output_len: float - percentiles_output_len: list[tuple[float, float]] - - -def add_cli_args(parser: argparse.ArgumentParser): - add_dataset_parser(parser) - parser.add_argument( - "--label", - type=str, - default=None, - help="The label (prefix) of the benchmark results. If not specified, " - "the endpoint type will be used as the label.", - ) - parser.add_argument( - "--backend", - type=str, - default="openai-chat", - choices=list(ASYNC_REQUEST_FUNCS.keys()), - ) - parser.add_argument( - "--base-url", - type=str, - default=None, - help="Server or API base url if not using http host and port.", - ) - # Use 127.0.0.1 here instead of localhost to force the use of ipv4 - parser.add_argument("--host", type=str, default="127.0.0.1") - parser.add_argument("--port", type=int, default=8000) - parser.add_argument( - "--endpoint", - type=str, - default="/v1/chat/completions", - help="API endpoint.", - ) - parser.add_argument( - "--header", - metavar="KEY=VALUE", - nargs="*", - help="Key-value pairs (e.g, --header x-additional-info=0.3.3) " - "for headers to be passed with each request. These headers override " - "per backend constants and values set via environment variable, and " - "will be overriden by other arguments (such as request ids).", - ) - parser.add_argument( - "--max-concurrency", - type=int, - default=None, - help="Maximum number of concurrent requests. This can be used " - "to help simulate an environment where a higher level component " - "is enforcing a maximum number of concurrent requests. While the " - "--request-rate argument controls the rate at which requests are " - "initiated, this argument will control how many are actually allowed " - "to execute at a time. This means that when used in combination, the " - "actual request rate may be lower than specified with --request-rate, " - "if the server is not processing requests fast enough to keep up.", - ) - - parser.add_argument( - "--model", - type=str, - required=True, - help="Name of the model.", - ) - parser.add_argument( - "--tokenizer", - type=str, - help="Name or path of the tokenizer, if not using the default tokenizer.", # noqa: E501 - ) - parser.add_argument("--use-beam-search", action="store_true") - parser.add_argument( - "--logprobs", - type=int, - default=None, - help=( - "Number of logprobs-per-token to compute & return as part of " - "the request. If unspecified, then either (1) if beam search " - "is disabled, no logprobs are computed & a single dummy " - "logprob is returned for each token; or (2) if beam search " - "is enabled 1 logprob per token is computed" - ), - ) - parser.add_argument( - "--request-rate", - type=float, - default=float("inf"), - help="Number of requests per second. If this is inf, " - "then all the requests are sent at time 0. " - "Otherwise, we use Poisson process or gamma distribution " - "to synthesize the request arrival times.", - ) - parser.add_argument( - "--burstiness", - type=float, - default=1.0, - help="Burstiness factor of the request generation. " - "Only take effect when request_rate is not inf. " - "Default value is 1, which follows Poisson process. " - "Otherwise, the request intervals follow a gamma distribution. " - "A lower burstiness value (0 < burstiness < 1) results in more " - "bursty requests. A higher burstiness value (burstiness > 1) " - "results in a more uniform arrival of requests.", - ) - parser.add_argument( - "--trust-remote-code", - action="store_true", - help="Trust remote code from huggingface", - ) - parser.add_argument( - "--disable-tqdm", - action="store_true", - help="Specify to disable tqdm progress bar.", - ) - parser.add_argument( - "--profile", - action="store_true", - help="Use Torch Profiler. The endpoint must be launched with " "VLLM_TORCH_PROFILER_DIR to enable profiler.", - ) - parser.add_argument( - "--save-result", - action="store_true", - help="Specify to save benchmark results to a json file", - ) - parser.add_argument( - "--save-detailed", - action="store_true", - help="When saving the results, whether to include per request " - "information such as response, error, ttfs, tpots, etc.", - ) - parser.add_argument( - "--append-result", - action="store_true", - help="Append the benchmark result to the existing json file.", - ) - parser.add_argument( - "--metadata", - metavar="KEY=VALUE", - nargs="*", - help="Key-value pairs (e.g, --metadata version=0.3.3 tp=1) " - "for metadata of this run to be saved in the result JSON file " - "for record keeping purposes.", - ) - parser.add_argument( - "--result-dir", - type=str, - default=None, - help="Specify directory to save benchmark json results." - "If not specified, results are saved in the current directory.", - ) - parser.add_argument( - "--result-filename", - type=str, - default=None, - help="Specify the filename to save benchmark json results." - "If not specified, results will be saved in " - "{label}-{args.request_rate}qps-{base_model_id}-{current_dt}.json" # noqa - " format.", - ) - parser.add_argument( - "--ignore-eos", - action="store_true", - help="Set ignore_eos flag when sending the benchmark request." - "Warning: ignore_eos is not supported in deepspeed_mii and tgi.", - ) - parser.add_argument( - "--percentile-metrics", - type=str, - default="ttft,tpot,itl", - help="Comma-separated list of selected metrics to report percentils. " - "This argument specifies the metrics to report percentiles. " - 'Allowed metric names are "ttft", "tpot", "itl", "e2el". ', - ) - parser.add_argument( - "--metric-percentiles", - type=str, - default="99", - help="Comma-separated list of percentiles for selected metrics. " - 'To report 25-th, 50-th, and 75-th percentiles, use "25,50,75". ' - 'Default value is "99".' - 'Use "--percentile-metrics" to select metrics.', - ) - parser.add_argument( - "--goodput", - nargs="+", - required=False, - help='Specify service level objectives for goodput as "KEY:VALUE" ' - "pairs, where the key is a metric name, and the value is in " - 'milliseconds. Multiple "KEY:VALUE" pairs can be provided, ' - "separated by spaces. Allowed request level metric names are " - '"ttft", "tpot", "e2el". For more context on the definition of ' - "goodput, refer to DistServe paper: https://arxiv.org/pdf/2401.09670 " - "and the blog: https://hao-ai-lab.github.io/blogs/distserve", - ) - parser.add_argument( - "--request-id-prefix", - type=str, - required=False, - default="benchmark-serving", - help="Specify the prefix of request id.", - ) - - sampling_group = parser.add_argument_group("sampling parameters") - sampling_group.add_argument( - "--top-p", - type=float, - default=None, - help="Top-p sampling parameter. Only has effect on " "openai-compatible backends.", - ) - sampling_group.add_argument( - "--top-k", - type=int, - default=None, - help="Top-k sampling parameter. Only has effect on " "openai-compatible backends.", - ) - sampling_group.add_argument( - "--min-p", - type=float, - default=None, - help="Min-p sampling parameter. Only has effect on " "openai-compatible backends.", - ) - sampling_group.add_argument( - "--temperature", - type=float, - default=None, - help="Temperature sampling parameter. Only has effect on " - "openai-compatible backends. If not specified, default to greedy " - "decoding (i.e. temperature==0.0).", - ) - parser.add_argument( - "--debug", - action="store_true", - help="print debug information (output)", - ) - parser.add_argument( - "--tokenizer-mode", - type=str, - default="auto", - choices=["auto", "slow", "mistral", "custom"], - help='The tokenizer mode.\n\n* "auto" will use the ' - 'fast tokenizer if available.\n* "slow" will ' - "always use the slow tokenizer. \n* " - '"mistral" will always use the `mistral_common` tokenizer. \n*' - '"custom" will use --tokenizer to select the preregistered tokenizer.', - ) - parser.add_argument( - "--shuffle", - action="store_true", - help="shuffle dataset", - ) - parser.add_argument( - "--hyperparameter-path", - type=str, - default=None, - help="Path to the hyperparameter. ", - ) - - parser.add_argument( - "--served-model-name", - type=str, - default=None, - help="The model name used in the API. " - "If not specified, the model name will be the " - "same as the ``--model`` argument. ", - ) - - parser.add_argument( - "--lora-modules", - nargs="+", - default=None, - help="A subset of LoRA module names passed in when " - "launching the server. For each request, the " - "script chooses a LoRA module at random.", - ) - - parser.add_argument( - "--ramp-up-strategy", - type=str, - default=None, - choices=["linear", "exponential"], - help="The ramp-up strategy. This would be used to " - "ramp up the request rate from initial RPS to final " - "RPS rate (specified by --ramp-up-start-rps and " - "--ramp-up-end-rps.) over the duration of the benchmark.", - ) - parser.add_argument( - "--ramp-up-start-rps", - type=int, - default=None, - help="The starting request rate for ramp-up (RPS). " "Needs to be specified when --ramp-up-strategy is used.", - ) - parser.add_argument( - "--ramp-up-end-rps", - type=int, - default=None, - help="The ending request rate for ramp-up (RPS). " "Needs to be specified when --ramp-up-strategy is used.", - ) - parser.add_argument( - "--ready-check-timeout-sec", - type=int, - default=600, - help="Maximum time to wait for the endpoint to become ready " - "in seconds (default: 600 seconds / 10 minutes).", - ) - - -async def get_request( - input_requests: list[SampleRequest], - request_rate: float, - burstiness: float = 1.0, -) -> AsyncGenerator[SampleRequest, None]: - """ - Asynchronously generates requests at a specified rate - with OPTIONAL burstiness. - - Args: - input_requests: - A list of input requests, each represented as a SampleRequest. - request_rate: - The rate at which requests are generated (requests/s). - burstiness (optional): - The burstiness factor of the request generation. - Only takes effect when request_rate is not inf. - Default value is 1, which follows a Poisson process. - Otherwise, the request intervals follow a gamma distribution. - A lower burstiness value (0 < burstiness < 1) results - in more bursty requests, while a higher burstiness value - (burstiness > 1) results in a more uniform arrival of requests. - """ - input_requests: Iterable[SampleRequest] = iter(input_requests) - - # Calculate scale parameter theta to maintain the desired request_rate. - assert burstiness > 0, f"A positive burstiness factor is expected, but given {burstiness}." - theta = 1.0 / (request_rate * burstiness) - - for request in input_requests: - yield request - - if request_rate == float("inf"): - # If the request rate is infinity, then we don't need to wait. - continue - - # Sample the request interval from the gamma distribution. - # If burstiness is 1, it follows exponential distribution. - interval = np.random.gamma(shape=burstiness, scale=theta) - # The next request will be sent after the interval. - await asyncio.sleep(interval) - - -def calculate_metrics( - input_requests: list[SampleRequest], - outputs: list[RequestFuncOutput], - dur_s: float, - selected_percentiles: list[float], - goodput_config_dict: dict[str, float], -) -> tuple[BenchmarkMetrics, list[int]]: - """Calculates various performance metrics based on the inputs and outputs.""" - input_lens: list[int] = [] - infer_input_lens: list[int] = [] # 推理侧输入token数 - actual_output_lens: list[int] = [] - total_input = 0 - completed = 0 - good_completed = 0 - itls: list[float] = [] - s_itls: list[float] = [] - tpots: list[float] = [] - all_tpots: list[float] = [] - ttfts: list[float] = [] - s_ttfts: list[float] = [] - e2els: list[float] = [] - s_e2els: list[float] = [] - s_decodes: list[float] = [] - for i in range(len(outputs)): - if outputs[i].success: - output_len = outputs[i].output_tokens - - if not output_len: - print("no output_len") - # We use the tokenizer to count the number of output tokens - # for some serving backends instead of looking at - # len(outputs[i].itl) since multiple output tokens may be - # bundled together - # Note : this may inflate the output token count slightly - continue - - actual_output_lens.append(output_len) - input_lens.append(outputs[i].prompt_len) - infer_input_lens.append(outputs[i].prompt_tokens) - total_input += outputs[i].prompt_tokens - tpot = 0 - if output_len > 1: - latency_minus_ttft = outputs[i].latency - outputs[i].ttft - tpot = latency_minus_ttft / (output_len - 1) - tpots.append(tpot) - # Note: if output_len <= 1, we regard tpot as 0 for goodput - all_tpots.append(tpot) - itls += outputs[i].itl - # 推理侧ITL - s_a = outputs[i].arrival_time[1:] - for j in range(len(s_a) - 2): - s_itls.append(s_a[j + 1] - s_a[j]) - ttfts.append(outputs[i].ttft) - # 推理侧TTFT - s_ttfts.append(outputs[i].arrival_time[1]) - e2els.append(outputs[i].latency) - # 推理侧整句时延 - s_e2els.append(outputs[i].arrival_time[-1]) - # 解码速度去掉首token - if len(outputs[i].arrival_time) > 2: - s_decodes.append( - (outputs[i].output_tokens - 1) / (outputs[i].arrival_time[-1] - outputs[i].arrival_time[1]) - ) - else: - print("len(outputs[i].arrival_time) <= 2") - completed += 1 - else: - actual_output_lens.append(0) - input_lens.append(0) - infer_input_lens.append(0) - - if goodput_config_dict: - valid_metrics = [] - slo_values = [] - - if "ttft" in goodput_config_dict: - valid_metrics.append(ttfts) - slo_values.append(goodput_config_dict["ttft"] / MILLISECONDS_TO_SECONDS_CONVERSION) - if "tpot" in goodput_config_dict: - valid_metrics.append(all_tpots) - slo_values.append(goodput_config_dict["tpot"] / MILLISECONDS_TO_SECONDS_CONVERSION) - if "e2el" in goodput_config_dict: - valid_metrics.append(e2els) - slo_values.append(goodput_config_dict["e2el"] / MILLISECONDS_TO_SECONDS_CONVERSION) - - for req_metric in zip(*valid_metrics): - is_good_req = all([s >= r for s, r in zip(slo_values, req_metric)]) - if is_good_req: - good_completed += 1 - - if completed == 0: - warnings.warn( - "All requests failed. This is likely due to a misconfiguration " "on the benchmark arguments.", - stacklevel=2, - ) - metrics = BenchmarkMetrics( - completed=completed, - total_input=total_input, - total_output=sum(actual_output_lens), - request_throughput=completed / dur_s, - request_goodput=good_completed / dur_s, - output_throughput=sum(actual_output_lens) / dur_s, - total_token_throughput=(total_input + sum(actual_output_lens)) / dur_s, - mean_s_decode=np.mean(s_decodes or 0) * 1, # ttfts is empty if streaming is not supported by backend - std_s_decode=np.std(s_decodes or 0) * 1, - median_s_decode=np.median(s_decodes or 0) * 1, - percentiles_s_decode=[(p, np.percentile(s_decodes or 0, p) * 1) for p in selected_percentiles], - mean_ttft_ms=np.mean(ttfts or 0) * 1000, # ttfts is empty if streaming is not supported by backend - std_ttft_ms=np.std(ttfts or 0) * 1000, - median_ttft_ms=np.median(ttfts or 0) * 1000, - percentiles_ttft_ms=[(p, np.percentile(ttfts or 0, p) * 1000) for p in selected_percentiles], - mean_s_ttft_ms=np.mean(s_ttfts or 0) * 1000, # ttfts is empty if streaming is not supported by backend - std_s_ttft_ms=np.std(s_ttfts or 0) * 1000, - median_s_ttft_ms=np.median(s_ttfts or 0) * 1000, - percentiles_s_ttft_ms=[(p, np.percentile(s_ttfts or 0, p) * 1000) for p in selected_percentiles], - mean_tpot_ms=np.mean(tpots or 0) * 1000, - std_tpot_ms=np.std(tpots or 0) * 1000, - median_tpot_ms=np.median(tpots or 0) * 1000, - percentiles_tpot_ms=[(p, np.percentile(tpots or 0, p) * 1000) for p in selected_percentiles], - mean_itl_ms=np.mean(itls or 0) * 1000, - std_itl_ms=np.std(itls or 0) * 1000, - median_itl_ms=np.median(itls or 0) * 1000, - percentiles_itl_ms=[(p, np.percentile(itls or 0, p) * 1000) for p in selected_percentiles], - mean_s_itl_ms=np.mean(s_itls or 0) * 1000, - std_s_itl_ms=np.std(s_itls or 0) * 1000, - median_s_itl_ms=np.median(s_itls or 0) * 1000, - percentiles_s_itl_ms=[(p, np.percentile(s_itls or 0, p) * 1000) for p in selected_percentiles], - mean_e2el_ms=np.mean(e2els or 0) * 1000, - std_e2el_ms=np.std(e2els or 0) * 1000, - median_e2el_ms=np.median(e2els or 0) * 1000, - percentiles_e2el_ms=[(p, np.percentile(e2els or 0, p) * 1000) for p in selected_percentiles], - mean_s_e2el_ms=np.mean(s_e2els or 0) * 1000, - std_s_e2el_ms=np.std(s_e2els or 0) * 1000, - median_s_e2el_ms=np.median(s_e2els or 0) * 1000, - percentiles_s_e2el_ms=[(p, np.percentile(s_e2els or 0, p) * 1000) for p in selected_percentiles], - mean_input_len=np.mean(input_lens or 0) * 1, - std_input_len=np.std(input_lens or 0) * 1, - median_input_len=np.median(input_lens or 0) * 1, - percentiles_input_len=[(p, np.percentile(input_lens or 0, p)) for p in selected_percentiles], - mean_s_input_len=np.mean(infer_input_lens or 0) * 1, - std_s_input_len=np.std(infer_input_lens or 0) * 1, - median_s_input_len=np.median(infer_input_lens or 0) * 1, - percentiles_s_input_len=[(p, np.percentile(infer_input_lens or 0, p)) for p in selected_percentiles], - mean_output_len=np.mean(actual_output_lens or 0) * 1, - std_output_len=np.std(actual_output_lens or 0) * 1, - median_output_len=np.median(actual_output_lens or 0) * 1, - percentiles_output_len=[(p, np.percentile(actual_output_lens or 0, p)) for p in selected_percentiles], - ) - - return metrics, actual_output_lens - - -async def benchmark( - backend: str, - api_url: str, - base_url: str, - model_id: str, - model_name: str, - input_requests: list[SampleRequest], - hyper_parameters: dict, - logprobs: Optional[int], - request_rate: float, - burstiness: float, - disable_tqdm: bool, - profile: bool, - selected_percentile_metrics: list[str], - selected_percentiles: list[float], - ignore_eos: bool, - debug: bool, - goodput_config_dict: dict[str, float], - max_concurrency: Optional[int], - lora_modules: Optional[Iterable[str]], - extra_body: Optional[dict], -): - """Benchmarks an API endpoint using a given set of sample inputs and returns""" - if backend in ASYNC_REQUEST_FUNCS: - request_func = ASYNC_REQUEST_FUNCS[backend] - else: - raise ValueError(f"Unknown backend: {backend}") - - print("Starting initial single prompt test run...") - test_prompt, test_output_len, test_no = ( - input_requests[0].prompt, - input_requests[0].expected_output_len, - input_requests[0].no, - ) - test_history_QA = input_requests[0].history_QA - - test_input = RequestFuncInput( - model=model_id, - model_name=model_name, - prompt=test_prompt, - no=test_no, - prompt_len=0, - history_QA=test_history_QA, - hyper_parameters=hyper_parameters, - api_url=api_url, - output_len=test_output_len, - logprobs=logprobs, - ignore_eos=ignore_eos, - debug=debug, - extra_body=extra_body, - ) - - print("test_input:", test_input) - - test_output = await request_func(request_func_input=test_input) - - print("test_output:", test_output) - - if not test_output.success: - raise ValueError( - f"Initial test run failed - Please make sure that 1. benchmark arguments are correctly specified and 2. the http_proxy and https_proxy are turned off. Error: {test_output.error}" - ) - else: - print("Initial test run completed. Starting main benchmark run...") - - if lora_modules: - # For each input request, choose a LoRA module at random. - lora_modules = iter([random.choice(lora_modules) for _ in range(len(input_requests))]) - - if profile: - print("Starting profiler...") - profile_input = RequestFuncInput( - model=model_id, - model_name=model_name, - prompt=test_prompt, - no=test_no, - api_url=base_url + "/start_profile", - output_len=test_output_len, - logprobs=logprobs, - ignore_eos=ignore_eos, - extra_body=extra_body, - ) - profile_output = await request_func(request_func_input=profile_input) - if profile_output.success: - print("Profiler started") - - if burstiness == 1.0: - distribution = "Poisson process" - else: - distribution = "Gamma distribution" - - print(f"Traffic request rate: {request_rate}") - print(f"Burstiness factor: {burstiness} ({distribution})") - print(f"Maximum request concurrency: {max_concurrency}") - - pbar = None if disable_tqdm else tqdm(total=len(input_requests)) - - # This can be used once the minimum Python version is 3.10 or higher, - # and it will simplify the code in limited_request_func. - # semaphore = (asyncio.Semaphore(max_concurrency) - # if max_concurrency else contextlib.nullcontext()) - semaphore = asyncio.Semaphore(max_concurrency) if max_concurrency else None - - async def limited_request_func(request_func_input, pbar): - if semaphore is None: - return await request_func(request_func_input=request_func_input, pbar=pbar) - async with semaphore: - return await request_func(request_func_input=request_func_input, pbar=pbar) - - benchmark_start_time = time.perf_counter() - tasks: list[asyncio.Task] = [] - async for request in get_request(input_requests, request_rate, burstiness): - prompt, output_len, no = ( - request.prompt, - request.expected_output_len, - request.no, - ) - history_QA = request.history_QA - - req_model_id, req_model_name = model_id, model_name - if lora_modules: - req_lora_module = next(lora_modules) - req_model_id, req_model_name = req_lora_module, req_lora_module - - request_func_input = RequestFuncInput( - model=req_model_id, - model_name=req_model_name, - prompt=prompt, - no=no, - prompt_len=0, - history_QA=history_QA, - hyper_parameters=hyper_parameters, - api_url=api_url, - output_len=output_len, - logprobs=logprobs, - debug=debug, - ignore_eos=ignore_eos, - extra_body=extra_body, - ) - tasks.append(asyncio.create_task(limited_request_func(request_func_input=request_func_input, pbar=pbar))) - outputs: list[RequestFuncOutput] = await asyncio.gather(*tasks) - - if profile: - print("Stopping profiler...") - profile_input = RequestFuncInput( - model=model_id, - prompt=test_prompt, - no=test_no, - api_url=base_url + "/stop_profile", - output_len=test_output_len, - logprobs=logprobs, - ) - profile_output = await request_func(request_func_input=profile_input) - if profile_output.success: - print("Profiler stopped") - - if pbar is not None: - pbar.close() - - benchmark_duration = time.perf_counter() - benchmark_start_time - print("benchmark_duration:", benchmark_duration) - - metrics, actual_output_lens = calculate_metrics( - input_requests=input_requests, - outputs=outputs, - dur_s=benchmark_duration, - # tokenizer=tokenizer, - selected_percentiles=selected_percentiles, - goodput_config_dict=goodput_config_dict, - ) - - print("{s:{c}^{n}}".format(s=" Serving Benchmark Result ", n=50, c="=")) - print("{:<40} {:<10}".format("Successful requests:", metrics.completed)) - print("{:<40} {:<10.2f}".format("Benchmark duration (s):", benchmark_duration)) - print("{:<40} {:<10}".format("Total input tokens:", metrics.total_input)) - print("{:<40} {:<10}".format("Total generated tokens:", metrics.total_output)) - print("{:<40} {:<10.3f}".format("Request throughput (req/s):", metrics.request_throughput)) - if goodput_config_dict: - print("{:<40} {:<10.2f}".format("Request goodput (req/s):", metrics.request_goodput)) - print("{:<40} {:<10.2f}".format("Output token throughput (tok/s):", metrics.output_throughput)) - print("{:<40} {:<10.2f}".format("Total Token throughput (tok/s):", metrics.total_token_throughput)) - - result = { - "duration": benchmark_duration, - "completed": metrics.completed, - "total_input_tokens": metrics.total_input, - "total_output_tokens": metrics.total_output, - "request_throughput": metrics.request_throughput, - "request_goodput:": (metrics.request_goodput if goodput_config_dict else None), - "output_throughput": metrics.output_throughput, - "total_token_throughput": metrics.total_token_throughput, - "input_lens": [output.prompt_len for output in outputs], - "infer_input_lens": [output.prompt_tokens for output in outputs], - "output_lens": actual_output_lens, - "ttfts": [output.ttft for output in outputs], - "itls": [output.itl for output in outputs], - "input_texts": [input.prompt for input in input_requests], - "generated_texts": [output.generated_text for output in outputs], - "reasoning_contents": [output.reasoning_content for output in outputs], - "errors": [output.error for output in outputs], - } - - def process_one_metric( - # E.g., "ttft" - metric_attribute_name: str, - # E.g., "TTFT" - metric_name: str, - # E.g., "Time to First Token" - metric_header: str, - ): - # This function prints and adds statistics of the specified - # metric. - if metric_attribute_name not in selected_percentile_metrics: - return - print("{s:{c}^{n}}".format(s=metric_header, n=50, c="-")) - print( - "{:<40} {:<10.2f}".format( - f"Mean {metric_name} (ms):", - getattr(metrics, f"mean_{metric_attribute_name}_ms"), - ) - ) - print( - "{:<40} {:<10.2f}".format( - f"Median {metric_name} (ms):", - getattr(metrics, f"median_{metric_attribute_name}_ms"), - ) - ) - result[f"mean_{metric_attribute_name}_ms"] = getattr(metrics, f"mean_{metric_attribute_name}_ms") - result[f"median_{metric_attribute_name}_ms"] = getattr(metrics, f"median_{metric_attribute_name}_ms") - result[f"std_{metric_attribute_name}_ms"] = getattr(metrics, f"std_{metric_attribute_name}_ms") - for p, value in getattr(metrics, f"percentiles_{metric_attribute_name}_ms"): - p_word = str(int(p)) if int(p) == p else str(p) - print("{:<40} {:<10.2f}".format(f"P{p_word} {metric_name} (ms):", value)) - result[f"p{p_word}_{metric_attribute_name}_ms"] = value - - def process_one_length( - # E.g., "ttft" - metric_attribute_name: str, - # E.g., "TTFT" - metric_name: str, - # E.g., "Time to First Token" - metric_header: str, - ): - # This function prints and adds statistics of the specified - # metric. - if metric_attribute_name not in selected_percentile_metrics: - return - print("{s:{c}^{n}}".format(s=metric_header, n=50, c="-")) - print( - "{:<40} {:<10.2f}".format( - f"Mean {metric_name}:", - getattr(metrics, f"mean_{metric_attribute_name}"), - ) - ) - print( - "{:<40} {:<10.2f}".format( - f"Median {metric_name}:", - getattr(metrics, f"median_{metric_attribute_name}"), - ) - ) - result[f"mean_{metric_attribute_name}"] = getattr(metrics, f"mean_{metric_attribute_name}") - result[f"median_{metric_attribute_name}"] = getattr(metrics, f"median_{metric_attribute_name}") - result[f"std_{metric_attribute_name}"] = getattr(metrics, f"std_{metric_attribute_name}") - for p, value in getattr(metrics, f"percentiles_{metric_attribute_name}"): - p_word = str(int(p)) if int(p) == p else str(p) - print("{:<40} {:<10.2f}".format(f"P{p_word} {metric_name}:", value)) - result[f"p{p_word}_{metric_attribute_name}"] = value - - process_one_length("s_decode", "Decode", "解码速度(tok/s)") - process_one_metric("ttft", "TTFT", "Time to First Token") - process_one_metric("s_ttft", "S_TTFT", "Infer Time to First Token") - process_one_metric("tpot", "TPOT", "Time per Output Token (excl. 1st token)") - process_one_metric("itl", "ITL", "Inter-token Latency") - process_one_metric("s_itl", "S_ITL", "Infer Inter-token Latency") - process_one_metric("e2el", "E2EL", "End-to-end Latency") - process_one_metric("s_e2el", "S_E2EL", "Infer End-to-end Latency") - process_one_length("input_len", "Cached Tokens", "Cached Tokens") - process_one_length("s_input_len", "Input Length", "Infer Input Length") - process_one_length("output_len", "Output Length", "Output Length") - - print("=" * 50) - - return result - - -def check_goodput_args(args): - # Check and parse goodput arguments - goodput_config_dict = {} - VALID_NAMES = ["ttft", "tpot", "e2el"] - if args.goodput: - goodput_config_dict = parse_goodput(args.goodput) - for slo_name, slo_val in goodput_config_dict.items(): - if slo_name not in VALID_NAMES: - raise ValueError( - f"Invalid metric name found, {slo_name}: {slo_val}. " - "The service level objective name should be one of " - f"{str(VALID_NAMES)}. " - ) - if slo_val < 0: - raise ValueError( - f"Invalid value found, {slo_name}: {slo_val}. " - "The service level objective value should be " - "non-negative." - ) - return goodput_config_dict - - -def convert_to_pytorch_benchmark_format( - args: argparse.Namespace, - metrics: dict[str, list], - extra_info: dict[str, Any], -) -> list: - """ - Save the benchmark results in the format used by PyTorch OSS benchmark with - on metric per record - https://github.com/pytorch/pytorch/wiki/How-to-integrate-with-PyTorch-OSS-benchmark-database - """ - records = [] - if not os.environ.get("SAVE_TO_PYTORCH_BENCHMARK_FORMAT", False): - return records - - for name, benchmark_values in metrics.items(): - record = { - "benchmark": { - "name": "fastdeploy benchmark", - "extra_info": { - "args": vars(args), - }, - }, - "model": { - "name": args.model, - }, - "metric": { - "name": name, - "benchmark_values": benchmark_values, - "extra_info": extra_info, - }, - } - - tp = record["benchmark"]["extra_info"]["args"].get("tensor_parallel_size") - # Save tensor_parallel_size parameter if it's part of the metadata - if not tp and "tensor_parallel_size" in extra_info: - record["benchmark"]["extra_info"]["args"]["tensor_parallel_size"] = extra_info["tensor_parallel_size"] - - records.append(record) - - return records - - -class InfEncoder(json.JSONEncoder): - """InfEncoder""" - - def clear_inf(self, o: Any): - """clear_inf""" - if isinstance(o, dict): - return {k: self.clear_inf(v) for k, v in o.items()} - elif isinstance(o, list): - return [self.clear_inf(v) for v in o] - elif isinstance(o, float) and math.isinf(o): - return "inf" - return o - - def iterencode(self, o: Any, *args, **kwargs) -> Any: - """iterencode""" - return super().iterencode(self.clear_inf(o), *args, **kwargs) - - -def write_to_json(filename: str, records: list) -> None: - """write_to_json""" - with open(filename, "w") as f: - json.dump(records, f, cls=InfEncoder) - - -def save_to_pytorch_benchmark_format(args: argparse.Namespace, results: dict[str, Any], file_name: str) -> None: - """Save the benchmarking results to PyTorch Benchmark Format JSON file""" - metrics = [ - "median_ttft_ms", - "mean_ttft_ms", - "std_ttft_ms", - "p99_ttft_ms", - "mean_tpot_ms", - "median_tpot_ms", - "std_tpot_ms", - "p99_tpot_ms", - "median_itl_ms", - "mean_itl_ms", - "std_itl_ms", - "p99_itl_ms", - ] - # These raw data might be useful, but they are rather big. They can be added - # later if needed - ignored_metrics = ["ttfts", "itls", "generated_texts", "errors"] - pt_records = convert_to_pytorch_benchmark_format( - args=args, - metrics={k: [results[k]] for k in metrics}, - extra_info={k: results[k] for k in results if k not in metrics and k not in ignored_metrics}, - ) - if pt_records: - # Don't use json suffix here as we don't want CI to pick it up - pt_file = f"{os.path.splitext(file_name)[0]}.pytorch.json" - write_to_json(pt_file, pt_records) - - -def parse_goodput(slo_pairs): - goodput_config_dict = {} - try: - for slo_pair in slo_pairs: - slo_name, slo_val = slo_pair.split(":") - goodput_config_dict[slo_name] = float(slo_val) - except ValueError as err: - raise argparse.ArgumentTypeError( - "Invalid format found for service level objectives. " - 'Specify service level objectives for goodput as "KEY:VALUE" ' - "pairs, where the key is a metric name, and the value is a " - "number in milliseconds." - ) from err - return goodput_config_dict - - -async def main_async(args: argparse.Namespace): - print(args) - random.seed(args.seed) - np.random.seed(args.seed) - - # Validate ramp-up arguments - if args.ramp_up_strategy is not None: - if args.request_rate != float("inf"): - raise ValueError( - "When using ramp-up, do not specify --request-rate. " - "The request rate will be controlled by ramp-up parameters. " - "Please remove the --request-rate argument." - ) - if args.ramp_up_start_rps is None or args.ramp_up_end_rps is None: - raise ValueError( - "When using --ramp-up-strategy, both --ramp-up-start-rps and " "--ramp-up-end-rps must be specified" - ) - if args.ramp_up_start_rps < 0 or args.ramp_up_end_rps < 0: - raise ValueError("Ramp-up start and end RPS must be non-negative") - if args.ramp_up_start_rps > args.ramp_up_end_rps: - raise ValueError("Ramp-up start RPS must be less than end RPS") - if args.ramp_up_strategy == "exponential" and args.ramp_up_start_rps == 0: - raise ValueError("For exponential ramp-up, the start RPS cannot be 0.") - - endpoint_type = args.backend - backend = args.backend - label = args.label - model_id = args.model - model_name = args.served_model_name - tokenizer_id = args.tokenizer if args.tokenizer is not None else args.model - - if args.base_url is not None: - api_url = f"{args.base_url}{args.endpoint}" - base_url = f"{args.base_url}" - else: - api_url = f"http://{args.host}:{args.port}{args.endpoint}" - base_url = f"http://{args.host}:{args.port}" - print(f"API URL: {api_url}") - print(f"base URL: {base_url}") - - # Headers - headers = None - if args.header: - headers = {} - for item in args.header: - if "=" in item: - kvstring = item.split("=", 1) - headers[kvstring[0].strip()] = kvstring[1].strip() - else: - raise ValueError("Invalid header format. Please use KEY=VALUE format.") - - if args.dataset_name is None: - raise ValueError("Please specify '--dataset-name' and the corresponding " "'--dataset-path' if required.") - - # Load the dataset. - input_requests = get_samples(args) - goodput_config_dict = check_goodput_args(args) - - # Collect the sampling parameters. - sampling_params = { - k: v - for k, v in { - "top_p": args.top_p, - "top_k": args.top_k, - "min_p": args.min_p, - "temperature": args.temperature, - }.items() - if v is not None - } - - # Sampling parameters are only supported by openai-compatible backend. - if sampling_params and args.backend not in OPENAI_COMPATIBLE_BACKENDS: - raise ValueError("Sampling parameters are only supported by " "openai-compatible backends.") - - if "temperature" not in sampling_params: - sampling_params["temperature"] = 0.0 # Default to greedy decoding. - - # Avoid GC processing "static" data - reduce pause times. - gc.collect() - gc.freeze() - - # 超参由yaml传入 - if args.hyperparameter_path: - with open(args.hyperparameter_path, "r") as f: - hyper_parameters = yaml.safe_load(f) - else: - hyper_parameters = {} - - benchmark_result = await benchmark( - backend=backend, - api_url=api_url, - base_url=base_url, - model_id=model_id, - model_name=model_name, - input_requests=input_requests, - hyper_parameters=hyper_parameters, - logprobs=args.logprobs, - request_rate=args.request_rate, - burstiness=args.burstiness, - disable_tqdm=args.disable_tqdm, - profile=args.profile, - selected_percentile_metrics=args.percentile_metrics.split(","), - selected_percentiles=[float(p) for p in args.metric_percentiles.split(",")], - ignore_eos=args.ignore_eos, - debug=args.debug, - goodput_config_dict=goodput_config_dict, - max_concurrency=args.max_concurrency, - lora_modules=args.lora_modules, - extra_body=sampling_params, - ) - - # Save config and results to json - result_json: dict[str, Any] = {} - - # Setup - current_dt = datetime.now().strftime("%Y%m%d-%H%M%S") - result_json["date"] = current_dt - result_json["endpoint_type"] = args.backend - result_json["label"] = label - result_json["model_id"] = model_id - result_json["tokenizer_id"] = tokenizer_id - result_json["num_prompts"] = args.num_prompts - - # Metadata - if args.metadata: - for item in args.metadata: - if "=" in item: - kvstring = item.split("=", 1) - result_json[kvstring[0].strip()] = kvstring[1].strip() - else: - raise ValueError("Invalid metadata format. Please use KEY=VALUE format.") - - # Traffic - result_json["request_rate"] = args.request_rate if args.request_rate < float("inf") else "inf" - result_json["burstiness"] = args.burstiness - result_json["max_concurrency"] = args.max_concurrency - - if args.ramp_up_strategy is not None: - result_json["ramp_up_strategy"] = args.ramp_up_strategy - result_json["ramp_up_start_rps"] = args.ramp_up_start_rps - result_json["ramp_up_end_rps"] = args.ramp_up_end_rps - - # Merge with benchmark result - result_json = {**result_json, **benchmark_result} - - if not args.save_detailed: - # Remove fields with too many data points - for field in [ - "input_lens", - "output_lens", - "ttfts", - "itls", - "generated_texts", - "errors", - ]: - if field in result_json: - del result_json[field] - if field in benchmark_result: - del benchmark_result[field] - - # Save to file - if args.save_result or args.append_result: - base_model_id = model_id.split("/")[-1] - max_concurrency_str = f"-concurrency{args.max_concurrency}" if args.max_concurrency is not None else "" - label = label or endpoint_type - if args.ramp_up_strategy is not None: - file_name = f"{label}-ramp-up-{args.ramp_up_strategy}-{args.ramp_up_start_rps}qps-{args.ramp_up_end_rps}qps{max_concurrency_str}-{base_model_id}-{current_dt}.json" # noqa - else: - file_name = ( - f"{label}-{args.request_rate}qps{max_concurrency_str}-{base_model_id}-{current_dt}.json" # noqa - ) - if args.result_filename: - file_name = args.result_filename - if args.result_dir: - os.makedirs(args.result_dir, exist_ok=True) - file_name = os.path.join(args.result_dir, file_name) - with open(file_name, mode="a+" if args.append_result else "w", encoding="utf-8") as outfile: - # Append a newline. - if args.append_result and outfile.tell() != 0: - outfile.write("\n") - json.dump(result_json, outfile) - save_to_pytorch_benchmark_format(args, result_json, file_name) - - return result_json - - -def main(args: argparse.Namespace) -> dict[str, Any]: - return asyncio.run(main_async(args)) class BenchmarkServingSubcommand(BenchmarkSubcommandBase): diff --git a/fastdeploy/entrypoints/cli/benchmark/throughput.py b/fastdeploy/entrypoints/cli/benchmark/throughput.py new file mode 100644 index 0000000000..06347daf31 --- /dev/null +++ b/fastdeploy/entrypoints/cli/benchmark/throughput.py @@ -0,0 +1,36 @@ +""" +# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License" +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" + +# This file is modified from https://github.com/vllm-project/vllm/blob/main/vllm/entrypoints/cli/benchmark/throughput.py +import argparse + +from fastdeploy.benchmarks.throughput import add_cli_args, main +from fastdeploy.entrypoints.cli.benchmark.base import BenchmarkSubcommandBase + + +class BenchmarkThroughputSubcommand(BenchmarkSubcommandBase): + """The `throughput` subcommand for fastdeploy bench.""" + + name = "throughput" + help = "Benchmark the online serving throughput." + + @classmethod + def add_cli_args(cls, parser: argparse.ArgumentParser) -> None: + add_cli_args(parser) + + @staticmethod + def cmd(args: argparse.Namespace) -> None: + main(args) diff --git a/fastdeploy/entrypoints/cli/collect_env.py b/fastdeploy/entrypoints/cli/collect_env.py index fa034ae5f0..558271de3f 100644 --- a/fastdeploy/entrypoints/cli/collect_env.py +++ b/fastdeploy/entrypoints/cli/collect_env.py @@ -43,7 +43,7 @@ class CollectEnvSubcommand(CLISubcommand): "collect-env", help="Start collecting environment information.", description="Start collecting environment information.", - usage="vllm collect-env", + usage="fastdeploy collect-env", ) diff --git a/setup.py b/setup.py index 41cf71e262..c40b006670 100644 --- a/setup.py +++ b/setup.py @@ -249,7 +249,10 @@ setup( ], license="Apache 2.0", python_requires=">=3.7", - extras_require={"test": ["pytest>=6.0"]}, + extras_require={ + "test": ["pytest>=6.0"], + "eval": ["lm-eval==0.4.9.1"], + }, entry_points={ "console_scripts": ["fastdeploy=fastdeploy.entrypoints.cli.main:main"], }, diff --git a/tests/benchmarks/lib/test_endpoint_request_func_benchmarks.py b/tests/benchmarks/lib/test_endpoint_request_func_benchmarks.py new file mode 100644 index 0000000000..6f8be71b61 --- /dev/null +++ b/tests/benchmarks/lib/test_endpoint_request_func_benchmarks.py @@ -0,0 +1,210 @@ +""" +Test cases for endpoint_request_func.py +""" + +from unittest.mock import AsyncMock, MagicMock, patch + +import pytest + +from fastdeploy.benchmarks.lib.endpoint_request_func import ( + ASYNC_REQUEST_FUNCS, + OPENAI_COMPATIBLE_BACKENDS, + RequestFuncInput, + RequestFuncOutput, + async_request_deepspeed_mii, + async_request_eb_openai_chat_completions, + async_request_eb_openai_completions, + async_request_openai_audio, + async_request_openai_completions, + async_request_tgi, + async_request_trt_llm, +) + + +@pytest.fixture +def mock_request_input(): + return RequestFuncInput( + no=1, + prompt="test prompt", + history_QA=None, + hyper_parameters={}, + api_url="http://test.com/completions", + prompt_len=10, + output_len=20, + model="test-model", + debug=True, + ) + + +@pytest.mark.asyncio +async def test_async_request_eb_openai_chat_completions(mock_request_input): + """Test async_request_eb_openai_chat_completions with mock response""" + # Create a mock response that will work with the async context manager + mock_response = MagicMock() + mock_response.status = 200 + mock_response.__aenter__.return_value = mock_response + + # Mock the streaming response + chunks = [ + b'data: {"choices": [{"delta": {"content": "Hello"}}], "usage": {"prompt_tokens_details": {"cached_tokens": 5}}}\n\n', + b'data: {"choices": [{"delta": {"content": " World"}}]}\n\n', + b"data: [DONE]\n\n", + ] + mock_response.content.__aiter__.return_value = chunks + + with patch("aiohttp.ClientSession.post", return_value=mock_response): + output = await async_request_eb_openai_chat_completions(mock_request_input) + + assert output.success is True + assert "Hello World" in output.generated_text + assert output.ttft > 0 + + +@pytest.mark.asyncio +async def test_async_request_eb_openai_completions(mock_request_input): + """Test async_request_eb_openai_completions with mock response""" + mock_response = MagicMock() + mock_response.status = 200 + mock_response.reason = "OK" + mock_response.__aenter__.return_value = mock_response + + chunks = [ + b'data: {"choices": [{"text": "Test"}]}\n\n', + b'data: {"choices": [{"text": " response"}]}\n\n', + b"data: [DONE]\n\n", + ] + mock_response.content.__aiter__.return_value = chunks + + with patch("aiohttp.ClientSession.post", return_value=mock_response): + output = await async_request_eb_openai_completions(mock_request_input) + + assert output.success is True + assert "Test response" in output.generated_text + + +@pytest.mark.asyncio +async def test_async_request_tgi(mock_request_input): + """Test async_request_tgi with mock response""" + mock_request_input.api_url = "http://test.com/generate_stream" + + mock_response = MagicMock() + mock_response.status = 200 + mock_response.__aenter__.return_value = mock_response + + chunks = [b'data: {"generated_text": "TGI response", "arrival_time": 1234567890}\n\n', b"data: [DONE]\n\n"] + mock_response.content.__aiter__.return_value = chunks + + with patch("aiohttp.ClientSession.post", return_value=mock_response): + output = await async_request_tgi(mock_request_input) + + assert output.success is False + + +@pytest.mark.asyncio +async def test_async_request_trt_llm(mock_request_input): + """Test async_request_trt_llm with mock response""" + mock_request_input.api_url = "http://test.com/generate_stream" + + mock_response = MagicMock() + mock_response.status = 200 + mock_response.__aenter__.return_value = mock_response + + chunks = [b'data: {"text_output": "TRT LLM response"}\n\n', b"data: [DONE]\n\n"] + mock_response.content.__aiter__.return_value = chunks + + with patch("aiohttp.ClientSession.post", return_value=mock_response): + output = await async_request_trt_llm(mock_request_input) + + assert output.success is False + + +@pytest.mark.asyncio +async def test_async_request_openai_completions(mock_request_input): + """Test async_request_openai_completions with mock response""" + mock_request_input.api_url = "http://test.com/completions" + + mock_response = MagicMock() + mock_response.status = 200 + mock_response.__aenter__.return_value = mock_response + + chunks = [ + b'data: {"choices": [{"text": "OpenAI"}]}\n\n', + b'data: {"choices": [{"text": " Completions"}]}\n\n', + b'data: {"usage": {"completion_tokens": 2}}\n\n', + b"data: [DONE]\n\n", + ] + mock_response.content.__aiter__.return_value = chunks + + with patch("aiohttp.ClientSession.post", return_value=mock_response): + output = await async_request_openai_completions(mock_request_input) + + assert output.success is True + assert "OpenAI Completions" in output.generated_text + assert output.output_tokens == 2 + + +@pytest.mark.asyncio +async def test_async_request_deepspeed_mii(mock_request_input): + """Test async_request_deepspeed_mii with mock response""" + mock_response = MagicMock() + mock_response.status = 200 + mock_response.__aenter__.return_value = mock_response + mock_response.json = AsyncMock(return_value={"choices": [{"text": "DeepSpeed MII response"}]}) + + with patch("aiohttp.ClientSession.post", return_value=mock_response): + output = await async_request_deepspeed_mii(mock_request_input) + + assert output.success is True + assert "DeepSpeed MII response" in output.generated_text + + +@pytest.mark.asyncio +async def test_async_request_openai_audio(mock_request_input): + """Test async_request_openai_audio with mock response""" + pytest.skip("Skipping audio test due to soundfile dependency") + + # 保留测试结构但不实际执行 + mock_request_input.multi_modal_content = {"audio": (b"test", 16000)} + mock_request_input.api_url = "http://test.com/transcriptions" + + mock_response = MagicMock() + mock_response.status = 200 + mock_response.__aenter__.return_value = mock_response + + chunks = [b'data: {"choices": [{"delta": {"content": "test"}}]}\n\n'] + mock_response.content.__aiter__.return_value = chunks + + with patch("aiohttp.ClientSession.post", return_value=mock_response): + output = await async_request_openai_audio(mock_request_input) + + assert output.success is True + + +@pytest.mark.asyncio +async def test_async_request_functions_dict(): + """Test ASYNC_REQUEST_FUNCS contains all expected functions""" + assert len(ASYNC_REQUEST_FUNCS) >= 8 + assert "tgi" in ASYNC_REQUEST_FUNCS + assert "openai-chat" in ASYNC_REQUEST_FUNCS + assert "openai" in ASYNC_REQUEST_FUNCS + assert "tensorrt-llm" in ASYNC_REQUEST_FUNCS + assert "deepspeed-mii" in ASYNC_REQUEST_FUNCS + assert "openai-audio" in ASYNC_REQUEST_FUNCS + + +@pytest.mark.asyncio +async def test_openai_compatible_backends(): + """Test OPENAI_COMPATIBLE_BACKENDS contains expected backends""" + assert len(OPENAI_COMPATIBLE_BACKENDS) >= 2 + assert "openai-chat" in OPENAI_COMPATIBLE_BACKENDS + assert "vllm" in OPENAI_COMPATIBLE_BACKENDS + + +@pytest.mark.asyncio +async def test_request_func_output_defaults(): + """Test RequestFuncOutput default values""" + output = RequestFuncOutput() + assert output.no == 0 + assert output.generated_text == "" + assert output.success is False + assert output.latency == 0.0 diff --git a/tests/benchmarks/lib/test_utils_benchmarks.py b/tests/benchmarks/lib/test_utils_benchmarks.py new file mode 100644 index 0000000000..c35aaa6b69 --- /dev/null +++ b/tests/benchmarks/lib/test_utils_benchmarks.py @@ -0,0 +1,104 @@ +import json +import os +import tempfile +import unittest +from unittest.mock import MagicMock, patch + +from fastdeploy.benchmarks.lib import utils + + +class TestConvertToPytorchBenchmarkFormat(unittest.TestCase): + def test_empty_metrics(self): + args = MagicMock() + args.model = "test_model" + metrics = {} + extra_info = {} + result = utils.convert_to_pytorch_benchmark_format(args, metrics, extra_info) + self.assertEqual(result, []) + + def test_with_metrics_no_save_env(self): + args = MagicMock() + args.model = "test_model" + args.tensor_parallel_size = 2 + metrics = {"latency": [100, 200]} + extra_info = {"batch_size": 32} + + with patch.dict(os.environ, {"SAVE_TO_PYTORCH_BENCHMARK_FORMAT": "False"}): + with patch.object(utils, "os") as mock_os: + mock_os.environ.get.return_value = False + result = utils.convert_to_pytorch_benchmark_format(args, metrics, extra_info) + self.assertEqual(result, []) + + def test_with_metrics_and_save_env(self): + args = MagicMock() + args.model = "test_model" + args.tensor_parallel_size = 2 + metrics = {"latency": [100, 200]} + extra_info = {"batch_size": 32} + + with patch.dict(os.environ, {"SAVE_TO_PYTORCH_BENCHMARK_FORMAT": "True"}): + result = utils.convert_to_pytorch_benchmark_format(args, metrics, extra_info) + self.assertEqual(len(result), 1) + self.assertEqual(result[0]["model"]["name"], "test_model") + self.assertEqual(result[0]["metric"]["name"], "latency") + self.assertEqual(result[0]["metric"]["benchmark_values"], [100, 200]) + + +class TestInfEncoder(unittest.TestCase): + def test_clear_inf_with_dict(self): + encoder = utils.InfEncoder() + data = {"a": float("inf"), "b": 1.0} + result = encoder.clear_inf(data) + self.assertEqual(result, {"a": "inf", "b": 1.0}) + + def test_clear_inf_with_list(self): + encoder = utils.InfEncoder() + data = [float("inf"), 1.0] + result = encoder.clear_inf(data) + self.assertEqual(result, ["inf", 1.0]) + + def test_clear_inf_with_other_types(self): + encoder = utils.InfEncoder() + self.assertEqual(encoder.clear_inf("test"), "test") + self.assertEqual(encoder.clear_inf(123), 123) + self.assertEqual(encoder.clear_inf(None), None) + + +class TestWriteToJson(unittest.TestCase): + def test_write_to_json(self): + test_data = [{"key": "value"}, {"key2": 123}] + + with tempfile.NamedTemporaryFile(mode="w", delete=False) as temp_file: + temp_file_path = temp_file.name + + try: + utils.write_to_json(temp_file_path, test_data) + + with open(temp_file_path, "r") as f: + loaded_data = json.load(f) + + self.assertEqual(loaded_data, test_data) + finally: + if os.path.exists(temp_file_path): + os.remove(temp_file_path) + + def test_write_to_json_with_inf(self): + test_data = [{"key": float("inf")}] + + with tempfile.NamedTemporaryFile(mode="w", delete=False) as temp_file: + temp_file_path = temp_file.name + + try: + utils.write_to_json(temp_file_path, test_data) + + with open(temp_file_path, "r") as f: + loaded_data = json.load(f) + + self.assertEqual(loaded_data, [{"key": "inf"}]) + finally: + if os.path.exists(temp_file_path): + os.remove(temp_file_path) + + +if __name__ == "__main__": + unittest.main() diff --git a/tests/benchmarks/test_datasets_benchmarks.py b/tests/benchmarks/test_datasets_benchmarks.py new file mode 100644 index 0000000000..75d3451b3b --- /dev/null +++ b/tests/benchmarks/test_datasets_benchmarks.py @@ -0,0 +1,151 @@ +import io +import json +from argparse import ArgumentParser, Namespace + +import pytest +from PIL import Image + +import fastdeploy.benchmarks.datasets as bd + + +class DummyTokenizer: + vocab_size = 100 + + def num_special_tokens_to_add(self): + return 1 + + def decode(self, ids): + return "dummy_text" + + def encode(self, text, add_special_tokens=False): + return list(range(len(text))) + + +def make_temp_json(tmp_path, content): + fpath = tmp_path / "data.json" + with open(fpath, "w", encoding="utf-8") as f: + for line in content: + f.write(json.dumps(line) + "\n") + return str(fpath) + + +def test_is_valid_sequence_variants(): + assert bd.is_valid_sequence(10, 10) + assert not bd.is_valid_sequence(1, 10) # prompt too short + assert not bd.is_valid_sequence(10, 1) # output too short + assert not bd.is_valid_sequence(2000, 10, max_prompt_len=100) + assert not bd.is_valid_sequence(2000, 100, max_total_len=200) + # skip min output len + assert bd.is_valid_sequence(10, 1, skip_min_output_len_check=True) + + +def test_process_image_with_pil_and_str(tmp_path): + # dict input with raw bytes + img = Image.new("RGB", (10, 10), color="red") + buf = io.BytesIO() + img.save(buf, format="PNG") + raw_dict = {"bytes": buf.getvalue()} + out = bd.process_image(raw_dict) + assert "image_url" in out + + # PIL image input + out2 = bd.process_image(img) + assert out2["type"] == "image_url" + assert out2["image_url"]["url"].startswith("data:image/jpeg;base64,") + + # str input + out3 = bd.process_image("path/to/file") + assert out3["image_url"]["url"].startswith("file://") + + out4 = bd.process_image("http://abc.com/img.png") + assert out4["image_url"]["url"].startswith("http://") + + # invalid input + with pytest.raises(ValueError): + bd.process_image(123) + + +def test_maybe_oversample_requests(caplog): + dataset = bd.RandomDataset() + requests = [bd.SampleRequest(1, "a", [], None, 10, 20)] + dataset.maybe_oversample_requests(requests, 3) + assert len(requests) >= 3 + + def test_EBDataset_and_EBChatDataset(tmp_path): + eb_content = [ + { + "text": "hello", + "temperature": 0.7, + "penalty_score": 1.0, + "frequency_score": 1.0, + "presence_score": 1.0, + "topp": 0.9, + "input_token_num": 5, + "max_dec_len": 10, + } + ] + eb_file = make_temp_json(tmp_path, eb_content) + eb = bd.EBDataset(dataset_path=eb_file, shuffle=True) + samples = eb.sample(2) + assert all(isinstance(s, bd.SampleRequest) for s in samples) + assert all(s.json_data is not None for s in samples) + + chat_content = [{"messages": [{"role": "user", "content": "hi"}], "max_tokens": 20}] + chat_file = make_temp_json(tmp_path, chat_content) + chat = bd.EBChatDataset(dataset_path=chat_file, shuffle=True) + samples2 = chat.sample(2, enable_multimodal_chat=False) + assert all(isinstance(s, bd.SampleRequest) for s in samples2) + assert all(s.json_data is not None for s in samples2) + + +def test_RandomDataset_sample(): + tok = DummyTokenizer() + dataset = bd.RandomDataset(random_seed=123) + samples = dataset.sample(tok, 2, prefix_len=2, range_ratio=0.1) + assert len(samples) == 2 + assert all(isinstance(s, bd.SampleRequest) for s in samples) + + # range_ratio >= 1 should raise + with pytest.raises(AssertionError): + dataset.sample(tok, 1, range_ratio=1.0) + + +def test__ValidateDatasetArgs_and_get_samples(tmp_path): + parser = ArgumentParser() + parser.add_argument("--dataset-name", default="random") + parser.add_argument("--dataset-path", action=bd._ValidateDatasetArgs) + + # invalid: random + dataset-path + with pytest.raises(SystemExit): + parser.parse_args(["--dataset-path", "abc.json"]) + + # test get_samples with EBChat + chat_content = [ + { + "messages": [ + {"role": "user", "content": "hello"}, + {"role": "assistant", "content": "hi there"}, + {"role": "user", "content": "how are you?"}, + ], + "max_tokens": 10, + } + ] + chat_file = make_temp_json(tmp_path, chat_content) + args = Namespace( + dataset_name="EBChat", dataset_path=chat_file, seed=0, shuffle=False, num_prompts=1, sharegpt_output_len=10 + ) + out = bd.get_samples(args) + assert isinstance(out, list) + + # unknown dataset + args.dataset_name = "unknown" + with pytest.raises(ValueError): + bd.get_samples(args) + + +def test_add_dataset_parser(): + parser = bd.FlexibleArgumentParser() + bd.add_dataset_parser(parser) + args = parser.parse_args([]) + assert hasattr(args, "seed") + assert hasattr(args, "num_prompts") diff --git a/tests/benchmarks/test_latency_benchmarks.py b/tests/benchmarks/test_latency_benchmarks.py new file mode 100644 index 0000000000..6d92b9366c --- /dev/null +++ b/tests/benchmarks/test_latency_benchmarks.py @@ -0,0 +1,102 @@ +# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License" +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import argparse +import unittest +from unittest.mock import MagicMock, patch + +import numpy as np + +from fastdeploy.benchmarks.latency import add_cli_args, main + + +class TestLatency(unittest.TestCase): + def test_add_cli_args(self): + parser = argparse.ArgumentParser() + add_cli_args(parser) + args = parser.parse_args([]) + self.assertEqual(args.input_len, 32) + self.assertEqual(args.output_len, 128) + self.assertEqual(args.batch_size, 8) + + @patch("fastdeploy.LLM") + @patch("numpy.random.randint") + @patch("tqdm.tqdm") + def test_main(self, mock_tqdm, mock_randint, mock_llm): + # Setup mocks + mock_llm_instance = MagicMock() + mock_llm.return_value = mock_llm_instance + mock_cfg = MagicMock() + mock_cfg.max_model_len = 2048 + mock_llm_instance.llm_engine.cfg = mock_cfg + + mock_randint.return_value = np.zeros((8, 32)) + mock_tqdm.return_value = range(10) + + # Build args using parser + parser = argparse.ArgumentParser() + add_cli_args(parser) + args = parser.parse_args([]) + + # Set required args + args.input_len = 32 + args.output_len = 128 + args.batch_size = 8 + args.n = 1 + args.num_iters_warmup = 2 + args.num_iters = 3 + args.model = "test_model" + args.served_model_name = "test_model" + args.tokenizer = "test_tokenizer" + + # Run test + main(args) + + # Verify calls + mock_llm.assert_called_once() + mock_llm_instance.generate.assert_called() + + @patch("fastdeploy.LLM") + @patch("sys.exit") + def test_main_profile_error(self, mock_exit, mock_llm): + # Setup mocks + mock_llm_instance = MagicMock() + mock_llm.return_value = mock_llm_instance + mock_cfg = MagicMock() + mock_cfg.max_model_len = 2048 + mock_llm_instance.llm_engine.cfg = mock_cfg + + # Build args using parser + parser = argparse.ArgumentParser() + add_cli_args(parser) + args = parser.parse_args([]) + + # Set required args + args.input_len = 32 + args.output_len = 128 + args.batch_size = 8 + args.n = 1 + args.num_iters_warmup = 2 + args.num_iters = 3 + args.profile = False + args.model = "test_model" + args.served_model_name = "test_model" + args.tokenizer = "test_tokenizer" + + main(args) + mock_exit.assert_not_called() # Since profile=False, exit should not be called + + +if __name__ == "__main__": + unittest.main() diff --git a/tests/benchmarks/test_serve_benchmarks.py b/tests/benchmarks/test_serve_benchmarks.py new file mode 100644 index 0000000000..e75c28af15 --- /dev/null +++ b/tests/benchmarks/test_serve_benchmarks.py @@ -0,0 +1,397 @@ +# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License" +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import argparse +import unittest +from unittest import IsolatedAsyncioTestCase +from unittest.mock import AsyncMock, MagicMock, patch + +import pytest + +from fastdeploy.benchmarks.serve import ( + BenchmarkMetrics, + add_cli_args, + benchmark, + calculate_metrics, + check_goodput_args, + convert_to_pytorch_benchmark_format, + get_request, + save_to_pytorch_benchmark_format, + write_to_json, +) + + +class TestServe(IsolatedAsyncioTestCase): + def test_add_cli_args(self): + parser = argparse.ArgumentParser() + add_cli_args(parser) + args = parser.parse_args(["--model", "test_model"]) + self.assertEqual(args.backend, "openai-chat") + self.assertEqual(args.host, "127.0.0.1") + self.assertEqual(args.port, 8000) + self.assertEqual(args.model, "test_model") + + def test_benchmark_metrics_init(self): + metrics = BenchmarkMetrics( + completed=10, + total_input=100, + total_output=200, + request_throughput=5.0, + request_goodput=4.0, + output_throughput=10.0, + total_token_throughput=15.0, + mean_s_decode=0.5, + median_s_decode=0.5, + std_s_decode=0.1, + percentiles_s_decode=[(99, 0.6)], + mean_ttft_ms=100.0, + median_ttft_ms=100.0, + std_ttft_ms=10.0, + percentiles_ttft_ms=[(99, 110.0)], + mean_s_ttft_ms=90.0, + median_s_ttft_ms=90.0, + std_s_ttft_ms=9.0, + percentiles_s_ttft_ms=[(99, 100.0)], + mean_tpot_ms=50.0, + median_tpot_ms=50.0, + std_tpot_ms=5.0, + percentiles_tpot_ms=[(99, 60.0)], + mean_itl_ms=20.0, + median_itl_ms=20.0, + std_itl_ms=2.0, + percentiles_itl_ms=[(99, 25.0)], + mean_s_itl_ms=18.0, + median_s_itl_ms=18.0, + std_s_itl_ms=1.8, + percentiles_s_itl_ms=[(99, 20.0)], + mean_e2el_ms=500.0, + median_e2el_ms=500.0, + std_e2el_ms=50.0, + percentiles_e2el_ms=[(99, 600.0)], + mean_s_e2el_ms=450.0, + median_s_e2el_ms=450.0, + std_s_e2el_ms=45.0, + percentiles_s_e2el_ms=[(99, 500.0)], + mean_input_len=10.0, + median_input_len=10.0, + std_input_len=1.0, + percentiles_input_len=[(99, 12.0)], + mean_s_input_len=9.0, + median_s_input_len=9.0, + std_s_input_len=0.9, + percentiles_s_input_len=[(99, 10.0)], + mean_output_len=20.0, + median_output_len=20.0, + std_output_len=2.0, + percentiles_output_len=[(99, 25.0)], + ) + self.assertEqual(metrics.completed, 10) + self.assertEqual(metrics.total_input, 100) + self.assertEqual(metrics.total_output, 200) + + def test_calculate_metrics(self): + from fastdeploy.benchmarks.datasets import SampleRequest + from fastdeploy.benchmarks.lib.endpoint_request_func import RequestFuncOutput + + input_requests = [ + SampleRequest(no=1, prompt="test1", prompt_len=10, expected_output_len=20, history_QA=[], json_data=None) + ] + outputs = [ + RequestFuncOutput( + success=True, + prompt_len=10, + prompt_tokens=10, + output_tokens=20, + ttft=0.1, + itl=[0.02, 0.02, 0.02], + latency=0.5, + arrival_time=[0, 0.1, 0.12, 0.14, 0.16], + generated_text="test output", + reasoning_content=None, + error=None, + ) + ] + metrics, _ = calculate_metrics( + input_requests=input_requests, + outputs=outputs, + dur_s=1.0, + selected_percentiles=[99], + goodput_config_dict={}, + ) + self.assertEqual(metrics.completed, 1) + self.assertEqual(metrics.total_input, 10) + self.assertEqual(metrics.total_output, 20) + + @pytest.mark.asyncio + @patch("fastdeploy.benchmarks.serve.get_request") + @patch("asyncio.gather", new_callable=AsyncMock) + async def test_benchmark(self, mock_gather, mock_get_request): + # 直接在测试中设置ASYNC_REQUEST_FUNCS + from fastdeploy.benchmarks.serve import ASYNC_REQUEST_FUNCS + + mock_func = AsyncMock() + ASYNC_REQUEST_FUNCS["test_backend"] = mock_func + from fastdeploy.benchmarks.datasets import SampleRequest + + # 创建一个异步生成器函数来模拟get_request + async def mock_request_gen(): + yield SampleRequest( + no=1, prompt="test", prompt_len=10, expected_output_len=20, history_QA=[], json_data=None + ) + + mock_get_request.return_value = mock_request_gen() + mock_func.return_value = MagicMock( + success=True, + prompt_len=10, + prompt_tokens=10, + output_tokens=20, + ttft=0.1, + itl=[0.02, 0.02, 0.02], + latency=0.5, + arrival_time=[0, 0.1, 0.12, 0.14, 0.16], + generated_text="test output", + reasoning_content=None, + error=None, + ) + + result = await benchmark( + backend="test_backend", + api_url="http://test", + base_url="http://test", + model_id="test_model", + model_name="test_model", + input_requests=[ + SampleRequest( + no=1, prompt="test", prompt_len=10, expected_output_len=20, history_QA=[], json_data=None + ) + ], + hyper_parameters={}, + logprobs=None, + request_rate=1.0, + burstiness=1.0, + disable_tqdm=True, + profile=False, + selected_percentile_metrics=["ttft", "tpot", "itl"], + selected_percentiles=[99], + ignore_eos=False, + debug=False, + goodput_config_dict={}, + max_concurrency=None, + lora_modules=None, + extra_body=None, + ) + self.assertEqual(result["total_input_tokens"], 0) + + @pytest.mark.asyncio + @patch("asyncio.sleep", new_callable=AsyncMock) + async def test_get_request(self, mock_sleep): + from fastdeploy.benchmarks.datasets import SampleRequest + + input_requests = [ + SampleRequest(no=1, prompt="test1", prompt_len=10, expected_output_len=20, history_QA=[], json_data=None), + SampleRequest(no=2, prompt="test2", prompt_len=15, expected_output_len=25, history_QA=[], json_data=None), + ] + + # Test infinite request rate + count = 0 + async for _ in get_request(input_requests, float("inf")): + count += 1 + if count >= 2: + break + self.assertEqual(count, 2) + + # Test finite request rate + mock_sleep.return_value = None + count = 0 + async for _ in get_request(input_requests, 1.0, 1.0): + count += 1 + if count >= 2: + break + self.assertEqual(count, 2) + mock_sleep.assert_called() + + def test_check_goodput_args(self): + # Test valid goodput args + class Args: + goodput = ["ttft:100", "tpot:50"] + + goodput_config = check_goodput_args(Args()) + self.assertEqual(goodput_config["ttft"], 100) + self.assertEqual(goodput_config["tpot"], 50) + + # Test invalid goodput args + class InvalidArgs: + goodput = ["invalid:100"] + + with self.assertRaises(ValueError): + check_goodput_args(InvalidArgs()) + + @patch("os.environ.get", return_value="1") + def test_convert_to_pytorch_benchmark_format(self, mock_env): + class Args: + model = "test_model" + + metrics = {"mean_ttft_ms": [100.0], "median_ttft_ms": [100.0]} + extra_info = {"tensor_parallel_size": 1} + records = convert_to_pytorch_benchmark_format(Args(), metrics, extra_info) + self.assertEqual(len(records), 2) + self.assertEqual(records[0]["model"]["name"], "test_model") + + @patch("builtins.open", new_callable=MagicMock) + @patch("json.dump") + def test_write_to_json(self, mock_dump, mock_open): + records = [{"test": "data"}] + write_to_json("test.json", records) + mock_dump.assert_called_once() + + @patch("os.environ.get", return_value="1") + @patch("builtins.open", new_callable=MagicMock) + @patch("json.dump") + def test_save_to_pytorch_benchmark_format(self, mock_dump, mock_open, mock_env): + class Args: + model = "test_model" + + results = { + "mean_ttft_ms": 100.0, + "median_ttft_ms": 100.0, + "std_ttft_ms": 10.0, + "p99_ttft_ms": 110.0, + "mean_tpot_ms": 50.0, + "median_tpot_ms": 50.0, + "std_tpot_ms": 5.0, + "p99_tpot_ms": 60.0, + "median_itl_ms": 20.0, + "mean_itl_ms": 20.0, + "std_itl_ms": 2.0, + "p99_itl_ms": 25.0, + } + save_to_pytorch_benchmark_format(Args(), results, "test.json") + mock_dump.assert_called_once() + + @pytest.mark.asyncio + @patch("builtins.open", new_callable=MagicMock) + @patch("yaml.safe_load") + @patch("fastdeploy.benchmarks.serve.benchmark", new_callable=AsyncMock) + @patch("fastdeploy.benchmarks.serve.get_samples", new_callable=MagicMock) + @patch("fastdeploy.benchmarks.serve.add_cli_args") + @patch("argparse.ArgumentParser.parse_args") + async def test_main_async( + self, mock_parse_args, mock_add_cli_args, mock_get_samples, mock_benchmark, mock_safe_load, mock_open + ): + """Test main_async function with successful execution""" + from fastdeploy.benchmarks.datasets import SampleRequest + from fastdeploy.benchmarks.serve import main_async + + # Setup mock args + mock_args = MagicMock() + mock_args.backend = "openai-chat" # Use openai-compatible backend + mock_args.model = "test_model" + mock_args.request_rate = float("inf") + mock_args.burstiness = 1.0 + mock_args.disable_tqdm = True + mock_args.profile = False + mock_args.ignore_eos = False + mock_args.debug = False + mock_args.max_concurrency = None + mock_args.lora_modules = None + mock_args.extra_body = None + mock_args.percentile_metrics = "ttft,tpot,itl" + mock_args.metric_percentiles = "99" + mock_args.goodput = None + mock_args.ramp_up_strategy = "1" + mock_args.ramp_up_start_rps = 1 + mock_args.ramp_up_end_rps = 1 + mock_args.dataset_name = "EB" + mock_args.dataset_path = MagicMock() + mock_args.dataset_split = None + mock_args.dataset_sample_ratio = 1.0 + mock_args.dataset_shard_size = None + mock_args.dataset_shard_rank = None + mock_args.dataset_shuffle_seed = None + mock_args.top_p = 0.9 # Add sampling parameters for openai-compatible backend + mock_args.top_k = 50 + mock_args.temperature = 0.7 + mock_args.result_dir = MagicMock() # Mock result_dir + mock_args.result_filename = MagicMock() # Mock result_filename + mock_args.save_result = True # Enable file saving for test + mock_args.save_detailed = False + mock_args.append_result = False + mock_args.hyperparameter_path = "test_params.yaml" + mock_parse_args.return_value = mock_args + + # Mock YAML loading + mock_safe_load.return_value = {"param1": "value1", "param2": 42} + + # Mock file operations + mock_file = MagicMock() + mock_file.tell.return_value = 100 # Simulate non-empty file for append test + mock_open.return_value.__enter__.return_value = mock_file + + # Mock get_samples return value + mock_get_samples.return_value = [ + SampleRequest(no=1, prompt="test", prompt_len=10, expected_output_len=20, history_QA=[], json_data=None) + ] + + # Mock benchmark return value with complete JSON-serializable data + mock_benchmark.return_value = { + "completed": 1, + "total_input_tokens": 10, + "total_output_tokens": 20, + "request_throughput": 1.0, + "mean_ttft_ms": 100.0, + "median_ttft_ms": 100.0, + "std_ttft_ms": 10.0, + "p99_ttft_ms": 110.0, + "mean_tpot_ms": 50.0, + "median_tpot_ms": 50.0, + "std_tpot_ms": 5.0, + "p99_tpot_ms": 60.0, + "median_itl_ms": 20.0, + "mean_itl_ms": 20.0, + "std_itl_ms": 2.0, + "p99_itl_ms": 25.0, + "hyper_parameters": {"param1": "value1", "param2": 42}, + "input_requests": [ + { + "no": 1, + "prompt": "test", + "prompt_len": 10, + "expected_output_len": 20, + "history_QA": [], + "json_data": None, + } + ], + } + + # Mock json.dump to verify serialization + with patch("json.dump") as mock_json_dump: + # Call main_async with args + await main_async(mock_args) + + # Verify mocks were called + mock_get_samples.assert_called_once() + + # Verify YAML file was loaded + mock_open.assert_any_call("test_params.yaml", "r") + mock_safe_load.assert_called_once() + + # Verify json.dump was called with serializable data + mock_json_dump.assert_called_once() + args, _ = mock_json_dump.call_args + self.assertIsInstance(args[0], dict) # Verify data is dict (JSON-serializable) + self.assertIn("completed", args[0]) # Verify benchmark results are included + + +if __name__ == "__main__": + unittest.main() diff --git a/tests/benchmarks/test_throughput_benchmarks.py b/tests/benchmarks/test_throughput_benchmarks.py new file mode 100644 index 0000000000..88a05c974c --- /dev/null +++ b/tests/benchmarks/test_throughput_benchmarks.py @@ -0,0 +1,485 @@ +# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License" +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import argparse +import unittest +from unittest.mock import MagicMock, patch + +try: + import torch + + TORCH_AVAILABLE = True +except (ImportError, NameError, AttributeError, OSError): + TORCH_AVAILABLE = False + +from fastdeploy.benchmarks.datasets import SampleRequest +from fastdeploy.benchmarks.throughput import ( + EngineArgs, + add_cli_args, + get_requests, + main, + run_fd, + run_fd_chat, + run_hf, + validate_args, +) + + +class TestThroughput(unittest.TestCase): + @patch("fastdeploy.LLM") + def test_run_fd(self, mock_llm): + mock_llm_instance = MagicMock() + mock_llm.return_value = mock_llm_instance + mock_llm_instance.generate.return_value = ["output1", "output2"] + # Mock cfg.max_model_len + mock_cfg = MagicMock() + mock_cfg.max_model_len = 2048 + mock_llm_instance.llm_engine.cfg = mock_cfg + + requests = [ + SampleRequest( + no=1, prompt="test prompt", prompt_len=10, expected_output_len=20, history_QA=[], json_data=None + ) + ] + engine_args = EngineArgs(model="test_model") + + elapsed_time, outputs = run_fd(requests, n=1, engine_args=engine_args) + self.assertIsInstance(elapsed_time, float) + self.assertEqual(len(outputs), 2) + + @patch("fastdeploy.LLM") + def test_run_fd_chat(self, mock_llm): + mock_llm_instance = MagicMock() + mock_llm.return_value = mock_llm_instance + mock_llm_instance.chat.return_value = ["chat output1", "chat output2"] + # Mock cfg.max_model_len + mock_cfg = MagicMock() + mock_cfg.max_model_len = 2048 + mock_llm_instance.llm_engine.cfg = mock_cfg + + requests = [ + SampleRequest( + no=1, prompt="test chat prompt", prompt_len=10, expected_output_len=20, history_QA=[], json_data=None + ) + ] + engine_args = EngineArgs(model="test_model") + + elapsed_time, outputs = run_fd_chat(requests, n=1, engine_args=engine_args) + self.assertIsInstance(elapsed_time, float) + self.assertEqual(len(outputs), 2) + + @unittest.skipIf(not TORCH_AVAILABLE, "PyTorch is not available") + @patch("transformers.AutoModelForCausalLM.from_pretrained") + @patch("transformers.AutoTokenizer.from_pretrained") + def test_run_hf(self, mock_tokenizer, mock_model): + mock_model_instance = MagicMock() + mock_model.return_value = mock_model_instance + mock_model_instance.generate.return_value = torch.tensor([[1, 2, 3]]) if TORCH_AVAILABLE else None + + mock_tokenizer_instance = MagicMock() + mock_tokenizer.return_value = mock_tokenizer_instance + mock_tokenizer_instance.pad_token = "pad" + + requests = [ + SampleRequest( + no=1, prompt="test hf prompt", prompt_len=10, expected_output_len=20, history_QA=[], json_data=None + ) + ] + + elapsed_time = run_hf( + requests, + model="test_model", + tokenizer=mock_tokenizer_instance, + n=1, + max_batch_size=4, + trust_remote_code=True, + ) + self.assertIsInstance(elapsed_time, float) + + @patch("fastdeploy.benchmarks.datasets.RandomDataset") + def test_get_requests(self, mock_dataset): + mock_dataset_instance = MagicMock() + mock_dataset.return_value = mock_dataset_instance + mock_dataset_instance.sample.return_value = [ + SampleRequest(no=1, prompt="test1", prompt_len=10, expected_output_len=20, history_QA=[], json_data=None), + SampleRequest(no=2, prompt="test2", prompt_len=15, expected_output_len=25, history_QA=[], json_data=None), + ] + + args = argparse.Namespace( + dataset_name="random", + dataset_path=None, + seed=42, + input_len=10, + output_len=20, + num_prompts=2, + hf_max_batch_size=4, + lora_path=None, + random_range_ratio=0.0, + prefix_len=0, + ) + tokenizer = MagicMock() + tokenizer.vocab_size = 10000 # 设置合理的词汇表大小 + tokenizer.num_special_tokens_to_add.return_value = 0 # 设置特殊token数量 + + requests = get_requests(args, tokenizer) + self.assertEqual(len(requests), 2) + + def test_validate_args(self): + # Test basic validation + args = argparse.Namespace( + backend="fastdeploy", + dataset_name="random", + dataset=None, + dataset_path=None, + input_len=10, + output_len=20, + tokenizer=None, + model="test_model", + hf_max_batch_size=None, + trust_remote_code=False, + quantization=None, + ) + validate_args(args) + self.assertEqual(args.tokenizer, "test_model") + + def test_add_cli_args(self): + parser = argparse.ArgumentParser() + add_cli_args(parser) + args = parser.parse_args([]) + self.assertEqual(args.backend, "fastdeploy") + self.assertEqual(args.dataset_name, "random") + + @patch("fastdeploy.benchmarks.throughput.run_fd") + @patch("fastdeploy.benchmarks.throughput.get_requests") + @patch("transformers.AutoTokenizer.from_pretrained") + def test_main_fastdeploy(self, mock_tokenizer, mock_get_requests, mock_run_fd): + mock_get_requests.return_value = [ + SampleRequest(no=1, prompt="test", prompt_len=10, expected_output_len=20, history_QA=[], json_data=None) + ] + mock_run_fd.return_value = (1.0, ["output1", "output2"]) + + parser = argparse.ArgumentParser() + add_cli_args(parser) + args = parser.parse_args([]) + args.backend = "fastdeploy" + args.dataset_name = "random" + args.dataset_path = None + args.seed = 42 + args.input_len = 10 + args.output_len = 20 + args.num_prompts = 1 + args.tokenizer = "test_tokenizer" + args.model = "test_model" + args.n = 1 + args.hf_max_batch_size = None + args.trust_remote_code = False + args.output_json = None + args.disable_detokenize = False + args.tensor_parallel_size = 1 + + with patch("builtins.print") as mock_print: + main(args) + mock_print.assert_called() + + @unittest.skipIf(not TORCH_AVAILABLE, "PyTorch is not available") + @patch("fastdeploy.benchmarks.throughput.run_hf") + @patch("fastdeploy.benchmarks.throughput.get_requests") + @patch("transformers.AutoTokenizer.from_pretrained") + @patch("transformers.AutoModelForCausalLM.from_pretrained") + def test_main_hf(self, mock_model, mock_tokenizer, mock_get_requests, mock_run_hf): + mock_get_requests.return_value = [ + SampleRequest(no=1, prompt="test", prompt_len=10, expected_output_len=20, history_QA=[], json_data=None) + ] + mock_run_hf.return_value = 1.0 + + parser = argparse.ArgumentParser() + add_cli_args(parser) + args = parser.parse_args([]) + args.backend = "hf" + args.dataset_name = "random" + args.dataset_path = None + args.seed = 42 + args.input_len = 10 + args.output_len = 20 + args.num_prompts = 1 + args.tokenizer = "test_tokenizer" + args.model = "test_model" + args.n = 1 + args.hf_max_batch_size = 4 + args.trust_remote_code = True + args.output_json = None + args.disable_detokenize = False + args.tensor_parallel_size = 1 + + with patch("builtins.print") as mock_print: + main(args) + mock_print.assert_called() + + @patch("fastdeploy.benchmarks.throughput.run_fd_chat") + @patch("fastdeploy.benchmarks.throughput.get_requests") + @patch("transformers.AutoTokenizer.from_pretrained") + def test_main_fastdeploy_chat(self, mock_tokenizer, mock_get_requests, mock_run_fd_chat): + mock_get_requests.return_value = [ + SampleRequest(no=1, prompt="test", prompt_len=10, expected_output_len=20, history_QA=[], json_data=None) + ] + mock_run_fd_chat.return_value = (1.0, ["output1", "output2"]) + + parser = argparse.ArgumentParser() + add_cli_args(parser) + args = parser.parse_args([]) + args.backend = "fastdeploy-chat" + args.dataset_name = "random" + args.dataset_path = None + args.seed = 42 + args.input_len = 10 + args.output_len = 20 + args.num_prompts = 1 + args.tokenizer = "test_tokenizer" + args.model = "test_model" + args.n = 1 + args.hf_max_batch_size = None + args.trust_remote_code = False + args.output_json = None + args.disable_detokenize = False + args.tensor_parallel_size = 1 + + with patch("builtins.print") as mock_print: + main(args) + mock_print.assert_called() + + @patch("builtins.open") + @patch("json.dump") + @patch("fastdeploy.benchmarks.throughput.run_fd") + @patch("fastdeploy.benchmarks.throughput.get_requests") + def test_main_with_output_json(self, mock_get_requests, mock_run_fd, mock_json_dump, mock_open): + mock_get_requests.return_value = [ + SampleRequest(no=1, prompt="test", prompt_len=10, expected_output_len=20, history_QA=[], json_data=None) + ] + mock_run_fd.return_value = (1.0, ["output1", "output2"]) + + parser = argparse.ArgumentParser() + add_cli_args(parser) + args = parser.parse_args([]) + args.backend = "fastdeploy" + args.dataset_name = "random" + args.dataset_path = None + args.seed = 42 + args.input_len = 10 + args.output_len = 20 + args.num_prompts = 1 + args.tokenizer = "test_tokenizer" + args.model = "test_model" + args.n = 1 + args.hf_max_batch_size = None + args.trust_remote_code = False + args.output_json = "output.json" + args.disable_detokenize = False + args.tensor_parallel_size = 1 + + main(args) + mock_json_dump.assert_called() + + # 新增测试用例覆盖缺失的行 + def test_validate_args_with_lora(self): + parser = argparse.ArgumentParser() + add_cli_args(parser) + args = parser.parse_args([]) + args.backend = "fastdeploy" # LoRA只支持vLLM后端 + args.dataset_name = "random" + args.enable_lora = True + args.lora_path = "/path/to/lora" + args.input_len = 10 + args.output_len = 20 + args.tokenizer = "test_tokenizer" + args.model = "test_model" + validate_args(args) + + def test_validate_args_with_hf_backend(self): + parser = argparse.ArgumentParser() + add_cli_args(parser) + args = parser.parse_args([]) + args.backend = "hf" + args.dataset_name = "random" + args.hf_max_batch_size = 4 + args.input_len = 10 + args.output_len = 20 + args.tokenizer = "test_tokenizer" + args.model = "test_model" + validate_args(args) + + def test_validate_args_with_quantization(self): + parser = argparse.ArgumentParser() + add_cli_args(parser) + args = parser.parse_args([]) + args.backend = "fastdeploy" + args.dataset_name = "random" + args.quantization = "w4a8" + args.input_len = 10 + args.output_len = 20 + args.tokenizer = "test_tokenizer" + args.model = "test_model" + validate_args(args) + + @patch("fastdeploy.benchmarks.throughput.write_to_json") + @patch("fastdeploy.benchmarks.throughput.convert_to_pytorch_benchmark_format") + def test_save_to_pytorch_benchmark_format(self, mock_convert, mock_write): + args = argparse.Namespace( + output_json="test.json", + model="test_model", + input_len=10, + output_len=20, + backend="fastdeploy", + ) + results = { + "elapsed_time": 1.0, + "num_requests": 10, + "total_num_tokens": 100, + "requests_per_second": 10.0, + "tokens_per_second": 100.0, + } + mock_convert.return_value = [{"metrics": {"requests_per_second": 10.0}}] + from fastdeploy.benchmarks.throughput import save_to_pytorch_benchmark_format + + save_to_pytorch_benchmark_format(args, results) + mock_write.assert_called() + + @patch("fastdeploy.benchmarks.throughput.run_fd") + @patch("fastdeploy.benchmarks.throughput.get_requests") + def test_main_with_disable_detokenize(self, mock_get_requests, mock_run_fd): + mock_get_requests.return_value = [ + SampleRequest(no=1, prompt="test", prompt_len=10, expected_output_len=20, history_QA=[], json_data=None) + ] + mock_run_fd.return_value = (1.0, ["output1", "output2"]) + + parser = argparse.ArgumentParser() + add_cli_args(parser) + args = parser.parse_args([]) + args.backend = "fastdeploy" + args.dataset_name = "random" + args.dataset_path = None + args.seed = 42 + args.input_len = 10 + args.output_len = 20 + args.num_prompts = 1 + args.tokenizer = "test_tokenizer" + args.model = "test_model" + args.n = 1 + args.hf_max_batch_size = None + args.trust_remote_code = False + args.output_json = None + args.disable_detokenize = True + args.tensor_parallel_size = 1 + + with patch("builtins.print") as mock_print: + main(args) + mock_print.assert_called() + + def test_validate_args_with_random_range_ratio(self): + parser = argparse.ArgumentParser() + add_cli_args(parser) + args = parser.parse_args([]) + args.backend = "fastdeploy" + args.dataset_name = "random" + args.random_range_ratio = 0.5 + args.input_len = 10 + args.output_len = 20 + args.tokenizer = "test_tokenizer" + args.model = "test_model" + validate_args(args) + + def test_validate_args_with_prefix_len(self): + parser = argparse.ArgumentParser() + add_cli_args(parser) + args = parser.parse_args([]) + args.backend = "fastdeploy" + args.dataset_name = "random" + args.prefix_len = 5 + args.input_len = 10 + args.output_len = 20 + args.tokenizer = "test_tokenizer" + args.model = "test_model" + validate_args(args) + + def test_validate_args_with_eb_dataset(self): + parser = argparse.ArgumentParser() + add_cli_args(parser) + args = parser.parse_args([]) + args.backend = "fastdeploy" + args.dataset_name = "EB" + args.dataset_path = "/path/to/eb" + args.tokenizer = "test_tokenizer" + args.model = "test_model" + validate_args(args) + + def test_validate_args_with_ebchat_dataset(self): + parser = argparse.ArgumentParser() + add_cli_args(parser) + args = parser.parse_args([]) + args.backend = "fastdeploy-chat" + args.dataset_name = "EBChat" + args.dataset_path = "/path/to/ebchat" + args.tokenizer = "test_tokenizer" + args.model = "test_model" + validate_args(args) + + def test_add_cli_args_with_all_options(self): + parser = argparse.ArgumentParser() + add_cli_args(parser) + # 使用parse_known_args避免未识别参数导致的SystemExit + args, _ = parser.parse_known_args( + [ + "--backend", + "fastdeploy-chat", + "--dataset-name", + "EBChat", + "--dataset-path", + "/path/to/dataset", + "--input-len", + "10", + "--output-len", + "20", + "--n", + "2", + "--num-prompts", + "50", + "--hf-max-batch-size", + "4", + "--output-json", + "output.json", + "--disable-detokenize", + "--lora-path", + "/path/to/lora", + "--prefix-len", + "5", + "--random-range-ratio", + "0.5", + ] + ) + self.assertEqual(args.backend, "fastdeploy-chat") + self.assertEqual(args.dataset_name, "EBChat") + self.assertEqual(args.dataset_path, "/path/to/dataset") + self.assertEqual(args.input_len, 10) + self.assertEqual(args.output_len, 20) + self.assertEqual(args.n, 2) + self.assertEqual(args.num_prompts, 50) + self.assertEqual(args.hf_max_batch_size, 4) + self.assertEqual(args.output_json, "output.json") + self.assertTrue(args.disable_detokenize) + self.assertEqual(args.lora_path, "/path/to/lora") + self.assertEqual(args.prefix_len, 5) + self.assertEqual(args.random_range_ratio, 0.5) + + +if __name__ == "__main__": + unittest.main() diff --git a/tests/entrypoints/cli/benchmark/test_eval.py b/tests/entrypoints/cli/benchmark/test_eval.py new file mode 100644 index 0000000000..010fdb45ad --- /dev/null +++ b/tests/entrypoints/cli/benchmark/test_eval.py @@ -0,0 +1,275 @@ +# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License" +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import argparse +import unittest +from unittest.mock import MagicMock, patch + +import pkg_resources + +from fastdeploy.entrypoints.cli.benchmark.eval import ( + BenchmarkEvalSubcommand, + _int_or_none_list_arg_type, + try_parse_json, +) + + +class TestIntOrNoneListArgType(unittest.TestCase): + def test_single_value(self): + result = _int_or_none_list_arg_type(3, 4, "1,2,3,4", "5") + self.assertEqual(result, [5, 5, 5, 5]) + + def test_multiple_values(self): + result = _int_or_none_list_arg_type(3, 4, "1,2,3,4", "5,6,7,8") + self.assertEqual(result, [5, 6, 7, 8]) + + def test_none_value(self): + result = _int_or_none_list_arg_type(3, 4, "1,2,3,4", "None,6,None,8") + self.assertEqual(result, [None, 6, None, 8]) + + def test_partial_values(self): + result = _int_or_none_list_arg_type(3, 4, "1,2,3,4", "5,6,7") + self.assertEqual(result, [5, 6, 7, 4]) + + def test_invalid_input(self): + with self.assertRaises(argparse.ArgumentTypeError): + _int_or_none_list_arg_type(3, 4, "1,2,3,4", "5,6,7,8,9") + + +class TestTryParseJson(unittest.TestCase): + def test_valid_json(self): + result = try_parse_json('{"key": "value"}') + self.assertEqual(result, {"key": "value"}) + + def test_invalid_json(self): + result = try_parse_json("not a json") + self.assertEqual(result, "not a json") + + def test_none_input(self): + result = try_parse_json(None) + self.assertIsNone(result) + + def test_invalid_json_with_braces(self): + with self.assertRaises(argparse.ArgumentTypeError): + try_parse_json("{invalid: json}") + + +class TestBenchmarkEvalSubcommand(unittest.TestCase): + def setUp(self): + self.parser = argparse.ArgumentParser() + BenchmarkEvalSubcommand.add_cli_args(self.parser) + self.mock_pkg_resources = MagicMock() + + def test_add_cli_args(self): + args = self.parser.parse_args(["--model", "test_model"]) + self.assertEqual(args.model, "test_model") + + @patch("subprocess.run") + @patch("pkg_resources.get_distribution") + def test_cmd_basic(self, mock_get_dist, mock_run): + mock_get_dist.return_value.version = "0.4.9.1" + mock_run.return_value = MagicMock(returncode=0) + + args = argparse.Namespace( + model="hf", + tasks="test_task", + model_args="pretrained=test_model", + batch_size="1", + output_path=None, + write_out=False, + num_fewshot=None, + max_batch_size=None, + device=None, + limit=None, + samples=None, + use_cache=None, + cache_requests=None, + check_integrity=False, + log_samples=False, + system_instruction=None, + apply_chat_template=False, + fewshot_as_multiturn=False, + show_config=False, + include_path=None, + verbosity=None, + wandb_args="", + wandb_config_args="", + hf_hub_log_args="", + predict_only=False, + seed="0,1234,1234,1234", + trust_remote_code=False, + confirm_run_unsafe_code=False, + metadata=None, + gen_kwargs=None, + ) + BenchmarkEvalSubcommand.cmd(args) + mock_run.assert_called_once() + + @patch("subprocess.run") + @patch("pkg_resources.get_distribution") + def test_cmd_with_complex_args(self, mock_get_dist, mock_run): + mock_get_dist.return_value.version = "0.4.9.1" + mock_run.return_value = MagicMock(returncode=0) + args = argparse.Namespace( + model="hf", + tasks="test_task", + model_args='{"pretrained":"test_model","dtype":"float32"}', + batch_size="auto:32", + output_path="/tmp/output", + write_out=True, + num_fewshot=5, + max_batch_size=64, + device="cuda:0", + limit=0.5, + samples='{"task1":[1,2,3]}', + use_cache="/tmp/cache", + cache_requests="refresh", + check_integrity=True, + log_samples=True, + system_instruction="Test instruction", + apply_chat_template="template_name", + fewshot_as_multiturn=True, + show_config=True, + include_path="/tmp/include", + verbosity="DEBUG", + wandb_args="project=test", + wandb_config_args="lr=0.01", + hf_hub_log_args="repo=test", + predict_only=True, + seed="1,2,3,4", + trust_remote_code=True, + confirm_run_unsafe_code=True, + metadata='{"max_seq_length":4096}', + gen_kwargs='{"temperature":0.7}', + ) + BenchmarkEvalSubcommand.cmd(args) + mock_run.assert_called_once() + + @patch("subprocess.run", side_effect=FileNotFoundError()) + @patch("pkg_resources.get_distribution") + def test_cmd_lm_eval_not_found(self, mock_get_dist, mock_run): + mock_get_dist.return_value.version = "0.4.9.1" + args = argparse.Namespace( + model="hf", + tasks="test_task", + model_args="pretrained=test_model", + batch_size="1", + output_path=None, + write_out=False, + num_fewshot=None, + max_batch_size=None, + device=None, + limit=None, + samples=None, + use_cache=None, + cache_requests=None, + check_integrity=False, + log_samples=False, + system_instruction=None, + apply_chat_template=False, + fewshot_as_multiturn=False, + show_config=False, + include_path=None, + verbosity=None, + wandb_args="", + wandb_config_args="", + hf_hub_log_args="", + predict_only=False, + seed="0,1234,1234,1234", + trust_remote_code=False, + confirm_run_unsafe_code=False, + metadata=None, + gen_kwargs=None, + ) + with self.assertRaises(SystemExit): + BenchmarkEvalSubcommand.cmd(args) + + @patch("pkg_resources.get_distribution") + def test_cmd_wrong_lm_eval_version(self, mock_get_dist): + mock_get_dist.return_value.version = "0.4.8" + args = argparse.Namespace( + model="hf", + tasks="test_task", + model_args="pretrained=test_model", + batch_size="1", + output_path=None, + write_out=False, + num_fewshot=None, + max_batch_size=None, + device=None, + limit=None, + samples=None, + use_cache=None, + cache_requests=None, + check_integrity=False, + log_samples=False, + system_instruction=None, + apply_chat_template=False, + fewshot_as_multiturn=False, + show_config=False, + include_path=None, + verbosity=None, + wandb_args="", + wandb_config_args="", + hf_hub_log_args="", + predict_only=False, + seed="0,1234,1234,1234", + trust_remote_code=False, + confirm_run_unsafe_code=False, + metadata=None, + gen_kwargs=None, + ) + with self.assertRaises(SystemExit): + BenchmarkEvalSubcommand.cmd(args) + + @patch("pkg_resources.get_distribution", side_effect=pkg_resources.DistributionNotFound) + def test_cmd_lm_eval_not_installed(self, mock_get_dist): + args = argparse.Namespace( + model="hf", + tasks="test_task", + model_args="pretrained=test_model", + batch_size="1", + output_path=None, + write_out=False, + num_fewshot=None, + max_batch_size=None, + device=None, + limit=None, + samples=None, + use_cache=None, + cache_requests=None, + check_integrity=False, + log_samples=False, + system_instruction=None, + apply_chat_template=False, + fewshot_as_multiturn=False, + show_config=False, + include_path=None, + verbosity=None, + wandb_args="", + wandb_config_args="", + hf_hub_log_args="", + predict_only=False, + seed="0,1234,1234,1234", + trust_remote_code=False, + confirm_run_unsafe_code=False, + metadata=None, + gen_kwargs=None, + ) + with self.assertRaises(SystemExit): + BenchmarkEvalSubcommand.cmd(args) + + +if __name__ == "__main__": + unittest.main() diff --git a/tests/entrypoints/cli/benchmark/test_throughput.py b/tests/entrypoints/cli/benchmark/test_throughput.py new file mode 100644 index 0000000000..26eaf69bfe --- /dev/null +++ b/tests/entrypoints/cli/benchmark/test_throughput.py @@ -0,0 +1,57 @@ +""" +Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +""" + +import argparse +import unittest + +from fastdeploy.entrypoints.cli.benchmark.throughput import ( + BenchmarkThroughputSubcommand, +) + + +class TestBenchmarkThroughputSubcommand(unittest.TestCase): + """ + 测试 BenchmarkThroughputSubcommand 类。 + """ + + def test_add_cli_args(self): + parser = argparse.ArgumentParser() + BenchmarkThroughputSubcommand.add_cli_args(parser) + + args = parser.parse_args( + [ + "--backend", + "fastdeploy", + "--dataset-name", + "random", + "--input-len", + "100", + "--output-len", + "50", + "--num-prompts", + "10", + ] + ) + self.assertEqual(args.backend, "fastdeploy") + self.assertEqual(args.dataset_name, "random") + self.assertEqual(args.input_len, 100) + self.assertEqual(args.output_len, 50) + self.assertEqual(args.num_prompts, 10) + + +# 如果你在命令行运行这个文件,下面的代码会执行测试 +if __name__ == "__main__": + unittest.main() diff --git a/tests/entrypoints/cli/test_collect_env_conmmand.py b/tests/entrypoints/cli/test_collect_env_conmmand.py index 179c73989e..f71184ea12 100644 --- a/tests/entrypoints/cli/test_collect_env_conmmand.py +++ b/tests/entrypoints/cli/test_collect_env_conmmand.py @@ -26,7 +26,7 @@ class TestCollectEnvSubcommand(unittest.TestCase): "collect-env", help="Start collecting environment information.", description="Start collecting environment information.", - usage="vllm collect-env", + usage="fastdeploy collect-env", )