mirror of
https://github.com/PaddlePaddle/FastDeploy.git
synced 2026-04-23 00:17:25 +08:00
[Intel HPU] add example benchmark scripts for hpu (#5304)
* [Intel HPU] add example benchmark scripts for hpu * Revise the code based on the copilot comments * update code based on comments * update ci ops version
This commit is contained in:
@@ -0,0 +1,246 @@
|
|||||||
|
# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
|
||||||
|
#
|
||||||
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
# you may not use this file except in compliance with the License.
|
||||||
|
# You may obtain a copy of the License at
|
||||||
|
#
|
||||||
|
# http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
#
|
||||||
|
# Unless required by applicable law or agreed to in writing, software
|
||||||
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
# See the License for the specific language governing permissions and
|
||||||
|
# limitations under the License.
|
||||||
|
|
||||||
|
"""Metric evaluation for Fastdeploy + ERNIE-4.5-Turbo"""
|
||||||
|
# adapted from https://github.com/sgl-project/sglang/blob/main/benchmark/gsm8k/bench_other.py
|
||||||
|
import argparse
|
||||||
|
import ast
|
||||||
|
import json
|
||||||
|
import re
|
||||||
|
import time
|
||||||
|
from concurrent.futures import ThreadPoolExecutor
|
||||||
|
|
||||||
|
import numpy as np
|
||||||
|
import requests
|
||||||
|
from tqdm import tqdm
|
||||||
|
|
||||||
|
INVALID = -9999999
|
||||||
|
|
||||||
|
|
||||||
|
def call_generate(prompt, **kwargs):
|
||||||
|
"""
|
||||||
|
Generates response based on the input prompt.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
prompt (str): The input prompt text.
|
||||||
|
**kwargs: Keyword arguments, including server IP address and port number.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
str: The response generated based on the prompt.
|
||||||
|
|
||||||
|
"""
|
||||||
|
url = f"http://{kwargs['ip']}:{kwargs['port']}/v1/chat/completions"
|
||||||
|
headers = {"Content-Type": "application/json"}
|
||||||
|
data = {
|
||||||
|
"messages": [
|
||||||
|
{
|
||||||
|
"role": "user",
|
||||||
|
"content": prompt,
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"temperature": 0.6,
|
||||||
|
"max_tokens": 2047,
|
||||||
|
"top_p": 0.95,
|
||||||
|
"do_sample": True,
|
||||||
|
}
|
||||||
|
|
||||||
|
response = requests.post(url, headers=headers, data=json.dumps(data))
|
||||||
|
out = response.json()
|
||||||
|
return out["choices"][0]["message"]["content"]
|
||||||
|
|
||||||
|
|
||||||
|
def get_one_example(lines, i, include_answer):
|
||||||
|
"""
|
||||||
|
Retrieves a question-answer example from the given list of text lines.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
lines (list of dict): A list of question-answer pairs.
|
||||||
|
i (int): The index of the question-answer pair to retrieve from lines.
|
||||||
|
include_answer (bool): Whether to include the answer in the returned string.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
str: A formatted question-answer string in the format "Question: <question>\nAnswer: <answer>".
|
||||||
|
|
||||||
|
"""
|
||||||
|
ret = "Question: " + lines[i]["question"] + "\nAnswer:"
|
||||||
|
if include_answer:
|
||||||
|
ret += " " + lines[i]["answer"]
|
||||||
|
return ret
|
||||||
|
|
||||||
|
|
||||||
|
def get_few_shot_examples(lines, k):
|
||||||
|
"""
|
||||||
|
Selects k examples from the given list of text lines and concatenates them into a single string.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
lines (list): A list containing text lines.
|
||||||
|
k (int): The number of examples to select.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
str: A string composed of k examples, separated by two newline characters.
|
||||||
|
"""
|
||||||
|
ret = ""
|
||||||
|
for i in range(k):
|
||||||
|
ret += get_one_example(lines, i, True) + "\n\n"
|
||||||
|
return ret
|
||||||
|
|
||||||
|
|
||||||
|
def get_answer_value(answer_str):
|
||||||
|
"""
|
||||||
|
Extracts numerical values from an answer string and returns them.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
answer_str (str): The string containing the answer.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
The extracted numerical value; returns "INVALID" if extraction fails.
|
||||||
|
"""
|
||||||
|
answer_str = answer_str.replace(",", "")
|
||||||
|
numbers = re.findall(r"\d+", answer_str)
|
||||||
|
if len(numbers) < 1:
|
||||||
|
return INVALID
|
||||||
|
try:
|
||||||
|
return ast.literal_eval(numbers[-1])
|
||||||
|
except SyntaxError:
|
||||||
|
return INVALID
|
||||||
|
|
||||||
|
|
||||||
|
def read_jsonl(filename: str):
|
||||||
|
"""
|
||||||
|
Reads a JSONL file.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
filename (str): Path to the JSONL file.
|
||||||
|
|
||||||
|
Yields:
|
||||||
|
dict: A dictionary object corresponding to each line in the JSONL file.
|
||||||
|
"""
|
||||||
|
with open(filename) as fin:
|
||||||
|
for line in fin:
|
||||||
|
if line.startswith("#"):
|
||||||
|
continue
|
||||||
|
yield json.loads(line)
|
||||||
|
|
||||||
|
|
||||||
|
def main(args):
|
||||||
|
"""
|
||||||
|
Process inputs and generate answers by calling the model in parallel using a thread pool.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
args (argparse.Namespace):
|
||||||
|
- num_questions (int): Number of questions to process.
|
||||||
|
- num_shots (int): Number of few-shot learning examples.
|
||||||
|
- ip (str): IP address of the model service.
|
||||||
|
- port (int): Port number of the model service.
|
||||||
|
- parallel (int): Number of questions to process in parallel.
|
||||||
|
- result_file (str): File path to store the results.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
None
|
||||||
|
|
||||||
|
"""
|
||||||
|
# Read data
|
||||||
|
filename = "test.jsonl"
|
||||||
|
|
||||||
|
lines = list(read_jsonl(filename))
|
||||||
|
|
||||||
|
# Construct prompts
|
||||||
|
num_questions = args.num_questions
|
||||||
|
num_shots = args.num_shots
|
||||||
|
few_shot_examples = get_few_shot_examples(lines, num_shots)
|
||||||
|
|
||||||
|
questions = []
|
||||||
|
labels = []
|
||||||
|
for i in range(len(lines[:num_questions])):
|
||||||
|
questions.append(get_one_example(lines, i, False))
|
||||||
|
labels.append(get_answer_value(lines[i]["answer"]))
|
||||||
|
assert all(l != INVALID for l in labels)
|
||||||
|
|
||||||
|
states = [None] * len(labels)
|
||||||
|
|
||||||
|
# Use thread pool
|
||||||
|
def get_one_answer(i):
|
||||||
|
answer = call_generate(
|
||||||
|
prompt=few_shot_examples + questions[i],
|
||||||
|
# stop=["Question", "Assistant:", "<|separator|>"],
|
||||||
|
ip=args.ip,
|
||||||
|
port=args.port,
|
||||||
|
)
|
||||||
|
states[i] = answer
|
||||||
|
|
||||||
|
tic = time.time()
|
||||||
|
if args.parallel == 1:
|
||||||
|
for i in tqdm(range(len(questions))):
|
||||||
|
get_one_answer(i)
|
||||||
|
else:
|
||||||
|
with ThreadPoolExecutor(args.parallel) as executor:
|
||||||
|
list(
|
||||||
|
tqdm(
|
||||||
|
executor.map(get_one_answer, list(range(len(questions)))),
|
||||||
|
total=len(questions),
|
||||||
|
)
|
||||||
|
)
|
||||||
|
|
||||||
|
latency = time.time() - tic
|
||||||
|
preds = []
|
||||||
|
|
||||||
|
with open(args.acc_log, "w") as fout:
|
||||||
|
for i in range(len(states)):
|
||||||
|
preds.append(get_answer_value(states[i]))
|
||||||
|
answer = get_answer_value(states[i])
|
||||||
|
fout.write("\n################################################################\n")
|
||||||
|
fout.write("-----------prompt--------------\n")
|
||||||
|
fout.write(f"{few_shot_examples + questions[i]}\n")
|
||||||
|
fout.write("-----------answer--------------\n")
|
||||||
|
fout.write(f"answer= {states[i]}\n")
|
||||||
|
fout.write("-----------accuracy--------------\n")
|
||||||
|
fout.write(f"Correct={answer==labels[i]}, pred={answer}, label={labels[i]} \n")
|
||||||
|
|
||||||
|
# Compute accuracy
|
||||||
|
acc = np.mean(np.array(preds) == np.array(labels))
|
||||||
|
invalid = np.mean(np.array(preds) == INVALID)
|
||||||
|
|
||||||
|
# Print results
|
||||||
|
print(f"Accuracy: {acc:.3f}")
|
||||||
|
print(f"Invalid: {invalid:.3f}")
|
||||||
|
print(f"Latency: {latency:.3f} s")
|
||||||
|
|
||||||
|
with open(args.result_file, "a") as fout:
|
||||||
|
value = {
|
||||||
|
"task": "gsm8k",
|
||||||
|
"backend": "paddlepaddle",
|
||||||
|
"num_gpus": 1,
|
||||||
|
"latency": round(latency, 3),
|
||||||
|
"accuracy": round(acc, 3),
|
||||||
|
"num_requests": args.num_questions,
|
||||||
|
"other": {
|
||||||
|
"num_questions": args.num_questions,
|
||||||
|
"parallel": args.parallel,
|
||||||
|
},
|
||||||
|
}
|
||||||
|
fout.write(json.dumps(value) + "\n")
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
parser = argparse.ArgumentParser()
|
||||||
|
parser.add_argument("--ip", type=str, default="127.0.0.1")
|
||||||
|
parser.add_argument("--port", type=str, default="8188")
|
||||||
|
parser.add_argument("--num-shots", type=int, default=10)
|
||||||
|
parser.add_argument("--data-path", type=str, default="test.jsonl")
|
||||||
|
parser.add_argument("--num-questions", type=int, default=1319)
|
||||||
|
parser.add_argument("--result-file", type=str, default="result.jsonl")
|
||||||
|
parser.add_argument("--parallel", type=int, default=1)
|
||||||
|
parser.add_argument("--acc-log", type=str, default="accuracy.log")
|
||||||
|
args = parser.parse_args()
|
||||||
|
main(args)
|
||||||
+72
@@ -0,0 +1,72 @@
|
|||||||
|
#!/bin/bash
|
||||||
|
|
||||||
|
# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
|
||||||
|
#
|
||||||
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
# you may not use this file except in compliance with the License.
|
||||||
|
# You may obtain a copy of the License at
|
||||||
|
#
|
||||||
|
# http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
#
|
||||||
|
# Unless required by applicable law or agreed to in writing, software
|
||||||
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
# See the License for the specific language governing permissions and
|
||||||
|
# limitations under the License.
|
||||||
|
|
||||||
|
# set -x
|
||||||
|
|
||||||
|
model="ERNIE-4.5-21B-A3B-Paddle"
|
||||||
|
model_log_name="ERNIE-4.5-21B-A3B-Paddle"
|
||||||
|
model_yaml="yaml/eb45-21b-a3b-32k-bf16.yaml"
|
||||||
|
# model="ERNIE-4.5-300B-A47B-Paddle"
|
||||||
|
# model_log_name="ERNIE-4.5-300B-A47B-Paddle"
|
||||||
|
# model_yaml="yaml/eb45-300b-a47b-32k-bf16.yaml"
|
||||||
|
|
||||||
|
export SERVER_PORT=8188
|
||||||
|
export no_proxy=localhost,127.0.0.1,0.0.0.0,10.0.0.0/8,192.168.1.0/24
|
||||||
|
|
||||||
|
input_lengths=(1024 2048)
|
||||||
|
output_lengths=(1024)
|
||||||
|
batch_sizes=(1 2 4 8 16 32 64 128)
|
||||||
|
|
||||||
|
workspace=$(pwd)
|
||||||
|
cd $workspace
|
||||||
|
log_home=$workspace/benchmark_fastdeploy_logs/$(TZ='Asia/Shanghai' date '+WW%V')_$(TZ='Asia/Shanghai' date +%F-%H-%M-%S)_${model_log_name}_FixedLen
|
||||||
|
|
||||||
|
mkdir -p ${log_home}
|
||||||
|
|
||||||
|
for input_length in "${input_lengths[@]}"
|
||||||
|
do
|
||||||
|
for output_length in "${output_lengths[@]}"
|
||||||
|
do
|
||||||
|
for batch_size in "${batch_sizes[@]}"
|
||||||
|
do
|
||||||
|
> log/hpu_model_runner_profile.log
|
||||||
|
num_prompts=$(( batch_size * 3))
|
||||||
|
log_name_prefix="benchmarkdata_${model_log_name}_inputlength_${input_length}_outputlength_${output_length}_batchsize_${batch_size}_numprompts_${num_prompts}"
|
||||||
|
log_name=${log_name_prefix}_$(TZ='Asia/Shanghai' date +%F-%H-%M-%S)
|
||||||
|
echo "running benchmark with input length ${input_length}, output length ${output_length}, batch size ${batch_size}, log name ${log_name}"
|
||||||
|
cmd="python ../../benchmarks/benchmark_serving.py \
|
||||||
|
--backend openai-chat \
|
||||||
|
--model $model \
|
||||||
|
--endpoint /v1/chat/completions \
|
||||||
|
--host 0.0.0.0 \
|
||||||
|
--port ${SERVER_PORT} \
|
||||||
|
--dataset-name random \
|
||||||
|
--random-input-len ${input_length} \
|
||||||
|
--random-output-len ${output_length} \
|
||||||
|
--random-range-ratio 0 \
|
||||||
|
--hyperparameter-path ../../benchmarks/${model_yaml} \
|
||||||
|
--percentile-metrics ttft,tpot,itl,e2el,s_ttft,s_itl,s_e2el,s_decode,input_len,s_input_len,output_len \
|
||||||
|
--metric-percentiles 80,95,99,99.9,99.95,99.99 \
|
||||||
|
--num-prompts ${num_prompts} \
|
||||||
|
--max-concurrency ${batch_size} \
|
||||||
|
--ignore-eos"
|
||||||
|
echo $cmd | tee -a ${log_home}/${log_name}.log
|
||||||
|
eval $cmd >> ${log_home}/${log_name}.log 2>&1
|
||||||
|
|
||||||
|
cp log/hpu_model_runner_profile.log ${log_home}/${log_name}_profile.log
|
||||||
|
done
|
||||||
|
done
|
||||||
|
done
|
||||||
+64
@@ -0,0 +1,64 @@
|
|||||||
|
#!/bin/bash
|
||||||
|
|
||||||
|
# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
|
||||||
|
#
|
||||||
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
# you may not use this file except in compliance with the License.
|
||||||
|
# You may obtain a copy of the License at
|
||||||
|
#
|
||||||
|
# http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
#
|
||||||
|
# Unless required by applicable law or agreed to in writing, software
|
||||||
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
# See the License for the specific language governing permissions and
|
||||||
|
# limitations under the License.
|
||||||
|
|
||||||
|
# set -x
|
||||||
|
|
||||||
|
model="ERNIE-4.5-21B-A3B-Paddle"
|
||||||
|
model_log_name="ERNIE-4.5-21B-A3B-Paddle"
|
||||||
|
model_yaml="yaml/eb45-21b-a3b-32k-bf16.yaml"
|
||||||
|
# model="ERNIE-4.5-300B-A47B-Paddle"
|
||||||
|
# model_log_name="ERNIE-4.5-300B-A47B-Paddle"
|
||||||
|
# model_yaml="yaml/eb45-300b-a47b-32k-bf16.yaml"
|
||||||
|
export SERVER_PORT=8188
|
||||||
|
export no_proxy=.intel.com,intel.com,localhost,127.0.0.1,0.0.0.0,10.0.0.0/8,192.168.1.0/24
|
||||||
|
|
||||||
|
CARD_NUM=$1
|
||||||
|
|
||||||
|
if [[ "$CARD_NUM" == "1" ]]; then
|
||||||
|
batch_size=128
|
||||||
|
else
|
||||||
|
batch_size=64
|
||||||
|
fi
|
||||||
|
|
||||||
|
num_prompts=2000
|
||||||
|
|
||||||
|
workspace=$(pwd)
|
||||||
|
cd $workspace
|
||||||
|
log_home=$workspace/benchmark_fastdeploy_logs/$(TZ='Asia/Shanghai' date '+WW%V')_$(TZ='Asia/Shanghai' date +%F-%H-%M-%S)_${model_log_name}
|
||||||
|
|
||||||
|
mkdir -p ${log_home}
|
||||||
|
|
||||||
|
log_name_prefix="benchmarkdata_${model_log_name}_sharegpt"
|
||||||
|
log_name=${log_name_prefix}_$(TZ='Asia/Shanghai' date +%F-%H-%M-%S)
|
||||||
|
echo "running benchmark with sharegpt log name ${log_name}"
|
||||||
|
cmd="python ../../benchmarks/benchmark_serving.py \
|
||||||
|
--backend openai-chat \
|
||||||
|
--model $model \
|
||||||
|
--endpoint /v1/chat/completions \
|
||||||
|
--host 0.0.0.0 \
|
||||||
|
--port ${SERVER_PORT} \
|
||||||
|
--dataset-name EBChat \
|
||||||
|
--dataset-path ./filtered_sharedgpt_2000_input_1136_output_200_fd.json \
|
||||||
|
--hyperparameter-path ../../benchmarks/${model_yaml} \
|
||||||
|
--percentile-metrics ttft,tpot,itl,e2el,s_ttft,s_itl,s_e2el,s_decode,input_len,s_input_len,output_len \
|
||||||
|
--metric-percentiles 80,95,99,99.9,99.95,99.99 \
|
||||||
|
--max-concurrency ${batch_size} \
|
||||||
|
--num-prompts ${num_prompts} \
|
||||||
|
--sharegpt-output-len 4096 \
|
||||||
|
--save-result "
|
||||||
|
echo $cmd | tee -a ${log_home}/${log_name}.log
|
||||||
|
eval $cmd >> ${log_home}/${log_name}.log 2>&1
|
||||||
|
cp log/hpu_model_runner_profile.log ${log_home}/${log_name}_profile.log
|
||||||
+49
@@ -0,0 +1,49 @@
|
|||||||
|
#!/bin/bash
|
||||||
|
export GC_KERNEL_PATH=/usr/lib/habanalabs/libtpc_kernels.so
|
||||||
|
export GC_KERNEL_PATH=/usr/local/lib/python3.10/dist-packages/paddle_custom_device/intel_hpu/libcustom_tpc_perf_lib.so:$GC_KERNEL_PATH
|
||||||
|
export INTEL_HPU_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
|
||||||
|
export PADDLE_DISTRI_BACKEND=xccl
|
||||||
|
export PADDLE_XCCL_BACKEND=intel_hpu
|
||||||
|
# export FLAGS_intel_hpu_recipe_cache_config=/tmp/recipe,false,10240
|
||||||
|
export FLAGS_intel_hpu_recipe_cache_num=20480
|
||||||
|
export SERVER_PORT=8188
|
||||||
|
export ENGINE_WORKER_QUEUE_PORT=8002
|
||||||
|
export METRICS_PORT=8001
|
||||||
|
export CACHE_QUEUE_PORT=8003
|
||||||
|
export HABANA_PROFILE=0
|
||||||
|
export HPU_VISIBLE_DEVICES=0
|
||||||
|
rm -rf log 2>/dev/null
|
||||||
|
FD_ENC_DEC_BLOCK_NUM=8 HPU_PERF_BREAKDOWN_SYNC_MODE=1 HPU_WARMUP_BUCKET=1 HPU_WARMUP_MODEL_LEN=4096 FD_ATTENTION_BACKEND=HPU_ATTN \
|
||||||
|
python -m fastdeploy.entrypoints.openai.api_server \
|
||||||
|
--model ERNIE-4.5-21B-A3B-Paddle \
|
||||||
|
--port ${SERVER_PORT} \
|
||||||
|
--engine-worker-queue-port ${ENGINE_WORKER_QUEUE_PORT} \
|
||||||
|
--metrics-port ${METRICS_PORT} \
|
||||||
|
--cache-queue-port ${CACHE_QUEUE_PORT} \
|
||||||
|
--tensor-parallel-size 1 \
|
||||||
|
--max-model-len 32768 \
|
||||||
|
--max-num-seqs 128 \
|
||||||
|
--block-size 128 \
|
||||||
|
--num-gpu-blocks-override 3100 \
|
||||||
|
--kv-cache-ratio 0.991 \
|
||||||
|
--no-enable-prefix-caching \
|
||||||
|
--graph-optimization-config '{"use_cudagraph":false}'
|
||||||
|
|
||||||
|
# (2k + 1k) / 128(block_size) * 128(batch) = 3072
|
||||||
|
# export HPU_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
|
||||||
|
# rm -rf log 2>/dev/null
|
||||||
|
# FD_ENC_DEC_BLOCK_NUM=8 HPU_PERF_BREAKDOWN_SYNC_MODE=1 HPU_WARMUP_BUCKET=1 HPU_WARMUP_MODEL_LEN=3072 FD_ATTENTION_BACKEND=HPU_ATTN \
|
||||||
|
# python -m fastdeploy.entrypoints.openai.api_server \
|
||||||
|
# --model ERNIE-4.5-300B-A47B-Paddle \
|
||||||
|
# --port ${SERVER_PORT} \
|
||||||
|
# --engine-worker-queue-port ${ENGINE_WORKER_QUEUE_PORT} \
|
||||||
|
# --metrics-port ${METRICS_PORT} \
|
||||||
|
# --cache-queue-port ${CACHE_QUEUE_PORT} \
|
||||||
|
# --tensor-parallel-size 8 \
|
||||||
|
# --max-model-len 32768 \
|
||||||
|
# --max-num-seqs 128 \
|
||||||
|
# --block-size 128 \
|
||||||
|
# --num-gpu-blocks-override 3100 \
|
||||||
|
# --kv-cache-ratio 0.991 \
|
||||||
|
# --no-enable-prefix-caching \
|
||||||
|
# --graph-optimization-config '{"use_cudagraph":false}'
|
||||||
@@ -0,0 +1,35 @@
|
|||||||
|
#!/bin/bash
|
||||||
|
export GC_KERNEL_PATH=/usr/lib/habanalabs/libtpc_kernels.so
|
||||||
|
export GC_KERNEL_PATH=/usr/local/lib/python3.10/dist-packages/paddle_custom_device/intel_hpu/libcustom_tpc_perf_lib.so:$GC_KERNEL_PATH
|
||||||
|
export INTEL_HPU_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
|
||||||
|
export PADDLE_DISTRI_BACKEND=xccl
|
||||||
|
export PADDLE_XCCL_BACKEND=intel_hpu
|
||||||
|
# export FLAGS_intel_hpu_recipe_cache_config=/tmp/recipe,false,10240
|
||||||
|
export FLAGS_intel_hpu_recipe_cache_num=20480
|
||||||
|
export SERVER_PORT=8188
|
||||||
|
export ENGINE_WORKER_QUEUE_PORT=8002
|
||||||
|
export METRICS_PORT=8001
|
||||||
|
export CACHE_QUEUE_PORT=8003
|
||||||
|
export HABANA_PROFILE=0
|
||||||
|
|
||||||
|
CARD_NUM=$1
|
||||||
|
|
||||||
|
if [[ "$CARD_NUM" == "1" ]]; then
|
||||||
|
export HPU_VISIBLE_DEVICES=0
|
||||||
|
export MODEL="ERNIE-4.5-21B-A3B-Paddle"
|
||||||
|
export GPU_BLOCKS=5000
|
||||||
|
elif [[ "$CARD_NUM" == "8" ]]; then
|
||||||
|
export HPU_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
|
||||||
|
export MODEL="ERNIE-4.5-300B-A47B-Paddle"
|
||||||
|
export GPU_BLOCKS=3000
|
||||||
|
else
|
||||||
|
exit 0
|
||||||
|
fi
|
||||||
|
|
||||||
|
rm -rf log 2>/dev/null
|
||||||
|
FD_ENC_DEC_BLOCK_NUM=8 HPU_PERF_BREAKDOWN_SYNC_MODE=1 HPU_WARMUP_BUCKET=0 FD_ATTENTION_BACKEND=HPU_ATTN ENABLE_V1_KVCACHE_SCHEDULER=0 \
|
||||||
|
python -m fastdeploy.entrypoints.openai.api_server --model ${MODEL} --port ${SERVER_PORT} \
|
||||||
|
--engine-worker-queue-port ${ENGINE_WORKER_QUEUE_PORT} --metrics-port ${METRICS_PORT} \
|
||||||
|
--cache-queue-port ${CACHE_QUEUE_PORT} --tensor-parallel-size ${CARD_NUM} --max-model-len 16384 \
|
||||||
|
--max-num-seqs 128 --block-size 128 --kv-cache-ratio 0.5 --num-gpu-blocks-override ${GPU_BLOCKS} \
|
||||||
|
--graph-optimization-config '{"use_cudagraph":false}'
|
||||||
@@ -0,0 +1,173 @@
|
|||||||
|
import csv
|
||||||
|
import os
|
||||||
|
import re
|
||||||
|
import sys
|
||||||
|
from datetime import datetime
|
||||||
|
|
||||||
|
import matplotlib.dates as mdates
|
||||||
|
import matplotlib.pyplot as plt
|
||||||
|
|
||||||
|
log_patterns = [
|
||||||
|
re.compile(
|
||||||
|
r"benchmarkdata_(.+?)_inputlength_(\d+)_outputlength_(\d+)_batchsize_(\d+)_numprompts_(\d+)_.*_profile\.log$"
|
||||||
|
),
|
||||||
|
]
|
||||||
|
|
||||||
|
|
||||||
|
def draw_time_graph(log_dir, log_filename, max_num_seqs, mode):
|
||||||
|
# Store extracted time and BT values
|
||||||
|
timestamps_model = []
|
||||||
|
times_model = []
|
||||||
|
bt_values_model = []
|
||||||
|
block_list_shapes_model = []
|
||||||
|
block_indices_shapes_model = []
|
||||||
|
timestamps_pp = []
|
||||||
|
times_pp = []
|
||||||
|
bt_values_pp = []
|
||||||
|
|
||||||
|
# Use regex to extract Model execution time and BT information
|
||||||
|
pattern_model = re.compile(
|
||||||
|
r"(\d+-\d+-\d+ \d+:\d+:\d+,\d+) .* Model execution time\(ms\): ([\d\.]+), BT=(\d+), block_list_shape=\[(\d+)\], block_indices_shape=\[(\d+)\]"
|
||||||
|
)
|
||||||
|
pattern_pp = re.compile(
|
||||||
|
r"(\d+-\d+-\d+ \d+:\d+:\d+,\d+) .* PostProcessing execution time\(ms\): ([\d\.]+), BT=(\d+)"
|
||||||
|
)
|
||||||
|
# Read log file
|
||||||
|
with open(os.path.join(log_dir, log_filename), "r") as file:
|
||||||
|
for line in file:
|
||||||
|
match_model = pattern_model.search(line)
|
||||||
|
if match_model:
|
||||||
|
bt_value = int(match_model.group(3))
|
||||||
|
timestamps_model.append(datetime.strptime(match_model.group(1), "%Y-%m-%d %H:%M:%S,%f"))
|
||||||
|
if mode == "prefill" and bt_value <= max_num_seqs:
|
||||||
|
times_model.append(None)
|
||||||
|
bt_values_model.append(None)
|
||||||
|
continue
|
||||||
|
if mode == "decode" and bt_value > max_num_seqs:
|
||||||
|
times_model.append(None)
|
||||||
|
bt_values_model.append(None)
|
||||||
|
continue
|
||||||
|
times_model.append(float(match_model.group(2)))
|
||||||
|
bt_values_model.append(bt_value)
|
||||||
|
block_list_shapes_model.append(int(match_model.group(4)))
|
||||||
|
block_indices_shapes_model.append(int(match_model.group(5)))
|
||||||
|
else:
|
||||||
|
match_pp = pattern_pp.search(line)
|
||||||
|
if match_pp:
|
||||||
|
bt_value = int(match_pp.group(3))
|
||||||
|
timestamps_pp.append(datetime.strptime(match_pp.group(1), "%Y-%m-%d %H:%M:%S,%f"))
|
||||||
|
if mode == "prefill" and bt_value <= max_num_seqs:
|
||||||
|
times_pp.append(None)
|
||||||
|
bt_values_pp.append(None)
|
||||||
|
continue
|
||||||
|
if mode == "decode" and bt_value > max_num_seqs:
|
||||||
|
times_pp.append(None)
|
||||||
|
bt_values_pp.append(None)
|
||||||
|
continue
|
||||||
|
times_pp.append(float(match_pp.group(2)))
|
||||||
|
bt_values_pp.append(bt_value)
|
||||||
|
|
||||||
|
# Plot graphs
|
||||||
|
plt.figure(figsize=(15, 7))
|
||||||
|
|
||||||
|
date_format = mdates.DateFormatter("%m-%d %H:%M:%S")
|
||||||
|
# Plot time graph
|
||||||
|
plt.subplot(2, 1, 1)
|
||||||
|
ax1 = plt.gca()
|
||||||
|
ax2 = ax1.twinx()
|
||||||
|
ax1.plot(timestamps_model, times_model, label="Model Execution Time (ms)", color="blue")
|
||||||
|
ax2.plot(timestamps_pp, times_pp, label="PostProcessing Time (ms)", color="red")
|
||||||
|
ax1.set_ylabel("Model Execution Time (ms)")
|
||||||
|
ax2.set_ylabel("PostProcessing Time (ms)")
|
||||||
|
ax1.xaxis.set_major_formatter(date_format)
|
||||||
|
# Merge legends
|
||||||
|
lines_1, labels_1 = ax1.get_legend_handles_labels()
|
||||||
|
lines_2, labels_2 = ax2.get_legend_handles_labels()
|
||||||
|
ax1.legend(lines_1 + lines_2, labels_1 + labels_2)
|
||||||
|
|
||||||
|
# Plot BT value graph
|
||||||
|
plt.subplot(2, 1, 2)
|
||||||
|
plt.plot(timestamps_model, bt_values_model, label="BT [" + mode + "]", color="orange")
|
||||||
|
plt.ylabel("BT Value")
|
||||||
|
plt.xlabel(log_filename, fontsize=8)
|
||||||
|
|
||||||
|
plt.gca().xaxis.set_major_formatter(date_format)
|
||||||
|
plt.legend()
|
||||||
|
|
||||||
|
plt.tight_layout()
|
||||||
|
output_filename = log_filename[:-4] + "_analysis_" + mode + ".png"
|
||||||
|
plt.savefig(os.path.join(log_dir, output_filename), dpi=300)
|
||||||
|
plt.close()
|
||||||
|
|
||||||
|
# Write to CSV file
|
||||||
|
if mode == "all":
|
||||||
|
csv_filename = log_filename[:-4] + "_analysis.csv"
|
||||||
|
with open(os.path.join(log_dir, csv_filename), "w", newline="") as csvfile:
|
||||||
|
writer = csv.writer(csvfile)
|
||||||
|
writer.writerow(
|
||||||
|
[
|
||||||
|
"Timestamp",
|
||||||
|
"ModelTime(ms)",
|
||||||
|
"BT",
|
||||||
|
"block_list_shape",
|
||||||
|
"block_indices_shape",
|
||||||
|
"Timestamp",
|
||||||
|
"PostProcessing(ms)",
|
||||||
|
"BT",
|
||||||
|
]
|
||||||
|
)
|
||||||
|
for i in range(len(times_model)):
|
||||||
|
writer.writerow(
|
||||||
|
[
|
||||||
|
timestamps_model[i],
|
||||||
|
times_model[i],
|
||||||
|
bt_values_model[i],
|
||||||
|
block_list_shapes_model[i],
|
||||||
|
block_indices_shapes_model[i],
|
||||||
|
timestamps_pp[i],
|
||||||
|
times_pp[i],
|
||||||
|
bt_values_pp[i],
|
||||||
|
]
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def main():
|
||||||
|
if len(sys.argv) > 1:
|
||||||
|
log_dir = sys.argv[1]
|
||||||
|
else:
|
||||||
|
log_dir = "."
|
||||||
|
try:
|
||||||
|
from natsort import natsorted
|
||||||
|
|
||||||
|
natsort_available = True
|
||||||
|
except ImportError:
|
||||||
|
natsort_available = False
|
||||||
|
|
||||||
|
files = []
|
||||||
|
for f in os.listdir(log_dir):
|
||||||
|
for pat in log_patterns:
|
||||||
|
if pat.match(f):
|
||||||
|
files.append(f)
|
||||||
|
break
|
||||||
|
if natsort_available:
|
||||||
|
files = natsorted(files)
|
||||||
|
else:
|
||||||
|
import re as _re
|
||||||
|
|
||||||
|
def natural_key(s):
|
||||||
|
return [int(text) if text.isdigit() else text.lower() for text in _re.split("([0-9]+)", s)]
|
||||||
|
|
||||||
|
files.sort(key=natural_key)
|
||||||
|
|
||||||
|
for file in files:
|
||||||
|
for idx, pat in enumerate(log_patterns):
|
||||||
|
m = pat.match(file)
|
||||||
|
if m:
|
||||||
|
draw_time_graph(log_dir, file, 128, "prefill")
|
||||||
|
draw_time_graph(log_dir, file, 128, "decode")
|
||||||
|
draw_time_graph(log_dir, file, 128, "all")
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
print("Starting to draw logs...")
|
||||||
|
main()
|
||||||
@@ -0,0 +1,5 @@
|
|||||||
|
max_model_len: 32768
|
||||||
|
max_num_seqs: 128
|
||||||
|
kv_cache_ratio: 0.75
|
||||||
|
tensor_parallel_size: 8
|
||||||
|
max_num_batched_tokens: 32768
|
||||||
@@ -0,0 +1,70 @@
|
|||||||
|
# Intel HPU serving benchmark
|
||||||
|
These scripts are used to launch FastDeploy Paddle large model inference service for performance and stress testing.
|
||||||
|
|
||||||
|
## Main HPU-Specific Parameter
|
||||||
|
- `HPU_WARMUP_BUCKET`: Whether to enable warmup (1 means enabled)
|
||||||
|
- `HPU_WARMUP_MODEL_LEN`: Model length for warmup (including input and output)
|
||||||
|
- `MAX_PREFILL_NUM`: Maximum batch in prefill stage, default 3
|
||||||
|
- `BATCH_STEP_PREFILL`: Batch step in prefill stage, default 1
|
||||||
|
- `SEQUENCE_STEP_PREFILL`: Sequence step in prefill stage, default 128, same as block size
|
||||||
|
- `CONTEXT_BLOCK_STEP_PREFILL`: Step size for block hit when prefill caching is enabled, default 1
|
||||||
|
- `BATCH_STEP_DECODE`: Batch step in decode stage, default 4
|
||||||
|
- `BLOCK_STEP_DECODE`: Block step in decode stage, default 16
|
||||||
|
- `FLAGS_intel_hpu_recipe_cache_num`: Limit for HPU recipe cache number
|
||||||
|
- `FLAGS_intel_hpu_recipe_cache_config`: HPU recipe cache config, can be used for warmup optimization
|
||||||
|
- `GC_KERNEL_PATH`: The default path of the HPU TPC kernels library
|
||||||
|
- `HABANA_PROFILE`: Whether to enable profiler (1 means enabled)
|
||||||
|
- `PROFILE_START`: Profiler start step.
|
||||||
|
- `PROFILE_END`: Profiler end step.
|
||||||
|
|
||||||
|
## Usage
|
||||||
|
### 1. Start server
|
||||||
|
There are different setup scripts are provided to start the vllm server, one for RandomDataset and the other for ShareGPT.
|
||||||
|
|
||||||
|
Before running, please make sure to correctly set the model path and port number in the script.
|
||||||
|
```bash
|
||||||
|
./benchmark_paddle_hpu_server.sh
|
||||||
|
./benchmark_paddle_hpu_server_sharegpt.sh
|
||||||
|
```
|
||||||
|
You can use HPU_VISIBLE_DEVICES in the script to select the HPU card.
|
||||||
|
|
||||||
|
### 2. Run client
|
||||||
|
Correspondingly, there are different client test scripts. `benchmark_paddle_hpu_cli.sh` supports both variable and fixed length tests.
|
||||||
|
|
||||||
|
Before running, please make sure to correctly set the model path, port number, and input/output settings in the script.
|
||||||
|
```bash
|
||||||
|
./benchmark_paddle_hpu_cli.sh
|
||||||
|
./benchmark_paddle_hpu_cli_sharegpt.sh
|
||||||
|
```
|
||||||
|
|
||||||
|
### 3. Parse logs
|
||||||
|
After batch testing, run the following script to automatically parse the logs and generate a CSV file.
|
||||||
|
```python
|
||||||
|
python parse_benchmark_logs.py benchmark_fastdeploy_logs/[the targeted folder]
|
||||||
|
```
|
||||||
|
The performance data will be saved as a CSV file.
|
||||||
|
|
||||||
|
### 4. Analyse logs
|
||||||
|
During HPU_MODEL_RUNNER execution, performance logs are generated. The following script can parse these logs and produce performance graphs to help identify bottlenecks.
|
||||||
|
```python
|
||||||
|
python draw_benchmark_data.py benchmark_fastdeploy_logs/[the targeted folder]
|
||||||
|
```
|
||||||
|
The script will save the model execution times and batch tokens as a CSV file and plot them in a graph.
|
||||||
|
|
||||||
|
### 5. Accuracy test
|
||||||
|
Accuracy testing uses GSM8K. Use the following conversion to generate the test file.
|
||||||
|
```python
|
||||||
|
>>> import pandas as pd
|
||||||
|
>>> df = pd.read_parquet('tests/ce/accuracy_cases/gsm8k.parquet', engine='pyarrow')
|
||||||
|
>>> df.to_json('test.jsonl', orient='records', lines=True)
|
||||||
|
```
|
||||||
|
Run the following command to perform the accuracy test.
|
||||||
|
```bash
|
||||||
|
python -u bench_gsm8k.py --port 8188 --num-questions 1319 --num-shots 5 --parallel 64
|
||||||
|
```
|
||||||
|
|
||||||
|
### 6. Offline demo
|
||||||
|
To run a offline demo on HPU quickly, after set model_path in offline_demo.py, run the start script directly.
|
||||||
|
```bash
|
||||||
|
./run_offline_demo.sh
|
||||||
|
```
|
||||||
@@ -0,0 +1,53 @@
|
|||||||
|
"""
|
||||||
|
# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
|
||||||
|
#
|
||||||
|
# Licensed under the Apache License, Version 2.0 (the "License"
|
||||||
|
# you may not use this file except in compliance with the License.
|
||||||
|
# You may obtain a copy of the License at
|
||||||
|
#
|
||||||
|
# http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
#
|
||||||
|
# Unless required by applicable law or agreed to in writing, software
|
||||||
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
# See the License for the specific language governing permissions and
|
||||||
|
# limitations under the License.
|
||||||
|
"""
|
||||||
|
|
||||||
|
from fastdeploy.engine.sampling_params import SamplingParams
|
||||||
|
from fastdeploy.entrypoints.llm import LLM
|
||||||
|
|
||||||
|
model_name_or_path = "ERNIE-4.5-21B-A3B-Paddle"
|
||||||
|
# model_name_or_path = "ERNIE-4.5-300B-A47B-Paddle"
|
||||||
|
|
||||||
|
# Hyperparameter settings
|
||||||
|
input_bs = 1
|
||||||
|
input_seq = None # 1000
|
||||||
|
max_out_tokens = 128
|
||||||
|
server_max_bs = 128
|
||||||
|
TP = 1
|
||||||
|
|
||||||
|
# num_gpu_blocks_override = ceil((input_seq + max_out_tokens) / 128) * server_max_bs
|
||||||
|
num_gpu_blocks_override = 2000
|
||||||
|
sampling_params = SamplingParams(max_tokens=max_out_tokens)
|
||||||
|
graph_optimization_config = {"use_cudagraph": False}
|
||||||
|
llm = LLM(
|
||||||
|
model=model_name_or_path,
|
||||||
|
tensor_parallel_size=TP,
|
||||||
|
engine_worker_queue_port=8602,
|
||||||
|
num_gpu_blocks_override=num_gpu_blocks_override,
|
||||||
|
block_size=128,
|
||||||
|
max_model_len=32768,
|
||||||
|
max_num_seqs=server_max_bs,
|
||||||
|
graph_optimization_config=graph_optimization_config,
|
||||||
|
)
|
||||||
|
|
||||||
|
if input_seq is None:
|
||||||
|
prompt = "user: who are you?"
|
||||||
|
else:
|
||||||
|
prompt = "hi " * input_seq
|
||||||
|
prompts = [prompt] * input_bs
|
||||||
|
for i in range(2):
|
||||||
|
output = llm.generate(prompts=prompts, use_tqdm=True, sampling_params=sampling_params)
|
||||||
|
|
||||||
|
print(output)
|
||||||
@@ -0,0 +1,195 @@
|
|||||||
|
import csv
|
||||||
|
import os
|
||||||
|
import re
|
||||||
|
import sys
|
||||||
|
|
||||||
|
log_patterns = [
|
||||||
|
re.compile(
|
||||||
|
r"benchmarkdata_(.+?)_inputlength_(\d+)_outputlength_(\d+)_batchsize_(\d+)_numprompts_(\d+)_.*(?<!_profile)\.log$"
|
||||||
|
),
|
||||||
|
re.compile(r"benchmarkdata_(.+?)_sharegpt_prompts_(\d+)_concurrency_(\d+)_.*(?<!_profile)\.log$"),
|
||||||
|
]
|
||||||
|
|
||||||
|
metrics = [
|
||||||
|
("Mean Decode", r"Mean Decode:\s+([\d\.]+)"),
|
||||||
|
("Mean TTFT (ms)", r"Mean TTFT \(ms\):\s+([\d\.]+)"),
|
||||||
|
("Mean S_TTFT (ms)", r"Mean S_TTFT \(ms\):\s+([\d\.]+)"),
|
||||||
|
("Mean TPOT (ms)", r"Mean TPOT \(ms\):\s+([\d\.]+)"),
|
||||||
|
("Mean ITL (ms)", r"Mean ITL \(ms\):\s+([\d\.]+)"),
|
||||||
|
("Mean S_ITL (ms)", r"Mean S_ITL \(ms\):\s+([\d\.]+)"),
|
||||||
|
("Mean E2EL (ms)", r"Mean E2EL \(ms\):\s+([\d\.]+)"),
|
||||||
|
("Mean S_E2EL (ms)", r"Mean S_E2EL \(ms\):\s+([\d\.]+)"),
|
||||||
|
("Mean Input Length", r"Mean Input Length:\s+([\d\.]+)"),
|
||||||
|
("Mean Output Length", r"Mean Output Length:\s+([\d\.]+)"),
|
||||||
|
("Request throughput (req/s)", r"Request throughput \(req/s\):\s+([\d\.]+)"),
|
||||||
|
("Output token throughput (tok/s)", r"Output token throughput \(tok/s\):\s+([\d\.]+)"),
|
||||||
|
("Total Token throughput (tok/s)", r"Total Token throughput \(tok/s\):\s+([\d\.]+)"),
|
||||||
|
]
|
||||||
|
|
||||||
|
|
||||||
|
def parse_benchmark_log_file(filepath):
|
||||||
|
with open(filepath, "r", encoding="utf-8") as f:
|
||||||
|
content = f.read()
|
||||||
|
result = {}
|
||||||
|
for name, pattern in metrics:
|
||||||
|
match = re.search(pattern, content)
|
||||||
|
result[name] = match.group(1) if match else ""
|
||||||
|
return result
|
||||||
|
|
||||||
|
|
||||||
|
def parse_profile_log_file(file_path):
|
||||||
|
prepare_input_times = []
|
||||||
|
model_times = []
|
||||||
|
postprocessing_times = []
|
||||||
|
steppaddle_times = []
|
||||||
|
|
||||||
|
with open(file_path, "r") as file:
|
||||||
|
for line in file:
|
||||||
|
prepare_input_match = re.search(r"_prepare_inputs time\(ms\): (\d+\.\d+)", line)
|
||||||
|
model_match = re.search(r"Model execution time\(ms\): (\d+\.\d+)", line)
|
||||||
|
postprocessing_match = re.search(r"PostProcessing execution time\(ms\): (\d+\.\d+)", line)
|
||||||
|
steppaddle_match = re.search(r"StepPaddle execution time\(ms\): (\d+\.\d+)", line)
|
||||||
|
|
||||||
|
if prepare_input_match:
|
||||||
|
prepare_input_times.append(float(prepare_input_match.group(1)))
|
||||||
|
if model_match:
|
||||||
|
model_times.append(float(model_match.group(1)))
|
||||||
|
if postprocessing_match:
|
||||||
|
postprocessing_times.append(float(postprocessing_match.group(1)))
|
||||||
|
if steppaddle_match:
|
||||||
|
steppaddle_times.append(float(steppaddle_match.group(1)))
|
||||||
|
|
||||||
|
return prepare_input_times, model_times, postprocessing_times, steppaddle_times
|
||||||
|
|
||||||
|
|
||||||
|
def calculate_times(times, separate_first):
|
||||||
|
if len(times) < 2:
|
||||||
|
return times[0], None
|
||||||
|
if separate_first:
|
||||||
|
first_time = times[0]
|
||||||
|
average_time = sum(times[1:]) / len(times[1:])
|
||||||
|
return first_time, average_time
|
||||||
|
else:
|
||||||
|
return None, sum(times) / len(times)
|
||||||
|
|
||||||
|
|
||||||
|
def main():
|
||||||
|
if len(sys.argv) > 1:
|
||||||
|
log_dir = sys.argv[1]
|
||||||
|
else:
|
||||||
|
log_dir = "."
|
||||||
|
try:
|
||||||
|
from natsort import natsorted
|
||||||
|
|
||||||
|
natsort_available = True
|
||||||
|
except ImportError:
|
||||||
|
natsort_available = False
|
||||||
|
all_files = set(os.listdir(log_dir))
|
||||||
|
files = []
|
||||||
|
for f in os.listdir(log_dir):
|
||||||
|
for pat in log_patterns:
|
||||||
|
if pat.match(f):
|
||||||
|
files.append(f)
|
||||||
|
break
|
||||||
|
if natsort_available:
|
||||||
|
files = natsorted(files)
|
||||||
|
else:
|
||||||
|
import re as _re
|
||||||
|
|
||||||
|
def natural_key(s):
|
||||||
|
return [int(text) if text.isdigit() else text.lower() for text in _re.split("([0-9]+)", s)]
|
||||||
|
|
||||||
|
files.sort(key=natural_key)
|
||||||
|
rows = []
|
||||||
|
|
||||||
|
for file in files:
|
||||||
|
m = None
|
||||||
|
matched_idx = -1
|
||||||
|
for idx, pat in enumerate(log_patterns):
|
||||||
|
m = pat.match(file)
|
||||||
|
if m:
|
||||||
|
matched_idx = idx
|
||||||
|
break
|
||||||
|
if not m:
|
||||||
|
continue
|
||||||
|
# model_name, input_len, output_len, batch_size, num_prompts
|
||||||
|
# model_name, num_prompts, max_concurrency
|
||||||
|
if matched_idx == 0:
|
||||||
|
model_name, input_len, output_len, batch_size, num_prompts = m.groups()
|
||||||
|
elif matched_idx == 1:
|
||||||
|
model_name, num_prompts, max_concurrency = m.groups()
|
||||||
|
input_len = "-"
|
||||||
|
output_len = "-"
|
||||||
|
if file.endswith(".log"):
|
||||||
|
profile_file = file[:-4] + "_profile.log"
|
||||||
|
else:
|
||||||
|
profile_file = ""
|
||||||
|
model_first = model_average = postprocessing_average = steppaddle_average = ""
|
||||||
|
if profile_file in all_files:
|
||||||
|
prepare_input_times, model_times, postprocessing_times, steppaddle_times = parse_profile_log_file(
|
||||||
|
os.path.join(log_dir, profile_file)
|
||||||
|
)
|
||||||
|
_, pia = calculate_times(prepare_input_times, False)
|
||||||
|
mf, ma = calculate_times(model_times, True)
|
||||||
|
_, pa = calculate_times(postprocessing_times, False)
|
||||||
|
_, sa = calculate_times(steppaddle_times, False)
|
||||||
|
prepare_input_average = pia if pia is not None else ""
|
||||||
|
model_first = mf if mf is not None else ""
|
||||||
|
model_average = ma if ma is not None else ""
|
||||||
|
postprocessing_average = pa if pa is not None else ""
|
||||||
|
steppaddle_average = sa if sa is not None else ""
|
||||||
|
data = parse_benchmark_log_file(os.path.join(log_dir, file))
|
||||||
|
data["dataset"] = "Fixed-Length" if matched_idx == 0 else "ShareGPT"
|
||||||
|
data["model_name"] = model_name
|
||||||
|
data["input_length"] = input_len
|
||||||
|
data["output_length"] = output_len
|
||||||
|
data["batch_size"] = batch_size if matched_idx == 0 else max_concurrency
|
||||||
|
data["num_prompts"] = num_prompts
|
||||||
|
data["prepare_input_average"] = prepare_input_average
|
||||||
|
data["model_execute_first"] = model_first
|
||||||
|
data["model_execute_average"] = model_average
|
||||||
|
data["postprocessing_execute_average"] = postprocessing_average
|
||||||
|
data["steppaddle_execute_average"] = steppaddle_average
|
||||||
|
rows.append(data)
|
||||||
|
|
||||||
|
import datetime
|
||||||
|
|
||||||
|
import pytz
|
||||||
|
|
||||||
|
shanghai_tz = pytz.timezone("Asia/Shanghai")
|
||||||
|
now = datetime.datetime.now(shanghai_tz)
|
||||||
|
ts = now.strftime("%Y%m%d_%H%M%S")
|
||||||
|
log_dir_name = os.path.basename(os.path.abspath(log_dir))
|
||||||
|
if log_dir_name == "" or log_dir == "." or log_dir == "/":
|
||||||
|
csv_filename = f"benchmark_summary_{ts}.csv"
|
||||||
|
else:
|
||||||
|
csv_filename = f"benchmark_summary_{log_dir_name}_{ts}.csv"
|
||||||
|
fieldnames = (
|
||||||
|
[
|
||||||
|
"model_name",
|
||||||
|
"dataset",
|
||||||
|
"input_length",
|
||||||
|
"output_length",
|
||||||
|
"batch_size",
|
||||||
|
"num_prompts",
|
||||||
|
]
|
||||||
|
+ [name for name, _ in metrics]
|
||||||
|
+ [
|
||||||
|
"prepare_input_average",
|
||||||
|
"model_execute_first",
|
||||||
|
"model_execute_average",
|
||||||
|
"postprocessing_execute_average",
|
||||||
|
"steppaddle_execute_average",
|
||||||
|
]
|
||||||
|
)
|
||||||
|
with open(csv_filename, "w", newline="", encoding="utf-8") as csvfile:
|
||||||
|
writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
|
||||||
|
writer.writeheader()
|
||||||
|
for row in rows:
|
||||||
|
writer.writerow(row)
|
||||||
|
print(f"CSV saved as: {csv_filename}")
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
print("Starting to parse logs...")
|
||||||
|
main()
|
||||||
Executable
+19
@@ -0,0 +1,19 @@
|
|||||||
|
#!/bin/bash
|
||||||
|
|
||||||
|
export GC_KERNEL_PATH=/usr/lib/habanalabs/libtpc_kernels.so
|
||||||
|
export GC_KERNEL_PATH=/usr/local/lib/python3.10/dist-packages/paddle_custom_device/intel_hpu/libcustom_tpc_perf_lib.so:$GC_KERNEL_PATH
|
||||||
|
export INTEL_HPU_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
|
||||||
|
export PADDLE_DISTRI_BACKEND=xccl
|
||||||
|
export PADDLE_XCCL_BACKEND=intel_hpu
|
||||||
|
# export HPU_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
|
||||||
|
export HPU_VISIBLE_DEVICES=0
|
||||||
|
export HABANA_PROFILE=0
|
||||||
|
export PROFILE_START=1
|
||||||
|
export PROFILE_END=3
|
||||||
|
# export HABANA_LOGS=hpu_logs
|
||||||
|
# export LOG_LEVEL_ALL=0
|
||||||
|
# export FLAGS_intel_hpu_runtime_debug=1
|
||||||
|
# export FLAGS_intel_hpu_reciperunner_debug=1
|
||||||
|
|
||||||
|
rm -rf log
|
||||||
|
FD_ATTENTION_BACKEND=HPU_ATTN python offline_demo.py
|
||||||
@@ -26,8 +26,8 @@ python -m pip uninstall fastdeploy_intel_hpu -y
|
|||||||
#to install paddlepaddle
|
#to install paddlepaddle
|
||||||
pip install paddlepaddle==3.1.1 -i https://www.paddlepaddle.org.cn/packages/stable/cpu/
|
pip install paddlepaddle==3.1.1 -i https://www.paddlepaddle.org.cn/packages/stable/cpu/
|
||||||
#to install paddlecustomdevice? (paddle-intel-hpu)
|
#to install paddlecustomdevice? (paddle-intel-hpu)
|
||||||
pip install https://paddle-qa.bj.bcebos.com/suijiaxin/HPU/paddle_intel_hpu-0.0.1-cp310-cp310-linux_x86_64.whl
|
pip install https://paddle-qa.bj.bcebos.com/suijiaxin/HPU/paddle_intel_hpu-0.0.2-cp310-cp310-linux_x86_64.whl
|
||||||
pip install https://paddle-qa.bj.bcebos.com/suijiaxin/HPU/paddlenlp_ops-0.0.0-cp310-cp310-linux_x86_64.whl
|
pip install https://paddle-qa.bj.bcebos.com/suijiaxin/HPU/paddlenlp_ops-0.0.2-cp310-cp310-linux_x86_64.whl
|
||||||
|
|
||||||
#to build and install fastdeploy
|
#to build and install fastdeploy
|
||||||
echo "build whl"
|
echo "build whl"
|
||||||
|
|||||||
Reference in New Issue
Block a user