[LLM] Add output module and polish docs

This commit is contained in:
jiangjiajun
2025-06-09 20:26:53 +08:00
parent 684703fd72
commit fb18f3092d
8 changed files with 548 additions and 364 deletions
-180
View File
@@ -1,180 +0,0 @@
---
Language: Cpp
# BasedOnStyle: LLVM
AccessModifierOffset: -1
AlignAfterOpenBracket: Align
AlignArrayOfStructures: None
AlignConsecutiveMacros: None
AlignConsecutiveAssignments: None
AlignConsecutiveBitFields: None
AlignConsecutiveDeclarations: None
AlignEscapedNewlines: Right
AlignOperands: Align
AlignTrailingComments: true
AllowAllArgumentsOnNextLine: true
AllowAllConstructorInitializersOnNextLine: true
AllowAllParametersOfDeclarationOnNextLine: true
AllowShortEnumsOnASingleLine: true
AllowShortBlocksOnASingleLine: Never
AllowShortCaseLabelsOnASingleLine: false
AllowShortFunctionsOnASingleLine: All
AllowShortLambdasOnASingleLine: All
AllowShortIfStatementsOnASingleLine: Never
AllowShortLoopsOnASingleLine: false
AlwaysBreakAfterDefinitionReturnType: None
AlwaysBreakAfterReturnType: None
AlwaysBreakBeforeMultilineStrings: false
AlwaysBreakTemplateDeclarations: MultiLine
AttributeMacros:
- __capability
BinPackArguments: true
BinPackParameters: true
BraceWrapping:
AfterCaseLabel: false
AfterClass: false
AfterControlStatement: Never
AfterEnum: false
AfterFunction: false
AfterNamespace: false
AfterObjCDeclaration: false
AfterStruct: false
AfterUnion: false
AfterExternBlock: false
BeforeCatch: false
BeforeElse: false
BeforeLambdaBody: false
BeforeWhile: false
IndentBraces: false
SplitEmptyFunction: true
SplitEmptyRecord: true
SplitEmptyNamespace: true
BreakBeforeBinaryOperators: None
BreakBeforeConceptDeclarations: true
BreakBeforeBraces: Attach
BreakBeforeInheritanceComma: false
BreakInheritanceList: BeforeColon
BreakBeforeTernaryOperators: true
BreakConstructorInitializersBeforeComma: false
BreakConstructorInitializers: BeforeColon
BreakAfterJavaFieldAnnotations: false
BreakStringLiterals: true
ColumnLimit: 80
# CommentPragmas: '^ IWYU pragma:'
# CommentPragmas: '^[^ ]'
CommentPragmas: '^\\.+'
CompactNamespaces: false
ConstructorInitializerAllOnOneLineOrOnePerLine: false
ConstructorInitializerIndentWidth: 4
ContinuationIndentWidth: 4
Cpp11BracedListStyle: true
DeriveLineEnding: true
DerivePointerAlignment: false
DisableFormat: false
EmptyLineAfterAccessModifier: Never
EmptyLineBeforeAccessModifier: LogicalBlock
ExperimentalAutoDetectBinPacking: false
FixNamespaceComments: true
ForEachMacros:
- foreach
- Q_FOREACH
- BOOST_FOREACH
IfMacros:
- KJ_IF_MAYBE
IncludeBlocks: Preserve
IncludeCategories:
- Regex: '^"(llvm|llvm-c|clang|clang-c)/'
Priority: 2
SortPriority: 0
CaseSensitive: false
- Regex: '^(<|"(gtest|gmock|isl|json)/)'
Priority: 3
SortPriority: 0
CaseSensitive: false
- Regex: '.*'
Priority: 1
SortPriority: 0
CaseSensitive: false
IncludeIsMainRegex: '(Test)?$'
IncludeIsMainSourceRegex: ''
IndentAccessModifiers: false
IndentCaseLabels: false
IndentCaseBlocks: false
IndentGotoLabels: true
IndentPPDirectives: None
IndentExternBlock: AfterExternBlock
IndentRequires: false
IndentWidth: 2
IndentWrappedFunctionNames: false
InsertTrailingCommas: None
JavaScriptQuotes: Leave
JavaScriptWrapImports: true
KeepEmptyLinesAtTheStartOfBlocks: true
LambdaBodyIndentation: Signature
MacroBlockBegin: ''
MacroBlockEnd: ''
MaxEmptyLinesToKeep: 1
NamespaceIndentation: None
ObjCBinPackProtocolList: Auto
ObjCBlockIndentWidth: 2
ObjCBreakBeforeNestedBlockParam: true
ObjCSpaceAfterProperty: false
ObjCSpaceBeforeProtocolList: true
PenaltyBreakAssignment: 2
PenaltyBreakBeforeFirstCallParameter: 19
PenaltyBreakComment: 300
PenaltyBreakFirstLessLess: 120
PenaltyBreakString: 1000
PenaltyBreakTemplateDeclaration: 10
PenaltyExcessCharacter: 1000000
PenaltyReturnTypeOnItsOwnLine: 60
PenaltyIndentedWhitespace: 0
PointerAlignment: Left
PPIndentWidth: -1
ReferenceAlignment: Pointer
ReflowComments: false
ShortNamespaceLines: 1
SortIncludes: CaseSensitive
SortJavaStaticImport: Before
SortUsingDeclarations: true
SpaceAfterCStyleCast: false
SpaceAfterLogicalNot: false
SpaceAfterTemplateKeyword: true
SpaceBeforeAssignmentOperators: true
SpaceBeforeCaseColon: false
SpaceBeforeCpp11BracedList: false
SpaceBeforeCtorInitializerColon: true
SpaceBeforeInheritanceColon: true
SpaceBeforeParens: ControlStatements
SpaceAroundPointerQualifiers: Default
SpaceBeforeRangeBasedForLoopColon: true
SpaceInEmptyBlock: false
SpaceInEmptyParentheses: false
SpacesBeforeTrailingComments: 2
SpacesInAngles: Never
SpacesInConditionalStatement: false
SpacesInContainerLiterals: true
SpacesInCStyleCastParentheses: false
SpacesInLineCommentPrefix:
Minimum: 1
Maximum: -1
SpacesInParentheses: false
SpacesInSquareBrackets: false
SpaceBeforeSquareBrackets: false
BitFieldColonSpacing: Both
Standard: Latest
StatementAttributeLikeMacros:
- Q_EMIT
StatementMacros:
- Q_UNUSED
- QT_REQUIRE_VERSION
TabWidth: 8
UseCRLF: false
UseTab: Never
WhitespaceSensitiveMacros:
- STRINGIZE
- PP_STRINGIZE
- BOOST_PP_STRINGIZE
- NS_SWIFT_NAME
- CF_SWIFT_NAME
...
-15
View File
@@ -1,15 +0,0 @@
#!/bin/bash
set -e
readonly VERSION="3.8"
version=$(clang-format -version)
if ! [[ version=="VERSION"* ]]; then
echo "clang-format version check failed."
echo "a version contains 'VERSIONisneeded,butgetversion'"
echo "you can install the right version, and make an soft-link to '$PATH' env"
exit -1
fi
clang-format -style=google $@
-60
View File
@@ -1,60 +0,0 @@
#!/bin/bash
#TOTAL_ERRORS=0
#echo "HAHAHAHAHHA"
#exit 5
#
#files=$(
#
#if [[ ! $TRAVIS_BRANCH ]]; then
# # install cpplint on local machine.
# if [[ ! $(which cpplint) ]]; then
# pip install cpplint
# fi
# # diff files on local machine.
# files=$(git diff --cached --name-status | awk 'Extra open brace or missing close brace2}')
#else
# # diff files between PR and latest commit on Travis CI.
# branch_ref=(gitrevparse"TRAVIS_BRANCH")
# head_ref=$(git rev-parse HEAD)
# files=(gitdiff−−namestatusbranch_ref $head_ref | awk 'Extra open brace or missing close brace2}')
#fi
## The trick to remove deleted files: https://stackoverflow.com/a/2413151
#for file in $files; do
# echo $file
# if [[ $file =~ ^(patches/.*) ]]; then
# continue;
# else
# cpplint --filter=-readability/fn_size $file;
# TOTAL_ERRORS=(exprTOTAL_ERRORS + $?);
# fi
#done
#
#exit $TOTAL_ERRORS
if git rev-parse --verify HEAD >/dev/null 2>&1
then
against=HEAD
else
# Initial commit: diff against an empty tree object
against=4b825dc642cb6eb9a060e54bf8d69288fbee4904
fi
# Redirect output to stderr.
exec 1>&2
cpplint=cpplint
sum=0
filters='-build/include_order,-build/namespaces,-legal/copyright,-runtime/references,-build/include_what_you_use'
# for cpp
for file in $(git diff-index --name-status $against -- | grep -E '\.[ch](pp)?$' | awk '{print $2}'); do
$cpplint --filter=$filters $file
sum=$(expr ${sum} + $?)
done
if [ ${sum} -eq 0 ]; then
exit 0
else
exit 1
fi
+158 -67
View File
@@ -1,69 +1,160 @@
build
cmake-build-debug
cmake-build-release
.vscode
FastDeploy.cmake
build-debug.sh
*dist
fastdeploy.egg-info
fastdeploy_python.egg-info
fastdeploy_gpu_python.egg-info
.setuptools-cmake-build
fastdeploy/version.py
fastdeploy/core/config.h
python/fastdeploy/c_lib_wrap.py
python/fastdeploy/LICENSE*
python/build_cpu.sh
python/fastdeploy/ThirdPartyNotices*
*.so*
fpython/astdeploy/libs/third_libs
fastdeploy/core/config.h
fastdeploy/pybind/main.cc
python/fastdeploy/libs/lib*
python/fastdeploy/libs/third_libs
__pycache__
build_fd_android.sh
python/scripts/process_libraries.py
.vs
.idea
.DS_Store
miniprogram_npm
node_modules
.DS_Store
dist
etc
lib
dist-ssr
coverage
*.local
yalc.*
.yalc
examples/vision/collect_quantize_cc.sh
examples/vision/tests_quantize
fastdeploy/LICENSE
fastdeploy/ThirdPartyNotices.txt
FastDeployCSharp.cmake
python/fastdeploy/code_version.py
*.pdmodel
*.pdiparams
*.pdiparams.info
log.txt
serving/build
serving/build.encrypt
serving/build.encrypt.auth
output
res
tmp
# Virtualenv
/.venv/
/venv/
# Byte-compiled / optimized / DLL files
__pycache__/
*.py[cod]
*$py.class
# C extensions
*.so
# Distribution / packaging
/bin/
/build/
/develop-eggs/
dist/
/eggs/
/lib/
/lib64/
/output/
/parts/
/sdist/
/var/
*.egg-info/
.installed.cfg
*.egg
.eggs
# AUTHORS and ChangeLog will be generated while packaging
/AUTHORS
/ChangeLog
# BCloud / BuildSubmitter
/build_submitter.*
/logger_client_log
# Installer logs
pip-log.txt
pip-delete-this-directory.txt
# Unit test / coverage reports
.tox/
.coverage
.cache
.pytest_cache
nosetests.xml
coverage.xml
# Translations
*.mo
*.pot
*.doctree
# Sphinx documentation
/docs/_build/
.env
log
nohup.out
llm/server/__pycache__
llm/server/data/__pycache__
llm/server/engine/__pycache__
llm/server/http_server/__pycache__
llm/server/log/
llm/client/build/
llm/client/dist/
llm/client/fastdeploy_client.egg-info/
llm/client/fastdeploy_client/tests/log/
*.pyc
.vscode
.idea
# PyInstaller
# Usually these files are written by a python script from a template
# before PyInstaller builds the exe, so as to inject date/other infos into it.
*.manifest
*.spec
# Django stuff:
*.log
local_settings.py
db.sqlite3
# Flask stuff:
instance/
.webassets-cache
# Scrapy stuff:
.scrapy
# PyBuilder
target/
# Jupyter Notebook
.ipynb_checkpoints
# IPython
profile_default/
ipython_config.py
# pyenv
.python-version
# celery beat schedule file
celerybeat-schedule
# SageMath parsed files
*.sage.py
# Spyder project settings
.spyderproject
.spyproject
# Rope project settings
.ropeproject
# mkdocs documentation
/site
# mypy
.mypy_cache/
.dmypy.json
dmypy.json
# Pyre type checker
.pyre/
# pycharm
.DS_Store
.idea/
FETCH_HEAD
#log
log/
checkpoints/
checkpoints_origin/
result/
result_lora/
# npu kernel cache
kernel_meta*
# building custom ops cache and auto-generated codes
*.o
fastdeploy_ops.py
version.txt
EGG-INFO/
# fp8 generated codes
autogen/
fp8_fp8_gemm_scale_bias_act.cu
fp8_fp8_dual_gemm_scale_bias_act.cu
visitor_fp8_gemm_fused.cu
# third party
custom_ops/third_party
fastdeploy/model_executor/ops/base
fastdeploy/model_executor/ops/gpu/deep_gemm
gemm_profiles.json
nohup.out
#fp8_deep_gemm
custom_ops/gpu_ops/fp8_deep_gemm/deep_gemm/include/cutlass
custom_ops/gpu_ops/fp8_deep_gemm/deep_gemm/include/cute
# buff
custom_ops/tmp*
+54 -37
View File
@@ -1,38 +1,55 @@
default_install_hook_types:
- pre-commit
- commit-msg
default_stages:
- pre-commit # Run locally
# - manual # Run in CI
repos:
- repo: https://github.com/pre-commit/pre-commit-hooks
rev: ed714747d7acbc5790b171702bb012af3b0fe145
hooks:
- id: check-merge-conflict
- id: check-symlinks
- id: end-of-file-fixer
- id: trailing-whitespace
- id: detect-private-key
- id: check-symlinks
- id: check-added-large-files
- repo: local
hooks:
- id: copyright_checker
name: copyright_checker
entry: python ./.copyright.hook
language: system
files: \.(c|cc|cxx|cpp|cu|h|hpp|hxx|proto|py)$
exclude: (?!.*third_party)^.*$
- repo: local
hooks:
- id: clang-format-with-version-check
name: clang-format
description: Format files with ClangFormat.
entry: bash .clang_format.hook -i
language: system
files: \.(c|cc|cxx|cpp|cu|hxx|proto)$
- repo: local
hooks:
- id: cpplint-cpp-source
name: cpplint
description: Check C++ code style using cpplint.py.
entry: bash .cpplint_pre_commit.hook
language: system
files: \.(c|cc|cxx|cpp|cu|h|hpp|hxx)$
# 格式化
- repo: https://github.com/google/yapf
rev: v0.43.0
hooks:
- id: yapf
args: [--in-place, --verbose]
# 代码检查
- repo: https://github.com/astral-sh/ruff-pre-commit
rev: v0.11.7
hooks:
- id: ruff
args: [--output-format, github, --fix]
# # 拼写检查
# - repo: https://github.com/codespell-project/codespell
# rev: v2.4.1
# hooks:
# - id: codespell
# additional_dependencies: ['tomli']
# args: ['--toml', 'pyproject.toml']
# 自动排序
- repo: https://github.com/PyCQA/isort
rev: 6.0.1
hooks:
- id: isort
# 格式化
- repo: https://github.com/pre-commit/mirrors-clang-format
rev: v20.1.3
hooks:
- id: clang-format
# exclude: '.*'
types_or: [c++, cuda]
args: [--style=file, --verbose]
# markdown
- repo: https://github.com/jackdewinter/pymarkdown
rev: v0.9.29
hooks:
- id: pymarkdown
args: [fix]
- repo: https://github.com/pre-commit/pre-commit-hooks
rev: v5.0.0
hooks:
- id: check-merge-conflict
- id: check-symlinks
- id: end-of-file-fixer
- id: trailing-whitespace
- id: detect-private-key
- id: check-symlinks
- id: check-added-large-files
+15 -5
View File
@@ -1,5 +1,16 @@
# FastDeploy 2.0: 大模型推理部署
<p align="center">
<a href="./LICENSE"><img src="https://img.shields.io/badge/license-Apache%202-dfd.svg"></a>
<a href="https://github.com/PaddlePaddle/FastDeploy/releases"><img src="https://img.shields.io/github/v/release/PaddlePaddle/FastDeploy?color=ffa"></a>
<a href=""><img src="https://img.shields.io/badge/python-3.10+-aff.svg"></a>
<a href=""><img src="https://img.shields.io/badge/os-linux-pink.svg"></a>
<a href="https://github.com/PaddlePaddle/FastDeploy/graphs/contributors"><img src="https://img.shields.io/github/contributors/PaddlePaddle/FastDeploy?color=9ea"></a>
<a href="https://github.com/PaddlePaddle/FastDeploy/commits"><img src="https://img.shields.io/github/commit-activity/m/PaddlePaddle/FastDeploy?color=3af"></a>
<a href="https://github.com/PaddlePaddle/FastDeploy/issues"><img src="https://img.shields.io/github/issues/PaddlePaddle/FastDeploy?color=9cc"></a>
<a href="https://github.com/PaddlePaddle/FastDeploy/stargazers"><img src="https://img.shields.io/github/stars/PaddlePaddle/FastDeploy?color=ccf"></a>
</p>
FastDeploy升级2.0版本支持多种大模型推理(当前仅支持Qwen2,更多模型即将更新支持),其推理部署功能涵盖:
- 一行命令即可快速实现模型的服务化部署,并支持流式生成
@@ -9,6 +20,8 @@ FastDeploy升级2.0版本支持多种大模型推理(当前仅支持Qwen2,
- 提供 Weight only int8/int4 无损压缩方案
- 支持 Prometheus Metrics 指标
> 注意: 老版本FastDeploy对于小模型的支持,请checkout [release/1.1.0分支](https://github.com/PaddlePaddle/FastDeploy/tree/release/1.1.0)。
## 环境依赖
- A800/H800/H100
- Python>=3.10
@@ -18,10 +31,9 @@ FastDeploy升级2.0版本支持多种大模型推理(当前仅支持Qwen2,
## 安装
推荐使用Docker环境
推荐使用Docker安装
```
docker pull
iregistry.baidu-int.com/paddlecloud/base-images:paddlecloud-ubuntu24.04-gcc12.3-cuda12.8-cudnn9.7-openmpi4.1.5-bccl2.15.5.4-ofed24.10-hadoop2.2.4.2-afsshell1.9.3.4095-250227
docker pull iregistry.baidu-int.com/paddlepaddle/fastdeploy:2.0.0-alpha
```
### 源码安装
@@ -33,9 +45,7 @@ python -m pip install --pre paddlepaddle-gpu -i https://www.paddlepaddle.org.cn/
2. 安装FastDeploy
```
# git clone FastDeploy仓库
cd FastDeploy
# 一键编译+安装本机可用的sm架构,whl包产物在dist/
bash build.sh
```
+15
View File
@@ -0,0 +1,15 @@
"""
# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License"
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""
+306
View File
@@ -0,0 +1,306 @@
"""
# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License"
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""
import os
import threading
import time
import traceback
from collections import Counter
from paddlenlp.utils.env import MAX_BSZ, MAX_DRAFT_TOKENS, SPECULATE_MAX_BSZ
from fastdeploy.engine.request import (CompletionOutput, RequestMetrics,
RequestOutput)
from fastdeploy.metrics.metrics import main_process_metrics
from fastdeploy.utils import llm_logger
class TokenProcessor(object):
"""
get Token/Score from Paddle inference engine
"""
def __init__(self, cfg, cached_generated_tokens):
import paddle
paddle.device.set_device("cpu")
self.cfg = cfg
self.cached_generated_tokens = cached_generated_tokens
self.resource_manager = None
self.tokens_counter = Counter()
self.is_speculate_decoding = False
if self.is_speculate_decoding:
self.output_tokens = paddle.full(shape=[
SPECULATE_MAX_BSZ * MAX_DRAFT_TOKENS + SPECULATE_MAX_BSZ + 2, 1
],
fill_value=2,
dtype="int64")
else:
self.output_tokens = paddle.full(shape=[MAX_BSZ + 2, 1],
fill_value=2,
dtype="int64")
self.worker = None
self.statics_start_time = time.time()
self.number_of_tasks = 0
self.number_of_input_tokens = 0
self.number_of_output_tokens = 0
self.total_step = 0
def set_resource_manager(self, resource_manager):
"""
set ResourceManager
Args:
resource_manager (ResourceManager)
"""
assert self.resource_manager is None, "The resource manager is not None, cannot set again."
self.resource_manager = resource_manager
def run(self):
"""
start thread to get tokens
"""
assert self.resource_manager is not None, "The resource manager is None, cannot run."
if self.worker is not None:
raise Exception("Worker is already running!")
self.worker = threading.Thread(target=self.process_sampling_results,
args=())
self.worker.daemon = True
self.worker.start()
def process_sampling_results(self):
"""
read tokens from paddle inference engine and process
"""
from fastdeploy.model_executor.models import \
inference_runner_supported_models
if self.cfg.model_config.architectures not in inference_runner_supported_models \
and "ErnieMoEVLForCausalLM" not in self.cfg.model_config.architectures:
from paddlenlp_ops import get_output, speculate_get_output
else:
os.environ["ELLM_LOG_LEVEL"] = "3"
use_pip_eff_llm = os.getenv('USE_PIP_EFF_LLM')
if use_pip_eff_llm is None:
from fastdeploy.model_executor.ops.gpu import (
get_output, speculate_get_output)
else:
from efficientllm.ops.gpu import (get_output,
speculate_get_output)
while True:
try:
rank_id = 0
is_blocking = True
if self.is_speculate_decoding:
speculate_get_output(self.output_tokens, rank_id,
is_blocking)
else:
get_output(self.output_tokens, rank_id, is_blocking)
if self.output_tokens[0, 0] == -2:
continue
self._process_batch_output()
except Exception as e:
llm_logger.info("while get input_data error: {0} {1}".format(
e, str(traceback.format_exc())))
def postprocess(self, batch_result):
"""
single post-processing function
Args:
batch_result (list): batch results
"""
self.cached_generated_tokens.put_results(batch_result)
def _recycle_resources(self, task_id, index, task):
"""
recycle resources
"""
self.resource_manager.stop_flags[index] = True
self.resource_manager.tasks_list[index] = None
self.resource_manager._recycle_block_tables(task.block_tables)
if task_id in self.tokens_counter:
del self.tokens_counter[task_id]
def _process_batch_output(self):
"""
batch post-processing function
"""
tokens = self.output_tokens.numpy()
batch = self.output_tokens[1, 0]
if not self.is_speculate_decoding:
tokens = tokens[2:batch + 2]
else:
accept_num = tokens[2:batch + 2]
batch_result = list()
for i in range(batch):
if self.resource_manager.stop_flags[i]:
continue
if not self.is_speculate_decoding:
token_ids = [int(tokens[i, 0])]
else:
token_ids = tokens[
2 + SPECULATE_MAX_BSZ + i * MAX_DRAFT_TOKENS:2 +
SPECULATE_MAX_BSZ + i * MAX_DRAFT_TOKENS +
accept_num[i, 0],
0,
].tolist()
if any(token_id < 0 for token_id in token_ids):
continue
task = self.resource_manager.tasks_list[i]
if self.cfg.enable_chunked_prefill:
if task.get("prefill_token_num", None) is None:
task.set("prefill_token_num", task.token_chunk_size)
else:
task.prefill_token_num += task.token_chunk_size
if task.prompt_token_ids_len > task.prefill_token_num:
continue
task_id = task.request_id
self.total_step += 1
current_time = time.time()
if self.tokens_counter[task_id] == 0:
metrics = RequestMetrics(
arrival_time=task.arrival_time,
inference_start_time=task.inference_start_time,
first_token_time=time.time() - task.inference_start_time,
time_in_queue=task.schedule_start_time -
task.preprocess_end_time,
preprocess_cost_time=task.preprocess_end_time -
task.preprocess_start_time)
main_process_metrics.time_to_first_token.observe(
current_time - task.inference_start_time)
main_process_metrics.request_queue_time.observe(
metrics.time_in_queue)
else:
if hasattr(task, 'last_token_time'
) and task.last_token_time is not None:
token_gen_time = current_time - task.last_token_time
main_process_metrics.time_per_output_token.observe(
token_gen_time)
task.last_token_time = current_time
metrics = RequestMetrics(
arrival_time=time.time(),
request_start_time=task.arrival_time,
)
self.number_of_output_tokens += len(token_ids)
result = RequestOutput(request_id=task_id,
outputs=CompletionOutput(index=i,
token_ids=[]),
finished=False,
metrics=metrics)
if self.tokens_counter[task_id] == 0:
if task.messages is not None:
result.prompt = task.messages
result.prompt_token_ids = task.prompt_token_ids
for token_id in token_ids:
self.tokens_counter[task_id] += 1
result.outputs.token_ids.append(token_id)
if token_id in task.eos_token_ids:
result.finished = True
result.prompt = task.prompt
result.prompt_token_ids = task.prompt_token_ids
llm_logger.info(
f"Request: {task_id} finished, number of "
f"generated tokens: {self.tokens_counter[task_id]}.")
llm_logger.info(
f"Request: {task_id} token ratio: {self.tokens_counter[task_id] / (time.time() - task.inference_start_time)}"
)
llm_logger.info(f"{self.resource_manager.info()}")
llm_logger.info(
f"Speculate accept ratio: {1 - self.total_step * 1.0 / self.number_of_output_tokens}"
f" total step: {self.total_step}. total_output_token_num: {self.number_of_output_tokens}"
)
self._recycle_resources(task_id, i, task)
main_process_metrics.num_requests_running.dec(1)
main_process_metrics.request_inference_time.observe(
current_time - task.inference_start_time)
break
batch_result.append(result)
self.postprocess(batch_result)
class WarmUpTokenProcessor(TokenProcessor):
"""
Warmup Processor
"""
def __init__(self, cfg):
super().__init__(cfg)
self._is_running = True
self._is_blocking = True
def postprocess(self, batch_result):
pass
def process_sampling_results(self):
"""
get output from model and process it
"""
from fastdeploy.model_executor.models import \
inference_runner_supported_models
if self.cfg.model_config.architectures not in inference_runner_supported_models \
and "ErnieMoEVLForCausalLM" not in self.cfg.model_config.architectures:
from paddlenlp_ops import get_output, speculate_get_output
else:
os.environ["ELLM_LOG_LEVEL"] = "3"
use_pip_eff_llm = os.getenv('USE_PIP_EFF_LLM')
if use_pip_eff_llm is None:
from fastdeploy.model_executor.ops.gpu import (
get_output, speculate_get_output)
else:
from efficientllm.ops.gpu import (get_output,
speculate_get_output)
while self._is_running:
try:
rank_id = 0
if self.is_speculate_decoding:
speculate_get_output(self.output_tokens, rank_id,
self._is_blocking)
else:
get_output(self.output_tokens, rank_id, self._is_blocking)
if self.output_tokens[0, 0] == -2:
continue
self._process_batch_output()
except Exception as e:
llm_logger.info("while get input_data error: {0} {1}".format(
e, str(traceback.format_exc())))
def stop(self):
"""
stop warm up thread
"""
self._is_running = False
self.worker.join()
llm_logger.info("warm up thread stop")
del self.worker