mirror of
https://github.com/PaddlePaddle/FastDeploy.git
synced 2026-04-23 00:17:25 +08:00
cef3164c3b
CE Compile Job / ce_job_pre_check (push) Has been cancelled
CE Compile Job / print_ce_job_pre_check_outputs (push) Has been cancelled
CE Compile Job / FD-Clone-Linux (push) Has been cancelled
CE Compile Job / Show Code Archive Output (push) Has been cancelled
CE Compile Job / BUILD_SM8090 (push) Has been cancelled
CE Compile Job / BUILD_SM8689 (push) Has been cancelled
CE Compile Job / CE_UPLOAD (push) Has been cancelled
Deploy GitHub Pages / deploy (push) Has been cancelled
Publish Job / publish_pre_check (push) Has been cancelled
Publish Job / print_publish_pre_check_outputs (push) Has been cancelled
Publish Job / FD-Clone-Linux (push) Has been cancelled
Publish Job / Show Code Archive Output (push) Has been cancelled
Publish Job / BUILD_SM8090 (push) Has been cancelled
Publish Job / BUILD_SM8689 (push) Has been cancelled
Publish Job / PADDLE_PYPI_UPLOAD_8090 (push) Has been cancelled
Publish Job / PADDLE_PYPI_UPLOAD_8689 (push) Has been cancelled
Publish Job / Run FD Image Build (push) Has been cancelled
Publish Job / Run FastDeploy Unit Tests and Coverage (push) Has been cancelled
Publish Job / Run FastDeploy LogProb Tests (push) Has been cancelled
Publish Job / Extracted partial CE model tasks to run in CI. (push) Has been cancelled
Publish Job / Run Base Tests (push) Has been cancelled
Publish Job / Run Accuracy Tests (push) Has been cancelled
Publish Job / Run Stable Tests (push) Has been cancelled
CI Images Build / FD-Clone-Linux (push) Has been cancelled
CI Images Build / Show Code Archive Output (push) Has been cancelled
CI Images Build / CI Images Build (push) Has been cancelled
CI Images Build / BUILD_SM8090 (push) Has been cancelled
CI Images Build / Run FastDeploy Unit Tests and Coverage (push) Has been cancelled
CI Images Build / Run FastDeploy LogProb Tests (push) Has been cancelled
CI Images Build / Extracted partial CE model tasks to run in CI. (push) Has been cancelled
CI Images Build / Run Base Tests (push) Has been cancelled
CI Images Build / Run Accuracy Tests (push) Has been cancelled
CI Images Build / Run Stable Tests (push) Has been cancelled
CI Images Build / Publish Docker Images Pre Check (push) Has been cancelled
* delete impl * delete min_length&max_length * support limit thinking content strategy * fix * fix * fix * update * fix set_value_by_flags_and_idx * fix * fix * fix * fix * update * fix * fix * fix typo * fix ci * fix * fix * support mtp * fix * fix * update * update
251 lines
8.2 KiB
Python
251 lines
8.2 KiB
Python
"""
|
|
# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
|
|
#
|
|
# Licensed under the Apache License, Version 2.0 (the "License"
|
|
# you may not use this file except in compliance with the License.
|
|
# You may obtain a copy of the License at
|
|
#
|
|
# http://www.apache.org/licenses/LICENSE-2.0
|
|
#
|
|
# Unless required by applicable law or agreed to in writing, software
|
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
# See the License for the specific language governing permissions and
|
|
# limitations under the License.
|
|
"""
|
|
|
|
from __future__ import annotations
|
|
|
|
import argparse
|
|
import json
|
|
import typing
|
|
from pathlib import Path
|
|
|
|
from fastdeploy.config import ModelConfig
|
|
from fastdeploy.entrypoints.cli.types import CLISubcommand
|
|
from fastdeploy.input.preprocess import InputPreprocessor
|
|
|
|
if typing.TYPE_CHECKING:
|
|
from fastdeploy.utils import FlexibleArgumentParser
|
|
|
|
|
|
class TokenizerSubcommand(CLISubcommand):
|
|
"""The `tokenizer` subcommand for the FastDeploy CLI."""
|
|
|
|
name = "tokenizer"
|
|
|
|
@staticmethod
|
|
def cmd(args: argparse.Namespace) -> None:
|
|
main(args)
|
|
|
|
def subparser_init(self, subparsers: argparse._SubParsersAction) -> FlexibleArgumentParser:
|
|
tokenizer_parser = subparsers.add_parser(
|
|
name=self.name,
|
|
help="Start the FastDeploy Tokenizer Server.",
|
|
description="Start the FastDeploy Tokenizer Server.",
|
|
usage="fastdeploy tokenizer [--encode/-e TEXT] [--decode/-d TEXT]",
|
|
)
|
|
|
|
# 添加通用参数
|
|
tokenizer_parser.add_argument(
|
|
"--model_name_or_path",
|
|
"--model",
|
|
"-m",
|
|
type=str,
|
|
default="baidu/ERNIE-4.5-0.3B-PT",
|
|
help="Path to model or model identifier",
|
|
)
|
|
tokenizer_parser.add_argument("--enable-mm", "-mm", action="store_true", help="Enable multi-modal support")
|
|
tokenizer_parser.add_argument("--vocab-size", "-vs", action="store_true", help="Show vocabulary size")
|
|
tokenizer_parser.add_argument("--info", "-i", action="store_true", help="Show tokenizer information")
|
|
tokenizer_parser.add_argument(
|
|
"--vocab-export", "-ve", type=str, metavar="FILE", help="Export vocabulary to file"
|
|
)
|
|
tokenizer_parser.add_argument("--encode", "-e", default=None, help="Encode text to tokens")
|
|
tokenizer_parser.add_argument("--decode", "-d", default=None, help="Decode tokens to text")
|
|
|
|
return tokenizer_parser
|
|
|
|
|
|
def cmd_init() -> list[CLISubcommand]:
|
|
return [TokenizerSubcommand()]
|
|
|
|
|
|
def get_vocab_size(tokenizer) -> int:
|
|
"""获取词表大小"""
|
|
try:
|
|
if hasattr(tokenizer, "vocab_size"):
|
|
return tokenizer.vocab_size
|
|
elif hasattr(tokenizer, "get_vocab_size"):
|
|
return tokenizer.get_vocab_size()
|
|
else:
|
|
return 100295 # Ernie4_5Tokenizer的固定词表大小
|
|
except Exception:
|
|
return 0
|
|
|
|
|
|
def get_tokenizer_info(tokenizer) -> dict:
|
|
"""获取tokenizer的元信息"""
|
|
info = {}
|
|
|
|
try:
|
|
# 基本属性
|
|
info["vocab_size"] = get_vocab_size(tokenizer)
|
|
|
|
# 模型类型和路径
|
|
if hasattr(tokenizer, "name_or_path"):
|
|
info["model_name"] = tokenizer.name_or_path
|
|
|
|
# tokenizer类型
|
|
info["tokenizer_type"] = type(tokenizer).__name__
|
|
|
|
# 特殊符号
|
|
special_tokens = {}
|
|
for attr in ["bos_token", "eos_token", "unk_token", "sep_token", "pad_token", "cls_token", "mask_token"]:
|
|
if hasattr(tokenizer, attr):
|
|
token = getattr(tokenizer, attr)
|
|
if token:
|
|
special_tokens[attr] = token
|
|
info["special_tokens"] = special_tokens
|
|
|
|
# 特殊token IDs
|
|
special_token_ids = {}
|
|
for attr in [
|
|
"bos_token_id",
|
|
"eos_token_id",
|
|
"unk_token_id",
|
|
"sep_token_id",
|
|
"pad_token_id",
|
|
"cls_token_id",
|
|
"mask_token_id",
|
|
]:
|
|
if hasattr(tokenizer, attr):
|
|
token_id = getattr(tokenizer, attr)
|
|
if token_id is not None:
|
|
special_token_ids[attr] = token_id
|
|
info["special_token_ids"] = special_token_ids
|
|
|
|
# 模型最大长度
|
|
if hasattr(tokenizer, "model_max_length"):
|
|
info["model_max_length"] = tokenizer.model_max_length
|
|
|
|
except Exception as e:
|
|
info["error"] = f"Failed to get tokenizer info: {e}"
|
|
|
|
return info
|
|
|
|
|
|
def get_vocab_dict(tokenizer) -> dict:
|
|
"""获取词表字典"""
|
|
try:
|
|
if hasattr(tokenizer, "vocab"):
|
|
return tokenizer.vocab
|
|
elif hasattr(tokenizer, "get_vocab"):
|
|
return tokenizer.get_vocab()
|
|
elif hasattr(tokenizer, "tokenizer") and hasattr(tokenizer.tokenizer, "vocab"):
|
|
return tokenizer.tokenizer.vocab
|
|
elif hasattr(tokenizer, "encoder"):
|
|
return tokenizer.encoder
|
|
else:
|
|
return {}
|
|
except Exception:
|
|
return {}
|
|
|
|
|
|
def export_vocabulary(tokenizer, file_path: str) -> None:
|
|
"""导出词表到文件"""
|
|
try:
|
|
vocab = get_vocab_dict(tokenizer)
|
|
if not vocab:
|
|
print("Warning: Could not retrieve vocabulary from tokenizer")
|
|
return
|
|
|
|
path = Path(file_path)
|
|
path.parent.mkdir(parents=True, exist_ok=True)
|
|
|
|
# 根据文件扩展名选择格式
|
|
if path.suffix.lower() == ".json":
|
|
with open(path, "w", encoding="utf-8") as f:
|
|
json.dump(vocab, f, ensure_ascii=False, indent=2)
|
|
else:
|
|
# 默认格式:每行一个token
|
|
with open(path, "w", encoding="utf-8") as f:
|
|
for token, token_id in sorted(vocab.items(), key=lambda x: x[1]):
|
|
# 处理不可打印字符
|
|
try:
|
|
f.write(f"{token_id}\t{repr(token)}\n")
|
|
except:
|
|
f.write(f"{token_id}\t<unprintable>\n")
|
|
|
|
print(f"Vocabulary exported to: {file_path}")
|
|
print(f"Total tokens: {len(vocab)}")
|
|
|
|
except Exception as e:
|
|
print(f"Error exporting vocabulary: {e}")
|
|
|
|
|
|
def main(args: argparse.Namespace) -> None:
|
|
|
|
def print_separator(title=""):
|
|
if title:
|
|
print(f"\n{'='*50}")
|
|
print(f" {title}")
|
|
print(f"{'='*50}")
|
|
else:
|
|
print(f"\n{'='*50}")
|
|
|
|
# 检查参数
|
|
if not any([args.encode, args.decode, args.vocab_size, args.info, args.vocab_export]):
|
|
print("请至少指定一个参数:--encode, --decode, --vocab-size, --info, --export-vocab")
|
|
return
|
|
|
|
# 初始化tokenizer
|
|
preprocessor = InputPreprocessor(model_config=ModelConfig({"model": args.model_name_or_path}))
|
|
tokenizer = preprocessor.create_processor().tokenizer
|
|
|
|
# 执行操作
|
|
operations_count = 0
|
|
|
|
if args.encode:
|
|
print_separator("ENCODING")
|
|
print(f"Input text: {args.encode}")
|
|
encoded_text = tokenizer.encode(args.encode)
|
|
print(f"Encoded tokens: {encoded_text}")
|
|
operations_count += 1
|
|
|
|
if args.decode:
|
|
print_separator("DECODING")
|
|
print(f"Input tokens: {args.decode}")
|
|
try:
|
|
if isinstance(args.decode, str):
|
|
if args.decode.startswith("[") and args.decode.endswith("]"):
|
|
tokens = eval(args.decode)
|
|
else:
|
|
tokens = [int(x.strip()) for x in args.decode.split(",")]
|
|
else:
|
|
tokens = args.decode
|
|
|
|
decoded_text = tokenizer.decode(tokens)
|
|
print(f"Decoded text: {decoded_text}")
|
|
except Exception as e:
|
|
print(f"Error decoding tokens: {e}")
|
|
operations_count += 1
|
|
|
|
if args.vocab_size:
|
|
print_separator("VOCABULARY SIZE")
|
|
print(f"Vocabulary size: {get_vocab_size(tokenizer)}")
|
|
operations_count += 1
|
|
|
|
if args.info:
|
|
print_separator("TOKENIZER INFO")
|
|
print(json.dumps(get_tokenizer_info(tokenizer), indent=2))
|
|
operations_count += 1
|
|
|
|
if args.vocab_export:
|
|
print_separator("EXPORT VOCABULARY")
|
|
export_vocabulary(tokenizer, args.vocab_export)
|
|
operations_count += 1
|
|
|
|
print_separator()
|
|
print(f"Completed {operations_count} operation(s)")
|