Sync v2.0 version of code to github repo

This commit is contained in:
Jiang-Jia-Jun
2025-06-29 23:29:37 +00:00
parent d151496038
commit 92c2cfa2e7
597 changed files with 78776 additions and 22905 deletions
+444
View File
@@ -0,0 +1,444 @@
"""
# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License"
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""
import os
import numpy as np
from paddleformers.generation import GenerationConfig
from fastdeploy import envs
from fastdeploy.utils import data_processor_logger
from fastdeploy.input.ernie_tokenizer import ErnieBotTokenizer
from fastdeploy.input.text_processor import BaseDataProcessor
_SAMPLING_EPS = 1e-5
class ErnieProcessor(BaseDataProcessor):
"""
初始化模型实例。
Args:
model_name_or_path (str): 模型名称或路径。
Attributes:
model_name_or_path (str): 存储模型名称或路径。
decode_status (dict): 存储解码状态信息。
tokenizer (object): 存储分词器实例。
eos_token_ids (list): 存储结束符号的token ID列表。
eos_token_id_len (int): 存储结束符号的token ID列表的长度。
pad_token_id (int): 存储填充符号的token ID。
"""
def __init__(self, model_name_or_path, reasoning_parser_obj=None):
self.model_name_or_path = model_name_or_path
data_processor_logger.info(f"model_name_or_path: {model_name_or_path}")
self._init_config()
self.decode_status = dict()
self.thinking_parser_dict = dict()
self._load_tokenizer()
data_processor_logger.info(
f"tokenizer information: bos_token is {self.tokenizer.bos_token} \
{self.tokenizer.bos_token_id}, \
eos_token is {self.tokenizer.eos_token}, {self.tokenizer.eos_token_id} "
)
self.eos_token_ids = [self.tokenizer.eos_token_id]
self.eos_token_id_len = len(self.eos_token_ids)
self.pad_token_id = self.get_pad_id()
self.reasoning_parser = None
if reasoning_parser_obj:
self.reasoning_parser = reasoning_parser_obj(self.tokenizer)
def _init_config(self):
self.use_hf_tokenizer = int(envs.FD_USE_HF_TOKENIZER) == 1
# Generation config
try:
self.generation_config = GenerationConfig.from_pretrained(
self.model_name_or_path)
except Exception as e:
data_processor_logger.warning(
f"Can't find generation config, so it will not use "
f"generation_config field in the model config, details={e}")
self.generation_config = None
def process_request(self, request, max_model_len=None, **kwargs):
"""
Preprocess the request
Args:
request (Dict): may contain text and messages fields
Returns:
bool: Whether preprocessing is successful
str: error message
"""
request = self._apply_default_parameters(request)
if request.get("eos_token_ids") is None or len(
request.eos_token_ids) == 0:
request.eos_token_ids = self.eos_token_ids
stop_sequences = request.get("stop", [])
if stop_sequences is not None and len(stop_sequences) != 0:
stop_seqs, stop_seqs_len = self.update_stop_seq(stop_sequences)
request.set("stop_token_ids", stop_seqs)
request.set("stop_seqs_len", stop_seqs_len)
if request.prompt_token_ids is None or len(
request.prompt_token_ids) == 0:
system = request.get("system")
if request.prompt is None and request.messages is None:
raise ValueError(
f"The request should have `input_ids`, `text` or `messages`: {request}.")
if request.prompt is not None or not request.raw_request:
prompt = request.prompt if request.prompt is not None else request.messages[0]
prompt = prompt[0] if isinstance(prompt, list) else prompt
tokens = self.tokenizer.tokenize(prompt)
token_ids = self.tokenizer.convert_tokens_to_ids(tokens)
request.prompt_token_ids = token_ids
data_processor_logger.info(f"req_id:{request.request_id}, tokens:{tokens}, token_ids: {token_ids}")
else:
request.prompt_token_ids = self.messages2ids(request.to_dict())
if max_model_len is not None and len(
request.prompt_token_ids) > max_model_len:
request.prompt_token_ids = request.prompt_token_ids[:
max_model_len -
1]
if request.get("max_tokens") is None:
request.set("max_tokens",
max(1, max_model_len - len(request.prompt_token_ids)))
if request.get("temperature") < _SAMPLING_EPS:
# zero temperature is equivalent to greedy sampling
request.set("temperature", 1)
data_processor_logger.info(f"Processed request {request}")
return request
def process_request_dict(self, request, max_model_len=None):
"""
Preprocess the request
Args:
request (Dict): may contain text and messages fields
Returns:
bool: Whether preprocessing is successful
str: error message
"""
request = self._apply_default_parameters(request)
if not request.get('eos_token_ids'):
request['eos_token_ids'] = self.eos_token_ids
# 处理stop_sequences
stop_sequences = request.get('stop', [])
if stop_sequences:
stop_seqs, stop_seqs_len = self.update_stop_seq(stop_sequences)
request['stop_token_ids'] = stop_seqs
request['stop_seqs_len'] = stop_seqs_len
system = request.get("system")
# 处理prompt_token_ids
if not request.get('prompt_token_ids'):
if request.get('prompt') is None and request.get(
'messages') is None:
raise ValueError(
f"Request must contain 'prompt_token_ids', 'prompt', or 'messages': {request}"
)
if request.get('prompt'):
prompt = request.get('prompt')
prompt = prompt[0] if isinstance(prompt, list) else prompt
tokens = self.tokenizer.tokenize(prompt)
token_ids = self.tokenizer.convert_tokens_to_ids(tokens)
request['prompt_token_ids'] = token_ids
req_id = request.get("request_id", None)
data_processor_logger.info(
f"req_id:{req_id}, tokens:{tokens}, token_ids: {token_ids}"
)
else:
request['prompt_token_ids'] = self.messages2ids(request)
# 截断超过长度限制的prompt
if max_model_len is not None and len(
request['prompt_token_ids']) > max_model_len:
request['prompt_token_ids'] = request[
'prompt_token_ids'][:max_model_len - 1]
if request.get("max_tokens") is None:
request["max_tokens"] = max(
1, max_model_len - len(request['prompt_token_ids']))
if request.get("temperature") < _SAMPLING_EPS:
# zero temperature is equivalent to greedy sampling
request["temperature"] = 1
data_processor_logger.info(f"Processed request {request}")
return request
def process_response(self, response_dict, **kwargs):
"""
Preprocess the response
Args:
response_dict (Dict): response for engine, contain ids fields
Returns:
Dict: response contain text fields
"""
req_id = response_dict.request_id
token_ids = response_dict.outputs.token_ids
response_dict.usage = {
"completion_tokens": response_dict.outputs.index + 1
}
if token_ids[-1] == self.tokenizer.eos_token_id:
token_ids = token_ids[:-1]
full_text = self.tokenizer.decode(token_ids)
if self.reasoning_parser:
reasoning_content, text = self.reasoning_parser.extract_reasoning_content(
full_text, response_dict)
response_dict.outputs.text = text
response_dict.outputs.reasoning_content = reasoning_content
else:
response_dict.outputs.text = full_text
data_processor_logger.info(f"req_id:{req_id}, token)ids: {token_ids}")
if response_dict.outputs.text == "" and \
response_dict.outputs.reasoning_content == "" and \
response_dict.outputs.tool_call_content == []:
return None
return response_dict
def process_response_dict(self, response_dict, stream, **kwargs):
"""
Preprocess the response
Args:
response_dict (Dict): response for engine, contain ids fields
Returns:
Dict: response contain text fields
"""
if stream:
return self.process_response_dict_streaming(
response_dict, **kwargs)
else:
return self.process_response_dict_normal(response_dict, **kwargs)
def process_response_dict_normal(self, response_dict, **kwargs):
"""
Preprocess the response
Args:
response_dict (Dict): response for engine, contain ids fields
Returns:
Dict: response contain text fields
"""
token_ids = response_dict["outputs"]["token_ids"]
is_end = response_dict["finished"]
req_id = response_dict["request_id"]
if is_end and len(token_ids) > 0:
if token_ids[-1] == self.tokenizer.eos_token_id:
token_ids = token_ids[:-1]
delta_text, _, previous_texts = self.ids2tokens(token_ids, req_id)
if is_end:
full_text = previous_texts + delta_text
if self.reasoning_parser:
reasoning_content, text = self.reasoning_parser.extract_reasoning_content(
full_text, response_dict)
response_dict["outputs"]["text"] = text
response_dict["outputs"][
"reasoning_content"] = reasoning_content
else:
response_dict["outputs"]["text"] = full_text
data_processor_logger.info(
f"req_id:{req_id}, decode_status: {self.decode_status[req_id]}"
)
del self.decode_status[req_id]
return response_dict
def process_response_dict_streaming(self, response_dict, **kwargs):
"""
Preprocess the response streaming
Args:
response_dict (Dict): response for engine, contain ids fields
Returns:
Dict: response contain text fields
"""
enable_thinking = kwargs.get("enable_thinking")
is_end = response_dict["finished"]
req_id = response_dict["request_id"]
token_ids = response_dict["outputs"]["token_ids"]
if is_end and len(token_ids) > 0:
if token_ids[-1] == self.tokenizer.eos_token_id:
token_ids = token_ids[:-1]
delta_text, previous_token_ids, previous_texts = self.ids2tokens(
token_ids, req_id)
if enable_thinking and self.reasoning_parser:
reasoning_content, text = self.reasoning_parser.extract_reasoning_content_streaming(
previous_texts, previous_texts + delta_text, delta_text,
previous_token_ids, previous_token_ids + token_ids, token_ids)
response_dict["outputs"]["text"] = text
response_dict["outputs"]["reasoning_content"] = reasoning_content
else:
response_dict["outputs"]["text"] = delta_text
if is_end:
data_processor_logger.info(
f"req_id:{req_id}, decode_status: {self.decode_status[req_id]}"
)
del self.decode_status[req_id]
return response_dict
def messages2ids(self, request_or_messages):
"""
Convert multi-turn messages into ID sequences.
Args:
request_or_messages: Either a request dict containing 'messages' field,
or a list of message dicts directly
Returns:
List of token IDs as strings (converted from token objects)
"""
if self.tokenizer.chat_template is None:
raise ValueError("This model does not support chat_template.")
spliced_message = self.tokenizer.apply_chat_template(
request_or_messages,
tokenize=False,
split_special_tokens=False,
add_special_tokens=False)
req_id = None
if isinstance(request_or_messages, dict):
req_id = request_or_messages.get("request_id", None)
tokens = self.tokenizer.tokenize(spliced_message)
token_ids = self.tokenizer.convert_tokens_to_ids(tokens)
data_processor_logger.info(
f"req_id:{req_id}, tokens:{tokens}, token_ids: {token_ids}")
return token_ids
def ids2tokens(self, token_id, task_id):
"""
token ids to strings
Args:
token_ids (List[int]): token ids
task_id (str): task id
Returns:
List[str]: strings
"""
if task_id not in self.decode_status:
# prefix offset & read offset & history token ids & history token strings
self.decode_status[task_id] = [0, 0, [], ""]
prefix_offset = self.decode_status[task_id][0]
read_offset = self.decode_status[task_id][1]
previous_token_ids = self.decode_status[task_id][2]
previous_texts = self.decode_status[task_id][3]
decode_str, prefix_offset, read_offset = self.tokenizer.decode_token(
previous_token_ids + token_id, prefix_offset, read_offset)
self.decode_status[task_id][0] = prefix_offset
self.decode_status[task_id][1] = read_offset
self.decode_status[task_id][2] += token_id
self.decode_status[task_id][3] += decode_str
return decode_str, previous_token_ids, previous_texts
def _load_tokenizer(self):
"""
load tokenizer
Returns:
tokenizer (AutoTokenizer)
"""
vocab_file_names = [
"tokenizer.model", "spm.model", "ernie_token_100k.model"
]
for i in range(len(vocab_file_names)):
if os.path.exists(
os.path.join(self.model_name_or_path,
vocab_file_names[i])):
ErnieBotTokenizer.resource_files_names[
"vocab_file"] = vocab_file_names[i]
break
self.tokenizer = ErnieBotTokenizer.from_pretrained(
self.model_name_or_path)
def get_pad_id(self):
"""
get pad_token_id, if not pad_token_id, use eos_token
Returns:
int: pad_token_id
"""
# if isinstance(self.tokenizer, (LlamaTokenizer, Llama3Tokenizer)) and not self.tokenizer.pad_token_id:
# return self.tokenizer.eos_token
return self.tokenizer.pad_token_id
def pad_batch_data(self,
insts,
pad_id=0,
return_seq_len=False,
return_array=True,
pad_style="right"):
"""Pad the instances to the max sequence length in batch."""
if len(insts) == 0:
padded_insts = np.array([[]],
dtype=np.int64) if return_array else [[]]
if return_seq_len:
seq_len = np.array([], dtype=np.int64) if return_array else []
return padded_insts, seq_len
return padded_insts
max_len = max(map(len, insts))
if pad_style == "left":
padded_insts = [[pad_id] * (max_len - len(inst)) + list(inst)
for inst in insts]
else:
padded_insts = [
list(inst) + [pad_id] * (max_len - len(inst)) for inst in insts
]
if return_array:
padded_insts = np.array(padded_insts,
dtype=np.int64).reshape([-1, max_len])
if return_seq_len:
seq_len = [len(inst) for inst in insts]
if return_array:
seq_len = np.array(seq_len, dtype=np.int64).reshape(-1, 1)
return padded_insts, seq_len
return padded_insts
def update_stop_seq(self, stop_sequences):
"""
Update stop sequences from request.
"""
stop_seqs = []
for seq in stop_sequences:
if seq != self.tokenizer.eos_token_id:
stop_seqs.append(
self.tokenizer.convert_tokens_to_ids(
self.tokenizer.tokenize(seq)))
stop_seqs, stop_seqs_len = self.pad_batch_data(stop_seqs,
pad_id=-1,
return_seq_len=True,
return_array=False)
data_processor_logger.debug(
f"processed stop_seqs: {stop_seqs}, {stop_seqs_len}")
return stop_seqs, stop_seqs_len
+285 -170
View File
@@ -14,159 +14,156 @@
# limitations under the License.
"""
# cipher_token=WjI1fQOvhN # do not edit this line
import os
import re
from shutil import copyfile
from typing import Any, Dict, List, Optional, Tuple
from typing import Dict, Optional, Tuple, List
import numpy as np
import sentencepiece as spm
from paddlenlp.transformers import AddedToken, PretrainedTokenizer
from paddlenlp.utils import logger
__all__ = ["ErnieBotTokenizer"]
import paddle
VOCAB_FILES_NAMES = {"vocab_file": "spm.model"}
PRETRAINED_VOCAB_FILES_MAP = {
"vocab_file": {},
"tokenizer_file": {},
}
PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {}
from paddleformers.utils.log import logger
from paddleformers.transformers import PretrainedTokenizer
from paddleformers.transformers.tokenizer_utils_base import (
PaddingStrategy,
TextInput,
)
class ErnieBotTokenizer(PretrainedTokenizer):
"""
Construct a ErnieBot tokenizer. Based on byte-level Byte-Pair-Encoding.
Args:
vocab_file (`str`):
Path to the vocabulary file.
一个更好用的 `ErnieBotToknizer`
能 encode 目前 sft/ppo 阶段的特殊token,也支持多模态。
"""
vocab_files_names = VOCAB_FILES_NAMES
resource_files_names = VOCAB_FILES_NAMES
pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
model_input_names = ["input_ids", "attention_mask"]
resource_files_names = {
"vocab_file": "tokenizer.model",
}
pretrained_resource_files_map = {"vocab_file": {"ernie-bot-10b": None}}
pretrained_init_configuration = {
"ernie-bot-10b": {},
}
model_input_names = ["input_ids", "position_ids", "attention_mask", "labels"]
padding_side = "right"
def __init__(
self,
vocab_file,
unk_token="<unk>",
bos_token="<s>",
cls_token="<cls>",
eos_token="</s>",
mask_token="<mask:0>",
pad_token="<pad>",
sp_model_kwargs: Optional[Dict[str, Any]] = None,
add_bos_token=True,
add_eos_token=False,
clean_up_tokenization_spaces=False,
sep_token="<sep>",
unk_token="<unk>",
additional_special_tokens=None,
verbose=False,
**kwargs,
):
self.vocab_file = vocab_file
self.add_bos_token = add_bos_token
self.add_eos_token = add_eos_token
self.sp_model_kwargs = {} if sp_model_kwargs is None else sp_model_kwargs
self.sp_model = spm.SentencePieceProcessor(**self.sp_model_kwargs)
self.sp_model.Load(vocab_file)
bos_token = AddedToken(bos_token,
lstrip=False, rstrip=False) if isinstance(
bos_token, str) else bos_token
eos_token = AddedToken(eos_token,
lstrip=False, rstrip=False) if isinstance(
eos_token, str) else eos_token
unk_token = AddedToken(unk_token,
lstrip=False, rstrip=False) if isinstance(
unk_token, str) else unk_token
pad_token = AddedToken(pad_token,
lstrip=False, rstrip=False) if isinstance(
pad_token, str) else pad_token
"""doc"""
if additional_special_tokens is None:
additional_special_tokens = ["<mask:1>", "<mask:7>"]
super().__init__(
bos_token=bos_token,
cls_token=cls_token,
eos_token=eos_token,
unk_token=unk_token,
mask_token=mask_token,
pad_token=pad_token,
add_bos_token=add_bos_token,
add_eos_token=add_eos_token,
sep_token=sep_token,
unk_token=unk_token,
additional_special_tokens=additional_special_tokens,
verbose=False,
sp_model_kwargs=self.sp_model_kwargs,
clean_up_tokenization_spaces=clean_up_tokenization_spaces,
**kwargs,
)
# for eb35 reader
self.bos_id = self.bos_token_id
self.eos_id = self.eos_token_id
self.sep_id = self.sep_token_id
self.pad_id = self.pad_token_id
self.unk_id = self.unk_token_id
self.vocab_file = vocab_file
self.sp_model = spm.SentencePieceProcessor()
self.sp_model.Load(vocab_file)
def __getstate__(self):
state = self.__dict__.copy()
state["sp_model"] = None
return state
@property
def space_token(self):
"""doc"""
return "<mask:1>"
def __setstate__(self, d):
self.__dict__ = d
self.sp_model = spm.SentencePieceProcessor(**self.sp_model_kwargs)
self.sp_model.Load(self.vocab_file)
@property
def space_token_id(self):
"""doc"""
return self.sp_model.piece_to_id("<mask:1>")
@property
def gend_token(self):
"""doc"""
return "<mask:7>"
@property
def gend_token_id(self):
"""doc"""
return self.sp_model.piece_to_id("<mask:7>")
@property
def im_start_id(self):
"""doc"""
return self.sp_model.piece_to_id("<|im_start|>")
@property
def im_end_id(self):
"""doc"""
return self.sp_model.piece_to_id("<|im_end|>")
@property
def vocab_size(self):
"""Returns vocab size"""
return self.sp_model.get_piece_size()
"""doc"""
return self.sp_model.vocab_size()
def get_vocab(self):
"""Returns vocab as a dict"""
vocab = {
self.convert_ids_to_tokens(i): i
for i in range(self.vocab_size)
}
"""doc"""
vocab = {self.convert_ids_to_tokens(i): i for i in range(self.vocab_size)}
vocab.update(self.added_tokens_encoder)
return vocab
def tokenize(self, text):
"""Returns a tokenized string."""
return self._tokenize(text)
def _tokenize(self, text):
"""Returns a tokenized string."""
return self.sp_model.encode(text, out_type=str)
def decode(self,
tokens,
skip_special_tokens=False,
clean_up_tokenization_spaces=False):
"""Returns a tokenized string."""
return self.sp_model.decode(tokens)
"""doc"""
return self.sp_model.encode_as_pieces(text)
def _convert_token_to_id(self, token):
"""Converts a token (str) in an id using the vocab."""
"""doc"""
return self.sp_model.piece_to_id(token)
def _convert_id_to_token(self, index):
"""Converts an index (integer) in a token (str) using the vocab."""
token = self.sp_model.IdToPiece(index)
return token
def _convert_id_to_token(self, id):
"""doc"""
return self.sp_model.id_to_piece(id)
def convert_tokens_to_string(self, tokens):
"""Converts a sequence of tokens (string) in a single string."""
current_sub_tokens = []
out_string = ""
prev_is_special = False
for i, token in enumerate(tokens):
# prev_is_special = False
for token in tokens:
# make sure that special tokens are not decoded using sentencepiece model
if token in self.all_special_tokens:
if not prev_is_special and i != 0:
out_string += " "
# if not prev_is_special:
# out_string += " "
out_string += self.sp_model.decode(current_sub_tokens) + token
prev_is_special = True
# prev_is_special = True
current_sub_tokens = []
else:
current_sub_tokens.append(token)
prev_is_special = False
# prev_is_special = False
out_string += self.sp_model.decode(current_sub_tokens)
return out_string
return out_string # .strip()
def save_vocabulary(self,
save_directory,
filename_prefix: Optional[str] = None) -> Tuple[str]:
def prepare_for_model(self, *args, **kwargs):
"""doc"""
if "add_special_tokens" in kwargs:
kwargs.pop("add_special_tokens")
# logger.warning(f'ErnieBotTokenizer v2 does not support `add_special_tokens`')
return super().prepare_for_model(*args, **kwargs)
def save_vocabulary(self, save_directory, filename_prefix: Optional[str] = None) -> Tuple[str]:
"""
Save the vocabulary and special tokens file to a directory.
Args:
@@ -176,94 +173,212 @@ class ErnieBotTokenizer(PretrainedTokenizer):
`Tuple(str)`: Paths to the files saved.
"""
if not os.path.isdir(save_directory):
logger.error(
f"Vocabulary path ({save_directory}) should be a directory")
logger.error(f"Vocabulary path ({save_directory}) should be a directory")
return
out_vocab_file = os.path.join(
save_directory,
(filename_prefix + "-" if filename_prefix else "") +
VOCAB_FILES_NAMES["vocab_file"])
if os.path.abspath(self.vocab_file) != os.path.abspath(
out_vocab_file) and os.path.isfile(self.vocab_file):
(filename_prefix + "-" if filename_prefix else "") + self.resource_files_names["vocab_file"],
)
if os.path.abspath(self.vocab_file) != os.path.abspath(out_vocab_file) and os.path.isfile(self.vocab_file):
copyfile(self.vocab_file, out_vocab_file)
elif not os.path.isfile(self.vocab_file):
with open(out_vocab_file, "wb") as fi:
content_spiece_model = self.sp_model.serialized_model_proto()
fi.write(content_spiece_model)
return (out_vocab_file,)
return (out_vocab_file, )
def build_inputs_with_special_tokens(self, token_ids_0, token_ids_1=None):
""" build_inputs_with_special_tokens """
bos_token_id = [self.bos_token_id] if self.add_bos_token else []
eos_token_id = [self.eos_token_id] if self.add_eos_token else []
output = bos_token_id + token_ids_0 + eos_token_id
if token_ids_1 is not None:
output = output + bos_token_id + token_ids_1 + eos_token_id
return output
def get_special_tokens_mask(
self,
token_ids_0: List[int],
token_ids_1: Optional[List[int]] = None,
already_has_special_tokens: bool = False) -> List[int]:
def tokenize(self, text: TextInput, **kwargs) -> List[str]:
"""
Retrieve sequence ids from a token list that has no special tokens added. This method is called when adding
special tokens using the tokenizer `prepare_for_model` method.
Converts a string in a sequence of tokens, using the tokenizer.
Split in words for word-based vocabulary or sub-words for sub-word-based vocabularies
(BPE/SentencePieces/WordPieces). Takes care of added tokens.
Args:
token_ids_0 (`List[int]`):
List of IDs.
token_ids_1 (`List[int]`, *optional*):
Optional second list of IDs for sequence pairs.
already_has_special_tokens (`bool`, *optional*, defaults to `False`):
Whether or not the token list is already formatted with special tokens for the model.
text (`str`):
The sequence to be encoded.
**kwargs (additional keyword arguments):
Passed along to the model-specific `prepare_for_tokenization` preprocessing method.
Returns:
`List[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
`List[str]`: The list of tokens.
"""
if already_has_special_tokens:
return super().get_special_tokens_mask(
token_ids_0=token_ids_0,
token_ids_1=token_ids_1,
already_has_special_tokens=True)
# Simple mapping string => AddedToken for special tokens with specific tokenization behaviors
# all_special_tokens_extended = dict(
# (str(t), t)
# for t in self.all_special_tokens_extended
# if isinstance(t, AddedToken)
# )
bos_token_id = [1] if self.add_bos_token else []
eos_token_id = [1] if self.add_eos_token else []
text, kwargs = self.prepare_for_tokenization(text, **kwargs)
if token_ids_1 is None:
return bos_token_id + ([0] * len(token_ids_0)) + eos_token_id
return (bos_token_id + ([0] * len(token_ids_0)) + eos_token_id +
bos_token_id + ([0] * len(token_ids_1)) + eos_token_id)
# TODO: should this be in the base class?
if hasattr(self, "do_lower_case") and self.do_lower_case:
# convert non-special tokens to lowercase
escaped_special_toks = [
re.escape(s_tok) for s_tok in (self.unique_no_split_tokens + self.all_special_tokens)
]
pattern = r"(" + r"|".join(escaped_special_toks) + r")|" + r"(.+?)"
text = re.sub(pattern, lambda m: m.groups()[0] or m.groups()[1].lower(), text)
def create_token_type_ids_from_sequences(
self,
token_ids_0: List[int],
token_ids_1: Optional[List[int]] = None) -> List[int]:
"""
Creates a mask from the two sequences passed to be used in a sequence-pair classification task. An ALBERT
sequence pair mask has the following format:
```
0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1
| first sequence | second sequence |
```
if token_ids_1 is None, only returns the first portion of the mask (0s).
Args:
token_ids_0 (`List[int]`):
List of ids.
token_ids_1 (`List[int]`, *optional*):
Optional second list of IDs for sequence pairs.
Returns:
`List[int]`: List of [token type IDs](../glossary#token-type-ids) according to the given sequence(s).
"""
bos_token_id = [self.bos_token_id] if self.add_bos_token else []
eos_token_id = [self.eos_token_id] if self.add_eos_token else []
no_split_token = set(self.unique_no_split_tokens)
tokens = self.tokens_trie.split(text)
output = [0] * len(bos_token_id + token_ids_0 + eos_token_id)
# ["This is something", "<special_token_1>", " else"]
# for i, token in enumerate(tokens):
# if token in no_split_token:
# tok_extended = all_special_tokens_extended.get(token, None)
# print(f'>>>{token}|{tok_extended}|{all_special_tokens_extended}<<<')
# left = tokens[i - 1] if i > 0 else None
# right = tokens[i + 1] if i < len(tokens) - 1 else None
# if isinstance(tok_extended, AddedToken):
# if tok_extended.rstrip and right:
# # A bit counter-intuitive but we strip the left of the string
# # since tok_extended.rstrip means the special token is eating all white spaces on its right
# tokens[i + 1] = right.lstrip()
# # Strip white spaces on the left
# if tok_extended.lstrip and left:
# tokens[i - 1] = left.rstrip() # Opposite here
# else:
# We strip left and right by default
# if right:
# tokens[i + 1] = right.lstrip()
# if left:
# tokens[i - 1] = left.rstrip()
# ["This is something", "<special_token_1>", "else"]
tokenized_text = []
for token in tokens:
# Need to skip eventual empty (fully stripped) tokens
if not token:
continue
if token in no_split_token:
tokenized_text.append(token)
else:
tokenized_text.extend(self._tokenize(token))
# ["This", " is", " something", "<special_token_1>", "else"]
return tokenized_text
if token_ids_1 is not None:
output += [1] * len(bos_token_id + token_ids_1 + eos_token_id)
return output
def _decode(self, *args, **kwargs):
"""doc"""
kwargs.pop("clean_up_tokenization_spaces", None)
kwargs.pop("spaces_between_special_tokens", None)
return super()._decode(
*args,
**kwargs,
clean_up_tokenization_spaces=False,
spaces_between_special_tokens=False,
)
def _pad(
self,
encoded_inputs: Dict,
max_length: Optional[int] = None,
padding_strategy=PaddingStrategy.DO_NOT_PAD,
pad_to_multiple_of: Optional[int] = None,
return_attention_mask: Optional[bool] = None,
) -> dict:
"""doc"""
if return_attention_mask is None:
return_attention_mask = "attention_mask" in self.model_input_names
if return_attention_mask:
required_input = encoded_inputs[self.model_input_names[0]]
if padding_strategy == PaddingStrategy.LONGEST:
max_length = len(required_input)
if max_length is not None and pad_to_multiple_of is not None and (max_length % pad_to_multiple_of != 0):
max_length = ((max_length // pad_to_multiple_of) + 1) * pad_to_multiple_of
needs_to_be_padded = padding_strategy != PaddingStrategy.DO_NOT_PAD and len(required_input) != max_length
if "attention_mask" in encoded_inputs and encoded_inputs["attention_mask"] is not None:
attention_mask = encoded_inputs.pop("attention_mask")
if isinstance(attention_mask, paddle.Tensor):
attention_mask = attention_mask.numpy()
elif isinstance(attention_mask, list):
attention_mask = np.array(attention_mask)
elif not isinstance(attention_mask, np.ndarray):
raise ValueError(f"Unexpected type {type(attention_mask)} of attention_mask, ")
else:
attention_mask = np.tril(np.ones((len(required_input), len(required_input)), dtype=np.int64))
attention_mask = np.expand_dims(attention_mask, axis=0)
if needs_to_be_padded:
difference = max_length - len(required_input)
if self.padding_side == "right":
if attention_mask.ndim == 1:
pad_width = [(0, difference)]
else:
pad_width = [(0, 0), (0, difference), (0, difference)]
elif self.padding_side == "left":
if attention_mask.ndim == 1:
pad_width = [(difference, 0)]
else:
pad_width = [(0, 0), (difference, 0), (difference, 0)]
else:
raise ValueError("Invalid padding strategy:" + str(self.padding_side))
attention_mask = np.pad(
attention_mask,
pad_width=pad_width,
mode="constant",
constant_values=0,
)
encoded_inputs = super()._pad(
encoded_inputs,
max_length,
padding_strategy=padding_strategy,
pad_to_multiple_of=pad_to_multiple_of,
return_attention_mask=False,
)
if return_attention_mask:
encoded_inputs["attention_mask"] = attention_mask.tolist()
return encoded_inputs
def add_special_tokens(
tokenizer,
special_tokens_info,
use_ocr_specialtoken=False,
use_crop_specialtoken=False,
special_token_ids_start=254208,
special_token_ids_end=256256,
):
"""
增加 special token
placeholder [<|IMAGE_PLACEHOLDER|>, <|AUDIO_PLACEHOLDER|>, <|VIDEO_PLACEHOLDER|>] 共3个
模态起始截止 special tokens [<|BOI|> <|EOI|> <|BOA|> <|EOA|> <|BOV|> <|EOV|>]
ocr special tokens [<|LOC_0|> <|LOC_1|> ... <|LOC_1000|>] 共1001个
crop special tokens [<|CROP_COL_SEP|>, <|CROP_ROW_SEP|>, <|CROP_IMAGE_SEP|>] 共3个
<|CROP_COL_SEP|> for col 维度切 图片width(替换原明文逗号)
<|CROP_ROW_SEP|> for row 维度切 图片height(替换原明文回车)
<|CROP_IMAGE_SEP|> for 区分原图和crop图 图片width(替换原明文两个回车)
共2048个 unsed token
Args:
tokenizer (ErnieTokenizer): tokenizer
special_token_ids_start (int, optional): special token 起点 ids. Defaults to 254208.
special_token_ids_end (int, optional): 词表最多支持大小. Defaults to 256256.
"""
special_tokens = [
special_tokens_info["image_placeholder"],
special_tokens_info["audio_placeholder"],
]
if use_ocr_specialtoken:
special_tokens.extend(special_tokens_info["ocr_coor"])
special_tokens.extend(special_tokens_info["ocr_begin_end"])
if use_crop_specialtoken:
special_tokens.extend(special_tokens_info["crop"])
# add special_tokens
additional_special_tokens = {"additional_special_tokens": special_tokens}
tokenizer.add_special_tokens(additional_special_tokens)
# check
first_special_tokens = tokenizer.encode(special_tokens[0])["input_ids"]
assert first_special_tokens[0] == special_token_ids_start, f"[ERROR] first_special_tokens={first_special_tokens}"
assert (
len(tokenizer.get_vocab()) < special_token_ids_end
), f"[ERROR] vocab_size = {len(tokenizer.get_vocab())} >= {special_token_ids_end} 增加过多special token了!"
+260
View File
@@ -0,0 +1,260 @@
"""
# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License"
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""
import os
import numpy as np
import re
from fastdeploy.input.mm_processor import DataProcessor, IDS_TYPE_FLAG
from fastdeploy.input.ernie_processor import ErnieProcessor
from fastdeploy.engine.request import Request
from fastdeploy.entrypoints.chat_utils import parse_chat_messages
from fastdeploy.utils import data_processor_logger
class ErnieMoEVLProcessor(ErnieProcessor):
"""The processor class for ERNIE MoE VL models."""
def __init__(self, model_name_or_path, limit_mm_per_prompt=None, mm_processor_kwargs=None,
reasoning_parser_obj=None):
self.use_hf_tokenizer = False
if "merge_llm_model" in model_name_or_path:
model_name_or_path = os.path.dirname(model_name_or_path)
data_processor_logger.info(f"model_name_or_path: {model_name_or_path}")
tokenizer_path = model_name_or_path
preprocessor_path = model_name_or_path
processor_kwargs = self._parse_processor_kwargs(mm_processor_kwargs)
self.ernie_processor = DataProcessor(
tokenizer_name=tokenizer_path,
image_preprocessor_name=preprocessor_path,
**processor_kwargs
)
self.ernie_processor.eval()
self.image_patch_id = self.ernie_processor.image_patch_id
self.spatial_conv_size = self.ernie_processor.spatial_conv_size
self.decode_status = dict()
self._load_tokenizer()
self.eos_token_ids = [self.tokenizer.eos_token_id]
self.eos_token_id_len = len(self.eos_token_ids)
self.pad_token_id = self.get_pad_id()
self.limit_mm_per_prompt = self._parse_limits(limit_mm_per_prompt)
self.reasoning_parser = None
if reasoning_parser_obj:
self.reasoning_parser = reasoning_parser_obj(self.tokenizer)
def get_pad_id(self):
"""get pad id"""
return self.tokenizer.pad_token_id
def _load_tokenizer(self):
"""
load tokenizer
Returns:
tokenizer (AutoTokenizer)
"""
self.tokenizer = self.ernie_processor.tokenizer
def process_request(self, request, max_model_len=None, **kwargs):
"""process the input data"""
task = request.to_dict()
task['enable_thinking'] = kwargs.get("enable_thinking", True)
self.process_request_dict(task, max_model_len)
request = Request.from_dict(task)
return request
def _parse_processor_kwargs(self, kwargs):
"""解析多模态处理器参数配置"""
if not kwargs:
return {}
try:
if not isinstance(kwargs, dict):
raise ValueError("mm-processor-kwargs must be a dictionary")
# 验证参数类型
data_processor_logger.info(f"kwargs:{kwargs}")
expected_types = {
"spatial_conv_size": int,
"temporal_conv_size": int,
"image_min_pixels": int,
"image_max_pixels": int,
"video_min_pixels": int,
"video_max_pixels": int,
"video_target_frames": int,
"video_frames_sample": str,
"video_max_frames": int,
"video_min_frames": int,
"video_fps": int
}
for key, value in kwargs.items():
if key in expected_types and not isinstance(value, expected_types[key]):
raise ValueError(
f"Invalid type for {key}: expected {expected_types[key].__name__}, got {type(value).__name__}")
return kwargs
except Exception as e:
data_processor_logger.warning(f"Invalid mm-processor-kwargs format: {e}")
return {}
def _parse_limits(self, limits):
"""解析多模态限制配置"""
DEFAULT_LIMITS = {
"image": 1,
"video": 1,
"audio": 1
}
if not limits:
return DEFAULT_LIMITS
try:
if not isinstance(limits, dict):
raise ValueError("limit-mm-per-prompt must be a dictionary")
data_processor_logger.info(f"_parse_limits:{limits}")
return {**DEFAULT_LIMITS, **limits}
except Exception as e:
data_processor_logger.warning(f"Invalid limit-mm-per-prompt format: {e}, using default limits")
return DEFAULT_LIMITS
def _check_mm_limits(self, item):
if isinstance(item, dict):
# 请求包含prompt和multi_modal_data
mm_data = item
else:
# 请求包含messages
mm_data = {
"image": [],
"video": []
}
for message in item:
if isinstance(message.get("content"), list):
for part in message["content"]:
if part.get("type") == "image":
mm_data["image"].append(part)
elif part.get("type") == "video":
mm_data["video"].append(part)
for modality, data in mm_data.items():
if modality in self.limit_mm_per_prompt:
limit = self.limit_mm_per_prompt[modality]
if len(data) > limit:
raise ValueError(
f"Too many {modality} items in prompt, "
f"got {len(data)} but limit is {limit}"
)
def process_request_dict(self, request, max_model_len=None):
"""process the input data"""
if not request.get("eos_token_ids"):
request["eos_token_ids"] = self.eos_token_ids
stop_sequences = request.get("stop", [])
if stop_sequences:
stop_seqs, stop_seqs_len = self.update_stop_seq(stop_sequences)
request["stop_token_ids"] = stop_seqs
request["stop_seqs_len"] = stop_seqs_len
if request.get("prompt"):
multimodal_data = request.get("multimodal_data")
if multimodal_data is None:
multimodal_data = {}
self._check_mm_limits(multimodal_data)
images = multimodal_data.get("image", None)
videos = multimodal_data.get("video", None)
outputs = self.ernie_processor.text2ids(request["prompt"], images, videos)
elif request.get("messages"):
messages = request["messages"]
self._check_mm_limits(messages)
outputs = self.ernie_processor.request2ids(request)
else:
raise ValueError(f"Request must contain 'prompt', or 'messages': {request}")
metadata = request.get("metadata")
# 如果metadata包含之前输出的token,将这些token添加到input_ids末尾
if metadata and metadata.get("generated_token_ids"):
self.append_generated_tokens(outputs, metadata["generated_token_ids"])
outputs = self.pack_outputs(outputs)
request["prompt_token_ids"] = outputs["input_ids"]
request["prompt_token_ids_len"] = len(request["prompt_token_ids"])
request["multimodal_inputs"] = outputs
# 截断超过长度限制的prompt
if max_model_len is not None and len(
request['prompt_token_ids']) > max_model_len:
request['prompt_token_ids'] = request[
'prompt_token_ids'][:max_model_len - 1]
if request.get("max_tokens") is None:
request["max_tokens"] = max(
1, max_model_len - len(request['prompt_token_ids']))
data_processor_logger.info(f"Processed request {request}")
return request
def append_generated_tokens(self, multimodal_inputs, generated_token_ids):
"append already generated tokens"
num_tokens = len(generated_token_ids)
multimodal_inputs["input_ids"].extend(generated_token_ids)
multimodal_inputs["token_type_ids"].extend([IDS_TYPE_FLAG["text"]] * num_tokens)
start = multimodal_inputs["cur_position"]
for i in range(num_tokens):
multimodal_inputs["position_ids"].append([start + i] * 3)
multimodal_inputs["cur_position"] += num_tokens
def pack_outputs(self, outs):
# Stack or nullify image-related fields
if not outs["images"]:
outs["images"] = None
outs["grid_thw"] = None
outs["image_type_ids"] = None
else:
outs["images"] = np.vstack(outs["images"])
outs["grid_thw"] = np.vstack(outs["grid_thw"])
outs["image_type_ids"] = np.array(outs["image_type_ids"])
# Convert lists to arrays
outs["input_ids"] = np.array(outs["input_ids"], dtype=np.int64)
outs["token_type_ids"] = np.array(outs["token_type_ids"], dtype=np.int64)
outs["position_ids"] = np.array(outs["position_ids"], dtype=np.int64)
return outs
def process_response_dict(self, response_dict, stream, **kwargs):
"""
Preprocess the response
Args:
response_dict (Dict): response for engine, contain ids fields
Returns:
Dict: response contain text fields
"""
enable_thinking = kwargs.pop("enable_thinking", True)
if enable_thinking is None:
enable_thinking = True
if stream:
return self.process_response_dict_streaming(response_dict, enable_thinking=enable_thinking, **kwargs)
else:
return self.process_response_dict_normal(response_dict, enable_thinking=enable_thinking, **kwargs)
@@ -22,16 +22,16 @@ from typing import List, Optional, Union
import numpy as np
import paddle
import PIL
from paddlenlp.transformers.feature_extraction_utils import BatchFeature
from paddlenlp.transformers.image_processing_utils import BaseImageProcessor
from paddlenlp.transformers.image_transforms import (
from paddleformers.transformers.feature_extraction_utils import BatchFeature
from paddleformers.transformers.image_processing_utils import BaseImageProcessor
from paddleformers.transformers.image_transforms import (
convert_to_rgb,
normalize,
rescale,
resize,
to_channel_dimension_format,
)
from paddlenlp.transformers.image_utils import (
from paddleformers.transformers.image_utils import (
ChannelDimension,
ImageInput,
PILImageResampling,
@@ -42,7 +42,7 @@ from paddlenlp.transformers.image_utils import (
to_numpy_array,
valid_images,
)
from paddlenlp.transformers.tokenizer_utils_base import (
from paddleformers.transformers.tokenizer_utils_base import (
TensorType,
)
from PIL import Image
@@ -326,7 +326,7 @@ class AdaptiveImageProcessor(BaseImageProcessor):
max_pixels=self.max_pixels,
)
image = image.astype("uint8") # TODO : 需要手动加上,否则多除255 导致结果会出错
# 直接fromarray,不要靠paddlenlp里面的
# 直接fromarray,不要靠paddleformers里面的
image = Image.fromarray(image)
image = resize(
image,
+132 -65
View File
@@ -18,11 +18,12 @@
""" process.py """
import copy
import io
import os
from collections import defaultdict
from typing import Any, Dict, List, Union
import numpy as np
from paddlenlp.transformers.image_utils import ChannelDimension
from paddleformers.transformers.image_utils import ChannelDimension
from PIL import Image
@@ -31,6 +32,8 @@ from .image_preprocessor.image_preprocessor_adaptive import AdaptiveImageProcess
from .process_video import read_frames_decord, read_video_decord
from .utils.io_utils import RAW_IMAGE_DIR, get_downloadable
from .utils.render_timestamp import render_frame_timestamp
from fastdeploy.input.ernie_tokenizer import ErnieBotTokenizer
from fastdeploy.entrypoints.chat_utils import parse_chat_messages
IDS_TYPE_FLAG = {"text": 0, "image": 1, "video": 2, "audio": 3}
@@ -94,9 +97,11 @@ class DataProcessor:
video_max_frames: int = 180,
video_min_frames: int = 16,
video_fps: int = 2,
**kwargs
) -> None:
# Tokenizer and image preprocessor
self.tokenizer = ErnieVLTokenizer.from_pretrained(tokenizer_name, verbose=False)
self.model_name_or_path = tokenizer_name
self._load_tokenizer()
self.tokenizer.ignored_index = -100
self.image_preprocessor = AdaptiveImageProcessor.from_pretrained(image_preprocessor_name)
@@ -125,6 +130,8 @@ class DataProcessor:
self.video_start = self.VID_START
self.video_end = self.VID_END
self.image_patch_id = self.tokenizer.convert_tokens_to_ids("<|IMAGE_PLACEHOLDER|>")
self.image_start_id = self.tokenizer.convert_tokens_to_ids(self.image_start)
self.video_start_id = self.tokenizer.convert_tokens_to_ids(self.video_start)
self.token_type_mapping = self._build_token_type_mapping()
self.is_training = True
@@ -145,11 +152,12 @@ class DataProcessor:
"""Enable evaluation mode (doesn't produce labels)."""
self.is_training = False
def process(self, messages: List[Dict[str, Any]]) -> Dict[str, Union[np.ndarray, List[np.ndarray], None]]:
def text2ids(self, text, images=None, videos=None):
"""
Convert chat messages into model inputs.
Convert chat text into model inputs.
Returns a dict with input_ids, token_type_ids, position_ids, images, grid_thw, image_type_ids, labels.
"""
outputs = {
"input_ids": [],
"token_type_ids": [],
@@ -162,37 +170,94 @@ class DataProcessor:
"pic_cnt": 0,
"video_cnt": 0,
}
self._add_special_token(self.cls_token, outputs)
IMAGE_PLACEHOLDER = "<|image@placeholder|>"
VIDEO_PLACEHOLDER = "<|video@placeholder|>"
IMAGE_PLACEHOLDER_LEN = len(IMAGE_PLACEHOLDER)
VIDEO_PLACEHOLDER_LEN = len(VIDEO_PLACEHOLDER)
st, image_idx, video_idx = 0, 0, 0
while st < len(text):
image_pos = text.find(IMAGE_PLACEHOLDER, st)
image_pos = len(text) if image_pos == -1 else image_pos
video_pos = text.find(VIDEO_PLACEHOLDER, st)
video_pos = len(text) if video_pos == -1 else video_pos
ed = min(image_pos, video_pos)
self._add_text(text[st:ed], outputs)
if ed == len(text):
break
if ed == image_pos:
self._add_image(images[image_idx], outputs)
image_idx += 1
st = ed + IMAGE_PLACEHOLDER_LEN
else:
item = videos[video_idx]
if isinstance(item, dict):
frames = self._load_and_process_video(item["video"], item)
else:
frames = self._load_and_process_video(item, {})
self._add_video(frames, outputs)
video_idx += 1
st = ed + VIDEO_PLACEHOLDER_LEN
return outputs
def request2ids(self, request: Dict[str, Any]) -> Dict[str, Union[np.ndarray, List[np.ndarray], None]]:
"""
Convert chat messages into model inputs.
Returns a dict with input_ids, token_type_ids, position_ids, images, grid_thw, image_type_ids, labels.
"""
outputs = {
"input_ids": [],
"token_type_ids": [],
"position_ids": [],
"images": [],
"grid_thw": [],
"image_type_ids": [],
"labels": [],
"cur_position": 0,
"pic_cnt": 0,
"video_cnt": 0,
}
messages = parse_chat_messages(request.get("messages"))
image_message_list = []
for msg in messages:
role = msg.get("role")
assert role in self.role_prefixes, f"Unsupported role: {role}"
prefix = self.role_prefixes[role]
if prefix:
self._add_text(prefix, outputs)
content_items = msg.get("content")
if not isinstance(content_items, list):
content_items = [content_items]
for item in content_items:
if isinstance(item, str) or item.get("type") == "text":
text = item if isinstance(item, str) else item.get("text", "")
self._add_text(text, outputs)
elif item.get("type") == "image_url" or item.get("type") == "image":
self._add_image(item, outputs)
elif item.get("type") == "video_url" or item.get("type") == "video":
self._add_video(item, outputs)
if role in ("user", "system"):
self._add_text("\n", outputs)
else:
self._add_special_token(self.sep_token, outputs)
if not self.is_training:
# Append assistant prefix in eval
self._add_text(self.role_prefixes["bot"], outputs)
if isinstance(item, dict) and item.get("type") in ["image", "video"]:
image_message_list.append(item)
prompt_token_ids = self.apply_chat_template(request)
image_start_index = 0
image_message_index = 0
for i in range(len(prompt_token_ids)):
if prompt_token_ids[i] in [self.image_start_id, self.video_start_id]:
self._add_text(prompt_token_ids[image_start_index:i + 1], outputs)
image_start_index = i + 1
image_message = image_message_list[image_message_index]
if image_message["type"] == "image":
img = image_message.get("image")
if img is None:
continue
outputs["pic_cnt"] += 1
self._add_image(img, outputs)
elif image_message["type"] == "video":
video_bytes = image_message.get("video")
if video_bytes is None:
continue
frames = self._load_and_process_video(video_bytes, image_message)
outputs["video_cnt"] += 1
self._add_video(frames, outputs)
image_message_index += 1
self._add_text(prompt_token_ids[image_start_index:], outputs)
return outputs
def _add_special_token(self, token: Union[str, int], outputs: Dict) -> None:
@@ -203,8 +268,9 @@ class DataProcessor:
outputs["position_ids"].append([pos] * 3)
outputs["cur_position"] += 1
def _add_text(self, text: str, outputs: Dict) -> None:
tokens = self.tokenizer.encode(text, add_special_tokens=False)["input_ids"]
def _add_text(self, tokens, outputs: Dict) -> None:
if isinstance(tokens, str):
tokens = self.tokenizer.encode(tokens, add_special_tokens=False)["input_ids"]
outputs["input_ids"].extend(tokens)
outputs["token_type_ids"].extend([IDS_TYPE_FLAG["text"]] * len(tokens))
@@ -213,25 +279,7 @@ class DataProcessor:
outputs["position_ids"].append([start + i] * 3)
outputs["cur_position"] += len(tokens)
def _add_image(self, item: Dict, outputs: Dict) -> None:
url_info = item.get("image_url", {})
w = url_info.get("image_width", None)
h = url_info.get("image_height", None)
if "image" in item:
img = item["image"]
else:
url = url_info.get("url")
data = get_downloadable(url, download_dir=RAW_IMAGE_DIR, save_to_disk=False)
img = Image.open(io.BytesIO(data) if isinstance(data, bytes) else data)
if w and h:
img = img.resize((w, h))
outputs["pic_cnt"] += 1
self._add_text(f"Picture {outputs['pic_cnt']}:", outputs)
self._add_special_token(self.IMG_START, outputs)
def _add_image(self, img, outputs: Dict) -> None:
patches_h, patches_w = self.image_preprocessor.get_smarted_resize(
img.height,
img.width,
@@ -260,21 +308,7 @@ class DataProcessor:
outputs["grid_thw"].append(ret["image_grid_thw"])
outputs["image_type_ids"].append(0)
self._add_special_token(self.IMG_END, outputs)
def _add_video(self, item: Dict, outputs: Dict) -> None:
url_info = item.get("video_url", {})
url = url_info.get("url")
outputs["video_cnt"] += 1
self._add_text(f"Video {outputs['video_cnt']}:", outputs)
self._add_special_token(self.VID_START, outputs)
if "video" in item:
video_path = item["video"]
frames = self._load_and_process_video(video_path, item)
else:
video_path = get_downloadable(url, save_to_disk=False)
frames = self._load_and_process_video(video_path, item)
def _add_video(self, frames, outputs: Dict) -> None:
patches_h, patches_w = self.image_preprocessor.get_smarted_resize(
frames[0].height,
frames[0].width,
@@ -305,8 +339,6 @@ class DataProcessor:
outputs["position_ids"].extend(pos_ids)
outputs["cur_position"] = np.max(pos_ids) + 1
self._add_special_token(self.VID_END, outputs)
def _load_and_process_video(self, url: str, item: Dict) -> List[Image.Image]:
reader, meta, path = read_video_decord(url, save_to_disk=False)
@@ -386,3 +418,38 @@ class DataProcessor:
coords = list(zip(time_idx, h_idx, w_idx))
return [[start_idx + ti, start_idx + hi, start_idx + wi] for ti, hi, wi in coords]
def _load_tokenizer(self):
"""
load tokenizer
Returns:
tokenizer (AutoTokenizer)
"""
vocab_file_names = ["tokenizer.model", "spm.model", "ernie_token_100k.model"]
for i in range(len(vocab_file_names)):
if os.path.exists(os.path.join(self.model_name_or_path, vocab_file_names[i])):
ErnieBotTokenizer.resource_files_names["vocab_file"] = vocab_file_names[i]
break
self.tokenizer = ErnieBotTokenizer.from_pretrained(self.model_name_or_path)
def apply_chat_template(self, request):
"""
Convert multi-turn messages into ID sequences.
Args:
messages: Either a request dict containing 'messages' field,
or a list of message dicts directly
Returns:
List of token IDs as strings (converted from token objects)
"""
if self.tokenizer.chat_template is None:
raise ValueError("This model does not support chat_template.")
prompt_token_str = self.tokenizer.apply_chat_template(
request, tokenize=False, add_generation_prompt=request.get("add_generation_prompt", True)
).replace("<|image@placeholder|>", "").replace("<|video@placeholder|>", "")
tokens = self.tokenizer.tokenize(prompt_token_str)
token_ids = self.tokenizer.convert_tokens_to_ids(tokens)
return token_ids
@@ -13,7 +13,6 @@
# See the License for the specific language governing permissions and
# limitations under the License.
"""
"""
ErnieVLTokenizer
"""
@@ -25,12 +24,11 @@ from typing import Dict, List, Optional, Tuple
import numpy as np
import paddle
import sentencepiece as spm
from paddlenlp.transformers import PretrainedTokenizer
from paddlenlp.transformers.tokenizer_utils_base import (
PaddingStrategy,
TextInput,
)
from paddlenlp.utils.log import logger
from paddleformers.transformers import PretrainedTokenizer
from paddleformers.transformers.tokenizer_utils_base import (PaddingStrategy,
TextInput)
from fastdeploy.utils import console_logger as logger
class ErnieVLTokenizer(PretrainedTokenizer):
@@ -43,7 +41,9 @@ class ErnieVLTokenizer(PretrainedTokenizer):
pretrained_init_configuration = {
"ernie-bot-10b": {},
}
model_input_names = ["input_ids", "position_ids", "attention_mask", "labels"]
model_input_names = [
"input_ids", "position_ids", "attention_mask", "labels"
]
padding_side = "right"
def __init__(
@@ -114,7 +114,10 @@ class ErnieVLTokenizer(PretrainedTokenizer):
def get_vocab(self):
"""doc"""
vocab = {self.convert_ids_to_tokens(i): i for i in range(self.vocab_size)}
vocab = {
self.convert_ids_to_tokens(i): i
for i in range(self.vocab_size)
}
vocab.update(self.added_tokens_encoder)
return vocab
@@ -157,7 +160,9 @@ class ErnieVLTokenizer(PretrainedTokenizer):
# logger.warning(f'ErnieBotTokenizer v2 does not support `add_special_tokens`')
return super().prepare_for_model(*args, **kwargs)
def save_vocabulary(self, save_directory, filename_prefix: Optional[str] = None) -> Tuple[str]:
def save_vocabulary(self,
save_directory,
filename_prefix: Optional[str] = None) -> Tuple[str]:
"""
Save the vocabulary and special tokens file to a directory.
Args:
@@ -167,19 +172,22 @@ class ErnieVLTokenizer(PretrainedTokenizer):
`Tuple(str)`: Paths to the files saved.
"""
if not os.path.isdir(save_directory):
logger.error(f"Vocabulary path ({save_directory}) should be a directory")
logger.error(
f"Vocabulary path ({save_directory}) should be a directory")
return
out_vocab_file = os.path.join(
save_directory,
(filename_prefix + "-" if filename_prefix else "") + self.resource_files_names["vocab_file"],
(filename_prefix + "-" if filename_prefix else "") +
self.resource_files_names["vocab_file"],
)
if os.path.abspath(self.vocab_file) != os.path.abspath(out_vocab_file) and os.path.isfile(self.vocab_file):
if os.path.abspath(self.vocab_file) != os.path.abspath(
out_vocab_file) and os.path.isfile(self.vocab_file):
copyfile(self.vocab_file, out_vocab_file)
elif not os.path.isfile(self.vocab_file):
with open(out_vocab_file, "wb") as fi:
content_spiece_model = self.sp_model.serialized_model_proto()
fi.write(content_spiece_model)
return (out_vocab_file,)
return (out_vocab_file, )
def tokenize(self, text: TextInput, **kwargs) -> List[str]:
"""
@@ -203,10 +211,13 @@ class ErnieVLTokenizer(PretrainedTokenizer):
if hasattr(self, "do_lower_case") and self.do_lower_case:
# convert non-special tokens to lowercase
escaped_special_toks = [
re.escape(s_tok) for s_tok in (self.unique_no_split_tokens + self.all_special_tokens)
re.escape(s_tok) for s_tok in (self.unique_no_split_tokens +
self.all_special_tokens)
]
pattern = r"(" + r"|".join(escaped_special_toks) + r")|" + r"(.+?)"
text = re.sub(pattern, lambda m: m.groups()[0] or m.groups()[1].lower(), text)
text = re.sub(pattern,
lambda m: m.groups()[0] or m.groups()[1].lower(),
text)
no_split_token = set(self.unique_no_split_tokens)
tokens = self.tokens_trie.split(text)
@@ -248,19 +259,27 @@ class ErnieVLTokenizer(PretrainedTokenizer):
required_input = encoded_inputs[self.model_input_names[0]]
if padding_strategy == PaddingStrategy.LONGEST:
max_length = len(required_input)
if max_length is not None and pad_to_multiple_of is not None and (max_length % pad_to_multiple_of != 0):
max_length = ((max_length // pad_to_multiple_of) + 1) * pad_to_multiple_of
needs_to_be_padded = padding_strategy != PaddingStrategy.DO_NOT_PAD and len(required_input) != max_length
if "attention_mask" in encoded_inputs and encoded_inputs["attention_mask"] is not None:
if max_length is not None and pad_to_multiple_of is not None and (
max_length % pad_to_multiple_of != 0):
max_length = ((max_length // pad_to_multiple_of) +
1) * pad_to_multiple_of
needs_to_be_padded = padding_strategy != PaddingStrategy.DO_NOT_PAD and len(
required_input) != max_length
if "attention_mask" in encoded_inputs and encoded_inputs[
"attention_mask"] is not None:
attention_mask = encoded_inputs.pop("attention_mask")
if isinstance(attention_mask, paddle.Tensor):
attention_mask = attention_mask.numpy()
elif isinstance(attention_mask, list):
attention_mask = np.array(attention_mask)
elif not isinstance(attention_mask, np.ndarray):
raise ValueError(f"Unexpected type {type(attention_mask)} of attention_mask, ")
raise ValueError(
f"Unexpected type {type(attention_mask)} of attention_mask, "
)
else:
attention_mask = np.tril(np.ones((len(required_input), len(required_input)), dtype=np.int64))
attention_mask = np.tril(
np.ones((len(required_input), len(required_input)),
dtype=np.int64))
attention_mask = np.expand_dims(attention_mask, axis=0)
if needs_to_be_padded:
difference = max_length - len(required_input)
@@ -275,7 +294,8 @@ class ErnieVLTokenizer(PretrainedTokenizer):
else:
pad_width = [(0, 0), (difference, 0), (difference, 0)]
else:
raise ValueError("Invalid padding strategy:" + str(self.padding_side))
raise ValueError("Invalid padding strategy:" +
str(self.padding_side))
attention_mask = np.pad(
attention_mask,
pad_width=pad_width,
@@ -342,7 +362,8 @@ def add_special_tokens(
# check
first_special_tokens = tokenizer.encode(special_tokens[0])["input_ids"]
assert first_special_tokens[0] == special_token_ids_start, f"[ERROR] first_special_tokens={first_special_tokens}"
assert first_special_tokens[
0] == special_token_ids_start, f"[ERROR] first_special_tokens={first_special_tokens}"
assert (
len(tokenizer.get_vocab()) < special_token_ids_end
), f"[ERROR] vocab_size = {len(tokenizer.get_vocab())} >= {special_token_ids_end} 增加过多special token了!"
+2 -2
View File
@@ -78,13 +78,13 @@ class ImageMediaIO(MediaIO[Image.Image]):
"""
return self.load_bytes(base64.b64decode(data))
def load_file(self, filepath: Path) -> Image.Image:
def load_file(self, filepath: str) -> Image.Image:
"""
加载文件,并转换为指定模式。
如果文件不存在或无法打开,将抛出FileNotFoundError异常。
Args:
filepath (Path): 文件路径Pathlib.Path对象)
filepath (str): 文件路径。
Returns:
Image.Image: 返回一个Image.Image对象,表示已经加载和转换的图像。
+37 -107
View File
@@ -100,142 +100,72 @@ def sample_frames_from_video(frames: npt.NDArray,
return sampled_frames
class VideoMediaIO(MediaIO[npt.NDArray]):
class VideoMediaIO(MediaIO[bytes]):
def __init__(
self,
image_io: ImageMediaIO,
*,
num_frames: int = 32,
) -> None:
def __init__(self) -> None:
"""
初始化一个 VideoMediaIO 对象。
Args:
image_io (ImageMediaIO): 用于读取和写入图像的 ImageMediaIO 对象
num_frames (int, optional): 视频中帧数,默认为 32。
ImageMediaIO 对象必须支持指定帧数。
Raises:
TypeError: 如果 image_io 不是 ImageMediaIO 类型
ValueError: 如果 num_frames 小于等于 0。
Returns:
None: 无返回值,直接初始化并设置属性
"""
super().__init__()
self.image_io = image_io
self.num_frames = num_frames
def load_bytes(self, data: bytes) -> npt.NDArray:
def load_bytes(self, data: bytes) -> bytes:
"""
从字节数据加载视频帧,并返回一个 numpy ndarray。
如果字节数据中的视频帧数量大于指定的 `num_frames`,则将其平均分布到这些帧上;否则,返回所有帧
ERNIE-45-VL模型的前处理中包含抽帧操作,如果将视频帧加载为npt.NDArray格式会丢失FPS信息,因此目前
不对字节数据做任何操作
Args:
data (bytes): 包含视频帧数据的字节对象。
Returns:
npt.NDArray, shape=(num_frames, height, width, channels): 返回一个 numpy ndarray,其中包含了视频帧数据
如果 `num_frames` 小于视频帧数量,则返回前 `num_frames` 帧;否则,返回所有帧。
Raises:
None.
"""
import decord
vr = decord.VideoReader(BytesIO(data), num_threads=1)
total_frame_num = len(vr)
num_frames = self.num_frames
if total_frame_num > num_frames:
uniform_sampled_frames = np.linspace(0,
total_frame_num - 1,
num_frames,
dtype=int)
frame_idx = uniform_sampled_frames.tolist()
else:
frame_idx = list(range(0, total_frame_num))
return vr.get_batch(frame_idx).asnumpy()
def load_base64(self, media_type: str, data: str) -> npt.NDArray:
"""
加载 base64 编码的数据,并返回 numpy ndarray。
Args:
media_type (str): 媒体类型,目前仅支持 "video/jpeg"
当为 "video/jpeg" 时,将解析每一帧的 base64 编码数据,并转换成 numpy ndarray。
data (str): base64 编码的字符串数据。
Returns:
npt.NDArray, optional: 如果 media_type 为 "video/jpeg",则返回 numpy ndarray 格式的视频数据;否则返回 None。
Raises:
None.
"""
if media_type.lower() == "video/jpeg":
load_frame = partial(
self.image_io.load_base64,
"image/jpeg",
)
return np.stack([
np.array(load_frame(frame_data))
for frame_data in data.split(",")
])
return self.load_bytes(base64.b64decode(data))
def load_file(self, filepath: Path) -> npt.NDArray:
"""
读取文件内容,并将其转换为numpy数组。
Args:
filepath (Path): 文件路径对象,表示要读取的文件。
Returns:
npt.NDArray, optional: 返回一个numpy数组,包含了文件内容。如果无法解析文件内容,则返回None。
bytes,字节数据原样返回
Raises:
无。
"""
with filepath.open("rb") as f:
data = f.read()
return data
return self.load_bytes(data)
def encode_base64(
self,
media: npt.NDArray,
*,
video_format: str = "JPEG",
) -> str:
def load_base64(self, media_type: str, data: str) -> bytes:
"""
将视频编码为Base64字符串,每一帧都是一个Base64字符串
如果视频格式为"JPEG",则每一帧都会被转换成JPEG图片并进行编码。
加载 base64 编码的数据,并返回bytes
Args:
media (npt.NDArray): 要编码的视频,形状为(H,W,C)或者(T,H,W,C),其中T为时间步长,H和W分别为高度和宽度,C为通道数
当前仅支持JPEG格式
video_format (str, optional, default="JPEG"): 视频格式,只支持"JPEG"。 Default to "JPEG".
Raises:
NotImplementedError: 当前仅支持JPEG格式。
media_type (str): 媒体类型,目前不支持 "video/jpeg"
data (str): base64 编码的字符串数据
Returns:
str: Base64字符串,每一帧都是一个Base64字符串,用","连接起来
bytes, optional: 如果 media_type 不为 "video/jpeg",则返回字节数据
Raises:
ValueError: 如果media_type是"video/jpeg"
"""
video = media
if media_type.lower() == "video/jpeg":
raise ValueError("Video in JPEG format is not supported")
if video_format == "JPEG":
encode_frame = partial(
self.image_io.encode_base64,
image_format=video_format,
)
return base64.b64decode(data)
return ",".join(
encode_frame(Image.fromarray(frame)) for frame in video)
def load_file(self, filepath: str) -> bytes:
"""
读取文件内容,并返回bytes。
Args:
filepath (str): 文件路径,表示要读取的文件。
Returns:
bytes, optional: 返回字节数据,包含了文件内容。
Raises:
无。
"""
with open(filepath, "rb") as f:
data = f.read()
msg = "Only JPEG format is supported for now."
raise NotImplementedError(msg)
return data
+41 -3
View File
@@ -13,8 +13,11 @@
# See the License for the specific language governing permissions and
# limitations under the License.
"""
from typing import Any, Dict, Optional
from fastdeploy.engine.config import ModelConfig
from fastdeploy.reasoning import ReasoningParserManager
class InputPreprocessor:
"""
@@ -24,6 +27,9 @@ class InputPreprocessor:
key in the Hugging Face Transformers' model registry (https://huggingface.co/models).
The model will be downloaded from the Hugging Face model hub if necessary.
If a path is provided, the model will be loaded from that path.
reasoning_parser (str, optional):
Reasoning parser type. Defaults to None.
Flag specifies the reasoning parser to use for extracting reasoning content from the model output
enable_mm (bool, optional):
Whether to use the multi-modal model processor. Defaults to False.
@@ -32,15 +38,21 @@ class InputPreprocessor:
If the model name is not found in the Hugging Face Transformers' model registry and the path does not
exist.
"""
def __init__(
self,
model_name_or_path: str,
reasoning_parser: str = None,
limit_mm_per_prompt: Optional[Dict[str, Any]] = None,
mm_processor_kwargs: Optional[Dict[str, Any]] = None,
enable_mm: bool = False,
) -> None:
self.model_name_or_path = model_name_or_path
self.reasoning_parser = reasoning_parser
self.enable_mm = enable_mm
self.limit_mm_per_prompt = limit_mm_per_prompt
self.mm_processor_kwargs = mm_processor_kwargs
def create_processor(self):
"""
@@ -53,7 +65,33 @@ class InputPreprocessor:
Returns:
DataProcessor or MultiModalRegistry.Processor (Union[DataProcessor, MultiModalRegistry.Processor]): 数据处理器。
"""
reasoning_parser_obj = None
if self.reasoning_parser:
reasoning_parser_obj = ReasoningParserManager.get_reasoning_parser(
self.reasoning_parser)
architectures = ModelConfig(self.model_name_or_path).architectures
from fastdeploy.input.text_processor import DataProcessor
self.processor = DataProcessor(model_name_or_path=self.model_name_or_path)
if not self.enable_mm:
if "Ernie4_5_MoeForCausalLM" not in architectures \
and "Ernie4_5_ForCausalLM" not in architectures:
from fastdeploy.input.text_processor import DataProcessor
self.processor = DataProcessor(
model_name_or_path=self.model_name_or_path, reasoning_parser_obj=reasoning_parser_obj)
else:
from fastdeploy.input.ernie_processor import ErnieProcessor
self.processor = ErnieProcessor(
model_name_or_path=self.model_name_or_path, reasoning_parser_obj=reasoning_parser_obj)
else:
if not architectures.startswith(
"Ernie4_5_VLMoeForConditionalGeneration"):
raise ValueError(
f"Model {self.model_name_or_path} is not a valid Ernie4_5_VLMoe model."
)
else:
from fastdeploy.input.ernie_vl_processor import \
ErnieMoEVLProcessor
self.processor = ErnieMoEVLProcessor(
model_name_or_path=self.model_name_or_path,
limit_mm_per_prompt=self.limit_mm_per_prompt,
mm_processor_kwargs=self.mm_processor_kwargs,
reasoning_parser_obj=reasoning_parser_obj)
return self.processor
+168 -67
View File
@@ -14,15 +14,16 @@
# limitations under the License.
"""
import os
from abc import ABC, abstractmethod
import numpy as np
from paddlenlp.generation import GenerationConfig
from paddlenlp.transformers import Llama3Tokenizer, LlamaTokenizer
from paddleformers.generation import GenerationConfig
from paddleformers.transformers import Llama3Tokenizer, LlamaTokenizer
from fastdeploy import envs
from fastdeploy.utils import data_processor_logger
_SAMPLING_EPS = 1e-5
class BaseDataProcessor(ABC):
"""base class for data processor"""
@@ -51,6 +52,27 @@ class BaseDataProcessor(ABC):
f"mask_token is {self.tokenizer.mask_token}, {self.tokenizer.mask_token_id}"
))
def _apply_default_parameters(self, request):
"""
Apply default value for parameters in request
"""
def set_value(req, key, value):
value = getattr(self.generation_config, key, value)
if isinstance(req, dict):
if key not in req:
req[key] = value
else:
if req.get(key) is None:
req.set(key, value)
set_value(request, "top_p", 0.7)
set_value(request, "temperature", 1.0)
set_value(request, "repetition_penalty", 1.0)
set_value(request, "frequency_penalty", 0.0)
set_value(request, "presence_penalty", 0.0)
return request
@abstractmethod
def process_request(self, request, **kwargs):
"""
@@ -129,7 +151,7 @@ class BaseDataProcessor(ABC):
class DataProcessor(BaseDataProcessor):
def __init__(self, model_name_or_path):
def __init__(self, model_name_or_path, reasoning_parser_obj=None):
"""
Initializes the DecodeStatus object.
@@ -145,6 +167,7 @@ class DataProcessor(BaseDataProcessor):
"""
self.model_name_or_path = model_name_or_path
self._init_config()
self.decode_status = dict()
@@ -154,12 +177,15 @@ class DataProcessor(BaseDataProcessor):
eos_token is {self.tokenizer.eos_token}, {self.tokenizer.eos_token_id} "
)
from paddlenlp.trl.llm_utils import get_eos_token_id
from paddleformers.trl.llm_utils import get_eos_token_id
self.eos_token_ids = get_eos_token_id(self.tokenizer,
self.generation_config)
self.eos_token_id_len = len(self.eos_token_ids)
self.pad_token_id = self.get_pad_id()
self.reasoning_parser = None
if reasoning_parser_obj:
self.reasoning_parser = reasoning_parser_obj(self.tokenizer)
self.tokenizer.pad_token_id = self.pad_token_id
def _init_config(self):
@@ -175,7 +201,7 @@ class DataProcessor(BaseDataProcessor):
Raises:
无异常抛出。
"""
self.use_hf_tokenizer = int(os.getenv("USE_HF_TOKENIZER", "0")) == 1
self.use_hf_tokenizer = int(envs.FD_USE_HF_TOKENIZER) == 1
# Generation config
try:
@@ -187,7 +213,7 @@ class DataProcessor(BaseDataProcessor):
)
self.generation_config = None
def process_request(self, request, max_model_len=None):
def process_request(self, request, max_model_len=None, **kwargs):
"""
Preprocess the request
@@ -198,6 +224,7 @@ class DataProcessor(BaseDataProcessor):
bool: Whether preprocessing is successful
str: error message
"""
request = self._apply_default_parameters(request)
if request.get("eos_token_ids") is None or len(
request.eos_token_ids) == 0:
request.eos_token_ids = self.eos_token_ids
@@ -217,20 +244,23 @@ class DataProcessor(BaseDataProcessor):
if self.tokenizer.chat_template is None:
raise ValueError(
"This model does not support chat_template.")
request.prompt_token_ids = self.messages2ids(request.messages)
task = request.to_dict()
task['enable_thinking'] = kwargs.get("enable_thinking", True)
request.prompt_token_ids = self.messages2ids(task)
else:
raise ValueError(
f"The request should have `input_ids`, `text` or `messages`: {request}."
)
if max_model_len is not None and len(
request.prompt_token_ids) > max_model_len:
request.prompt_token_ids = request.prompt_token_ids[:
max_model_len -
1]
if request.get("max_tokens") is None:
request.set("max_tokens",
max(1, max_model_len - len(request.prompt_token_ids)))
if request.get("temperature") < _SAMPLING_EPS:
# zero temperature is equivalent to greedy sampling
request.set("temperature", 1)
data_processor_logger.info(f"Processed request {request}")
return request
def process_request_dict(self, request, max_model_len=None):
def process_request_dict(self, request, max_model_len=None, **kwargs):
"""
Preprocess the request
@@ -241,6 +271,7 @@ class DataProcessor(BaseDataProcessor):
bool: Whether preprocessing is successful
str: error message
"""
request = self._apply_default_parameters(request)
if not request.get('eos_token_ids'):
request['eos_token_ids'] = self.eos_token_ids
@@ -251,6 +282,7 @@ class DataProcessor(BaseDataProcessor):
request['stop_token_ids'] = stop_seqs
request['stop_seqs_len'] = stop_seqs_len
data_processor_logger.info(f"Processing request {request}")
# 处理prompt_token_ids
if not request.get('prompt_token_ids'):
if 'prompt' in request:
@@ -261,19 +293,19 @@ class DataProcessor(BaseDataProcessor):
if self.tokenizer.chat_template is None:
raise ValueError(
"This model does not support chat_template.")
request['prompt_token_ids'] = self.messages2ids(
request['messages']).tolist()
request['prompt_token_ids'] = self.messages2ids(request)
else:
raise ValueError(
f"Request must contain 'prompt_token_ids', 'prompt', or 'messages': {request}"
)
# 截断超过长度限制的prompt
if max_model_len is not None and len(
request['prompt_token_ids']) > max_model_len:
request['prompt_token_ids'] = request[
'prompt_token_ids'][:max_model_len - 1]
if request.get("max_tokens") is None:
request["max_tokens"] = max(
1, max_model_len - len(request['prompt_token_ids']))
if request.get("temperature") < _SAMPLING_EPS:
# zero temperature is equivalent to greedy sampling
request["temperature"] = 1
data_processor_logger.info(f"Processed request {request}")
return request
def process_response(self, response_dict, **kwargs):
@@ -286,24 +318,26 @@ class DataProcessor(BaseDataProcessor):
Returns:
Dict: response contain text fields
"""
is_end = response_dict.finished
req_id = response_dict.request_id
token_ids = response_dict.outputs.token_ids
response_dict.outputs.text = self.ids2tokens(token_ids, req_id)
response_dict.usage = {
"completion_tokens": response_dict.outputs.index + 1
}
if token_ids[-1] == self.tokenizer.eos_token_id:
token_ids = token_ids[:-1]
full_text = self.tokenizer.decode(token_ids)
# 模型支持思考,并且支持思考
if self.reasoning_parser:
reasoning_content, text = self.reasoning_parser.extract_reasoning_content(
full_text, response_dict)
response_dict.outputs.text = text
response_dict.outputs.reasoning_content = reasoning_content
else:
# 模型不支持思考,并且没单独设置enable_thinking为false
response_dict.outputs.text = full_text
data_processor_logger.info(f"req_id:{req_id}, token)ids: {token_ids}")
if is_end:
self.clear_request_status(req_id)
data_processor_logger.debug(
"Request id: {} has been completed.".format(token_ids))
response_dict.outputs.text = self.ids2tokens(token_ids, req_id)
self.clear_request_status(req_id)
return response_dict
def process_response_dict(self, response_dict, stream=True):
def process_response_dict_normal(self, response_dict, **kwargs):
"""
Preprocess the response
@@ -313,24 +347,86 @@ class DataProcessor(BaseDataProcessor):
Returns:
Dict: response contain text fields
"""
token_ids = response_dict["outputs"]["token_ids"]
is_end = response_dict["finished"]
req_id = response_dict["request_id"]
if is_end and len(token_ids) > 0:
if token_ids[-1] == self.tokenizer.eos_token_id:
token_ids = token_ids[:-1]
delta_text, _, previous_texts = self.ids2tokens(token_ids, req_id)
if is_end:
full_text = previous_texts + delta_text
if self.reasoning_parser:
reasoning_content, text = self.reasoning_parser.extract_reasoning_content(
full_text, response_dict)
response_dict["outputs"]["text"] = text
response_dict["outputs"][
"reasoning_content"] = reasoning_content
else:
response_dict["outputs"]["text"] = full_text
data_processor_logger.info(
f"req_id:{req_id}, decode_status: {self.decode_status[req_id]}"
)
del self.decode_status[req_id]
return response_dict
def process_response_dict_streaming(self, response_dict, **kwargs):
"""
Preprocess the response
Args:
response_dict (Dict): response for engine, contain ids fields
Returns:
Dict: response contain text fields
"""
enable_thinking = kwargs.get("enable_thinking")
is_end = response_dict["finished"]
req_id = response_dict["request_id"]
token_ids = response_dict["outputs"]["token_ids"]
if is_end:
data_processor_logger.debug(
"Request id: {} has been completed.".format(token_ids))
full_text = self.clear_request_status(req_id)
if not stream:
response_dict["outputs"]["text"] = full_text
else:
response_dict["outputs"]["text"] = ""
if is_end and len(token_ids) > 0:
if token_ids[-1] == self.tokenizer.eos_token_id:
token_ids = token_ids[:-1]
delta_text, previous_token_ids, previous_texts = self.ids2tokens(
token_ids, req_id)
if enable_thinking and self.reasoning_parser:
reasoning_content, text = self.reasoning_parser.extract_reasoning_content_streaming(
previous_texts, previous_texts + delta_text, delta_text,
previous_token_ids, previous_token_ids + token_ids, token_ids)
response_dict["outputs"]["text"] = text
response_dict["outputs"]["reasoning_content"] = reasoning_content
else:
response_dict["outputs"]["text"] = self.ids2tokens(
token_ids, req_id)
response_dict["outputs"]["text"] = delta_text
if is_end:
data_processor_logger.info(
f"req_id:{req_id}, decode_status: {self.decode_status[req_id]}"
)
del self.decode_status[req_id]
return response_dict
def process_response_dict(self, response_dict, **kwargs):
"""
Preprocess the response
Args:
response_dict (Dict): response for engine, contain ids fields
Returns:
Dict: response contain text fields
"""
enable_thinking = kwargs.pop("enable_thinking", True)
if enable_thinking is None:
enable_thinking = True
stream = kwargs.get("stream", True)
if stream:
return self.process_response_dict_streaming(
response_dict, enable_thinking=enable_thinking, **kwargs)
else:
return self.process_response_dict_normal(
response_dict=response_dict, enable_thinking=enable_thinking)
def text2ids(self, text, max_model_len, raw_request=True):
"""
text to token ids
@@ -349,28 +445,20 @@ class DataProcessor(BaseDataProcessor):
truncation=True,
)
else:
if not raw_request or self.tokenizer.chat_template is None:
text = [text] if isinstance(text, str) else text
chat_template = False
elif self.tokenizer.chat_template is not None:
text = [text] if isinstance(text, str) else text
text = [
self.tokenizer.apply_chat_template(sentence,
tokenize=False)
for sentence in text
]
chat_template = True
text = [text] if isinstance(text, str) else text
tokens = self.tokenizer(
text,
return_tensors="np",
padding=True,
truncation=True,
max_length=max_model_len,
add_special_tokens=chat_template,
add_special_tokens=False,
)
return tokens["input_ids"][0]
def messages2ids(self, messages):
def messages2ids(self, request):
"""
Convert multi-turn messages into ID sequences.
@@ -380,9 +468,21 @@ class DataProcessor(BaseDataProcessor):
Returns:
List[int]: ID sequences
"""
message_result = self.tokenizer.apply_chat_template(
messages, return_tensors="pd")
return np.array(message_result["input_ids"][0])
spliced_message = self.tokenizer.apply_chat_template(
request,
tokenize=False,
split_special_tokens=False,
add_special_tokens=False,
return_tensors="pd")
req_id = None
tokens = self.tokenizer.tokenize(spliced_message)
if isinstance(request, dict):
req_id = request.get("request_id", None)
token_ids = self.tokenizer.convert_tokens_to_ids(tokens)
data_processor_logger.info(
f"req_id:{req_id}, tokens:{tokens}, token_ids: {token_ids}")
return token_ids
def ids2tokens(self, token_id, task_id):
"""
@@ -417,18 +517,20 @@ class DataProcessor(BaseDataProcessor):
else:
if task_id not in self.decode_status:
# prefix offset & read offset & history token ids & history token strings
self.decode_status[task_id] = [0, 0, [], []]
self.decode_status[task_id] = [0, 0, [], ""]
prefix_offset = self.decode_status[task_id][0]
read_offset = self.decode_status[task_id][1]
previous_token_ids = self.decode_status[task_id][2]
previous_texts = self.decode_status[task_id][3]
decode_str, prefix_offset, read_offset = self.tokenizer.decode_token(
previous_token_ids + token_id, prefix_offset, read_offset)
self.decode_status[task_id][0] = prefix_offset
self.decode_status[task_id][1] = read_offset
self.decode_status[task_id][2] += token_id
self.decode_status[task_id][3].append(decode_str)
return decode_str
self.decode_status[task_id][3] += decode_str
return decode_str, previous_token_ids, previous_texts
def _load_tokenizer(self):
"""
@@ -437,13 +539,12 @@ class DataProcessor(BaseDataProcessor):
Returns:
tokenizer (AutoTokenizer)
"""
if self.use_hf_tokenizer:
from transformers import AutoTokenizer
return AutoTokenizer.from_pretrained(self.model_name_or_path,
use_fast=False)
else:
from paddlenlp.transformers import AutoTokenizer
from paddleformers.transformers import AutoTokenizer
return AutoTokenizer.from_pretrained(self.model_name_or_path,
padding_side="left",
use_fast=True)