mirror of
https://github.com/PaddlePaddle/FastDeploy.git
synced 2026-04-24 01:29:57 +08:00
Sync v2.0 version of code to github repo
This commit is contained in:
@@ -0,0 +1,444 @@
|
||||
"""
|
||||
# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License"
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
"""
|
||||
|
||||
import os
|
||||
|
||||
import numpy as np
|
||||
from paddleformers.generation import GenerationConfig
|
||||
|
||||
from fastdeploy import envs
|
||||
from fastdeploy.utils import data_processor_logger
|
||||
from fastdeploy.input.ernie_tokenizer import ErnieBotTokenizer
|
||||
|
||||
from fastdeploy.input.text_processor import BaseDataProcessor
|
||||
|
||||
_SAMPLING_EPS = 1e-5
|
||||
|
||||
class ErnieProcessor(BaseDataProcessor):
|
||||
"""
|
||||
初始化模型实例。
|
||||
|
||||
Args:
|
||||
model_name_or_path (str): 模型名称或路径。
|
||||
|
||||
Attributes:
|
||||
model_name_or_path (str): 存储模型名称或路径。
|
||||
decode_status (dict): 存储解码状态信息。
|
||||
tokenizer (object): 存储分词器实例。
|
||||
eos_token_ids (list): 存储结束符号的token ID列表。
|
||||
eos_token_id_len (int): 存储结束符号的token ID列表的长度。
|
||||
pad_token_id (int): 存储填充符号的token ID。
|
||||
"""
|
||||
|
||||
def __init__(self, model_name_or_path, reasoning_parser_obj=None):
|
||||
|
||||
self.model_name_or_path = model_name_or_path
|
||||
data_processor_logger.info(f"model_name_or_path: {model_name_or_path}")
|
||||
self._init_config()
|
||||
|
||||
self.decode_status = dict()
|
||||
self.thinking_parser_dict = dict()
|
||||
self._load_tokenizer()
|
||||
data_processor_logger.info(
|
||||
f"tokenizer information: bos_token is {self.tokenizer.bos_token} \
|
||||
{self.tokenizer.bos_token_id}, \
|
||||
eos_token is {self.tokenizer.eos_token}, {self.tokenizer.eos_token_id} "
|
||||
)
|
||||
self.eos_token_ids = [self.tokenizer.eos_token_id]
|
||||
self.eos_token_id_len = len(self.eos_token_ids)
|
||||
self.pad_token_id = self.get_pad_id()
|
||||
self.reasoning_parser = None
|
||||
if reasoning_parser_obj:
|
||||
self.reasoning_parser = reasoning_parser_obj(self.tokenizer)
|
||||
|
||||
def _init_config(self):
|
||||
self.use_hf_tokenizer = int(envs.FD_USE_HF_TOKENIZER) == 1
|
||||
|
||||
# Generation config
|
||||
try:
|
||||
self.generation_config = GenerationConfig.from_pretrained(
|
||||
self.model_name_or_path)
|
||||
except Exception as e:
|
||||
data_processor_logger.warning(
|
||||
f"Can't find generation config, so it will not use "
|
||||
f"generation_config field in the model config, details={e}")
|
||||
self.generation_config = None
|
||||
|
||||
def process_request(self, request, max_model_len=None, **kwargs):
|
||||
"""
|
||||
Preprocess the request
|
||||
|
||||
Args:
|
||||
request (Dict): may contain text and messages fields
|
||||
|
||||
Returns:
|
||||
bool: Whether preprocessing is successful
|
||||
str: error message
|
||||
"""
|
||||
request = self._apply_default_parameters(request)
|
||||
if request.get("eos_token_ids") is None or len(
|
||||
request.eos_token_ids) == 0:
|
||||
request.eos_token_ids = self.eos_token_ids
|
||||
stop_sequences = request.get("stop", [])
|
||||
if stop_sequences is not None and len(stop_sequences) != 0:
|
||||
stop_seqs, stop_seqs_len = self.update_stop_seq(stop_sequences)
|
||||
request.set("stop_token_ids", stop_seqs)
|
||||
request.set("stop_seqs_len", stop_seqs_len)
|
||||
|
||||
if request.prompt_token_ids is None or len(
|
||||
request.prompt_token_ids) == 0:
|
||||
system = request.get("system")
|
||||
if request.prompt is None and request.messages is None:
|
||||
raise ValueError(
|
||||
f"The request should have `input_ids`, `text` or `messages`: {request}.")
|
||||
if request.prompt is not None or not request.raw_request:
|
||||
prompt = request.prompt if request.prompt is not None else request.messages[0]
|
||||
prompt = prompt[0] if isinstance(prompt, list) else prompt
|
||||
tokens = self.tokenizer.tokenize(prompt)
|
||||
token_ids = self.tokenizer.convert_tokens_to_ids(tokens)
|
||||
request.prompt_token_ids = token_ids
|
||||
data_processor_logger.info(f"req_id:{request.request_id}, tokens:{tokens}, token_ids: {token_ids}")
|
||||
else:
|
||||
request.prompt_token_ids = self.messages2ids(request.to_dict())
|
||||
|
||||
if max_model_len is not None and len(
|
||||
request.prompt_token_ids) > max_model_len:
|
||||
request.prompt_token_ids = request.prompt_token_ids[:
|
||||
max_model_len -
|
||||
1]
|
||||
if request.get("max_tokens") is None:
|
||||
request.set("max_tokens",
|
||||
max(1, max_model_len - len(request.prompt_token_ids)))
|
||||
if request.get("temperature") < _SAMPLING_EPS:
|
||||
# zero temperature is equivalent to greedy sampling
|
||||
request.set("temperature", 1)
|
||||
data_processor_logger.info(f"Processed request {request}")
|
||||
return request
|
||||
|
||||
def process_request_dict(self, request, max_model_len=None):
|
||||
"""
|
||||
Preprocess the request
|
||||
|
||||
Args:
|
||||
request (Dict): may contain text and messages fields
|
||||
|
||||
Returns:
|
||||
bool: Whether preprocessing is successful
|
||||
str: error message
|
||||
"""
|
||||
request = self._apply_default_parameters(request)
|
||||
if not request.get('eos_token_ids'):
|
||||
request['eos_token_ids'] = self.eos_token_ids
|
||||
# 处理stop_sequences
|
||||
stop_sequences = request.get('stop', [])
|
||||
if stop_sequences:
|
||||
stop_seqs, stop_seqs_len = self.update_stop_seq(stop_sequences)
|
||||
request['stop_token_ids'] = stop_seqs
|
||||
request['stop_seqs_len'] = stop_seqs_len
|
||||
|
||||
system = request.get("system")
|
||||
# 处理prompt_token_ids
|
||||
if not request.get('prompt_token_ids'):
|
||||
if request.get('prompt') is None and request.get(
|
||||
'messages') is None:
|
||||
raise ValueError(
|
||||
f"Request must contain 'prompt_token_ids', 'prompt', or 'messages': {request}"
|
||||
)
|
||||
if request.get('prompt'):
|
||||
prompt = request.get('prompt')
|
||||
prompt = prompt[0] if isinstance(prompt, list) else prompt
|
||||
tokens = self.tokenizer.tokenize(prompt)
|
||||
token_ids = self.tokenizer.convert_tokens_to_ids(tokens)
|
||||
request['prompt_token_ids'] = token_ids
|
||||
req_id = request.get("request_id", None)
|
||||
data_processor_logger.info(
|
||||
f"req_id:{req_id}, tokens:{tokens}, token_ids: {token_ids}"
|
||||
)
|
||||
else:
|
||||
request['prompt_token_ids'] = self.messages2ids(request)
|
||||
|
||||
# 截断超过长度限制的prompt
|
||||
if max_model_len is not None and len(
|
||||
request['prompt_token_ids']) > max_model_len:
|
||||
request['prompt_token_ids'] = request[
|
||||
'prompt_token_ids'][:max_model_len - 1]
|
||||
if request.get("max_tokens") is None:
|
||||
request["max_tokens"] = max(
|
||||
1, max_model_len - len(request['prompt_token_ids']))
|
||||
if request.get("temperature") < _SAMPLING_EPS:
|
||||
# zero temperature is equivalent to greedy sampling
|
||||
request["temperature"] = 1
|
||||
data_processor_logger.info(f"Processed request {request}")
|
||||
|
||||
return request
|
||||
|
||||
def process_response(self, response_dict, **kwargs):
|
||||
"""
|
||||
Preprocess the response
|
||||
|
||||
Args:
|
||||
response_dict (Dict): response for engine, contain ids fields
|
||||
|
||||
Returns:
|
||||
Dict: response contain text fields
|
||||
"""
|
||||
|
||||
req_id = response_dict.request_id
|
||||
token_ids = response_dict.outputs.token_ids
|
||||
|
||||
response_dict.usage = {
|
||||
"completion_tokens": response_dict.outputs.index + 1
|
||||
}
|
||||
if token_ids[-1] == self.tokenizer.eos_token_id:
|
||||
token_ids = token_ids[:-1]
|
||||
full_text = self.tokenizer.decode(token_ids)
|
||||
if self.reasoning_parser:
|
||||
reasoning_content, text = self.reasoning_parser.extract_reasoning_content(
|
||||
full_text, response_dict)
|
||||
response_dict.outputs.text = text
|
||||
response_dict.outputs.reasoning_content = reasoning_content
|
||||
else:
|
||||
response_dict.outputs.text = full_text
|
||||
data_processor_logger.info(f"req_id:{req_id}, token)ids: {token_ids}")
|
||||
if response_dict.outputs.text == "" and \
|
||||
response_dict.outputs.reasoning_content == "" and \
|
||||
response_dict.outputs.tool_call_content == []:
|
||||
return None
|
||||
return response_dict
|
||||
|
||||
def process_response_dict(self, response_dict, stream, **kwargs):
|
||||
"""
|
||||
Preprocess the response
|
||||
|
||||
Args:
|
||||
response_dict (Dict): response for engine, contain ids fields
|
||||
|
||||
Returns:
|
||||
Dict: response contain text fields
|
||||
"""
|
||||
if stream:
|
||||
return self.process_response_dict_streaming(
|
||||
response_dict, **kwargs)
|
||||
else:
|
||||
return self.process_response_dict_normal(response_dict, **kwargs)
|
||||
|
||||
def process_response_dict_normal(self, response_dict, **kwargs):
|
||||
"""
|
||||
Preprocess the response
|
||||
|
||||
Args:
|
||||
response_dict (Dict): response for engine, contain ids fields
|
||||
|
||||
Returns:
|
||||
Dict: response contain text fields
|
||||
"""
|
||||
token_ids = response_dict["outputs"]["token_ids"]
|
||||
is_end = response_dict["finished"]
|
||||
req_id = response_dict["request_id"]
|
||||
if is_end and len(token_ids) > 0:
|
||||
if token_ids[-1] == self.tokenizer.eos_token_id:
|
||||
token_ids = token_ids[:-1]
|
||||
delta_text, _, previous_texts = self.ids2tokens(token_ids, req_id)
|
||||
if is_end:
|
||||
full_text = previous_texts + delta_text
|
||||
if self.reasoning_parser:
|
||||
reasoning_content, text = self.reasoning_parser.extract_reasoning_content(
|
||||
full_text, response_dict)
|
||||
response_dict["outputs"]["text"] = text
|
||||
response_dict["outputs"][
|
||||
"reasoning_content"] = reasoning_content
|
||||
else:
|
||||
response_dict["outputs"]["text"] = full_text
|
||||
data_processor_logger.info(
|
||||
f"req_id:{req_id}, decode_status: {self.decode_status[req_id]}"
|
||||
)
|
||||
del self.decode_status[req_id]
|
||||
return response_dict
|
||||
|
||||
def process_response_dict_streaming(self, response_dict, **kwargs):
|
||||
"""
|
||||
Preprocess the response streaming
|
||||
|
||||
Args:
|
||||
response_dict (Dict): response for engine, contain ids fields
|
||||
|
||||
Returns:
|
||||
Dict: response contain text fields
|
||||
"""
|
||||
enable_thinking = kwargs.get("enable_thinking")
|
||||
is_end = response_dict["finished"]
|
||||
req_id = response_dict["request_id"]
|
||||
token_ids = response_dict["outputs"]["token_ids"]
|
||||
|
||||
if is_end and len(token_ids) > 0:
|
||||
if token_ids[-1] == self.tokenizer.eos_token_id:
|
||||
token_ids = token_ids[:-1]
|
||||
delta_text, previous_token_ids, previous_texts = self.ids2tokens(
|
||||
token_ids, req_id)
|
||||
if enable_thinking and self.reasoning_parser:
|
||||
reasoning_content, text = self.reasoning_parser.extract_reasoning_content_streaming(
|
||||
previous_texts, previous_texts + delta_text, delta_text,
|
||||
previous_token_ids, previous_token_ids + token_ids, token_ids)
|
||||
response_dict["outputs"]["text"] = text
|
||||
response_dict["outputs"]["reasoning_content"] = reasoning_content
|
||||
else:
|
||||
response_dict["outputs"]["text"] = delta_text
|
||||
if is_end:
|
||||
data_processor_logger.info(
|
||||
f"req_id:{req_id}, decode_status: {self.decode_status[req_id]}"
|
||||
)
|
||||
del self.decode_status[req_id]
|
||||
return response_dict
|
||||
|
||||
def messages2ids(self, request_or_messages):
|
||||
"""
|
||||
Convert multi-turn messages into ID sequences.
|
||||
|
||||
Args:
|
||||
request_or_messages: Either a request dict containing 'messages' field,
|
||||
or a list of message dicts directly
|
||||
|
||||
Returns:
|
||||
List of token IDs as strings (converted from token objects)
|
||||
"""
|
||||
if self.tokenizer.chat_template is None:
|
||||
raise ValueError("This model does not support chat_template.")
|
||||
spliced_message = self.tokenizer.apply_chat_template(
|
||||
request_or_messages,
|
||||
tokenize=False,
|
||||
split_special_tokens=False,
|
||||
add_special_tokens=False)
|
||||
|
||||
req_id = None
|
||||
if isinstance(request_or_messages, dict):
|
||||
req_id = request_or_messages.get("request_id", None)
|
||||
tokens = self.tokenizer.tokenize(spliced_message)
|
||||
token_ids = self.tokenizer.convert_tokens_to_ids(tokens)
|
||||
data_processor_logger.info(
|
||||
f"req_id:{req_id}, tokens:{tokens}, token_ids: {token_ids}")
|
||||
return token_ids
|
||||
|
||||
def ids2tokens(self, token_id, task_id):
|
||||
"""
|
||||
token ids to strings
|
||||
|
||||
Args:
|
||||
token_ids (List[int]): token ids
|
||||
task_id (str): task id
|
||||
|
||||
Returns:
|
||||
List[str]: strings
|
||||
"""
|
||||
|
||||
if task_id not in self.decode_status:
|
||||
# prefix offset & read offset & history token ids & history token strings
|
||||
self.decode_status[task_id] = [0, 0, [], ""]
|
||||
|
||||
prefix_offset = self.decode_status[task_id][0]
|
||||
read_offset = self.decode_status[task_id][1]
|
||||
previous_token_ids = self.decode_status[task_id][2]
|
||||
previous_texts = self.decode_status[task_id][3]
|
||||
decode_str, prefix_offset, read_offset = self.tokenizer.decode_token(
|
||||
previous_token_ids + token_id, prefix_offset, read_offset)
|
||||
self.decode_status[task_id][0] = prefix_offset
|
||||
self.decode_status[task_id][1] = read_offset
|
||||
self.decode_status[task_id][2] += token_id
|
||||
self.decode_status[task_id][3] += decode_str
|
||||
|
||||
return decode_str, previous_token_ids, previous_texts
|
||||
|
||||
def _load_tokenizer(self):
|
||||
"""
|
||||
load tokenizer
|
||||
|
||||
Returns:
|
||||
tokenizer (AutoTokenizer)
|
||||
"""
|
||||
vocab_file_names = [
|
||||
"tokenizer.model", "spm.model", "ernie_token_100k.model"
|
||||
]
|
||||
for i in range(len(vocab_file_names)):
|
||||
if os.path.exists(
|
||||
os.path.join(self.model_name_or_path,
|
||||
vocab_file_names[i])):
|
||||
ErnieBotTokenizer.resource_files_names[
|
||||
"vocab_file"] = vocab_file_names[i]
|
||||
break
|
||||
self.tokenizer = ErnieBotTokenizer.from_pretrained(
|
||||
self.model_name_or_path)
|
||||
|
||||
def get_pad_id(self):
|
||||
"""
|
||||
get pad_token_id, if not pad_token_id, use eos_token
|
||||
|
||||
Returns:
|
||||
int: pad_token_id
|
||||
"""
|
||||
# if isinstance(self.tokenizer, (LlamaTokenizer, Llama3Tokenizer)) and not self.tokenizer.pad_token_id:
|
||||
# return self.tokenizer.eos_token
|
||||
return self.tokenizer.pad_token_id
|
||||
|
||||
def pad_batch_data(self,
|
||||
insts,
|
||||
pad_id=0,
|
||||
return_seq_len=False,
|
||||
return_array=True,
|
||||
pad_style="right"):
|
||||
"""Pad the instances to the max sequence length in batch."""
|
||||
if len(insts) == 0:
|
||||
padded_insts = np.array([[]],
|
||||
dtype=np.int64) if return_array else [[]]
|
||||
if return_seq_len:
|
||||
seq_len = np.array([], dtype=np.int64) if return_array else []
|
||||
return padded_insts, seq_len
|
||||
return padded_insts
|
||||
|
||||
max_len = max(map(len, insts))
|
||||
if pad_style == "left":
|
||||
padded_insts = [[pad_id] * (max_len - len(inst)) + list(inst)
|
||||
for inst in insts]
|
||||
else:
|
||||
padded_insts = [
|
||||
list(inst) + [pad_id] * (max_len - len(inst)) for inst in insts
|
||||
]
|
||||
if return_array:
|
||||
padded_insts = np.array(padded_insts,
|
||||
dtype=np.int64).reshape([-1, max_len])
|
||||
|
||||
if return_seq_len:
|
||||
seq_len = [len(inst) for inst in insts]
|
||||
if return_array:
|
||||
seq_len = np.array(seq_len, dtype=np.int64).reshape(-1, 1)
|
||||
return padded_insts, seq_len
|
||||
return padded_insts
|
||||
|
||||
def update_stop_seq(self, stop_sequences):
|
||||
"""
|
||||
Update stop sequences from request.
|
||||
"""
|
||||
stop_seqs = []
|
||||
for seq in stop_sequences:
|
||||
if seq != self.tokenizer.eos_token_id:
|
||||
stop_seqs.append(
|
||||
self.tokenizer.convert_tokens_to_ids(
|
||||
self.tokenizer.tokenize(seq)))
|
||||
stop_seqs, stop_seqs_len = self.pad_batch_data(stop_seqs,
|
||||
pad_id=-1,
|
||||
return_seq_len=True,
|
||||
return_array=False)
|
||||
data_processor_logger.debug(
|
||||
f"processed stop_seqs: {stop_seqs}, {stop_seqs_len}")
|
||||
return stop_seqs, stop_seqs_len
|
||||
+285
-170
@@ -14,159 +14,156 @@
|
||||
# limitations under the License.
|
||||
"""
|
||||
|
||||
# cipher_token=WjI1fQOvhN # do not edit this line
|
||||
|
||||
import os
|
||||
import re
|
||||
from shutil import copyfile
|
||||
from typing import Any, Dict, List, Optional, Tuple
|
||||
|
||||
from typing import Dict, Optional, Tuple, List
|
||||
import numpy as np
|
||||
import sentencepiece as spm
|
||||
from paddlenlp.transformers import AddedToken, PretrainedTokenizer
|
||||
from paddlenlp.utils import logger
|
||||
|
||||
__all__ = ["ErnieBotTokenizer"]
|
||||
import paddle
|
||||
|
||||
VOCAB_FILES_NAMES = {"vocab_file": "spm.model"}
|
||||
|
||||
PRETRAINED_VOCAB_FILES_MAP = {
|
||||
"vocab_file": {},
|
||||
"tokenizer_file": {},
|
||||
}
|
||||
PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {}
|
||||
from paddleformers.utils.log import logger
|
||||
from paddleformers.transformers import PretrainedTokenizer
|
||||
from paddleformers.transformers.tokenizer_utils_base import (
|
||||
PaddingStrategy,
|
||||
TextInput,
|
||||
)
|
||||
|
||||
|
||||
class ErnieBotTokenizer(PretrainedTokenizer):
|
||||
"""
|
||||
Construct a ErnieBot tokenizer. Based on byte-level Byte-Pair-Encoding.
|
||||
Args:
|
||||
vocab_file (`str`):
|
||||
Path to the vocabulary file.
|
||||
一个更好用的 `ErnieBotToknizer`,
|
||||
能 encode 目前 sft/ppo 阶段的特殊token,也支持多模态。
|
||||
"""
|
||||
|
||||
vocab_files_names = VOCAB_FILES_NAMES
|
||||
resource_files_names = VOCAB_FILES_NAMES
|
||||
pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
|
||||
max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
|
||||
model_input_names = ["input_ids", "attention_mask"]
|
||||
resource_files_names = {
|
||||
"vocab_file": "tokenizer.model",
|
||||
}
|
||||
pretrained_resource_files_map = {"vocab_file": {"ernie-bot-10b": None}}
|
||||
pretrained_init_configuration = {
|
||||
"ernie-bot-10b": {},
|
||||
}
|
||||
model_input_names = ["input_ids", "position_ids", "attention_mask", "labels"]
|
||||
padding_side = "right"
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
vocab_file,
|
||||
unk_token="<unk>",
|
||||
bos_token="<s>",
|
||||
cls_token="<cls>",
|
||||
eos_token="</s>",
|
||||
mask_token="<mask:0>",
|
||||
pad_token="<pad>",
|
||||
sp_model_kwargs: Optional[Dict[str, Any]] = None,
|
||||
add_bos_token=True,
|
||||
add_eos_token=False,
|
||||
clean_up_tokenization_spaces=False,
|
||||
sep_token="<sep>",
|
||||
unk_token="<unk>",
|
||||
additional_special_tokens=None,
|
||||
verbose=False,
|
||||
**kwargs,
|
||||
):
|
||||
self.vocab_file = vocab_file
|
||||
self.add_bos_token = add_bos_token
|
||||
self.add_eos_token = add_eos_token
|
||||
self.sp_model_kwargs = {} if sp_model_kwargs is None else sp_model_kwargs
|
||||
self.sp_model = spm.SentencePieceProcessor(**self.sp_model_kwargs)
|
||||
self.sp_model.Load(vocab_file)
|
||||
bos_token = AddedToken(bos_token,
|
||||
lstrip=False, rstrip=False) if isinstance(
|
||||
bos_token, str) else bos_token
|
||||
eos_token = AddedToken(eos_token,
|
||||
lstrip=False, rstrip=False) if isinstance(
|
||||
eos_token, str) else eos_token
|
||||
unk_token = AddedToken(unk_token,
|
||||
lstrip=False, rstrip=False) if isinstance(
|
||||
unk_token, str) else unk_token
|
||||
pad_token = AddedToken(pad_token,
|
||||
lstrip=False, rstrip=False) if isinstance(
|
||||
pad_token, str) else pad_token
|
||||
"""doc"""
|
||||
if additional_special_tokens is None:
|
||||
additional_special_tokens = ["<mask:1>", "<mask:7>"]
|
||||
super().__init__(
|
||||
bos_token=bos_token,
|
||||
cls_token=cls_token,
|
||||
eos_token=eos_token,
|
||||
unk_token=unk_token,
|
||||
mask_token=mask_token,
|
||||
pad_token=pad_token,
|
||||
add_bos_token=add_bos_token,
|
||||
add_eos_token=add_eos_token,
|
||||
sep_token=sep_token,
|
||||
unk_token=unk_token,
|
||||
additional_special_tokens=additional_special_tokens,
|
||||
verbose=False,
|
||||
sp_model_kwargs=self.sp_model_kwargs,
|
||||
clean_up_tokenization_spaces=clean_up_tokenization_spaces,
|
||||
**kwargs,
|
||||
)
|
||||
# for eb35 reader
|
||||
self.bos_id = self.bos_token_id
|
||||
self.eos_id = self.eos_token_id
|
||||
self.sep_id = self.sep_token_id
|
||||
self.pad_id = self.pad_token_id
|
||||
self.unk_id = self.unk_token_id
|
||||
self.vocab_file = vocab_file
|
||||
self.sp_model = spm.SentencePieceProcessor()
|
||||
self.sp_model.Load(vocab_file)
|
||||
|
||||
def __getstate__(self):
|
||||
state = self.__dict__.copy()
|
||||
state["sp_model"] = None
|
||||
return state
|
||||
@property
|
||||
def space_token(self):
|
||||
"""doc"""
|
||||
return "<mask:1>"
|
||||
|
||||
def __setstate__(self, d):
|
||||
self.__dict__ = d
|
||||
self.sp_model = spm.SentencePieceProcessor(**self.sp_model_kwargs)
|
||||
self.sp_model.Load(self.vocab_file)
|
||||
@property
|
||||
def space_token_id(self):
|
||||
"""doc"""
|
||||
return self.sp_model.piece_to_id("<mask:1>")
|
||||
|
||||
@property
|
||||
def gend_token(self):
|
||||
"""doc"""
|
||||
return "<mask:7>"
|
||||
|
||||
@property
|
||||
def gend_token_id(self):
|
||||
"""doc"""
|
||||
return self.sp_model.piece_to_id("<mask:7>")
|
||||
|
||||
@property
|
||||
def im_start_id(self):
|
||||
"""doc"""
|
||||
return self.sp_model.piece_to_id("<|im_start|>")
|
||||
|
||||
@property
|
||||
def im_end_id(self):
|
||||
"""doc"""
|
||||
return self.sp_model.piece_to_id("<|im_end|>")
|
||||
|
||||
@property
|
||||
def vocab_size(self):
|
||||
"""Returns vocab size"""
|
||||
return self.sp_model.get_piece_size()
|
||||
"""doc"""
|
||||
return self.sp_model.vocab_size()
|
||||
|
||||
def get_vocab(self):
|
||||
"""Returns vocab as a dict"""
|
||||
vocab = {
|
||||
self.convert_ids_to_tokens(i): i
|
||||
for i in range(self.vocab_size)
|
||||
}
|
||||
"""doc"""
|
||||
vocab = {self.convert_ids_to_tokens(i): i for i in range(self.vocab_size)}
|
||||
vocab.update(self.added_tokens_encoder)
|
||||
return vocab
|
||||
|
||||
def tokenize(self, text):
|
||||
"""Returns a tokenized string."""
|
||||
return self._tokenize(text)
|
||||
|
||||
def _tokenize(self, text):
|
||||
"""Returns a tokenized string."""
|
||||
return self.sp_model.encode(text, out_type=str)
|
||||
|
||||
def decode(self,
|
||||
tokens,
|
||||
skip_special_tokens=False,
|
||||
clean_up_tokenization_spaces=False):
|
||||
"""Returns a tokenized string."""
|
||||
return self.sp_model.decode(tokens)
|
||||
"""doc"""
|
||||
return self.sp_model.encode_as_pieces(text)
|
||||
|
||||
def _convert_token_to_id(self, token):
|
||||
"""Converts a token (str) in an id using the vocab."""
|
||||
"""doc"""
|
||||
return self.sp_model.piece_to_id(token)
|
||||
|
||||
def _convert_id_to_token(self, index):
|
||||
"""Converts an index (integer) in a token (str) using the vocab."""
|
||||
token = self.sp_model.IdToPiece(index)
|
||||
return token
|
||||
def _convert_id_to_token(self, id):
|
||||
"""doc"""
|
||||
return self.sp_model.id_to_piece(id)
|
||||
|
||||
def convert_tokens_to_string(self, tokens):
|
||||
"""Converts a sequence of tokens (string) in a single string."""
|
||||
current_sub_tokens = []
|
||||
out_string = ""
|
||||
prev_is_special = False
|
||||
for i, token in enumerate(tokens):
|
||||
# prev_is_special = False
|
||||
for token in tokens:
|
||||
# make sure that special tokens are not decoded using sentencepiece model
|
||||
if token in self.all_special_tokens:
|
||||
if not prev_is_special and i != 0:
|
||||
out_string += " "
|
||||
# if not prev_is_special:
|
||||
# out_string += " "
|
||||
out_string += self.sp_model.decode(current_sub_tokens) + token
|
||||
prev_is_special = True
|
||||
# prev_is_special = True
|
||||
|
||||
current_sub_tokens = []
|
||||
else:
|
||||
current_sub_tokens.append(token)
|
||||
prev_is_special = False
|
||||
# prev_is_special = False
|
||||
out_string += self.sp_model.decode(current_sub_tokens)
|
||||
return out_string
|
||||
return out_string # .strip()
|
||||
|
||||
def save_vocabulary(self,
|
||||
save_directory,
|
||||
filename_prefix: Optional[str] = None) -> Tuple[str]:
|
||||
def prepare_for_model(self, *args, **kwargs):
|
||||
"""doc"""
|
||||
if "add_special_tokens" in kwargs:
|
||||
kwargs.pop("add_special_tokens")
|
||||
# logger.warning(f'ErnieBotTokenizer v2 does not support `add_special_tokens`')
|
||||
return super().prepare_for_model(*args, **kwargs)
|
||||
|
||||
def save_vocabulary(self, save_directory, filename_prefix: Optional[str] = None) -> Tuple[str]:
|
||||
"""
|
||||
Save the vocabulary and special tokens file to a directory.
|
||||
Args:
|
||||
@@ -176,94 +173,212 @@ class ErnieBotTokenizer(PretrainedTokenizer):
|
||||
`Tuple(str)`: Paths to the files saved.
|
||||
"""
|
||||
if not os.path.isdir(save_directory):
|
||||
logger.error(
|
||||
f"Vocabulary path ({save_directory}) should be a directory")
|
||||
logger.error(f"Vocabulary path ({save_directory}) should be a directory")
|
||||
return
|
||||
out_vocab_file = os.path.join(
|
||||
save_directory,
|
||||
(filename_prefix + "-" if filename_prefix else "") +
|
||||
VOCAB_FILES_NAMES["vocab_file"])
|
||||
|
||||
if os.path.abspath(self.vocab_file) != os.path.abspath(
|
||||
out_vocab_file) and os.path.isfile(self.vocab_file):
|
||||
(filename_prefix + "-" if filename_prefix else "") + self.resource_files_names["vocab_file"],
|
||||
)
|
||||
if os.path.abspath(self.vocab_file) != os.path.abspath(out_vocab_file) and os.path.isfile(self.vocab_file):
|
||||
copyfile(self.vocab_file, out_vocab_file)
|
||||
elif not os.path.isfile(self.vocab_file):
|
||||
with open(out_vocab_file, "wb") as fi:
|
||||
content_spiece_model = self.sp_model.serialized_model_proto()
|
||||
fi.write(content_spiece_model)
|
||||
return (out_vocab_file,)
|
||||
|
||||
return (out_vocab_file, )
|
||||
|
||||
def build_inputs_with_special_tokens(self, token_ids_0, token_ids_1=None):
|
||||
""" build_inputs_with_special_tokens """
|
||||
bos_token_id = [self.bos_token_id] if self.add_bos_token else []
|
||||
eos_token_id = [self.eos_token_id] if self.add_eos_token else []
|
||||
|
||||
output = bos_token_id + token_ids_0 + eos_token_id
|
||||
|
||||
if token_ids_1 is not None:
|
||||
output = output + bos_token_id + token_ids_1 + eos_token_id
|
||||
|
||||
return output
|
||||
|
||||
def get_special_tokens_mask(
|
||||
self,
|
||||
token_ids_0: List[int],
|
||||
token_ids_1: Optional[List[int]] = None,
|
||||
already_has_special_tokens: bool = False) -> List[int]:
|
||||
def tokenize(self, text: TextInput, **kwargs) -> List[str]:
|
||||
"""
|
||||
Retrieve sequence ids from a token list that has no special tokens added. This method is called when adding
|
||||
special tokens using the tokenizer `prepare_for_model` method.
|
||||
Converts a string in a sequence of tokens, using the tokenizer.
|
||||
|
||||
Split in words for word-based vocabulary or sub-words for sub-word-based vocabularies
|
||||
(BPE/SentencePieces/WordPieces). Takes care of added tokens.
|
||||
|
||||
Args:
|
||||
token_ids_0 (`List[int]`):
|
||||
List of IDs.
|
||||
token_ids_1 (`List[int]`, *optional*):
|
||||
Optional second list of IDs for sequence pairs.
|
||||
already_has_special_tokens (`bool`, *optional*, defaults to `False`):
|
||||
Whether or not the token list is already formatted with special tokens for the model.
|
||||
text (`str`):
|
||||
The sequence to be encoded.
|
||||
**kwargs (additional keyword arguments):
|
||||
Passed along to the model-specific `prepare_for_tokenization` preprocessing method.
|
||||
|
||||
Returns:
|
||||
`List[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
|
||||
`List[str]`: The list of tokens.
|
||||
"""
|
||||
if already_has_special_tokens:
|
||||
return super().get_special_tokens_mask(
|
||||
token_ids_0=token_ids_0,
|
||||
token_ids_1=token_ids_1,
|
||||
already_has_special_tokens=True)
|
||||
# Simple mapping string => AddedToken for special tokens with specific tokenization behaviors
|
||||
# all_special_tokens_extended = dict(
|
||||
# (str(t), t)
|
||||
# for t in self.all_special_tokens_extended
|
||||
# if isinstance(t, AddedToken)
|
||||
# )
|
||||
|
||||
bos_token_id = [1] if self.add_bos_token else []
|
||||
eos_token_id = [1] if self.add_eos_token else []
|
||||
text, kwargs = self.prepare_for_tokenization(text, **kwargs)
|
||||
|
||||
if token_ids_1 is None:
|
||||
return bos_token_id + ([0] * len(token_ids_0)) + eos_token_id
|
||||
return (bos_token_id + ([0] * len(token_ids_0)) + eos_token_id +
|
||||
bos_token_id + ([0] * len(token_ids_1)) + eos_token_id)
|
||||
# TODO: should this be in the base class?
|
||||
if hasattr(self, "do_lower_case") and self.do_lower_case:
|
||||
# convert non-special tokens to lowercase
|
||||
escaped_special_toks = [
|
||||
re.escape(s_tok) for s_tok in (self.unique_no_split_tokens + self.all_special_tokens)
|
||||
]
|
||||
pattern = r"(" + r"|".join(escaped_special_toks) + r")|" + r"(.+?)"
|
||||
text = re.sub(pattern, lambda m: m.groups()[0] or m.groups()[1].lower(), text)
|
||||
|
||||
def create_token_type_ids_from_sequences(
|
||||
self,
|
||||
token_ids_0: List[int],
|
||||
token_ids_1: Optional[List[int]] = None) -> List[int]:
|
||||
"""
|
||||
Creates a mask from the two sequences passed to be used in a sequence-pair classification task. An ALBERT
|
||||
sequence pair mask has the following format:
|
||||
```
|
||||
0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1
|
||||
| first sequence | second sequence |
|
||||
```
|
||||
if token_ids_1 is None, only returns the first portion of the mask (0s).
|
||||
Args:
|
||||
token_ids_0 (`List[int]`):
|
||||
List of ids.
|
||||
token_ids_1 (`List[int]`, *optional*):
|
||||
Optional second list of IDs for sequence pairs.
|
||||
Returns:
|
||||
`List[int]`: List of [token type IDs](../glossary#token-type-ids) according to the given sequence(s).
|
||||
"""
|
||||
bos_token_id = [self.bos_token_id] if self.add_bos_token else []
|
||||
eos_token_id = [self.eos_token_id] if self.add_eos_token else []
|
||||
no_split_token = set(self.unique_no_split_tokens)
|
||||
tokens = self.tokens_trie.split(text)
|
||||
|
||||
output = [0] * len(bos_token_id + token_ids_0 + eos_token_id)
|
||||
# ["This is something", "<special_token_1>", " else"]
|
||||
# for i, token in enumerate(tokens):
|
||||
# if token in no_split_token:
|
||||
# tok_extended = all_special_tokens_extended.get(token, None)
|
||||
# print(f'>>>{token}|{tok_extended}|{all_special_tokens_extended}<<<')
|
||||
# left = tokens[i - 1] if i > 0 else None
|
||||
# right = tokens[i + 1] if i < len(tokens) - 1 else None
|
||||
# if isinstance(tok_extended, AddedToken):
|
||||
# if tok_extended.rstrip and right:
|
||||
# # A bit counter-intuitive but we strip the left of the string
|
||||
# # since tok_extended.rstrip means the special token is eating all white spaces on its right
|
||||
# tokens[i + 1] = right.lstrip()
|
||||
# # Strip white spaces on the left
|
||||
# if tok_extended.lstrip and left:
|
||||
# tokens[i - 1] = left.rstrip() # Opposite here
|
||||
# else:
|
||||
# We strip left and right by default
|
||||
# if right:
|
||||
# tokens[i + 1] = right.lstrip()
|
||||
# if left:
|
||||
# tokens[i - 1] = left.rstrip()
|
||||
# ["This is something", "<special_token_1>", "else"]
|
||||
tokenized_text = []
|
||||
for token in tokens:
|
||||
# Need to skip eventual empty (fully stripped) tokens
|
||||
if not token:
|
||||
continue
|
||||
if token in no_split_token:
|
||||
tokenized_text.append(token)
|
||||
else:
|
||||
tokenized_text.extend(self._tokenize(token))
|
||||
# ["This", " is", " something", "<special_token_1>", "else"]
|
||||
return tokenized_text
|
||||
|
||||
if token_ids_1 is not None:
|
||||
output += [1] * len(bos_token_id + token_ids_1 + eos_token_id)
|
||||
|
||||
return output
|
||||
def _decode(self, *args, **kwargs):
|
||||
"""doc"""
|
||||
kwargs.pop("clean_up_tokenization_spaces", None)
|
||||
kwargs.pop("spaces_between_special_tokens", None)
|
||||
return super()._decode(
|
||||
*args,
|
||||
**kwargs,
|
||||
clean_up_tokenization_spaces=False,
|
||||
spaces_between_special_tokens=False,
|
||||
)
|
||||
|
||||
def _pad(
|
||||
self,
|
||||
encoded_inputs: Dict,
|
||||
max_length: Optional[int] = None,
|
||||
padding_strategy=PaddingStrategy.DO_NOT_PAD,
|
||||
pad_to_multiple_of: Optional[int] = None,
|
||||
return_attention_mask: Optional[bool] = None,
|
||||
) -> dict:
|
||||
"""doc"""
|
||||
if return_attention_mask is None:
|
||||
return_attention_mask = "attention_mask" in self.model_input_names
|
||||
if return_attention_mask:
|
||||
required_input = encoded_inputs[self.model_input_names[0]]
|
||||
if padding_strategy == PaddingStrategy.LONGEST:
|
||||
max_length = len(required_input)
|
||||
if max_length is not None and pad_to_multiple_of is not None and (max_length % pad_to_multiple_of != 0):
|
||||
max_length = ((max_length // pad_to_multiple_of) + 1) * pad_to_multiple_of
|
||||
needs_to_be_padded = padding_strategy != PaddingStrategy.DO_NOT_PAD and len(required_input) != max_length
|
||||
if "attention_mask" in encoded_inputs and encoded_inputs["attention_mask"] is not None:
|
||||
attention_mask = encoded_inputs.pop("attention_mask")
|
||||
if isinstance(attention_mask, paddle.Tensor):
|
||||
attention_mask = attention_mask.numpy()
|
||||
elif isinstance(attention_mask, list):
|
||||
attention_mask = np.array(attention_mask)
|
||||
elif not isinstance(attention_mask, np.ndarray):
|
||||
raise ValueError(f"Unexpected type {type(attention_mask)} of attention_mask, ")
|
||||
else:
|
||||
attention_mask = np.tril(np.ones((len(required_input), len(required_input)), dtype=np.int64))
|
||||
attention_mask = np.expand_dims(attention_mask, axis=0)
|
||||
if needs_to_be_padded:
|
||||
difference = max_length - len(required_input)
|
||||
if self.padding_side == "right":
|
||||
if attention_mask.ndim == 1:
|
||||
pad_width = [(0, difference)]
|
||||
else:
|
||||
pad_width = [(0, 0), (0, difference), (0, difference)]
|
||||
elif self.padding_side == "left":
|
||||
if attention_mask.ndim == 1:
|
||||
pad_width = [(difference, 0)]
|
||||
else:
|
||||
pad_width = [(0, 0), (difference, 0), (difference, 0)]
|
||||
else:
|
||||
raise ValueError("Invalid padding strategy:" + str(self.padding_side))
|
||||
attention_mask = np.pad(
|
||||
attention_mask,
|
||||
pad_width=pad_width,
|
||||
mode="constant",
|
||||
constant_values=0,
|
||||
)
|
||||
encoded_inputs = super()._pad(
|
||||
encoded_inputs,
|
||||
max_length,
|
||||
padding_strategy=padding_strategy,
|
||||
pad_to_multiple_of=pad_to_multiple_of,
|
||||
return_attention_mask=False,
|
||||
)
|
||||
if return_attention_mask:
|
||||
encoded_inputs["attention_mask"] = attention_mask.tolist()
|
||||
return encoded_inputs
|
||||
|
||||
|
||||
def add_special_tokens(
|
||||
tokenizer,
|
||||
special_tokens_info,
|
||||
use_ocr_specialtoken=False,
|
||||
use_crop_specialtoken=False,
|
||||
special_token_ids_start=254208,
|
||||
special_token_ids_end=256256,
|
||||
):
|
||||
"""
|
||||
增加 special token
|
||||
|
||||
placeholder [<|IMAGE_PLACEHOLDER|>, <|AUDIO_PLACEHOLDER|>, <|VIDEO_PLACEHOLDER|>] 共3个
|
||||
|
||||
模态起始截止 special tokens [<|BOI|> <|EOI|> <|BOA|> <|EOA|> <|BOV|> <|EOV|>]
|
||||
|
||||
ocr special tokens [<|LOC_0|> <|LOC_1|> ... <|LOC_1000|>] 共1001个
|
||||
|
||||
crop special tokens [<|CROP_COL_SEP|>, <|CROP_ROW_SEP|>, <|CROP_IMAGE_SEP|>] 共3个
|
||||
<|CROP_COL_SEP|> for col 维度切 图片width(替换原明文逗号)
|
||||
<|CROP_ROW_SEP|> for row 维度切 图片height(替换原明文回车)
|
||||
<|CROP_IMAGE_SEP|> for 区分原图和crop图 图片width(替换原明文两个回车)
|
||||
|
||||
共2048个 unsed token
|
||||
|
||||
Args:
|
||||
tokenizer (ErnieTokenizer): tokenizer
|
||||
special_token_ids_start (int, optional): special token 起点 ids. Defaults to 254208.
|
||||
special_token_ids_end (int, optional): 词表最多支持大小. Defaults to 256256.
|
||||
"""
|
||||
special_tokens = [
|
||||
special_tokens_info["image_placeholder"],
|
||||
special_tokens_info["audio_placeholder"],
|
||||
]
|
||||
|
||||
if use_ocr_specialtoken:
|
||||
special_tokens.extend(special_tokens_info["ocr_coor"])
|
||||
special_tokens.extend(special_tokens_info["ocr_begin_end"])
|
||||
|
||||
if use_crop_specialtoken:
|
||||
special_tokens.extend(special_tokens_info["crop"])
|
||||
|
||||
# add special_tokens
|
||||
additional_special_tokens = {"additional_special_tokens": special_tokens}
|
||||
tokenizer.add_special_tokens(additional_special_tokens)
|
||||
|
||||
# check
|
||||
first_special_tokens = tokenizer.encode(special_tokens[0])["input_ids"]
|
||||
|
||||
assert first_special_tokens[0] == special_token_ids_start, f"[ERROR] first_special_tokens={first_special_tokens}"
|
||||
assert (
|
||||
len(tokenizer.get_vocab()) < special_token_ids_end
|
||||
), f"[ERROR] vocab_size = {len(tokenizer.get_vocab())} >= {special_token_ids_end} 增加过多special token了!"
|
||||
|
||||
@@ -0,0 +1,260 @@
|
||||
"""
|
||||
# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License"
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
"""
|
||||
|
||||
import os
|
||||
|
||||
import numpy as np
|
||||
import re
|
||||
from fastdeploy.input.mm_processor import DataProcessor, IDS_TYPE_FLAG
|
||||
from fastdeploy.input.ernie_processor import ErnieProcessor
|
||||
from fastdeploy.engine.request import Request
|
||||
from fastdeploy.entrypoints.chat_utils import parse_chat_messages
|
||||
from fastdeploy.utils import data_processor_logger
|
||||
|
||||
|
||||
class ErnieMoEVLProcessor(ErnieProcessor):
|
||||
"""The processor class for ERNIE MoE VL models."""
|
||||
def __init__(self, model_name_or_path, limit_mm_per_prompt=None, mm_processor_kwargs=None,
|
||||
reasoning_parser_obj=None):
|
||||
self.use_hf_tokenizer = False
|
||||
|
||||
if "merge_llm_model" in model_name_or_path:
|
||||
model_name_or_path = os.path.dirname(model_name_or_path)
|
||||
data_processor_logger.info(f"model_name_or_path: {model_name_or_path}")
|
||||
tokenizer_path = model_name_or_path
|
||||
preprocessor_path = model_name_or_path
|
||||
processor_kwargs = self._parse_processor_kwargs(mm_processor_kwargs)
|
||||
|
||||
self.ernie_processor = DataProcessor(
|
||||
tokenizer_name=tokenizer_path,
|
||||
image_preprocessor_name=preprocessor_path,
|
||||
**processor_kwargs
|
||||
)
|
||||
self.ernie_processor.eval()
|
||||
self.image_patch_id = self.ernie_processor.image_patch_id
|
||||
self.spatial_conv_size = self.ernie_processor.spatial_conv_size
|
||||
|
||||
self.decode_status = dict()
|
||||
self._load_tokenizer()
|
||||
self.eos_token_ids = [self.tokenizer.eos_token_id]
|
||||
self.eos_token_id_len = len(self.eos_token_ids)
|
||||
self.pad_token_id = self.get_pad_id()
|
||||
self.limit_mm_per_prompt = self._parse_limits(limit_mm_per_prompt)
|
||||
self.reasoning_parser = None
|
||||
if reasoning_parser_obj:
|
||||
self.reasoning_parser = reasoning_parser_obj(self.tokenizer)
|
||||
|
||||
def get_pad_id(self):
|
||||
"""get pad id"""
|
||||
return self.tokenizer.pad_token_id
|
||||
|
||||
def _load_tokenizer(self):
|
||||
"""
|
||||
load tokenizer
|
||||
|
||||
Returns:
|
||||
tokenizer (AutoTokenizer)
|
||||
"""
|
||||
self.tokenizer = self.ernie_processor.tokenizer
|
||||
|
||||
def process_request(self, request, max_model_len=None, **kwargs):
|
||||
"""process the input data"""
|
||||
task = request.to_dict()
|
||||
task['enable_thinking'] = kwargs.get("enable_thinking", True)
|
||||
self.process_request_dict(task, max_model_len)
|
||||
request = Request.from_dict(task)
|
||||
|
||||
return request
|
||||
|
||||
def _parse_processor_kwargs(self, kwargs):
|
||||
"""解析多模态处理器参数配置"""
|
||||
if not kwargs:
|
||||
return {}
|
||||
|
||||
try:
|
||||
if not isinstance(kwargs, dict):
|
||||
raise ValueError("mm-processor-kwargs must be a dictionary")
|
||||
|
||||
# 验证参数类型
|
||||
data_processor_logger.info(f"kwargs:{kwargs}")
|
||||
expected_types = {
|
||||
"spatial_conv_size": int,
|
||||
"temporal_conv_size": int,
|
||||
"image_min_pixels": int,
|
||||
"image_max_pixels": int,
|
||||
"video_min_pixels": int,
|
||||
"video_max_pixels": int,
|
||||
"video_target_frames": int,
|
||||
"video_frames_sample": str,
|
||||
"video_max_frames": int,
|
||||
"video_min_frames": int,
|
||||
"video_fps": int
|
||||
}
|
||||
|
||||
for key, value in kwargs.items():
|
||||
if key in expected_types and not isinstance(value, expected_types[key]):
|
||||
raise ValueError(
|
||||
f"Invalid type for {key}: expected {expected_types[key].__name__}, got {type(value).__name__}")
|
||||
|
||||
return kwargs
|
||||
|
||||
except Exception as e:
|
||||
data_processor_logger.warning(f"Invalid mm-processor-kwargs format: {e}")
|
||||
return {}
|
||||
|
||||
def _parse_limits(self, limits):
|
||||
"""解析多模态限制配置"""
|
||||
DEFAULT_LIMITS = {
|
||||
"image": 1,
|
||||
"video": 1,
|
||||
"audio": 1
|
||||
}
|
||||
|
||||
if not limits:
|
||||
return DEFAULT_LIMITS
|
||||
|
||||
try:
|
||||
if not isinstance(limits, dict):
|
||||
raise ValueError("limit-mm-per-prompt must be a dictionary")
|
||||
data_processor_logger.info(f"_parse_limits:{limits}")
|
||||
return {**DEFAULT_LIMITS, **limits}
|
||||
except Exception as e:
|
||||
data_processor_logger.warning(f"Invalid limit-mm-per-prompt format: {e}, using default limits")
|
||||
return DEFAULT_LIMITS
|
||||
|
||||
def _check_mm_limits(self, item):
|
||||
if isinstance(item, dict):
|
||||
# 请求包含prompt和multi_modal_data
|
||||
mm_data = item
|
||||
else:
|
||||
# 请求包含messages
|
||||
mm_data = {
|
||||
"image": [],
|
||||
"video": []
|
||||
}
|
||||
|
||||
for message in item:
|
||||
if isinstance(message.get("content"), list):
|
||||
for part in message["content"]:
|
||||
if part.get("type") == "image":
|
||||
mm_data["image"].append(part)
|
||||
elif part.get("type") == "video":
|
||||
mm_data["video"].append(part)
|
||||
|
||||
for modality, data in mm_data.items():
|
||||
if modality in self.limit_mm_per_prompt:
|
||||
limit = self.limit_mm_per_prompt[modality]
|
||||
if len(data) > limit:
|
||||
raise ValueError(
|
||||
f"Too many {modality} items in prompt, "
|
||||
f"got {len(data)} but limit is {limit}"
|
||||
)
|
||||
|
||||
def process_request_dict(self, request, max_model_len=None):
|
||||
"""process the input data"""
|
||||
|
||||
if not request.get("eos_token_ids"):
|
||||
request["eos_token_ids"] = self.eos_token_ids
|
||||
|
||||
stop_sequences = request.get("stop", [])
|
||||
if stop_sequences:
|
||||
stop_seqs, stop_seqs_len = self.update_stop_seq(stop_sequences)
|
||||
request["stop_token_ids"] = stop_seqs
|
||||
request["stop_seqs_len"] = stop_seqs_len
|
||||
|
||||
if request.get("prompt"):
|
||||
multimodal_data = request.get("multimodal_data")
|
||||
if multimodal_data is None:
|
||||
multimodal_data = {}
|
||||
self._check_mm_limits(multimodal_data)
|
||||
images = multimodal_data.get("image", None)
|
||||
videos = multimodal_data.get("video", None)
|
||||
outputs = self.ernie_processor.text2ids(request["prompt"], images, videos)
|
||||
elif request.get("messages"):
|
||||
messages = request["messages"]
|
||||
self._check_mm_limits(messages)
|
||||
outputs = self.ernie_processor.request2ids(request)
|
||||
else:
|
||||
raise ValueError(f"Request must contain 'prompt', or 'messages': {request}")
|
||||
|
||||
metadata = request.get("metadata")
|
||||
# 如果metadata包含之前输出的token,将这些token添加到input_ids末尾
|
||||
if metadata and metadata.get("generated_token_ids"):
|
||||
self.append_generated_tokens(outputs, metadata["generated_token_ids"])
|
||||
outputs = self.pack_outputs(outputs)
|
||||
request["prompt_token_ids"] = outputs["input_ids"]
|
||||
request["prompt_token_ids_len"] = len(request["prompt_token_ids"])
|
||||
request["multimodal_inputs"] = outputs
|
||||
|
||||
# 截断超过长度限制的prompt
|
||||
if max_model_len is not None and len(
|
||||
request['prompt_token_ids']) > max_model_len:
|
||||
request['prompt_token_ids'] = request[
|
||||
'prompt_token_ids'][:max_model_len - 1]
|
||||
if request.get("max_tokens") is None:
|
||||
request["max_tokens"] = max(
|
||||
1, max_model_len - len(request['prompt_token_ids']))
|
||||
data_processor_logger.info(f"Processed request {request}")
|
||||
|
||||
return request
|
||||
|
||||
def append_generated_tokens(self, multimodal_inputs, generated_token_ids):
|
||||
"append already generated tokens"
|
||||
|
||||
num_tokens = len(generated_token_ids)
|
||||
multimodal_inputs["input_ids"].extend(generated_token_ids)
|
||||
multimodal_inputs["token_type_ids"].extend([IDS_TYPE_FLAG["text"]] * num_tokens)
|
||||
|
||||
start = multimodal_inputs["cur_position"]
|
||||
for i in range(num_tokens):
|
||||
multimodal_inputs["position_ids"].append([start + i] * 3)
|
||||
multimodal_inputs["cur_position"] += num_tokens
|
||||
|
||||
def pack_outputs(self, outs):
|
||||
# Stack or nullify image-related fields
|
||||
if not outs["images"]:
|
||||
outs["images"] = None
|
||||
outs["grid_thw"] = None
|
||||
outs["image_type_ids"] = None
|
||||
else:
|
||||
outs["images"] = np.vstack(outs["images"])
|
||||
outs["grid_thw"] = np.vstack(outs["grid_thw"])
|
||||
outs["image_type_ids"] = np.array(outs["image_type_ids"])
|
||||
|
||||
# Convert lists to arrays
|
||||
outs["input_ids"] = np.array(outs["input_ids"], dtype=np.int64)
|
||||
outs["token_type_ids"] = np.array(outs["token_type_ids"], dtype=np.int64)
|
||||
outs["position_ids"] = np.array(outs["position_ids"], dtype=np.int64)
|
||||
|
||||
return outs
|
||||
|
||||
def process_response_dict(self, response_dict, stream, **kwargs):
|
||||
"""
|
||||
Preprocess the response
|
||||
|
||||
Args:
|
||||
response_dict (Dict): response for engine, contain ids fields
|
||||
|
||||
Returns:
|
||||
Dict: response contain text fields
|
||||
"""
|
||||
enable_thinking = kwargs.pop("enable_thinking", True)
|
||||
if enable_thinking is None:
|
||||
enable_thinking = True
|
||||
if stream:
|
||||
return self.process_response_dict_streaming(response_dict, enable_thinking=enable_thinking, **kwargs)
|
||||
else:
|
||||
return self.process_response_dict_normal(response_dict, enable_thinking=enable_thinking, **kwargs)
|
||||
@@ -22,16 +22,16 @@ from typing import List, Optional, Union
|
||||
import numpy as np
|
||||
import paddle
|
||||
import PIL
|
||||
from paddlenlp.transformers.feature_extraction_utils import BatchFeature
|
||||
from paddlenlp.transformers.image_processing_utils import BaseImageProcessor
|
||||
from paddlenlp.transformers.image_transforms import (
|
||||
from paddleformers.transformers.feature_extraction_utils import BatchFeature
|
||||
from paddleformers.transformers.image_processing_utils import BaseImageProcessor
|
||||
from paddleformers.transformers.image_transforms import (
|
||||
convert_to_rgb,
|
||||
normalize,
|
||||
rescale,
|
||||
resize,
|
||||
to_channel_dimension_format,
|
||||
)
|
||||
from paddlenlp.transformers.image_utils import (
|
||||
from paddleformers.transformers.image_utils import (
|
||||
ChannelDimension,
|
||||
ImageInput,
|
||||
PILImageResampling,
|
||||
@@ -42,7 +42,7 @@ from paddlenlp.transformers.image_utils import (
|
||||
to_numpy_array,
|
||||
valid_images,
|
||||
)
|
||||
from paddlenlp.transformers.tokenizer_utils_base import (
|
||||
from paddleformers.transformers.tokenizer_utils_base import (
|
||||
TensorType,
|
||||
)
|
||||
from PIL import Image
|
||||
@@ -326,7 +326,7 @@ class AdaptiveImageProcessor(BaseImageProcessor):
|
||||
max_pixels=self.max_pixels,
|
||||
)
|
||||
image = image.astype("uint8") # TODO : 需要手动加上,否则多除255 导致结果会出错
|
||||
# 直接fromarray,不要靠paddlenlp里面的
|
||||
# 直接fromarray,不要靠paddleformers里面的
|
||||
image = Image.fromarray(image)
|
||||
image = resize(
|
||||
image,
|
||||
|
||||
@@ -18,11 +18,12 @@
|
||||
""" process.py """
|
||||
import copy
|
||||
import io
|
||||
import os
|
||||
from collections import defaultdict
|
||||
from typing import Any, Dict, List, Union
|
||||
|
||||
import numpy as np
|
||||
from paddlenlp.transformers.image_utils import ChannelDimension
|
||||
from paddleformers.transformers.image_utils import ChannelDimension
|
||||
from PIL import Image
|
||||
|
||||
|
||||
@@ -31,6 +32,8 @@ from .image_preprocessor.image_preprocessor_adaptive import AdaptiveImageProcess
|
||||
from .process_video import read_frames_decord, read_video_decord
|
||||
from .utils.io_utils import RAW_IMAGE_DIR, get_downloadable
|
||||
from .utils.render_timestamp import render_frame_timestamp
|
||||
from fastdeploy.input.ernie_tokenizer import ErnieBotTokenizer
|
||||
from fastdeploy.entrypoints.chat_utils import parse_chat_messages
|
||||
|
||||
IDS_TYPE_FLAG = {"text": 0, "image": 1, "video": 2, "audio": 3}
|
||||
|
||||
@@ -94,9 +97,11 @@ class DataProcessor:
|
||||
video_max_frames: int = 180,
|
||||
video_min_frames: int = 16,
|
||||
video_fps: int = 2,
|
||||
**kwargs
|
||||
) -> None:
|
||||
# Tokenizer and image preprocessor
|
||||
self.tokenizer = ErnieVLTokenizer.from_pretrained(tokenizer_name, verbose=False)
|
||||
self.model_name_or_path = tokenizer_name
|
||||
self._load_tokenizer()
|
||||
self.tokenizer.ignored_index = -100
|
||||
self.image_preprocessor = AdaptiveImageProcessor.from_pretrained(image_preprocessor_name)
|
||||
|
||||
@@ -125,6 +130,8 @@ class DataProcessor:
|
||||
self.video_start = self.VID_START
|
||||
self.video_end = self.VID_END
|
||||
self.image_patch_id = self.tokenizer.convert_tokens_to_ids("<|IMAGE_PLACEHOLDER|>")
|
||||
self.image_start_id = self.tokenizer.convert_tokens_to_ids(self.image_start)
|
||||
self.video_start_id = self.tokenizer.convert_tokens_to_ids(self.video_start)
|
||||
|
||||
self.token_type_mapping = self._build_token_type_mapping()
|
||||
self.is_training = True
|
||||
@@ -145,11 +152,12 @@ class DataProcessor:
|
||||
"""Enable evaluation mode (doesn't produce labels)."""
|
||||
self.is_training = False
|
||||
|
||||
def process(self, messages: List[Dict[str, Any]]) -> Dict[str, Union[np.ndarray, List[np.ndarray], None]]:
|
||||
def text2ids(self, text, images=None, videos=None):
|
||||
"""
|
||||
Convert chat messages into model inputs.
|
||||
Convert chat text into model inputs.
|
||||
Returns a dict with input_ids, token_type_ids, position_ids, images, grid_thw, image_type_ids, labels.
|
||||
"""
|
||||
|
||||
outputs = {
|
||||
"input_ids": [],
|
||||
"token_type_ids": [],
|
||||
@@ -162,37 +170,94 @@ class DataProcessor:
|
||||
"pic_cnt": 0,
|
||||
"video_cnt": 0,
|
||||
}
|
||||
self._add_special_token(self.cls_token, outputs)
|
||||
|
||||
IMAGE_PLACEHOLDER = "<|image@placeholder|>"
|
||||
VIDEO_PLACEHOLDER = "<|video@placeholder|>"
|
||||
IMAGE_PLACEHOLDER_LEN = len(IMAGE_PLACEHOLDER)
|
||||
VIDEO_PLACEHOLDER_LEN = len(VIDEO_PLACEHOLDER)
|
||||
st, image_idx, video_idx = 0, 0, 0
|
||||
while st < len(text):
|
||||
image_pos = text.find(IMAGE_PLACEHOLDER, st)
|
||||
image_pos = len(text) if image_pos == -1 else image_pos
|
||||
video_pos = text.find(VIDEO_PLACEHOLDER, st)
|
||||
video_pos = len(text) if video_pos == -1 else video_pos
|
||||
ed = min(image_pos, video_pos)
|
||||
|
||||
self._add_text(text[st:ed], outputs)
|
||||
if ed == len(text):
|
||||
break
|
||||
|
||||
if ed == image_pos:
|
||||
self._add_image(images[image_idx], outputs)
|
||||
image_idx += 1
|
||||
st = ed + IMAGE_PLACEHOLDER_LEN
|
||||
else:
|
||||
item = videos[video_idx]
|
||||
if isinstance(item, dict):
|
||||
frames = self._load_and_process_video(item["video"], item)
|
||||
else:
|
||||
frames = self._load_and_process_video(item, {})
|
||||
|
||||
self._add_video(frames, outputs)
|
||||
video_idx += 1
|
||||
st = ed + VIDEO_PLACEHOLDER_LEN
|
||||
|
||||
return outputs
|
||||
|
||||
def request2ids(self, request: Dict[str, Any]) -> Dict[str, Union[np.ndarray, List[np.ndarray], None]]:
|
||||
"""
|
||||
Convert chat messages into model inputs.
|
||||
Returns a dict with input_ids, token_type_ids, position_ids, images, grid_thw, image_type_ids, labels.
|
||||
"""
|
||||
|
||||
outputs = {
|
||||
"input_ids": [],
|
||||
"token_type_ids": [],
|
||||
"position_ids": [],
|
||||
"images": [],
|
||||
"grid_thw": [],
|
||||
"image_type_ids": [],
|
||||
"labels": [],
|
||||
"cur_position": 0,
|
||||
"pic_cnt": 0,
|
||||
"video_cnt": 0,
|
||||
}
|
||||
|
||||
messages = parse_chat_messages(request.get("messages"))
|
||||
image_message_list = []
|
||||
for msg in messages:
|
||||
role = msg.get("role")
|
||||
assert role in self.role_prefixes, f"Unsupported role: {role}"
|
||||
prefix = self.role_prefixes[role]
|
||||
if prefix:
|
||||
self._add_text(prefix, outputs)
|
||||
|
||||
content_items = msg.get("content")
|
||||
if not isinstance(content_items, list):
|
||||
content_items = [content_items]
|
||||
|
||||
for item in content_items:
|
||||
if isinstance(item, str) or item.get("type") == "text":
|
||||
text = item if isinstance(item, str) else item.get("text", "")
|
||||
self._add_text(text, outputs)
|
||||
elif item.get("type") == "image_url" or item.get("type") == "image":
|
||||
self._add_image(item, outputs)
|
||||
elif item.get("type") == "video_url" or item.get("type") == "video":
|
||||
self._add_video(item, outputs)
|
||||
|
||||
if role in ("user", "system"):
|
||||
self._add_text("\n", outputs)
|
||||
else:
|
||||
self._add_special_token(self.sep_token, outputs)
|
||||
|
||||
if not self.is_training:
|
||||
# Append assistant prefix in eval
|
||||
self._add_text(self.role_prefixes["bot"], outputs)
|
||||
|
||||
if isinstance(item, dict) and item.get("type") in ["image", "video"]:
|
||||
image_message_list.append(item)
|
||||
|
||||
prompt_token_ids = self.apply_chat_template(request)
|
||||
image_start_index = 0
|
||||
image_message_index = 0
|
||||
for i in range(len(prompt_token_ids)):
|
||||
if prompt_token_ids[i] in [self.image_start_id, self.video_start_id]:
|
||||
self._add_text(prompt_token_ids[image_start_index:i + 1], outputs)
|
||||
image_start_index = i + 1
|
||||
image_message = image_message_list[image_message_index]
|
||||
if image_message["type"] == "image":
|
||||
img = image_message.get("image")
|
||||
if img is None:
|
||||
continue
|
||||
outputs["pic_cnt"] += 1
|
||||
self._add_image(img, outputs)
|
||||
elif image_message["type"] == "video":
|
||||
video_bytes = image_message.get("video")
|
||||
if video_bytes is None:
|
||||
continue
|
||||
frames = self._load_and_process_video(video_bytes, image_message)
|
||||
outputs["video_cnt"] += 1
|
||||
self._add_video(frames, outputs)
|
||||
image_message_index += 1
|
||||
self._add_text(prompt_token_ids[image_start_index:], outputs)
|
||||
return outputs
|
||||
|
||||
def _add_special_token(self, token: Union[str, int], outputs: Dict) -> None:
|
||||
@@ -203,8 +268,9 @@ class DataProcessor:
|
||||
outputs["position_ids"].append([pos] * 3)
|
||||
outputs["cur_position"] += 1
|
||||
|
||||
def _add_text(self, text: str, outputs: Dict) -> None:
|
||||
tokens = self.tokenizer.encode(text, add_special_tokens=False)["input_ids"]
|
||||
def _add_text(self, tokens, outputs: Dict) -> None:
|
||||
if isinstance(tokens, str):
|
||||
tokens = self.tokenizer.encode(tokens, add_special_tokens=False)["input_ids"]
|
||||
outputs["input_ids"].extend(tokens)
|
||||
outputs["token_type_ids"].extend([IDS_TYPE_FLAG["text"]] * len(tokens))
|
||||
|
||||
@@ -213,25 +279,7 @@ class DataProcessor:
|
||||
outputs["position_ids"].append([start + i] * 3)
|
||||
outputs["cur_position"] += len(tokens)
|
||||
|
||||
def _add_image(self, item: Dict, outputs: Dict) -> None:
|
||||
url_info = item.get("image_url", {})
|
||||
w = url_info.get("image_width", None)
|
||||
h = url_info.get("image_height", None)
|
||||
|
||||
if "image" in item:
|
||||
img = item["image"]
|
||||
else:
|
||||
url = url_info.get("url")
|
||||
data = get_downloadable(url, download_dir=RAW_IMAGE_DIR, save_to_disk=False)
|
||||
img = Image.open(io.BytesIO(data) if isinstance(data, bytes) else data)
|
||||
|
||||
if w and h:
|
||||
img = img.resize((w, h))
|
||||
|
||||
outputs["pic_cnt"] += 1
|
||||
self._add_text(f"Picture {outputs['pic_cnt']}:", outputs)
|
||||
self._add_special_token(self.IMG_START, outputs)
|
||||
|
||||
def _add_image(self, img, outputs: Dict) -> None:
|
||||
patches_h, patches_w = self.image_preprocessor.get_smarted_resize(
|
||||
img.height,
|
||||
img.width,
|
||||
@@ -260,21 +308,7 @@ class DataProcessor:
|
||||
outputs["grid_thw"].append(ret["image_grid_thw"])
|
||||
outputs["image_type_ids"].append(0)
|
||||
|
||||
self._add_special_token(self.IMG_END, outputs)
|
||||
|
||||
def _add_video(self, item: Dict, outputs: Dict) -> None:
|
||||
url_info = item.get("video_url", {})
|
||||
url = url_info.get("url")
|
||||
outputs["video_cnt"] += 1
|
||||
self._add_text(f"Video {outputs['video_cnt']}:", outputs)
|
||||
self._add_special_token(self.VID_START, outputs)
|
||||
|
||||
if "video" in item:
|
||||
video_path = item["video"]
|
||||
frames = self._load_and_process_video(video_path, item)
|
||||
else:
|
||||
video_path = get_downloadable(url, save_to_disk=False)
|
||||
frames = self._load_and_process_video(video_path, item)
|
||||
def _add_video(self, frames, outputs: Dict) -> None:
|
||||
patches_h, patches_w = self.image_preprocessor.get_smarted_resize(
|
||||
frames[0].height,
|
||||
frames[0].width,
|
||||
@@ -305,8 +339,6 @@ class DataProcessor:
|
||||
outputs["position_ids"].extend(pos_ids)
|
||||
outputs["cur_position"] = np.max(pos_ids) + 1
|
||||
|
||||
self._add_special_token(self.VID_END, outputs)
|
||||
|
||||
def _load_and_process_video(self, url: str, item: Dict) -> List[Image.Image]:
|
||||
reader, meta, path = read_video_decord(url, save_to_disk=False)
|
||||
|
||||
@@ -386,3 +418,38 @@ class DataProcessor:
|
||||
|
||||
coords = list(zip(time_idx, h_idx, w_idx))
|
||||
return [[start_idx + ti, start_idx + hi, start_idx + wi] for ti, hi, wi in coords]
|
||||
|
||||
def _load_tokenizer(self):
|
||||
"""
|
||||
load tokenizer
|
||||
|
||||
Returns:
|
||||
tokenizer (AutoTokenizer)
|
||||
"""
|
||||
vocab_file_names = ["tokenizer.model", "spm.model", "ernie_token_100k.model"]
|
||||
for i in range(len(vocab_file_names)):
|
||||
if os.path.exists(os.path.join(self.model_name_or_path, vocab_file_names[i])):
|
||||
ErnieBotTokenizer.resource_files_names["vocab_file"] = vocab_file_names[i]
|
||||
break
|
||||
self.tokenizer = ErnieBotTokenizer.from_pretrained(self.model_name_or_path)
|
||||
|
||||
def apply_chat_template(self, request):
|
||||
"""
|
||||
Convert multi-turn messages into ID sequences.
|
||||
|
||||
Args:
|
||||
messages: Either a request dict containing 'messages' field,
|
||||
or a list of message dicts directly
|
||||
|
||||
Returns:
|
||||
List of token IDs as strings (converted from token objects)
|
||||
"""
|
||||
if self.tokenizer.chat_template is None:
|
||||
raise ValueError("This model does not support chat_template.")
|
||||
|
||||
prompt_token_str = self.tokenizer.apply_chat_template(
|
||||
request, tokenize=False, add_generation_prompt=request.get("add_generation_prompt", True)
|
||||
).replace("<|image@placeholder|>", "").replace("<|video@placeholder|>", "")
|
||||
tokens = self.tokenizer.tokenize(prompt_token_str)
|
||||
token_ids = self.tokenizer.convert_tokens_to_ids(tokens)
|
||||
return token_ids
|
||||
@@ -13,7 +13,6 @@
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
"""
|
||||
|
||||
"""
|
||||
ErnieVLTokenizer
|
||||
"""
|
||||
@@ -25,12 +24,11 @@ from typing import Dict, List, Optional, Tuple
|
||||
import numpy as np
|
||||
import paddle
|
||||
import sentencepiece as spm
|
||||
from paddlenlp.transformers import PretrainedTokenizer
|
||||
from paddlenlp.transformers.tokenizer_utils_base import (
|
||||
PaddingStrategy,
|
||||
TextInput,
|
||||
)
|
||||
from paddlenlp.utils.log import logger
|
||||
from paddleformers.transformers import PretrainedTokenizer
|
||||
from paddleformers.transformers.tokenizer_utils_base import (PaddingStrategy,
|
||||
TextInput)
|
||||
|
||||
from fastdeploy.utils import console_logger as logger
|
||||
|
||||
|
||||
class ErnieVLTokenizer(PretrainedTokenizer):
|
||||
@@ -43,7 +41,9 @@ class ErnieVLTokenizer(PretrainedTokenizer):
|
||||
pretrained_init_configuration = {
|
||||
"ernie-bot-10b": {},
|
||||
}
|
||||
model_input_names = ["input_ids", "position_ids", "attention_mask", "labels"]
|
||||
model_input_names = [
|
||||
"input_ids", "position_ids", "attention_mask", "labels"
|
||||
]
|
||||
padding_side = "right"
|
||||
|
||||
def __init__(
|
||||
@@ -114,7 +114,10 @@ class ErnieVLTokenizer(PretrainedTokenizer):
|
||||
|
||||
def get_vocab(self):
|
||||
"""doc"""
|
||||
vocab = {self.convert_ids_to_tokens(i): i for i in range(self.vocab_size)}
|
||||
vocab = {
|
||||
self.convert_ids_to_tokens(i): i
|
||||
for i in range(self.vocab_size)
|
||||
}
|
||||
vocab.update(self.added_tokens_encoder)
|
||||
return vocab
|
||||
|
||||
@@ -157,7 +160,9 @@ class ErnieVLTokenizer(PretrainedTokenizer):
|
||||
# logger.warning(f'ErnieBotTokenizer v2 does not support `add_special_tokens`')
|
||||
return super().prepare_for_model(*args, **kwargs)
|
||||
|
||||
def save_vocabulary(self, save_directory, filename_prefix: Optional[str] = None) -> Tuple[str]:
|
||||
def save_vocabulary(self,
|
||||
save_directory,
|
||||
filename_prefix: Optional[str] = None) -> Tuple[str]:
|
||||
"""
|
||||
Save the vocabulary and special tokens file to a directory.
|
||||
Args:
|
||||
@@ -167,19 +172,22 @@ class ErnieVLTokenizer(PretrainedTokenizer):
|
||||
`Tuple(str)`: Paths to the files saved.
|
||||
"""
|
||||
if not os.path.isdir(save_directory):
|
||||
logger.error(f"Vocabulary path ({save_directory}) should be a directory")
|
||||
logger.error(
|
||||
f"Vocabulary path ({save_directory}) should be a directory")
|
||||
return
|
||||
out_vocab_file = os.path.join(
|
||||
save_directory,
|
||||
(filename_prefix + "-" if filename_prefix else "") + self.resource_files_names["vocab_file"],
|
||||
(filename_prefix + "-" if filename_prefix else "") +
|
||||
self.resource_files_names["vocab_file"],
|
||||
)
|
||||
if os.path.abspath(self.vocab_file) != os.path.abspath(out_vocab_file) and os.path.isfile(self.vocab_file):
|
||||
if os.path.abspath(self.vocab_file) != os.path.abspath(
|
||||
out_vocab_file) and os.path.isfile(self.vocab_file):
|
||||
copyfile(self.vocab_file, out_vocab_file)
|
||||
elif not os.path.isfile(self.vocab_file):
|
||||
with open(out_vocab_file, "wb") as fi:
|
||||
content_spiece_model = self.sp_model.serialized_model_proto()
|
||||
fi.write(content_spiece_model)
|
||||
return (out_vocab_file,)
|
||||
return (out_vocab_file, )
|
||||
|
||||
def tokenize(self, text: TextInput, **kwargs) -> List[str]:
|
||||
"""
|
||||
@@ -203,10 +211,13 @@ class ErnieVLTokenizer(PretrainedTokenizer):
|
||||
if hasattr(self, "do_lower_case") and self.do_lower_case:
|
||||
# convert non-special tokens to lowercase
|
||||
escaped_special_toks = [
|
||||
re.escape(s_tok) for s_tok in (self.unique_no_split_tokens + self.all_special_tokens)
|
||||
re.escape(s_tok) for s_tok in (self.unique_no_split_tokens +
|
||||
self.all_special_tokens)
|
||||
]
|
||||
pattern = r"(" + r"|".join(escaped_special_toks) + r")|" + r"(.+?)"
|
||||
text = re.sub(pattern, lambda m: m.groups()[0] or m.groups()[1].lower(), text)
|
||||
text = re.sub(pattern,
|
||||
lambda m: m.groups()[0] or m.groups()[1].lower(),
|
||||
text)
|
||||
|
||||
no_split_token = set(self.unique_no_split_tokens)
|
||||
tokens = self.tokens_trie.split(text)
|
||||
@@ -248,19 +259,27 @@ class ErnieVLTokenizer(PretrainedTokenizer):
|
||||
required_input = encoded_inputs[self.model_input_names[0]]
|
||||
if padding_strategy == PaddingStrategy.LONGEST:
|
||||
max_length = len(required_input)
|
||||
if max_length is not None and pad_to_multiple_of is not None and (max_length % pad_to_multiple_of != 0):
|
||||
max_length = ((max_length // pad_to_multiple_of) + 1) * pad_to_multiple_of
|
||||
needs_to_be_padded = padding_strategy != PaddingStrategy.DO_NOT_PAD and len(required_input) != max_length
|
||||
if "attention_mask" in encoded_inputs and encoded_inputs["attention_mask"] is not None:
|
||||
if max_length is not None and pad_to_multiple_of is not None and (
|
||||
max_length % pad_to_multiple_of != 0):
|
||||
max_length = ((max_length // pad_to_multiple_of) +
|
||||
1) * pad_to_multiple_of
|
||||
needs_to_be_padded = padding_strategy != PaddingStrategy.DO_NOT_PAD and len(
|
||||
required_input) != max_length
|
||||
if "attention_mask" in encoded_inputs and encoded_inputs[
|
||||
"attention_mask"] is not None:
|
||||
attention_mask = encoded_inputs.pop("attention_mask")
|
||||
if isinstance(attention_mask, paddle.Tensor):
|
||||
attention_mask = attention_mask.numpy()
|
||||
elif isinstance(attention_mask, list):
|
||||
attention_mask = np.array(attention_mask)
|
||||
elif not isinstance(attention_mask, np.ndarray):
|
||||
raise ValueError(f"Unexpected type {type(attention_mask)} of attention_mask, ")
|
||||
raise ValueError(
|
||||
f"Unexpected type {type(attention_mask)} of attention_mask, "
|
||||
)
|
||||
else:
|
||||
attention_mask = np.tril(np.ones((len(required_input), len(required_input)), dtype=np.int64))
|
||||
attention_mask = np.tril(
|
||||
np.ones((len(required_input), len(required_input)),
|
||||
dtype=np.int64))
|
||||
attention_mask = np.expand_dims(attention_mask, axis=0)
|
||||
if needs_to_be_padded:
|
||||
difference = max_length - len(required_input)
|
||||
@@ -275,7 +294,8 @@ class ErnieVLTokenizer(PretrainedTokenizer):
|
||||
else:
|
||||
pad_width = [(0, 0), (difference, 0), (difference, 0)]
|
||||
else:
|
||||
raise ValueError("Invalid padding strategy:" + str(self.padding_side))
|
||||
raise ValueError("Invalid padding strategy:" +
|
||||
str(self.padding_side))
|
||||
attention_mask = np.pad(
|
||||
attention_mask,
|
||||
pad_width=pad_width,
|
||||
@@ -342,7 +362,8 @@ def add_special_tokens(
|
||||
# check
|
||||
first_special_tokens = tokenizer.encode(special_tokens[0])["input_ids"]
|
||||
|
||||
assert first_special_tokens[0] == special_token_ids_start, f"[ERROR] first_special_tokens={first_special_tokens}"
|
||||
assert first_special_tokens[
|
||||
0] == special_token_ids_start, f"[ERROR] first_special_tokens={first_special_tokens}"
|
||||
assert (
|
||||
len(tokenizer.get_vocab()) < special_token_ids_end
|
||||
), f"[ERROR] vocab_size = {len(tokenizer.get_vocab())} >= {special_token_ids_end} 增加过多special token了!"
|
||||
|
||||
@@ -78,13 +78,13 @@ class ImageMediaIO(MediaIO[Image.Image]):
|
||||
"""
|
||||
return self.load_bytes(base64.b64decode(data))
|
||||
|
||||
def load_file(self, filepath: Path) -> Image.Image:
|
||||
def load_file(self, filepath: str) -> Image.Image:
|
||||
"""
|
||||
加载文件,并转换为指定模式。
|
||||
如果文件不存在或无法打开,将抛出FileNotFoundError异常。
|
||||
|
||||
Args:
|
||||
filepath (Path): 文件路径(Pathlib.Path对象)。
|
||||
filepath (str): 文件路径。
|
||||
|
||||
Returns:
|
||||
Image.Image: 返回一个Image.Image对象,表示已经加载和转换的图像。
|
||||
|
||||
@@ -100,142 +100,72 @@ def sample_frames_from_video(frames: npt.NDArray,
|
||||
return sampled_frames
|
||||
|
||||
|
||||
class VideoMediaIO(MediaIO[npt.NDArray]):
|
||||
class VideoMediaIO(MediaIO[bytes]):
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
image_io: ImageMediaIO,
|
||||
*,
|
||||
num_frames: int = 32,
|
||||
) -> None:
|
||||
def __init__(self) -> None:
|
||||
"""
|
||||
初始化一个 VideoMediaIO 对象。
|
||||
|
||||
Args:
|
||||
image_io (ImageMediaIO): 用于读取和写入图像的 ImageMediaIO 对象。
|
||||
num_frames (int, optional): 视频中帧数,默认为 32。
|
||||
ImageMediaIO 对象必须支持指定帧数。
|
||||
无。
|
||||
|
||||
Raises:
|
||||
TypeError: 如果 image_io 不是 ImageMediaIO 类型。
|
||||
ValueError: 如果 num_frames 小于等于 0。
|
||||
无。
|
||||
|
||||
Returns:
|
||||
None: 无返回值,直接初始化并设置属性。
|
||||
无。
|
||||
"""
|
||||
super().__init__()
|
||||
|
||||
self.image_io = image_io
|
||||
self.num_frames = num_frames
|
||||
|
||||
def load_bytes(self, data: bytes) -> npt.NDArray:
|
||||
def load_bytes(self, data: bytes) -> bytes:
|
||||
"""
|
||||
从字节数据加载视频帧,并返回一个 numpy ndarray。
|
||||
如果字节数据中的视频帧数量大于指定的 `num_frames`,则将其平均分布到这些帧上;否则,返回所有帧。
|
||||
ERNIE-45-VL模型的前处理中包含抽帧操作,如果将视频帧加载为npt.NDArray格式会丢失FPS信息,因此目前
|
||||
不对字节数据做任何操作。
|
||||
|
||||
Args:
|
||||
data (bytes): 包含视频帧数据的字节对象。
|
||||
|
||||
Returns:
|
||||
npt.NDArray, shape=(num_frames, height, width, channels): 返回一个 numpy ndarray,其中包含了视频帧数据。
|
||||
如果 `num_frames` 小于视频帧数量,则返回前 `num_frames` 帧;否则,返回所有帧。
|
||||
|
||||
Raises:
|
||||
None.
|
||||
"""
|
||||
import decord
|
||||
vr = decord.VideoReader(BytesIO(data), num_threads=1)
|
||||
total_frame_num = len(vr)
|
||||
|
||||
num_frames = self.num_frames
|
||||
if total_frame_num > num_frames:
|
||||
uniform_sampled_frames = np.linspace(0,
|
||||
total_frame_num - 1,
|
||||
num_frames,
|
||||
dtype=int)
|
||||
frame_idx = uniform_sampled_frames.tolist()
|
||||
else:
|
||||
frame_idx = list(range(0, total_frame_num))
|
||||
|
||||
return vr.get_batch(frame_idx).asnumpy()
|
||||
|
||||
def load_base64(self, media_type: str, data: str) -> npt.NDArray:
|
||||
"""
|
||||
加载 base64 编码的数据,并返回 numpy ndarray。
|
||||
|
||||
Args:
|
||||
media_type (str): 媒体类型,目前仅支持 "video/jpeg"。
|
||||
当为 "video/jpeg" 时,将解析每一帧的 base64 编码数据,并转换成 numpy ndarray。
|
||||
data (str): base64 编码的字符串数据。
|
||||
|
||||
Returns:
|
||||
npt.NDArray, optional: 如果 media_type 为 "video/jpeg",则返回 numpy ndarray 格式的视频数据;否则返回 None。
|
||||
|
||||
Raises:
|
||||
None.
|
||||
"""
|
||||
if media_type.lower() == "video/jpeg":
|
||||
load_frame = partial(
|
||||
self.image_io.load_base64,
|
||||
"image/jpeg",
|
||||
)
|
||||
|
||||
return np.stack([
|
||||
np.array(load_frame(frame_data))
|
||||
for frame_data in data.split(",")
|
||||
])
|
||||
|
||||
return self.load_bytes(base64.b64decode(data))
|
||||
|
||||
def load_file(self, filepath: Path) -> npt.NDArray:
|
||||
"""
|
||||
读取文件内容,并将其转换为numpy数组。
|
||||
|
||||
Args:
|
||||
filepath (Path): 文件路径对象,表示要读取的文件。
|
||||
|
||||
Returns:
|
||||
npt.NDArray, optional: 返回一个numpy数组,包含了文件内容。如果无法解析文件内容,则返回None。
|
||||
bytes,字节数据原样返回。
|
||||
|
||||
Raises:
|
||||
无。
|
||||
"""
|
||||
with filepath.open("rb") as f:
|
||||
data = f.read()
|
||||
return data
|
||||
|
||||
return self.load_bytes(data)
|
||||
|
||||
def encode_base64(
|
||||
self,
|
||||
media: npt.NDArray,
|
||||
*,
|
||||
video_format: str = "JPEG",
|
||||
) -> str:
|
||||
def load_base64(self, media_type: str, data: str) -> bytes:
|
||||
"""
|
||||
将视频编码为Base64字符串,每一帧都是一个Base64字符串。
|
||||
如果视频格式为"JPEG",则每一帧都会被转换成JPEG图片并进行编码。
|
||||
加载 base64 编码的数据,并返回bytes。
|
||||
|
||||
Args:
|
||||
media (npt.NDArray): 要编码的视频,形状为(H,W,C)或者(T,H,W,C),其中T为时间步长,H和W分别为高度和宽度,C为通道数。
|
||||
当前仅支持JPEG格式。
|
||||
video_format (str, optional, default="JPEG"): 视频格式,只支持"JPEG"。 Default to "JPEG".
|
||||
|
||||
Raises:
|
||||
NotImplementedError: 当前仅支持JPEG格式。
|
||||
media_type (str): 媒体类型,目前不支持 "video/jpeg"。
|
||||
data (str): base64 编码的字符串数据。
|
||||
|
||||
Returns:
|
||||
str: Base64字符串,每一帧都是一个Base64字符串,用","连接起来。
|
||||
bytes, optional: 如果 media_type 不为 "video/jpeg",则返回字节数据。
|
||||
|
||||
Raises:
|
||||
ValueError: 如果media_type是"video/jpeg"。
|
||||
"""
|
||||
video = media
|
||||
if media_type.lower() == "video/jpeg":
|
||||
raise ValueError("Video in JPEG format is not supported")
|
||||
|
||||
if video_format == "JPEG":
|
||||
encode_frame = partial(
|
||||
self.image_io.encode_base64,
|
||||
image_format=video_format,
|
||||
)
|
||||
return base64.b64decode(data)
|
||||
|
||||
return ",".join(
|
||||
encode_frame(Image.fromarray(frame)) for frame in video)
|
||||
def load_file(self, filepath: str) -> bytes:
|
||||
"""
|
||||
读取文件内容,并返回bytes。
|
||||
|
||||
Args:
|
||||
filepath (str): 文件路径,表示要读取的文件。
|
||||
|
||||
Returns:
|
||||
bytes, optional: 返回字节数据,包含了文件内容。
|
||||
|
||||
Raises:
|
||||
无。
|
||||
"""
|
||||
with open(filepath, "rb") as f:
|
||||
data = f.read()
|
||||
|
||||
msg = "Only JPEG format is supported for now."
|
||||
raise NotImplementedError(msg)
|
||||
return data
|
||||
|
||||
@@ -13,8 +13,11 @@
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
"""
|
||||
from typing import Any, Dict, Optional
|
||||
|
||||
from fastdeploy.engine.config import ModelConfig
|
||||
from fastdeploy.reasoning import ReasoningParserManager
|
||||
|
||||
|
||||
class InputPreprocessor:
|
||||
"""
|
||||
@@ -24,6 +27,9 @@ class InputPreprocessor:
|
||||
key in the Hugging Face Transformers' model registry (https://huggingface.co/models).
|
||||
The model will be downloaded from the Hugging Face model hub if necessary.
|
||||
If a path is provided, the model will be loaded from that path.
|
||||
reasoning_parser (str, optional):
|
||||
Reasoning parser type. Defaults to None.
|
||||
Flag specifies the reasoning parser to use for extracting reasoning content from the model output
|
||||
enable_mm (bool, optional):
|
||||
Whether to use the multi-modal model processor. Defaults to False.
|
||||
|
||||
@@ -32,15 +38,21 @@ class InputPreprocessor:
|
||||
If the model name is not found in the Hugging Face Transformers' model registry and the path does not
|
||||
exist.
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
model_name_or_path: str,
|
||||
reasoning_parser: str = None,
|
||||
limit_mm_per_prompt: Optional[Dict[str, Any]] = None,
|
||||
mm_processor_kwargs: Optional[Dict[str, Any]] = None,
|
||||
enable_mm: bool = False,
|
||||
) -> None:
|
||||
|
||||
self.model_name_or_path = model_name_or_path
|
||||
self.reasoning_parser = reasoning_parser
|
||||
self.enable_mm = enable_mm
|
||||
|
||||
self.limit_mm_per_prompt = limit_mm_per_prompt
|
||||
self.mm_processor_kwargs = mm_processor_kwargs
|
||||
|
||||
def create_processor(self):
|
||||
"""
|
||||
@@ -53,7 +65,33 @@ class InputPreprocessor:
|
||||
Returns:
|
||||
DataProcessor or MultiModalRegistry.Processor (Union[DataProcessor, MultiModalRegistry.Processor]): 数据处理器。
|
||||
"""
|
||||
reasoning_parser_obj = None
|
||||
if self.reasoning_parser:
|
||||
reasoning_parser_obj = ReasoningParserManager.get_reasoning_parser(
|
||||
self.reasoning_parser)
|
||||
architectures = ModelConfig(self.model_name_or_path).architectures
|
||||
from fastdeploy.input.text_processor import DataProcessor
|
||||
self.processor = DataProcessor(model_name_or_path=self.model_name_or_path)
|
||||
if not self.enable_mm:
|
||||
if "Ernie4_5_MoeForCausalLM" not in architectures \
|
||||
and "Ernie4_5_ForCausalLM" not in architectures:
|
||||
from fastdeploy.input.text_processor import DataProcessor
|
||||
self.processor = DataProcessor(
|
||||
model_name_or_path=self.model_name_or_path, reasoning_parser_obj=reasoning_parser_obj)
|
||||
else:
|
||||
from fastdeploy.input.ernie_processor import ErnieProcessor
|
||||
self.processor = ErnieProcessor(
|
||||
model_name_or_path=self.model_name_or_path, reasoning_parser_obj=reasoning_parser_obj)
|
||||
else:
|
||||
if not architectures.startswith(
|
||||
"Ernie4_5_VLMoeForConditionalGeneration"):
|
||||
raise ValueError(
|
||||
f"Model {self.model_name_or_path} is not a valid Ernie4_5_VLMoe model."
|
||||
)
|
||||
else:
|
||||
from fastdeploy.input.ernie_vl_processor import \
|
||||
ErnieMoEVLProcessor
|
||||
self.processor = ErnieMoEVLProcessor(
|
||||
model_name_or_path=self.model_name_or_path,
|
||||
limit_mm_per_prompt=self.limit_mm_per_prompt,
|
||||
mm_processor_kwargs=self.mm_processor_kwargs,
|
||||
reasoning_parser_obj=reasoning_parser_obj)
|
||||
return self.processor
|
||||
|
||||
@@ -14,15 +14,16 @@
|
||||
# limitations under the License.
|
||||
"""
|
||||
|
||||
import os
|
||||
from abc import ABC, abstractmethod
|
||||
|
||||
import numpy as np
|
||||
from paddlenlp.generation import GenerationConfig
|
||||
from paddlenlp.transformers import Llama3Tokenizer, LlamaTokenizer
|
||||
from paddleformers.generation import GenerationConfig
|
||||
from paddleformers.transformers import Llama3Tokenizer, LlamaTokenizer
|
||||
|
||||
from fastdeploy import envs
|
||||
from fastdeploy.utils import data_processor_logger
|
||||
|
||||
_SAMPLING_EPS = 1e-5
|
||||
|
||||
class BaseDataProcessor(ABC):
|
||||
"""base class for data processor"""
|
||||
@@ -51,6 +52,27 @@ class BaseDataProcessor(ABC):
|
||||
f"mask_token is {self.tokenizer.mask_token}, {self.tokenizer.mask_token_id}"
|
||||
))
|
||||
|
||||
def _apply_default_parameters(self, request):
|
||||
"""
|
||||
Apply default value for parameters in request
|
||||
"""
|
||||
|
||||
def set_value(req, key, value):
|
||||
value = getattr(self.generation_config, key, value)
|
||||
if isinstance(req, dict):
|
||||
if key not in req:
|
||||
req[key] = value
|
||||
else:
|
||||
if req.get(key) is None:
|
||||
req.set(key, value)
|
||||
|
||||
set_value(request, "top_p", 0.7)
|
||||
set_value(request, "temperature", 1.0)
|
||||
set_value(request, "repetition_penalty", 1.0)
|
||||
set_value(request, "frequency_penalty", 0.0)
|
||||
set_value(request, "presence_penalty", 0.0)
|
||||
return request
|
||||
|
||||
@abstractmethod
|
||||
def process_request(self, request, **kwargs):
|
||||
"""
|
||||
@@ -129,7 +151,7 @@ class BaseDataProcessor(ABC):
|
||||
|
||||
class DataProcessor(BaseDataProcessor):
|
||||
|
||||
def __init__(self, model_name_or_path):
|
||||
def __init__(self, model_name_or_path, reasoning_parser_obj=None):
|
||||
"""
|
||||
Initializes the DecodeStatus object.
|
||||
|
||||
@@ -145,6 +167,7 @@ class DataProcessor(BaseDataProcessor):
|
||||
"""
|
||||
|
||||
self.model_name_or_path = model_name_or_path
|
||||
|
||||
self._init_config()
|
||||
|
||||
self.decode_status = dict()
|
||||
@@ -154,12 +177,15 @@ class DataProcessor(BaseDataProcessor):
|
||||
eos_token is {self.tokenizer.eos_token}, {self.tokenizer.eos_token_id} "
|
||||
)
|
||||
|
||||
from paddlenlp.trl.llm_utils import get_eos_token_id
|
||||
from paddleformers.trl.llm_utils import get_eos_token_id
|
||||
|
||||
self.eos_token_ids = get_eos_token_id(self.tokenizer,
|
||||
self.generation_config)
|
||||
self.eos_token_id_len = len(self.eos_token_ids)
|
||||
self.pad_token_id = self.get_pad_id()
|
||||
self.reasoning_parser = None
|
||||
if reasoning_parser_obj:
|
||||
self.reasoning_parser = reasoning_parser_obj(self.tokenizer)
|
||||
self.tokenizer.pad_token_id = self.pad_token_id
|
||||
|
||||
def _init_config(self):
|
||||
@@ -175,7 +201,7 @@ class DataProcessor(BaseDataProcessor):
|
||||
Raises:
|
||||
无异常抛出。
|
||||
"""
|
||||
self.use_hf_tokenizer = int(os.getenv("USE_HF_TOKENIZER", "0")) == 1
|
||||
self.use_hf_tokenizer = int(envs.FD_USE_HF_TOKENIZER) == 1
|
||||
|
||||
# Generation config
|
||||
try:
|
||||
@@ -187,7 +213,7 @@ class DataProcessor(BaseDataProcessor):
|
||||
)
|
||||
self.generation_config = None
|
||||
|
||||
def process_request(self, request, max_model_len=None):
|
||||
def process_request(self, request, max_model_len=None, **kwargs):
|
||||
"""
|
||||
Preprocess the request
|
||||
|
||||
@@ -198,6 +224,7 @@ class DataProcessor(BaseDataProcessor):
|
||||
bool: Whether preprocessing is successful
|
||||
str: error message
|
||||
"""
|
||||
request = self._apply_default_parameters(request)
|
||||
if request.get("eos_token_ids") is None or len(
|
||||
request.eos_token_ids) == 0:
|
||||
request.eos_token_ids = self.eos_token_ids
|
||||
@@ -217,20 +244,23 @@ class DataProcessor(BaseDataProcessor):
|
||||
if self.tokenizer.chat_template is None:
|
||||
raise ValueError(
|
||||
"This model does not support chat_template.")
|
||||
request.prompt_token_ids = self.messages2ids(request.messages)
|
||||
task = request.to_dict()
|
||||
task['enable_thinking'] = kwargs.get("enable_thinking", True)
|
||||
request.prompt_token_ids = self.messages2ids(task)
|
||||
else:
|
||||
raise ValueError(
|
||||
f"The request should have `input_ids`, `text` or `messages`: {request}."
|
||||
)
|
||||
|
||||
if max_model_len is not None and len(
|
||||
request.prompt_token_ids) > max_model_len:
|
||||
request.prompt_token_ids = request.prompt_token_ids[:
|
||||
max_model_len -
|
||||
1]
|
||||
if request.get("max_tokens") is None:
|
||||
request.set("max_tokens",
|
||||
max(1, max_model_len - len(request.prompt_token_ids)))
|
||||
if request.get("temperature") < _SAMPLING_EPS:
|
||||
# zero temperature is equivalent to greedy sampling
|
||||
request.set("temperature", 1)
|
||||
data_processor_logger.info(f"Processed request {request}")
|
||||
return request
|
||||
|
||||
def process_request_dict(self, request, max_model_len=None):
|
||||
def process_request_dict(self, request, max_model_len=None, **kwargs):
|
||||
"""
|
||||
Preprocess the request
|
||||
|
||||
@@ -241,6 +271,7 @@ class DataProcessor(BaseDataProcessor):
|
||||
bool: Whether preprocessing is successful
|
||||
str: error message
|
||||
"""
|
||||
request = self._apply_default_parameters(request)
|
||||
if not request.get('eos_token_ids'):
|
||||
request['eos_token_ids'] = self.eos_token_ids
|
||||
|
||||
@@ -251,6 +282,7 @@ class DataProcessor(BaseDataProcessor):
|
||||
request['stop_token_ids'] = stop_seqs
|
||||
request['stop_seqs_len'] = stop_seqs_len
|
||||
|
||||
data_processor_logger.info(f"Processing request {request}")
|
||||
# 处理prompt_token_ids
|
||||
if not request.get('prompt_token_ids'):
|
||||
if 'prompt' in request:
|
||||
@@ -261,19 +293,19 @@ class DataProcessor(BaseDataProcessor):
|
||||
if self.tokenizer.chat_template is None:
|
||||
raise ValueError(
|
||||
"This model does not support chat_template.")
|
||||
request['prompt_token_ids'] = self.messages2ids(
|
||||
request['messages']).tolist()
|
||||
request['prompt_token_ids'] = self.messages2ids(request)
|
||||
else:
|
||||
raise ValueError(
|
||||
f"Request must contain 'prompt_token_ids', 'prompt', or 'messages': {request}"
|
||||
)
|
||||
|
||||
# 截断超过长度限制的prompt
|
||||
if max_model_len is not None and len(
|
||||
request['prompt_token_ids']) > max_model_len:
|
||||
request['prompt_token_ids'] = request[
|
||||
'prompt_token_ids'][:max_model_len - 1]
|
||||
|
||||
if request.get("max_tokens") is None:
|
||||
request["max_tokens"] = max(
|
||||
1, max_model_len - len(request['prompt_token_ids']))
|
||||
if request.get("temperature") < _SAMPLING_EPS:
|
||||
# zero temperature is equivalent to greedy sampling
|
||||
request["temperature"] = 1
|
||||
data_processor_logger.info(f"Processed request {request}")
|
||||
return request
|
||||
|
||||
def process_response(self, response_dict, **kwargs):
|
||||
@@ -286,24 +318,26 @@ class DataProcessor(BaseDataProcessor):
|
||||
Returns:
|
||||
Dict: response contain text fields
|
||||
"""
|
||||
is_end = response_dict.finished
|
||||
req_id = response_dict.request_id
|
||||
|
||||
token_ids = response_dict.outputs.token_ids
|
||||
response_dict.outputs.text = self.ids2tokens(token_ids, req_id)
|
||||
response_dict.usage = {
|
||||
"completion_tokens": response_dict.outputs.index + 1
|
||||
}
|
||||
if token_ids[-1] == self.tokenizer.eos_token_id:
|
||||
token_ids = token_ids[:-1]
|
||||
full_text = self.tokenizer.decode(token_ids)
|
||||
|
||||
# 模型支持思考,并且支持思考
|
||||
if self.reasoning_parser:
|
||||
reasoning_content, text = self.reasoning_parser.extract_reasoning_content(
|
||||
full_text, response_dict)
|
||||
response_dict.outputs.text = text
|
||||
response_dict.outputs.reasoning_content = reasoning_content
|
||||
else:
|
||||
# 模型不支持思考,并且没单独设置enable_thinking为false
|
||||
response_dict.outputs.text = full_text
|
||||
data_processor_logger.info(f"req_id:{req_id}, token)ids: {token_ids}")
|
||||
|
||||
if is_end:
|
||||
self.clear_request_status(req_id)
|
||||
data_processor_logger.debug(
|
||||
"Request id: {} has been completed.".format(token_ids))
|
||||
response_dict.outputs.text = self.ids2tokens(token_ids, req_id)
|
||||
self.clear_request_status(req_id)
|
||||
return response_dict
|
||||
|
||||
def process_response_dict(self, response_dict, stream=True):
|
||||
def process_response_dict_normal(self, response_dict, **kwargs):
|
||||
"""
|
||||
Preprocess the response
|
||||
|
||||
@@ -313,24 +347,86 @@ class DataProcessor(BaseDataProcessor):
|
||||
Returns:
|
||||
Dict: response contain text fields
|
||||
"""
|
||||
token_ids = response_dict["outputs"]["token_ids"]
|
||||
is_end = response_dict["finished"]
|
||||
req_id = response_dict["request_id"]
|
||||
if is_end and len(token_ids) > 0:
|
||||
if token_ids[-1] == self.tokenizer.eos_token_id:
|
||||
token_ids = token_ids[:-1]
|
||||
delta_text, _, previous_texts = self.ids2tokens(token_ids, req_id)
|
||||
if is_end:
|
||||
full_text = previous_texts + delta_text
|
||||
if self.reasoning_parser:
|
||||
reasoning_content, text = self.reasoning_parser.extract_reasoning_content(
|
||||
full_text, response_dict)
|
||||
response_dict["outputs"]["text"] = text
|
||||
response_dict["outputs"][
|
||||
"reasoning_content"] = reasoning_content
|
||||
else:
|
||||
response_dict["outputs"]["text"] = full_text
|
||||
data_processor_logger.info(
|
||||
f"req_id:{req_id}, decode_status: {self.decode_status[req_id]}"
|
||||
)
|
||||
del self.decode_status[req_id]
|
||||
return response_dict
|
||||
|
||||
def process_response_dict_streaming(self, response_dict, **kwargs):
|
||||
"""
|
||||
Preprocess the response
|
||||
|
||||
Args:
|
||||
response_dict (Dict): response for engine, contain ids fields
|
||||
|
||||
Returns:
|
||||
Dict: response contain text fields
|
||||
"""
|
||||
enable_thinking = kwargs.get("enable_thinking")
|
||||
is_end = response_dict["finished"]
|
||||
req_id = response_dict["request_id"]
|
||||
token_ids = response_dict["outputs"]["token_ids"]
|
||||
|
||||
if is_end:
|
||||
data_processor_logger.debug(
|
||||
"Request id: {} has been completed.".format(token_ids))
|
||||
full_text = self.clear_request_status(req_id)
|
||||
if not stream:
|
||||
response_dict["outputs"]["text"] = full_text
|
||||
else:
|
||||
response_dict["outputs"]["text"] = ""
|
||||
if is_end and len(token_ids) > 0:
|
||||
if token_ids[-1] == self.tokenizer.eos_token_id:
|
||||
token_ids = token_ids[:-1]
|
||||
delta_text, previous_token_ids, previous_texts = self.ids2tokens(
|
||||
token_ids, req_id)
|
||||
|
||||
if enable_thinking and self.reasoning_parser:
|
||||
reasoning_content, text = self.reasoning_parser.extract_reasoning_content_streaming(
|
||||
previous_texts, previous_texts + delta_text, delta_text,
|
||||
previous_token_ids, previous_token_ids + token_ids, token_ids)
|
||||
response_dict["outputs"]["text"] = text
|
||||
response_dict["outputs"]["reasoning_content"] = reasoning_content
|
||||
else:
|
||||
response_dict["outputs"]["text"] = self.ids2tokens(
|
||||
token_ids, req_id)
|
||||
response_dict["outputs"]["text"] = delta_text
|
||||
if is_end:
|
||||
data_processor_logger.info(
|
||||
f"req_id:{req_id}, decode_status: {self.decode_status[req_id]}"
|
||||
)
|
||||
del self.decode_status[req_id]
|
||||
return response_dict
|
||||
|
||||
def process_response_dict(self, response_dict, **kwargs):
|
||||
"""
|
||||
Preprocess the response
|
||||
|
||||
Args:
|
||||
response_dict (Dict): response for engine, contain ids fields
|
||||
|
||||
Returns:
|
||||
Dict: response contain text fields
|
||||
"""
|
||||
enable_thinking = kwargs.pop("enable_thinking", True)
|
||||
if enable_thinking is None:
|
||||
enable_thinking = True
|
||||
stream = kwargs.get("stream", True)
|
||||
if stream:
|
||||
return self.process_response_dict_streaming(
|
||||
response_dict, enable_thinking=enable_thinking, **kwargs)
|
||||
else:
|
||||
return self.process_response_dict_normal(
|
||||
response_dict=response_dict, enable_thinking=enable_thinking)
|
||||
|
||||
def text2ids(self, text, max_model_len, raw_request=True):
|
||||
"""
|
||||
text to token ids
|
||||
@@ -349,28 +445,20 @@ class DataProcessor(BaseDataProcessor):
|
||||
truncation=True,
|
||||
)
|
||||
else:
|
||||
if not raw_request or self.tokenizer.chat_template is None:
|
||||
text = [text] if isinstance(text, str) else text
|
||||
chat_template = False
|
||||
elif self.tokenizer.chat_template is not None:
|
||||
text = [text] if isinstance(text, str) else text
|
||||
text = [
|
||||
self.tokenizer.apply_chat_template(sentence,
|
||||
tokenize=False)
|
||||
for sentence in text
|
||||
]
|
||||
chat_template = True
|
||||
text = [text] if isinstance(text, str) else text
|
||||
|
||||
tokens = self.tokenizer(
|
||||
text,
|
||||
return_tensors="np",
|
||||
padding=True,
|
||||
truncation=True,
|
||||
max_length=max_model_len,
|
||||
add_special_tokens=chat_template,
|
||||
add_special_tokens=False,
|
||||
)
|
||||
|
||||
return tokens["input_ids"][0]
|
||||
|
||||
def messages2ids(self, messages):
|
||||
def messages2ids(self, request):
|
||||
"""
|
||||
Convert multi-turn messages into ID sequences.
|
||||
|
||||
@@ -380,9 +468,21 @@ class DataProcessor(BaseDataProcessor):
|
||||
Returns:
|
||||
List[int]: ID sequences
|
||||
"""
|
||||
message_result = self.tokenizer.apply_chat_template(
|
||||
messages, return_tensors="pd")
|
||||
return np.array(message_result["input_ids"][0])
|
||||
|
||||
spliced_message = self.tokenizer.apply_chat_template(
|
||||
request,
|
||||
tokenize=False,
|
||||
split_special_tokens=False,
|
||||
add_special_tokens=False,
|
||||
return_tensors="pd")
|
||||
req_id = None
|
||||
tokens = self.tokenizer.tokenize(spliced_message)
|
||||
if isinstance(request, dict):
|
||||
req_id = request.get("request_id", None)
|
||||
token_ids = self.tokenizer.convert_tokens_to_ids(tokens)
|
||||
data_processor_logger.info(
|
||||
f"req_id:{req_id}, tokens:{tokens}, token_ids: {token_ids}")
|
||||
return token_ids
|
||||
|
||||
def ids2tokens(self, token_id, task_id):
|
||||
"""
|
||||
@@ -417,18 +517,20 @@ class DataProcessor(BaseDataProcessor):
|
||||
else:
|
||||
if task_id not in self.decode_status:
|
||||
# prefix offset & read offset & history token ids & history token strings
|
||||
self.decode_status[task_id] = [0, 0, [], []]
|
||||
self.decode_status[task_id] = [0, 0, [], ""]
|
||||
|
||||
prefix_offset = self.decode_status[task_id][0]
|
||||
read_offset = self.decode_status[task_id][1]
|
||||
previous_token_ids = self.decode_status[task_id][2]
|
||||
previous_texts = self.decode_status[task_id][3]
|
||||
decode_str, prefix_offset, read_offset = self.tokenizer.decode_token(
|
||||
previous_token_ids + token_id, prefix_offset, read_offset)
|
||||
self.decode_status[task_id][0] = prefix_offset
|
||||
self.decode_status[task_id][1] = read_offset
|
||||
self.decode_status[task_id][2] += token_id
|
||||
self.decode_status[task_id][3].append(decode_str)
|
||||
return decode_str
|
||||
self.decode_status[task_id][3] += decode_str
|
||||
|
||||
return decode_str, previous_token_ids, previous_texts
|
||||
|
||||
def _load_tokenizer(self):
|
||||
"""
|
||||
@@ -437,13 +539,12 @@ class DataProcessor(BaseDataProcessor):
|
||||
Returns:
|
||||
tokenizer (AutoTokenizer)
|
||||
"""
|
||||
|
||||
if self.use_hf_tokenizer:
|
||||
from transformers import AutoTokenizer
|
||||
return AutoTokenizer.from_pretrained(self.model_name_or_path,
|
||||
use_fast=False)
|
||||
else:
|
||||
from paddlenlp.transformers import AutoTokenizer
|
||||
from paddleformers.transformers import AutoTokenizer
|
||||
return AutoTokenizer.from_pretrained(self.model_name_or_path,
|
||||
padding_side="left",
|
||||
use_fast=True)
|
||||
|
||||
Reference in New Issue
Block a user