mirror of
https://github.com/PaddlePaddle/FastDeploy.git
synced 2026-04-23 00:17:25 +08:00
[Bug Fix] fix tokenizer oom (#6287)
* fix tokenizer oom * fix unit test
This commit is contained in:
@@ -480,7 +480,8 @@ class DataProcessor(MMBaseDataProcessor):
|
||||
|
||||
def _add_text(self, tokens, outputs: Dict) -> None:
|
||||
if isinstance(tokens, str):
|
||||
tokens = self.tokenizer.encode(tokens, add_special_tokens=False)["input_ids"]
|
||||
tokens = self.tokenizer.tokenize(tokens)
|
||||
tokens = self.tokenizer.convert_tokens_to_ids(tokens)
|
||||
outputs["input_ids"].extend(tokens)
|
||||
outputs["token_type_ids"].extend([IDS_TYPE_FLAG["text"]] * len(tokens))
|
||||
|
||||
|
||||
@@ -441,8 +441,6 @@ class TestDataProcessorTargetMethods(unittest.TestCase):
|
||||
self.mock_tokenizer.convert_tokens_to_ids.side_effect = self._mock_convert_tokens_to_ids
|
||||
self.mock_tokenizer.chat_template = "mock_template"
|
||||
self.mock_tokenizer.apply_chat_template.return_value = "User: Hello<|image@placeholder|>"
|
||||
# Mock encode method for _add_text
|
||||
self.mock_tokenizer.encode = MagicMock(return_value={"input_ids": [1, 2, 3]})
|
||||
|
||||
def mock_load_tokenizer(dp_instance):
|
||||
dp_instance.tokenizer = self.mock_tokenizer
|
||||
@@ -1004,8 +1002,8 @@ class TestDataProcessor(unittest.TestCase):
|
||||
"""Test adding text and special tokens"""
|
||||
outputs = self._create_outputs()
|
||||
self.processor._add_text("hello", outputs)
|
||||
self.assertEqual(len(outputs["input_ids"]), 3)
|
||||
self.assertEqual(outputs["cur_position"], 3)
|
||||
self.assertEqual(len(outputs["input_ids"]), 2)
|
||||
self.assertEqual(outputs["cur_position"], 2)
|
||||
|
||||
outputs2 = self._create_outputs()
|
||||
self.processor._add_text([1, 2, 3, 4, 5], outputs2)
|
||||
|
||||
Reference in New Issue
Block a user