[Bug Fix] fix tokenizer oom (#6287)

* fix tokenizer oom

* fix unit test
This commit is contained in:
ApplEOFDiscord
2026-02-03 11:27:11 +08:00
committed by GitHub
parent fb374238e1
commit 6563b8307c
2 changed files with 4 additions and 5 deletions
@@ -480,7 +480,8 @@ class DataProcessor(MMBaseDataProcessor):
def _add_text(self, tokens, outputs: Dict) -> None:
if isinstance(tokens, str):
tokens = self.tokenizer.encode(tokens, add_special_tokens=False)["input_ids"]
tokens = self.tokenizer.tokenize(tokens)
tokens = self.tokenizer.convert_tokens_to_ids(tokens)
outputs["input_ids"].extend(tokens)
outputs["token_type_ids"].extend([IDS_TYPE_FLAG["text"]] * len(tokens))
+2 -4
View File
@@ -441,8 +441,6 @@ class TestDataProcessorTargetMethods(unittest.TestCase):
self.mock_tokenizer.convert_tokens_to_ids.side_effect = self._mock_convert_tokens_to_ids
self.mock_tokenizer.chat_template = "mock_template"
self.mock_tokenizer.apply_chat_template.return_value = "User: Hello<|image@placeholder|>"
# Mock encode method for _add_text
self.mock_tokenizer.encode = MagicMock(return_value={"input_ids": [1, 2, 3]})
def mock_load_tokenizer(dp_instance):
dp_instance.tokenizer = self.mock_tokenizer
@@ -1004,8 +1002,8 @@ class TestDataProcessor(unittest.TestCase):
"""Test adding text and special tokens"""
outputs = self._create_outputs()
self.processor._add_text("hello", outputs)
self.assertEqual(len(outputs["input_ids"]), 3)
self.assertEqual(outputs["cur_position"], 3)
self.assertEqual(len(outputs["input_ids"]), 2)
self.assertEqual(outputs["cur_position"], 2)
outputs2 = self._create_outputs()
self.processor._add_text([1, 2, 3, 4, 5], outputs2)