[Bug Fix] fix tokenizer oom (#6287)

* fix tokenizer oom

* fix unit test
This commit is contained in:
ApplEOFDiscord
2026-02-03 11:27:11 +08:00
committed by GitHub
parent fb374238e1
commit 6563b8307c
2 changed files with 4 additions and 5 deletions
@@ -480,7 +480,8 @@ class DataProcessor(MMBaseDataProcessor):
def _add_text(self, tokens, outputs: Dict) -> None:
if isinstance(tokens, str):
tokens = self.tokenizer.encode(tokens, add_special_tokens=False)["input_ids"]
tokens = self.tokenizer.tokenize(tokens)
tokens = self.tokenizer.convert_tokens_to_ids(tokens)
outputs["input_ids"].extend(tokens)
outputs["token_type_ids"].extend([IDS_TYPE_FLAG["text"]] * len(tokens))