Add sentencepiece processor

This commit is contained in:
aeslampanah 2023-04-13 07:58:45 -04:00
parent 7c8ee5aec5
commit 902075752a

View file

@ -2,6 +2,7 @@ import json
import os import os
import tempfile import tempfile
import sys import sys
import sentencepiece as spm
from tokenizers import Tokenizer, models, pre_tokenizers, decoders, processors from tokenizers import Tokenizer, models, pre_tokenizers, decoders, processors
if len(sys.argv) < 3: if len(sys.argv) < 3:
@ -51,10 +52,11 @@ def load_tokenizer_from_json(json_path, special_tokens_map_path, tokenizer_confi
tokenizer.decoder = decoders.WordPiece() tokenizer.decoder = decoders.WordPiece()
elif tokenizer_type == "SentencePiece": elif tokenizer_type == "SentencePiece":
tokenizer = Tokenizer(models.SentencePiece.from_file(vocab_file.name)) sp_model = spm.SentencePieceProcessor()
tokenizer.pre_tokenizer = pre_tokenizers.WhitespaceSplit() sp_model.Load(vocab_file.name)
tokenizer.decoder = decoders.SentencePiece() tokenizer = Tokenizer(models.Model.from_sentencepiece(sp_model))
tokenizer.pre_tokenizer = pre_tokenizers.Sequence([pre_tokenizers.Metaspace(), pre_tokenizers.Split()])
tokenizer.decoder = decoders.Sequence([decoders.Split(), decoders.Metaspace()])
else: else:
raise ValueError(f"Unsupported tokenizer type: {tokenizer_type}") raise ValueError(f"Unsupported tokenizer type: {tokenizer_type}")