Add sentencepiece processor
This commit is contained in:
parent
7c8ee5aec5
commit
902075752a
1 changed files with 6 additions and 4 deletions
|
@ -2,6 +2,7 @@ import json
|
||||||
import os
|
import os
|
||||||
import tempfile
|
import tempfile
|
||||||
import sys
|
import sys
|
||||||
|
import sentencepiece as spm
|
||||||
from tokenizers import Tokenizer, models, pre_tokenizers, decoders, processors
|
from tokenizers import Tokenizer, models, pre_tokenizers, decoders, processors
|
||||||
|
|
||||||
if len(sys.argv) < 3:
|
if len(sys.argv) < 3:
|
||||||
|
@ -51,10 +52,11 @@ def load_tokenizer_from_json(json_path, special_tokens_map_path, tokenizer_confi
|
||||||
tokenizer.decoder = decoders.WordPiece()
|
tokenizer.decoder = decoders.WordPiece()
|
||||||
|
|
||||||
elif tokenizer_type == "SentencePiece":
|
elif tokenizer_type == "SentencePiece":
|
||||||
tokenizer = Tokenizer(models.SentencePiece.from_file(vocab_file.name))
|
sp_model = spm.SentencePieceProcessor()
|
||||||
tokenizer.pre_tokenizer = pre_tokenizers.WhitespaceSplit()
|
sp_model.Load(vocab_file.name)
|
||||||
tokenizer.decoder = decoders.SentencePiece()
|
tokenizer = Tokenizer(models.Model.from_sentencepiece(sp_model))
|
||||||
|
tokenizer.pre_tokenizer = pre_tokenizers.Sequence([pre_tokenizers.Metaspace(), pre_tokenizers.Split()])
|
||||||
|
tokenizer.decoder = decoders.Sequence([decoders.Split(), decoders.Metaspace()])
|
||||||
|
|
||||||
else:
|
else:
|
||||||
raise ValueError(f"Unsupported tokenizer type: {tokenizer_type}")
|
raise ValueError(f"Unsupported tokenizer type: {tokenizer_type}")
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue