convert-hf : Corrected sentencepiece API calls.

This commit is contained in:
Stanisław Szymczyk 2024-05-14 20:52:51 +02:00
parent 9acc3ecf34
commit f4421f7cd8

View file

@ -2294,7 +2294,8 @@ class ArcticModel(Model):
sys.exit(1) sys.exit(1)
# Read the whole vocabulary from the tokenizer.model file # Read the whole vocabulary from the tokenizer.model file
tokenizer = SentencePieceProcessor(str(tokenizer_path)) tokenizer = SentencePieceProcessor()
tokenizer.LoadFromFile(str(tokenizer_path))
vocab_size = self.hparams.get('vocab_size', tokenizer.vocab_size()) vocab_size = self.hparams.get('vocab_size', tokenizer.vocab_size())
@ -2304,18 +2305,18 @@ class ArcticModel(Model):
for token_id in range(tokenizer.vocab_size()): for token_id in range(tokenizer.vocab_size()):
piece = tokenizer.id_to_piece(token_id) piece = tokenizer.IdToPiece(token_id)
text = piece.encode("utf-8") text = piece.encode("utf-8")
score = tokenizer.get_score(token_id) score = tokenizer.GetScore(token_id)
toktype = SentencePieceTokenTypes.NORMAL toktype = SentencePieceTokenTypes.NORMAL
if tokenizer.is_unknown(token_id): if tokenizer.IsUnknown(token_id):
toktype = SentencePieceTokenTypes.UNKNOWN toktype = SentencePieceTokenTypes.UNKNOWN
elif tokenizer.is_control(token_id): elif tokenizer.IsControl(token_id):
toktype = SentencePieceTokenTypes.CONTROL toktype = SentencePieceTokenTypes.CONTROL
elif tokenizer.is_unused(token_id): elif tokenizer.IsUnused(token_id):
toktype = SentencePieceTokenTypes.UNUSED toktype = SentencePieceTokenTypes.UNUSED
elif tokenizer.is_byte(token_id): elif tokenizer.IsByte(token_id):
toktype = SentencePieceTokenTypes.BYTE toktype = SentencePieceTokenTypes.BYTE
tokens[token_id] = text tokens[token_id] = text