convert : update HF converter to new tokenizer voodoo magics

This commit is contained in:
Georgi Gerganov 2023-08-14 21:31:02 +03:00
parent 95d7593e4a
commit e4b8f94d6b
No known key found for this signature in database
GPG key ID: 449E073F9DC10735

View file

@ -122,19 +122,11 @@ if Path(dir_model + "/tokenizer.model").is_file():
for i in range(tokenizer.vocab_size()): for i in range(tokenizer.vocab_size()):
text: bytes text: bytes
if tokenizer.is_unknown(i): score: float
text = " \u2047 ".encode("utf-8")
elif tokenizer.is_control(i):
text = b""
if tokenizer.is_byte(i):
piece = tokenizer.id_to_piece(i) piece = tokenizer.id_to_piece(i)
if len(piece) != 6: text = piece.encode("utf-8")
raise Exception(f"Invalid token: {piece}") score = tokenizer.get_score(i)
byte_value = int(piece[3:-1], 16)
text = struct.pack("B", byte_value)
else:
text = tokenizer.id_to_piece(i).replace("\u2581", " ").encode("utf-8")
score: float = tokenizer.get_score(i)
tokens.append(text) tokens.append(text)
scores.append(score) scores.append(score)