convert : update HF converter to new tokenizer voodoo magics
This commit is contained in:
parent
95d7593e4a
commit
e4b8f94d6b
1 changed files with 5 additions and 13 deletions
|
@ -122,19 +122,11 @@ if Path(dir_model + "/tokenizer.model").is_file():
|
||||||
|
|
||||||
for i in range(tokenizer.vocab_size()):
|
for i in range(tokenizer.vocab_size()):
|
||||||
text: bytes
|
text: bytes
|
||||||
if tokenizer.is_unknown(i):
|
score: float
|
||||||
text = " \u2047 ".encode("utf-8")
|
|
||||||
elif tokenizer.is_control(i):
|
|
||||||
text = b""
|
|
||||||
if tokenizer.is_byte(i):
|
|
||||||
piece = tokenizer.id_to_piece(i)
|
piece = tokenizer.id_to_piece(i)
|
||||||
if len(piece) != 6:
|
text = piece.encode("utf-8")
|
||||||
raise Exception(f"Invalid token: {piece}")
|
score = tokenizer.get_score(i)
|
||||||
byte_value = int(piece[3:-1], 16)
|
|
||||||
text = struct.pack("B", byte_value)
|
|
||||||
else:
|
|
||||||
text = tokenizer.id_to_piece(i).replace("\u2581", " ").encode("utf-8")
|
|
||||||
score: float = tokenizer.get_score(i)
|
|
||||||
|
|
||||||
tokens.append(text)
|
tokens.append(text)
|
||||||
scores.append(score)
|
scores.append(score)
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue