From e4b8f94d6bd95beb76caf5dc5811ebab8e43d81d Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Mon, 14 Aug 2023 21:31:02 +0300 Subject: [PATCH] convert : update HF converter to new tokenizer voodoo magics --- convert-llama-h5-to-gguf.py | 18 +++++------------- 1 file changed, 5 insertions(+), 13 deletions(-) diff --git a/convert-llama-h5-to-gguf.py b/convert-llama-h5-to-gguf.py index d3d29916d..9d91b433b 100644 --- a/convert-llama-h5-to-gguf.py +++ b/convert-llama-h5-to-gguf.py @@ -122,19 +122,11 @@ if Path(dir_model + "/tokenizer.model").is_file(): for i in range(tokenizer.vocab_size()): text: bytes - if tokenizer.is_unknown(i): - text = " \u2047 ".encode("utf-8") - elif tokenizer.is_control(i): - text = b"" - if tokenizer.is_byte(i): - piece = tokenizer.id_to_piece(i) - if len(piece) != 6: - raise Exception(f"Invalid token: {piece}") - byte_value = int(piece[3:-1], 16) - text = struct.pack("B", byte_value) - else: - text = tokenizer.id_to_piece(i).replace("\u2581", " ").encode("utf-8") - score: float = tokenizer.get_score(i) + score: float + + piece = tokenizer.id_to_piece(i) + text = piece.encode("utf-8") + score = tokenizer.get_score(i) tokens.append(text) scores.append(score)