chore: Fix import path, token comparisons, and update token type references

2024-05-28 18:44:57 -04:00 · 2024-05-28 18:44:57 -04:00 · aa28cfe6ec
commit aa28cfe6ec
parent 9dbc9571a3
1 changed files with 14 additions and 12 deletions
--- a/convert-hf-to-gguf.py
+++ b/convert-hf-to-gguf.py
@ -31,7 +31,7 @@ if TYPE_CHECKING:
    from torch import Tensor

 if 'NO_LOCAL_GGUF' not in os.environ:
-    sys.path.insert(1, str(Path(__file__).parent / 'gguf-py'))
+    sys.path.insert(1, str(Path('gguf-py')))
 import gguf

 from convert import LlamaHfVocab
@ -375,10 +375,13 @@ class Model:
        from transformers import AutoTokenizer
        tokenizer = AutoTokenizer.from_pretrained(self.dir_model)
        vocab_size = self.hparams.get("vocab_size", len(tokenizer.vocab))
-        assert tokenizer.vocab_size == vocab_size
+        assert max(tokenizer.vocab.values()) < vocab_size
+
+        tokpre = self.get_vocab_base_pre(tokenizer)

        reverse_vocab = {id_: encoded_tok for encoded_tok, id_ in tokenizer.vocab.items()}
        added_vocab = tokenizer.get_added_vocab()
+
        for i in range(vocab_size):
            if i not in reverse_vocab:
                tokens.append(f"[PAD{i}]")
@ -393,7 +396,6 @@ class Model:
                tokens.append(reverse_vocab[i])
                toktypes.append(gguf.TokenType.NORMAL)

-        tokpre = self.get_vocab_base_pre(tokenizer)
        return tokens, toktypes, tokpre

    # NOTE: this function is generated by convert-hf-to-gguf-update.py
@ -2439,7 +2441,7 @@ class ArcticModel(Model):

        tokens: list[bytes] = [f"[PAD{i}]".encode("utf-8") for i in range(vocab_size)]
        scores: list[float] = [-10000.0] * vocab_size
-        toktypes: list[int] = [SentencePieceTokenTypes.UNKNOWN] * vocab_size
+        toktypes: list[int] = [gguf.TokenType.UNKNOWN] * vocab_size

        for token_id in range(tokenizer.vocab_size()):

@ -2447,15 +2449,15 @@ class ArcticModel(Model):
            text = piece.encode("utf-8")
            score = tokenizer.GetScore(token_id)

-            toktype = SentencePieceTokenTypes.NORMAL
+            toktype = gguf.TokenType.NORMAL
            if tokenizer.IsUnknown(token_id):
-                toktype = SentencePieceTokenTypes.UNKNOWN
+                toktype = gguf.TokenType.UNKNOWN
            elif tokenizer.IsControl(token_id):
-                toktype = SentencePieceTokenTypes.CONTROL
+                toktype = gguf.TokenType.CONTROL
            elif tokenizer.IsUnused(token_id):
-                toktype = SentencePieceTokenTypes.UNUSED
+                toktype = gguf.TokenType.UNUSED
            elif tokenizer.IsByte(token_id):
-                toktype = SentencePieceTokenTypes.BYTE
+                toktype = gguf.TokenType.BYTE

            tokens[token_id] = text
            scores[token_id] = score
@ -2477,16 +2479,16 @@ class ArcticModel(Model):
                            continue

                        token_content = token_json["content"]
-                        token_type = SentencePieceTokenTypes.USER_DEFINED
+                        token_type = gguf.TokenType.USER_DEFINED
                        token_score = -10000.0

                        # Map unk_token to UNKNOWN, other special tokens to CONTROL
                        # Set the score to 0.0 as in the original tokenizer.model
                        if ("special" in token_json) and token_json["special"]:
                            if token_content == tokenizer_config_json["unk_token"]:
-                                token_type = SentencePieceTokenTypes.UNKNOWN
+                                token_type = gguf.TokenType.UNKNOWN
                            else:
-                                token_type = SentencePieceTokenTypes.CONTROL
+                                token_type = gguf.TokenType.CONTROL
                            token_score = 0.0

                        logger.info(f"Setting added token {token_id} to '{token_content}' (type: {token_type}, score: {token_score:.2f})")