diff --git a/convert-hf-to-gguf.py b/convert-hf-to-gguf.py index 91266185f..b8ec48835 100755 --- a/convert-hf-to-gguf.py +++ b/convert-hf-to-gguf.py @@ -31,7 +31,7 @@ if TYPE_CHECKING: from torch import Tensor if 'NO_LOCAL_GGUF' not in os.environ: - sys.path.insert(1, str(Path(__file__).parent / 'gguf-py')) + sys.path.insert(1, str(Path('gguf-py'))) import gguf from convert import LlamaHfVocab @@ -375,10 +375,13 @@ class Model: from transformers import AutoTokenizer tokenizer = AutoTokenizer.from_pretrained(self.dir_model) vocab_size = self.hparams.get("vocab_size", len(tokenizer.vocab)) - assert tokenizer.vocab_size == vocab_size + assert max(tokenizer.vocab.values()) < vocab_size + + tokpre = self.get_vocab_base_pre(tokenizer) reverse_vocab = {id_: encoded_tok for encoded_tok, id_ in tokenizer.vocab.items()} added_vocab = tokenizer.get_added_vocab() + for i in range(vocab_size): if i not in reverse_vocab: tokens.append(f"[PAD{i}]") @@ -393,7 +396,6 @@ class Model: tokens.append(reverse_vocab[i]) toktypes.append(gguf.TokenType.NORMAL) - tokpre = self.get_vocab_base_pre(tokenizer) return tokens, toktypes, tokpre # NOTE: this function is generated by convert-hf-to-gguf-update.py @@ -2439,7 +2441,7 @@ class ArcticModel(Model): tokens: list[bytes] = [f"[PAD{i}]".encode("utf-8") for i in range(vocab_size)] scores: list[float] = [-10000.0] * vocab_size - toktypes: list[int] = [SentencePieceTokenTypes.UNKNOWN] * vocab_size + toktypes: list[int] = [gguf.TokenType.UNKNOWN] * vocab_size for token_id in range(tokenizer.vocab_size()): @@ -2447,15 +2449,15 @@ class ArcticModel(Model): text = piece.encode("utf-8") score = tokenizer.GetScore(token_id) - toktype = SentencePieceTokenTypes.NORMAL + toktype = gguf.TokenType.NORMAL if tokenizer.IsUnknown(token_id): - toktype = SentencePieceTokenTypes.UNKNOWN + toktype = gguf.TokenType.UNKNOWN elif tokenizer.IsControl(token_id): - toktype = SentencePieceTokenTypes.CONTROL + toktype = gguf.TokenType.CONTROL elif tokenizer.IsUnused(token_id): - toktype = SentencePieceTokenTypes.UNUSED + toktype = gguf.TokenType.UNUSED elif tokenizer.IsByte(token_id): - toktype = SentencePieceTokenTypes.BYTE + toktype = gguf.TokenType.BYTE tokens[token_id] = text scores[token_id] = score @@ -2477,16 +2479,16 @@ class ArcticModel(Model): continue token_content = token_json["content"] - token_type = SentencePieceTokenTypes.USER_DEFINED + token_type = gguf.TokenType.USER_DEFINED token_score = -10000.0 # Map unk_token to UNKNOWN, other special tokens to CONTROL # Set the score to 0.0 as in the original tokenizer.model if ("special" in token_json) and token_json["special"]: if token_content == tokenizer_config_json["unk_token"]: - token_type = SentencePieceTokenTypes.UNKNOWN + token_type = gguf.TokenType.UNKNOWN else: - token_type = SentencePieceTokenTypes.CONTROL + token_type = gguf.TokenType.CONTROL token_score = 0.0 logger.info(f"Setting added token {token_id} to '{token_content}' (type: {token_type}, score: {token_score:.2f})")