From 4adb8b986217b789423a1c11b2d4ce3cb131fd06 Mon Sep 17 00:00:00 2001
From: wonjun Jang <strutive07@gmail.com>
Date: Sun, 5 Nov 2023 22:41:40 +0900
Subject: [PATCH] Update convert.py

---
 convert.py | 53 ++++++++++++++++++++++++++++++++++++++++++-----------
 1 file changed, 42 insertions(+), 11 deletions(-)
diff --git a/convert.py b/convert.py
index d4d2a5a9b..7e43bfb01 100755
--- a/convert.py
+++ b/convert.py
@@ -310,6 +310,8 @@ class VocabLoader:
         self.tokenizer = AutoTokenizer.from_pretrained(str(fname_tokenizer))
         vocab_set = {encoded_tok for encoded_tok, id in self.tokenizer.vocab.items()}
 
+        self.added_tokens_list = []
+        self.vocab_size_base: int = len(self.tokenizer.vocab)
         self.vocab_size: int = len(self.tokenizer.vocab)
         self.fname_tokenizer = fname_tokenizer
 
@@ -317,15 +319,21 @@ class VocabLoader:
         tokenizer = self.tokenizer
         reverse_vocab = {id: encoded_tok for encoded_tok, id in tokenizer.vocab.items()}
         
-        for i in range(self.vocab_size):
+        for i in range(self.vocab_size_base):
             text = reverse_vocab[i].encode("utf-8")
             yield text, 0.0, gguf.TokenType.NORMAL
-            
+
+    def added_tokens(self) -> Iterable[tuple[bytes, float, gguf.TokenType]]:
+        for text in self.added_tokens_list:
+            score = -1000.0
+            yield text.encode("utf-8"), score, gguf.TokenType.USER_DEFINED
+
     def has_newline_token(self):
         return '<0x0A>' in self.tokenizer.vocab or '\n' in self.tokenizer.vocab
 
     def all_tokens(self) -> Iterable[tuple[bytes, float, gguf.TokenType]]:
         yield from self.hf_tokens()
+        yield from self.added_tokens()
 
     def get_vocab_type(self) -> str:
         path_candidates = []
@@ -344,7 +352,7 @@ class VocabLoader:
         vocab_file = "tokenizer.json"
         path_candidate = vocab_check_and_append_path(self.fname_tokenizer, vocab_file)
         if path_candidate:
-            if not self.tokenizer.can_save_slow_tokenizer(): 
+            if not self.has_newline_token(): 
                 return "gpt2"
             else:
                 return "llama"
@@ -355,7 +363,7 @@ class VocabLoader:
                     "if it's in another directory, pass the directory as --vocab-dir")
 
     def __repr__(self) -> str:
-        return f"<VocabLoader with {self.vocab_size} tokens>"
+        return f"<VocabLoader with {self.vocab_size_base} base tokens and {len(self.added_tokens_list)} added tokens>"
 
 Vocab: TypeAlias = 'VocabLoader'
 
@@ -728,17 +736,27 @@ def bounded_parallel_map(func: Callable[[In], Out], iterable: Iterable[In], conc
                     break
             yield result
 
-def check_vocab_size(params: Params, vocab: Vocab) -> None:
+def check_vocab_size(params: Params, vocab: Vocab, pad_vocab: bool = False) -> None:
     if params.n_vocab != vocab.vocab_size:
         if params.n_vocab == vocab.vocab_size:
             print("Ignoring added_tokens.json since model matches vocab size without it.")
             vocab.added_tokens_list = []
             vocab.vocab_size = vocab.vocab_size
             return
+            
+        if pad_vocab and params.n_vocab > vocab.vocab_size:
+            pad_count = params.n_vocab - vocab.vocab_size
+            print(f'Padding vocab with {pad_count} token(s) - <dummy00001> through <dummy{pad_count:05}>')
+            for i in range(1, (params.n_vocab - vocab.vocab_size) + 1):
+                vocab.added_tokens_list.append(f'<dummy{i:05}>')
+            vocab.vocab_size = params.n_vocab
+            return
         msg = f"Vocab size mismatch (model has {params.n_vocab}, but {vocab.fname_tokenizer}"
         msg += f" has {vocab.vocab_size})."
         if vocab.vocab_size < params.n_vocab < vocab.vocab_size + 20:
             msg += f"  Most likely you are missing added_tokens.json (should be in {vocab.fname_tokenizer.parent})."
+        if vocab.vocab_size < params.n_vocab:
+            msg += " Possibly try using the --padvocab option."
         raise Exception(msg)
 
 
@@ -812,8 +830,12 @@ class OutputFile:
         self.gguf.close()
 
     @staticmethod
-    def write_vocab_only(fname_out: Path, params: Params, vocab: Vocab, svocab: gguf.SpecialVocab, endianess:gguf.GGUFEndian=gguf.GGUFEndian.LITTLE) -> None:
-        check_vocab_size(params, vocab)
+    def write_vocab_only(
+        fname_out: Path, params: Params, vocab: Vocab, svocab: gguf.SpecialVocab,
+        endianess: gguf.GGUFEndian = gguf.GGUFEndian.LITTLE,
+        pad_vocab: bool            = False,
+        ) -> None:
+        check_vocab_size(params, vocab, pad_vocab = pad_vocab)
 
         of = OutputFile(fname_out, endianess=endianess)
 
@@ -840,8 +862,14 @@ class OutputFile:
         return dt.quantize(arr)
 
     @staticmethod
-    def write_all(fname_out: Path, ftype: GGMLFileType, params: Params, model: LazyModel, vocab: Vocab, svocab: gguf.SpecialVocab, concurrency: int = DEFAULT_CONCURRENCY, endianess=gguf.GGUFEndian.LITTLE) -> None:
-        check_vocab_size(params, vocab)
+    def write_all(
+        fname_out  : Path, ftype: GGMLFileType, params: Params,
+        model      : LazyModel, vocab: Vocab, svocab: gguf.SpecialVocab,
+        concurrency: int             = DEFAULT_CONCURRENCY,
+        endianess  : gguf.GGUFEndian = gguf.GGUFEndian.LITTLE,
+        pad_vocab  : bool            = False,
+        ) -> None:
+        check_vocab_size(params, vocab, pad_vocab = pad_vocab)
 
         of = OutputFile(fname_out, endianess=endianess)
 
@@ -1054,6 +1082,7 @@ def main(args_in: list[str] | None = None) -> None:
     parser.add_argument("--ctx",         type=int,               help="model training context (default: based on input)")
     parser.add_argument("--concurrency", type=int,               help=f"concurrency used for conversion (default: {DEFAULT_CONCURRENCY})", default = DEFAULT_CONCURRENCY)
     parser.add_argument("--bigendian",   action="store_true",    help="model is executed on big endian machine")
+    parser.add_argument("--padvocab", action="store_true", help="add pad tokens when model vocab expects more than tokenizer metadata provides")
 
     args = parser.parse_args(args_in)
     if args.dump_single:
@@ -1101,7 +1130,8 @@ def main(args_in: list[str] | None = None) -> None:
                                           load_merges = True,
                                           n_vocab = vocab.vocab_size)
         outfile = args.outfile
-        OutputFile.write_vocab_only(outfile, params, vocab, special_vocab)
+        OutputFile.write_vocab_only(outfile, params, vocab, special_vocab,
+            endianess = endianess, pad_vocab = args.padvocab)
         print(f"Wrote {outfile}")
         return
 
@@ -1127,7 +1157,8 @@ def main(args_in: list[str] | None = None) -> None:
     params.ftype = ftype
     print(f"Writing {outfile}, format {ftype}")
 
-    OutputFile.write_all(outfile, ftype, params, model, vocab, special_vocab, concurrency = args.concurrency, endianess=endianess)
+    OutputFile.write_all(outfile, ftype, params, model, vocab, special_vocab,
+        concurrency = args.concurrency, endianess = endianess, pad_vocab = args.padvocab)
     print(f"Wrote {outfile}")