From 4adb8b986217b789423a1c11b2d4ce3cb131fd06 Mon Sep 17 00:00:00 2001 From: wonjun Jang Date: Sun, 5 Nov 2023 22:41:40 +0900 Subject: [PATCH] Update convert.py --- convert.py | 53 ++++++++++++++++++++++++++++++++++++++++++----------- 1 file changed, 42 insertions(+), 11 deletions(-) diff --git a/convert.py b/convert.py index d4d2a5a9b..7e43bfb01 100755 --- a/convert.py +++ b/convert.py @@ -310,6 +310,8 @@ class VocabLoader: self.tokenizer = AutoTokenizer.from_pretrained(str(fname_tokenizer)) vocab_set = {encoded_tok for encoded_tok, id in self.tokenizer.vocab.items()} + self.added_tokens_list = [] + self.vocab_size_base: int = len(self.tokenizer.vocab) self.vocab_size: int = len(self.tokenizer.vocab) self.fname_tokenizer = fname_tokenizer @@ -317,15 +319,21 @@ class VocabLoader: tokenizer = self.tokenizer reverse_vocab = {id: encoded_tok for encoded_tok, id in tokenizer.vocab.items()} - for i in range(self.vocab_size): + for i in range(self.vocab_size_base): text = reverse_vocab[i].encode("utf-8") yield text, 0.0, gguf.TokenType.NORMAL - + + def added_tokens(self) -> Iterable[tuple[bytes, float, gguf.TokenType]]: + for text in self.added_tokens_list: + score = -1000.0 + yield text.encode("utf-8"), score, gguf.TokenType.USER_DEFINED + def has_newline_token(self): return '<0x0A>' in self.tokenizer.vocab or '\n' in self.tokenizer.vocab def all_tokens(self) -> Iterable[tuple[bytes, float, gguf.TokenType]]: yield from self.hf_tokens() + yield from self.added_tokens() def get_vocab_type(self) -> str: path_candidates = [] @@ -344,7 +352,7 @@ class VocabLoader: vocab_file = "tokenizer.json" path_candidate = vocab_check_and_append_path(self.fname_tokenizer, vocab_file) if path_candidate: - if not self.tokenizer.can_save_slow_tokenizer(): + if not self.has_newline_token(): return "gpt2" else: return "llama" @@ -355,7 +363,7 @@ class VocabLoader: "if it's in another directory, pass the directory as --vocab-dir") def __repr__(self) -> str: - return f"" + return f"" Vocab: TypeAlias = 'VocabLoader' @@ -728,17 +736,27 @@ def bounded_parallel_map(func: Callable[[In], Out], iterable: Iterable[In], conc break yield result -def check_vocab_size(params: Params, vocab: Vocab) -> None: +def check_vocab_size(params: Params, vocab: Vocab, pad_vocab: bool = False) -> None: if params.n_vocab != vocab.vocab_size: if params.n_vocab == vocab.vocab_size: print("Ignoring added_tokens.json since model matches vocab size without it.") vocab.added_tokens_list = [] vocab.vocab_size = vocab.vocab_size return + + if pad_vocab and params.n_vocab > vocab.vocab_size: + pad_count = params.n_vocab - vocab.vocab_size + print(f'Padding vocab with {pad_count} token(s) - through ') + for i in range(1, (params.n_vocab - vocab.vocab_size) + 1): + vocab.added_tokens_list.append(f'') + vocab.vocab_size = params.n_vocab + return msg = f"Vocab size mismatch (model has {params.n_vocab}, but {vocab.fname_tokenizer}" msg += f" has {vocab.vocab_size})." if vocab.vocab_size < params.n_vocab < vocab.vocab_size + 20: msg += f" Most likely you are missing added_tokens.json (should be in {vocab.fname_tokenizer.parent})." + if vocab.vocab_size < params.n_vocab: + msg += " Possibly try using the --padvocab option." raise Exception(msg) @@ -812,8 +830,12 @@ class OutputFile: self.gguf.close() @staticmethod - def write_vocab_only(fname_out: Path, params: Params, vocab: Vocab, svocab: gguf.SpecialVocab, endianess:gguf.GGUFEndian=gguf.GGUFEndian.LITTLE) -> None: - check_vocab_size(params, vocab) + def write_vocab_only( + fname_out: Path, params: Params, vocab: Vocab, svocab: gguf.SpecialVocab, + endianess: gguf.GGUFEndian = gguf.GGUFEndian.LITTLE, + pad_vocab: bool = False, + ) -> None: + check_vocab_size(params, vocab, pad_vocab = pad_vocab) of = OutputFile(fname_out, endianess=endianess) @@ -840,8 +862,14 @@ class OutputFile: return dt.quantize(arr) @staticmethod - def write_all(fname_out: Path, ftype: GGMLFileType, params: Params, model: LazyModel, vocab: Vocab, svocab: gguf.SpecialVocab, concurrency: int = DEFAULT_CONCURRENCY, endianess=gguf.GGUFEndian.LITTLE) -> None: - check_vocab_size(params, vocab) + def write_all( + fname_out : Path, ftype: GGMLFileType, params: Params, + model : LazyModel, vocab: Vocab, svocab: gguf.SpecialVocab, + concurrency: int = DEFAULT_CONCURRENCY, + endianess : gguf.GGUFEndian = gguf.GGUFEndian.LITTLE, + pad_vocab : bool = False, + ) -> None: + check_vocab_size(params, vocab, pad_vocab = pad_vocab) of = OutputFile(fname_out, endianess=endianess) @@ -1054,6 +1082,7 @@ def main(args_in: list[str] | None = None) -> None: parser.add_argument("--ctx", type=int, help="model training context (default: based on input)") parser.add_argument("--concurrency", type=int, help=f"concurrency used for conversion (default: {DEFAULT_CONCURRENCY})", default = DEFAULT_CONCURRENCY) parser.add_argument("--bigendian", action="store_true", help="model is executed on big endian machine") + parser.add_argument("--padvocab", action="store_true", help="add pad tokens when model vocab expects more than tokenizer metadata provides") args = parser.parse_args(args_in) if args.dump_single: @@ -1101,7 +1130,8 @@ def main(args_in: list[str] | None = None) -> None: load_merges = True, n_vocab = vocab.vocab_size) outfile = args.outfile - OutputFile.write_vocab_only(outfile, params, vocab, special_vocab) + OutputFile.write_vocab_only(outfile, params, vocab, special_vocab, + endianess = endianess, pad_vocab = args.padvocab) print(f"Wrote {outfile}") return @@ -1127,7 +1157,8 @@ def main(args_in: list[str] | None = None) -> None: params.ftype = ftype print(f"Writing {outfile}, format {ftype}") - OutputFile.write_all(outfile, ftype, params, model, vocab, special_vocab, concurrency = args.concurrency, endianess=endianess) + OutputFile.write_all(outfile, ftype, params, model, vocab, special_vocab, + concurrency = args.concurrency, endianess = endianess, pad_vocab = args.padvocab) print(f"Wrote {outfile}")