llama : validate special token ids are in range when loading GGUF model (#3635)

* Add validation for special token ids to llama.cpp Small optimization for llama_byte_to_token SPM mode * Fix BPE newline check, only I could break something so simple * Killll meeeeee * Account for GGUF_KEY_KEY only setting when the key exists * Minor code cleanups. * Fix convert.py error msg when added tokens are out of range * Make gguf SpecialVocab vocab size-aware Update conversion scripts accordingly * Avoid a string copy Co-authored-by: Georgi Gerganov <ggerganov@gmail.com> --------- Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
2023-10-22 12:14:56 -06:00 · 2023-10-22 12:14:56 -06:00 · a5e7dbd614
commit a5e7dbd614
parent d3956aea53
11 changed files with 72 additions and 32 deletions
--- a/convert.py
+++ b/convert.py
@ -369,7 +369,7 @@ class SentencePieceVocab:
        expected_ids = list(range(vocab_size, vocab_size + len(added_tokens)))
        actual_ids   = sorted(added_tokens.values())
        if expected_ids != actual_ids:
-            raise Exception(f"Expected added token IDs to be sequential and start at {len(added_tokens)}; got {actual_ids}")
+            raise Exception(f"Expected added token IDs to be sequential and start at {vocab_size}; got {actual_ids}")

        items = sorted(added_tokens.items(), key=lambda text_idx: text_idx[1])
        self.added_tokens_list = [text for (text, idx) in items]
@ -1163,10 +1163,13 @@ def main(args_in: list[str] | None = None) -> None:

    vocab: Vocab
    if args.vocab_only:
-        assert args.outfile, "need --outfile if using --vocab-only"
+        if not args.outfile:
+            raise ValueError("need --outfile if using --vocab-only")
        # FIXME: Try to respect vocab_dir somehow?
        vocab = load_vocab(args.vocab_dir or args.model, args.vocabtype)
-        special_vocab = gguf.SpecialVocab(model_plus.paths[0].parent, load_merges = args.vocabtype == 'bpe')
+        special_vocab = gguf.SpecialVocab(model_plus.paths[0].parent,
+            load_merges = args.vocabtype == 'bpe',
+            n_vocab = vocab.vocab_size)
        outfile = args.outfile
        OutputFile.write_vocab_only(outfile, params, vocab, special_vocab)
        print(f"Wrote {outfile}")
@ -1178,7 +1181,9 @@ def main(args_in: list[str] | None = None) -> None:
        vocab_dir = args.vocab_dir if args.vocab_dir else model_plus.paths[0].parent
        vocab = load_vocab(vocab_dir, args.vocabtype)
    # FIXME: Try to respect vocab_dir somehow?
-    special_vocab = gguf.SpecialVocab(model_plus.paths[0].parent, load_merges = args.vocabtype == 'bpe')
+    special_vocab = gguf.SpecialVocab(model_plus.paths[0].parent,
+        load_merges = args.vocabtype == 'bpe',
+        n_vocab = vocab.vocab_size)

    model   = model_plus.model
    model   = convert_model_names(model, params)