convert-gptneox-hf-to-gguf and convert: Only handle merges for BPE tokenizer

2023-08-29 04:23:09 -06:00 · 2023-08-29 04:23:09 -06:00 · 0c620ef63b
commit 0c620ef63b
parent 61911ca4db
2 changed files with 3 additions and 3 deletions
--- a/convert-gptneox-hf-to-gguf.py
+++ b/convert-gptneox-hf-to-gguf.py
@ -150,7 +150,7 @@ if Path(dir_model + "/tokenizer.json").is_file():

    gguf_writer.add_token_list(tokens)

-special_vocab = gguf.SpecialVocab(Path(dir_model))
+special_vocab = gguf.SpecialVocab(Path(dir_model), load_merges = True)
 special_vocab.add_to_gguf(gguf_writer)

 # TENSORS
--- a/convert.py
+++ b/convert.py
@ -1159,7 +1159,7 @@ def main(args_in: Optional[List[str]] = None) -> None:
        assert args.outfile, "need --outfile if using --vocab-only"
        # FIXME: Try to respect vocab_dir somehow?
        vocab = load_vocab(args.vocab_dir or args.model, args.vocabtype)
-        special_vocab = gguf.SpecialVocab(model_plus.paths[0].parent)
+        special_vocab = gguf.SpecialVocab(model_plus.paths[0].parent, load_merges = args.vocabtype == 'bpe')
        outfile = args.outfile
        OutputFile.write_vocab_only(outfile, params, vocab, special_vocab)
        print(f"Wrote {outfile}")
@ -1171,7 +1171,7 @@ def main(args_in: Optional[List[str]] = None) -> None:
        vocab_dir = args.vocab_dir if args.vocab_dir else model_plus.paths[0].parent
        vocab = load_vocab(vocab_dir, args.vocabtype)
    # FIXME: Try to respect vocab_dir somehow?
-    special_vocab = gguf.SpecialVocab(model_plus.paths[0].parent)
+    special_vocab = gguf.SpecialVocab(model_plus.paths[0].parent, load_merges = args.vocabtype == 'bpe')

    model   = model_plus.model
    model   = convert_model_names(model, params)