convert-gptneox-hf-to-gguf and convert: Only handle merges for BPE tokenizer

This commit is contained in:
KerfuffleV2 2023-08-29 04:23:09 -06:00
parent 61911ca4db
commit 0c620ef63b
2 changed files with 3 additions and 3 deletions

View file

@ -150,7 +150,7 @@ if Path(dir_model + "/tokenizer.json").is_file():
gguf_writer.add_token_list(tokens)
special_vocab = gguf.SpecialVocab(Path(dir_model))
special_vocab = gguf.SpecialVocab(Path(dir_model), load_merges = True)
special_vocab.add_to_gguf(gguf_writer)
# TENSORS

View file

@ -1159,7 +1159,7 @@ def main(args_in: Optional[List[str]] = None) -> None:
assert args.outfile, "need --outfile if using --vocab-only"
# FIXME: Try to respect vocab_dir somehow?
vocab = load_vocab(args.vocab_dir or args.model, args.vocabtype)
special_vocab = gguf.SpecialVocab(model_plus.paths[0].parent)
special_vocab = gguf.SpecialVocab(model_plus.paths[0].parent, load_merges = args.vocabtype == 'bpe')
outfile = args.outfile
OutputFile.write_vocab_only(outfile, params, vocab, special_vocab)
print(f"Wrote {outfile}")
@ -1171,7 +1171,7 @@ def main(args_in: Optional[List[str]] = None) -> None:
vocab_dir = args.vocab_dir if args.vocab_dir else model_plus.paths[0].parent
vocab = load_vocab(vocab_dir, args.vocabtype)
# FIXME: Try to respect vocab_dir somehow?
special_vocab = gguf.SpecialVocab(model_plus.paths[0].parent)
special_vocab = gguf.SpecialVocab(model_plus.paths[0].parent, load_merges = args.vocabtype == 'bpe')
model = model_plus.model
model = convert_model_names(model, params)