From ce005285aadd87403f4f6c69f1516ef2801d48ae Mon Sep 17 00:00:00 2001 From: KerfuffleV2 Date: Tue, 29 Aug 2023 12:01:59 -0600 Subject: [PATCH] convert.py: Set gpt2 as tokenizer model when using BPE --- convert.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/convert.py b/convert.py index 27d187dac..448b6f0f3 100755 --- a/convert.py +++ b/convert.py @@ -846,7 +846,12 @@ class OutputFile: scores.append(score) toktypes.append(toktype) - self.gguf.add_tokenizer_model("llama") + if isinstance(vocab, SentencePieceVocab): + self.gguf.add_tokenizer_model("llama") + elif isinstance(vocab, BpeVocab): + self.gguf.add_tokenizer_model("gpt2") + else: + raise ValueError(f'Unknown vocab type: Not BpeVocab or SentencePieceVocab') self.gguf.add_token_list(tokens) self.gguf.add_token_scores(scores) self.gguf.add_token_types(toktypes)