convert : fix pre-tokenizer type writing

This commit is contained in:
Georgi Gerganov 2024-04-26 20:55:14 +03:00
parent 43e12ce8e5
commit 1b9b79dd14
No known key found for this signature in database
GPG key ID: 449E073F9DC10735
4 changed files with 3 additions and 1 deletions

View file

@ -408,6 +408,8 @@ class Model(ABC):
if res is None: if res is None:
raise NotImplementedError(f"BPE pre-tokenizer was not recognized - update get_vocab_base_pre()") raise NotImplementedError(f"BPE pre-tokenizer was not recognized - update get_vocab_base_pre()")
return res
def _set_vocab_gpt2(self) -> None: def _set_vocab_gpt2(self) -> None:
tokens, toktypes, tokpre = self.get_vocab_base() tokens, toktypes, tokpre = self.get_vocab_base()
self.gguf_writer.add_tokenizer_model("gpt2") self.gguf_writer.add_tokenizer_model("gpt2")

View file

@ -4290,7 +4290,7 @@ static void llm_load_vocab(
} }
if (tokenizer_pre.empty()) { if (tokenizer_pre.empty()) {
LLAMA_LOG_WARN("%s: missing tokenizer pre, using default tokenizer pre: 'default'", __func__); LLAMA_LOG_WARN("%s: missing tokenizer pre, using default tokenizer pre: 'default'\n", __func__);
vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_DEFAULT; vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_DEFAULT;
} else if (tokenizer_pre == "default") { } else if (tokenizer_pre == "default") {
vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_DEFAULT; vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_DEFAULT;

Binary file not shown.