From 1b9b79dd14e39f5aa8afbc2ee1a924eb1900c120 Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Fri, 26 Apr 2024 20:55:14 +0300 Subject: [PATCH] convert : fix pre-tokenizer type writing --- convert-hf-to-gguf.py | 2 ++ llama.cpp | 2 +- models/ggml-vocab-deepseek-coder.gguf | Bin 1157113 -> 1157165 bytes models/ggml-vocab-deepseek-llm.gguf | Bin 3970627 -> 3970677 bytes 4 files changed, 3 insertions(+), 1 deletion(-) diff --git a/convert-hf-to-gguf.py b/convert-hf-to-gguf.py index 28b060ed3..06fa9996d 100755 --- a/convert-hf-to-gguf.py +++ b/convert-hf-to-gguf.py @@ -408,6 +408,8 @@ class Model(ABC): if res is None: raise NotImplementedError(f"BPE pre-tokenizer was not recognized - update get_vocab_base_pre()") + return res + def _set_vocab_gpt2(self) -> None: tokens, toktypes, tokpre = self.get_vocab_base() self.gguf_writer.add_tokenizer_model("gpt2") diff --git a/llama.cpp b/llama.cpp index e05d10cdb..d8e691c18 100644 --- a/llama.cpp +++ b/llama.cpp @@ -4290,7 +4290,7 @@ static void llm_load_vocab( } if (tokenizer_pre.empty()) { - LLAMA_LOG_WARN("%s: missing tokenizer pre, using default tokenizer pre: 'default'", __func__); + LLAMA_LOG_WARN("%s: missing tokenizer pre, using default tokenizer pre: 'default'\n", __func__); vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_DEFAULT; } else if (tokenizer_pre == "default") { vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_DEFAULT; diff --git a/models/ggml-vocab-deepseek-coder.gguf b/models/ggml-vocab-deepseek-coder.gguf index 8ea17fa4d988de61d2db3941d3a4870fe5f3303a..60de2c4b5a62a5b132b7557d13d343c696e2d850 100644 GIT binary patch delta 137 zcmex)-fis#H$ivzP&Z}<1d!S&xQvNWXu9oUL2;R)R1TmFA5=CaHMO8PH8oo|IX@+} zsChe6`*tQqAO_LQK+FQftU$~L#Oy%K0mPg@%mu{UK+FTgyg2FoR(B5iWPlmo?QxGg-$a5@?41UO^AgrZKXQ>+57?)_EVMs delta 280 zcmYMlHxdC+0Dxip*2-z6C$neQOVx zwtd*^j(_rI?!@J3Sn{aENeKu|z=3vPJeg%5rN5JU)JL|`I{7~)7E35gWa$RLXx v@+hE)5-gNaK@~OB(LfU|w9!EqJ@hfa5F?B+!4xyhvA_~5tg*Q(v3;Gt9ZFaJ