From d53240ccc24d32b4ed25d541a23eadf2109b8aae Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Fri, 3 May 2024 17:27:12 +0300 Subject: [PATCH] refact : add tokenizer model --- convert-hf-to-gguf-update.py | 1 + convert-hf-to-gguf.py | 3 +++ llama.cpp | 4 ++++ llama.h | 1 + models/ggml-vocab-refact.gguf | Bin 1720666 -> 1720710 bytes 5 files changed, 9 insertions(+) diff --git a/convert-hf-to-gguf-update.py b/convert-hf-to-gguf-update.py index 04ad8fb6a..9079484a3 100644 --- a/convert-hf-to-gguf-update.py +++ b/convert-hf-to-gguf-update.py @@ -56,6 +56,7 @@ models = [ { "name": "mpt", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/mosaicml/mpt-7b", }, { "name": "starcoder", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/bigcode/starcoder2-3b", }, { "name": "gpt-2", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/openai-community/gpt2", }, + { "name": "refact", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/smallcloudai/Refact-1_6-base", }, ] # make directory "models/tokenizers" if it doesn't exist diff --git a/convert-hf-to-gguf.py b/convert-hf-to-gguf.py index 2f146d730..2b07bfa45 100755 --- a/convert-hf-to-gguf.py +++ b/convert-hf-to-gguf.py @@ -306,6 +306,9 @@ class Model(ABC): if chkhsh == "3ce83efda5659b07b1ad37ca97ca5797ea4285d9b9ab0dc679e4a720c9da7454": # ref: https://huggingface.co/openai-community/gpt2 res = "gpt-2" + if chkhsh == "6221ad2852e85ce96f791f476e0b390cf9b474c9e3d1362f53a24a06dc8220ff": + # ref: https://huggingface.co/smallcloudai/Refact-1_6-base + res = "refact" if res is None: print("\n") diff --git a/llama.cpp b/llama.cpp index a6cb3a8a9..b3a77ea79 100644 --- a/llama.cpp +++ b/llama.cpp @@ -4383,6 +4383,9 @@ static void llm_load_vocab( } else if ( tokenizer_pre == "gpt-2") { vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_GPT2; + } else if ( + tokenizer_pre == "refact") { + vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_REFACT; } else { throw std::runtime_error(format("unknown pre-tokenizer type: '%s'", tokenizer_pre.c_str())); } @@ -12234,6 +12237,7 @@ struct llm_tokenizer_bpe { }); break; case LLAMA_VOCAB_PRE_TYPE_STARCODER: + case LLAMA_VOCAB_PRE_TYPE_REFACT: word_collection = unicode_regex_split(text, { "\\p{N}", "'s|'t|'re|'ve|'m|'ll|'d| ?\\p{L}+| ?\\p{N}+| ?[^\\s\\p{L}\\p{N}]+|\\s+(?!\\S)", diff --git a/llama.h b/llama.h index 059d78f11..f199b5d7f 100644 --- a/llama.h +++ b/llama.h @@ -79,6 +79,7 @@ extern "C" { LLAMA_VOCAB_PRE_TYPE_MPT = 5, LLAMA_VOCAB_PRE_TYPE_STARCODER = 6, LLAMA_VOCAB_PRE_TYPE_GPT2 = 7, + LLAMA_VOCAB_PRE_TYPE_REFACT = 8, }; // note: these values should be synchronized with ggml_rope diff --git a/models/ggml-vocab-refact.gguf b/models/ggml-vocab-refact.gguf index 8f26cfb76c9f287f5ff06bf0445ba9ac363f11e8..52afcf01aeb7323e0153b2817a1cbfc5a07d787b 100644 GIT binary patch delta 166 zcmYMpu?>Pi7(iivQ9x1Q4nbrYVv5b0)A-Y;zQv~G mp`^ff6BcYZaN)s6fDjQBVkAhRks(Kc5*2DRXfGwZrGEj<_!uq#