From d53240ccc24d32b4ed25d541a23eadf2109b8aae Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Fri, 3 May 2024 17:27:12 +0300
Subject: [PATCH] refact : add tokenizer model

---
 convert-hf-to-gguf-update.py  |   1 +
 convert-hf-to-gguf.py         |   3 +++
 llama.cpp                     |   4 ++++
 llama.h                       |   1 +
 models/ggml-vocab-refact.gguf | Bin 1720666 -> 1720710 bytes
 5 files changed, 9 insertions(+)

diff --git a/convert-hf-to-gguf-update.py b/convert-hf-to-gguf-update.py
index 04ad8fb6a..9079484a3 100644
--- a/convert-hf-to-gguf-update.py
+++ b/convert-hf-to-gguf-update.py
@@ -56,6 +56,7 @@ models = [
         { "name": "mpt",            "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/mosaicml/mpt-7b", },
         { "name": "starcoder",      "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/bigcode/starcoder2-3b", },
         { "name": "gpt-2",          "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/openai-community/gpt2", },
+        { "name": "refact",         "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/smallcloudai/Refact-1_6-base", },
         ]
 
 # make directory "models/tokenizers" if it doesn't exist
diff --git a/convert-hf-to-gguf.py b/convert-hf-to-gguf.py
index 2f146d730..2b07bfa45 100755
--- a/convert-hf-to-gguf.py
+++ b/convert-hf-to-gguf.py
@@ -306,6 +306,9 @@ class Model(ABC):
         if chkhsh == "3ce83efda5659b07b1ad37ca97ca5797ea4285d9b9ab0dc679e4a720c9da7454":
             # ref: https://huggingface.co/openai-community/gpt2
             res = "gpt-2"
+        if chkhsh == "6221ad2852e85ce96f791f476e0b390cf9b474c9e3d1362f53a24a06dc8220ff":
+            # ref: https://huggingface.co/smallcloudai/Refact-1_6-base
+            res = "refact"
 
         if res is None:
             print("\n")
diff --git a/llama.cpp b/llama.cpp
index a6cb3a8a9..b3a77ea79 100644
--- a/llama.cpp
+++ b/llama.cpp
@@ -4383,6 +4383,9 @@ static void llm_load_vocab(
             } else if (
                     tokenizer_pre == "gpt-2") {
                 vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_GPT2;
+            } else if (
+                    tokenizer_pre == "refact") {
+                vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_REFACT;
             } else {
                 throw std::runtime_error(format("unknown pre-tokenizer type: '%s'", tokenizer_pre.c_str()));
             }
@@ -12234,6 +12237,7 @@ struct llm_tokenizer_bpe {
                         });
                         break;
                     case LLAMA_VOCAB_PRE_TYPE_STARCODER:
+                    case LLAMA_VOCAB_PRE_TYPE_REFACT:
                         word_collection = unicode_regex_split(text, {
                             "\\p{N}",
                             "'s|'t|'re|'ve|'m|'ll|'d| ?\\p{L}+| ?\\p{N}+| ?[^\\s\\p{L}\\p{N}]+|\\s+(?!\\S)",
diff --git a/llama.h b/llama.h
index 059d78f11..f199b5d7f 100644
--- a/llama.h
+++ b/llama.h
@@ -79,6 +79,7 @@ extern "C" {
         LLAMA_VOCAB_PRE_TYPE_MPT            = 5,
         LLAMA_VOCAB_PRE_TYPE_STARCODER      = 6,
         LLAMA_VOCAB_PRE_TYPE_GPT2           = 7,
+        LLAMA_VOCAB_PRE_TYPE_REFACT         = 8,
     };
 
     // note: these values should be synchronized with ggml_rope
diff --git a/models/ggml-vocab-refact.gguf b/models/ggml-vocab-refact.gguf
index 8f26cfb76c9f287f5ff06bf0445ba9ac363f11e8..52afcf01aeb7323e0153b2817a1cbfc5a07d787b 100644
GIT binary patch
delta 166
zcmYMpu?>Pi7(iivQ9x1Q4nbrYV<QtdfFY335kf-?35?(zR<2=fcboW3Onk{#y?0-Y
zL#49pda-8z^h3Qw^j*$r9Mw(Xvh3e(it?7XgOSO|*BllWSi*(_7b~pcVS_Dp@DV^k
RBg7sNVjPg*_%xDsrw`RHGiv|<

delta 137
zcmYMl$qhh206<YQ_GK`3nsK87MJU0QgnFE`;j9|p$xmMXzRk}%>v5b0)A-Y;zQv~G
mp`^ff6BcYZaN)s6fDjQBVkAhRks(Kc5*2DRXfGwZrGEj<_!uq#