Merge branch 'ggerganov:gg/bpe-preprocess' into gg/bpe-preprocess

2024-04-29 10:55:15 +02:00 · 2024-04-29 10:55:15 +02:00 · 866e3941f7
commit 866e3941f7
parent 0cf9ed3457 c21ab1833e
38 changed files with 1687 additions and 1313 deletions
--- a/.github/workflows/python-lint.yml
+++ b/.github/workflows/python-lint.yml
@ -20,5 +20,5 @@ jobs:
      - name: flake8 Lint
        uses: py-actions/flake8@v2
        with:
-            ignore: "E203,E211,E221,E222,E225,E231,E241,E251,E261,E266,E501,E701,E704,W503"
-            exclude: "examples/*,examples/*/**,*/**/__init__.py"
+            ignore: "E203,E211,E221,E225,E231,E241,E251,E261,E266,E501,E701,E704,W503"
+            exclude: "examples/*,examples/*/**,*/**/__init__.py,convert-hf-to-gguf-update.py"
--- a/.gitignore
+++ b/.gitignore
@ -118,10 +118,8 @@ nppBackup
 /tests/test-quantize-fns
 /tests/test-quantize-perf
 /tests/test-sampling
-/tests/test-tokenizer-0-llama
-/tests/test-tokenizer-0-falcon
-/tests/test-tokenizer-0-deepseek-coder
-/tests/test-tokenizer-1-llama
+/tests/test-tokenizer-0
+/tests/test-tokenizer-1-spm
 /tests/test-tokenizer-1-bpe
 /tests/test-rope
 /tests/test-backend-ops
--- a/39
+++ b/39
@ -20,13 +20,9 @@ TEST_TARGETS = \
 	tests/test-quantize-perf \
 	tests/test-rope \
 	tests/test-sampling \
-	tests/test-tokenizer-0-deepseek-coder \
-	tests/test-tokenizer-0-deepseek-llm \
-	tests/test-tokenizer-0-falcon \
-	tests/test-tokenizer-0-llama \
-	tests/test-tokenizer-0-llama-v3 \
+	tests/test-tokenizer-0 \
 	tests/test-tokenizer-1-bpe \
-	tests/test-tokenizer-1-llama
+	tests/test-tokenizer-1-spm

 # Code coverage output files
 COV_TARGETS = *.gcno tests/*.gcno *.gcda tests/*.gcda *.gcov tests/*.gcov lcov-report gcovr-report
@ -65,17 +61,14 @@ default: $(BUILD_TARGETS)
 test: $(TEST_TARGETS)
 	@failures=0; \
 	for test_target in $(TEST_TARGETS); do \
-		if [ "$$test_target" = "tests/test-tokenizer-0-llama" ]; then \
-			./$$test_target $(CURDIR)/models/ggml-vocab-llama.gguf; \
-		elif [ "$$test_target" = "tests/test-tokenizer-0-llama-v3" ]; then \
-			./$$test_target $(CURDIR)/models/ggml-vocab-llama-v3.gguf; \
-		elif [ "$$test_target" = "tests/test-tokenizer-0-falcon" ]; then \
+		if [ "$$test_target" = "tests/test-tokenizer-0" ]; then \
+			./$$test_target $(CURDIR)/models/ggml-vocab-llama-spm.gguf; \
+			./$$test_target $(CURDIR)/models/ggml-vocab-llama-bpe.gguf; \
 			./$$test_target $(CURDIR)/models/ggml-vocab-falcon.gguf; \
-		elif [ "$$test_target" = "tests/test-tokenizer-0-deepseek-coder" ]; then \
 			./$$test_target $(CURDIR)/models/ggml-vocab-deepseek-coder.gguf; \
-		elif [ "$$test_target" = "tests/test-tokenizer-0-deepseek-llm" ]; then \
 			./$$test_target $(CURDIR)/models/ggml-vocab-deepseek-llm.gguf; \
-		elif [ "$$test_target" = "tests/test-tokenizer-1-llama" ]; then \
+			./$$test_target $(CURDIR)/models/ggml-vocab-bert-bge.gguf; \
+		elif [ "$$test_target" = "tests/test-tokenizer-1-spm" ]; then \
 			continue; \
 		elif [ "$$test_target" = "tests/test-tokenizer-1-bpe" ]; then \
 			continue; \
@ -993,29 +986,15 @@ tests/test-sampling: tests/test-sampling.cpp ggml.o llama.o $(OBJS)
 	$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)

-tests/test-tokenizer-0-falcon: tests/test-tokenizer-0-falcon.cpp ggml.o llama.o $(COMMON_DEPS) console.o $(OBJS)
+tests/test-tokenizer-0: tests/test-tokenizer-0.cpp ggml.o llama.o $(COMMON_DEPS) console.o $(OBJS)
 	$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)

-tests/test-tokenizer-0-llama: tests/test-tokenizer-0-llama.cpp ggml.o llama.o $(COMMON_DEPS) console.o $(OBJS)
-	$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
-	$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
-
-tests/test-tokenizer-0-llama-v3: tests/test-tokenizer-0-llama-v3.cpp ggml.o llama.o $(COMMON_DEPS) console.o $(OBJS)
-	$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
-	$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
-
-tests/test-tokenizer-0-deepseek-coder: tests/test-tokenizer-0-deepseek-coder.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS)
-	$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
-
-tests/test-tokenizer-0-deepseek-llm: tests/test-tokenizer-0-deepseek-llm.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS)
-	$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
-
 tests/test-tokenizer-1-bpe: tests/test-tokenizer-1-bpe.cpp ggml.o llama.o $(COMMON_DEPS) console.o $(OBJS)
 	$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)

-tests/test-tokenizer-1-llama: tests/test-tokenizer-1-llama.cpp ggml.o llama.o $(COMMON_DEPS) console.o $(OBJS)
+tests/test-tokenizer-1-spm: tests/test-tokenizer-1-spm.cpp ggml.o llama.o $(COMMON_DEPS) console.o $(OBJS)
 	$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)

--- a/convert-hf-to-gguf-update.py
+++ b/convert-hf-to-gguf-update.py
@ -0,0 +1,271 @@
+# This script downloads the tokenizer models of the specified models from Huggingface and
+# generates the get_vocab_base_pre() function for convert-hf-to-gguf.py
+#
+# This is necessary in order to analyze the type of pre-tokenizer used by the model and
+# provide the necessary information to llama.cpp via the GGUF header in order to implement
+# the same pre-tokenizer.
+#
+# ref: https://github.com/ggerganov/llama.cpp/pull/6920
+#
+# Instructions:
+#
+# - Add a new model to the "models" list
+# - Run the script with your huggingface token:
+#
+#   python3 convert-hf-to-gguf-update.py <huggingface_token>
+#
+# - Copy-paste the generated get_vocab_base_pre() function into convert-hf-to-gguf.py
+# - Update llama.cpp with the new pre-tokenizer if necessary
+#
+# TODO: generate tokenizer tests for llama.cpp
+# TODO: automate the update of convert-hf-to-gguf.py
+#
+
+import os
+import requests
+import sys
+import json
+
+from hashlib import sha256
+from enum import IntEnum, auto
+
+class TOKENIZER_TYPE(IntEnum):
+    SPM = auto()
+    BPE = auto()
+    WPM = auto()
+
+# TODO: this string has to exercise as much pre-tokenizer functionality as possible
+#       will be updated with time - contributions welcome
+chktxt = '\n \n\n \n\n\n \t \t\t \t\n  \n   \n    \n     \n🚀 (normal) 😶‍🌫️ (multiple emojis concatenated) ✅ 🦙🦙 3 33 333 3333 33333 333333 3333333 33333333 3.3 3..3 3...3 កាន់តែពិសេសអាច😁 ?我想在apple工作1314151天～ ------======= нещо на Български \'\'\'\'\'\'```````\"\"\"\"......!!!!!!?????? I\'ve been \'told he\'s there, \'RE you sure? \'M not sure I\'ll make it, \'D you like some tea? We\'Ve a\'lL'
+
+if len(sys.argv) == 2:
+    token = sys.argv[1]
+else:
+    print("Usage: python convert-hf-to-gguf-update.py <huggingface_token>")
+    sys.exit(1)
+
+# TODO: add models here, base models preferred
+models = [
+        { "name": "llama-spm",      "tokt": TOKENIZER_TYPE.SPM, "repo": "https://huggingface.co/meta-llama/Llama-2-7b-hf",                },
+        { "name": "llama-bpe",      "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/meta-llama/Meta-Llama-3-8B",              },
+        { "name": "deepseek-llm",   "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/deepseek-ai/deepseek-llm-7b-base",        },
+        { "name": "deepseek-coder", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/deepseek-ai/deepseek-coder-6.7b-base",    },
+        { "name": "falcon",         "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/tiiuae/falcon-7b",                        },
+        { "name": "bert-bge",       "tokt": TOKENIZER_TYPE.WPM, "repo": "https://huggingface.co/BAAI/bge-small-en-v1.5",                  },
+        ]
+
+# make directory "models/tokenizers" if it doesn't exist
+if not os.path.exists("models/tokenizers"):
+    os.makedirs("models/tokenizers")
+
+def download_file_with_auth(url, token, save_path):
+    headers = {"Authorization": f"Bearer {token}"}
+    response = requests.get(url, headers=headers)
+    if response.status_code == 200:
+        with open(save_path, 'wb') as f:
+            f.write(response.content)
+        print(f"File {save_path} downloaded successfully")
+    else:
+        print(f"Failed to download file. Status code: {response.status_code}")
+
+# download the tokenizer models
+for model in models:
+    name = model["name"]
+    repo = model["repo"]
+    tokt = model["tokt"]
+
+    if not os.path.exists(f"models/tokenizers/{name}"):
+        os.makedirs(f"models/tokenizers/{name}")
+    else:
+        print(f"Directory models/tokenizers/{name} already exists - skipping")
+        continue
+
+    print(f"Downloading {name} to models/tokenizers/{name}")
+
+    url = f"{repo}/raw/main/config.json"
+    save_path = f"models/tokenizers/{name}/config.json"
+    download_file_with_auth(url, token, save_path)
+
+    url = f"{repo}/raw/main/tokenizer.json"
+    save_path = f"models/tokenizers/{name}/tokenizer.json"
+    download_file_with_auth(url, token, save_path)
+
+    if tokt == TOKENIZER_TYPE.SPM:
+        url = f"{repo}/resolve/main/tokenizer.model"
+        save_path = f"models/tokenizers/{name}/tokenizer.model"
+        download_file_with_auth(url, token, save_path)
+
+    url = f"{repo}/raw/main/tokenizer_config.json"
+    save_path = f"models/tokenizers/{name}/tokenizer_config.json"
+    download_file_with_auth(url, token, save_path)
+
+# generate the source code for the convert-hf-to-gguf.py:get_vocab_base_pre() function:
+# TODO: auto-update convert-hf-to-gguf.py with the generated function
+
+src_ifs = ""
+for model in models:
+    name = model["name"]
+    tokt = model["tokt"]
+
+    if tokt == TOKENIZER_TYPE.SPM:
+        continue
+
+    # create the tokenizer
+    from transformers import AutoTokenizer
+    tokenizer = AutoTokenizer.from_pretrained(f"models/tokenizers/{name}")
+
+    chktok = tokenizer.encode(chktxt)
+    chkhsh = sha256(str(chktok).encode()).hexdigest()
+
+    print(f"model: {name}")
+    print(f"tokt: {tokt}")
+    print(f"repo: {model['repo']}")
+    print(f"chktok: {chktok}")
+    print(f"chkhsh: {chkhsh}")
+
+    # print the "pre_tokenizer" content from the tokenizer.json
+    with open(f"models/tokenizers/{name}/tokenizer.json", "r") as f:
+        cfg = json.load(f)
+        pre_tokenizer = cfg["pre_tokenizer"]
+        print("pre_tokenizer: " + json.dumps(pre_tokenizer, indent=4))
+
+    print(f"\n")
+
+    src_ifs += f"        if chkhsh == \"{chkhsh}\":\n"
+    src_ifs += f"            # ref: {model['repo']}\n"
+    src_ifs += f"            res = \"{name}\"\n"
+
+src_func = ""
+src_func +=  "    def get_vocab_base_pre(self, tokenizer) -> str:\n"
+src_func +=  "        # encoding this string and hashing the resulting tokens would (hopefully) give us a unique identifier that\n"
+src_func +=  "        # is specific for the BPE pre-tokenizer used by the model\n"
+src_func +=  "        # we will use this unique identifier to write a \"tokenizer.ggml.pre\" entry in the GGUF file which we can\n"
+src_func +=  "        # use in llama.cpp to implement the same pre-tokenizer\n"
+src_func +=  "\n"
+src_func += f"        chktxt = {repr(chktxt)}\n"
+src_func +=  "\n"
+src_func +=  "        chktok = tokenizer.encode(chktxt)\n"
+src_func +=  "        chkhsh = sha256(str(chktok).encode()).hexdigest()\n"
+src_func +=  "\n"
+src_func +=  "        print(f\"chktok: {chktok}\")\n"
+src_func +=  "        print(f\"chkhsh: {chkhsh}\")\n"
+src_func +=  "\n"
+src_func +=  "        res = None\n"
+src_func +=  "\n"
+src_func +=  "        # NOTE: if you get an error here, you need to add the model to the if-elif chain below\n"
+src_func +=  "        #       don't do this manually - use the convert-hf-to-gguf-update.py script!\n"
+src_func += f"{src_ifs}\n"
+src_func +=  "        if res is None:\n"
+src_func +=  "            print(\"\\n\")\n"
+src_func +=  "            print(\"**************************************************************************************\")\n"
+src_func +=  "            print(\"** WARNING: The BPE pre-tokenizer was not recognized!\")\n"
+src_func +=  "            print(\"**          This means that it was not added yet or you are using an older version.\")\n"
+src_func +=  "            print(\"**          Check convert-hf-to-gguf-update.py and update it accordingly.\")\n"
+src_func +=  "            print(\"**\")\n"
+src_func +=  "            print(f\"** chkhsh:  {chkhsh}\")\n"
+src_func +=  "            print(\"**************************************************************************************\")\n"
+src_func +=  "            print(\"\\n\")\n"
+src_func +=  "            raise NotImplementedError(\"BPE pre-tokenizer was not recognized - update get_vocab_base_pre()\")\n"
+src_func +=  "\n"
+src_func +=  "        print(f\"tokenizer.ggml.pre: {res}\")\n"
+src_func +=  "        print(f\"chkhsh: {chkhsh}\")\n"
+src_func +=  "\n"
+src_func +=  "        return res\n"
+
+print(src_func)
+
+print("\n")
+print("!!! Copy-paste the function above into convert-hf-to-gguf.py !!!")
+print("\n")
+
+# generate tests for each tokenizer model
+
+tests = [
+    "",
+    " ",
+    "  ",
+    "   ",
+    "\t",
+    "\n",
+    "\n\n",
+    "\n\n\n",
+    "\t\n",
+    "Hello world",
+    " Hello world",
+    "Hello World",
+    " Hello World",
+    " Hello World!",
+    "Hello, world!",
+    " Hello, world!",
+    " this is 🦙.cpp",
+    "w048 7tuijk dsdfhu",
+    "нещо на Български",
+    "កាន់តែពិសេសអាចខលចេញ",
+    "🚀 (normal) 😶‍🌫️ (multiple emojis concatenated) ✅ (only emoji that has its own token)",
+    "Hello",
+    " Hello",
+    "  Hello",
+    "   Hello",
+    "    Hello",
+    "    Hello\n    Hello",
+    " (",
+    "\n =",
+    "' era",
+    "Hello, y'all! How are you 😁 ?我想在apple工作1314151天～",
+    "3",
+    "33",
+    "333",
+    "3333",
+    "33333",
+    "333333",
+    "3333333",
+    "33333333",
+    "333333333",
+    chktxt,
+]
+
+# write the tests to ./models/ggml-vocab-{name}.gguf.inp
+# the format is:
+#
+# test0
+# __ggml_vocab_test__
+# test1
+# __ggml_vocab_test__
+# ...
+#
+
+# with each model, encode all tests and write the results in ./models/ggml-vocab-{name}.gguf.out
+# for each test, write the resulting tokens on a separate line
+
+for model in models:
+    name = model["name"]
+    tokt = model["tokt"]
+
+    # create the tokenizer
+    from transformers import AutoTokenizer
+    tokenizer = AutoTokenizer.from_pretrained(f"models/tokenizers/{name}")
+
+    with open(f"models/ggml-vocab-{name}.gguf.inp", "w") as f:
+        for text in tests:
+            f.write(f"{text}")
+            f.write("\n__ggml_vocab_test__\n")
+
+    with open(f"models/ggml-vocab-{name}.gguf.out", "w") as f:
+        for text in tests:
+            res = tokenizer.encode(text, add_special_tokens=False)
+            for r in res:
+                f.write(f" {r}")
+            f.write("\n")
+
+    print(f"Tests for {name} written in ./models/ggml-vocab-{name}.gguf.*")
+
+# generate commands for creating vocab files
+
+print("\nRun the following commands to generate the vocab files for testing:\n")
+
+for model in models:
+    name = model["name"]
+
+    print(f"python3 convert-hf-to-gguf.py models/tokenizers/{name}/ --outfile models/ggml-vocab-{name}.gguf --vocab-only")
+
+print("\n")
--- a/convert-hf-to-gguf.py
+++ b/convert-hf-to-gguf.py
@ -11,6 +11,7 @@ import sys
 from abc import ABC, abstractmethod
 from enum import IntEnum
 from pathlib import Path
+from hashlib import sha256
 from typing import TYPE_CHECKING, Any, Callable, ContextManager, Iterator, Sequence, TypeVar, cast

 import numpy as np
@ -215,76 +216,6 @@ class Model(ABC):
        except KeyError:
            raise NotImplementedError(f'Architecture {arch!r} not supported!') from None

-    # @staticmethod
-    # def from_model_architecture(model_architecture):
-    #     if model_architecture == "GPTNeoXForCausalLM":
-    #         return GPTNeoXModel
-    #     if model_architecture == "BloomForCausalLM":
-    #         return BloomModel
-    #     if model_architecture == "MPTForCausalLM":
-    #         return MPTModel
-    #     if model_architecture in ("BaichuanForCausalLM", "BaiChuanForCausalLM"):
-    #         return BaichuanModel
-    #     if model_architecture in ("FalconForCausalLM", "RWForCausalLM"):
-    #         return FalconModel
-    #     if model_architecture == "GPTBigCodeForCausalLM":
-    #         return StarCoderModel
-    #     if model_architecture == "GPTRefactForCausalLM":
-    #         return RefactModel
-    #     if model_architecture == "PersimmonForCausalLM":
-    #         return PersimmonModel
-    #     if model_architecture == "LlamaForCausalLM":
-    #         return LlamaModel
-    #     if model_architecture in ("StableLMEpochForCausalLM", "LlavaStableLMEpochForCausalLM"):
-    #         return StableLMModel
-    #     if model_architecture == "QWenLMHeadModel":
-    #         return QwenModel
-    #     if model_architecture == "Qwen2ForCausalLM":
-    #         return Model
-    #     if model_architecture == "MixtralForCausalLM":
-    #         return MixtralModel
-    #     if model_architecture == "GPT2LMHeadModel":
-    #         return GPT2Model
-    #     if model_architecture == "PhiForCausalLM":
-    #         return Phi2Model
-    #     if model_architecture == "PlamoForCausalLM":
-    #         return PlamoModel
-    #     if model_architecture == "CodeShellForCausalLM":
-    #         return CodeShellModel
-    #     if model_architecture == "OrionForCausalLM":
-    #         return OrionModel
-    #     if model_architecture == "InternLM2ForCausalLM":
-    #         return InternLM2Model
-    #     if model_architecture == "MiniCPMForCausalLM":
-    #         return MiniCPMModel
-    #     if model_architecture == "BertModel":
-    #         return BertModel
-
-    @staticmethod
-    def from_model_name(model_name: str):
-        model_name_lower = model_name.lower()
-        if model_name_lower in ("stablelmepoch", "llavastablelmepoch"):
-            return StableLMModel
-        if model_name_lower == "gptneox":
-            return GPTNeoXModel
-        if model_name_lower == "bloom":
-            return BloomModel
-        if model_name_lower == "mpt":
-            return MPTModel
-        if model_name_lower in ("baichuan"):
-            return BaichuanModel
-        if model_name_lower in ("falcon", "rw"):
-            return FalconModel
-        if model_name_lower == "gptbigcode":
-            return StarCoderModel
-        if model_name_lower == "gptrefact":
-            return RefactModel
-        if model_name_lower == "persimmon":
-            return PersimmonModel
-        if model_name_lower in ("llama", "deepseekcoder", "deepseekllm"):
-            return LlamaModel
-        return Model
-
    def _is_model_safetensors(self) -> bool:
        return Model.count_model_parts(self.dir_model, ".safetensors") > 0

@ -298,53 +229,6 @@ class Model(ABC):
            return ("pytorch_model.bin",)
        return (f"pytorch_model-{n:05}-of-{self.num_parts:05}.bin" for n in range(1, self.num_parts + 1))

-    def _get_model_architecture(self) -> gguf.MODEL_ARCH:
-        arch = self.hparams["architectures"][0]
-        if arch == "GPTNeoXForCausalLM":
-            return gguf.MODEL_ARCH.GPTNEOX
-        if arch == "BloomForCausalLM":
-            return gguf.MODEL_ARCH.BLOOM
-        if arch == "MPTForCausalLM":
-            return gguf.MODEL_ARCH.MPT
-        if arch in ("BaichuanForCausalLM", "BaiChuanForCausalLM"):
-            return gguf.MODEL_ARCH.BAICHUAN
-        if arch in ("FalconForCausalLM", "RWForCausalLM"):
-            return gguf.MODEL_ARCH.FALCON
-        if arch == "GPTBigCodeForCausalLM":
-            return gguf.MODEL_ARCH.STARCODER
-        if arch == "GPTRefactForCausalLM":
-            return gguf.MODEL_ARCH.REFACT
-        if arch == "PersimmonForCausalLM":
-            return gguf.MODEL_ARCH.PERSIMMON
-        if arch == "LlamaForCausalLM":
-            return gguf.MODEL_ARCH.LLAMA
-        if arch in ("StableLMEpochForCausalLM", "LlavaStableLMEpochForCausalLM"):
-            return gguf.MODEL_ARCH.STABLELM
-        if arch == "QWenLMHeadModel":
-            return gguf.MODEL_ARCH.QWEN
-        if arch == "Qwen2ForCausalLM":
-            return gguf.MODEL_ARCH.QWEN2
-        if arch == "MixtralForCausalLM":
-            return gguf.MODEL_ARCH.LLAMA
-        if arch == "GPT2LMHeadModel":
-            return gguf.MODEL_ARCH.GPT2
-        if arch == "PhiForCausalLM":
-            return gguf.MODEL_ARCH.PHI2
-        if arch == "PlamoForCausalLM":
-            return gguf.MODEL_ARCH.PLAMO
-        if arch == "CodeShellForCausalLM":
-            return gguf.MODEL_ARCH.CODESHELL
-        if arch == "OrionForCausalLM":
-            return gguf.MODEL_ARCH.ORION
-        if arch == "InternLM2ForCausalLM":
-            return gguf.MODEL_ARCH.INTERNLM2
-        if arch == "MiniCPMForCausalLM":
-            return gguf.MODEL_ARCH.MINICPM
-        if arch == "BertModel":
-            return gguf.MODEL_ARCH.BERT
-
-        raise NotImplementedError(f'Architecture "{arch}" not supported!')
-
    # used for GPT-2 BPE and WordPiece vocabs
    def get_vocab_base(self) -> tuple[list[str], list[int], str]:
        tokens: list[str] = []
@ -376,16 +260,19 @@ class Model(ABC):

        return tokens, toktypes, tokpre

+    # NOTE: this function is generated by convert-hf-to-gguf-update.py
+    #       do not modify it manually!
+    # ref:  https://github.com/ggerganov/llama.cpp/pull/6920
    def get_vocab_base_pre(self, tokenizer) -> str:
        # encoding this string and hashing the resulting tokens would (hopefully) give us a unique identifier that
        # is specific for the BPE pre-tokenizer used by the model
        # we will use this unique identifier to write a "tokenizer.ggml.pre" entry in the GGUF file which we can
        # use in llama.cpp to implement the same pre-tokenizer

-        chktxt = "\n \n\n \n\n\n \t \t\t \t\n  \n   \n    \n     \n🚀 (normal) 😶‍🌫️ (multiple emojis concatenated) ✅ 🦙🦙 3 33 333 3333 33333 333333 3333333 33333333 3.3 3..3 3...3 កាន់តែពិសេសអាច😁 ?我想在apple工作1314151天～ ------======= нещо на Български what's ''''''```````\"\"\"\"......!!!!!!??????"
+        chktxt = '\n \n\n \n\n\n \t \t\t \t\n  \n   \n    \n     \n🚀 (normal) 😶\u200d🌫️ (multiple emojis concatenated) ✅ 🦙🦙 3 33 333 3333 33333 333333 3333333 33333333 3.3 3..3 3...3 កាន់តែពិសេសអាច😁 ?我想在apple工作1314151天～ ------======= нещо на Български \'\'\'\'\'\'```````""""......!!!!!!?????? I\'ve been \'told he\'s there, \'RE you sure? \'M not sure I\'ll make it, \'D you like some tea? We\'Ve a\'lL'

        chktok = tokenizer.encode(chktxt)
-        chkhsh = hash(tuple(chktok))
+        chkhsh = sha256(str(chktok).encode()).hexdigest()

        print(f"chktok: {chktok}")
        print(f"chkhsh: {chkhsh}")
@ -393,21 +280,38 @@ class Model(ABC):
        res = None

        # NOTE: if you get an error here, you need to add the model to the if-elif chain below
-        #       observe the stdout for the chkhsh value and add it to the chain
-        if self.model_arch == gguf.MODEL_ARCH.LLAMA:
-            if chkhsh == -3290901550109860290:
-                # ref: https://huggingface.co/meta-llama/Meta-Llama-3-8B-Instruct/blob/main/tokenizer.json
-                res = "llama3"
-            if chkhsh ==  5332289095291046364:
-                # ref: https://huggingface.co/deepseek-ai/deepseek-llm-7b-chat/blob/main/tokenizer.json
-                res = "deepseek-llm"
-            if chkhsh ==  4190561703949727616:
-                # ref: https://huggingface.co/deepseek-ai/deepseek-coder-6.7b-instruct/blob/main/tokenizer.json
-                res = "deepseek-coder"
+        #       don't do this manually - use the convert-hf-to-gguf-update.py script!
+        if chkhsh == "0ef9807a4087ebef797fc749390439009c3b9eda9ad1a097abbe738f486c01e5":
+            # ref: https://huggingface.co/meta-llama/Meta-Llama-3-8B
+            res = "llama-bpe"
+        if chkhsh == "049ecf7629871e3041641907f3de7c733e4dbfdc736f57d882ba0b0845599754":
+            # ref: https://huggingface.co/deepseek-ai/deepseek-llm-7b-base
+            res = "deepseek-llm"
+        if chkhsh == "347715f544604f9118bb75ed199f68779f423cabb20db6de6f31b908d04d7821":
+            # ref: https://huggingface.co/deepseek-ai/deepseek-coder-6.7b-base
+            res = "deepseek-coder"
+        if chkhsh == "8aeee3860c56296a157a1fe2fad249ec40aa59b1bb5709f4ade11c4e6fe652ed":
+            # ref: https://huggingface.co/tiiuae/falcon-7b
+            res = "falcon"
+        if chkhsh == "0876d13b50744004aa9aeae05e7b0647eac9d801b5ba4668afc01e709c15e19f":
+            # ref: https://huggingface.co/BAAI/bge-small-en-v1.5
+            res = "bert-bge"

        if res is None:
+            print("\n")
+            print("**************************************************************************************")
+            print("** WARNING: The BPE pre-tokenizer was not recognized!")
+            print("**          This means that it was not added yet or you are using an older version.")
+            print("**          Check convert-hf-to-gguf-update.py and update it accordingly.")
+            print("**")
+            print(f"** chkhsh:  {chkhsh}")
+            print("**************************************************************************************")
+            print("\n")
            raise NotImplementedError("BPE pre-tokenizer was not recognized - update get_vocab_base_pre()")

+        print(f"tokenizer.ggml.pre: {res}")
+        print(f"chkhsh: {chkhsh}")
+
        return res

    def _set_vocab_gpt2(self) -> None:
--- a/llama.cpp
+++ b/llama.cpp
@ -4330,19 +4330,33 @@ static void llm_load_vocab(
            vocab.special_mask_id = -1;
        }

-        if (tokenizer_pre.empty()) {
-            LLAMA_LOG_WARN("%s: missing pre-tokenizer type, using: 'default'\n", __func__);
-            vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_DEFAULT;
-        } else if (tokenizer_pre == "default") {
-            vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_DEFAULT;
-        } else if (tokenizer_pre == "llama3") {
-            vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_LLAMA3;
-        } else if (tokenizer_pre == "deepseek-llm") {
-            vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_DEEPSEEK_LLM;
-        } else if (tokenizer_pre == "deepseek-coder") {
-            vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_DEEPSEEK_CODER;
+        // for now, only BPE models have pre-tokenizers
+        if (vocab.type == LLAMA_VOCAB_TYPE_BPE) {
+            if (tokenizer_pre.empty()) {
+                LLAMA_LOG_WARN("%s: missing pre-tokenizer type, using: 'default'\n", __func__);
+                vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_DEFAULT;
+            } else if (
+                    tokenizer_pre == "default") {
+                vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_DEFAULT;
+            } else if (
+                    tokenizer_pre == "llama3"   ||
+                    tokenizer_pre == "llama-v3" ||
+                    tokenizer_pre == "llama-bpe") {
+                vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_LLAMA3;
+            } else if (
+                    tokenizer_pre == "deepseek-llm") {
+                vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_DEEPSEEK_LLM;
+            } else if (
+                    tokenizer_pre == "deepseek-coder") {
+                vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_DEEPSEEK_CODER;
+            } else if (
+                    tokenizer_pre == "falcon") {
+                vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_FALCON;
+            } else {
+                throw std::runtime_error(format("unknown pre-tokenizer type: '%s'", tokenizer_pre.c_str()));
+            }
        } else {
-            throw std::runtime_error(format("unknown pre-tokenizer type: '%s'", tokenizer_pre.c_str()));
+            vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_DEFAULT;
        }
    }

@ -12102,6 +12116,14 @@ struct llm_tokenizer_bpe {
                            "\\p{N}+",
                        });
                        break;
+                    case LLAMA_VOCAB_PRE_TYPE_FALCON:
+                        word_collection = unicode_regex_split(text, {
+                            "[\\p{P}\\$\\+<=>\\^~\\|]+",
+                            "'s|'t|'re|'ve|'m|'ll|'d| ?\\p{L}+| ?\\p{N}+| ?[^\\s\\p{L}\\p{N}]+|\\s+(?!\\S)",
+                            "\\p{N}+",
+                            "[0-9][0-9][0-9]",
+                        });
+                        break;
                    default:
                        // default regex for BPE tokenization pre-processing
                        word_collection = unicode_regex_split(text, {
@ -12562,7 +12584,7 @@ static std::vector<llama_vocab::id> llama_tokenize_internal(const llama_vocab &
            } break;
        case LLAMA_VOCAB_TYPE_BPE:
            {
-                if (add_special && vocab.special_add_bos == 1) {
+                if (add_special && vocab.special_add_bos != 0) {
                    GGML_ASSERT(vocab.special_bos_id != -1);
                    output.push_back(vocab.special_bos_id);
                }
--- a/llama.h
+++ b/llama.h
@ -75,6 +75,7 @@ extern "C" {
        LLAMA_VOCAB_PRE_TYPE_LLAMA3         = 1,
        LLAMA_VOCAB_PRE_TYPE_DEEPSEEK_LLM   = 2,
        LLAMA_VOCAB_PRE_TYPE_DEEPSEEK_CODER = 3,
+        LLAMA_VOCAB_PRE_TYPE_FALCON         = 4,
    };

    // note: these values should be synchronized with ggml_rope
--- a/models/ggml-vocab-bert-bge.gguf
+++ b/models/ggml-vocab-bert-bge.gguf
--- a/models/ggml-vocab-bert-bge.gguf.inp
+++ b/models/ggml-vocab-bert-bge.gguf.inp
@ -0,0 +1,102 @@
+
+__ggml_vocab_test__
+ 
+__ggml_vocab_test__
+  
+__ggml_vocab_test__
+   
+__ggml_vocab_test__
+	
+__ggml_vocab_test__
+
+
+__ggml_vocab_test__
+
+
+
+__ggml_vocab_test__
+
+
+
+
+__ggml_vocab_test__
+	
+
+__ggml_vocab_test__
+Hello world
+__ggml_vocab_test__
+ Hello world
+__ggml_vocab_test__
+Hello World
+__ggml_vocab_test__
+ Hello World
+__ggml_vocab_test__
+ Hello World!
+__ggml_vocab_test__
+Hello, world!
+__ggml_vocab_test__
+ Hello, world!
+__ggml_vocab_test__
+ this is 🦙.cpp
+__ggml_vocab_test__
+w048 7tuijk dsdfhu
+__ggml_vocab_test__
+нещо на Български
+__ggml_vocab_test__
+កាន់តែពិសេសអាចខលចេញ
+__ggml_vocab_test__
+🚀 (normal) 😶‍🌫️ (multiple emojis concatenated) ✅ (only emoji that has its own token)
+__ggml_vocab_test__
+Hello
+__ggml_vocab_test__
+ Hello
+__ggml_vocab_test__
+  Hello
+__ggml_vocab_test__
+   Hello
+__ggml_vocab_test__
+    Hello
+__ggml_vocab_test__
+    Hello
+    Hello
+__ggml_vocab_test__
+ (
+__ggml_vocab_test__
+
+ =
+__ggml_vocab_test__
+' era
+__ggml_vocab_test__
+Hello, y'all! How are you 😁 ?我想在apple工作1314151天～
+__ggml_vocab_test__
+3
+__ggml_vocab_test__
+33
+__ggml_vocab_test__
+333
+__ggml_vocab_test__
+3333
+__ggml_vocab_test__
+33333
+__ggml_vocab_test__
+333333
+__ggml_vocab_test__
+3333333
+__ggml_vocab_test__
+33333333
+__ggml_vocab_test__
+333333333
+__ggml_vocab_test__
+
+ 
+
+ 
+
+
+ 	 		 	
+  
+   
+    
+     
+🚀 (normal) 😶‍🌫️ (multiple emojis concatenated) ✅ 🦙🦙 3 33 333 3333 33333 333333 3333333 33333333 3.3 3..3 3...3 កាន់តែពិសេសអាច😁 ?我想在apple工作1314151天～ ------======= нещо на Български ''''''```````""""......!!!!!!?????? I've been 'told he's there, 'RE you sure? 'M not sure I'll make it, 'D you like some tea? We'Ve a'lL
+__ggml_vocab_test__
--- a/models/ggml-vocab-bert-bge.gguf.out
+++ b/models/ggml-vocab-bert-bge.gguf.out
@ -0,0 +1,41 @@
+
+
+
+
+
+
+
+
+
+ 7592 2088
+ 7592 2088
+ 7592 2088
+ 7592 2088
+ 7592 2088 999
+ 7592 1010 2088 999
+ 7592 1010 2088 999
+ 2023 2003 100 1012 18133 2361
+ 1059 2692 18139 1021 8525 28418 2243 16233 20952 6979
+ 1192 15290 29754 14150 1192 10260 1181 29755 29436 29741 10260 16856 29747 23925 10325
+ 100
+ 100 1006 3671 1007 100 1006 3674 7861 29147 2483 9530 16280 23854 1007 100 1006 2069 7861 29147 2072 2008 2038 2049 2219 19204 1007
+ 7592
+ 7592
+ 7592
+ 7592
+ 7592
+ 7592 7592
+ 1006
+ 1027
+ 1005 3690
+ 7592 1010 1061 1005 2035 999 2129 2024 2017 100 1029 1855 100 100 6207 100 100 14677 23632 22203 1811 1995
+ 1017
+ 3943
+ 21211
+ 21211 2509
+ 21211 22394
+ 21211 22394 2509
+ 21211 22394 22394
+ 21211 22394 22394 2509
+ 21211 22394 22394 22394
+ 100 1006 3671 1007 100 1006 3674 7861 29147 2483 9530 16280 23854 1007 100 100 1017 3943 21211 21211 2509 21211 22394 21211 22394 2509 21211 22394 22394 21211 22394 22394 2509 1017 1012 1017 1017 1012 1012 1017 1017 1012 1012 1012 1017 100 1029 1855 100 100 6207 100 100 14677 23632 22203 1811 1995 1011 1011 1011 1011 1011 1011 1027 1027 1027 1027 1027 1027 1027 1192 15290 29754 14150 1192 10260 1181 29755 29436 29741 10260 16856 29747 23925 10325 1005 1005 1005 1005 1005 1005 1036 1036 1036 1036 1036 1036 1036 1000 1000 1000 1000 1012 1012 1012 1012 1012 1012 999 999 999 999 999 999 1029 1029 1029 1029 1029 1029 1045 1005 2310 2042 1005 2409 2002 1005 1055 2045 1010 1005 2128 2017 2469 1029 1005 1049 2025 2469 1045 1005 2222 2191 2009 1010 1005 1040 2017 2066 2070 5572 1029 2057 1005 2310 1037 1005 2222
--- a/models/ggml-vocab-deepseek-coder.gguf
+++ b/models/ggml-vocab-deepseek-coder.gguf
--- a/models/ggml-vocab-deepseek-coder.gguf.inp
+++ b/models/ggml-vocab-deepseek-coder.gguf.inp
@ -0,0 +1,102 @@
+
+__ggml_vocab_test__
+ 
+__ggml_vocab_test__
+  
+__ggml_vocab_test__
+   
+__ggml_vocab_test__
+	
+__ggml_vocab_test__
+
+
+__ggml_vocab_test__
+
+
+
+__ggml_vocab_test__
+
+
+
+
+__ggml_vocab_test__
+	
+
+__ggml_vocab_test__
+Hello world
+__ggml_vocab_test__
+ Hello world
+__ggml_vocab_test__
+Hello World
+__ggml_vocab_test__
+ Hello World
+__ggml_vocab_test__
+ Hello World!
+__ggml_vocab_test__
+Hello, world!
+__ggml_vocab_test__
+ Hello, world!
+__ggml_vocab_test__
+ this is 🦙.cpp
+__ggml_vocab_test__
+w048 7tuijk dsdfhu
+__ggml_vocab_test__
+нещо на Български
+__ggml_vocab_test__
+កាន់តែពិសេសអាចខលចេញ
+__ggml_vocab_test__
+🚀 (normal) 😶‍🌫️ (multiple emojis concatenated) ✅ (only emoji that has its own token)
+__ggml_vocab_test__
+Hello
+__ggml_vocab_test__
+ Hello
+__ggml_vocab_test__
+  Hello
+__ggml_vocab_test__
+   Hello
+__ggml_vocab_test__
+    Hello
+__ggml_vocab_test__
+    Hello
+    Hello
+__ggml_vocab_test__
+ (
+__ggml_vocab_test__
+
+ =
+__ggml_vocab_test__
+' era
+__ggml_vocab_test__
+Hello, y'all! How are you 😁 ?我想在apple工作1314151天～
+__ggml_vocab_test__
+3
+__ggml_vocab_test__
+33
+__ggml_vocab_test__
+333
+__ggml_vocab_test__
+3333
+__ggml_vocab_test__
+33333
+__ggml_vocab_test__
+333333
+__ggml_vocab_test__
+3333333
+__ggml_vocab_test__
+33333333
+__ggml_vocab_test__
+333333333
+__ggml_vocab_test__
+
+ 
+
+ 
+
+
+ 	 		 	
+  
+   
+    
+     
+🚀 (normal) 😶‍🌫️ (multiple emojis concatenated) ✅ 🦙🦙 3 33 333 3333 33333 333333 3333333 33333333 3.3 3..3 3...3 កាន់តែពិសេសអាច😁 ?我想在apple工作1314151天～ ------======= нещо на Български ''''''```````""""......!!!!!!?????? I've been 'told he's there, 'RE you sure? 'M not sure I'll make it, 'D you like some tea? We'Ve a'lL
+__ggml_vocab_test__
--- a/models/ggml-vocab-deepseek-coder.gguf.out
+++ b/models/ggml-vocab-deepseek-coder.gguf.out
@ -0,0 +1,41 @@
+
+ 207
+ 243
+ 315
+ 184
+ 185
+ 185 185
+ 185 185 185
+ 184 185
+ 17535 1835
+ 414 9489 1835
+ 17535 5414
+ 414 9489 5414
+ 414 9489 5414 0
+ 17535 11 1835 0
+ 414 9489 11 1835 0
+ 437 317 12394 99 234 13 14789
+ 86 15 19 23 207 22 83 3963 27659 26078 3934 14072
+ 1593 6478 616 2251 14994
+ 155 239 209 155 239 114 155 239 228 155 240 220 155 239 224 155 240 211 155 239 231 155 239 115 155 239 240 155 240 210 155 239 240 155 239 95 155 239 114 155 239 214 155 239 210 155 239 236 155 239 214 155 240 210 155 239 218
+ 10047 235 209 334 8760 8 12394 233 114 350 222 10047 221 104 169 116 224 334 4684 3909 992 24330 262 29651 612 8 207 156 237 214 334 5950 992 78 12896 344 638 891 1372 10736 8
+ 17535
+ 414 9489
+ 207 414 9489
+ 243 414 9489
+ 315 414 9489
+ 315 414 9489 185 315 414 9489
+ 334
+ 185 405
+ 6 2895
+ 17535 11 320 6 435 0 1717 417 340 12394 233 210 3015 19100 608 9413 2668 16 18 16 19 16 20 16 1393 169 121 239
+ 18
+ 18 18
+ 18 18 18
+ 18 18 18 18
+ 18 18 18 18 18
+ 18 18 18 18 18 18
+ 18 18 18 18 18 18 18
+ 18 18 18 18 18 18 18 18
+ 18 18 18 18 18 18 18 18 18
+ 185 207 185 185 207 185 185 185 207 12405 459 22758 185 243 185 315 185 251 185 730 185 10047 235 209 334 8760 8 12394 233 114 350 222 10047 221 104 169 116 224 334 4684 3909 992 24330 262 29651 612 8 207 156 237 214 12394 99 234 10047 99 234 207 18 207 18 18 207 18 18 18 207 18 18 18 18 207 18 18 18 18 18 207 18 18 18 18 18 18 207 18 18 18 18 18 18 18 207 18 18 18 18 18 18 18 18 207 18 13 18 207 18 524 18 207 18 1202 18 207 155 239 209 155 239 114 155 239 228 155 240 220 155 239 224 155 240 211 155 239 231 155 239 115 155 239 240 155 240 210 155 239 240 155 239 95 155 239 114 155 239 214 10047 233 210 3015 19100 608 9413 2668 16 18 16 19 16 20 16 1393 169 121 239 18155 374 17194 28 2861 6478 616 2251 14994 31269 4191 6 4686 4686 10252 3358 3358 3409 524 15330 3023 15031 5668 303 6 312 798 651 83 839 362 6 82 741 11 651 1369 340 2037 30 651 44 441 2037 303 6 642 1098 359 11 651 35 340 833 738 10860 30 998 6 10709 245 6 75 43
--- a/models/ggml-vocab-deepseek-llm.gguf
+++ b/models/ggml-vocab-deepseek-llm.gguf
--- a/models/ggml-vocab-deepseek-llm.gguf.inp
+++ b/models/ggml-vocab-deepseek-llm.gguf.inp
@ -0,0 +1,102 @@
+
+__ggml_vocab_test__
+ 
+__ggml_vocab_test__
+  
+__ggml_vocab_test__
+   
+__ggml_vocab_test__
+	
+__ggml_vocab_test__
+
+
+__ggml_vocab_test__
+
+
+
+__ggml_vocab_test__
+
+
+
+
+__ggml_vocab_test__
+	
+
+__ggml_vocab_test__
+Hello world
+__ggml_vocab_test__
+ Hello world
+__ggml_vocab_test__
+Hello World
+__ggml_vocab_test__
+ Hello World
+__ggml_vocab_test__
+ Hello World!
+__ggml_vocab_test__
+Hello, world!
+__ggml_vocab_test__
+ Hello, world!
+__ggml_vocab_test__
+ this is 🦙.cpp
+__ggml_vocab_test__
+w048 7tuijk dsdfhu
+__ggml_vocab_test__
+нещо на Български
+__ggml_vocab_test__
+កាន់តែពិសេសអាចខលចេញ
+__ggml_vocab_test__
+🚀 (normal) 😶‍🌫️ (multiple emojis concatenated) ✅ (only emoji that has its own token)
+__ggml_vocab_test__
+Hello
+__ggml_vocab_test__
+ Hello
+__ggml_vocab_test__
+  Hello
+__ggml_vocab_test__
+   Hello
+__ggml_vocab_test__
+    Hello
+__ggml_vocab_test__
+    Hello
+    Hello
+__ggml_vocab_test__
+ (
+__ggml_vocab_test__
+
+ =
+__ggml_vocab_test__
+' era
+__ggml_vocab_test__
+Hello, y'all! How are you 😁 ?我想在apple工作1314151天～
+__ggml_vocab_test__
+3
+__ggml_vocab_test__
+33
+__ggml_vocab_test__
+333
+__ggml_vocab_test__
+3333
+__ggml_vocab_test__
+33333
+__ggml_vocab_test__
+333333
+__ggml_vocab_test__
+3333333
+__ggml_vocab_test__
+33333333
+__ggml_vocab_test__
+333333333
+__ggml_vocab_test__
+
+ 
+
+ 
+
+
+ 	 		 	
+  
+   
+    
+     
+🚀 (normal) 😶‍🌫️ (multiple emojis concatenated) ✅ 🦙🦙 3 33 333 3333 33333 333333 3333333 33333333 3.3 3..3 3...3 កាន់តែពិសេសអាច😁 ?我想在apple工作1314151天～ ------======= нещо на Български ''''''```````""""......!!!!!!?????? I've been 'told he's there, 'RE you sure? 'M not sure I'll make it, 'D you like some tea? We'Ve a'lL
+__ggml_vocab_test__
--- a/models/ggml-vocab-deepseek-llm.gguf.out
+++ b/models/ggml-vocab-deepseek-llm.gguf.out
@ -0,0 +1,41 @@
+
+ 207
+ 243
+ 300
+ 184
+ 185
+ 185 185
+ 185 185 185
+ 184 185
+ 17464 1843
+ 37727 1843
+ 17464 5427
+ 37727 5427
+ 37727 5427 0
+ 17464 11 1843 0
+ 37727 11 1843 0
+ 437 317 12356 99 234 13 14743
+ 86 15 19 23 207 22 83 3970 27519 26016 3944 14025
+ 1603 6476 620 91754
+ 71374 209 71374 114 71374 228 155 240 220 71374 224 155 240 211 71374 231 71374 115 71374 240 155 240 210 71374 240 71374 95 71374 114 71374 214 71374 210 71374 236 71374 214 155 240 210 71374 218
+ 10044 95300 334 8754 8 33701 114 350 222 10044 221 104 46713 334 34732 996 24250 262 80923 8 207 37103 214 334 5956 89213 344 643 895 1377 10728 8
+ 17464
+ 37727
+ 207 37727
+ 243 37727
+ 300 37727
+ 300 37727 185 300 37727
+ 334
+ 185 403
+ 6 2906
+ 17464 11 320 6 436 0 1724 418 340 33701 210 3025 19017 612 9407 2681 16 18 16 19 16 20 16 1398 68940 239
+ 18
+ 18 18
+ 18 18 18
+ 18 18 18 18
+ 18 18 18 18 18
+ 18 18 18 18 18 18
+ 18 18 18 18 18 18 18
+ 18 18 18 18 18 18 18 18
+ 18 18 18 18 18 18 18 18 18
+ 185 207 185 185 207 185 185 185 207 11969 486 22504 185 243 185 300 185 251 185 663 185 10044 95300 334 8754 8 33701 114 350 222 10044 221 104 46713 334 34732 996 24250 262 80923 8 207 37103 214 12356 99 234 10044 99 234 207 18 207 18 18 207 18 18 18 207 18 18 18 18 207 18 18 18 18 18 207 18 18 18 18 18 18 207 18 18 18 18 18 18 18 207 18 18 18 18 18 18 18 18 207 18 13 18 207 18 526 18 207 18 1204 18 207 71374 209 71374 114 71374 228 155 240 220 71374 224 155 240 211 71374 231 71374 115 71374 240 155 240 210 71374 240 71374 95 71374 114 71374 214 71899 210 3025 19017 612 9407 2681 16 18 16 19 16 20 16 1398 68940 239 78827 55170 76659 620 91754 31116 36804 4885 4885 10897 4390 4390 41047 15278 3033 14986 5675 304 6 313 803 655 33326 362 6 82 745 11 655 1374 340 2049 30 655 44 441 2049 304 6 647 1099 359 11 655 35 340 837 742 10842 30 1003 6 10699 245 6 75 43
--- a/models/ggml-vocab-falcon.gguf
+++ b/models/ggml-vocab-falcon.gguf
--- a/models/ggml-vocab-falcon.gguf.inp
+++ b/models/ggml-vocab-falcon.gguf.inp
@ -0,0 +1,102 @@
+
+__ggml_vocab_test__
+ 
+__ggml_vocab_test__
+  
+__ggml_vocab_test__
+   
+__ggml_vocab_test__
+	
+__ggml_vocab_test__
+
+
+__ggml_vocab_test__
+
+
+
+__ggml_vocab_test__
+
+
+
+
+__ggml_vocab_test__
+	
+
+__ggml_vocab_test__
+Hello world
+__ggml_vocab_test__
+ Hello world
+__ggml_vocab_test__
+Hello World
+__ggml_vocab_test__
+ Hello World
+__ggml_vocab_test__
+ Hello World!
+__ggml_vocab_test__
+Hello, world!
+__ggml_vocab_test__
+ Hello, world!
+__ggml_vocab_test__
+ this is 🦙.cpp
+__ggml_vocab_test__
+w048 7tuijk dsdfhu
+__ggml_vocab_test__
+нещо на Български
+__ggml_vocab_test__
+កាន់តែពិសេសអាចខលចេញ
+__ggml_vocab_test__
+🚀 (normal) 😶‍🌫️ (multiple emojis concatenated) ✅ (only emoji that has its own token)
+__ggml_vocab_test__
+Hello
+__ggml_vocab_test__
+ Hello
+__ggml_vocab_test__
+  Hello
+__ggml_vocab_test__
+   Hello
+__ggml_vocab_test__
+    Hello
+__ggml_vocab_test__
+    Hello
+    Hello
+__ggml_vocab_test__
+ (
+__ggml_vocab_test__
+
+ =
+__ggml_vocab_test__
+' era
+__ggml_vocab_test__
+Hello, y'all! How are you 😁 ?我想在apple工作1314151天～
+__ggml_vocab_test__
+3
+__ggml_vocab_test__
+33
+__ggml_vocab_test__
+333
+__ggml_vocab_test__
+3333
+__ggml_vocab_test__
+33333
+__ggml_vocab_test__
+333333
+__ggml_vocab_test__
+3333333
+__ggml_vocab_test__
+33333333
+__ggml_vocab_test__
+333333333
+__ggml_vocab_test__
+
+ 
+
+ 
+
+
+ 	 		 	
+  
+   
+    
+     
+🚀 (normal) 😶‍🌫️ (multiple emojis concatenated) ✅ 🦙🦙 3 33 333 3333 33333 333333 3333333 33333333 3.3 3..3 3...3 កាន់តែពិសេសអាច😁 ?我想在apple工作1314151天～ ------======= нещо на Български ''''''```````""""......!!!!!!?????? I've been 'told he's there, 'RE you sure? 'M not sure I'll make it, 'D you like some tea? We'Ve a'lL
+__ggml_vocab_test__
--- a/models/ggml-vocab-falcon.gguf.out
+++ b/models/ggml-vocab-falcon.gguf.out
@ -0,0 +1,41 @@
+
+ 204
+ 258
+ 466
+ 192
+ 193
+ 1001
+ 11331
+ 19125
+ 9856 1079
+ 23090 1079
+ 9856 2889
+ 23090 2889
+ 23090 2889 12
+ 9856 23 1079 12
+ 23090 23 1079 12
+ 414 304 3346 111 231 25 29247
+ 98 55866 204 34 16682 7149 36190 6869 11481
+ 150 133 6207 151 215 150 134 5052 133 6279 5052 223 151 216 49679 123 53110 47043 7795
+ 38154 206 38154 126 38154 225 167 237 217 38154 221 167 237 208 38154 228 38154 127 38154 237 167 237 207 38154 237 38154 107 38154 126 38154 211 38154 207 38154 233 38154 211 167 237 207 38154 215
+ 2571 232 206 204 19 11003 20 8196 126 283 219 48778 116 13392 204 19 51831 732 63209 1741 7955 522 20 22438 211 204 19 7927 53360 325 504 701 946 10930 20
+ 9856
+ 23090
+ 204 23090
+ 258 23090
+ 466 23090
+ 466 23090 742 23090
+ 204 19
+ 1212 40
+ 18 4932
+ 9856 23 291 18 436 12 1265 362 299 8196 207 204 42 50087 123 2727 20300 32022 133 234 17419 30137 28 7858 181 133 236
+ 30
+ 3138
+ 22287
+ 22287 30
+ 22287 3138
+ 22287 22287
+ 22287 22287 30
+ 22287 22287 3138
+ 22287 22287 22287
+ 1212 4824 1001 1212 192 204 663 49453 2069 742 561 1501 193 2571 232 206 204 19 11003 20 8196 126 283 219 48778 116 13392 204 19 51831 732 63209 1741 7955 522 20 22438 211 3346 111 231 2571 111 231 204 30 204 3138 204 22287 204 22287 30 204 22287 3138 204 22287 22287 204 22287 22287 30 204 22287 22287 3138 204 30 25 30 204 30 513 30 204 30 951 30 27171 236 206 38154 126 38154 225 167 237 217 38154 221 167 237 208 38154 228 38154 127 38154 237 167 237 207 38154 237 38154 107 38154 126 38154 211 20589 207 204 42 50087 123 2727 20300 32022 133 234 17419 30137 28 7858 181 133 236 204 37057 2228 10666 5052 133 6207 151 215 150 134 5052 133 6279 5052 223 151 216 49679 123 53110 47043 7795 204 7544 7544 7544 8543 8543 17593 3513 3513 12844 51520 17664 4247 295 18 298 650 204 18 95 693 332 18 94 629 23 204 18 1553 299 1310 42 204 18 56 416 1310 295 18 567 717 334 23 204 18 47 299 606 596 6696 42 703 18 16139 241 18 87 55
--- a/models/ggml-vocab-llama-bpe.gguf
+++ b/models/ggml-vocab-llama-bpe.gguf
--- a/models/ggml-vocab-llama-bpe.gguf.inp
+++ b/models/ggml-vocab-llama-bpe.gguf.inp
@ -0,0 +1,102 @@
+
+__ggml_vocab_test__
+ 
+__ggml_vocab_test__
+  
+__ggml_vocab_test__
+   
+__ggml_vocab_test__
+	
+__ggml_vocab_test__
+
+
+__ggml_vocab_test__
+
+
+
+__ggml_vocab_test__
+
+
+
+
+__ggml_vocab_test__
+	
+
+__ggml_vocab_test__
+Hello world
+__ggml_vocab_test__
+ Hello world
+__ggml_vocab_test__
+Hello World
+__ggml_vocab_test__
+ Hello World
+__ggml_vocab_test__
+ Hello World!
+__ggml_vocab_test__
+Hello, world!
+__ggml_vocab_test__
+ Hello, world!
+__ggml_vocab_test__
+ this is 🦙.cpp
+__ggml_vocab_test__
+w048 7tuijk dsdfhu
+__ggml_vocab_test__
+нещо на Български
+__ggml_vocab_test__
+កាន់តែពិសេសអាចខលចេញ
+__ggml_vocab_test__
+🚀 (normal) 😶‍🌫️ (multiple emojis concatenated) ✅ (only emoji that has its own token)
+__ggml_vocab_test__
+Hello
+__ggml_vocab_test__
+ Hello
+__ggml_vocab_test__
+  Hello
+__ggml_vocab_test__
+   Hello
+__ggml_vocab_test__
+    Hello
+__ggml_vocab_test__
+    Hello
+    Hello
+__ggml_vocab_test__
+ (
+__ggml_vocab_test__
+
+ =
+__ggml_vocab_test__
+' era
+__ggml_vocab_test__
+Hello, y'all! How are you 😁 ?我想在apple工作1314151天～
+__ggml_vocab_test__
+3
+__ggml_vocab_test__
+33
+__ggml_vocab_test__
+333
+__ggml_vocab_test__
+3333
+__ggml_vocab_test__
+33333
+__ggml_vocab_test__
+333333
+__ggml_vocab_test__
+3333333
+__ggml_vocab_test__
+33333333
+__ggml_vocab_test__
+333333333
+__ggml_vocab_test__
+
+ 
+
+ 
+
+
+ 	 		 	
+  
+   
+    
+     
+🚀 (normal) 😶‍🌫️ (multiple emojis concatenated) ✅ 🦙🦙 3 33 333 3333 33333 333333 3333333 33333333 3.3 3..3 3...3 កាន់តែពិសេសអាច😁 ?我想在apple工作1314151天～ ------======= нещо на Български ''''''```````""""......!!!!!!?????? I've been 'told he's there, 'RE you sure? 'M not sure I'll make it, 'D you like some tea? We'Ve a'lL
+__ggml_vocab_test__
--- a/models/ggml-vocab-llama-bpe.gguf.out
+++ b/models/ggml-vocab-llama-bpe.gguf.out
@ -0,0 +1,41 @@
+
+ 220
+ 256
+ 262
+ 197
+ 198
+ 271
+ 1432
+ 1602
+ 9906 1917
+ 22691 1917
+ 9906 4435
+ 22691 4435
+ 22691 4435 0
+ 9906 11 1917 0
+ 22691 11 1917 0
+ 420 374 11410 99 247 13 11055
+ 86 23904 220 22 83 2005 42908 11729 3013 17156
+ 79862 102118 13373 64571 34694 3114 112203 80112
+ 21549 222 98629 241 45358 233 21549 237 45358 224 21549 244 21549 115 21549 253 45358 223 21549 253 21549 95 98629 227 21549 223 21549 249 21549 227 45358 223 21549 231
+ 9468 248 222 320 8416 8 27623 114 102470 9468 234 104 31643 320 36773 100166 98634 8 26602 227 320 3323 43465 430 706 1202 1866 4037 8
+ 9906
+ 22691
+ 220 22691
+ 256 22691
+ 262 22691
+ 262 22691 198 262 22691
+ 320
+ 198 284
+ 6 11639
+ 9906 11 379 65948 0 2650 527 499 27623 223 949 37046 101067 19000 23182 102301 9263 18136 16 36827 21909
+ 18
+ 1644
+ 8765
+ 8765 18
+ 8765 1644
+ 8765 8765
+ 8765 8765 18
+ 8765 8765 1644
+ 8765 8765 8765
+ 198 4815 15073 66597 8004 1602 2355 79772 11187 9468 248 222 320 8416 8 27623 114 102470 9468 234 104 31643 320 36773 100166 98634 8 26602 227 11410 99 247 9468 99 247 220 18 220 1644 220 8765 220 8765 18 220 8765 1644 220 8765 8765 220 8765 8765 18 220 8765 8765 1644 220 18 13 18 220 18 497 18 220 18 1131 18 220 21549 222 98629 241 45358 233 21549 237 45358 224 21549 244 21549 115 21549 253 45358 223 21549 253 21549 95 98629 227 76460 223 949 37046 101067 19000 23182 102301 9263 18136 16 36827 21909 56560 54337 19175 102118 13373 64571 34694 3114 112203 80112 3436 106451 14196 14196 74694 3089 3089 29249 17523 3001 27708 7801 358 3077 1027 364 83 820 568 596 1070 11 364 793 499 2771 30 364 44 539 2771 358 3358 1304 433 11 364 35 499 1093 1063 15600 30 1226 6 43712 264 64966 43
--- a/models/ggml-vocab-llama-spm.gguf
+++ b/models/ggml-vocab-llama-spm.gguf
--- a/models/ggml-vocab-llama-spm.gguf.inp
+++ b/models/ggml-vocab-llama-spm.gguf.inp
@ -0,0 +1,102 @@
+
+__ggml_vocab_test__
+ 
+__ggml_vocab_test__
+  
+__ggml_vocab_test__
+   
+__ggml_vocab_test__
+	
+__ggml_vocab_test__
+
+
+__ggml_vocab_test__
+
+
+
+__ggml_vocab_test__
+
+
+
+
+__ggml_vocab_test__
+	
+
+__ggml_vocab_test__
+Hello world
+__ggml_vocab_test__
+ Hello world
+__ggml_vocab_test__
+Hello World
+__ggml_vocab_test__
+ Hello World
+__ggml_vocab_test__
+ Hello World!
+__ggml_vocab_test__
+Hello, world!
+__ggml_vocab_test__
+ Hello, world!
+__ggml_vocab_test__
+ this is 🦙.cpp
+__ggml_vocab_test__
+w048 7tuijk dsdfhu
+__ggml_vocab_test__
+нещо на Български
+__ggml_vocab_test__
+កាន់តែពិសេសអាចខលចេញ
+__ggml_vocab_test__
+🚀 (normal) 😶‍🌫️ (multiple emojis concatenated) ✅ (only emoji that has its own token)
+__ggml_vocab_test__
+Hello
+__ggml_vocab_test__
+ Hello
+__ggml_vocab_test__
+  Hello
+__ggml_vocab_test__
+   Hello
+__ggml_vocab_test__
+    Hello
+__ggml_vocab_test__
+    Hello
+    Hello
+__ggml_vocab_test__
+ (
+__ggml_vocab_test__
+
+ =
+__ggml_vocab_test__
+' era
+__ggml_vocab_test__
+Hello, y'all! How are you 😁 ?我想在apple工作1314151天～
+__ggml_vocab_test__
+3
+__ggml_vocab_test__
+33
+__ggml_vocab_test__
+333
+__ggml_vocab_test__
+3333
+__ggml_vocab_test__
+33333
+__ggml_vocab_test__
+333333
+__ggml_vocab_test__
+3333333
+__ggml_vocab_test__
+33333333
+__ggml_vocab_test__
+333333333
+__ggml_vocab_test__
+
+ 
+
+ 
+
+
+ 	 		 	
+  
+   
+    
+     
+🚀 (normal) 😶‍🌫️ (multiple emojis concatenated) ✅ 🦙🦙 3 33 333 3333 33333 333333 3333333 33333333 3.3 3..3 3...3 កាន់តែពិសេសអាច😁 ?我想在apple工作1314151天～ ------======= нещо на Български ''''''```````""""......!!!!!!?????? I've been 'told he's there, 'RE you sure? 'M not sure I'll make it, 'D you like some tea? We'Ve a'lL
+__ggml_vocab_test__
--- a/models/ggml-vocab-llama-spm.gguf.out
+++ b/models/ggml-vocab-llama-spm.gguf.out
@ -0,0 +1,41 @@
+
+ 259
+ 1678
+ 268
+ 29871 12
+ 29871 13
+ 29871 13 13
+ 29871 13 13 13
+ 29871 12 13
+ 15043 3186
+ 29871 15043 3186
+ 15043 2787
+ 29871 15043 2787
+ 29871 15043 2787 29991
+ 15043 29892 3186 29991
+ 29871 15043 29892 3186 29991
+ 29871 445 338 29871 243 162 169 156 29889 8223
+ 281 29900 29946 29947 29871 29955 9161 13535 18031 2176 6905
+ 1538 4851 665 1386 29713 1305
+ 29871 31849 31324 31934 228 162 142 228 161 146 228 162 133 228 161 153 228 161 186 31708 228 162 132 31708 228 161 165 31324 228 161 136 228 161 132 228 161 158 228 161 136 228 162 132 228 161 140
+ 29871 243 162 157 131 313 8945 29897 29871 243 162 155 185 30722 243 162 143 174 30598 313 20787 953 3848 275 16125 630 29897 29871 31681 313 6194 953 29877 2397 393 756 967 1914 5993 29897
+ 15043
+ 29871 15043
+ 259 15043
+ 1678 15043
+ 268 15043
+ 268 15043 13 1678 15043
+ 29871 313
+ 29871 13 353
+ 525 3152
+ 15043 29892 343 29915 497 29991 1128 526 366 29871 243 162 155 132 1577 30672 31522 30505 11548 31041 30732 29896 29941 29896 29946 29896 29945 29896 30408 30739
+ 29871 29941
+ 29871 29941 29941
+ 29871 29941 29941 29941
+ 29871 29941 29941 29941 29941
+ 29871 29941 29941 29941 29941 29941
+ 29871 29941 29941 29941 29941 29941 29941
+ 29871 29941 29941 29941 29941 29941 29941 29941
+ 29871 29941 29941 29941 29941 29941 29941 29941 29941
+ 29871 29941 29941 29941 29941 29941 29941 29941 29941 29941
+ 29871 13 29871 13 13 29871 13 13 13 29871 12 29871 12 12 29871 12 13 259 13 1678 13 268 13 418 13 243 162 157 131 313 8945 29897 29871 243 162 155 185 30722 243 162 143 174 30598 313 20787 953 3848 275 16125 630 29897 29871 31681 29871 243 162 169 156 243 162 169 156 29871 29941 29871 29941 29941 29871 29941 29941 29941 29871 29941 29941 29941 29941 29871 29941 29941 29941 29941 29941 29871 29941 29941 29941 29941 29941 29941 29871 29941 29941 29941 29941 29941 29941 29941 29871 29941 29941 29941 29941 29941 29941 29941 29941 29871 29941 29889 29941 29871 29941 636 29941 29871 29941 856 29941 29871 31849 31324 31934 228 162 142 228 161 146 228 162 133 228 161 153 228 161 186 31708 228 162 132 31708 228 161 165 31324 228 161 136 243 162 155 132 1577 30672 31522 30505 11548 31041 30732 29896 29941 29896 29946 29896 29945 29896 30408 30739 448 23648 2751 25512 1538 4851 665 1386 29713 1305 14550 4907 11120 16159 16159 16159 15945 15945 3045 636 6824 6824 6824 8773 8773 8773 306 29915 345 1063 525 29873 1025 540 29915 29879 727 29892 525 1525 366 1854 29973 525 29924 451 1854 306 29915 645 1207 372 29892 525 29928 366 763 777 23429 29973 1334 29915 29963 29872 263 29915 29880 29931
--- a/models/ggml-vocab-stablelm-3b-4e1t.gguf
+++ b/models/ggml-vocab-stablelm-3b-4e1t.gguf
--- a/requirements.txt
+++ b/requirements.txt
@ -7,6 +7,7 @@
 -r ./requirements/requirements-convert.txt

 -r ./requirements/requirements-convert-hf-to-gguf.txt
+-r ./requirements/requirements-convert-hf-to-gguf-update.txt
 -r ./requirements/requirements-convert-llama-ggml-to-gguf.txt
 -r ./requirements/requirements-convert-lora-to-ggml.txt
 -r ./requirements/requirements-convert-persimmon-to-gguf.txt
--- a/requirements/requirements-convert-hf-to-gguf-update.txt
+++ b/requirements/requirements-convert-hf-to-gguf-update.txt
@ -0,0 +1,3 @@
+-r ./requirements-convert.txt
+torch~=2.1.1
+einops~=0.7.0
--- a/scripts/check-requirements.sh
+++ b/scripts/check-requirements.sh
@ -168,6 +168,11 @@ fi

 check_convert_script convert.py
 for py in convert-*.py; do
+    # skip convert-hf-to-gguf-update.py
+    # TODO: the check is failing for some reason:
+    #       https://github.com/ggerganov/llama.cpp/actions/runs/8875330981/job/24364557177?pr=6920
+    [[ $py == convert-hf-to-gguf-update.py ]] && continue
+
    check_convert_script "$py"
 done

--- a/tests/CMakeLists.txt
+++ b/tests/CMakeLists.txt
@ -1,10 +1,40 @@
+function(llama_test target)
+    include(CMakeParseArguments)
+    set(options)
+    set(oneValueArgs NAME LABEL WORKING_DIRECTORY)
+    set(multiValueArgs ARGS)
+    cmake_parse_arguments(LLAMA_TEST "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
+
+    if (NOT DEFINED LLAMA_TEST_LABEL)
+        set(LLAMA_TEST_LABEL "main")
+    endif()
+    if (NOT DEFINED LLAMA_TEST_WORKING_DIRECTORY)
+        set(LLAMA_TEST_WORKING_DIRECTORY .)
+    endif()
+    if (DEFINED LLAMA_TEST_NAME)
+        set(TEST_NAME ${LLAMA_TEST_NAME})
+    else()
+        set(TEST_NAME ${target})
+    endif()
+
+    set(TEST_TARGET ${target})
+
+    add_test(
+        NAME ${TEST_NAME}
+        WORKING_DIRECTORY ${LLAMA_TEST_WORKING_DIRECTORY}
+        COMMAND $<TARGET_FILE:${TEST_TARGET}>
+        ${LLAMA_TEST_ARGS})
+
+    set_property(TEST ${TEST_NAME} PROPERTY LABELS ${LLAMA_TEST_LABEL})
+endfunction()
+
 # Builds and runs a test source file.
 # Optional args:
 # - NAME: name of the executable & test target (defaults to the source file name without extension)
 # - LABEL: label for the test (defaults to main)
 # - ARGS: arguments to pass to the test executable
 # - WORKING_DIRECTORY
-function(llama_test source)
+function(llama_target_and_test source)
    include(CMakeParseArguments)
    set(options)
    set(oneValueArgs NAME LABEL WORKING_DIRECTORY)
@ -35,45 +65,60 @@ function(llama_test source)
    set_property(TEST ${TEST_TARGET} PROPERTY LABELS ${LLAMA_TEST_LABEL})
 endfunction()

-# llama_test(test-double-float.cpp) # SLOW
-llama_test(test-quantize-fns.cpp)
-llama_test(test-quantize-perf.cpp)
-llama_test(test-sampling.cpp)
-llama_test(test-chat-template.cpp)
+# build test-tokenizer-0 target once and add many tests
+add_executable(test-tokenizer-0 test-tokenizer-0.cpp)
+target_link_libraries(test-tokenizer-0 PRIVATE common)
+install(TARGETS test-tokenizer-0 RUNTIME)

-llama_test(test-tokenizer-0-llama.cpp    NAME test-tokenizer-0-llama                          ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-llama.gguf)
-llama_test(test-tokenizer-0-llama-v3.cpp NAME test-tokenizer-0-llama-v3                       ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-llama-v3.gguf)
-llama_test(test-tokenizer-0-falcon.cpp   NAME test-tokenizer-0-falcon                         ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-falcon.gguf)
+llama_test(test-tokenizer-0 NAME test-tokenizer-0-llama-spm         ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-llama-spm.gguf)
+llama_test(test-tokenizer-0 NAME test-tokenizer-0-llama-bpe         ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-llama-bpe.gguf)
+llama_test(test-tokenizer-0 NAME test-tokenizer-0-falcon            ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-falcon.gguf)
+llama_test(test-tokenizer-0 NAME test-tokenizer-0-deepseek-llm      ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-deepseek-llm.gguf)
+llama_test(test-tokenizer-0 NAME test-tokenizer-0-deepseek-coder    ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-deepseek-coder.gguf)
+llama_test(test-tokenizer-0 NAME test-tokenizer-0-bert-bge     r    ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-bert-bge.gguf)

-llama_test(test-tokenizer-0-deepseek-coder.cpp NAME test-tokenizer-0-deepseek-coder           ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-deepseek-coder.gguf)
-llama_test(test-tokenizer-0-deepseek-llm.cpp   NAME test-tokenizer-0-deepseek-llm             ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-deepseek-llm.gguf)
+# build test-tokenizer-1-bpe target once and add many tests
+add_executable(test-tokenizer-1-bpe test-tokenizer-1-bpe.cpp)
+target_link_libraries(test-tokenizer-1-bpe PRIVATE common)
+install(TARGETS test-tokenizer-1-bpe RUNTIME)

-llama_test(test-tokenizer-1-llama.cpp  NAME test-tokenizer-1-llama                            ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-llama.gguf)
-llama_test(test-tokenizer-1-llama.cpp  NAME test-tokenizer-1-baichuan                         ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-baichuan.gguf)
+llama_test(test-tokenizer-1-bpe NAME test-tokenizer-1-falcon    ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-falcon.gguf)
+llama_test(test-tokenizer-1-bpe NAME test-tokenizer-1-aquila    ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-aquila.gguf)
+llama_test(test-tokenizer-1-bpe NAME test-tokenizer-1-mpt       ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-mpt.gguf)
+llama_test(test-tokenizer-1-bpe NAME test-tokenizer-1-stablelm  ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-stablelm.gguf)
+llama_test(test-tokenizer-1-bpe NAME test-tokenizer-1-gpt-neox  ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-gpt-neox.gguf)
+llama_test(test-tokenizer-1-bpe NAME test-tokenizer-1-refact    ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-refact.gguf)
+llama_test(test-tokenizer-1-bpe NAME test-tokenizer-1-starcoder ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-starcoder.gguf)
+llama_test(test-tokenizer-1-bpe NAME test-tokenizer-1-gpt2      ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-gpt2.gguf)
+#llama_test(test-tokenizer-1-bpe NAME test-tokenizer-1-bloom     ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-bloom.gguf) # BIG

-llama_test(test-tokenizer-1-bpe.cpp    NAME test-tokenizer-1-falcon                           ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-falcon.gguf)
-llama_test(test-tokenizer-1-bpe.cpp    NAME test-tokenizer-1-aquila                           ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-aquila.gguf)
-llama_test(test-tokenizer-1-bpe.cpp    NAME test-tokenizer-1-mpt                              ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-mpt.gguf)
-llama_test(test-tokenizer-1-bpe.cpp    NAME test-tokenizer-1-stablelm-3b-4e1t                 ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-stablelm-3b-4e1t.gguf)
-llama_test(test-tokenizer-1-bpe.cpp    NAME test-tokenizer-1-gpt-neox                         ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-gpt-neox.gguf)
-llama_test(test-tokenizer-1-bpe.cpp    NAME test-tokenizer-1-refact                           ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-refact.gguf)
-llama_test(test-tokenizer-1-bpe.cpp    NAME test-tokenizer-1-starcoder                        ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-starcoder.gguf)
-llama_test(test-tokenizer-1-bpe.cpp    NAME test-tokenizer-1-gpt2                             ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-gpt2.gguf)
-#llama_test(test-tokenizer-1-bpe.cpp    NAME test-tokenizer-1-bloom                           ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-bloom.gguf) # BIG
+# build test-tokenizer-1-spm target once and add many tests
+add_executable(test-tokenizer-1-spm test-tokenizer-1-spm.cpp)
+target_link_libraries(test-tokenizer-1-spm PRIVATE common)
+install(TARGETS test-tokenizer-1-spm RUNTIME)

-llama_test(test-grammar-parser.cpp)
-llama_test(test-llama-grammar.cpp)
-llama_test(test-grammar-integration.cpp)
-llama_test(test-grad0.cpp)
-# llama_test(test-opt.cpp) # SLOW
-llama_test(test-backend-ops.cpp)
+llama_test(test-tokenizer-1-spm  NAME test-tokenizer-1-llama-spm ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-llama-spm.gguf)
+llama_test(test-tokenizer-1-spm  NAME test-tokenizer-1-baichuan  ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-baichuan.gguf)

-llama_test(test-rope.cpp)
+# llama_target_and_test(test-double-float.cpp) # SLOW
+llama_target_and_test(test-quantize-fns.cpp)
+llama_target_and_test(test-quantize-perf.cpp)
+llama_target_and_test(test-sampling.cpp)
+llama_target_and_test(test-chat-template.cpp)

-llama_test(test-model-load-cancel.cpp  LABEL "model")
-llama_test(test-autorelease.cpp        LABEL "model")
+llama_target_and_test(test-grammar-parser.cpp)
+llama_target_and_test(test-llama-grammar.cpp)
+llama_target_and_test(test-grammar-integration.cpp)
+llama_target_and_test(test-grad0.cpp)
+# llama_target_and_test(test-opt.cpp) # SLOW
+llama_target_and_test(test-backend-ops.cpp)

-llama_test(test-json-schema-to-grammar.cpp   WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}/..)
+llama_target_and_test(test-rope.cpp)
+
+llama_target_and_test(test-model-load-cancel.cpp  LABEL "model")
+llama_target_and_test(test-autorelease.cpp        LABEL "model")
+
+llama_target_and_test(test-json-schema-to-grammar.cpp   WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}/..)
 target_include_directories(test-json-schema-to-grammar PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/../examples/server)

 # dummy executable - not installed
--- a/tests/test-tokenizer-0-deepseek-coder.cpp
+++ b/tests/test-tokenizer-0-deepseek-coder.cpp
@ -1,188 +0,0 @@
-#include "llama.h"
-#include "common.h"
-#include "console.h"
-
-#include <cstdio>
-#include <string>
-#include <map>
-#include <vector>
-#include <fstream>
-
-// generate using test-tokenizer-0-falcon.py
-static const std::map<std::string, std::vector<llama_token>> & k_tests() {
-    static std::map<std::string, std::vector<llama_token>> _k_tests = {
-        { ""                      , {    }, },
-        { " "                     , {       207, }, },
-        { "  "                    , {       243, }, },
-        { "   "                   , {       315, }, },
-        { "\t"                    , {       184, }, },
-        { "\n"                    , {       185, }, },
-        { "\t\n"                  , {       184,    185, }, },
-        { "Hello world"           , {     17535,   1835, }, },
-        { " Hello world"          , {       414,   9489,   1835, }, },
-        { "Hello World"           , {     17535,   5414, }, },
-        { " Hello World"          , {       414,   9489,   5414, }, },
-        { " Hello World!"         , {       414,   9489,   5414,      0, }, },
-        { "Hello, world!"         , {     17535,     11,   1835,      0, }, },
-        { " Hello, world!"        , {       414,   9489,     11,   1835,      0, }, },
-        { " this is 🦙.cpp"        , {       437,    317,  12394,     99,    234,     13,  14789, }, },
-        { "w048 7tuijk dsdfhu"    , {        86,     15,     19,     23,    207,     22,     83,   3963,  27659,  26078,   3934,  14072, }, },
-        { "нещо на Български"     , {      1593,   6478,    616,   2251,  14994, }, },
-        { "កាន់តែពិសេសអាចខលចេញ"   , {       155,    239,    209,    155,    239,    114,    155,    239,    228,    155,    240,    220,    155,    239,    224,    155,    240,    211,    155,    239,    231,    155,    239,    115,    155,    239,    240,    155,    240,    210,    155,    239,    240,    155,    239,     95,    155,    239,    114,    155,    239,    214,    155,    239,    210,    155,    239,    236,    155,    239,    214,    155,    240,    210,    155,    239,    218, }, },
-        { "🚀 (normal) 😶‍🌫️ (multiple emojis concatenated) ✅ (only emoji that has its own token)", {     10047,    235,    209,    334,   8760,      8,  12394,    233,    114,    350,    222,  10047,    221,    104,    169,    116,    224,    334,   4684,   3909,    992,  24330,    262,  29651,    612,      8,    207,    156,    237,    214,    334,   5950,    992,     78,  12896,    344,    638,    891,   1372,  10736,      8, }, },
-        { "Hello"                 , {     17535, }, },
-        { " Hello"                , {       414,   9489, }, },
-        { "  Hello"               , {       207,    414,   9489, }, },
-        { "   Hello"              , {       243,    414,   9489, }, },
-        { "    Hello"             , {       315,    414,   9489, }, },
-        { "    Hello\n    Hello"  , {       315,    414,   9489,    185,    315,    414,   9489, }, },
-        { "\n ="                  , {       185,    405, }, },
-        { "' era"                 , {         6,   2895, }, },
-        { "Hello, y'all! How are you 😁 ?我想在apple工作1314151天～", {     17535,     11,    320,      6,    435,      0,   1717,    417,    340,  12394,    233,    210,   3015,  19100,    608,   9413,   2668,     16,     18,     16,     19,     16,     20,     16,   1393,    169,    121,    239, }, },
-
-    };
-
-    return _k_tests;
-}
-
-int main(int argc, char **argv) {
-    if (argc < 2) {
-        fprintf(stderr, "Usage: %s vocab-file [text-file]\n", argv[0]);
-        return 1;
-    }
-
-    const std::string fname = argv[1];
-
-    std::string fname_text;
-    if (argc > 2) {
-        fname_text = argv[2];
-    }
-
-    fprintf(stderr, "%s : reading vocab from: '%s'\n", __func__, fname.c_str());
-
-    llama_model * model;
-    llama_context * ctx;
-
-    llama_backend_init();
-
-    // load the vocab
-    {
-        auto mparams = llama_model_default_params();
-
-        mparams.vocab_only = true;
-
-        model = llama_load_model_from_file(fname.c_str(), mparams);
-
-        if (model == NULL) {
-            fprintf(stderr, "%s: error: failed to load vocab '%s'\n", __func__, fname.c_str());
-            return 1;
-        }
-
-        auto cparams = llama_context_default_params();
-
-        ctx = llama_new_context_with_model(model, cparams);
-
-        if (ctx == NULL) {
-            fprintf(stderr, "%s: error: failed to load vocab '%s'\n", __func__, fname.c_str());
-            llama_free_model(model);
-            return 1;
-        }
-    }
-
-    if (llama_vocab_type(model) != LLAMA_VOCAB_TYPE_BPE) {
-        fprintf(stderr, "%s : error: vocab type is not BPE\n", __func__);
-        llama_free_model(model);
-        llama_free(ctx);
-        return 2;
-    }
-
-#ifdef _WIN32
-    // We need this for unicode console support
-    console::init(false, false);
-    atexit([]() { console::cleanup(); });
-#endif
-
-    bool success = true;
-
-    for (const auto & test_kv : k_tests()) {
-        const std::vector<llama_token> res = llama_tokenize(ctx, test_kv.first, false);
-
-        printf("\n");
-        printf("src: '%s'\n", test_kv.first.c_str());
-        printf("res: '%s'\n", llama_detokenize_bpe(ctx, res).c_str());
-        printf("tok: ");
-        for (const auto & tok : res) {
-            printf("%d ", tok);
-        }
-        printf("\n");
-
-        bool correct = res.size() == test_kv.second.size();
-        for (int i = 0; i < (int) res.size() && correct; ++i) {
-            if (test_kv.second[i] != res[i]) {
-                correct = false;
-            }
-        }
-
-        if (!correct) {
-            fprintf(stderr, "%s : failed test:    '%s'\n", __func__, test_kv.first.c_str());
-            fprintf(stderr, "%s : detokenized to: '%s' instead of '%s'\n", __func__,
-                llama_detokenize_bpe(ctx, res).c_str(),
-                llama_detokenize_bpe(ctx, test_kv.second).c_str());
-            fprintf(stderr, "%s : expected tokens: ", __func__);
-            for (const auto & t : test_kv.second) {
-                fprintf(stderr, "%6d, ", t);
-            }
-            fprintf(stderr, "\n");
-            fprintf(stderr, "%s : got tokens:      ", __func__);
-            for (const auto & t : res) {
-                fprintf(stderr, "%6d, ", t);
-            }
-            fprintf(stderr, "\n");
-
-            success = false;
-        }
-    }
-
-    if (!fname_text.empty()) {
-        fprintf(stderr, "%s : tokenizing: '%s'\n", __func__, fname_text.c_str());
-
-        std::string text;
-        {
-            std::ifstream ifs(fname_text);
-            if (!ifs) {
-                fprintf(stderr, "%s : error: could not open file '%s'\n", __func__, fname_text.c_str());
-                return 1;
-            }
-            text = std::string(std::istreambuf_iterator<char>(ifs), std::istreambuf_iterator<char>());
-        }
-
-        fprintf(stderr, "%s : text size: %zu\n", __func__, text.size());
-
-        const std::vector<llama_token> res = llama_tokenize(ctx, text, false);
-
-        fprintf(stderr, "%s : tokens: %zu\n", __func__, res.size());
-
-        {
-            const std::string fname_out = fname_text + ".tokcpp";
-
-            std::ofstream ofs(fname_out);
-            if (!ofs) {
-                fprintf(stderr, "%s : error: could not open file '%s'\n", __func__, fname_out.c_str());
-                return 1;
-            }
-
-            for (const auto & tok : res) {
-                ofs << tok << " '" << llama_detokenize_bpe(ctx, std::vector<int>{tok}) << "'" << std::endl;
-            }
-        }
-
-        fprintf(stderr, "%s : tokens written to '%s'\n", __func__, (fname_text + ".tokcpp").c_str());
-    }
-
-    llama_free_model(model);
-    llama_free(ctx);
-
-    llama_backend_free();
-
-    return success ? 0 : 3;
-}
--- a/tests/test-tokenizer-0-deepseek-llm.cpp
+++ b/tests/test-tokenizer-0-deepseek-llm.cpp
@ -1,186 +0,0 @@
-#include "llama.h"
-#include "common.h"
-#include "console.h"
-
-#include <cstdio>
-#include <string>
-#include <map>
-#include <vector>
-#include <fstream>
-
-// generate using test-tokenizer-0-falcon.py
-static const std::map<std::string, std::vector<llama_token>> & k_tests() {
-    static std::map<std::string, std::vector<llama_token>> _k_tests = {
-        { ""                      , {   }, },
-        { " "                     , {      207, }, },
-        { "  "                    , {      243, }, },
-        { "   "                   , {      300, }, },
-        { "\t"                    , {      184, }, },
-        { "\n"                    , {      185, }, },
-        { "\t\n"                  , {      184,    185, }, },
-        { "Hello world"           , {    17464,   1843, }, },
-        { " Hello world"          , {    37727,   1843, }, },
-        { "Hello World"           , {    17464,   5427, }, },
-        { " Hello World"          , {    37727,   5427, }, },
-        { " Hello World!"         , {    37727,   5427,      0, }, },
-        { "Hello, world!"         , {    17464,     11,   1843,      0, }, },
-        { " Hello, world!"        , {    37727,     11,   1843,      0, }, },
-        { " this is 🦙.cpp"        , {      437,    317,  12356,     99,    234,     13,  14743, }, },
-        { "w048 7tuijk dsdfhu"    , {       86,     15,     19,     23,    207,     22,     83,   3970,  27519,  26016,   3944,  14025, }, },
-        { "нещо на Български"     , {     1603,   6476,    620,  91754, }, },
-        { "កាន់តែពិសេសអាចខលចេញ"   , {    71374,    209,  71374,    114,  71374,    228,    155,    240,    220,  71374,    224,    155,    240,    211,  71374,    231,  71374,    115,  71374,    240,    155,    240,    210,  71374,    240,  71374,     95,  71374,    114,  71374,    214,  71374,    210,  71374,    236,  71374,    214,    155,    240,    210,  71374,    218, }, },
-        { "🚀 (normal) 😶‍🌫️ (multiple emojis concatenated) ✅ (only emoji that has its own token)", {    10044,  95300,    334,   8754,      8,  33701,    114,    350,    222,  10044,    221,    104,  46713,    334,  34732,    996,  24250,    262,  80923,      8,    207,  37103,    214,    334,   5956,  89213,    344,    643,    895,   1377,  10728,      8, }, },
-        { "Hello"                 , {    17464, }, },
-        { " Hello"                , {    37727, }, },
-        { "  Hello"               , {      207,  37727, }, },
-        { "   Hello"              , {      243,  37727, }, },
-        { "    Hello"             , {      300,  37727, }, },
-        { "    Hello\n    Hello"  , {      300,  37727,    185,    300,  37727, }, },
-        { "\n ="                  , {      185,    403, }, },
-        { "' era"                 , {        6,   2906, }, },
-        { "Hello, y'all! How are you 😁 ?我想在apple工作1314151天～", {    17464,     11,    320,      6,    436,      0,   1724,    418,    340,  33701,    210,   3025,  19017,    612,   9407,   2681,     16,     18,     16,     19,     16,     20,     16,   1398,  68940,    239, }, },
-
-    };
-
-    return _k_tests;
-}
-
-int main(int argc, char **argv) {
-    if (argc < 2) {
-        fprintf(stderr, "Usage: %s vocab-file [text-file]\n", argv[0]);
-        return 1;
-    }
-
-    const std::string fname = argv[1];
-
-    std::string fname_text;
-    if (argc > 2) {
-        fname_text = argv[2];
-    }
-
-    fprintf(stderr, "%s : reading vocab from: '%s'\n", __func__, fname.c_str());
-
-    llama_model * model;
-    llama_context * ctx;
-
-    llama_backend_init();
-
-    // load the vocab
-    {
-        auto mparams = llama_model_default_params();
-
-        mparams.vocab_only = true;
-
-        model = llama_load_model_from_file(fname.c_str(), mparams);
-
-        if (model == NULL) {
-            fprintf(stderr, "%s: error: failed to load vocab '%s'\n", __func__, fname.c_str());
-            return 1;
-        }
-
-        auto cparams = llama_context_default_params();
-
-        ctx = llama_new_context_with_model(model, cparams);
-
-        if (ctx == NULL) {
-            fprintf(stderr, "%s: error: failed to load vocab '%s'\n", __func__, fname.c_str());
-            llama_free_model(model);
-            return 1;
-        }
-    }
-
-    if (llama_vocab_type(model) != LLAMA_VOCAB_TYPE_BPE) {
-        fprintf(stderr, "%s : error: vocab type is not BPE\n", __func__);
-        llama_free_model(model);
-        llama_free(ctx);
-        return 2;
-    }
-
-#ifdef _WIN32
-    // We need this for unicode console support
-    console::init(false, false);
-    atexit([]() { console::cleanup(); });
-#endif
-
-    bool success = true;
-
-    for (const auto & test_kv : k_tests()) {
-        const std::vector<llama_token> res = llama_tokenize(ctx, test_kv.first, false);
-
-        printf("\n");
-        printf("src: '%s'\n", test_kv.first.c_str());
-        printf("res: '%s'\n", llama_detokenize_bpe(ctx, res).c_str());
-        printf("tok: ");
-        for (const auto & tok : res) {
-            printf("%d ", tok);
-        }
-        printf("\n");
-
-        bool correct = res.size() == test_kv.second.size();
-        for (int i = 0; i < (int) res.size() && correct; ++i) {
-            if (test_kv.second[i] != res[i]) {
-                correct = false;
-            }
-        }
-
-        if (!correct) {
-            fprintf(stderr, "%s : failed test:    '%s'\n", __func__, test_kv.first.c_str());
-            fprintf(stderr, "%s : detokenized to: '%s' instead of '%s'\n", __func__,
-                llama_detokenize_bpe(ctx, res).c_str(),
-                llama_detokenize_bpe(ctx, test_kv.second).c_str());
-            fprintf(stderr, "%s : expected tokens: ", __func__);
-            for (const auto & t : test_kv.second) {
-                fprintf(stderr, "%6d '%s', ", t, llama_token_to_piece(ctx, t).c_str());
-            }
-            fprintf(stderr, "\n");
-            fprintf(stderr, "%s : got tokens:      ", __func__);
-            for (const auto & t : res) {
-                fprintf(stderr, "%6d '%s', ", t, llama_token_to_piece(ctx, t).c_str());
-            }
-            fprintf(stderr, "\n");
-        }
-    }
-
-    if (!fname_text.empty()) {
-        fprintf(stderr, "%s : tokenizing: '%s'\n", __func__, fname_text.c_str());
-
-        std::string text;
-        {
-            std::ifstream ifs(fname_text);
-            if (!ifs) {
-                fprintf(stderr, "%s : error: could not open file '%s'\n", __func__, fname_text.c_str());
-                return 1;
-            }
-            text = std::string(std::istreambuf_iterator<char>(ifs), std::istreambuf_iterator<char>());
-        }
-
-        fprintf(stderr, "%s : text size: %zu\n", __func__, text.size());
-
-        const std::vector<llama_token> res = llama_tokenize(ctx, text, false);
-
-        fprintf(stderr, "%s : tokens: %zu\n", __func__, res.size());
-
-        {
-            const std::string fname_out = fname_text + ".tokcpp";
-
-            std::ofstream ofs(fname_out);
-            if (!ofs) {
-                fprintf(stderr, "%s : error: could not open file '%s'\n", __func__, fname_out.c_str());
-                return 1;
-            }
-
-            for (const auto & tok : res) {
-                ofs << tok << " '" << llama_detokenize_bpe(ctx, std::vector<int>{tok}) << "'" << std::endl;
-            }
-        }
-
-        fprintf(stderr, "%s : tokens written to '%s'\n", __func__, (fname_text + ".tokcpp").c_str());
-    }
-
-    llama_free_model(model);
-    llama_free(ctx);
-
-    llama_backend_free();
-
-    return success ? 0 : 3;
-}
--- a/tests/test-tokenizer-0-falcon.cpp
+++ b/tests/test-tokenizer-0-falcon.cpp
@ -1,199 +0,0 @@
-#include "llama.h"
-#include "common.h"
-#include "console.h"
-
-#include <cstdio>
-#include <string>
-#include <map>
-#include <vector>
-#include <fstream>
-
-// generate using test-tokenizer-0-falcon.py
-static const std::map<std::string, std::vector<llama_token>> & k_tests() {
-    static std::map<std::string, std::vector<llama_token>> _k_tests = {
-        { ""                      , {  }, },
-        { " "                     , {     204, }, },
-        { "  "                    , {     258, }, },
-        { "   "                   , {     466, }, },
-        { "\t"                    , {     192, }, },
-        { "\n"                    , {     193, }, },
-        { "\n\n"                  , {    1001, }, },
-        { "\n\n\n"                , {   11331, }, },
-        { "\t\n"                  , {   19125, }, },
-        { "Hello world"           , {    9856,   1079, }, },
-        { " Hello world"          , {   23090,   1079, }, },
-        { "Hello World"           , {    9856,   2889, }, },
-        { " Hello World"          , {   23090,   2889, }, },
-        { " Hello World!"         , {   23090,   2889,     12, }, },
-        { "Hello, world!"         , {    9856,     23,   1079,     12, }, },
-        { " Hello, world!"        , {   23090,     23,   1079,     12, }, },
-        { " this is 🦙.cpp"        , {     414,    304,   3346,    111,    231,     25,  29247, }, },
-        { "w048 7tuijk dsdfhu"    , {      98,  55866,    204,     34,  16682,   7149,  36190,   6869,  11481, }, },
-        { "нещо на Български"     , {     150,    133,   6207,    151,    215,    150,    134,   5052,    133,   6279,   5052,    223,    151,    216,  49679,    123,  53110,  47043,   7795, }, },
-        { "កាន់តែពិសេសអាចខលចេញ"   , {   38154,    206,  38154,    126,  38154,    225,    167,    237,    217,  38154,    221,    167,    237,    208,  38154,    228,  38154,    127,  38154,    237,    167,    237,    207,  38154,    237,  38154,    107,  38154,    126,  38154,    211,  38154,    207,  38154,    233,  38154,    211,    167,    237,    207,  38154,    215, }, },
-        { "🚀 (normal) 😶‍🌫️ (multiple emojis concatenated) ✅ (only emoji that has its own token)", {    2571,    232,    206,    204,     19,  11003,     20,   8196,    126,    283,    219,  48778,    116,  13392,    204,     19,  51831,    732,  63209,   1741,   7955,    522,     20,  22438,    211,    204,     19,   7927,  53360,    325,    504,    701,    946,  10930,     20, }, },
-        { "Hello"                 , {    9856, }, },
-        { " Hello"                , {   23090, }, },
-        { "  Hello"               , {     204,  23090, }, },
-        { "   Hello"              , {     258,  23090, }, },
-        { "    Hello"             , {     466,  23090, }, },
-        { "    Hello\n    Hello"  , {     466,  23090,    742,  23090, }, },
-        { " ("                    , {     204,     19, }, },
-        { "\n ="                  , {    1212,     40, }, },
-        { "' era"                 , {      18,   4932, }, },
-        { "Hello, y'all! How are you 😁 ?我想在apple工作1314151天～", {    9856,     23,    291,     18,    436,     12,   1265,    362,    299,   8196,    207,    204,     42,  50087,    123,   2727,  20300,  32022,    133,    234,  17419,  30137,     28,   7858,    181,    133,    236, }, },
-        { "3"                     , {      30, }, },
-        { "33"                    , {    3138, }, },
-        { "333"                   , {   22287, }, },
-        { "3333"                  , {   22287,     30, }, },
-        { "33333"                 , {   22287,   3138, }, },
-        { "333333"                , {   22287,  22287, }, },
-        { "3333333"               , {   22287,  22287,     30, }, },
-        { "33333333"              , {   22287,  22287,   3138, }, },
-        { "333333333"             , {   22287,  22287,  22287, }, },
-    };
-
-    return _k_tests;
-}
-
-int main(int argc, char **argv) {
-    if (argc < 2) {
-        fprintf(stderr, "Usage: %s vocab-file [text-file]\n", argv[0]);
-        return 1;
-    }
-
-    const std::string fname = argv[1];
-
-    std::string fname_text;
-    if (argc > 2) {
-        fname_text = argv[2];
-    }
-
-    fprintf(stderr, "%s : reading vocab from: '%s'\n", __func__, fname.c_str());
-
-    llama_model * model;
-    llama_context * ctx;
-
-    llama_backend_init();
-
-    // load the vocab
-    {
-        auto mparams = llama_model_default_params();
-
-        mparams.vocab_only = true;
-
-        model = llama_load_model_from_file(fname.c_str(), mparams);
-
-        if (model == NULL) {
-            fprintf(stderr, "%s: error: failed to load vocab '%s'\n", __func__, fname.c_str());
-            return 1;
-        }
-
-        auto cparams = llama_context_default_params();
-
-        ctx = llama_new_context_with_model(model, cparams);
-
-        if (ctx == NULL) {
-            fprintf(stderr, "%s: error: failed to load vocab '%s'\n", __func__, fname.c_str());
-            llama_free_model(model);
-            return 1;
-        }
-    }
-
-    if (llama_vocab_type(model) != LLAMA_VOCAB_TYPE_BPE) {
-        fprintf(stderr, "%s : error: vocab type is not BPE\n", __func__);
-        llama_free_model(model);
-        llama_free(ctx);
-        return 2;
-    }
-
-#ifdef _WIN32
-    // We need this for unicode console support
-    console::init(false, false);
-    atexit([]() { console::cleanup(); });
-#endif
-
-    bool success = true;
-
-    for (const auto & test_kv : k_tests()) {
-        const std::vector<llama_token> res = llama_tokenize(ctx, test_kv.first, false);
-
-        printf("\n");
-        printf("src: '%s'\n", test_kv.first.c_str());
-        printf("res: '%s'\n", llama_detokenize_bpe(ctx, res).c_str());
-        printf("tok: ");
-        for (const auto & tok : res) {
-            printf("%d ", tok);
-        }
-        printf("\n");
-
-        bool correct = res.size() == test_kv.second.size();
-        for (int i = 0; i < (int) res.size() && correct; ++i) {
-            if (test_kv.second[i] != res[i]) {
-                correct = false;
-            }
-        }
-
-        if (!correct) {
-            fprintf(stderr, "%s : failed test:    '%s'\n", __func__, test_kv.first.c_str());
-            fprintf(stderr, "%s : detokenized to: '%s' instead of '%s'\n", __func__,
-                llama_detokenize_bpe(ctx, res).c_str(),
-                llama_detokenize_bpe(ctx, test_kv.second).c_str());
-            fprintf(stderr, "%s : expected tokens: ", __func__);
-            for (const auto & t : test_kv.second) {
-                fprintf(stderr, "%6d, ", t);
-            }
-            fprintf(stderr, "\n");
-            fprintf(stderr, "%s : got tokens:      ", __func__);
-            for (const auto & t : res) {
-                fprintf(stderr, "%6d, ", t);
-            }
-            fprintf(stderr, "\n");
-
-            success = false;
-        }
-    }
-
-    if (!fname_text.empty()) {
-        fprintf(stderr, "%s : tokenizing: '%s'\n", __func__, fname_text.c_str());
-
-        std::string text;
-        {
-            std::ifstream ifs(fname_text);
-            if (!ifs) {
-                fprintf(stderr, "%s : error: could not open file '%s'\n", __func__, fname_text.c_str());
-                return 1;
-            }
-            text = std::string(std::istreambuf_iterator<char>(ifs), std::istreambuf_iterator<char>());
-        }
-
-        fprintf(stderr, "%s : text size: %zu\n", __func__, text.size());
-
-        const std::vector<llama_token> res = llama_tokenize(ctx, text, false);
-
-        fprintf(stderr, "%s : tokens: %zu\n", __func__, res.size());
-
-        {
-            const std::string fname_out = fname_text + ".tokcpp";
-
-            std::ofstream ofs(fname_out);
-            if (!ofs) {
-                fprintf(stderr, "%s : error: could not open file '%s'\n", __func__, fname_out.c_str());
-                return 1;
-            }
-
-            for (const auto & tok : res) {
-                ofs << tok << " '" << llama_detokenize_bpe(ctx, std::vector<int>{tok}) << "'" << std::endl;
-            }
-        }
-
-        fprintf(stderr, "%s : tokens written to '%s'\n", __func__, (fname_text + ".tokcpp").c_str());
-    }
-
-    llama_free_model(model);
-    llama_free(ctx);
-
-    llama_backend_free();
-
-    return success ? 0 : 3;
-}
--- a/tests/test-tokenizer-0-llama-v3.cpp
+++ b/tests/test-tokenizer-0-llama-v3.cpp
@ -1,199 +0,0 @@
-#include "llama.h"
-#include "common.h"
-#include "console.h"
-
-#include <cstdio>
-#include <string>
-#include <map>
-#include <vector>
-#include <fstream>
-
-// generate using test-tokenizer-0-llama.py
-static const std::map<std::string, std::vector<llama_token>> & k_tests() {
-    static std::map<std::string, std::vector<llama_token>> _k_tests = {
-        { ""                      , {  }, },
-        { " "                     , {     220, }, },
-        { "  "                    , {     256, }, },
-        { "   "                   , {     262, }, },
-        { "\t"                    , {     197, }, },
-        { "\n"                    , {     198, }, },
-        { "\n\n"                  , {     271, }, },
-        { "\n\n\n"                , {    1432, }, },
-        { "\t\n"                  , {    1602, }, },
-        { "Hello world"           , {    9906,   1917, }, },
-        { " Hello world"          , {   22691,   1917, }, },
-        { "Hello World"           , {    9906,   4435, }, },
-        { " Hello World"          , {   22691,   4435, }, },
-        { " Hello World!"         , {   22691,   4435,      0, }, },
-        { "Hello, world!"         , {    9906,     11,   1917,      0, }, },
-        { " Hello, world!"        , {   22691,     11,   1917,      0, }, },
-        { " this is 🦙.cpp"        , {     420,    374,  11410,     99,    247,     13,  11055, }, },
-        { "w048 7tuijk dsdfhu"    , {      86,  23904,    220,     22,     83,   2005,  42908,  11729,   3013,  17156, }, },
-        { "нещо на Български"     , {   79862, 102118,  13373,  64571,  34694,   3114, 112203,  80112, }, },
-        { "កាន់តែពិសេសអាចខលចេញ"   , {   21549,    222,  98629,    241,  45358,    233,  21549,    237,  45358,    224,  21549,    244,  21549,    115,  21549,    253,  45358,    223,  21549,    253,  21549,     95,  98629,    227,  21549,    223,  21549,    249,  21549,    227,  45358,    223,  21549,    231, }, },
-        { "🚀 (normal) 😶‍🌫️ (multiple emojis concatenated) ✅ (only emoji that has its own token)", {    9468,    248,    222,    320,   8416,      8,  27623,    114, 102470,   9468,    234,    104,  31643,    320,  36773, 100166,  98634,      8,  26602,    227,    320,   3323,  43465,    430,    706,   1202,   1866,   4037,      8, }, },
-        { "Hello"                 , {    9906, }, },
-        { " Hello"                , {   22691, }, },
-        { "  Hello"               , {     220,  22691, }, },
-        { "   Hello"              , {     256,  22691, }, },
-        { "    Hello"             , {     262,  22691, }, },
-        { "    Hello\n    Hello"  , {     262,  22691,    198,    262,  22691, }, },
-        { " ("                    , {     320, }, },
-        { "\n ="                  , {     198,    284, }, },
-        { "' era"                 , {       6,  11639, }, },
-        { "Hello, y'all! How are you 😁 ?我想在apple工作1314151天～", {    9906,     11,    379,  65948,      0,   2650,    527,    499,  27623,    223,    949,  37046, 101067,  19000,  23182, 102301,   9263,  18136,     16,  36827,  21909, }, },
-        { "3"                     , {      18, }, },
-        { "33"                    , {    1644, }, },
-        { "333"                   , {    8765, }, },
-        { "3333"                  , {    8765,     18, }, },
-        { "33333"                 , {    8765,   1644, }, },
-        { "333333"                , {    8765,   8765, }, },
-        { "3333333"               , {    8765,   8765,     18, }, },
-        { "33333333"              , {    8765,   8765,   1644, }, },
-        { "333333333"             , {    8765,   8765,   8765, }, },
-    };
-
-    return _k_tests;
-}
-
-int main(int argc, char **argv) {
-    if (argc < 2) {
-        fprintf(stderr, "Usage: %s vocab-file [text-file]\n", argv[0]);
-        return 1;
-    }
-
-    const std::string fname = argv[1];
-
-    std::string fname_text;
-    if (argc > 2) {
-        fname_text = argv[2];
-    }
-
-    fprintf(stderr, "%s : reading vocab from: '%s'\n", __func__, fname.c_str());
-
-    llama_model * model;
-    llama_context * ctx;
-
-    llama_backend_init();
-
-    // load the vocab
-    {
-        auto mparams = llama_model_default_params();
-
-        mparams.vocab_only = true;
-
-        model = llama_load_model_from_file(fname.c_str(), mparams);
-
-        if (model == NULL) {
-            fprintf(stderr, "%s: error: failed to load vocab '%s'\n", __func__, fname.c_str());
-            return 1;
-        }
-
-        auto cparams = llama_context_default_params();
-
-        ctx = llama_new_context_with_model(model, cparams);
-
-        if (ctx == NULL) {
-            fprintf(stderr, "%s: error: failed to load vocab '%s'\n", __func__, fname.c_str());
-            llama_free_model(model);
-            return 1;
-        }
-    }
-
-    if (llama_vocab_type(model) != LLAMA_VOCAB_TYPE_BPE) {
-        fprintf(stderr, "%s : error: vocab type is not BPE\n", __func__);
-        llama_free_model(model);
-        llama_free(ctx);
-        return 2;
-    }
-
-#ifdef _WIN32
-    // We need this for unicode console support
-    console::init(false, false);
-    atexit([]() { console::cleanup(); });
-#endif
-
-    bool success = true;
-
-    for (const auto & test_kv : k_tests()) {
-        const std::vector<llama_token> res = llama_tokenize(ctx, test_kv.first, false);
-
-        printf("\n");
-        printf("src: '%s'\n", test_kv.first.c_str());
-        printf("res: '%s'\n", llama_detokenize_bpe(ctx, res).c_str());
-        printf("tok: ");
-        for (const auto & tok : res) {
-            printf("%d ", tok);
-        }
-        printf("\n");
-
-        bool correct = res.size() == test_kv.second.size();
-        for (int i = 0; i < (int) res.size() && correct; ++i) {
-            if (test_kv.second[i] != res[i]) {
-                correct = false;
-            }
-        }
-
-        if (!correct) {
-            fprintf(stderr, "%s : failed test:    '%s'\n", __func__, test_kv.first.c_str());
-            fprintf(stderr, "%s : detokenized to: '%s' instead of '%s'\n", __func__,
-                llama_detokenize_bpe(ctx, res).c_str(),
-                llama_detokenize_bpe(ctx, test_kv.second).c_str());
-            fprintf(stderr, "%s : expected tokens: ", __func__);
-            for (const auto & t : test_kv.second) {
-                fprintf(stderr, "%6d '%s', ", t, llama_token_to_piece(ctx, t).c_str());
-            }
-            fprintf(stderr, "\n");
-            fprintf(stderr, "%s : got tokens:      ", __func__);
-            for (const auto & t : res) {
-                fprintf(stderr, "%6d '%s', ", t, llama_token_to_piece(ctx, t).c_str());
-            }
-            fprintf(stderr, "\n");
-
-            success = false;
-        }
-    }
-
-    if (!fname_text.empty()) {
-        fprintf(stderr, "%s : tokenizing: '%s'\n", __func__, fname_text.c_str());
-
-        std::string text;
-        {
-            std::ifstream ifs(fname_text);
-            if (!ifs) {
-                fprintf(stderr, "%s : error: could not open file '%s'\n", __func__, fname_text.c_str());
-                return 1;
-            }
-            text = std::string(std::istreambuf_iterator<char>(ifs), std::istreambuf_iterator<char>());
-        }
-
-        fprintf(stderr, "%s : text size: %zu\n", __func__, text.size());
-
-        const std::vector<llama_token> res = llama_tokenize(ctx, text, false);
-
-        fprintf(stderr, "%s : tokens: %zu\n", __func__, res.size());
-
-        {
-            const std::string fname_out = fname_text + ".tokcpp";
-
-            std::ofstream ofs(fname_out);
-            if (!ofs) {
-                fprintf(stderr, "%s : error: could not open file '%s'\n", __func__, fname_out.c_str());
-                return 1;
-            }
-
-            for (const auto & tok : res) {
-                ofs << tok << " '" << string_strip(llama_detokenize_bpe(ctx, std::vector<int>{tok})) << "'" << std::endl;
-            }
-        }
-
-        fprintf(stderr, "%s : tokens written to '%s'\n", __func__, (fname_text + ".tokcpp").c_str());
-    }
-
-    llama_free_model(model);
-    llama_free(ctx);
-
-    llama_backend_free();
-
-    return success ? 0 : 3;
-}
--- a/tests/test-tokenizer-0-llama.cpp
+++ b/tests/test-tokenizer-0-llama.cpp
@ -1,204 +0,0 @@
-#include "llama.h"
-#include "common.h"
-#include "console.h"
-
-#include <cstdio>
-#include <string>
-#include <map>
-#include <vector>
-#include <fstream>
-
-// generate using test-tokenizer-0-llama.py
-static const std::map<std::string, std::vector<llama_token>> & k_tests() {
-    static std::map<std::string, std::vector<llama_token>> _k_tests = {
-        { ""                      , {  }, },
-        { " "                     , {     259, }, },
-        { "  "                    , {    1678, }, },
-        { "   "                   , {     268, }, },
-        { "\t"                    , {   29871,     12, }, },
-        { "\n"                    , {   29871,     13, }, },
-        { "\n\n"                  , {   29871,     13,     13, }, },
-        { "\n\n\n"                , {   29871,     13,     13,     13, }, },
-        { "\t\n"                  , {   29871,     12,     13, }, },
-        { "Hello world"           , {   15043,   3186, }, },
-        { " Hello world"          , {   29871,  15043,   3186, }, },
-        { "Hello World"           , {   15043,   2787, }, },
-        { " Hello World"          , {   29871,  15043,   2787, }, },
-        { " Hello World!"         , {   29871,  15043,   2787,  29991, }, },
-        { "Hello, world!"         , {   15043,  29892,   3186,  29991, }, },
-        { " Hello, world!"        , {   29871,  15043,  29892,   3186,  29991, }, },
-        { " this is 🦙.cpp"        , {   29871,    445,    338,  29871,    243,    162,    169,    156,  29889,   8223, }, },
-        { "w048 7tuijk dsdfhu"    , {     281,  29900,  29946,  29947,  29871,  29955,   9161,  13535,  18031,   2176,   6905, }, },
-        { "нещо на Български"     , {    1538,   4851,    665,   1386,  29713,   1305, }, },
-        { "កាន់តែពិសេសអាចខលចេញ"   , {   29871,  31849,  31324,  31934,    228,    162,    142,    228,    161,    146,    228,    162,    133,    228,    161,    153,    228,    161,    186,  31708,    228,    162,    132,  31708,    228,    161,    165,  31324,    228,    161,    136,    228,    161,    132,    228,    161,    158,    228,    161,    136,    228,    162,    132,    228,    161,    140, }, },
-        { "🚀 (normal) 😶‍🌫️ (multiple emojis concatenated) ✅ (only emoji that has its own token)", {   29871,    243,    162,    157,    131,    313,   8945,  29897,  29871,    243,    162,    155,    185,  30722,    243,    162,    143,    174,  30598,    313,  20787,    953,   3848,    275,  16125,    630,  29897,  29871,  31681,    313,   6194,    953,  29877,   2397,    393,    756,    967,   1914,   5993,  29897, }, },
-        { "Hello"                 , {   15043, }, },
-        { " Hello"                , {   29871,  15043, }, },
-        { "  Hello"               , {     259,  15043, }, },
-        { "   Hello"              , {    1678,  15043, }, },
-        { "    Hello"             , {     268,  15043, }, },
-        { "    Hello\n    Hello"  , {     268,  15043,     13,   1678,  15043, }, },
-        { " ("                    , {   29871,    313, }, },
-        { "\n ="                  , {   29871,     13,    353, }, },
-        { "' era"                 , {     525,   3152, }, },
-        { "Hello, y'all! How are you 😁 ?我想在apple工作1314151天～", {   15043,  29892,    343,  29915,    497,  29991,   1128,    526,    366,  29871,    243,    162,    155,    132,   1577,  30672,  31522,  30505,  11548,  31041,  30732,  29896,  29941,  29896,  29946,  29896,  29945,  29896,  30408,  30739, }, },
-        { "3"                     , {   29871,  29941, }, },
-        { "33"                    , {   29871,  29941,  29941, }, },
-        { "333"                   , {   29871,  29941,  29941,  29941, }, },
-        { "3333"                  , {   29871,  29941,  29941,  29941,  29941, }, },
-        { "33333"                 , {   29871,  29941,  29941,  29941,  29941,  29941, }, },
-        { "333333"                , {   29871,  29941,  29941,  29941,  29941,  29941,  29941, }, },
-        { "3333333"               , {   29871,  29941,  29941,  29941,  29941,  29941,  29941,  29941, }, },
-        { "33333333"              , {   29871,  29941,  29941,  29941,  29941,  29941,  29941,  29941,  29941, }, },
-        { "333333333"             , {   29871,  29941,  29941,  29941,  29941,  29941,  29941,  29941,  29941,  29941, }, },
-    };
-
-    return _k_tests;
-}
-
-int main(int argc, char **argv) {
-    if (argc < 2) {
-        fprintf(stderr, "Usage: %s vocab-file [text-file]\n", argv[0]);
-        return 1;
-    }
-
-    const std::string fname = argv[1];
-
-    std::string fname_text;
-    if (argc > 2) {
-        fname_text = argv[2];
-    }
-
-    fprintf(stderr, "%s : reading vocab from: '%s'\n", __func__, fname.c_str());
-
-    llama_model * model;
-    llama_context * ctx;
-
-    llama_backend_init();
-
-    // load the vocab
-    {
-        auto mparams = llama_model_default_params();
-
-        mparams.vocab_only = true;
-
-        model = llama_load_model_from_file(fname.c_str(), mparams);
-
-        if (model == NULL) {
-            fprintf(stderr, "%s: error: failed to load vocab '%s'\n", __func__, fname.c_str());
-            return 1;
-        }
-
-        auto cparams = llama_context_default_params();
-
-        ctx = llama_new_context_with_model(model, cparams);
-
-        if (ctx == NULL) {
-            fprintf(stderr, "%s: error: failed to load vocab '%s'\n", __func__, fname.c_str());
-            llama_free_model(model);
-            return 1;
-        }
-    }
-
-    if (llama_vocab_type(model) != LLAMA_VOCAB_TYPE_SPM) {
-        fprintf(stderr, "%s : error: vocab type is not SPM\n", __func__);
-        llama_free_model(model);
-        llama_free(ctx);
-        return 2;
-    }
-
-#ifdef _WIN32
-    // We need this for unicode console support
-    console::init(false, false);
-    atexit([]() { console::cleanup(); });
-#endif
-
-    bool success = true;
-
-    for (const auto & test_kv : k_tests()) {
-        const std::vector<llama_token> res_bos   = llama_tokenize(ctx, test_kv.first, true);
-        const std::vector<llama_token> res_nobos = llama_tokenize(ctx, test_kv.first, false);
-
-        printf("\n");
-        printf("src: '%s'\n", test_kv.first.c_str());
-        printf("res: '%s'\n", llama_detokenize_spm(ctx, res_bos).c_str());
-        printf("tok: ");
-        for (const auto & tok : res_bos) {
-            printf("%d ", tok);
-        }
-        printf("\n");
-
-        bool correct = res_nobos.size() == test_kv.second.size() && res_bos.size() == res_nobos.size() + 1 && res_bos[0] == llama_token_bos(model);
-
-        for (int i = 0; i < (int) res_nobos.size() && correct; ++i) {
-            if (test_kv.second[i] != res_bos[i + 1]) {
-                correct = false;
-            }
-            if (test_kv.second[i] != res_nobos[i]) {
-                correct = false;
-            }
-        }
-
-        if (!correct) {
-            fprintf(stderr, "%s : failed test:    '%s'\n", __func__, test_kv.first.c_str());
-            fprintf(stderr, "%s : detokenized to: '%s' instead of '%s'\n", __func__,
-                llama_detokenize_spm(ctx, res_nobos).c_str(),
-                llama_detokenize_spm(ctx, test_kv.second).c_str());
-            fprintf(stderr, "%s : expected tokens: ", __func__);
-            for (const auto & t : test_kv.second) {
-                fprintf(stderr, "%6d, ", t);
-            }
-            fprintf(stderr, "\n");
-            fprintf(stderr, "%s : got tokens:      ", __func__);
-            for (const auto & t : res_nobos) {
-                fprintf(stderr, "%6d, ", t);
-            }
-            fprintf(stderr, "\n");
-
-            success = false;
-        }
-    }
-
-    if (!fname_text.empty()) {
-        fprintf(stderr, "%s : tokenizing: '%s'\n", __func__, fname_text.c_str());
-
-        std::string text;
-        {
-            std::ifstream ifs(fname_text);
-            if (!ifs) {
-                fprintf(stderr, "%s : error: could not open file '%s'\n", __func__, fname_text.c_str());
-                return 1;
-            }
-            text = std::string(std::istreambuf_iterator<char>(ifs), std::istreambuf_iterator<char>());
-        }
-
-        fprintf(stderr, "%s : text size: %zu\n", __func__, text.size());
-
-        const std::vector<llama_token> res = llama_tokenize(ctx, text, true);
-
-        fprintf(stderr, "%s : tokens: %zu\n", __func__, res.size());
-
-        {
-            const std::string fname_out = fname_text + ".tokcpp";
-
-            std::ofstream ofs(fname_out);
-            if (!ofs) {
-                fprintf(stderr, "%s : error: could not open file '%s'\n", __func__, fname_out.c_str());
-                return 1;
-            }
-
-            for (const auto & tok : res) {
-                ofs << tok << " '" << llama_detokenize_spm(ctx, std::vector<int>{tok}) << "'" << std::endl;
-            }
-        }
-
-        fprintf(stderr, "%s : tokens written to '%s'\n", __func__, (fname_text + ".tokcpp").c_str());
-    }
-
-    llama_free_model(model);
-    llama_free(ctx);
-
-    llama_backend_free();
-
-    return success ? 0 : 3;
-}
--- a/tests/test-tokenizer-0.cpp
+++ b/tests/test-tokenizer-0.cpp
@ -0,0 +1,263 @@
+#include "llama.h"
+#include "common.h"
+#include "console.h"
+
+#include <cstdio>
+#include <string>
+#include <map>
+#include <vector>
+#include <fstream>
+
+//static const std::map<std::string, std::vector<llama_token>> & k_tests() {
+//    static std::map<std::string, std::vector<llama_token>> _k_tests = {
+//        { ""                      , {  }, },
+//        { " "                     , {     220, }, },
+//        { "  "                    , {     256, }, },
+//        { "   "                   , {     262, }, },
+//        { "\t"                    , {     197, }, },
+//        { "\n"                    , {     198, }, },
+//        { "\n\n"                  , {     271, }, },
+//        { "\n\n\n"                , {    1432, }, },
+//        { "\t\n"                  , {    1602, }, },
+//        { "Hello world"           , {    9906,   1917, }, },
+//        { " Hello world"          , {   22691,   1917, }, },
+//        { "Hello World"           , {    9906,   4435, }, },
+//        { " Hello World"          , {   22691,   4435, }, },
+//        { " Hello World!"         , {   22691,   4435,      0, }, },
+//        { "Hello, world!"         , {    9906,     11,   1917,      0, }, },
+//        { " Hello, world!"        , {   22691,     11,   1917,      0, }, },
+//        { " this is 🦙.cpp"        , {     420,    374,  11410,     99,    247,     13,  11055, }, },
+//        { "w048 7tuijk dsdfhu"    , {      86,  23904,    220,     22,     83,   2005,  42908,  11729,   3013,  17156, }, },
+//        { "нещо на Български"     , {   79862, 102118,  13373,  64571,  34694,   3114, 112203,  80112, }, },
+//        { "កាន់តែពិសេសអាចខលចេញ"   , {   21549,    222,  98629,    241,  45358,    233,  21549,    237,  45358,    224,  21549,    244,  21549,    115,  21549,    253,  45358,    223,  21549,    253,  21549,     95,  98629,    227,  21549,    223,  21549,    249,  21549,    227,  45358,    223,  21549,    231, }, },
+//        { "🚀 (normal) 😶‍🌫️ (multiple emojis concatenated) ✅ (only emoji that has its own token)", {    9468,    248,    222,    320,   8416,      8,  27623,    114, 102470,   9468,    234,    104,  31643,    320,  36773, 100166,  98634,      8,  26602,    227,    320,   3323,  43465,    430,    706,   1202,   1866,   4037,      8, }, },
+//        { "Hello"                 , {    9906, }, },
+//        { " Hello"                , {   22691, }, },
+//        { "  Hello"               , {     220,  22691, }, },
+//        { "   Hello"              , {     256,  22691, }, },
+//        { "    Hello"             , {     262,  22691, }, },
+//        { "    Hello\n    Hello"  , {     262,  22691,    198,    262,  22691, }, },
+//        { " ("                    , {     320, }, },
+//        { "\n ="                  , {     198,    284, }, },
+//        { "' era"                 , {       6,  11639, }, },
+//        { "Hello, y'all! How are you 😁 ?我想在apple工作1314151天～", {    9906,     11,    379,  65948,      0,   2650,    527,    499,  27623,    223,    949,  37046, 101067,  19000,  23182, 102301,   9263,  18136,     16,  36827,  21909, }, },
+//        { "3"                     , {      18, }, },
+//        { "33"                    , {    1644, }, },
+//        { "333"                   , {    8765, }, },
+//        { "3333"                  , {    8765,     18, }, },
+//        { "33333"                 , {    8765,   1644, }, },
+//        { "333333"                , {    8765,   8765, }, },
+//        { "3333333"               , {    8765,   8765,     18, }, },
+//        { "33333333"              , {    8765,   8765,   1644, }, },
+//        { "333333333"             , {    8765,   8765,   8765, }, },
+//    };
+//
+//    return _k_tests;
+//}
+
+static std::map<std::string, std::vector<llama_token>> read_tests(const std::string & fname_inp, const std::string & fname_out) {
+    std::map<std::string, std::vector<llama_token>> tests;
+
+    std::ifstream ifs_inp(fname_inp);
+    if (!ifs_inp) {
+        fprintf(stderr, "%s : error: could not open file '%s'\n", __func__, fname_inp.c_str());
+        return tests;
+    }
+
+    std::string sraw((std::istreambuf_iterator<char>(ifs_inp)), std::istreambuf_iterator<char>());
+
+    std::ifstream ifs_out(fname_out);
+    if (!ifs_out) {
+        fprintf(stderr, "%s : error: could not open file '%s'\n", __func__, fname_out.c_str());
+        return tests;
+    }
+
+    std::vector<std::string> sout;
+    for (std::string line; std::getline(ifs_out, line);) {
+        sout.push_back(line);
+    }
+
+    const std::string sep = "\n__ggml_vocab_test__\n";
+
+    std::vector<std::string> sinp;
+
+    size_t pos = 0;
+    while (pos < sraw.size()) {
+        const size_t next = sraw.find(sep, pos);
+        if (next == std::string::npos) {
+            sinp.push_back(sraw.substr(pos));
+            break;
+        }
+        sinp.push_back(sraw.substr(pos, next - pos));
+        pos = next + sep.size();
+    }
+
+    if (sinp.size() != sout.size()) {
+        fprintf(stderr, "%s : error: input and output files have different number of tests\n", __func__);
+        return tests;
+    }
+
+    for (size_t i = 0; i < sinp.size(); ++i) {
+        const std::string & s = sinp[i];
+        const std::string & o = string_strip(sout[i]);
+
+        std::vector<llama_token> toks;
+
+        size_t pos = 0;
+        while (pos < o.size()) {
+            size_t next = o.find(' ', pos);
+            if (next == std::string::npos) {
+                next = o.size();
+            }
+            const std::string stok = o.substr(pos, next - pos);
+            toks.push_back(std::stoi(stok));
+            pos = next + 1;
+        }
+
+        tests[s] = toks;
+    }
+
+    return tests;
+}
+
+int main(int argc, char **argv) {
+    if (argc < 2) {
+        fprintf(stderr, "Usage: %s vocab-file [text-file]\n", argv[0]);
+        return 1;
+    }
+
+    const std::string fname = argv[1];
+
+    const std::string fname_inp = fname + ".inp";
+    const std::string fname_out = fname + ".out";
+
+    std::string fname_text;
+    if (argc > 2) {
+        fname_text = argv[2];
+    }
+
+    fprintf(stderr, "%s : reading vocab from: '%s'\n", __func__, fname.c_str());
+
+    llama_model * model;
+    llama_context * ctx;
+
+    llama_backend_init();
+
+    // load the vocab
+    {
+        auto mparams = llama_model_default_params();
+
+        mparams.vocab_only = true;
+
+        model = llama_load_model_from_file(fname.c_str(), mparams);
+
+        if (model == NULL) {
+            fprintf(stderr, "%s: error: failed to load vocab '%s'\n", __func__, fname.c_str());
+            return 1;
+        }
+
+        auto cparams = llama_context_default_params();
+
+        ctx = llama_new_context_with_model(model, cparams);
+
+        if (ctx == NULL) {
+            fprintf(stderr, "%s: error: failed to load vocab '%s'\n", __func__, fname.c_str());
+            llama_free_model(model);
+            return 1;
+        }
+    }
+
+#ifdef _WIN32
+    // We need this for unicode console support
+    console::init(false, false);
+    atexit([]() { console::cleanup(); });
+#endif
+
+    bool success = true;
+
+    const auto k_tests = read_tests(fname_inp, fname_out);
+
+    const bool add_special = false;
+
+    for (const auto & test_kv : k_tests) {
+        const std::vector<llama_token> res = llama_tokenize(ctx, test_kv.first, add_special);
+
+        printf("\n");
+        printf("src: '%s'\n", test_kv.first.c_str());
+        printf("res: '%s'\n", llama_detokenize_bpe(ctx, res).c_str());
+        printf("tok: ");
+        for (const auto & tok : res) {
+            printf("%d ", tok);
+        }
+        printf("\n");
+
+        bool correct = res.size() == test_kv.second.size();
+        for (int i = 0; i < (int) res.size() && correct; ++i) {
+            if (test_kv.second[i] != res[i]) {
+                correct = false;
+            }
+        }
+
+        if (!correct) {
+            fprintf(stderr, "%s : failed test:    '%s'\n", __func__, test_kv.first.c_str());
+            fprintf(stderr, "%s : detokenized to: '%s' instead of '%s'\n", __func__,
+                llama_detokenize_bpe(ctx, res).c_str(),
+                llama_detokenize_bpe(ctx, test_kv.second).c_str());
+            fprintf(stderr, "%s : expected tokens: ", __func__);
+            for (const auto & t : test_kv.second) {
+                fprintf(stderr, "%6d '%s', ", t, llama_token_to_piece(ctx, t).c_str());
+            }
+            fprintf(stderr, "\n");
+            fprintf(stderr, "%s : got tokens:      ", __func__);
+            for (const auto & t : res) {
+                fprintf(stderr, "%6d '%s', ", t, llama_token_to_piece(ctx, t).c_str());
+            }
+            fprintf(stderr, "\n");
+
+            success = false;
+        }
+    }
+
+    if (!fname_text.empty()) {
+        fprintf(stderr, "%s : tokenizing: '%s'\n", __func__, fname_text.c_str());
+
+        std::string text;
+        {
+            std::ifstream ifs(fname_text);
+            if (!ifs) {
+                fprintf(stderr, "%s : error: could not open file '%s'\n", __func__, fname_text.c_str());
+                return 1;
+            }
+            text = std::string(std::istreambuf_iterator<char>(ifs), std::istreambuf_iterator<char>());
+        }
+
+        fprintf(stderr, "%s : text size: %zu\n", __func__, text.size());
+
+        const std::vector<llama_token> res = llama_tokenize(ctx, text, add_special);
+
+        fprintf(stderr, "%s : tokens: %zu\n", __func__, res.size());
+
+        {
+            const std::string fname_out = fname_text + ".tokcpp";
+
+            std::ofstream ofs(fname_out);
+            if (!ofs) {
+                fprintf(stderr, "%s : error: could not open file '%s'\n", __func__, fname_out.c_str());
+                return 1;
+            }
+
+            for (const auto & tok : res) {
+                ofs << tok << " '" << string_strip(llama_detokenize_bpe(ctx, std::vector<int>{tok})) << "'" << std::endl;
+            }
+        }
+
+        fprintf(stderr, "%s : tokens written to '%s'\n", __func__, (fname_text + ".tokcpp").c_str());
+    }
+
+    llama_free_model(model);
+    llama_free(ctx);
+
+    llama_backend_free();
+
+    return success ? 0 : 3;
+}
--- a/tests/test-tokenizer-1-llama.cpp
+++ b/tests/test-tokenizer-1-llama.cpp
@ -12,7 +12,7 @@
 #include <thread>
 #include <vector>

-int main(int argc, char **argv) {
+int main(int argc, char ** argv) {
    if (argc < 2) {
        fprintf(stderr, "Usage: %s <vocab-file>\n", argv[0]);
        return 1;
--- a/unicode.cpp
+++ b/unicode.cpp
@ -56,23 +56,22 @@ static uint32_t unicode_cpt_from_utf8(const std::string & utf8, size_t & offset)
        offset += 4;
        return result;
    }
-    throw std::invalid_argument("invalid string");
+    throw std::invalid_argument("failed to convert utf8 to codepoint");
 }

-static std::vector<uint16_t> unicode_cpt_to_utf16(uint32_t cp) {
-    std::vector<uint16_t> result;
-    if (/* 0x0000 <= cp && */ cp <= 0xffff) {
-        result.emplace_back(cp);
-    }
-    else if (0x10000 <= cp && cp <= 0x10ffff) {
-        result.emplace_back(0xd800 | ((cp - 0x10000) >> 10));
-        result.emplace_back(0xdc00 | ((cp - 0x10000) & 0x03ff));
-    }
-    else {
-        throw std::invalid_argument("invalid cpt");
-    }
-    return result;
-}
+//static std::vector<uint16_t> unicode_cpt_to_utf16(uint32_t cp) {
+//    std::vector<uint16_t> result;
+//    if (/* 0x0000 <= cp && */ cp <= 0xffff) {
+//        result.emplace_back(cp);
+//        return result;
+//    }
+//    if (0x10000 <= cp && cp <= 0x10ffff) {
+//        result.emplace_back(0xd800 | ((cp - 0x10000) >> 10));
+//        result.emplace_back(0xdc00 | ((cp - 0x10000) & 0x03ff));
+//        return result;
+//    }
+//    throw std::invalid_argument("failed to convert codepoint to utf16");
+//}

 //static std::vector<uint16_t> unicode_cpts_to_utf16(const std::vector<uint32_t> & cps) {
 //    std::vector<uint16_t> result;
@ -83,28 +82,28 @@ static std::vector<uint16_t> unicode_cpt_to_utf16(uint32_t cp) {
 //    return result;
 //}

-static uint32_t cpt_from_utf16(const std::vector<uint16_t> & utf16, size_t & offset) {
-    assert(offset < utf16.size());
-    if (((utf16[0] >> 10) << 10) != 0xd800) {
-        auto result = utf16[offset + 0];
-        offset += 1;
-        return result;
-    }
-
-    if (offset + 1 >= utf16.size() || !((utf16[1] & 0xdc00) == 0xdc00)) {
-        throw std::invalid_argument("invalid character");
-    }
-
-    auto result = 0x10000 + (((utf16[0] & 0x03ff) << 10) | (utf16[1] & 0x03ff));
-    offset += 2;
-    return result;
-}
+//static uint32_t unicode_cpt_from_utf16(const std::vector<uint16_t> & utf16, size_t & offset) {
+//    assert(offset < utf16.size());
+//    if (((utf16[0] >> 10) << 10) != 0xd800) {
+//        auto result = utf16[offset + 0];
+//        offset += 1;
+//        return result;
+//    }
+//
+//    if (offset + 1 >= utf16.size() || !((utf16[1] & 0xdc00) == 0xdc00)) {
+//        throw std::invalid_argument("invalid character");
+//    }
+//
+//    auto result = 0x10000 + (((utf16[0] & 0x03ff) << 10) | (utf16[1] & 0x03ff));
+//    offset += 2;
+//    return result;
+//}

 //static std::vector<uint32_t> unicode_cpts_from_utf16(const std::vector<uint16_t> & utf16) {
 //    std::vector<uint32_t> result;
 //    size_t offset = 0;
 //    while (offset < utf16.size()) {
-//        result.push_back(cpt_from_utf16(utf16, offset));
+//        result.push_back(unicode_cpt_from_utf16(utf16, offset));
 //    }
 //    return result;
 //}
@ -499,7 +498,7 @@ static std::vector<size_t> unicode_regex_split_custom_llama3(const std::string &
 }

 // use std::wregex to split the text
-static std::vector<size_t> unicode_regex_split_stl(const std::wstring & wtext, const std::vector<size_t> & offsets, const std::wstring & regex_expr) {
+static std::vector<size_t> unicode_regex_split_stl(const std::wstring & wtext, const std::wstring & regex_expr, const std::vector<size_t> & offsets) {
    std::wregex expr(regex_expr);
    std::vector<size_t> bpe_offsets; // store the offset of each word
    bpe_offsets.reserve(offsets.size()); // Reserve memory for the approximate size
@ -529,7 +528,7 @@ static std::vector<size_t> unicode_regex_split_stl(const std::wstring & wtext, c
 }

 // use std::regex to split the text
-static std::vector<size_t> unicode_regex_split_stl(const std::string & text, const std::vector<size_t> & offsets, const std::string & regex_expr) {
+static std::vector<size_t> unicode_regex_split_stl(const std::string & text, const std::string & regex_expr, const std::vector<size_t> & offsets) {
    std::regex expr(regex_expr);
    std::vector<size_t> bpe_offsets; // store the offset of each word
    bpe_offsets.reserve(offsets.size()); // Reserve memory for the approximate size
@ -558,10 +557,10 @@ static std::vector<size_t> unicode_regex_split_stl(const std::string & text, con
    return bpe_offsets;
 }

-static std::vector<size_t> unicode_regex_split_custom(const std::string & regex, const std::string & text, const std::vector<size_t> & offsets) {
+static std::vector<size_t> unicode_regex_split_custom(const std::string & text, const std::string & regex_expr, const std::vector<size_t> & offsets) {
    std::vector<size_t> bpe_offsets;

-    if (regex == "'s|'t|'re|'ve|'m|'ll|'d| ?\\p{L}+| ?\\p{N}+| ?[^\\s\\p{L}\\p{N}]+|\\s+(?!\\S)") {
+    if (regex_expr == "'s|'t|'re|'ve|'m|'ll|'d| ?\\p{L}+| ?\\p{N}+| ?[^\\s\\p{L}\\p{N}]+|\\s+(?!\\S)") {
        bpe_offsets = unicode_regex_split_custom_gpt2(text, offsets);
    } else if (regex == "(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\\r\\n\\p{L}\\p{N}]?\\p{L}+|\\p{N}{1,3}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+") {
        bpe_offsets = unicode_regex_split_custom_llama3(text, offsets);
@ -576,28 +575,31 @@ static std::vector<size_t> unicode_regex_split_custom(const std::string & regex,

 std::string unicode_cpt_to_utf8(uint32_t cp) {
    std::string result;
+
    if (/* 0x00 <= cp && */ cp <= 0x7f) {
        result.push_back(cp);
+        return result;
    }
-    else if (0x80 <= cp && cp <= 0x7ff) {
+    if (0x80 <= cp && cp <= 0x7ff) {
        result.push_back(0xc0 | ((cp >> 6) & 0x1f));
        result.push_back(0x80 | (cp & 0x3f));
+        return result;
    }
-    else if (0x800 <= cp && cp <= 0xffff) {
+    if (0x800 <= cp && cp <= 0xffff) {
        result.push_back(0xe0 | ((cp >> 12) & 0x0f));
        result.push_back(0x80 | ((cp >> 6) & 0x3f));
        result.push_back(0x80 | (cp & 0x3f));
+        return result;
    }
-    else if (0x10000 <= cp && cp <= 0x10ffff) {
+    if (0x10000 <= cp && cp <= 0x10ffff) {
        result.push_back(0xf0 | ((cp >> 18) & 0x07));
        result.push_back(0x80 | ((cp >> 12) & 0x3f));
        result.push_back(0x80 | ((cp >> 6) & 0x3f));
        result.push_back(0x80 | (cp & 0x3f));
+        return result;
    }
-    else {
-        throw std::invalid_argument("invalid codepoint");
-    }
-    return result;
+
+    throw std::invalid_argument("invalid codepoint");
 }

 std::vector<uint32_t> unicode_cpts_normalize_nfd(const std::vector<uint32_t> & cpts) {
@ -686,7 +688,7 @@ std::vector<std::string> unicode_regex_split(const std::string & text, const std

    const auto cpts = unicode_cpts_from_utf8(text);

-    // generated a "collapsed" representation of the text, where all codepoints are replaced by a single byte
+    // generate a "collapsed" representation of the text, where all codepoints are replaced by a single byte
    // ref: https://github.com/ggerganov/llama.cpp/pull/6920#issuecomment-2081479935
    std::string text_collapsed;
    if (need_collapse) {
@ -714,92 +716,90 @@ std::vector<std::string> unicode_regex_split(const std::string & text, const std

    for (auto & regex_expr : regex_exprs) {
        // first, see if we have an efficient custom regex implementation
-        auto tmp = unicode_regex_split_custom(regex_expr, text, bpe_offsets);
+        auto tmp = unicode_regex_split_custom(text, regex_expr, bpe_offsets);

        if (!tmp.empty()) {
            bpe_offsets = std::move(tmp);
-        } else {
-            // fallback to general-purpose std::regex / std::wregex
-            try {
-                // if a unicode category is used in the regex, we use the collapsed text and replace the unicode category
-                // with the corresponding collapsed representation
-                bool use_collapsed = false;
-                for (auto & ucat : k_ucat_enum) {
-                    if (std::string::npos != regex_expr.find(ucat.first)) {
-                        use_collapsed = true;
-                        break;
-                    }
+            continue;
+        }
+
+        // fallback to general-purpose std::regex / std::wregex
+        try {
+            // if a unicode category is used in the regex, we use the collapsed text and replace the unicode category
+            // with the corresponding collapsed representation
+            bool use_collapsed = false;
+            for (auto & ucat : k_ucat_enum) {
+                if (std::string::npos != regex_expr.find(ucat.first)) {
+                    use_collapsed = true;
+                    break;
                }
-
-                if (use_collapsed) {
-                    // sanity-check that the original regex does not contain any non-ASCII characters
-                    const auto cpts_regex = unicode_cpts_from_utf8(regex_expr);
-                    for (size_t i = 0; i < cpts_regex.size(); ++i) {
-                        if (cpts_regex[i] >= 128) {
-                            throw std::runtime_error("Regex includes both unicode categories and non-ASCII characters - not supported");
-                        }
-                    }
-
-                    // generate a collapsed representation of the regex
-                    std::string regex_expr_collapsed;
-
-                    // track if we are inside [], because nested [] are not allowed
-                    bool inside = false;
-                    for (size_t i = 0; i < regex_expr.size(); ++i) {
-                        if (regex_expr[i] == '[' && (i == 0 || regex_expr[i - 1] != '\\')) {
-                            regex_expr_collapsed += '[';
-                            inside = true;
-                            continue;
-                        }
-
-                        if (inside && regex_expr[i] == ']' && regex_expr[i - 1] != '\\') {
-                            regex_expr_collapsed += ']';
-                            inside = false;
-                            continue;
-                        }
-
-                        if (regex_expr[i] == '\\' && i + 1 < regex_expr.size()) {
-                            if (regex_expr[i + 1] == 'p') {
-                                if (i + 3 < regex_expr.size() && regex_expr[i + 2] == '{') {
-                                    if (regex_expr[i + 4] == '}') {
-                                        const std::string pat = regex_expr.substr(i, 5);
-                                        if (k_ucat_enum.find(pat) != k_ucat_enum.end()) {
-                                            if (!inside) {
-                                                regex_expr_collapsed += '[';
-                                            }
-                                            regex_expr_collapsed += k_ucat_cpt.at(k_ucat_enum.at(pat));
-                                            regex_expr_collapsed += k_ucat_map.at(k_ucat_enum.at(pat));
-                                            if (!inside) {
-                                                regex_expr_collapsed += ']';
-                                            }
-                                            i += 4;
-                                            continue;
-                                        }
-                                    }
-                                }
-                            }
-                        }
-
-                        regex_expr_collapsed += regex_expr[i];
-                    }
-
-                    //printf("text_collapsed: %s\n", text_collapsed.c_str());
-                    //printf("regex_expr_collapsed: %s\n", regex_expr_collapsed.c_str());
-                    bpe_offsets = unicode_regex_split_stl(text_collapsed, bpe_offsets, regex_expr_collapsed);
-                } else {
-                    // no unicode category used, we can use std::wregex directly
-                    const std::wstring wtext       = unicode_wstring_from_utf8(text);
-                    const std::wstring wregex_expr = unicode_wstring_from_utf8(regex_expr);
-
-                    //printf("text: %s\n", text.c_str());
-                    //printf("regex_expr: %s\n", regex_expr.c_str());
-                    bpe_offsets = unicode_regex_split_stl(wtext, bpe_offsets, wregex_expr);
-                }
-            } catch (std::regex_error & e) {
-                fprintf(stderr, "Failed to process regex: '%s'\n", regex_expr.c_str());
-                fprintf(stderr, "Regex error: %s\n", e.what());
-                throw std::runtime_error("Failed to process regex");
            }
+
+            if (use_collapsed) {
+                // sanity-check that the original regex does not contain any non-ASCII characters
+                const auto cpts_regex = unicode_cpts_from_utf8(regex_expr);
+                for (size_t i = 0; i < cpts_regex.size(); ++i) {
+                    if (cpts_regex[i] >= 128) {
+                        throw std::runtime_error("Regex includes both unicode categories and non-ASCII characters - not supported");
+                    }
+                }
+
+                // generate a collapsed representation of the regex
+                std::string regex_expr_collapsed;
+
+                // track if we are inside [], because nested [] are not allowed
+                bool inside = false;
+                for (size_t i = 0; i < regex_expr.size(); ++i) {
+                    if (regex_expr[i] == '[' && (i == 0 || regex_expr[i - 1] != '\\')) {
+                        regex_expr_collapsed += '[';
+                        inside = true;
+                        continue;
+                    }
+
+                    if (inside && regex_expr[i] == ']' && regex_expr[i - 1] != '\\') {
+                        regex_expr_collapsed += ']';
+                        inside = false;
+                        continue;
+                    }
+
+                    if (regex_expr[i + 0] == '\\' && i + 4 < regex_expr.size() &&
+                        regex_expr[i + 1] == 'p' &&
+                        regex_expr[i + 2] == '{' &&
+                        regex_expr[i + 4] == '}') {
+                        const std::string pat = regex_expr.substr(i, 5);
+                        if (k_ucat_enum.find(pat) != k_ucat_enum.end()) {
+                            if (!inside) {
+                                regex_expr_collapsed += '[';
+                            }
+                            regex_expr_collapsed += k_ucat_cpt.at(k_ucat_enum.at(pat));
+                            regex_expr_collapsed += k_ucat_map.at(k_ucat_enum.at(pat));
+                            if (!inside) {
+                                regex_expr_collapsed += ']';
+                            }
+                            i += 4;
+                            continue;
+                        }
+                    }
+
+                    regex_expr_collapsed += regex_expr[i];
+                }
+
+                //printf("text_collapsed: %s\n", text_collapsed.c_str());
+                //printf("regex_expr_collapsed: %s\n", regex_expr_collapsed.c_str());
+                bpe_offsets = unicode_regex_split_stl(text_collapsed, regex_expr_collapsed, bpe_offsets);
+            } else {
+                // no unicode category used, we can use std::wregex directly
+                const std::wstring wtext       = unicode_wstring_from_utf8(text);
+                const std::wstring wregex_expr = unicode_wstring_from_utf8(regex_expr);
+
+                //printf("text: %s\n", text.c_str());
+                //printf("regex_expr: %s\n", regex_expr.c_str());
+                bpe_offsets = unicode_regex_split_stl(wtext, wregex_expr, bpe_offsets);
+            }
+        } catch (std::regex_error & e) {
+            fprintf(stderr, "Failed to process regex: '%s'\n", regex_expr.c_str());
+            fprintf(stderr, "Regex error: %s\n", e.what());
+            throw std::runtime_error("Failed to process regex");
        }
    }