Merge branch 'master' into gg/flash-attn

ggml-ci
2024-04-29 17:19:25 +03:00 · 2024-04-29 17:19:25 +03:00 · a1616e9f72
commit a1616e9f72
parent 9e3876061c 24affa7db3
82 changed files with 3896 additions and 1063 deletions
--- a/.github/workflows/bench.yml
+++ b/.github/workflows/bench.yml
@ -32,7 +32,7 @@ on:
    -  cron: '04 2 * * *'
 concurrency:
-  group: ${{ github.workflow }}-${{ github.ref || github.run_id }}-${{ github.event.inputs.sha }}
+  group: ${{ github.workflow }}-${{ github.ref }}-${{ github.head_ref || github.run_id }}-${{ github.event.inputs.sha }}
  cancel-in-progress: true
 jobs:
--- a/.github/workflows/build.yml
+++ b/.github/workflows/build.yml
@ -593,6 +593,63 @@ jobs:
        run: |
            make swift
  windows-msys2:
    runs-on: windows-latest
    strategy:
      fail-fast: false
      matrix:
        include:
          - { sys: UCRT64,  env: ucrt-x86_64,  build: Release }
          - { sys: CLANG64, env: clang-x86_64, build: Release }
    steps:
      - name: Clone
        uses: actions/checkout@v4
      - name: Setup ${{ matrix.sys }}
        uses: msys2/setup-msys2@v2
        with:
          update: true
          msystem: ${{matrix.sys}}
          install: >-
            base-devel
            mingw-w64-${{matrix.env}}-toolchain
            mingw-w64-${{matrix.env}}-cmake
            mingw-w64-${{matrix.env}}-openblas
      - name: Build using make
        shell: msys2 {0}
        run: |
            make -j $(nproc)
      - name: Clean after building using make
        shell: msys2 {0}
        run: |
            make clean
      - name: Build using make w/ OpenBLAS
        shell: msys2 {0}
        run: |
            make LLAMA_OPENBLAS=1 -j $(nproc)
      - name: Build using CMake
        shell: msys2 {0}
        run: |
            cmake -B build
            cmake --build build --config ${{ matrix.build }} -j $(nproc)
      - name: Clean after building using CMake
        shell: msys2 {0}
        run: |
            rm -rf build
      - name: Build using CMake w/ OpenBLAS
        shell: msys2 {0}
        run: |
            cmake -B build -DLLAMA_BLAS=ON -DLLAMA_BLAS_VENDOR=OpenBLAS
            cmake --build build --config ${{ matrix.build }} -j $(nproc)
  windows-latest-cmake:
    runs-on: windows-latest
--- a/.github/workflows/python-lint.yml
+++ b/.github/workflows/python-lint.yml
@ -21,4 +21,4 @@ jobs:
        uses: py-actions/flake8@v2
        with:
            ignore: "E203,E211,E221,E225,E231,E241,E251,E261,E266,E501,E701,E704,W503"
-            exclude: "examples/*,examples/*/**,*/**/__init__.py"
+            exclude: "examples/*,examples/*/**,*/**/__init__.py,convert-hf-to-gguf-update.py"
--- a/.github/workflows/server.yml
+++ b/.github/workflows/server.yml
@ -23,7 +23,7 @@ on:
    -  cron: '2 4 * * *'
 concurrency:
-  group: ${{ github.workflow }}-${{ github.ref || github.run_id }}
+  group: ${{ github.workflow }}-${{ github.ref }}-${{ github.head_ref || github.run_id }}
  cancel-in-progress: true
 jobs:
@ -41,23 +41,16 @@ jobs:
            sanitizer: ""
      fail-fast: false # While -DLLAMA_SANITIZE_THREAD=ON is broken
    container:
      image: ubuntu:latest
      ports:
        - 8888
      options: --cpus 4
    steps:
      - name: Dependencies
        id: depends
        run: |
-          apt-get update
+          sudo apt-get update
-          apt-get -y install \
+          sudo apt-get -y install \
            build-essential \
            xxd \
            git \
            cmake \
            python3-pip \
            curl \
            wget \
            language-pack-en \
@ -70,6 +63,17 @@ jobs:
          fetch-depth: 0
          ref: ${{ github.event.inputs.sha || github.event.pull_request.head.sha || github.sha || github.head_ref || github.ref_name }}
      - name: Python setup
        id: setup_python
        uses: actions/setup-python@v5
        with:
          python-version: '3.11'
      - name: Tests dependencies
        id: test_dependencies
        run: |
          pip install -r examples/server/tests/requirements.txt
      - name: Verify server deps
        id: verify_server_deps
        run: |
@ -100,10 +104,6 @@ jobs:
              -DLLAMA_SANITIZE_${{ matrix.sanitizer }}=ON ;
          cmake --build . --config ${{ matrix.build_type }} -j $(nproc) --target server
      - name: Tests dependencies
        id: test_dependencies
        run: |
          pip install -r examples/server/tests/requirements.txt
      - name: Tests
        id: server_integration_tests
@ -129,6 +129,7 @@ jobs:
        uses: actions/checkout@v4
        with:
          fetch-depth: 0
          ref: ${{ github.event.inputs.sha || github.event.pull_request.head.sha || github.sha || github.head_ref || github.ref_name }}
      - name: libCURL
        id: get_libcurl
--- a/.gitignore
+++ b/.gitignore
@ -108,3 +108,18 @@ examples/server/*.mjs.hpp
 poetry.lock
 poetry.toml
 nppBackup
 # Test binaries
 /tests/test-grammar-parser
 /tests/test-llama-grammar
 /tests/test-double-float
 /tests/test-grad0
 /tests/test-opt
 /tests/test-quantize-fns
 /tests/test-quantize-perf
 /tests/test-sampling
 /tests/test-tokenizer-0
 /tests/test-tokenizer-1-spm
 /tests/test-tokenizer-1-bpe
 /tests/test-rope
 /tests/test-backend-ops
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@ -43,6 +43,8 @@ else()
    set(LLAMA_METAL_DEFAULT OFF)
 endif()
 set(LLAMA_LLAMAFILE_DEFAULT ON)
 # general
 option(BUILD_SHARED_LIBS                "build shared libraries"                                OFF)
 option(LLAMA_STATIC                     "llama: static link libraries"                          OFF)
--- a/57
+++ b/57
@ -6,11 +6,23 @@ BUILD_TARGETS = \
 # Binaries only useful for tests
 TEST_TARGETS = \
-	tests/test-llama-grammar tests/test-grammar-parser tests/test-double-float tests/test-grad0 tests/test-opt \
+	tests/test-autorelease \
-	tests/test-quantize-fns tests/test-quantize-perf tests/test-sampling tests/test-tokenizer-0-llama          \
+	tests/test-backend-ops \
-	tests/test-tokenizer-0-falcon tests/test-tokenizer-1-llama tests/test-tokenizer-1-bpe tests/test-rope      \
+	tests/test-double-float \
-	tests/test-backend-ops tests/test-model-load-cancel tests/test-autorelease                                 \
+	tests/test-grad0 \
-	tests/test-json-schema-to-grammar tests/test-grammar-integration
+	tests/test-grammar-integration \
 	tests/test-grammar-parser \
 	tests/test-json-schema-to-grammar \
 	tests/test-llama-grammar \
 	tests/test-model-load-cancel \
 	tests/test-opt \
 	tests/test-quantize-fns \
 	tests/test-quantize-perf \
 	tests/test-rope \
 	tests/test-sampling \
 	tests/test-tokenizer-0 \
 	tests/test-tokenizer-1-bpe \
 	tests/test-tokenizer-1-spm
 # Code coverage output files
 COV_TARGETS = *.gcno tests/*.gcno *.gcda tests/*.gcda *.gcov tests/*.gcov lcov-report gcovr-report
@ -27,6 +39,17 @@ ifndef UNAME_M
 UNAME_M := $(shell uname -m)
 endif
 # In GNU make default CXX is g++ instead of c++.  Let's fix that so that users
 # of non-gcc compilers don't have to provide g++ alias or wrapper.
 DEFCC  := cc
 DEFCXX := c++
 ifeq ($(origin CC),default)
 CC  := $(DEFCC)
 endif
 ifeq ($(origin CXX),default)
 CXX := $(DEFCXX)
 endif
 # Mac OS + Arm can report x86_64
 # ref: https://github.com/ggerganov/whisper.cpp/issues/66#issuecomment-1282546789
 ifeq ($(UNAME_S),Darwin)
@ -49,11 +72,17 @@ default: $(BUILD_TARGETS)
 test: $(TEST_TARGETS)
 	@failures=0; \
 	for test_target in $(TEST_TARGETS); do \
-		if [ "$$test_target" = "tests/test-tokenizer-0-llama" ]; then \
+		if [ "$$test_target" = "tests/test-tokenizer-0" ]; then \
-			./$$test_target $(CURDIR)/models/ggml-vocab-llama.gguf; \
+			./$$test_target $(CURDIR)/models/ggml-vocab-llama-spm.gguf; \
-		elif [ "$$test_target" = "tests/test-tokenizer-0-falcon" ]; then \
+			./$$test_target $(CURDIR)/models/ggml-vocab-llama-bpe.gguf; \
 			./$$test_target $(CURDIR)/models/ggml-vocab-phi-3.gguf; \
 			./$$test_target $(CURDIR)/models/ggml-vocab-falcon.gguf; \
-		elif [ "$$test_target" = "tests/test-tokenizer-1-llama" ]; then \
+			./$$test_target $(CURDIR)/models/ggml-vocab-deepseek-coder.gguf; \
 			./$$test_target $(CURDIR)/models/ggml-vocab-deepseek-llm.gguf; \
 			./$$test_target $(CURDIR)/models/ggml-vocab-bert-bge.gguf; \
 			./$$test_target $(CURDIR)/models/ggml-vocab-starcoder.gguf; \
 			./$$test_target $(CURDIR)/models/ggml-vocab-gpt-2.gguf; \
 		elif [ "$$test_target" = "tests/test-tokenizer-1-spm" ]; then \
 			continue; \
 		elif [ "$$test_target" = "tests/test-tokenizer-1-bpe" ]; then \
 			continue; \
@ -768,7 +797,7 @@ batched-bench: examples/batched-bench/batched-bench.cpp       build-info.o ggml.
 	$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
-quantize: examples/quantize/quantize.cpp                      build-info.o ggml.o llama.o $(OBJS)
+quantize: examples/quantize/quantize.cpp                      ggml.o llama.o $(COMMON_DEPS) $(OBJS)
 	$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
@ -971,11 +1000,7 @@ tests/test-sampling: tests/test-sampling.cpp ggml.o llama.o $(OBJS)
 	$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
-tests/test-tokenizer-0-falcon: tests/test-tokenizer-0-falcon.cpp ggml.o llama.o $(COMMON_DEPS) console.o $(OBJS)
+tests/test-tokenizer-0: tests/test-tokenizer-0.cpp ggml.o llama.o $(COMMON_DEPS) console.o $(OBJS)
 	$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
 tests/test-tokenizer-0-llama: tests/test-tokenizer-0-llama.cpp ggml.o llama.o $(COMMON_DEPS) console.o $(OBJS)
 	$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
@ -983,7 +1008,7 @@ tests/test-tokenizer-1-bpe: tests/test-tokenizer-1-bpe.cpp ggml.o llama.o $(COMM
 	$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
-tests/test-tokenizer-1-llama: tests/test-tokenizer-1-llama.cpp ggml.o llama.o $(COMMON_DEPS) console.o $(OBJS)
+tests/test-tokenizer-1-spm: tests/test-tokenizer-1-spm.cpp ggml.o llama.o $(COMMON_DEPS) console.o $(OBJS)
 	$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
--- a/README.md
+++ b/README.md
@ -20,7 +20,8 @@ Inference of Meta's [LLaMA](https://arxiv.org/abs/2302.13971) model (and others)
 ### Hot topics
- **MoE memory layout has been updated - reconvert models for `mmap` support and regenerate `imatrix` https://github.com/ggerganov/llama.cpp/pull/6387**
+- **BPE pre-tokenization support has been added: https://github.com/ggerganov/llama.cpp/pull/6920**
 - MoE memory layout has been updated - reconvert models for `mmap` support and regenerate `imatrix` https://github.com/ggerganov/llama.cpp/pull/6387
 - Model sharding instructions using `gguf-split` https://github.com/ggerganov/llama.cpp/discussions/6404
 - Fix major bug in Metal batched inference https://github.com/ggerganov/llama.cpp/pull/6225
 - Multi-GPU pipeline parallelism support https://github.com/ggerganov/llama.cpp/pull/6017
@ -138,6 +139,7 @@ Typically finetunes of the base models below are supported as well.
 - [x] [MobileVLM 1.7B/3B models](https://huggingface.co/models?search=mobileVLM)
 - [x] [Yi-VL](https://huggingface.co/models?search=Yi-VL)
 - [x] [Mini CPM](https://huggingface.co/models?search=MiniCPM)
 - [x] [Moondream](https://huggingface.co/vikhyatk/moondream2)
 **HTTP server**
--- a/common/common.cpp
+++ b/common/common.cpp
@ -234,8 +234,54 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
    return result;
 }
 bool parse_kv_override(const char * data, std::vector<llama_model_kv_override> & overrides) {
    const char * sep = strchr(data, '=');
    if (sep == nullptr || sep - data >= 128) {
        fprintf(stderr, "%s: malformed KV override '%s'\n", __func__, data);
        return false;
    }
    llama_model_kv_override kvo;
    std::strncpy(kvo.key, data, sep - data);
    kvo.key[sep - data] = 0;
    sep++;
    if (strncmp(sep, "int:", 4) == 0) {
        sep += 4;
        kvo.tag = LLAMA_KV_OVERRIDE_TYPE_INT;
        kvo.val_i64 = std::atol(sep);
    } else if (strncmp(sep, "float:", 6) == 0) {
        sep += 6;
        kvo.tag = LLAMA_KV_OVERRIDE_TYPE_FLOAT;
        kvo.val_f64 = std::atof(sep);
    } else if (strncmp(sep, "bool:", 5) == 0) {
        sep += 5;
        kvo.tag = LLAMA_KV_OVERRIDE_TYPE_BOOL;
        if (std::strcmp(sep, "true") == 0) {
            kvo.val_bool = true;
        } else if (std::strcmp(sep, "false") == 0) {
            kvo.val_bool = false;
        } else {
            fprintf(stderr, "%s: invalid boolean value for KV override '%s'\n", __func__, data);
            return false;
        }
    } else if (strncmp(sep, "str:", 4) == 0) {
        sep += 4;
        kvo.tag = LLAMA_KV_OVERRIDE_TYPE_STR;
        if (strlen(sep) > 127) {
            fprintf(stderr, "%s: malformed KV override '%s', value cannot exceed 127 chars\n", __func__, data);
            return false;
        }
        strncpy(kvo.val_str, sep, 127);
        kvo.val_str[127] = '\0';
    } else {
        fprintf(stderr, "%s: invalid type for KV override '%s'\n", __func__, data);
        return false;
    }
    overrides.emplace_back(std::move(kvo));
    return true;
 }
 bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_params & params, int & i, bool & invalid_param) {
-    llama_sampling_params& sparams = params.sparams;
+    llama_sampling_params & sparams = params.sparams;
    if (arg == "-s" || arg == "--seed") {
        if (++i >= argc) {
@ -1093,6 +1139,10 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa
        params.n_print = std::stoi(argv[i]);
        return true;
    }
    if (arg == "--check-tensors") {
        params.check_tensors = true;
        return true;
    }
    if (arg == "--ppl-output-type") {
        if (++i >= argc) {
            invalid_param = true;
@ -1244,47 +1294,11 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa
            invalid_param = true;
            return true;
        }
-        char* sep = strchr(argv[i], '=');
+        if (!parse_kv_override(argv[i], params.kv_overrides)) {
        if (sep == nullptr || sep - argv[i] >= 128) {
            fprintf(stderr, "error: Malformed KV override: %s\n", argv[i]);
            invalid_param = true;
            return true;
        }
        struct llama_model_kv_override kvo;
        std::strncpy(kvo.key, argv[i], sep - argv[i]);
        kvo.key[sep - argv[i]] = 0;
        sep++;
        if (strncmp(sep, "int:", 4) == 0) {
            sep += 4;
            kvo.tag = LLAMA_KV_OVERRIDE_TYPE_INT;
            kvo.int_value = std::atol(sep);
        }
        else if (strncmp(sep, "float:", 6) == 0) {
            sep += 6;
            kvo.tag = LLAMA_KV_OVERRIDE_TYPE_FLOAT;
            kvo.float_value = std::atof(sep);
        }
        else if (strncmp(sep, "bool:", 5) == 0) {
            sep += 5;
            kvo.tag = LLAMA_KV_OVERRIDE_TYPE_BOOL;
            if (std::strcmp(sep, "true") == 0) {
                kvo.bool_value = true;
            }
            else if (std::strcmp(sep, "false") == 0) {
                kvo.bool_value = false;
            }
            else {
                fprintf(stderr, "error: Invalid boolean value for KV override: %s\n", argv[i]);
                invalid_param = true;
                return true;
            }
        }
        else {
            fprintf(stderr, "error: Invalid type for KV override: %s\n", argv[i]);
            invalid_param = true;
            return true;
        }
        params.kv_overrides.push_back(kvo);
        return true;
    }
 #ifndef LOG_DISABLE_LOGS
@ -1556,9 +1570,10 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
    printf("                        path to dynamic lookup cache to use for lookup decoding (updated by generation)\n");
    printf("  --override-kv KEY=TYPE:VALUE\n");
    printf("                        advanced option to override model metadata by key. may be specified multiple times.\n");
-    printf("                        types: int, float, bool. example: --override-kv tokenizer.ggml.add_bos_token=bool:false\n");
+    printf("                        types: int, float, bool, str. example: --override-kv tokenizer.ggml.add_bos_token=bool:false\n");
    printf("  -ptc N, --print-token-count N\n");
    printf("                        print token count every N tokens (default: %d)\n", params.n_print);
    printf("  --check-tensors       check model tensor data for invalid values\n");
    printf("\n");
 #ifndef LOG_DISABLE_LOGS
    log_print_usage();
@ -1683,6 +1698,18 @@ std::vector<std::string> string_split(std::string input, char separator) {
    return parts;
 }
 std::string string_strip(const std::string & str) {
    size_t start = 0;
    size_t end = str.size();
    while (start < end && std::isspace(str[start])) {
        start++;
    }
    while (end > start && std::isspace(str[end - 1])) {
        end--;
    }
    return str.substr(start, end - start);
 }
 std::vector<llama_sampler_type> sampler_types_from_names(const std::vector<std::string> & names, bool allow_alt_names) {
    std::unordered_map<std::string, llama_sampler_type> sampler_canonical_name_map {
        {"top_k",       llama_sampler_type::TOP_K},
@ -1779,6 +1806,7 @@ struct llama_model_params llama_model_params_from_gpt_params(const gpt_params &
    mparams.tensor_split    = params.tensor_split;
    mparams.use_mmap        = params.use_mmap;
    mparams.use_mlock       = params.use_mlock;
    mparams.check_tensors   = params.check_tensors;
    if (params.kv_overrides.empty()) {
        mparams.kv_overrides = NULL;
    } else {
--- a/common/common.h
+++ b/common/common.h
@ -162,6 +162,7 @@ struct gpt_params {
    bool dump_kv_cache     = false; // dump the KV cache contents for debugging purposes
    bool no_kv_offload     = false; // disable KV offloading
    bool warmup            = true;  // warmup run
    bool check_tensors     = false; // validate tensor data
    std::string cache_type_k = "f16"; // KV cache data type for the K
    std::string cache_type_v = "f16"; // KV cache data type for the V
@ -171,6 +172,8 @@ struct gpt_params {
    std::string image  = ""; // path to an image file
 };
 bool parse_kv_override(const char * data, std::vector<llama_model_kv_override> & overrides);
 bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params);
 bool gpt_params_parse(int argc, char ** argv, gpt_params & params);
@ -194,6 +197,7 @@ bool validate_file_name(const std::string & filename);
 std::vector<llama_sampler_type> sampler_types_from_names(const std::vector<std::string> & names, bool allow_alt_names);
 std::vector<llama_sampler_type> sampler_types_from_chars(const std::string & names_string);
 std::vector<std::string> string_split(std::string input, char separator);
 std::string string_strip(const std::string & str);
 std::string sampler_type_to_name_string(llama_sampler_type sampler_type);
 //
--- a/common/log.h
+++ b/common/log.h
@ -234,7 +234,7 @@ inline std::string log_filename_generator_impl(LogTriState multilog, const std::
 // INTERNAL, DO NOT USE
 //  USE LOG() INSTEAD
 //
-#if !defined(_MSC_VER) or defined(__INTEL_LLVM_COMPILER)
+#if !defined(_MSC_VER) || defined(__INTEL_LLVM_COMPILER)
    #define LOG_IMPL(str, ...)                                                                                      \
    do {                                                                                                            \
        if (LOG_TARGET != nullptr)                                                                                  \
@ -257,7 +257,7 @@ inline std::string log_filename_generator_impl(LogTriState multilog, const std::
 // INTERNAL, DO NOT USE
 //  USE LOG_TEE() INSTEAD
 //
-#if !defined(_MSC_VER) or defined(__INTEL_LLVM_COMPILER)
+#if !defined(_MSC_VER) || defined(__INTEL_LLVM_COMPILER)
    #define LOG_TEE_IMPL(str, ...)                                                                                                      \
    do {                                                                                                                                \
        if (LOG_TARGET != nullptr)                                                                                                      \
--- a/common/sampling.cpp
+++ b/common/sampling.cpp
@ -68,7 +68,7 @@ void llama_sampling_reset(llama_sampling_context * ctx) {
 void llama_sampling_set_rng_seed(struct llama_sampling_context * ctx, uint32_t seed) {
    if (seed == LLAMA_DEFAULT_SEED) {
-        seed = time(NULL);
+        seed = std::random_device{}();
    }
    ctx->rng.seed(seed);
 }
--- a/convert-hf-to-gguf-update.py
+++ b/convert-hf-to-gguf-update.py
@ -0,0 +1,275 @@
 # This script downloads the tokenizer models of the specified models from Huggingface and
 # generates the get_vocab_base_pre() function for convert-hf-to-gguf.py
 #
 # This is necessary in order to analyze the type of pre-tokenizer used by the model and
 # provide the necessary information to llama.cpp via the GGUF header in order to implement
 # the same pre-tokenizer.
 #
 # ref: https://github.com/ggerganov/llama.cpp/pull/6920
 #
 # Instructions:
 #
 # - Add a new model to the "models" list
 # - Run the script with your huggingface token:
 #
 #   python3 convert-hf-to-gguf-update.py <huggingface_token>
 #
 # - Copy-paste the generated get_vocab_base_pre() function into convert-hf-to-gguf.py
 # - Update llama.cpp with the new pre-tokenizer if necessary
 #
 # TODO: generate tokenizer tests for llama.cpp
 # TODO: automate the update of convert-hf-to-gguf.py
 #
 import os
 import requests
 import sys
 import json
 from hashlib import sha256
 from enum import IntEnum, auto
 class TOKENIZER_TYPE(IntEnum):
    SPM = auto()
    BPE = auto()
    WPM = auto()
 # TODO: this string has to exercise as much pre-tokenizer functionality as possible
 #       will be updated with time - contributions welcome
 chktxt = '\n \n\n \n\n\n \t \t\t \t\n  \n   \n    \n     \n🚀 (normal) 😶‍🌫️ (multiple emojis concatenated) ✅ 🦙🦙 3 33 333 3333 33333 333333 3333333 33333333 3.3 3..3 3...3 កាន់តែពិសេសអាច😁 ?我想在apple工作1314151天～ ------======= нещо на Български \'\'\'\'\'\'```````\"\"\"\"......!!!!!!?????? I\'ve been \'told he\'s there, \'RE you sure? \'M not sure I\'ll make it, \'D you like some tea? We\'Ve a\'lL'
 if len(sys.argv) == 2:
    token = sys.argv[1]
 else:
    print("Usage: python convert-hf-to-gguf-update.py <huggingface_token>")
    sys.exit(1)
 # TODO: add models here, base models preferred
 models = [
        { "name": "llama-spm",      "tokt": TOKENIZER_TYPE.SPM, "repo": "https://huggingface.co/meta-llama/Llama-2-7b-hf", },
        { "name": "llama-bpe",      "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/meta-llama/Meta-Llama-3-8B", },
        { "name": "phi-3",          "tokt": TOKENIZER_TYPE.SPM, "repo": "https://huggingface.co/microsoft/Phi-3-mini-4k-instruct", },
        { "name": "deepseek-llm",   "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/deepseek-ai/deepseek-llm-7b-base", },
        { "name": "deepseek-coder", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/deepseek-ai/deepseek-coder-6.7b-base", },
        { "name": "falcon",         "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/tiiuae/falcon-7b", },
        { "name": "bert-bge",       "tokt": TOKENIZER_TYPE.WPM, "repo": "https://huggingface.co/BAAI/bge-small-en-v1.5", },
        { "name": "mpt",            "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/mosaicml/mpt-7b", },
        { "name": "starcoder",      "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/bigcode/starcoder2-3b", },
        { "name": "gpt-2",          "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/openai-community/gpt2", },
        ]
 # make directory "models/tokenizers" if it doesn't exist
 if not os.path.exists("models/tokenizers"):
    os.makedirs("models/tokenizers")
 def download_file_with_auth(url, token, save_path):
    headers = {"Authorization": f"Bearer {token}"}
    response = requests.get(url, headers=headers)
    if response.status_code == 200:
        with open(save_path, 'wb') as f:
            f.write(response.content)
        print(f"File {save_path} downloaded successfully")
    else:
        print(f"Failed to download file. Status code: {response.status_code}")
 # download the tokenizer models
 for model in models:
    name = model["name"]
    repo = model["repo"]
    tokt = model["tokt"]
    if not os.path.exists(f"models/tokenizers/{name}"):
        os.makedirs(f"models/tokenizers/{name}")
    else:
        print(f"Directory models/tokenizers/{name} already exists - skipping")
        continue
    print(f"Downloading {name} to models/tokenizers/{name}")
    url = f"{repo}/raw/main/config.json"
    save_path = f"models/tokenizers/{name}/config.json"
    download_file_with_auth(url, token, save_path)
    url = f"{repo}/raw/main/tokenizer.json"
    save_path = f"models/tokenizers/{name}/tokenizer.json"
    download_file_with_auth(url, token, save_path)
    if tokt == TOKENIZER_TYPE.SPM:
        url = f"{repo}/resolve/main/tokenizer.model"
        save_path = f"models/tokenizers/{name}/tokenizer.model"
        download_file_with_auth(url, token, save_path)
    url = f"{repo}/raw/main/tokenizer_config.json"
    save_path = f"models/tokenizers/{name}/tokenizer_config.json"
    download_file_with_auth(url, token, save_path)
 # generate the source code for the convert-hf-to-gguf.py:get_vocab_base_pre() function:
 # TODO: auto-update convert-hf-to-gguf.py with the generated function
 src_ifs = ""
 for model in models:
    name = model["name"]
    tokt = model["tokt"]
    if tokt == TOKENIZER_TYPE.SPM:
        continue
    # create the tokenizer
    from transformers import AutoTokenizer
    tokenizer = AutoTokenizer.from_pretrained(f"models/tokenizers/{name}")
    chktok = tokenizer.encode(chktxt)
    chkhsh = sha256(str(chktok).encode()).hexdigest()
    print(f"model: {name}")
    print(f"tokt: {tokt}")
    print(f"repo: {model['repo']}")
    print(f"chktok: {chktok}")
    print(f"chkhsh: {chkhsh}")
    # print the "pre_tokenizer" content from the tokenizer.json
    with open(f"models/tokenizers/{name}/tokenizer.json", "r") as f:
        cfg = json.load(f)
        pre_tokenizer = cfg["pre_tokenizer"]
        print("pre_tokenizer: " + json.dumps(pre_tokenizer, indent=4))
    print(f"\n")
    src_ifs += f"        if chkhsh == \"{chkhsh}\":\n"
    src_ifs += f"            # ref: {model['repo']}\n"
    src_ifs += f"            res = \"{name}\"\n"
 src_func = ""
 src_func +=  "    def get_vocab_base_pre(self, tokenizer) -> str:\n"
 src_func +=  "        # encoding this string and hashing the resulting tokens would (hopefully) give us a unique identifier that\n"
 src_func +=  "        # is specific for the BPE pre-tokenizer used by the model\n"
 src_func +=  "        # we will use this unique identifier to write a \"tokenizer.ggml.pre\" entry in the GGUF file which we can\n"
 src_func +=  "        # use in llama.cpp to implement the same pre-tokenizer\n"
 src_func +=  "\n"
 src_func += f"        chktxt = {repr(chktxt)}\n"
 src_func +=  "\n"
 src_func +=  "        chktok = tokenizer.encode(chktxt)\n"
 src_func +=  "        chkhsh = sha256(str(chktok).encode()).hexdigest()\n"
 src_func +=  "\n"
 src_func +=  "        print(f\"chktok: {chktok}\")\n"
 src_func +=  "        print(f\"chkhsh: {chkhsh}\")\n"
 src_func +=  "\n"
 src_func +=  "        res = None\n"
 src_func +=  "\n"
 src_func +=  "        # NOTE: if you get an error here, you need to add the model to the if-elif chain below\n"
 src_func +=  "        #       don't do this manually - use the convert-hf-to-gguf-update.py script!\n"
 src_func += f"{src_ifs}\n"
 src_func +=  "        if res is None:\n"
 src_func +=  "            print(\"\\n\")\n"
 src_func +=  "            print(\"**************************************************************************************\")\n"
 src_func +=  "            print(\"** WARNING: The BPE pre-tokenizer was not recognized!\")\n"
 src_func +=  "            print(\"**          This means that it was not added yet or you are using an older version.\")\n"
 src_func +=  "            print(\"**          Check convert-hf-to-gguf-update.py and update it accordingly.\")\n"
 src_func +=  "            print(\"**\")\n"
 src_func +=  "            print(f\"** chkhsh:  {chkhsh}\")\n"
 src_func +=  "            print(\"**************************************************************************************\")\n"
 src_func +=  "            print(\"\\n\")\n"
 src_func +=  "            raise NotImplementedError(\"BPE pre-tokenizer was not recognized - update get_vocab_base_pre()\")\n"
 src_func +=  "\n"
 src_func +=  "        print(f\"tokenizer.ggml.pre: {res}\")\n"
 src_func +=  "        print(f\"chkhsh: {chkhsh}\")\n"
 src_func +=  "\n"
 src_func +=  "        return res\n"
 print(src_func)
 print("\n")
 print("!!! Copy-paste the function above into convert-hf-to-gguf.py !!!")
 print("\n")
 # generate tests for each tokenizer model
 tests = [
    "",
    " ",
    "  ",
    "   ",
    "\t",
    "\n",
    "\n\n",
    "\n\n\n",
    "\t\n",
    "Hello world",
    " Hello world",
    "Hello World",
    " Hello World",
    " Hello World!",
    "Hello, world!",
    " Hello, world!",
    " this is 🦙.cpp",
    "w048 7tuijk dsdfhu",
    "нещо на Български",
    "កាន់តែពិសេសអាចខលចេញ",
    "🚀 (normal) 😶‍🌫️ (multiple emojis concatenated) ✅ (only emoji that has its own token)",
    "Hello",
    " Hello",
    "  Hello",
    "   Hello",
    "    Hello",
    "    Hello\n    Hello",
    " (",
    "\n =",
    "' era",
    "Hello, y'all! How are you 😁 ?我想在apple工作1314151天～",
    "3",
    "33",
    "333",
    "3333",
    "33333",
    "333333",
    "3333333",
    "33333333",
    "333333333",
    chktxt,
 ]
 # write the tests to ./models/ggml-vocab-{name}.gguf.inp
 # the format is:
 #
 # test0
 # __ggml_vocab_test__
 # test1
 # __ggml_vocab_test__
 # ...
 #
 # with each model, encode all tests and write the results in ./models/ggml-vocab-{name}.gguf.out
 # for each test, write the resulting tokens on a separate line
 for model in models:
    name = model["name"]
    tokt = model["tokt"]
    # create the tokenizer
    from transformers import AutoTokenizer
    tokenizer = AutoTokenizer.from_pretrained(f"models/tokenizers/{name}")
    with open(f"models/ggml-vocab-{name}.gguf.inp", "w") as f:
        for text in tests:
            f.write(f"{text}")
            f.write("\n__ggml_vocab_test__\n")
    with open(f"models/ggml-vocab-{name}.gguf.out", "w") as f:
        for text in tests:
            res = tokenizer.encode(text, add_special_tokens=False)
            for r in res:
                f.write(f" {r}")
            f.write("\n")
    print(f"Tests for {name} written in ./models/ggml-vocab-{name}.gguf.*")
 # generate commands for creating vocab files
 print("\nRun the following commands to generate the vocab files for testing:\n")
 for model in models:
    name = model["name"]
    print(f"python3 convert-hf-to-gguf.py models/tokenizers/{name}/ --outfile models/ggml-vocab-{name}.gguf --vocab-only")
 print("\n")
--- a/convert-hf-to-gguf.py
+++ b/convert-hf-to-gguf.py
@ -11,6 +11,7 @@ import sys
 from abc import ABC, abstractmethod
 from enum import IntEnum
 from pathlib import Path
 from hashlib import sha256
 from typing import TYPE_CHECKING, Any, Callable, ContextManager, Iterator, Sequence, TypeVar, cast
 import numpy as np
@ -229,7 +230,7 @@ class Model(ABC):
        return (f"pytorch_model-{n:05}-of-{self.num_parts:05}.bin" for n in range(1, self.num_parts + 1))
    # used for GPT-2 BPE and WordPiece vocabs
-    def get_basic_vocab(self) -> tuple[list[str], list[int]]:
+    def get_vocab_base(self) -> tuple[list[str], list[int], str]:
        tokens: list[str] = []
        toktypes: list[int] = []
@ -238,6 +239,8 @@ class Model(ABC):
        vocab_size = self.hparams.get("vocab_size", len(tokenizer.vocab))
        assert max(tokenizer.vocab.values()) < vocab_size
        tokpre = self.get_vocab_base_pre(tokenizer)
        reverse_vocab = {id_: encoded_tok for encoded_tok, id_ in tokenizer.vocab.items()}
        added_vocab = tokenizer.get_added_vocab()
@ -255,11 +258,75 @@ class Model(ABC):
                tokens.append(reverse_vocab[i])
                toktypes.append(gguf.TokenType.NORMAL)
-        return tokens, toktypes
+        return tokens, toktypes, tokpre
    # NOTE: this function is generated by convert-hf-to-gguf-update.py
    #       do not modify it manually!
    # ref:  https://github.com/ggerganov/llama.cpp/pull/6920
    def get_vocab_base_pre(self, tokenizer) -> str:
        # encoding this string and hashing the resulting tokens would (hopefully) give us a unique identifier that
        # is specific for the BPE pre-tokenizer used by the model
        # we will use this unique identifier to write a "tokenizer.ggml.pre" entry in the GGUF file which we can
        # use in llama.cpp to implement the same pre-tokenizer
        chktxt = '\n \n\n \n\n\n \t \t\t \t\n  \n   \n    \n     \n🚀 (normal) 😶\u200d🌫️ (multiple emojis concatenated) ✅ 🦙🦙 3 33 333 3333 33333 333333 3333333 33333333 3.3 3..3 3...3 កាន់តែពិសេសអាច😁 ?我想在apple工作1314151天～ ------======= нещо на Български \'\'\'\'\'\'```````""""......!!!!!!?????? I\'ve been \'told he\'s there, \'RE you sure? \'M not sure I\'ll make it, \'D you like some tea? We\'Ve a\'lL'
        chktok = tokenizer.encode(chktxt)
        chkhsh = sha256(str(chktok).encode()).hexdigest()
        print(f"chktok: {chktok}")
        print(f"chkhsh: {chkhsh}")
        res = None
        # NOTE: if you get an error here, you need to add the model to the if-elif chain below
        #       don't do this manually - use the convert-hf-to-gguf-update.py script!
        if chkhsh == "0ef9807a4087ebef797fc749390439009c3b9eda9ad1a097abbe738f486c01e5":
            # ref: https://huggingface.co/meta-llama/Meta-Llama-3-8B
            res = "llama-bpe"
        if chkhsh == "049ecf7629871e3041641907f3de7c733e4dbfdc736f57d882ba0b0845599754":
            # ref: https://huggingface.co/deepseek-ai/deepseek-llm-7b-base
            res = "deepseek-llm"
        if chkhsh == "347715f544604f9118bb75ed199f68779f423cabb20db6de6f31b908d04d7821":
            # ref: https://huggingface.co/deepseek-ai/deepseek-coder-6.7b-base
            res = "deepseek-coder"
        if chkhsh == "8aeee3860c56296a157a1fe2fad249ec40aa59b1bb5709f4ade11c4e6fe652ed":
            # ref: https://huggingface.co/tiiuae/falcon-7b
            res = "falcon"
        if chkhsh == "0876d13b50744004aa9aeae05e7b0647eac9d801b5ba4668afc01e709c15e19f":
            # ref: https://huggingface.co/BAAI/bge-small-en-v1.5
            res = "bert-bge"
        if chkhsh == "b6dc8df998e1cfbdc4eac8243701a65afe638679230920b50d6f17d81c098166":
            # ref: https://huggingface.co/mosaicml/mpt-7b
            res = "mpt"
        if chkhsh == "35d91631860c815f952d711435f48d356ebac988362536bed955d43bfa436e34":
            # ref: https://huggingface.co/bigcode/starcoder2-3b
            res = "starcoder"
        if chkhsh == "3ce83efda5659b07b1ad37ca97ca5797ea4285d9b9ab0dc679e4a720c9da7454":
            # ref: https://huggingface.co/openai-community/gpt2
            res = "gpt-2"
        if res is None:
            print("\n")
            print("**************************************************************************************")
            print("** WARNING: The BPE pre-tokenizer was not recognized!")
            print("**          This means that it was not added yet or you are using an older version.")
            print("**          Check convert-hf-to-gguf-update.py and update it accordingly.")
            print("**")
            print(f"** chkhsh:  {chkhsh}")
            print("**************************************************************************************")
            print("\n")
            raise NotImplementedError("BPE pre-tokenizer was not recognized - update get_vocab_base_pre()")
        print(f"tokenizer.ggml.pre: {res}")
        print(f"chkhsh: {chkhsh}")
        return res
    def _set_vocab_gpt2(self) -> None:
-        tokens, toktypes = self.get_basic_vocab()
+        tokens, toktypes, tokpre = self.get_vocab_base()
        self.gguf_writer.add_tokenizer_model("gpt2")
        self.gguf_writer.add_tokenizer_pre(tokpre)
        self.gguf_writer.add_token_list(tokens)
        self.gguf_writer.add_token_types(toktypes)
@ -277,6 +344,8 @@ class Model(ABC):
        vocab_size = hparams["vocab_size"]
        assert max(tokenizer.get_vocab().values()) < vocab_size
        tokpre = self.get_vocab_base_pre(tokenizer)
        merges = []
        vocab = {}
        mergeable_ranks = tokenizer.mergeable_ranks
@ -304,6 +373,7 @@ class Model(ABC):
                toktypes.append(gguf.TokenType.NORMAL)
        self.gguf_writer.add_tokenizer_model("gpt2")
        self.gguf_writer.add_tokenizer_pre(tokpre)
        self.gguf_writer.add_token_list(tokens)
        self.gguf_writer.add_token_types(toktypes)
@ -376,6 +446,7 @@ class Model(ABC):
        assert len(tokens) == vocab_size
        self.gguf_writer.add_tokenizer_model("llama")
        self.gguf_writer.add_tokenizer_pre("default")
        self.gguf_writer.add_token_list(tokens)
        self.gguf_writer.add_token_scores(scores)
        self.gguf_writer.add_token_types(toktypes)
@ -397,6 +468,7 @@ class Model(ABC):
        assert len(tokens) == vocab.vocab_size
        self.gguf_writer.add_tokenizer_model("llama")
        self.gguf_writer.add_tokenizer_pre("default")
        self.gguf_writer.add_token_list(tokens)
        self.gguf_writer.add_token_scores(scores)
        self.gguf_writer.add_token_types(toktypes)
@ -840,6 +912,7 @@ class XverseModel(Model):
            toktypes.append(toktype)
        self.gguf_writer.add_tokenizer_model("llama")
        self.gguf_writer.add_tokenizer_pre("default")
        self.gguf_writer.add_token_list(tokens)
        self.gguf_writer.add_token_types(toktypes)
@ -1335,6 +1408,11 @@ class LlamaModel(Model):
        self.gguf_writer.add_vocab_size(hparams["vocab_size"])
        self.gguf_writer.add_rope_dimension_count(hparams["hidden_size"] // hparams["num_attention_heads"])
        if self.hparams.get("rope_scaling") is not None and "factor" in self.hparams["rope_scaling"]:
            if self.hparams["rope_scaling"].get("type") == "linear":
                self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.LINEAR)
                self.gguf_writer.add_rope_scaling_factor(self.hparams["rope_scaling"]["factor"])
    # Same as super class, but permuting q_proj, k_proj
    def write_tensors(self):
        block_count = self.hparams.get("n_layers", self.hparams.get("num_hidden_layers", self.hparams.get("n_layer")))
@ -2052,6 +2130,7 @@ class Phi3MiniModel(Model):
                    toktypes[token_id] = SentencePieceTokenTypes.USER_DEFINED
        self.gguf_writer.add_tokenizer_model("llama")
        self.gguf_writer.add_tokenizer_pre("default")
        self.gguf_writer.add_token_list(tokens)
        self.gguf_writer.add_token_scores(scores)
        self.gguf_writer.add_token_types(toktypes)
@ -2294,6 +2373,7 @@ class InternLM2Model(Model):
                    toktypes.append(SentencePieceTokenTypes.USER_DEFINED)
        self.gguf_writer.add_tokenizer_model("llama")
        self.gguf_writer.add_tokenizer_pre("default")
        self.gguf_writer.add_token_list(tokens)
        self.gguf_writer.add_token_scores(scores)
        self.gguf_writer.add_token_types(toktypes)
@ -2443,7 +2523,7 @@ class BertModel(Model):
            self.gguf_writer.add_pooling_type(pooling_type)
    def set_vocab(self):
-        tokens, toktypes = self.get_basic_vocab()
+        tokens, toktypes, tokpre = self.get_vocab_base()
        self.vocab_size = len(tokens)
        # we need this to validate the size of the token_type embeddings
@ -2461,6 +2541,7 @@ class BertModel(Model):
        # add vocab to gguf
        self.gguf_writer.add_tokenizer_model("bert")
        self.gguf_writer.add_tokenizer_pre(tokpre)
        self.gguf_writer.add_token_list(tokens)
        self.gguf_writer.add_token_types(toktypes)
@ -2482,6 +2563,10 @@ class BertModel(Model):
                print(f"Can not map tensor {name!r}")
                sys.exit()
            # convert any unsupported data types to float32
            if data_torch.dtype not in (torch.float16, torch.float32):
                data_torch = data_torch.to(torch.float32)
            data = data_torch.squeeze().numpy()
            n_dims = len(data.shape)
            new_dtype: type[np.floating[Any]]
@ -2638,6 +2723,9 @@ class MambaModel(Model):
            field = neox_reader.get_field(gguf.Keys.Tokenizer.MODEL)
            self.gguf_writer.add_tokenizer_model(bytes(field.parts[-1]))
            field = neox_reader.get_field(gguf.Keys.Tokenizer.PRE)
            self.gguf_writer.add_tokenizer_pre(bytes(field.parts[-1]))
            field = neox_reader.get_field(gguf.Keys.Tokenizer.LIST)
            self.gguf_writer.add_token_list([bytes(field.parts[i]) for i in field.data][:vocab_size])
@ -2843,6 +2931,7 @@ def parse_args() -> argparse.Namespace:
        help="directory containing model file",
    )
    parser.add_argument("--use-temp-file", action="store_true", help="use the tempfile library while processing (helpful when running out of memory, process killed)")
    parser.add_argument("--model-name", type=str, default=None, help="name of the model")
    return parser.parse_args()
--- a/convert-llama-ggml-to-gguf.py
+++ b/convert-llama-ggml-to-gguf.py
@ -281,6 +281,7 @@ class GGMLToGGUF:
    def add_vocab(self, gguf_writer):
        hp = self.model.hyperparameters
        gguf_writer.add_tokenizer_model('llama')
        gguf_writer.add_tokenizer_pre('default')
        tokens = []
        scores = []
        toktypes = []
--- a/convert-persimmon-to-gguf.py
+++ b/convert-persimmon-to-gguf.py
@ -99,6 +99,7 @@ def main():
    tokens, scores, toktypes = _get_sentencepiece_tokenizer_info(args.model_dir)
    gguf_writer.add_tokenizer_model('llama')
    gguf_writer.add_tokenizer_pre('default')
    gguf_writer.add_token_list(tokens)
    gguf_writer.add_token_scores(scores)
    gguf_writer.add_token_types(toktypes)
--- a/examples/imatrix/imatrix.cpp
+++ b/examples/imatrix/imatrix.cpp
@ -23,6 +23,7 @@ struct Stats {
 };
 struct StatParams {
    std::string dataset;
    std::string ofile = "imatrix.dat";
    int         n_output_frequency = 10;
    int         verbosity = 1;
@ -46,7 +47,7 @@ private:
    std::vector<float>                     m_src1_data;
    std::vector<char>                      m_ids; // the expert ids from ggml_mul_mat_id
                                                  //
-    void save_imatrix(const char * file_name) const;
+    void save_imatrix(const char * file_name, const char * dataset) const;
    void keep_imatrix(int ncall) const;
 };
@ -199,7 +200,7 @@ bool IMatrixCollector::collect_imatrix(struct ggml_tensor * t, bool ask, void *
 }
 void IMatrixCollector::save_imatrix() const {
-    save_imatrix(m_params.ofile.empty() ? "imatrix.dat" : m_params.ofile.c_str());
+    save_imatrix(m_params.ofile.empty() ? "imatrix.dat" : m_params.ofile.c_str(), m_params.dataset.c_str());
 }
 void IMatrixCollector::keep_imatrix(int ncall) const {
@ -207,24 +208,33 @@ void IMatrixCollector::keep_imatrix(int ncall) const {
    if (file_name.empty()) file_name = "imatrix.dat";
    file_name += ".at_";
    file_name += std::to_string(ncall);
-    save_imatrix(file_name.c_str());
+    save_imatrix(file_name.c_str(), m_params.dataset.c_str());
 }
-void IMatrixCollector::save_imatrix(const char * fname) const {
+void IMatrixCollector::save_imatrix(const char * fname, const char * dataset) const {
    std::ofstream out(fname, std::ios::binary);
    int n_entries = m_stats.size();
-    out.write((const char*)&n_entries, sizeof(n_entries));
+    out.write((const char *) &n_entries, sizeof(n_entries));
-    for (auto& p : m_stats) {
+    for (const auto & p : m_stats) {
        int len = p.first.size();
-        out.write((const char*)&len, sizeof(len));
+        out.write((const char *) &len, sizeof(len));
        out.write(p.first.c_str(), len);
-        out.write((const char*)&p.second.ncall, sizeof(p.second.ncall));
+        out.write((const char *) &p.second.ncall, sizeof(p.second.ncall));
        int nval = p.second.values.size();
-        out.write((const char*)&nval, sizeof(nval));
+        out.write((const char *) &nval, sizeof(nval));
-        if (nval > 0) out.write((const char*)p.second.values.data(), nval*sizeof(float));
+        if (nval > 0) out.write((const char *) p.second.values.data(), nval * sizeof(float));
    }
    // Write the number of call the matrix was computed with
    out.write((const char *) &m_last_call, sizeof(m_last_call));
    // Write the dataset name at the end of the file to later on specify it in quantize
    int n_dataset = strlen(dataset);
    out.write((const char *) &n_dataset, sizeof(n_dataset));
    out.write(dataset, n_dataset);
    if (m_params.verbosity > 0) {
-        fprintf(stderr, "\n%s: stored collected data after %d chunks in %s\n",__func__,m_last_call,fname);
+        fprintf(stderr, "\n%s: stored collected data after %d chunks in %s\n", __func__, m_last_call, fname);
    }
 }
@ -547,6 +557,29 @@ int main(int argc, char ** argv) {
        }
    }
    gpt_params params;
    params.n_batch = 512;
    if (!gpt_params_parse(args.size(), args.data(), params)) {
        return 1;
    }
    params.logits_all = true;
    params.n_batch = std::min(params.n_batch, params.n_ctx);
    print_build_info();
    if (params.seed == LLAMA_DEFAULT_SEED) {
        params.seed = time(NULL);
    }
    fprintf(stderr, "%s: seed  = %u\n", __func__, params.seed);
    std::mt19937 rng(params.seed);
    if (params.random_prompt) {
        params.prompt = gpt_random_prompt(rng);
    }
    sparams.dataset = params.prompt_file;
    g_collector.set_parameters(std::move(sparams));
    if (!combine_files.empty()) {
@ -585,28 +618,6 @@ int main(int argc, char ** argv) {
        }
    }
    gpt_params params;
    params.n_batch = 512;
    if (!gpt_params_parse(args.size(), args.data(), params)) {
        return 1;
    }
    params.logits_all = true;
    params.n_batch = std::min(params.n_batch, params.n_ctx);
    print_build_info();
    if (params.seed == LLAMA_DEFAULT_SEED) {
        params.seed = time(NULL);
    }
    fprintf(stderr, "%s: seed  = %u\n", __func__, params.seed);
    std::mt19937 rng(params.seed);
    if (params.random_prompt) {
        params.prompt = gpt_random_prompt(rng);
    }
    llama_backend_init();
    llama_numa_init(params.numa);
--- a/examples/llava/clip.cpp
+++ b/examples/llava/clip.cpp
@ -104,6 +104,7 @@ static std::string format(const char * fmt, ...) {
 #define TN_POS_EMBD        "%s.position_embd.weight"
 #define TN_CLASS_EMBD      "v.class_embd"
 #define TN_PATCH_EMBD      "v.patch_embd.weight"
 #define TN_PATCH_BIAS      "v.patch_embd.bias"
 #define TN_ATTN_K          "%s.blk.%d.attn_k.%s"
 #define TN_ATTN_Q          "%s.blk.%d.attn_q.%s"
 #define TN_ATTN_V          "%s.blk.%d.attn_v.%s"
@ -425,6 +426,7 @@ struct clip_vision_model {
    // embeddings
    struct ggml_tensor * class_embedding;
    struct ggml_tensor * patch_embeddings;
    struct ggml_tensor * patch_bias;
    struct ggml_tensor * position_embeddings;
    struct ggml_tensor * pre_ln_w;
@ -501,6 +503,11 @@ struct clip_ctx {
    bool use_gelu = false;
    int32_t ftype = 1;
    bool has_class_embedding = true;
    bool has_pre_norm = true;
    bool has_post_norm = false;
    bool has_patch_bias = false;
    struct gguf_context * ctx_gguf;
    struct ggml_context * ctx_data;
@ -526,7 +533,7 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32
    const int patch_size           = hparams.patch_size;
    const int num_patches          = ((image_size / patch_size) * (image_size / patch_size));
    const int num_patches_per_side = image_size / patch_size; GGML_UNUSED(num_patches_per_side);
-    const int num_positions        = num_patches + 1;
+    const int num_positions        = num_patches + (ctx->has_class_embedding ? 1 : 0);
    const int hidden_size          = hparams.hidden_size;
    const int n_head               = hparams.n_head;
    const int d_head               = hidden_size / n_head;
@ -557,16 +564,23 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32
    inp = ggml_reshape_3d(ctx0, inp, num_patches, hidden_size, batch_size);
    inp = ggml_cont(ctx0, ggml_permute(ctx0, inp, 1, 0, 2, 3));
    if (ctx->has_patch_bias) {
        // inp = ggml_add(ctx0, inp, ggml_repeat(ctx0, model.patch_bias, inp));
        inp = ggml_add(ctx0, inp, model.patch_bias);
    }
    // concat class_embeddings and patch_embeddings
-    struct ggml_tensor * embeddings = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, hidden_size, num_positions, batch_size);
+    struct ggml_tensor * embeddings = inp;
    if (ctx->has_class_embedding) {
        embeddings = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, hidden_size, num_positions, batch_size);
        embeddings = ggml_acc(ctx0, embeddings, model.class_embedding,
                embeddings->nb[1], embeddings->nb[2], embeddings->nb[3], 0);
        embeddings = ggml_acc(ctx0, embeddings, inp,
                embeddings->nb[1], embeddings->nb[2], embeddings->nb[3], model.class_embedding->nb[1]);
    }
    ggml_set_name(embeddings, "embeddings");
    ggml_set_input(embeddings);
    embeddings = ggml_acc(ctx0, embeddings, model.class_embedding,
            embeddings->nb[1], embeddings->nb[2], embeddings->nb[3], 0);
    embeddings = ggml_acc(ctx0, embeddings, inp,
            embeddings->nb[1], embeddings->nb[2], embeddings->nb[3], model.class_embedding->nb[1]);
    struct ggml_tensor * positions = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, num_positions);
    ggml_set_name(positions, "positions");
@ -576,7 +590,7 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32
        ggml_add(ctx0, embeddings, ggml_get_rows(ctx0, model.position_embeddings, positions));
    // pre-layernorm
-    {
+    if (ctx->has_pre_norm) {
        embeddings = ggml_norm(ctx0, embeddings, eps);
        ggml_set_name(embeddings, "pre_ln");
@ -664,6 +678,14 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32
        embeddings = cur;
    }
    // post-layernorm
    if (ctx->has_post_norm) {
        embeddings = ggml_norm(ctx0, embeddings, eps);
        ggml_set_name(embeddings, "post_ln");
        embeddings = ggml_add(ctx0, ggml_mul(ctx0, embeddings, model.post_ln_w), model.post_ln_b);
    }
    // llava projector
    {
        embeddings = ggml_reshape_2d(ctx0, embeddings, embeddings->ne[0], embeddings->ne[1]);
@ -1149,11 +1171,38 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
        }
        try {
            vision_model.patch_embeddings    = get_tensor(new_clip->ctx_data, TN_PATCH_EMBD);
            vision_model.class_embedding  = get_tensor(new_clip->ctx_data, TN_CLASS_EMBD);
-            vision_model.position_embeddings = get_tensor(new_clip->ctx_data, format(TN_POS_EMBD, "v"));
+            new_clip->has_class_embedding = true;
        } catch (const std::exception& e) {
            new_clip->has_class_embedding = false;
        }
        try {
            vision_model.pre_ln_w  = get_tensor(new_clip->ctx_data, format(TN_LN_PRE, "v", "weight"));
            vision_model.pre_ln_b  = get_tensor(new_clip->ctx_data, format(TN_LN_PRE, "v", "bias"));
            new_clip->has_pre_norm = true;
        } catch (std::exception & e) {
            new_clip->has_pre_norm = false;
        }
        try {
            vision_model.post_ln_w  = get_tensor(new_clip->ctx_data, format(TN_LN_POST, "v", "weight"));
            vision_model.post_ln_b  = get_tensor(new_clip->ctx_data, format(TN_LN_POST, "v", "bias"));
            new_clip->has_post_norm = true;
        } catch (std::exception & e) {
            new_clip->has_post_norm = false;
        }
        try {
            vision_model.patch_bias = get_tensor(new_clip->ctx_data, TN_PATCH_BIAS);
            new_clip->has_patch_bias = true;
        } catch (std::exception & e) {
            new_clip->has_patch_bias = false;
        }
        try {
            vision_model.patch_embeddings    = get_tensor(new_clip->ctx_data, TN_PATCH_EMBD);
            vision_model.position_embeddings = get_tensor(new_clip->ctx_data, format(TN_POS_EMBD, "v"));
        } catch(const std::exception& e) {
            LOG_TEE("%s: failed to load vision model tensors\n", __func__);
        }
--- a/examples/quantize/CMakeLists.txt
+++ b/examples/quantize/CMakeLists.txt
@ -1,6 +1,6 @@
 set(TARGET quantize)
 add_executable(${TARGET} quantize.cpp)
 install(TARGETS ${TARGET} RUNTIME)
-target_link_libraries(${TARGET} PRIVATE llama build_info ${CMAKE_THREAD_LIBS_INIT})
+target_link_libraries(${TARGET} PRIVATE llama common ${CMAKE_THREAD_LIBS_INIT})
 target_include_directories(${TARGET} PRIVATE ../../common)
 target_compile_features(${TARGET} PRIVATE cxx_std_11)
--- a/examples/quantize/quantize.cpp
+++ b/examples/quantize/quantize.cpp
@ -8,7 +8,6 @@
 #include <unordered_map>
 #include <fstream>
 #include <cmath>
 #include <algorithm>
 struct quant_option {
    std::string name;
@ -53,6 +52,10 @@ static const std::vector<struct quant_option> QUANT_OPTIONS = {
    { "COPY",   LLAMA_FTYPE_ALL_F32,       "only copy tensors, no quantizing", },
 };
 static const char * const LLM_KV_QUANTIZE_IMATRIX_FILE       = "quantize.imatrix.file";
 static const char * const LLM_KV_QUANTIZE_IMATRIX_DATASET    = "quantize.imatrix.dataset";
 static const char * const LLM_KV_QUANTIZE_IMATRIX_N_ENTRIES  = "quantize.imatrix.entries_count";
 static const char * const LLM_KV_QUANTIZE_IMATRIX_N_CHUNKS   = "quantize.imatrix.chunks_count";
 static bool try_parse_ftype(const std::string & ftype_str_in, llama_ftype & ftype, std::string & ftype_str_out) {
    std::string ftype_str;
@ -113,7 +116,7 @@ static void usage(const char * executable) {
    exit(1);
 }
-static void load_imatrix(const std::string & imatrix_file, std::unordered_map<std::string, std::vector<float>> & imatrix_data) {
+static int load_imatrix(const std::string & imatrix_file, std::string & imatrix_dataset, std::unordered_map<std::string, std::vector<float>> & imatrix_data) {
    std::ifstream in(imatrix_file.c_str(), std::ios::binary);
    if (!in) {
        printf("%s: failed to open %s\n",__func__, imatrix_file.c_str());
@ -160,18 +163,33 @@ static void load_imatrix(const std::string & imatrix_file, std::unordered_map<st
            printf("%s: loaded data (size = %6d, ncall = %6d) for '%s'\n", __func__, int(e.size()), ncall, name.c_str());
        }
    }
-    printf("%s: loaded %d importance matrix entries from %s\n", __func__, int(imatrix_data.size()), imatrix_file.c_str());
+
    // latest imatrix version contains the dataset filename at the end of the file
    int m_last_call = 0;
    if (in.peek() != EOF) {
        in.read((char *)&m_last_call, sizeof(m_last_call));
        int dataset_len;
        in.read((char *)&dataset_len, sizeof(dataset_len));
        std::vector<char> dataset_as_vec(dataset_len);
        in.read(dataset_as_vec.data(), dataset_len);
        imatrix_dataset.assign(dataset_as_vec.begin(), dataset_as_vec.end());
        printf("%s: imatrix dataset='%s'\n", __func__, imatrix_dataset.c_str());
    }
    printf("%s: loaded %d importance matrix entries from %s computed on %d chunks\n", __func__, int(imatrix_data.size()), imatrix_file.c_str(), m_last_call);
    return m_last_call;
 }
-static void prepare_imatrix(const std::string & imatrix_file,
+static int prepare_imatrix(const std::string & imatrix_file,
        std::string & imatrix_dataset,
        const std::vector<std::string> & included_weights,
        const std::vector<std::string> & excluded_weights,
        std::unordered_map<std::string, std::vector<float>> & imatrix_data) {
    int m_last_call = -1;
    if (!imatrix_file.empty()) {
-        load_imatrix(imatrix_file, imatrix_data);
+        m_last_call = load_imatrix(imatrix_file, imatrix_dataset, imatrix_data);
    }
    if (imatrix_data.empty()) {
-        return;
+        return m_last_call;
    }
    if (!excluded_weights.empty()) {
        for (auto& name : excluded_weights) {
@ -197,6 +215,7 @@ static void prepare_imatrix(const std::string & imatrix_file,
    if (!imatrix_data.empty()) {
        printf("%s: have %d importance matrix entries\n", __func__, int(imatrix_data.size()));
    }
    return m_last_call;
 }
 static ggml_type parse_ggml_type(const char * arg) {
@ -211,43 +230,6 @@ static ggml_type parse_ggml_type(const char * arg) {
    return result;
 }
 static bool parse_kv_override(const char * data, std::vector<llama_model_kv_override> & overrides) {
    const char* sep = strchr(data, '=');
    if (sep == nullptr || sep - data >= 128) {
        fprintf(stderr, "%s: malformed KV override '%s'\n", __func__, data);
        return false;
    }
    llama_model_kv_override kvo;
    std::strncpy(kvo.key, data, sep - data);
    kvo.key[sep - data] = 0;
    sep++;
    if (strncmp(sep, "int:", 4) == 0) {
        sep += 4;
        kvo.tag = LLAMA_KV_OVERRIDE_TYPE_INT;
        kvo.int_value = std::atol(sep);
    } else if (strncmp(sep, "float:", 6) == 0) {
        sep += 6;
        kvo.tag = LLAMA_KV_OVERRIDE_TYPE_FLOAT;
        kvo.float_value = std::atof(sep);
    } else if (strncmp(sep, "bool:", 5) == 0) {
        sep += 5;
        kvo.tag = LLAMA_KV_OVERRIDE_TYPE_BOOL;
        if (std::strcmp(sep, "true") == 0) {
            kvo.bool_value = true;
        } else if (std::strcmp(sep, "false") == 0) {
            kvo.bool_value = false;
        } else {
            fprintf(stderr, "%s: invalid boolean value for KV override '%s'\n", __func__, data);
            return false;
        }
    } else {
        fprintf(stderr, "%s: invalid type for KV override '%s'\n", __func__, data);
        return false;
    }
    overrides.emplace_back(std::move(kvo));
    return true;
 }
 int main(int argc, char ** argv) {
    if (argc < 3) {
        usage(argv[0]);
@ -316,10 +298,43 @@ int main(int argc, char ** argv) {
        usage(argv[0]);
    }
    std::string imatrix_dataset;
    std::unordered_map<std::string, std::vector<float>> imatrix_data;
-    prepare_imatrix(imatrix_file, included_weights, excluded_weights, imatrix_data);
+    int m_last_call = prepare_imatrix(imatrix_file, imatrix_dataset, included_weights, excluded_weights, imatrix_data);
    if (!imatrix_data.empty()) {
        params.imatrix = &imatrix_data;
        {
            llama_model_kv_override kvo;
            std::strcpy(kvo.key, LLM_KV_QUANTIZE_IMATRIX_FILE);
            kvo.tag = LLAMA_KV_OVERRIDE_TYPE_STR;
            strncpy(kvo.val_str, imatrix_file.c_str(), 127);
            kvo.val_str[127] = '\0';
            kv_overrides.emplace_back(std::move(kvo));
        }
        if (!imatrix_dataset.empty()) {
            llama_model_kv_override kvo;
            std::strcpy(kvo.key, LLM_KV_QUANTIZE_IMATRIX_DATASET);
            kvo.tag = LLAMA_KV_OVERRIDE_TYPE_STR;
            strncpy(kvo.val_str, imatrix_dataset.c_str(), 127);
            kvo.val_str[127] = '\0';
            kv_overrides.emplace_back(std::move(kvo));
        }
        {
            llama_model_kv_override kvo;
            std::strcpy(kvo.key, LLM_KV_QUANTIZE_IMATRIX_N_ENTRIES);
            kvo.tag = LLAMA_KV_OVERRIDE_TYPE_INT;
            kvo.val_i64 = imatrix_data.size();
            kv_overrides.emplace_back(std::move(kvo));
        }
        if (m_last_call > 0) {
            llama_model_kv_override kvo;
            std::strcpy(kvo.key, LLM_KV_QUANTIZE_IMATRIX_N_CHUNKS);
            kvo.tag = LLAMA_KV_OVERRIDE_TYPE_INT;
            kvo.val_i64 = m_last_call;
            kv_overrides.emplace_back(std::move(kvo));
        }
    }
    if (!kv_overrides.empty()) {
        kv_overrides.emplace_back();
--- a/examples/server/bench/script.js
+++ b/examples/server/bench/script.js
@ -90,7 +90,8 @@ export default function () {
        "model": model,
        "stream": true,
        "seed": 42,
-        "max_tokens": max_tokens
+        "max_tokens": max_tokens,
        "stop": ["<|im_end|>"] // This is temporary for phi-2 base (i.e. not instructed) since the server expects that the model always to emit BOS
    }
    const params = {method: 'POST', body: JSON.stringify(payload)};
--- a/examples/server/server.cpp
+++ b/examples/server/server.cpp
@ -1207,6 +1207,27 @@ struct server_context {
            LOG_VERBOSE("eos token found", {});
        }
        auto n_ctx_train = llama_n_ctx_train(model);
        if (slot.params.n_predict < 1 && slot.n_predict < 1 && slot.ga_n == 1
                    && slot.n_prompt_tokens + slot.n_decoded >= n_ctx_train) {
            LOG_WARNING("n_predict is not set and self-context extend is disabled."
                        " Limiting generated tokens to n_ctx_train to avoid EOS-less generation infinite loop", {
                    { "id_slot",              slot.id },
                    { "params.n_predict",     slot.params.n_predict },
                    { "slot.n_prompt_tokens", slot.n_prompt_tokens },
                    { "slot.n_decoded",       slot.n_decoded },
                    { "slot.n_predict",       slot.n_predict },
                    { "n_slots",              params.n_parallel },
                    { "slot.n_ctx",           slot.n_ctx },
                    { "n_ctx",                n_ctx },
                    { "n_ctx_train",          n_ctx_train },
                    { "ga_n",                 slot.ga_n },
                });
            slot.truncated      = true;
            slot.stopped_limit  = true;
            slot.has_next_token = false; // stop prediction
        }
        LOG_VERBOSE("next token", {
            {"id_slot",        slot.id},
            {"id_task",        slot.id_task},
@ -2141,7 +2162,7 @@ struct server_context {
        });
        // process the created batch of tokens
-        for (int32_t i = 0; i < (int32_t) batch.n_tokens; i += n_batch) {
+        for (int32_t i = 0; i < batch.n_tokens; i += n_batch) {
            const int32_t n_tokens = std::min(n_batch, batch.n_tokens - i);
            for (auto & slot : slots) {
@ -2372,7 +2393,7 @@ static void server_print_usage(const char * argv0, const gpt_params & params, co
    printf("  -n, --n-predict           maximum tokens to predict (default: %d)\n", params.n_predict);
    printf("  --override-kv KEY=TYPE:VALUE\n");
    printf("                            advanced option to override model metadata by key. may be specified multiple times.\n");
-    printf("                            types: int, float, bool. example: --override-kv tokenizer.ggml.add_bos_token=bool:false\n");
+    printf("                            types: int, float, bool, str. example: --override-kv tokenizer.ggml.add_bos_token=bool:false\n");
    printf("  -gan N, --grp-attn-n N    set the group attention factor to extend context size through self-extend(default: 1=disabled), used together with group attention width `--grp-attn-w`\n");
    printf("  -gaw N, --grp-attn-w N    set the group attention width to extend context size through self-extend(default: 512), used together with group attention factor `--grp-attn-n`\n");
    printf("  --chat-template JINJA_TEMPLATE\n");
@ -2805,43 +2826,11 @@ static void server_params_parse(int argc, char ** argv, server_params & sparams,
                invalid_param = true;
                break;
            }
-            char * sep = strchr(argv[i], '=');
+            if (!parse_kv_override(argv[i], params.kv_overrides)) {
            if (sep == nullptr || sep - argv[i] >= 128) {
                fprintf(stderr, "error: Malformed KV override: %s\n", argv[i]);
                invalid_param = true;
                break;
            }
            struct llama_model_kv_override kvo;
            std::strncpy(kvo.key, argv[i], sep - argv[i]);
            kvo.key[sep - argv[i]] = 0;
            sep++;
            if (strncmp(sep, "int:", 4) == 0) {
                sep += 4;
                kvo.tag = LLAMA_KV_OVERRIDE_TYPE_INT;
                kvo.int_value = std::atol(sep);
            } else if (strncmp(sep, "float:", 6) == 0) {
                sep += 6;
                kvo.tag = LLAMA_KV_OVERRIDE_TYPE_FLOAT;
                kvo.float_value = std::atof(sep);
            } else if (strncmp(sep, "bool:", 5) == 0) {
                sep += 5;
                kvo.tag = LLAMA_KV_OVERRIDE_TYPE_BOOL;
                if (std::strcmp(sep, "true") == 0) {
                    kvo.bool_value = true;
                } else if (std::strcmp(sep, "false") == 0) {
                    kvo.bool_value = false;
                } else {
                    fprintf(stderr, "error: Invalid boolean value for KV override: %s\n", argv[i]);
                    invalid_param = true;
                    break;
                }
            } else {
                fprintf(stderr, "error: Invalid type for KV override: %s\n", argv[i]);
                invalid_param = true;
                break;
            }
            params.kv_overrides.push_back(kvo);
        } else {
            fprintf(stderr, "error: unknown argument: %s\n", arg.c_str());
            server_print_usage(argv[0], default_params, default_sparams);
--- a/flake.lock
+++ b/flake.lock
@ -20,11 +20,11 @@
    },
    "nixpkgs": {
      "locked": {
-        "lastModified": 1713537308,
+        "lastModified": 1714076141,
-        "narHash": "sha256-XtTSSIB2DA6tOv+l0FhvfDMiyCmhoRbNB+0SeInZkbk=",
+        "narHash": "sha256-Drmja/f5MRHZCskS6mvzFqxEaZMeciScCTFxWVLqWEY=",
        "owner": "NixOS",
        "repo": "nixpkgs",
-        "rev": "5c24cf2f0a12ad855f444c30b2421d044120c66f",
+        "rev": "7bb2ccd8cdc44c91edba16c48d2c8f331fb3d856",
        "type": "github"
      },
      "original": {
--- a/ggml-backend.c
+++ b/ggml-backend.c
@ -1784,12 +1784,14 @@ void ggml_backend_sched_free(ggml_backend_sched_t sched) {
 void ggml_backend_sched_reset(ggml_backend_sched_t sched) {
    // reset state for the next run
    if (!sched->is_reset) {
        size_t hash_size = sched->hash_set.size;
        memset(sched->hash_set.keys,      0, sizeof(sched->hash_set.keys[0])     * hash_size); // NOLINT
        memset(sched->tensor_backend_id, -1, sizeof(sched->tensor_backend_id[0]) * hash_size);
        memset(sched->tensor_copies,      0, sizeof(sched->tensor_copies[0])     * hash_size);
        sched->is_reset = true;
    }
    sched->is_alloc = false;
 }
--- a/ggml-cuda/convert.cu
+++ b/ggml-cuda/convert.cu
@ -5,16 +5,16 @@
 template <int qk, int qr, dequantize_kernel_t dequantize_kernel, typename dst_t>
 static __global__ void dequantize_block(const void * __restrict__ vx, dst_t * __restrict__ y, const int64_t k) {
-    const int64_t i = 2*(blockDim.x*blockIdx.x + threadIdx.x);
+    const int64_t i = (int64_t)2*(blockDim.x*blockIdx.x + threadIdx.x);
    if (i >= k) {
        return;
    }
    const int64_t ib = i/qk; // block index
-    const int iqs = (i%qk)/qr; // quant index
+    const int64_t iqs = (i%qk)/qr; // quant index
-    const int iybs = i - i%qk; // y block start index
+    const int64_t iybs = i - i%qk; // y block start index
-    const int y_offset = qr == 1 ? 1 : qk/2;
+    const int64_t y_offset = qr == 1 ? 1 : qk/2;
    // dequantize
    dfloat2 v;
@ -29,7 +29,7 @@ static __global__ void dequantize_block_q8_0_f16(const void * __restrict__ vx, h
 #if __CUDA_ARCH__ >= CC_PASCAL
    constexpr int nint = CUDA_Q8_0_NE_ALIGN/sizeof(int) + WARP_SIZE;
-    const int   i0 = CUDA_Q8_0_NE_ALIGN*blockIdx.x;
+    const int64_t   i0 = CUDA_Q8_0_NE_ALIGN*blockIdx.x;
    const int * x0 = ((int *) vx) + blockIdx.x * nint;
    half2 * y2 = (half2 *) (y + i0);
@ -73,9 +73,9 @@ static __global__ void dequantize_block_q4_0(const void * __restrict__ vx, dst_t
    const int64_t i = blockIdx.x;
    // assume 32 threads
-    const int tid = threadIdx.x;
+    const int64_t tid = threadIdx.x;
-    const int il  = tid/8;
+    const int64_t il  = tid/8;
-    const int ir  = tid%8;
+    const int64_t ir  = tid%8;
    const int64_t ib = 8*i + ir;
    if (ib >= nb32) {
        return;
@ -101,9 +101,9 @@ static __global__ void dequantize_block_q4_1(const void * __restrict__ vx, dst_t
    const int64_t i = blockIdx.x;
    // assume 32 threads
-    const int tid = threadIdx.x;
+    const int64_t tid = threadIdx.x;
-    const int il  = tid/8;
+    const int64_t il  = tid/8;
-    const int ir  = tid%8;
+    const int64_t ir  = tid%8;
    const int64_t ib = 8*i + ir;
    if (ib >= nb32) {
        return;
@ -127,14 +127,14 @@ static __global__ void dequantize_block_q4_1(const void * __restrict__ vx, dst_t
 template<typename dst_t>
 static __global__ void dequantize_block_q2_K(const void * __restrict__ vx, dst_t * __restrict__ yy) {
-    const int i   = blockIdx.x;
+    const int64_t i   = blockIdx.x;
    const block_q2_K * x = (const block_q2_K *) vx;
-    const int tid = threadIdx.x;
+    const int64_t tid = threadIdx.x;
 #if QK_K == 256
-    const int n   = tid/32;
+    const int64_t n   = tid/32;
-    const int l   = tid - 32*n;
+    const int64_t l   = tid - 32*n;
-    const int is  = 8*n + l/16;
+    const int64_t is  = 8*n + l/16;
    const uint8_t q = x[i].qs[32*n + l];
    dst_t * y = yy + i*QK_K + 128*n;
@ -146,8 +146,8 @@ static __global__ void dequantize_block_q2_K(const void * __restrict__ vx, dst_t
    y[l+64] = dall * (x[i].scales[is+4] & 0xF) * ((q >> 4) & 3) - dmin * (x[i].scales[is+4] >> 4);
    y[l+96] = dall * (x[i].scales[is+6] & 0xF) * ((q >> 6) & 3) - dmin * (x[i].scales[is+6] >> 4);
 #else
-    const int is = tid/16;  // 0 or 1
+    const int64_t is = tid/16;  // 0 or 1
-    const int il = tid%16;  // 0...15
+    const int64_t il = tid%16;  // 0...15
    const uint8_t q = x[i].qs[il] >> (2*is);
    dst_t * y = yy + i*QK_K + 16*is + il;
    float dall = __low2half(x[i].dm);
@ -161,19 +161,19 @@ static __global__ void dequantize_block_q2_K(const void * __restrict__ vx, dst_t
 template<typename dst_t>
 static __global__ void dequantize_block_q3_K(const void * __restrict__ vx, dst_t * __restrict__ yy) {
-    const int i = blockIdx.x;
+    const int64_t i = blockIdx.x;
    const block_q3_K * x = (const block_q3_K *) vx;
 #if QK_K == 256
-    const int r = threadIdx.x/4;
+    const int64_t r = threadIdx.x/4;
-    const int tid = r/2;
+    const int64_t tid = r/2;
-    const int is0 = r%2;
+    const int64_t is0 = r%2;
-    const int l0 = 16*is0 + 4*(threadIdx.x%4);
+    const int64_t l0 = 16*is0 + 4*(threadIdx.x%4);
-    const int n = tid / 4;
+    const int64_t n = tid / 4;
-    const int j = tid - 4*n;
+    const int64_t j = tid - 4*n;
    uint8_t m = 1 << (4*n + j);
-    int is = 8*n + 2*j + is0;
+    int64_t is = 8*n + 2*j + is0;
    int shift = 2*j;
    int8_t us = is <  4 ? (x[i].scales[is-0] & 0xF) | (((x[i].scales[is+8] >> 0) & 3) << 4) :
@ -189,11 +189,11 @@ static __global__ void dequantize_block_q3_K(const void * __restrict__ vx, dst_t
    for (int l = l0; l < l0+4; ++l) y[l] = dl * ((int8_t)((q[l] >> shift) & 3) - ((hm[l] & m) ? 0 : 4));
 #else
-    const int tid = threadIdx.x;
+    const int64_t tid = threadIdx.x;
-    const int is  = tid/16;  // 0 or 1
+    const int64_t is  = tid/16;  // 0 or 1
-    const int il  = tid%16;  // 0...15
+    const int64_t il  = tid%16;  // 0...15
-    const int im  = il/8;    // 0...1
+    const int64_t im  = il/8;    // 0...1
-    const int in  = il%8;    // 0...7
+    const int64_t in  = il%8;    // 0...7
    dst_t * y = yy + i*QK_K + 16*is + il;
@ -227,15 +227,15 @@ template<typename dst_t>
 static __global__ void dequantize_block_q4_K(const void * __restrict__ vx, dst_t * __restrict__ yy) {
    const block_q4_K * x = (const block_q4_K *) vx;
-    const int i = blockIdx.x;
+    const int64_t i = blockIdx.x;
 #if QK_K == 256
    // assume 32 threads
-    const int tid = threadIdx.x;
+    const int64_t tid = threadIdx.x;
-    const int il  = tid/8;
+    const int64_t il  = tid/8;
-    const int ir  = tid%8;
+    const int64_t ir  = tid%8;
-    const int is  = 2*il;
+    const int64_t is  = 2*il;
-    const int n   = 4;
+    const int64_t n   = 4;
    dst_t * y = yy + i*QK_K + 64*il + n*ir;
@ -254,7 +254,7 @@ static __global__ void dequantize_block_q4_K(const void * __restrict__ vx, dst_t
        y[l +32] = d2 * (q[l] >>  4) - m2;
    }
 #else
-    const int tid = threadIdx.x;
+    const int64_t tid = threadIdx.x;
    const uint8_t * q = x[i].qs;
    dst_t * y = yy + i*QK_K;
    const float d = (float)x[i].dm[0];
@ -268,14 +268,14 @@ template<typename dst_t>
 static __global__ void dequantize_block_q5_K(const void * __restrict__ vx, dst_t * __restrict__ yy) {
    const block_q5_K * x = (const block_q5_K *) vx;
-    const int i = blockIdx.x;
+    const int64_t i = blockIdx.x;
 #if QK_K == 256
    // assume 64 threads - this is very slightly better than the one below
-    const int tid = threadIdx.x;
+    const int64_t tid = threadIdx.x;
-    const int il  = tid/16;   // il is in 0...3
+    const int64_t il  = tid/16;   // il is in 0...3
-    const int ir  = tid%16;   // ir is in 0...15
+    const int64_t ir  = tid%16;   // ir is in 0...15
-    const int is  = 2*il;     // is is in 0...6
+    const int64_t is  = 2*il;     // is is in 0...6
    dst_t * y = yy + i*QK_K + 64*il + 2*ir;
@ -298,11 +298,11 @@ static __global__ void dequantize_block_q5_K(const void * __restrict__ vx, dst_t
    y[32] = d2 * ((ql[ 0] >>  4) + (qh[ 0] & hm ? 16 : 0)) - m2;
    y[33] = d2 * ((ql[ 1] >>  4) + (qh[ 1] & hm ? 16 : 0)) - m2;
 #else
-    const int tid = threadIdx.x;
+    const int64_t tid = threadIdx.x;
    const uint8_t q = x[i].qs[tid];
-    const int im = tid/8;  // 0...3
+    const int64_t im = tid/8;  // 0...3
-    const int in = tid%8;  // 0...7
+    const int64_t in = tid%8;  // 0...7
-    const int is = tid/16; // 0 or 1
+    const int64_t is = tid/16; // 0 or 1
    const uint8_t h = x[i].qh[in] >> im;
    const float d = x[i].d;
    dst_t * y = yy + i*QK_K + tid;
@ -359,13 +359,13 @@ static __global__ void dequantize_block_q6_K(const void * __restrict__ vx, dst_t
 template<typename dst_t>
 static __global__ void dequantize_block_iq2_xxs(const void * __restrict__ vx, dst_t * __restrict__ yy) {
-    const int i   = blockIdx.x;
+    const int64_t i   = blockIdx.x;
    const block_iq2_xxs * x = (const block_iq2_xxs  *) vx;
-    const int tid = threadIdx.x;
+    const int64_t tid = threadIdx.x;
 #if QK_K == 256
-    const int il = tid/8; // 0...3
+    const int64_t il = tid/8; // 0...3
-    const int ib = tid%8; // 0...7
+    const int64_t ib = tid%8; // 0...7
    dst_t * y = yy + i*QK_K + 32*ib + 8*il;
    const uint16_t * q2 = x[i].qs + 4*ib;
    const uint8_t  * aux8 = (const uint8_t *)q2;
@ -383,13 +383,13 @@ static __global__ void dequantize_block_iq2_xxs(const void * __restrict__ vx, ds
 template<typename dst_t>
 static __global__ void dequantize_block_iq2_xs(const void * __restrict__ vx, dst_t * __restrict__ yy) {
-    const int i   = blockIdx.x;
+    const int64_t i   = blockIdx.x;
    const block_iq2_xs * x = (const block_iq2_xs *) vx;
-    const int tid = threadIdx.x;
+    const int64_t tid = threadIdx.x;
 #if QK_K == 256
-    const int il = tid/8; // 0...3
+    const int64_t il = tid/8; // 0...3
-    const int ib = tid%8; // 0...7
+    const int64_t ib = tid%8; // 0...7
    dst_t * y = yy + i*QK_K + 32*ib + 8*il;
    const uint16_t * q2 = x[i].qs + 4*ib;
    const uint8_t  * grid = (const uint8_t *)(iq2xs_grid + (q2[il] & 511));
@ -405,13 +405,13 @@ static __global__ void dequantize_block_iq2_xs(const void * __restrict__ vx, dst
 template<typename dst_t>
 static __global__ void dequantize_block_iq2_s(const void * __restrict__ vx, dst_t * __restrict__ yy) {
-    const int i   = blockIdx.x;
+    const int64_t i   = blockIdx.x;
    const block_iq2_s * x = (const block_iq2_s *) vx;
-    const int tid = threadIdx.x;
+    const int64_t tid = threadIdx.x;
 #if QK_K == 256
-    const int il = tid/8; // 0...3
+    const int64_t il = tid/8; // 0...3
-    const int ib = tid%8; // 0...7
+    const int64_t ib = tid%8; // 0...7
    dst_t * y = yy + i*QK_K + 32*ib + 8*il;
    const uint8_t * grid = (const uint8_t *)(iq2s_grid + (x[i].qs[4*ib+il] | ((x[i].qh[ib] << (8-2*il)) & 0x300)));
    const float d = (float)x[i].d * (0.5f + ((x[i].scales[ib] >> 4*(il/2)) & 0xf)) * 0.25f;
@ -426,13 +426,13 @@ static __global__ void dequantize_block_iq2_s(const void * __restrict__ vx, dst_
 template<typename dst_t>
 static __global__ void dequantize_block_iq3_xxs(const void * __restrict__ vx, dst_t * __restrict__ yy) {
-    const int i   = blockIdx.x;
+    const int64_t i   = blockIdx.x;
    const block_iq3_xxs * x = (const block_iq3_xxs  *) vx;
-    const int tid = threadIdx.x;
+    const int64_t tid = threadIdx.x;
 #if QK_K == 256
-    const int il = tid/8; // 0...3
+    const int64_t il = tid/8; // 0...3
-    const int ib = tid%8; // 0...7
+    const int64_t ib = tid%8; // 0...7
    dst_t * y = yy + i*QK_K + 32*ib + 8*il;
    const uint8_t  * q3 = x[i].qs + 8*ib;
    const uint16_t * gas = (const uint16_t *)(x[i].qs + QK_K/4) + 2*ib;
@ -454,13 +454,13 @@ static __global__ void dequantize_block_iq3_xxs(const void * __restrict__ vx, ds
 template<typename dst_t>
 static __global__ void dequantize_block_iq3_s(const void * __restrict__ vx, dst_t * __restrict__ yy) {
-    const int i   = blockIdx.x;
+    const int64_t i   = blockIdx.x;
    const block_iq3_s * x = (const block_iq3_s *) vx;
-    const int tid = threadIdx.x;
+    const int64_t tid = threadIdx.x;
 #if QK_K == 256
-    const int il = tid/8; // 0...3
+    const int64_t il = tid/8; // 0...3
-    const int ib = tid%8; // 0...7
+    const int64_t ib = tid%8; // 0...7
    dst_t * y = yy + i*QK_K + 32*ib + 8*il;
    const uint8_t * qs = x[i].qs + 8*ib;
    const uint8_t * grid1 = (const uint8_t *)(iq3s_grid + (qs[2*il+0] | ((x[i].qh[ib] << (8-2*il)) & 256)));
@ -480,13 +480,13 @@ static __global__ void dequantize_block_iq3_s(const void * __restrict__ vx, dst_
 template<typename dst_t>
 static __global__ void dequantize_block_iq1_s(const void * __restrict__ vx, dst_t * __restrict__ yy) {
-    const int i   = blockIdx.x;
+    const int64_t i   = blockIdx.x;
    const block_iq1_s * x = (const block_iq1_s  *) vx;
-    const int tid = threadIdx.x;
+    const int64_t tid = threadIdx.x;
 #if QK_K == 256
-    const int il = tid/8; // 0...3
+    const int64_t il = tid/8; // 0...3
-    const int ib = tid%8; // 0...7
+    const int64_t ib = tid%8; // 0...7
    dst_t * y = yy + i*QK_K + 32*ib + 8*il;
    const float delta = x[i].qh[ib] & 0x8000 ? -1 - IQ1S_DELTA : -1 + IQ1S_DELTA;
    const float d = (float)x[i].d * (2*((x[i].qh[ib] >> 12) & 7) + 1);
@ -506,18 +506,18 @@ static __global__ void dequantize_block_iq1_s(const void * __restrict__ vx, dst_
 template<typename dst_t>
 static __global__ void dequantize_block_iq1_m(const void * __restrict__ vx, dst_t * __restrict__ yy) {
-    const int i   = blockIdx.x;
+    const int64_t i   = blockIdx.x;
    const block_iq1_m * x = (const block_iq1_m  *) vx;
-    const int tid = threadIdx.x;
+    const int64_t tid = threadIdx.x;
 #if QK_K == 256
-    const int il = tid/8; // 0...3
+    const int64_t il = tid/8; // 0...3
-    const int ib = tid%8; // 0...7
+    const int64_t ib = tid%8; // 0...7
    dst_t * y = yy + i*QK_K + 32*ib + 8*il;
    const uint16_t * sc = (const uint16_t *)x[i].scales;
    iq1m_scale_t scale;
    scale.u16 = (sc[0] >> 12) | ((sc[1] >> 8) & 0x00f0) | ((sc[2] >> 4) & 0x0f00) | (sc[3] & 0xf000);
-    const int ib16 = 2*ib + il/2; // sc[ib16/4] >> 3*(ib16%4) -> sc[ib/2] >> 3*((2*ib+il/2)%4);
+    const int64_t ib16 = 2*ib + il/2; // sc[ib16/4] >> 3*(ib16%4) -> sc[ib/2] >> 3*((2*ib+il/2)%4);
    const float d = (float)scale.f16 * (2*((sc[ib16/4] >> 3*(ib16%4)) & 0x7) + 1);
    const float delta = x[i].qh[2*ib+il/2] & (0x08 << 4*(il%2)) ? -1 - IQ1M_DELTA : -1 + IQ1M_DELTA;
    uint32_t grid32[2]; const int8_t * q = (const int8_t *)grid32;
@ -537,12 +537,12 @@ static __global__ void dequantize_block_iq1_m(const void * __restrict__ vx, dst_
 template<typename dst_t>
 static __global__ void dequantize_block_iq4_nl(const void * __restrict__ vx, dst_t * __restrict__ yy) {
-    const int i   = blockIdx.x;
+    const int64_t i   = blockIdx.x;
    const block_iq4_nl * x = (const block_iq4_nl *) vx + i*(QK_K/QK4_NL);
-    const int tid = threadIdx.x;
+    const int64_t tid = threadIdx.x;
-    const int il = tid/8; // 0...3
+    const int64_t il = tid/8; // 0...3
-    const int ib = tid%8; // 0...7
+    const int64_t ib = tid%8; // 0...7
    dst_t * y = yy + i*QK_K + 32*ib + 4*il;
    const uint8_t  * q4 = x[ib].qs + 4*il;
    const float d = (float)x[ib].d;
@ -556,12 +556,12 @@ static __global__ void dequantize_block_iq4_nl(const void * __restrict__ vx, dst
 #if QK_K != 64
 template<typename dst_t>
 static __global__ void dequantize_block_iq4_xs(const void * __restrict__ vx, dst_t * __restrict__ yy) {
-    const int i   = blockIdx.x;
+    const int64_t i   = blockIdx.x;
    const block_iq4_xs * x = (const block_iq4_xs *)vx;
-    const int tid = threadIdx.x;
+    const int64_t tid = threadIdx.x;
-    const int il = tid/8; // 0...3
+    const int64_t il = tid/8; // 0...3
-    const int ib = tid%8; // 0...7
+    const int64_t ib = tid%8; // 0...7
    dst_t * y = yy + i*QK_K + 32*ib + 4*il;
    const uint8_t  * q4 = x[i].qs + 16*ib + 4*il;
    const float d = (float)x[i].d * ((((x[i].scales_l[ib/2] >> 4*(ib%2)) & 0xf) | (((x[i].scales_h >> 2*ib) & 3) << 4)) - 32);
--- a/ggml-cuda/softmax.cu
+++ b/ggml-cuda/softmax.cu
@ -38,7 +38,7 @@ static __global__ void soft_max_f32(const float * x, const T * mask, const T * p
    extern __shared__ float data_soft_max_f32[];
    float * buf_iw = data_soft_max_f32; // shared memory buffer for inter-warp communication
    // shared memory buffer to cache values between iterations:
-    float * vals = vals_smem ? buf_iw + WARP_SIZE : dst + rowx*ncols;
+    float * vals = vals_smem ? buf_iw + WARP_SIZE : dst + (int64_t)rowx*ncols;
    float max_val = -INFINITY;
@ -50,8 +50,8 @@ static __global__ void soft_max_f32(const float * x, const T * mask, const T * p
            break;
        }
-        const int ix = rowx*ncols + col;
+        const int64_t ix = (int64_t)rowx*ncols + col;
-        const int iy = rowy*ncols + col;
+        const int64_t iy = (int64_t)rowy*ncols + col;
        const float val = x[ix]*scale + (mask ? t2f32(mask[iy]) : 0.0f) + (pos ? slope*t2f32(pos[col]) : 0.0f);
@ -119,7 +119,7 @@ static __global__ void soft_max_f32(const float * x, const T * mask, const T * p
            return;
        }
-        const int idst = rowx*ncols + col;
+        const int64_t idst = (int64_t)rowx*ncols + col;
        dst[idst] = vals[col] * inv_sum;
    }
 }
--- a/ggml-quants.c
+++ b/ggml-quants.c
@ -12383,3 +12383,287 @@ void quantize_row_iq2_s(const float * restrict x, void * restrict vy, int64_t k)
    block_iq2_s * restrict y = vy;
    quantize_row_iq2_s_reference(x, y, k);
 }
 static bool validate_float(float f, size_t i) {
    if (isinf(f)) {
        fprintf(stderr, "ggml_validate_row_data: found inf value at block %zu\n", i);
        return false;
    }
    if (isnan(f)) {
        fprintf(stderr, "ggml_validate_row_data: found nan value at block %zu\n", i);
        return false;
    }
    return true;
 }
 static bool isinf_fp16(ggml_fp16_t f) {
    return (f & 0x7c00) == 0x7c00 && (f & 0x03ff) == 0;
 }
 static bool isnan_fp16(ggml_fp16_t f) {
    return (f & 0x7c00) == 0x7c00 && (f & 0x03ff) != 0;
 }
 static bool validate_fp16(ggml_fp16_t f, size_t i) {
    if (isinf_fp16(f)) {
        fprintf(stderr, "ggml_validate_row_data: found inf value at block %zu\n", i);
        return false;
    }
    if (isnan_fp16(f)) {
        fprintf(stderr, "ggml_validate_row_data: found nan value at block %zu\n", i);
        return false;
    }
    return true;
 }
 #define VALIDATE_ROW_DATA_D_F16_IMPL(type, data, nb) \
    const type * q = (const type *) (data); \
    for (size_t i = 0; i < (nb); ++i) { \
        if (!validate_fp16(q[i].d, i)) { \
            return false; \
        } \
    }
 #define VALIDATE_ROW_DATA_DM_F16_IMPL(type, data, nb, d, m) \
    const type * q = (const type *) (data); \
    for (size_t i = 0; i < (nb); ++i) { \
        if (!validate_fp16(q[i].d, i) || !validate_fp16(q[i].m, i)) { \
            return false; \
        } \
    }
 bool ggml_validate_row_data(enum ggml_type type, const void * data, size_t nbytes) {
    if (type < 0 || type >= GGML_TYPE_COUNT) {
        fprintf(stderr, "%s: invalid type %d\n", __func__, type);
        return false;
    }
    if (nbytes % ggml_type_size(type) != 0) {
        fprintf(stderr, "%s: invalid size %zu for type %d\n", __func__, nbytes, type);
        return false;
    }
    const size_t nb = nbytes/ggml_type_size(type);
    switch (type) {
        case GGML_TYPE_F16:
            {
                const ggml_fp16_t * f = (const ggml_fp16_t *) data;
                size_t i = 0;
 #if defined(__AVX2__)
                for (; i + 15 < nb; i += 16) {
                    __m256i v = _mm256_loadu_si256((const __m256i *)(f + i));
                    __m256i vexp = _mm256_and_si256(v, _mm256_set1_epi16(0x7c00));
                    __m256i cmp = _mm256_cmpeq_epi16(vexp, _mm256_set1_epi16(0x7c00));
                    int mask = _mm256_movemask_epi8(cmp);
                    if (mask) {
                        for (size_t j = 0; j < 16; ++j) {
                            if (!validate_fp16(f[i + j], i + j)) {
                                return false;
                            }
                        }
                        GGML_UNREACHABLE();
                    }
                }
 #elif defined(__ARM_NEON)
                for (; i + 7 < nb; i += 8) {
                    uint16x8_t v = vld1q_u16(f + i);
                    uint16x8_t vexp = vandq_u16(v, vdupq_n_u16(0x7c00));
                    uint16x8_t cmp = vceqq_u16(vexp, vdupq_n_u16(0x7c00));
                    uint64_t mask = vget_lane_u64(vreinterpret_u64_u8(vshrn_n_u16(cmp, 4)), 0);
                    if (mask) {
                        for (size_t j = 0; j < 8; ++j) {
                            if (!validate_fp16(f[i + j], i + j)) {
                                return false;
                            }
                        }
                        GGML_UNREACHABLE();
                    }
                }
 #endif
                for (; i < nb; ++i) {
                    if (!validate_fp16(f[i], i)) {
                        return false;
                    }
                }
            } break;
        case GGML_TYPE_F32:
            {
                const float * f = (const float *) data;
                size_t i = 0;
 #if defined(__AVX2__)
                for (; i + 7 < nb; i += 8) {
                    __m256i v = _mm256_loadu_si256((const __m256i *)(f + i));
                    __m256i vexp = _mm256_and_si256(v, _mm256_set1_epi32(0x7f800000));
                    __m256i cmp = _mm256_cmpeq_epi32(vexp, _mm256_set1_epi32(0x7f800000));
                    int mask = _mm256_movemask_epi8(cmp);
                    if (mask) {
                        for (size_t j = 0; j < 8; ++j) {
                            if (!validate_float(f[i + j], i + j)) {
                                return false;
                            }
                        }
                        GGML_UNREACHABLE();
                    }
                }
 #elif defined(__ARM_NEON)
                for (; i + 3 < nb; i += 4) {
                    uint32x4_t v = vld1q_u32((const uint32_t *)f + i);
                    uint32x4_t vexp = vandq_u32(v, vdupq_n_u32(0x7f800000));
                    uint32x4_t cmp = vceqq_u32(vexp, vdupq_n_u32(0x7f800000));
                    uint64_t mask = vget_lane_u64(vreinterpret_u64_u16(vshrn_n_u32(cmp, 8)), 0);
                    if (mask) {
                        for (size_t j = 0; j < 4; ++j) {
                            if (!validate_float(f[i + j], i + j)) {
                                return false;
                            }
                        }
                        GGML_UNREACHABLE();
                    }
                }
 #endif
                for (; i < nb; ++i) {
                    if (!validate_float(f[i], i)) {
                        return false;
                    }
                }
            } break;
        case GGML_TYPE_F64:
            {
                const double * f = (const double *) data;
                for (size_t i = 0; i < nb; ++i) {
                    if (!validate_float(f[i], i)) {
                        return false;
                    }
                }
            } break;
        case GGML_TYPE_Q4_0:
            {
                VALIDATE_ROW_DATA_D_F16_IMPL(block_q4_0, data, nb);
            } break;
        case GGML_TYPE_Q4_1:
            {
                VALIDATE_ROW_DATA_DM_F16_IMPL(block_q4_1, data, nb, d, m);
            } break;
        case GGML_TYPE_Q5_0:
            {
                VALIDATE_ROW_DATA_D_F16_IMPL(block_q5_0, data, nb);
            } break;
        case GGML_TYPE_Q5_1:
            {
                VALIDATE_ROW_DATA_DM_F16_IMPL(block_q5_1, data, nb, d, m);
            } break;
        case GGML_TYPE_Q8_0:
            {
                VALIDATE_ROW_DATA_D_F16_IMPL(block_q8_0, data, nb);
            } break;
        case GGML_TYPE_Q2_K:
            {
                VALIDATE_ROW_DATA_DM_F16_IMPL(block_q2_K, data, nb, d, dmin);
            } break;
        case GGML_TYPE_Q3_K:
            {
                VALIDATE_ROW_DATA_D_F16_IMPL(block_q3_K, data, nb);
            } break;
        case GGML_TYPE_Q4_K:
            {
            #ifdef GGML_QKK_64
                VALIDATE_ROW_DATA_DM_F16_IMPL(block_q4_K, data, nb, d[0], d[1]);
            #else
                VALIDATE_ROW_DATA_DM_F16_IMPL(block_q4_K, data, nb, d, dmin);
            #endif
            } break;
        case GGML_TYPE_Q5_K:
            {
            #ifdef GGML_QKK_64
                VALIDATE_ROW_DATA_D_F16_IMPL(block_q5_K, data, nb);
            #else
                VALIDATE_ROW_DATA_DM_F16_IMPL(block_q5_K, data, nb, d, dmin);
            #endif
            } break;
        case GGML_TYPE_Q6_K:
            {
                VALIDATE_ROW_DATA_D_F16_IMPL(block_q6_K, data, nb);
            } break;
        case GGML_TYPE_Q8_K:
            {
                const block_q8_K * q = (const block_q8_K *) data;
                for (size_t i = 0; i < nb; ++i) {
                    if (!validate_float(q[i].d, i)) {
                        return false;
                    }
                }
            } break;
        case GGML_TYPE_IQ1_S:
            {
                VALIDATE_ROW_DATA_D_F16_IMPL(block_iq1_s, data, nb);
            } break;
        case GGML_TYPE_IQ1_M:
            {
                const block_iq1_m * q = (const block_iq1_m *) data;
                for (size_t i = 0; i < nb; ++i) {
                #if QK_K == 64
                    if (!validate_fp16(q[i].d, i)) {
                        return false;
                    }
                #else
                    iq1m_scale_t scale;
                    const uint16_t * sc = (const uint16_t *)q[i].scales;
                    scale.u16 = (sc[0] >> 12) | ((sc[1] >> 8) & 0x00f0) | ((sc[2] >> 4) & 0x0f00) | (sc[3] & 0xf000);
                    if (!validate_fp16(scale.f16, i)) {
                        return false;
                    }
                #endif
                }
            } break;
        case GGML_TYPE_IQ2_XXS:
            {
                VALIDATE_ROW_DATA_D_F16_IMPL(block_iq2_xxs, data, nb);
            } break;
        case GGML_TYPE_IQ2_XS:
            {
                VALIDATE_ROW_DATA_D_F16_IMPL(block_iq2_xs, data, nb);
            } break;
        case GGML_TYPE_IQ2_S:
            {
                VALIDATE_ROW_DATA_D_F16_IMPL(block_iq2_s, data, nb);
            } break;
        case GGML_TYPE_IQ3_XXS:
            {
                VALIDATE_ROW_DATA_D_F16_IMPL(block_iq3_xxs, data, nb);
            } break;
        case GGML_TYPE_IQ3_S:
            {
                VALIDATE_ROW_DATA_D_F16_IMPL(block_iq3_s, data, nb);
            } break;
        case GGML_TYPE_IQ4_XS:
        #if QK_K != 64
            {
                VALIDATE_ROW_DATA_D_F16_IMPL(block_iq4_xs, data, nb);
            } break;
        #endif
        // with QK_K == 64, iq4_xs is iq4_nl
        case GGML_TYPE_IQ4_NL:
            {
                VALIDATE_ROW_DATA_D_F16_IMPL(block_iq4_nl, data, nb);
            } break;
        case GGML_TYPE_I8:
        case GGML_TYPE_I16:
        case GGML_TYPE_I32:
        case GGML_TYPE_I64:
            // nothing to validate
            break;
        default:
            {
                fprintf(stderr, "%s: invalid type %d\n", __func__, type);
                return false;
            }
    }
    return true;
 }
--- a/ggml-sycl.cpp
+++ b/ggml-sycl.cpp
@ -13416,11 +13416,16 @@ void print_device_detail(int id, sycl::device &device, std::string device_type)
    version += std::to_string(prop.get_minor_version());
    device_type = std::regex_replace(device_type, std::regex("ext_oneapi_"), "");
    std::string name = std::string(prop.get_name());
    name = std::regex_replace(name, std::regex("\\(R\\)"), "");
    name = std::regex_replace(name, std::regex("\\(TM\\)"), "");
-    fprintf(stderr, "|%2d|%18s|%45s|%10s|%11d|%8d|%7d|%15lu|\n", id, device_type.c_str(),
+    auto global_mem_size = prop.get_global_mem_size()/1000000;
-            prop.get_name(), version.c_str(), prop.get_max_compute_units(),
+
    fprintf(stderr, "|%2d|%19s|%39s|%7s|%7d|%8d|%5d|%6luM|%21s|\n", id, device_type.c_str(),
            name.c_str(), version.c_str(), prop.get_max_compute_units(),
            prop.get_max_work_group_size(), prop.get_max_sub_group_size(),
-            prop.get_global_mem_size());
+            global_mem_size, device.get_info<sycl::info::device::driver_version>().c_str());
 }
 void ggml_backend_sycl_print_sycl_devices() {
@ -13428,9 +13433,10 @@ void ggml_backend_sycl_print_sycl_devices() {
    int device_count = dpct::dev_mgr::instance().device_count();
    std::map<std::string, size_t> DeviceNums;
    fprintf(stderr, "found %d SYCL devices:\n", device_count);
-    fprintf(stderr, "|  |                  |                                             |Compute   |Max compute|Max work|Max sub|               |\n");
+    fprintf(stderr, "|  |                   |                                       |       |Max    |        |Max  |Global |                     |\n");
-    fprintf(stderr, "|ID|       Device Type|                                         Name|capability|units      |group   |group  |Global mem size|\n");
+    fprintf(stderr, "|  |                   |                                       |       |compute|Max work|sub  |mem    |                     |\n");
-    fprintf(stderr, "|--|------------------|---------------------------------------------|----------|-----------|--------|-------|---------------|\n");
+    fprintf(stderr, "|ID|        Device Type|                                   Name|Version|units  |group   |group|size   |       Driver version|\n");
    fprintf(stderr, "|--|-------------------|---------------------------------------|-------|-------|--------|-----|-------|---------------------|\n");
    for (int id = 0; id < device_count; ++id) {
        sycl::device device = dpct::dev_mgr::instance().get_device(id);
        sycl::backend backend = device.get_backend();
--- a/ggml.c
+++ b/ggml.c
@ -20959,7 +20959,7 @@ static void gguf_free_kv(struct gguf_kv * kv) {
 }
 struct gguf_context * gguf_init_empty(void) {
-    struct gguf_context * ctx = GGML_ALIGNED_MALLOC(sizeof(struct gguf_context));
+    struct gguf_context * ctx = GGML_CALLOC(1, sizeof(struct gguf_context));
    memcpy(ctx->header.magic, GGUF_MAGIC, sizeof(ctx->header.magic));
    ctx->header.version   = GGUF_VERSION;
@ -21004,7 +21004,7 @@ struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_p
    bool ok = true;
-    struct gguf_context * ctx = GGML_ALIGNED_MALLOC(sizeof(struct gguf_context));
+    struct gguf_context * ctx = GGML_CALLOC(1, sizeof(struct gguf_context));
    // read the header
    {
@ -21041,9 +21041,13 @@ struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_p
    // read the kv pairs
    {
-        ctx->kv = GGML_MALLOC(ctx->header.n_kv * sizeof(struct gguf_kv));
+        const uint64_t n_kv = ctx->header.n_kv;
-        for (uint64_t i = 0; i < ctx->header.n_kv; ++i) {
+        // header.n_kv will hold the actual value of pairs that were successfully read in the loop below
        ctx->header.n_kv = 0;
        ctx->kv = GGML_CALLOC(n_kv, sizeof(struct gguf_kv));
        for (uint64_t i = 0; i < n_kv; ++i) {
            struct gguf_kv * kv = &ctx->kv[i];
            //fprintf(stderr, "%s: reading kv %d\n", __func__, i);
@ -21092,7 +21096,7 @@ struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_p
                                        return NULL;
                                    }
-                                    kv->value.arr.data = GGML_MALLOC(kv->value.arr.n * gguf_type_size(kv->value.arr.type));
+                                    kv->value.arr.data = GGML_CALLOC(kv->value.arr.n, gguf_type_size(kv->value.arr.type));
                                    ok = ok && gguf_fread_el(file, kv->value.arr.data, kv->value.arr.n * gguf_type_size(kv->value.arr.type), &offset);
                                } break;
@ -21106,7 +21110,7 @@ struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_p
                                        return NULL;
                                    }
-                                    kv->value.arr.data = GGML_MALLOC(kv->value.arr.n * sizeof(struct gguf_str));
+                                    kv->value.arr.data = GGML_CALLOC(kv->value.arr.n, sizeof(struct gguf_str));
                                    for (uint64_t j = 0; j < kv->value.arr.n; ++j) {
                                        ok = ok && gguf_fread_str(file, &((struct gguf_str *) kv->value.arr.data)[j], &offset);
@ -21122,6 +21126,8 @@ struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_p
            if (!ok) {
                break;
            }
            ctx->header.n_kv++;
        }
        if (!ok) {
@ -21134,7 +21140,7 @@ struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_p
    // read the tensor infos
    {
-        ctx->infos = GGML_MALLOC(ctx->header.n_tensors * sizeof(struct gguf_tensor_info));
+        ctx->infos = GGML_CALLOC(ctx->header.n_tensors, sizeof(struct gguf_tensor_info));
        for (uint64_t i = 0; i < ctx->header.n_tensors; ++i) {
            struct gguf_tensor_info * info = &ctx->infos[i];
@ -21155,8 +21161,17 @@ struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_p
            ok = ok && gguf_fread_el (file, &info->type,   sizeof(info->type),    &offset);
            ok = ok && gguf_fread_el (file, &info->offset, sizeof(info->offset),  &offset);
            // TODO: return an error instead of crashing with GGML_ASSERT
            gguf_tensor_info_sanitize(info);
            // make sure there is no duplicated tensor names
            for (uint64_t j = 0; j < i; ++j) {
                if (strcmp(info->name.data, ctx->infos[j].name.data) == 0) {
                    fprintf(stderr, "%s: duplicated tensor name %s\n", __func__, info->name.data);
                    ok = false;
                }
            }
            if (!ok) {
                fprintf(stderr, "%s: failed to read tensor info\n", __func__);
                fclose(file);
@ -21325,7 +21340,7 @@ void gguf_free(struct gguf_context * ctx) {
        GGML_FREE(ctx->infos);
    }
-    GGML_ALIGNED_FREE(ctx);
+    GGML_FREE(ctx);
 }
 const char * gguf_type_name(enum gguf_type type) {
@ -21636,7 +21651,7 @@ void gguf_set_arr_data(struct gguf_context * ctx, const char * key, enum gguf_ty
    ctx->kv[idx].type           = GGUF_TYPE_ARRAY;
    ctx->kv[idx].value.arr.type = type;
    ctx->kv[idx].value.arr.n    = n;
-    ctx->kv[idx].value.arr.data = GGML_MALLOC(n*gguf_type_size(type));
+    ctx->kv[idx].value.arr.data = GGML_CALLOC(n, gguf_type_size(type));
    memcpy(ctx->kv[idx].value.arr.data, data, n*gguf_type_size(type));
 }
@ -21646,7 +21661,7 @@ void gguf_set_arr_str(struct gguf_context * ctx, const char * key, const char **
    ctx->kv[idx].type           = GGUF_TYPE_ARRAY;
    ctx->kv[idx].value.arr.type = GGUF_TYPE_STRING;
    ctx->kv[idx].value.arr.n    = n;
-    ctx->kv[idx].value.arr.data = GGML_MALLOC(n*sizeof(struct gguf_str));
+    ctx->kv[idx].value.arr.data = GGML_CALLOC(n, sizeof(struct gguf_str));
    for (int i = 0; i < n; i++) {
        struct gguf_str * str = &((struct gguf_str *)ctx->kv[idx].value.arr.data)[i];
        str->n    = strlen(data[i]);
@ -21673,7 +21688,7 @@ void gguf_set_kv(struct gguf_context * ctx, struct gguf_context * src) {
            case GGUF_TYPE_ARRAY:
                {
                    if (src->kv[i].value.arr.type == GGUF_TYPE_STRING) {
-                        const char ** data = GGML_MALLOC(src->kv[i].value.arr.n*sizeof(char *));
+                        const char ** data = GGML_CALLOC(src->kv[i].value.arr.n, sizeof(char *));
                        for (uint32_t j = 0; j < src->kv[i].value.arr.n; j++) {
                            data[j] = ((struct gguf_str *)src->kv[i].value.arr.data)[j].data;
                        }
@ -21693,6 +21708,10 @@ void gguf_set_kv(struct gguf_context * ctx, struct gguf_context * src) {
 void gguf_add_tensor(
             struct gguf_context * ctx,
        const struct ggml_tensor * tensor) {
    if (gguf_find_tensor(ctx, tensor->name) != -1) {
        GGML_ASSERT(false && "duplicated tensor name");
    }
    const int idx = ctx->header.n_tensors;
    ctx->infos = realloc(ctx->infos, (idx + 1)*sizeof(struct gguf_tensor_info));
@ -21761,7 +21780,7 @@ struct gguf_buf {
 static struct gguf_buf gguf_buf_init(size_t size) {
    struct gguf_buf buf = {
-        /*buf.data   =*/ size == 0 ? NULL : GGML_MALLOC(size),
+        /*buf.data   =*/ size == 0 ? NULL : GGML_CALLOC(1, size),
        /*buf.size   =*/ size,
        /*buf.offset =*/ 0,
    };
--- a/ggml.h
+++ b/ggml.h
@ -763,6 +763,8 @@ extern "C" {
    // use this to compute the memory overhead of a tensor
    GGML_API size_t ggml_tensor_overhead(void);
    GGML_API bool ggml_validate_row_data(enum ggml_type type, const void * data, size_t nbytes);
    // main
    GGML_API struct ggml_context * ggml_init(struct ggml_init_params params);
--- a/gguf-py/gguf/constants.py
+++ b/gguf-py/gguf/constants.py
@ -72,6 +72,7 @@ class Keys:
    class Tokenizer:
        MODEL            = "tokenizer.ggml.model"
        PRE              = "tokenizer.ggml.pre"
        LIST             = "tokenizer.ggml.tokens"
        TOKEN_TYPE       = "tokenizer.ggml.token_type"
        TOKEN_TYPE_COUNT = "tokenizer.ggml.token_type_count"  # for BERT-style token types
@ -940,6 +941,7 @@ KEY_SSM_TIME_STEP_RANK = Keys.SSM.TIME_STEP_RANK
 # tokenization
 KEY_TOKENIZER_MODEL      = Keys.Tokenizer.MODEL
 KEY_TOKENIZER_PRE        = Keys.Tokenizer.PRE
 KEY_TOKENIZER_LIST       = Keys.Tokenizer.LIST
 KEY_TOKENIZER_TOKEN_TYPE = Keys.Tokenizer.TOKEN_TYPE
 KEY_TOKENIZER_SCORES     = Keys.Tokenizer.SCORES
--- a/gguf-py/gguf/gguf_reader.py
+++ b/gguf-py/gguf/gguf_reader.py
@ -139,7 +139,12 @@ class GGUFReader:
    def _push_field(self, field: ReaderField, skip_sum: bool = False) -> int:
        if field.name in self.fields:
-            raise KeyError(f'Duplicate {field.name} already in list at offset {field.offset}')
+            # TODO: add option to generate error on duplicate keys
            # raise KeyError(f'Duplicate {field.name} already in list at offset {field.offset}')
            print(f'Warning: Duplicate key {field.name} at offset {field.offset}')
            self.fields[field.name + '_{}'.format(field.offset)] = field
        else:
            self.fields[field.name] = field
        return 0 if skip_sum else sum(int(part.nbytes) for part in field.parts)
@ -234,8 +239,14 @@ class GGUFReader:
    def _build_tensors(self, start_offs: int, fields: list[ReaderField]) -> None:
        tensors = []
        tensor_names = set() # keep track of name to prevent duplicated tensors
        for field in fields:
            _name_len, name_data, _n_dims, dims, raw_dtype, offset_tensor = field.parts
            # check if there's any tensor having same name already in the list
            tensor_name = str(bytes(name_data), encoding = 'utf-8')
            if tensor_name in tensor_names:
                raise ValueError(f'Found duplicated tensor with name {tensor_name}')
            tensor_names.add(tensor_name)
            ggml_type = GGMLQuantizationType(raw_dtype[0])
            n_elems = np.prod(dims)
            block_size, type_size = GGML_QUANT_SIZES[ggml_type]
@ -267,7 +278,7 @@ class GGUFReader:
                item_count = n_bytes
                item_type = np.uint8
            tensors.append(ReaderTensor(
-                name = str(bytes(name_data), encoding = 'utf-8'),
+                name = tensor_name,
                tensor_type = ggml_type,
                shape = dims,
                n_elements = n_elems,
--- a/gguf-py/gguf/gguf_writer.py
+++ b/gguf-py/gguf/gguf_writer.py
@ -63,6 +63,7 @@ class GGUFWriter:
        self.kv_data_count = 0
        self.ti_data = bytearray()
        self.ti_data_count = 0
        self.ti_names = set()
        self.use_temp_file = use_temp_file
        self.temp_file = None
        self.tensors = []
@ -197,6 +198,10 @@ class GGUFWriter:
        if self.state is not WriterState.EMPTY:
            raise ValueError(f'Expected output file to be empty, got {self.state}')
        if name in self.ti_names:
            raise ValueError(f'Duplicated tensor name {name}')
        self.ti_names.add(name)
        encoded_name = name.encode("utf8")
        self.ti_data += self._pack("Q", len(encoded_name))
        self.ti_data += encoded_name
@ -422,6 +427,9 @@ class GGUFWriter:
    def add_tokenizer_model(self, model: str) -> None:
        self.add_string(Keys.Tokenizer.MODEL, model)
    def add_tokenizer_pre(self, pre: str) -> None:
        self.add_string(Keys.Tokenizer.PRE, pre)
    def add_token_list(self, tokens: Sequence[str] | Sequence[bytes] | Sequence[bytearray]) -> None:
        self.add_array(Keys.Tokenizer.LIST, tokens)
--- a/llama.cpp
+++ b/llama.cpp
@ -75,6 +75,7 @@
 #include <forward_list>
 #include <fstream>
 #include <functional>
 #include <future>
 #include <initializer_list>
 #include <locale>
 #include <map>
@ -315,6 +316,7 @@ enum llm_kv {
    LLM_KV_SSM_TIME_STEP_RANK,
    LLM_KV_TOKENIZER_MODEL,
    LLM_KV_TOKENIZER_PRE,
    LLM_KV_TOKENIZER_LIST,
    LLM_KV_TOKENIZER_TOKEN_TYPE,
    LLM_KV_TOKENIZER_TOKEN_TYPE_COUNT,
@ -391,6 +393,7 @@ static const std::map<llm_kv, const char *> LLM_KV_NAMES = {
    { LLM_KV_SSM_TIME_STEP_RANK,            "%s.ssm.time_step_rank" },
    { LLM_KV_TOKENIZER_MODEL,               "tokenizer.ggml.model"              },
    { LLM_KV_TOKENIZER_PRE,                 "tokenizer.ggml.pre"                },
    { LLM_KV_TOKENIZER_LIST,                "tokenizer.ggml.tokens"             },
    { LLM_KV_TOKENIZER_TOKEN_TYPE,          "tokenizer.ggml.token_type"         },
    { LLM_KV_TOKENIZER_TOKEN_TYPE_COUNT,    "tokenizer.ggml.token_type_count"   },
@ -2115,6 +2118,7 @@ struct llama_vocab {
    };
    enum llama_vocab_type     type     = LLAMA_VOCAB_TYPE_SPM;
    enum llama_vocab_pre_type type_pre = LLAMA_VOCAB_PRE_TYPE_DEFAULT;
    std::unordered_map<token, id> token_to_id;
    std::vector<token_data>       id_to_token;
@ -2890,6 +2894,7 @@ namespace GGUFMeta {
                case LLAMA_KV_OVERRIDE_TYPE_BOOL:  return "bool";
                case LLAMA_KV_OVERRIDE_TYPE_INT:   return "int";
                case LLAMA_KV_OVERRIDE_TYPE_FLOAT: return "float";
                case LLAMA_KV_OVERRIDE_TYPE_STR:   return "str";
            }
            return "unknown";
        }
@ -2901,13 +2906,16 @@ namespace GGUFMeta {
                    __func__, override_type_to_str(ovrd->tag), ovrd->key);
                switch (ovrd->tag) {
                    case LLAMA_KV_OVERRIDE_TYPE_BOOL:  {
-                        LLAMA_LOG_INFO("%s\n", ovrd->bool_value ? "true" : "false");
+                        LLAMA_LOG_INFO("%s\n", ovrd->val_bool ? "true" : "false");
                    } break;
                    case LLAMA_KV_OVERRIDE_TYPE_INT:   {
-                        LLAMA_LOG_INFO("%" PRId64 "\n", ovrd->int_value);
+                        LLAMA_LOG_INFO("%" PRId64 "\n", ovrd->val_i64);
                    } break;
                    case LLAMA_KV_OVERRIDE_TYPE_FLOAT: {
-                        LLAMA_LOG_INFO("%.6f\n", ovrd->float_value);
+                        LLAMA_LOG_INFO("%.6f\n", ovrd->val_f64);
                    } break;
                    case LLAMA_KV_OVERRIDE_TYPE_STR: {
                        LLAMA_LOG_INFO("%s\n", ovrd->val_str);
                    } break;
                    default:
                        // Shouldn't be possible to end up here, but just in case...
@ -2926,7 +2934,7 @@ namespace GGUFMeta {
        static typename std::enable_if<std::is_same<OT, bool>::value, bool>::type
        try_override(OT & target, const struct llama_model_kv_override * ovrd) {
            if (validate_override(LLAMA_KV_OVERRIDE_TYPE_BOOL, ovrd)) {
-                target = ovrd->bool_value;
+                target = ovrd->val_bool;
                return true;
            }
            return false;
@ -2936,7 +2944,7 @@ namespace GGUFMeta {
        static typename std::enable_if<!std::is_same<OT, bool>::value && std::is_integral<OT>::value, bool>::type
        try_override(OT & target, const struct llama_model_kv_override * ovrd) {
            if (validate_override(LLAMA_KV_OVERRIDE_TYPE_INT, ovrd)) {
-                target = ovrd->int_value;
+                target = ovrd->val_i64;
                return true;
            }
            return false;
@ -2946,7 +2954,7 @@ namespace GGUFMeta {
        static typename std::enable_if<std::is_floating_point<OT>::value, bool>::type
        try_override(T & target, const struct llama_model_kv_override * ovrd) {
            if (validate_override(LLAMA_KV_OVERRIDE_TYPE_FLOAT, ovrd)) {
-                target = ovrd->float_value;
+                target = ovrd->val_f64;
                return true;
            }
            return false;
@ -2955,12 +2963,11 @@ namespace GGUFMeta {
        template<typename OT>
        static typename std::enable_if<std::is_same<OT, std::string>::value, bool>::type
        try_override(T & target, const struct llama_model_kv_override * ovrd) {
-            (void)target;
+            if (validate_override(LLAMA_KV_OVERRIDE_TYPE_STR, ovrd)) {
-            (void)ovrd;
+                target = ovrd->val_str;
-            if (!ovrd) { return false; }
+                return true;
-            // Currently, we should never end up here so it would be a bug if we do.
+            }
-            throw std::runtime_error(format("Unsupported attempt to override string type for metadata key %s\n",
+            return false;
                ovrd ? ovrd->key : "NULL"));
        }
        static bool set(const gguf_context * ctx, const int k, T & target, const struct llama_model_kv_override * ovrd = nullptr) {
@ -2993,6 +3000,7 @@ struct llama_model_loader {
    size_t  n_bytes    = 0;
    bool use_mmap = false;
    bool check_tensors;
    llama_files files;
    llama_ftype ftype;
@ -3026,7 +3034,7 @@ struct llama_model_loader {
    std::string arch_name;
    LLM_KV      llm_kv    = LLM_KV(LLM_ARCH_UNKNOWN);
-    llama_model_loader(const std::string & fname, bool use_mmap, const struct llama_model_kv_override * param_overrides_p) {
+    llama_model_loader(const std::string & fname, bool use_mmap, bool check_tensors, const struct llama_model_kv_override * param_overrides_p) {
        int trace = 0;
        if (getenv("LLAMA_TRACE")) {
            trace = atoi(getenv("LLAMA_TRACE"));
@ -3123,9 +3131,17 @@ struct llama_model_loader {
        fver = (enum llama_fver) gguf_get_version(meta);
        std::set<std::string> tensor_names;
        for (auto & w : weights) {
            n_elements += ggml_nelements(w.tensor);
            n_bytes    += ggml_nbytes(w.tensor);
            // make sure there is no duplicated tensor names
            const std::string name(w.tensor->name);
            auto found = tensor_names.find(name);
            if (found != tensor_names.end()) {
                throw std::runtime_error(format("invalid model: tensor '%s' is duplicated", w.tensor->name));
            }
            tensor_names.insert(name);
        }
        LLAMA_LOG_INFO("%s: loaded meta data with %d key-value pairs and %d tensors from %s (version %s)\n",
@ -3231,6 +3247,7 @@ struct llama_model_loader {
        }
        this->use_mmap = use_mmap;
        this->check_tensors = check_tensors;
    }
    ~llama_model_loader() {
@ -3489,6 +3506,10 @@ struct llama_model_loader {
            file->seek(w.offs, SEEK_SET);
            file->read_raw(cur->data, ggml_nbytes(cur));
        }
        if (check_tensors && !ggml_validate_row_data(cur->type, cur->data, ggml_nbytes(cur))) {
            throw std::runtime_error(format("tensor '%s' has invalid data", ggml_get_name(cur)));
        }
    }
    size_t size_done = 0;
@ -3505,6 +3526,8 @@ struct llama_model_loader {
        GGML_ASSERT(size_data != 0 && "call init_mappings() first");
        std::vector<no_init<uint8_t>> read_buf;
        std::vector<std::future<std::pair<ggml_tensor *, bool>>> validation_result;
        for (struct ggml_tensor * cur = ggml_get_first_tensor(ctx); cur != NULL; cur = ggml_get_next_tensor(ctx, cur)) {
            const auto * weight = get_weight(ggml_get_name(cur));
            if (weight == nullptr) {
@ -3526,37 +3549,66 @@ struct llama_model_loader {
                if (bufs_mmap.count(weight->idx)) {
                    buf_mmap = bufs_mmap.at(weight->idx);
                }
                uint8_t * data = (uint8_t *) mapping->addr + weight->offs;
                if (check_tensors) {
                    validation_result.emplace_back(std::async(std::launch::async, [cur, data, n_size] {
                        return std::make_pair(cur, ggml_validate_row_data(cur->type, data, n_size));
                    }));
                }
                GGML_ASSERT(buf_mmap || cur->data); // either we have a buffer to allocate the tensor in, or it is already allocated
                if (buf_mmap && cur->data == nullptr) {
-                    ggml_backend_tensor_alloc(buf_mmap, cur, (uint8_t *) mapping->addr + weight->offs);
+                    ggml_backend_tensor_alloc(buf_mmap, cur, data);
                    if (lmlocks) {
                        const auto & lmlock = lmlocks->at(weight->idx);
-                        lmlock->grow_to(weight->offs + ggml_nbytes(cur));
+                        lmlock->grow_to(weight->offs + n_size);
                    }
                    auto & mmap_used = mmaps_used[weight->idx];
                    mmap_used.first  = std::min(mmap_used.first,  weight->offs);
                    mmap_used.second = std::max(mmap_used.second, weight->offs + n_size);
                } else {
-                    ggml_backend_tensor_set(cur, (uint8_t *) mapping->addr + weight->offs, 0, n_size);
+                    ggml_backend_tensor_set(cur, data, 0, n_size);
                }
            } else {
                GGML_ASSERT(weight->idx < files.size());
                const auto & file = files.at(weight->idx);
                if (ggml_backend_buffer_is_host(cur->buffer)) {
                    file->seek(weight->offs, SEEK_SET);
-                    file->read_raw(cur->data, ggml_nbytes(cur));
+                    file->read_raw(cur->data, n_size);
                    if (check_tensors) {
                        validation_result.emplace_back(std::async(std::launch::async, [cur, n_size] {
                            return std::make_pair(cur, ggml_validate_row_data(cur->type, cur->data, n_size));
                        }));
                    }
                } else {
-                    read_buf.resize(ggml_nbytes(cur));
+                    read_buf.resize(n_size);
                    file->seek(weight->offs, SEEK_SET);
-                    file->read_raw(read_buf.data(), ggml_nbytes(cur));
+                    file->read_raw(read_buf.data(), n_size);
                    ggml_backend_tensor_set(cur, read_buf.data(), 0, n_size);
                    if (check_tensors && !ggml_validate_row_data(cur->type, read_buf.data(), n_size)) {
                        throw std::runtime_error(format("tensor '%s' has invalid data", ggml_get_name(cur)));
                    }
                }
            }
            size_done += n_size;
        }
        // check validation results
        bool validation_failed = false;
        for (auto & future : validation_result) {
            auto result = future.get();
            if (!result.second) {
                LLAMA_LOG_ERROR("%s: tensor '%s' has invalid data\n", __func__, ggml_get_name(result.first));
                validation_failed = true;
            }
        }
        if (validation_failed) {
            throw std::runtime_error("found tensors with invalid data");
        }
        // check if this is the last call and do final cleanup
        if (size_done >= size_data) {
            // unmap offloaded tensors and metadata
@ -4173,11 +4225,13 @@ static void llm_load_vocab(
    // determine vocab type
    {
-        std::string tokenizer_name;
+        std::string tokenizer_model;
        std::string tokenizer_pre;
-        ml.get_key(LLM_KV_TOKENIZER_MODEL, tokenizer_name);
+        ml.get_key(LLM_KV_TOKENIZER_MODEL, tokenizer_model);
        ml.get_key(LLM_KV_TOKENIZER_PRE,   tokenizer_pre, false);
-        if (tokenizer_name == "no_vocab") {
+        if (tokenizer_model == "no_vocab") {
            vocab.type = LLAMA_VOCAB_TYPE_NONE;
            // default special tokens
@ -4191,7 +4245,7 @@ static void llm_load_vocab(
            vocab.linefeed_id     = -1;
            return;
-        } else if (tokenizer_name == "llama") {
+        } else if (tokenizer_model == "llama") {
            vocab.type = LLAMA_VOCAB_TYPE_SPM;
            // default special tokens
@ -4236,9 +4290,27 @@ static void llm_load_vocab(
            if (add_space_prefix_keyidx != -1) {
                vocab.add_space_prefix = gguf_get_val_bool(ctx, add_space_prefix_keyidx);
            } // The default value of add_space_prefix is true.
-        } else if (tokenizer_name == "gpt2") {
+        } else if (tokenizer_model == "bert") {
-            vocab.type = LLAMA_VOCAB_TYPE_BPE;
+            vocab.type = LLAMA_VOCAB_TYPE_WPM;
            // default special tokens
            vocab.special_bos_id  = -1;
            vocab.special_eos_id  = -1;
            vocab.special_unk_id  = 100;
            vocab.special_sep_id  = 102;
            vocab.special_pad_id  = 0;
            vocab.special_cls_id  = 101;
            vocab.special_mask_id = 103;
            vocab.add_space_prefix = false;
        } else {
            if (tokenizer_model == "gpt2") {
                vocab.type = LLAMA_VOCAB_TYPE_BPE;
            } else {
                LLAMA_LOG_WARN("%s: unknown tokenizer: '%s'", __func__, tokenizer_model.c_str());
                LLAMA_LOG_WARN("%s: using default tokenizer: 'llama'", __func__);
                vocab.type = LLAMA_VOCAB_TYPE_SPM;
                return;
            }
            // read bpe merges and populate bpe ranks
            const int merges_keyidx = gguf_find_key(ctx, kv(LLM_KV_TOKENIZER_MERGES).c_str());
            if (merges_keyidx == -1) {
@ -4272,23 +4344,50 @@ static void llm_load_vocab(
            vocab.special_pad_id  = -1;
            vocab.special_cls_id  = -1;
            vocab.special_mask_id = -1;
-        } else if (tokenizer_name == "bert") {
+        }
            vocab.type = LLAMA_VOCAB_TYPE_WPM;
-            // default special tokens
+        // for now, only BPE models have pre-tokenizers
-            vocab.special_bos_id  = -1;
+        if (vocab.type == LLAMA_VOCAB_TYPE_BPE) {
-            vocab.special_eos_id  = -1;
+            if (tokenizer_pre.empty()) {
-            vocab.special_unk_id  = 100;
+                LLAMA_LOG_WARN("%s: missing pre-tokenizer type, using: 'default'\n", __func__);
-            vocab.special_sep_id  = 102;
+                LLAMA_LOG_WARN("%s:                                             \n", __func__);
-            vocab.special_pad_id  = 0;
+                LLAMA_LOG_WARN("%s: ************************************        \n", __func__);
-            vocab.special_cls_id  = 101;
+                LLAMA_LOG_WARN("%s: GENERATION QUALITY WILL BE DEGRADED!        \n", __func__);
-            vocab.special_mask_id = 103;
+                LLAMA_LOG_WARN("%s: CONSIDER REGENERATING THE MODEL             \n", __func__);
-            vocab.add_space_prefix = false;
+                LLAMA_LOG_WARN("%s: ************************************        \n", __func__);
                LLAMA_LOG_WARN("%s:                                             \n", __func__);
                vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_DEFAULT;
            } else if (
                    tokenizer_pre == "default") {
                vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_DEFAULT;
            } else if (
                    tokenizer_pre == "llama3"   ||
                    tokenizer_pre == "llama-v3" ||
                    tokenizer_pre == "llama-bpe") {
                vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_LLAMA3;
            } else if (
                    tokenizer_pre == "deepseek-llm") {
                vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_DEEPSEEK_LLM;
            } else if (
                    tokenizer_pre == "deepseek-coder") {
                vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_DEEPSEEK_CODER;
            } else if (
                    tokenizer_pre == "falcon") {
                vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_FALCON;
            } else if (
                    tokenizer_pre == "mpt") {
                vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_MPT;
            } else if (
                    tokenizer_pre == "starcoder") {
                vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_STARCODER;
            } else if (
                    tokenizer_pre == "gpt-2") {
                vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_GPT2;
            } else {
-            LLAMA_LOG_WARN("%s: unknown tokenizer: '%s'", __func__, tokenizer_name.c_str());
+                throw std::runtime_error(format("unknown pre-tokenizer type: '%s'", tokenizer_pre.c_str()));
-            LLAMA_LOG_WARN("%s: using default tokenizer: 'llama'", __func__);
+            }
-
+        } else {
-            vocab.type = LLAMA_VOCAB_TYPE_SPM;
+            vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_DEFAULT;
        }
    }
@ -5983,7 +6082,7 @@ static bool llm_load_tensors(
 // Returns 0 on success, -1 on error, and -2 on cancellation via llama_progress_callback
 static int llama_model_load(const std::string & fname, llama_model & model, llama_model_params & params) {
    try {
-        llama_model_loader ml(fname, params.use_mmap, params.kv_overrides);
+        llama_model_loader ml(fname, params.use_mmap, params.check_tensors, params.kv_overrides);
        model.hparams.vocab_only = params.vocab_only;
@ -11506,6 +11605,10 @@ static int llama_decode_internal(
        }
    }
    // Reset state for the next token before backend sync, to allow the CPU activities in the reset to
    // overlap with device computation.
    ggml_backend_sched_reset(lctx.sched);
    return 0;
 }
@ -11857,7 +11960,7 @@ static uint8_t llama_token_to_byte(const llama_vocab& vocab, llama_token id) {
        }
        case LLAMA_VOCAB_TYPE_BPE: {
            GGML_ASSERT(false);
-            return unicode_utf8_to_byte(token_data.text);
+            return unicode_utf8_to_byte(token_data.text); // TODO: why is this here after GGML_ASSERT?
        }
        case LLAMA_VOCAB_TYPE_WPM: {
            GGML_ASSERT(false);
@ -12079,7 +12182,79 @@ struct llm_tokenizer_bpe {
    void tokenize(const std::string & text, std::vector<llama_vocab::id> & output) {
        int final_prev_index = -1;
-        auto word_collection = bpe_gpt2_preprocess(text);
+
        std::vector<std::string> word_collection;
        switch (vocab.type) {
            case LLAMA_VOCAB_TYPE_BPE:
                switch (vocab.type_pre) {
                    case LLAMA_VOCAB_PRE_TYPE_LLAMA3:
                        word_collection = unicode_regex_split(text, {
                            // original regex from tokenizer.json
                            //"(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\\r\\n\\p{L}\\p{N}]?\\p{L}+|\\p{N}{1,3}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+",
                            // adapted: https://github.com/ggerganov/llama.cpp/pull/6920#issuecomment-2080233989
                            "(?:'[sS]|'[tT]|'[rR][eE]|'[vV][eE]|'[mM]|'[lL][lL]|'[dD])|[^\\r\\n\\p{L}\\p{N}]?\\p{L}+|\\p{N}{1,3}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+",
                        });
                        break;
                    case LLAMA_VOCAB_PRE_TYPE_DEEPSEEK_LLM:
                        word_collection = unicode_regex_split(text, {
                            "[\r\n]",
                            "\\s?[A-Za-zµÀ-ÖØ-öø-ƺƼ-ƿǄ-ʓʕ-ʯͰ-ͳͶͷͻ-ͽͿΆΈ-ΊΌΎ-ΡΣ-ϵϷ-ҁҊ-ԯԱ-ՖႠ-ჅᎠ-Ᏽᏸ-ᏽᲐ-ᲺᲽ-Ჿᴀ-ᴫᵫ-ᵷᵹ-ᶚḀ-ἕἘ-Ἕἠ-ὅὈ-Ὅὐ-ὗὙὛὝὟ-ώᾀ-ᾴᾶ-ᾼιῂ-ῄῆ-ῌῐ-ΐῖ-Ίῠ-Ῥῲ-ῴῶ-ῼℂℇℊ-ℓℕℙ-ℝℤΩℨK-ℭℯ-ℴℹℼ-ℿⅅ-ⅉⅎↃↄⰀ-ⱻⱾ-ⳤⳫ-ⳮⳲⳳꙀ-ꙭꚀ-ꚛꜢ-ꝯꝱ-ꞇꞋ-ꞎꭰ-ꮿﬀ-ﬆﬓ-ﬗＡ-Ｚａ-ｚ𐐀-𐑏𐒰-𐓓𐓘-𐓻𐲀-𐲲𐳀-𐳲𑢠-𑣟𞤀-𞥃]+",
                            "\\s?[!-/:-~！-／：-～‘-‟　-。]+",
                            "\\s+$",
                            "[一-龥ࠀ-一가-퟿]+",
                            "\\p{N}+",
                        });
                        break;
                    case LLAMA_VOCAB_PRE_TYPE_DEEPSEEK_CODER:
                        word_collection = unicode_regex_split(text, {
                            "[\r\n]",
                            "\\s?\\p{L}+",
                            "\\s?\\p{P}+",
                            "[一-龥ࠀ-一가-퟿]+",
                            "\\p{N}+",
                        });
                        break;
                    case LLAMA_VOCAB_PRE_TYPE_FALCON:
                        word_collection = unicode_regex_split(text, {
                            "[\\p{P}\\$\\+<=>\\^~\\|]+",
                            "'s|'t|'re|'ve|'m|'ll|'d| ?\\p{L}+| ?\\p{N}+| ?[^\\s\\p{L}\\p{N}]+|\\s+(?!\\S)",
                            "\\p{N}+",
                            "[0-9][0-9][0-9]",
                        });
                        break;
                    case LLAMA_VOCAB_PRE_TYPE_MPT:
                        // TODO: MPT pre-tokenization regexes are unknown
                        //       the following are close, but not exact. run the following:
                        //       ./bin/test-tokenizer-0 ../models/ggml-vocab-mpt.gguf
                        GGML_ASSERT("MPT pre-tokenization regexes are unknown - fixes needed");
                        word_collection = unicode_regex_split(text, {
                            "\\s?\\p{L}+",
                            "\\s?\\p{P}+",
                            "'s|'t|'re|'ve|'m|'ll|'d| ?\\p{L}+| ?\\p{N}+| ?[^\\s\\p{L}\\p{N}]+|\\s+(?!\\S)",
                        });
                        break;
                    case LLAMA_VOCAB_PRE_TYPE_STARCODER:
                    case LLAMA_VOCAB_PRE_TYPE_GPT2:
                        word_collection = unicode_regex_split(text, {
                            "'s|'t|'re|'ve|'m|'ll|'d| ?\\p{L}+| ?\\p{N}+| ?[^\\s\\p{L}\\p{N}]+|\\s+(?!\\S)",
                        });
                        break;
                    default:
                        // default regex for BPE tokenization pre-processing
                        word_collection = unicode_regex_split(text, {
                            "[\\p{P}\\$\\+<=>\\^~\\|]+",
                            "'s|'t|'re|'ve|'m|'ll|'d| ?\\p{L}+| ?\\p{N}+| ?[^\\s\\p{L}\\p{N}]+|\\s+(?!\\S)",
                            "\\p{N}+",
                            "[0-9][0-9][0-9]",
                        });
                        break;
                }
                break;
            default:
                GGML_ASSERT(false);
                break;
        }
        symbols_final.clear();
@ -12206,145 +12381,6 @@ private:
        work_queue.push(bigram);
    }
    std::vector<std::string> bpe_gpt2_preprocess(const std::string & text) {
        std::vector<std::string> bpe_words;
        std::vector<std::string> bpe_encoded_words;
        std::string token = "";
        // GPT2 system regex:  's|'t|'re|'ve|'m|'ll|'d| ?\p{L}+| ?\p{N}+| ?[^\s\p{L}\p{N}]+|\s+(?!\S)|\s+
        bool collecting_numeric = false;
        bool collecting_letter = false;
        bool collecting_special = false;
        bool collecting_whitespace_lookahead = false;
        bool collecting = false;
        std::vector<std::string> text_utf;
        text_utf.reserve(text.size());
        bpe_words.reserve(text.size());
        bpe_encoded_words.reserve(text.size());
        const auto cpts = unicode_cpts_from_utf8(text);
        for (size_t i = 0; i < cpts.size(); ++i)
            text_utf.emplace_back(unicode_cpt_to_utf8(cpts[i]));
        for (int i = 0; i < (int)text_utf.size(); i++) {
            const std::string & utf_char = text_utf[i];
            bool split_condition = false;
            int bytes_remain = text_utf.size() - i;
            // forward backward lookups
            const std::string & utf_char_next = (i + 1 < (int)text_utf.size()) ? text_utf[i + 1] : "";
            const std::string & utf_char_next_next = (i + 2 < (int)text_utf.size()) ? text_utf[i + 2] : "";
            // handling contractions
            if (!split_condition && bytes_remain >= 2) {
                // 's|'t|'m|'d
                if (utf_char == "\'" && (utf_char_next == "s" || utf_char_next == "t" || utf_char_next == "m" || utf_char_next == "d")) {
                    split_condition = true;
                }
                if (split_condition) {
                    if (token.size()) {
                        bpe_words.emplace_back(token); // push previous content as token
                    }
                    token = utf_char + utf_char_next;
                    bpe_words.emplace_back(token);
                    token = "";
                    i++;
                    continue;
                }
            }
            if (!split_condition && bytes_remain >= 3) {
                // 're|'ve|'ll
                if (utf_char == "\'" && (
                    (utf_char_next == "r" && utf_char_next_next == "e") ||
                    (utf_char_next == "v" && utf_char_next_next == "e") ||
                    (utf_char_next == "l" && utf_char_next_next == "l"))
                    ) {
                    split_condition = true;
                }
                if (split_condition) {
                    // current token + next token can be defined
                    if (token.size()) {
                        bpe_words.emplace_back(token); // push previous content as token
                    }
                    token = utf_char + utf_char_next + utf_char_next_next;
                    bpe_words.emplace_back(token); // the contraction
                    token = "";
                    i += 2;
                    continue;
                }
            }
            if (!split_condition && !collecting) {
                if (unicode_cpt_type(utf_char) == CODEPOINT_TYPE_LETTER || (!token.size() && utf_char == " " && unicode_cpt_type(utf_char_next) == CODEPOINT_TYPE_LETTER)) {
                    collecting_letter = true;
                    collecting = true;
                }
                else if (unicode_cpt_type(utf_char) == CODEPOINT_TYPE_DIGIT || (!token.size() && utf_char == " " && unicode_cpt_type(utf_char_next) == CODEPOINT_TYPE_DIGIT)) {
                    collecting_numeric = true;
                    collecting = true;
                }
                else if (
                    ((unicode_cpt_type(utf_char) != CODEPOINT_TYPE_LETTER && unicode_cpt_type(utf_char) != CODEPOINT_TYPE_DIGIT) && (unicode_cpt_type(utf_char) != CODEPOINT_TYPE_WHITESPACE)) ||
                    (!token.size() && utf_char == " " && unicode_cpt_type(utf_char_next) != CODEPOINT_TYPE_LETTER && unicode_cpt_type(utf_char_next) != CODEPOINT_TYPE_DIGIT && unicode_cpt_type(utf_char_next) != CODEPOINT_TYPE_WHITESPACE)
                    ) {
                    collecting_special = true;
                    collecting = true;
                }
                else if (unicode_cpt_type(utf_char) == CODEPOINT_TYPE_WHITESPACE && unicode_cpt_type(utf_char_next) == CODEPOINT_TYPE_WHITESPACE) {
                    collecting_whitespace_lookahead = true;
                    collecting = true;
                }
                else if (unicode_cpt_type(utf_char) == CODEPOINT_TYPE_WHITESPACE) {
                    split_condition = true;
                }
            }
            else if (!split_condition && collecting) {
                if (collecting_letter && unicode_cpt_type(utf_char) != CODEPOINT_TYPE_LETTER) {
                    split_condition = true;
                }
                else if (collecting_numeric && unicode_cpt_type(utf_char) != CODEPOINT_TYPE_DIGIT) {
                    split_condition = true;
                }
                else if (collecting_special && (unicode_cpt_type(utf_char) == CODEPOINT_TYPE_LETTER || unicode_cpt_type(utf_char) == CODEPOINT_TYPE_DIGIT || unicode_cpt_type(utf_char) == CODEPOINT_TYPE_WHITESPACE)) {
                    split_condition = true;
                }
                else if (collecting_whitespace_lookahead && (unicode_cpt_type(utf_char_next) == CODEPOINT_TYPE_LETTER || unicode_cpt_type(utf_char_next) == CODEPOINT_TYPE_DIGIT)) {
                    split_condition = true;
                }
            }
            if (utf_char_next == "") {
                split_condition = true; // final
                token += utf_char;
            }
            if (split_condition) {
                if (token.size()) {
                    bpe_words.emplace_back(token);
                }
                token = utf_char;
                collecting = false;
                collecting_letter = false;
                collecting_numeric = false;
                collecting_special = false;
                collecting_whitespace_lookahead = false;
            }
            else {
                token += utf_char;
            }
        }
        for (std::string & word : bpe_words) {
            std::string encoded_token = "";
            for (char & c : word) {
                encoded_token += unicode_byte_to_utf8(c);
            }
            bpe_encoded_words.emplace_back(encoded_token);
        }
        return bpe_encoded_words;
    }
    const llama_vocab & vocab;
    std::vector<llm_symbol> symbols;
@ -12664,7 +12700,7 @@ static std::vector<llama_vocab::id> llama_tokenize_internal(const llama_vocab &
            } break;
        case LLAMA_VOCAB_TYPE_BPE:
            {
-                if (add_special && vocab.special_add_bos == 1) {
+                if (add_special && vocab.special_add_bos != 0) {
                    GGML_ASSERT(vocab.special_bos_id != -1);
                    output.push_back(vocab.special_bos_id);
                }
@ -14436,14 +14472,20 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n
 }
 static size_t llama_tensor_quantize_internal(enum ggml_type new_type, const float * f32_data, void * new_data, const int64_t chunk_size, int64_t nrows, int64_t n_per_row, const float * imatrix, std::vector<std::thread> & workers, const int nthread) {
    if (nthread < 2) {
        // single-thread
        size_t new_size = ggml_quantize_chunk(new_type, f32_data, new_data, 0, nrows, n_per_row, imatrix);
        if (!ggml_validate_row_data(new_type, new_data, new_size)) {
            throw std::runtime_error("quantized data validation failed");
        }
        return new_size;
    }
    std::mutex mutex;
    int64_t counter = 0;
    size_t new_size = 0;
-    if (nthread < 2) {
+    bool valid = true;
-        // single-thread
+    auto compute = [&mutex, &counter, &new_size, &valid, new_type, f32_data, new_data, chunk_size,
        return ggml_quantize_chunk(new_type, f32_data, new_data, 0, nrows, n_per_row, imatrix);
    }
    auto compute = [&mutex, &counter, &new_size, new_type, f32_data, new_data, chunk_size,
            nrows, n_per_row, imatrix]() {
        const int64_t nrows_per_chunk = chunk_size / n_per_row;
        size_t local_size = 0;
@ -14458,7 +14500,17 @@ static size_t llama_tensor_quantize_internal(enum ggml_type new_type, const floa
            }
            lock.unlock();
            const int64_t this_nrow = std::min(nrows - first_row, nrows_per_chunk);
-            local_size += ggml_quantize_chunk(new_type, f32_data, new_data, first_row * n_per_row, this_nrow, n_per_row, imatrix);
+            size_t this_size = ggml_quantize_chunk(new_type, f32_data, new_data, first_row * n_per_row, this_nrow, n_per_row, imatrix);
            local_size += this_size;
            // validate the quantized data
            const size_t row_size  = ggml_row_size(new_type, n_per_row);
            void * this_data = (char *) new_data + first_row * row_size;
            if (!ggml_validate_row_data(new_type, this_data, this_size)) {
                std::unique_lock<std::mutex> lock(mutex);
                valid = false;
                break;
            }
        }
    };
    for (int it = 0; it < nthread - 1; ++it) {
@ -14467,6 +14519,9 @@ static size_t llama_tensor_quantize_internal(enum ggml_type new_type, const floa
    compute();
    for (auto & w : workers) { w.join(); }
    workers.clear();
    if (!valid) {
        throw std::runtime_error("quantized data validation failed");
    }
    return new_size;
 }
@ -14529,7 +14584,7 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
        auto v = (std::vector<llama_model_kv_override>*)params->kv_overrides;
        kv_overrides = v->data();
    }
-    llama_model_loader ml(fname_inp, use_mmap, kv_overrides);
+    llama_model_loader ml(fname_inp, use_mmap, /*check_tensors*/ true, kv_overrides);
    ml.init_mappings(false); // no prefetching
    llama_model model;
@ -14567,11 +14622,13 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
        for (auto & o : overrides) {
            if (o.key[0] == 0) break;
            if (o.tag == LLAMA_KV_OVERRIDE_TYPE_FLOAT) {
-                gguf_set_val_f32(ctx_out, o.key, o.float_value);
+                gguf_set_val_f32(ctx_out, o.key, o.val_f64);
            } else if (o.tag == LLAMA_KV_OVERRIDE_TYPE_INT) {
-                gguf_set_val_i32(ctx_out, o.key, o.int_value);
+                gguf_set_val_i32(ctx_out, o.key, o.val_i64);
            } else if (o.tag == LLAMA_KV_OVERRIDE_TYPE_BOOL) {
-                gguf_set_val_bool(ctx_out, o.key, o.bool_value);
+                gguf_set_val_bool(ctx_out, o.key, o.val_bool);
            } else if (o.tag == LLAMA_KV_OVERRIDE_TYPE_STR) {
                gguf_set_val_str(ctx_out, o.key, o.val_str);
            } else {
                LLAMA_LOG_WARN("%s: unknown KV override type for key %s\n", __func__, o.key);
            }
@ -14890,7 +14947,7 @@ static int llama_apply_lora_from_file_internal(
    std::unique_ptr<llama_model_loader> ml;
    if (path_base_model) {
        LLAMA_LOG_INFO("%s: loading base model from '%s'\n", __func__, path_base_model);
-        ml.reset(new llama_model_loader(path_base_model, /*use_mmap*/ true, /*kv_overrides*/ nullptr));
+        ml.reset(new llama_model_loader(path_base_model, /*use_mmap*/ true, /*check_tensors*/ false, /*kv_overrides*/ nullptr));
        ml->init_mappings(/*prefetch*/ false); // no prefetching
    }
@ -15149,6 +15206,7 @@ struct llama_model_params llama_model_default_params() {
        /*.vocab_only                  =*/ false,
        /*.use_mmap                    =*/ true,
        /*.use_mlock                   =*/ false,
        /*.check_tensors               =*/ false,
    };
 #ifdef GGML_USE_METAL
@ -17823,6 +17881,11 @@ const char * llama_print_system_info(void) {
    s += "SSSE3 = "       + std::to_string(ggml_cpu_has_ssse3())       + " | ";
    s += "VSX = "         + std::to_string(ggml_cpu_has_vsx())         + " | ";
    s += "MATMUL_INT8 = " + std::to_string(ggml_cpu_has_matmul_int8()) + " | ";
 #ifdef GGML_USE_LLAMAFILE
    s += "LLAMAFILE = 1 | ";
 #else
    s += "LLAMAFILE = 0 | ";
 #endif
    return s.c_str();
 }
--- a/llama.h
+++ b/llama.h
@ -69,6 +69,18 @@ extern "C" {
        LLAMA_VOCAB_TYPE_WPM  = 3, // BERT tokenizer based on WordPiece
    };
    // pre-tokenization types
    enum llama_vocab_pre_type {
        LLAMA_VOCAB_PRE_TYPE_DEFAULT        = 0,
        LLAMA_VOCAB_PRE_TYPE_LLAMA3         = 1,
        LLAMA_VOCAB_PRE_TYPE_DEEPSEEK_LLM   = 2,
        LLAMA_VOCAB_PRE_TYPE_DEEPSEEK_CODER = 3,
        LLAMA_VOCAB_PRE_TYPE_FALCON         = 4,
        LLAMA_VOCAB_PRE_TYPE_MPT            = 5,
        LLAMA_VOCAB_PRE_TYPE_STARCODER      = 6,
        LLAMA_VOCAB_PRE_TYPE_GPT2           = 7,
    };
    // note: these values should be synchronized with ggml_rope
    // TODO: maybe move this enum to ggml.h (ggml_rope_type)
    enum llama_rope_type {
@ -195,15 +207,19 @@ extern "C" {
        LLAMA_KV_OVERRIDE_TYPE_INT,
        LLAMA_KV_OVERRIDE_TYPE_FLOAT,
        LLAMA_KV_OVERRIDE_TYPE_BOOL,
        LLAMA_KV_OVERRIDE_TYPE_STR,
    };
    struct llama_model_kv_override {
        char key[128];
        enum llama_model_kv_override_type tag;
        char key[128];
        union {
-            int64_t int_value;
+            int64_t val_i64;
-            double float_value;
+            double  val_f64;
-            bool bool_value;
+            bool    val_bool;
            char    val_str[128];
        };
    };
@ -235,6 +251,7 @@ extern "C" {
        bool vocab_only;    // only load the vocabulary, no weights
        bool use_mmap;      // use mmap if possible
        bool use_mlock;     // force system to keep model in RAM
        bool check_tensors; // validate model tensor data
    };
    struct llama_context_params {
--- a/models/ggml-vocab-bert-bge.gguf
+++ b/models/ggml-vocab-bert-bge.gguf
--- a/models/ggml-vocab-bert-bge.gguf.inp
+++ b/models/ggml-vocab-bert-bge.gguf.inp
@ -0,0 +1,102 @@
 __ggml_vocab_test__
 __ggml_vocab_test__
 __ggml_vocab_test__
 __ggml_vocab_test__
 __ggml_vocab_test__
 __ggml_vocab_test__
 __ggml_vocab_test__
 __ggml_vocab_test__
 __ggml_vocab_test__
 Hello world
 __ggml_vocab_test__
 Hello world
 __ggml_vocab_test__
 Hello World
 __ggml_vocab_test__
 Hello World
 __ggml_vocab_test__
 Hello World!
 __ggml_vocab_test__
 Hello, world!
 __ggml_vocab_test__
 Hello, world!
 __ggml_vocab_test__
 this is 🦙.cpp
 __ggml_vocab_test__
 w048 7tuijk dsdfhu
 __ggml_vocab_test__
 нещо на Български
 __ggml_vocab_test__
 កាន់តែពិសេសអាចខលចេញ
 __ggml_vocab_test__
 🚀 (normal) 😶‍🌫️ (multiple emojis concatenated) ✅ (only emoji that has its own token)
 __ggml_vocab_test__
 Hello
 __ggml_vocab_test__
 Hello
 __ggml_vocab_test__
  Hello
 __ggml_vocab_test__
   Hello
 __ggml_vocab_test__
    Hello
 __ggml_vocab_test__
    Hello
    Hello
 __ggml_vocab_test__
 (
 __ggml_vocab_test__
 =
 __ggml_vocab_test__
 ' era
 __ggml_vocab_test__
 Hello, y'all! How are you 😁 ?我想在apple工作1314151天～
 __ggml_vocab_test__
 3
 __ggml_vocab_test__
 33
 __ggml_vocab_test__
 333
 __ggml_vocab_test__
 3333
 __ggml_vocab_test__
 33333
 __ggml_vocab_test__
 333333
 __ggml_vocab_test__
 3333333
 __ggml_vocab_test__
 33333333
 __ggml_vocab_test__
 333333333
 __ggml_vocab_test__
 🚀 (normal) 😶‍🌫️ (multiple emojis concatenated) ✅ 🦙🦙 3 33 333 3333 33333 333333 3333333 33333333 3.3 3..3 3...3 កាន់តែពិសេសអាច😁 ?我想在apple工作1314151天～ ------======= нещо на Български ''''''```````""""......!!!!!!?????? I've been 'told he's there, 'RE you sure? 'M not sure I'll make it, 'D you like some tea? We'Ve a'lL
 __ggml_vocab_test__
--- a/models/ggml-vocab-bert-bge.gguf.out
+++ b/models/ggml-vocab-bert-bge.gguf.out
@ -0,0 +1,41 @@
 7592 2088
 7592 2088
 7592 2088
 7592 2088
 7592 2088 999
 7592 1010 2088 999
 7592 1010 2088 999
 2023 2003 100 1012 18133 2361
 1059 2692 18139 1021 8525 28418 2243 16233 20952 6979
 1192 15290 29754 14150 1192 10260 1181 29755 29436 29741 10260 16856 29747 23925 10325
 100
 100 1006 3671 1007 100 1006 3674 7861 29147 2483 9530 16280 23854 1007 100 1006 2069 7861 29147 2072 2008 2038 2049 2219 19204 1007
 7592
 7592
 7592
 7592
 7592
 7592 7592
 1006
 1027
 1005 3690
 7592 1010 1061 1005 2035 999 2129 2024 2017 100 1029 1855 100 100 6207 100 100 14677 23632 22203 1811 1995
 1017
 3943
 21211
 21211 2509
 21211 22394
 21211 22394 2509
 21211 22394 22394
 21211 22394 22394 2509
 21211 22394 22394 22394
 100 1006 3671 1007 100 1006 3674 7861 29147 2483 9530 16280 23854 1007 100 100 1017 3943 21211 21211 2509 21211 22394 21211 22394 2509 21211 22394 22394 21211 22394 22394 2509 1017 1012 1017 1017 1012 1012 1017 1017 1012 1012 1012 1017 100 1029 1855 100 100 6207 100 100 14677 23632 22203 1811 1995 1011 1011 1011 1011 1011 1011 1027 1027 1027 1027 1027 1027 1027 1192 15290 29754 14150 1192 10260 1181 29755 29436 29741 10260 16856 29747 23925 10325 1005 1005 1005 1005 1005 1005 1036 1036 1036 1036 1036 1036 1036 1000 1000 1000 1000 1012 1012 1012 1012 1012 1012 999 999 999 999 999 999 1029 1029 1029 1029 1029 1029 1045 1005 2310 2042 1005 2409 2002 1005 1055 2045 1010 1005 2128 2017 2469 1029 1005 1049 2025 2469 1045 1005 2222 2191 2009 1010 1005 1040 2017 2066 2070 5572 1029 2057 1005 2310 1037 1005 2222
--- a/models/ggml-vocab-deepseek-coder.gguf
+++ b/models/ggml-vocab-deepseek-coder.gguf
--- a/models/ggml-vocab-deepseek-coder.gguf.inp
+++ b/models/ggml-vocab-deepseek-coder.gguf.inp
@ -0,0 +1,102 @@
 __ggml_vocab_test__
 __ggml_vocab_test__
 __ggml_vocab_test__
 __ggml_vocab_test__
 __ggml_vocab_test__
 __ggml_vocab_test__
 __ggml_vocab_test__
 __ggml_vocab_test__
 __ggml_vocab_test__
 Hello world
 __ggml_vocab_test__
 Hello world
 __ggml_vocab_test__
 Hello World
 __ggml_vocab_test__
 Hello World
 __ggml_vocab_test__
 Hello World!
 __ggml_vocab_test__
 Hello, world!
 __ggml_vocab_test__
 Hello, world!
 __ggml_vocab_test__
 this is 🦙.cpp
 __ggml_vocab_test__
 w048 7tuijk dsdfhu
 __ggml_vocab_test__
 нещо на Български
 __ggml_vocab_test__
 កាន់តែពិសេសអាចខលចេញ
 __ggml_vocab_test__
 🚀 (normal) 😶‍🌫️ (multiple emojis concatenated) ✅ (only emoji that has its own token)
 __ggml_vocab_test__
 Hello
 __ggml_vocab_test__
 Hello
 __ggml_vocab_test__
  Hello
 __ggml_vocab_test__
   Hello
 __ggml_vocab_test__
    Hello
 __ggml_vocab_test__
    Hello
    Hello
 __ggml_vocab_test__
 (
 __ggml_vocab_test__
 =
 __ggml_vocab_test__
 ' era
 __ggml_vocab_test__
 Hello, y'all! How are you 😁 ?我想在apple工作1314151天～
 __ggml_vocab_test__
 3
 __ggml_vocab_test__
 33
 __ggml_vocab_test__
 333
 __ggml_vocab_test__
 3333
 __ggml_vocab_test__
 33333
 __ggml_vocab_test__
 333333
 __ggml_vocab_test__
 3333333
 __ggml_vocab_test__
 33333333
 __ggml_vocab_test__
 333333333
 __ggml_vocab_test__
 🚀 (normal) 😶‍🌫️ (multiple emojis concatenated) ✅ 🦙🦙 3 33 333 3333 33333 333333 3333333 33333333 3.3 3..3 3...3 កាន់តែពិសេសអាច😁 ?我想在apple工作1314151天～ ------======= нещо на Български ''''''```````""""......!!!!!!?????? I've been 'told he's there, 'RE you sure? 'M not sure I'll make it, 'D you like some tea? We'Ve a'lL
 __ggml_vocab_test__
--- a/models/ggml-vocab-deepseek-coder.gguf.out
+++ b/models/ggml-vocab-deepseek-coder.gguf.out
@ -0,0 +1,41 @@
 207
 243
 315
 184
 185
 185 185
 185 185 185
 184 185
 17535 1835
 414 9489 1835
 17535 5414
 414 9489 5414
 414 9489 5414 0
 17535 11 1835 0
 414 9489 11 1835 0
 437 317 12394 99 234 13 14789
 86 15 19 23 207 22 83 3963 27659 26078 3934 14072
 1593 6478 616 2251 14994
 155 239 209 155 239 114 155 239 228 155 240 220 155 239 224 155 240 211 155 239 231 155 239 115 155 239 240 155 240 210 155 239 240 155 239 95 155 239 114 155 239 214 155 239 210 155 239 236 155 239 214 155 240 210 155 239 218
 10047 235 209 334 8760 8 12394 233 114 350 222 10047 221 104 169 116 224 334 4684 3909 992 24330 262 29651 612 8 207 156 237 214 334 5950 992 78 12896 344 638 891 1372 10736 8
 17535
 414 9489
 207 414 9489
 243 414 9489
 315 414 9489
 315 414 9489 185 315 414 9489
 334
 185 405
 6 2895
 17535 11 320 6 435 0 1717 417 340 12394 233 210 3015 19100 608 9413 2668 16 18 16 19 16 20 16 1393 169 121 239
 18
 18 18
 18 18 18
 18 18 18 18
 18 18 18 18 18
 18 18 18 18 18 18
 18 18 18 18 18 18 18
 18 18 18 18 18 18 18 18
 18 18 18 18 18 18 18 18 18
 185 207 185 185 207 185 185 185 207 12405 459 22758 185 243 185 315 185 251 185 730 185 10047 235 209 334 8760 8 12394 233 114 350 222 10047 221 104 169 116 224 334 4684 3909 992 24330 262 29651 612 8 207 156 237 214 12394 99 234 10047 99 234 207 18 207 18 18 207 18 18 18 207 18 18 18 18 207 18 18 18 18 18 207 18 18 18 18 18 18 207 18 18 18 18 18 18 18 207 18 18 18 18 18 18 18 18 207 18 13 18 207 18 524 18 207 18 1202 18 207 155 239 209 155 239 114 155 239 228 155 240 220 155 239 224 155 240 211 155 239 231 155 239 115 155 239 240 155 240 210 155 239 240 155 239 95 155 239 114 155 239 214 10047 233 210 3015 19100 608 9413 2668 16 18 16 19 16 20 16 1393 169 121 239 18155 374 17194 28 2861 6478 616 2251 14994 31269 4191 6 4686 4686 10252 3358 3358 3409 524 15330 3023 15031 5668 303 6 312 798 651 83 839 362 6 82 741 11 651 1369 340 2037 30 651 44 441 2037 303 6 642 1098 359 11 651 35 340 833 738 10860 30 998 6 10709 245 6 75 43
--- a/models/ggml-vocab-deepseek-llm.gguf
+++ b/models/ggml-vocab-deepseek-llm.gguf
--- a/models/ggml-vocab-deepseek-llm.gguf.inp
+++ b/models/ggml-vocab-deepseek-llm.gguf.inp
@ -0,0 +1,102 @@
 __ggml_vocab_test__
 __ggml_vocab_test__
 __ggml_vocab_test__
 __ggml_vocab_test__
 __ggml_vocab_test__
 __ggml_vocab_test__
 __ggml_vocab_test__
 __ggml_vocab_test__
 __ggml_vocab_test__
 Hello world
 __ggml_vocab_test__
 Hello world
 __ggml_vocab_test__
 Hello World
 __ggml_vocab_test__
 Hello World
 __ggml_vocab_test__
 Hello World!
 __ggml_vocab_test__
 Hello, world!
 __ggml_vocab_test__
 Hello, world!
 __ggml_vocab_test__
 this is 🦙.cpp
 __ggml_vocab_test__
 w048 7tuijk dsdfhu
 __ggml_vocab_test__
 нещо на Български
 __ggml_vocab_test__
 កាន់តែពិសេសអាចខលចេញ
 __ggml_vocab_test__
 🚀 (normal) 😶‍🌫️ (multiple emojis concatenated) ✅ (only emoji that has its own token)
 __ggml_vocab_test__
 Hello
 __ggml_vocab_test__
 Hello
 __ggml_vocab_test__
  Hello
 __ggml_vocab_test__
   Hello
 __ggml_vocab_test__
    Hello
 __ggml_vocab_test__
    Hello
    Hello
 __ggml_vocab_test__
 (
 __ggml_vocab_test__
 =
 __ggml_vocab_test__
 ' era
 __ggml_vocab_test__
 Hello, y'all! How are you 😁 ?我想在apple工作1314151天～
 __ggml_vocab_test__
 3
 __ggml_vocab_test__
 33
 __ggml_vocab_test__
 333
 __ggml_vocab_test__
 3333
 __ggml_vocab_test__
 33333
 __ggml_vocab_test__
 333333
 __ggml_vocab_test__
 3333333
 __ggml_vocab_test__
 33333333
 __ggml_vocab_test__
 333333333
 __ggml_vocab_test__
 🚀 (normal) 😶‍🌫️ (multiple emojis concatenated) ✅ 🦙🦙 3 33 333 3333 33333 333333 3333333 33333333 3.3 3..3 3...3 កាន់តែពិសេសអាច😁 ?我想在apple工作1314151天～ ------======= нещо на Български ''''''```````""""......!!!!!!?????? I've been 'told he's there, 'RE you sure? 'M not sure I'll make it, 'D you like some tea? We'Ve a'lL
 __ggml_vocab_test__
--- a/models/ggml-vocab-deepseek-llm.gguf.out
+++ b/models/ggml-vocab-deepseek-llm.gguf.out
@ -0,0 +1,41 @@
 207
 243
 300
 184
 185
 185 185
 185 185 185
 184 185
 17464 1843
 37727 1843
 17464 5427
 37727 5427
 37727 5427 0
 17464 11 1843 0
 37727 11 1843 0
 437 317 12356 99 234 13 14743
 86 15 19 23 207 22 83 3970 27519 26016 3944 14025
 1603 6476 620 91754
 71374 209 71374 114 71374 228 155 240 220 71374 224 155 240 211 71374 231 71374 115 71374 240 155 240 210 71374 240 71374 95 71374 114 71374 214 71374 210 71374 236 71374 214 155 240 210 71374 218
 10044 95300 334 8754 8 33701 114 350 222 10044 221 104 46713 334 34732 996 24250 262 80923 8 207 37103 214 334 5956 89213 344 643 895 1377 10728 8
 17464
 37727
 207 37727
 243 37727
 300 37727
 300 37727 185 300 37727
 334
 185 403
 6 2906
 17464 11 320 6 436 0 1724 418 340 33701 210 3025 19017 612 9407 2681 16 18 16 19 16 20 16 1398 68940 239
 18
 18 18
 18 18 18
 18 18 18 18
 18 18 18 18 18
 18 18 18 18 18 18
 18 18 18 18 18 18 18
 18 18 18 18 18 18 18 18
 18 18 18 18 18 18 18 18 18
 185 207 185 185 207 185 185 185 207 11969 486 22504 185 243 185 300 185 251 185 663 185 10044 95300 334 8754 8 33701 114 350 222 10044 221 104 46713 334 34732 996 24250 262 80923 8 207 37103 214 12356 99 234 10044 99 234 207 18 207 18 18 207 18 18 18 207 18 18 18 18 207 18 18 18 18 18 207 18 18 18 18 18 18 207 18 18 18 18 18 18 18 207 18 18 18 18 18 18 18 18 207 18 13 18 207 18 526 18 207 18 1204 18 207 71374 209 71374 114 71374 228 155 240 220 71374 224 155 240 211 71374 231 71374 115 71374 240 155 240 210 71374 240 71374 95 71374 114 71374 214 71899 210 3025 19017 612 9407 2681 16 18 16 19 16 20 16 1398 68940 239 78827 55170 76659 620 91754 31116 36804 4885 4885 10897 4390 4390 41047 15278 3033 14986 5675 304 6 313 803 655 33326 362 6 82 745 11 655 1374 340 2049 30 655 44 441 2049 304 6 647 1099 359 11 655 35 340 837 742 10842 30 1003 6 10699 245 6 75 43
--- a/models/ggml-vocab-falcon.gguf
+++ b/models/ggml-vocab-falcon.gguf
--- a/models/ggml-vocab-falcon.gguf.inp
+++ b/models/ggml-vocab-falcon.gguf.inp
@ -0,0 +1,102 @@
 __ggml_vocab_test__
 __ggml_vocab_test__
 __ggml_vocab_test__
 __ggml_vocab_test__
 __ggml_vocab_test__
 __ggml_vocab_test__
 __ggml_vocab_test__
 __ggml_vocab_test__
 __ggml_vocab_test__
 Hello world
 __ggml_vocab_test__
 Hello world
 __ggml_vocab_test__
 Hello World
 __ggml_vocab_test__
 Hello World
 __ggml_vocab_test__
 Hello World!
 __ggml_vocab_test__
 Hello, world!
 __ggml_vocab_test__
 Hello, world!
 __ggml_vocab_test__
 this is 🦙.cpp
 __ggml_vocab_test__
 w048 7tuijk dsdfhu
 __ggml_vocab_test__
 нещо на Български
 __ggml_vocab_test__
 កាន់តែពិសេសអាចខលចេញ
 __ggml_vocab_test__
 🚀 (normal) 😶‍🌫️ (multiple emojis concatenated) ✅ (only emoji that has its own token)
 __ggml_vocab_test__
 Hello
 __ggml_vocab_test__
 Hello
 __ggml_vocab_test__
  Hello
 __ggml_vocab_test__
   Hello
 __ggml_vocab_test__
    Hello
 __ggml_vocab_test__
    Hello
    Hello
 __ggml_vocab_test__
 (
 __ggml_vocab_test__
 =
 __ggml_vocab_test__
 ' era
 __ggml_vocab_test__
 Hello, y'all! How are you 😁 ?我想在apple工作1314151天～
 __ggml_vocab_test__
 3
 __ggml_vocab_test__
 33
 __ggml_vocab_test__
 333
 __ggml_vocab_test__
 3333
 __ggml_vocab_test__
 33333
 __ggml_vocab_test__
 333333
 __ggml_vocab_test__
 3333333
 __ggml_vocab_test__
 33333333
 __ggml_vocab_test__
 333333333
 __ggml_vocab_test__
 🚀 (normal) 😶‍🌫️ (multiple emojis concatenated) ✅ 🦙🦙 3 33 333 3333 33333 333333 3333333 33333333 3.3 3..3 3...3 កាន់តែពិសេសអាច😁 ?我想在apple工作1314151天～ ------======= нещо на Български ''''''```````""""......!!!!!!?????? I've been 'told he's there, 'RE you sure? 'M not sure I'll make it, 'D you like some tea? We'Ve a'lL
 __ggml_vocab_test__
--- a/models/ggml-vocab-falcon.gguf.out
+++ b/models/ggml-vocab-falcon.gguf.out
@ -0,0 +1,41 @@
 204
 258
 466
 192
 193
 1001
 11331
 19125
 9856 1079
 23090 1079
 9856 2889
 23090 2889
 23090 2889 12
 9856 23 1079 12
 23090 23 1079 12
 414 304 3346 111 231 25 29247
 98 55866 204 34 16682 7149 36190 6869 11481
 150 133 6207 151 215 150 134 5052 133 6279 5052 223 151 216 49679 123 53110 47043 7795
 38154 206 38154 126 38154 225 167 237 217 38154 221 167 237 208 38154 228 38154 127 38154 237 167 237 207 38154 237 38154 107 38154 126 38154 211 38154 207 38154 233 38154 211 167 237 207 38154 215
 2571 232 206 204 19 11003 20 8196 126 283 219 48778 116 13392 204 19 51831 732 63209 1741 7955 522 20 22438 211 204 19 7927 53360 325 504 701 946 10930 20
 9856
 23090
 204 23090
 258 23090
 466 23090
 466 23090 742 23090
 204 19
 1212 40
 18 4932
 9856 23 291 18 436 12 1265 362 299 8196 207 204 42 50087 123 2727 20300 32022 133 234 17419 30137 28 7858 181 133 236
 30
 3138
 22287
 22287 30
 22287 3138
 22287 22287
 22287 22287 30
 22287 22287 3138
 22287 22287 22287
 1212 4824 1001 1212 192 204 663 49453 2069 742 561 1501 193 2571 232 206 204 19 11003 20 8196 126 283 219 48778 116 13392 204 19 51831 732 63209 1741 7955 522 20 22438 211 3346 111 231 2571 111 231 204 30 204 3138 204 22287 204 22287 30 204 22287 3138 204 22287 22287 204 22287 22287 30 204 22287 22287 3138 204 30 25 30 204 30 513 30 204 30 951 30 27171 236 206 38154 126 38154 225 167 237 217 38154 221 167 237 208 38154 228 38154 127 38154 237 167 237 207 38154 237 38154 107 38154 126 38154 211 20589 207 204 42 50087 123 2727 20300 32022 133 234 17419 30137 28 7858 181 133 236 204 37057 2228 10666 5052 133 6207 151 215 150 134 5052 133 6279 5052 223 151 216 49679 123 53110 47043 7795 204 7544 7544 7544 8543 8543 17593 3513 3513 12844 51520 17664 4247 295 18 298 650 204 18 95 693 332 18 94 629 23 204 18 1553 299 1310 42 204 18 56 416 1310 295 18 567 717 334 23 204 18 47 299 606 596 6696 42 703 18 16139 241 18 87 55
--- a/models/ggml-vocab-gpt-2.gguf
+++ b/models/ggml-vocab-gpt-2.gguf
--- a/models/ggml-vocab-gpt-2.gguf.inp
+++ b/models/ggml-vocab-gpt-2.gguf.inp
@ -0,0 +1,102 @@
 __ggml_vocab_test__
 __ggml_vocab_test__
 __ggml_vocab_test__
 __ggml_vocab_test__
 __ggml_vocab_test__
 __ggml_vocab_test__
 __ggml_vocab_test__
 __ggml_vocab_test__
 __ggml_vocab_test__
 Hello world
 __ggml_vocab_test__
 Hello world
 __ggml_vocab_test__
 Hello World
 __ggml_vocab_test__
 Hello World
 __ggml_vocab_test__
 Hello World!
 __ggml_vocab_test__
 Hello, world!
 __ggml_vocab_test__
 Hello, world!
 __ggml_vocab_test__
 this is 🦙.cpp
 __ggml_vocab_test__
 w048 7tuijk dsdfhu
 __ggml_vocab_test__
 нещо на Български
 __ggml_vocab_test__
 កាន់តែពិសេសអាចខលចេញ
 __ggml_vocab_test__
 🚀 (normal) 😶‍🌫️ (multiple emojis concatenated) ✅ (only emoji that has its own token)
 __ggml_vocab_test__
 Hello
 __ggml_vocab_test__
 Hello
 __ggml_vocab_test__
  Hello
 __ggml_vocab_test__
   Hello
 __ggml_vocab_test__
    Hello
 __ggml_vocab_test__
    Hello
    Hello
 __ggml_vocab_test__
 (
 __ggml_vocab_test__
 =
 __ggml_vocab_test__
 ' era
 __ggml_vocab_test__
 Hello, y'all! How are you 😁 ?我想在apple工作1314151天～
 __ggml_vocab_test__
 3
 __ggml_vocab_test__
 33
 __ggml_vocab_test__
 333
 __ggml_vocab_test__
 3333
 __ggml_vocab_test__
 33333
 __ggml_vocab_test__
 333333
 __ggml_vocab_test__
 3333333
 __ggml_vocab_test__
 33333333
 __ggml_vocab_test__
 333333333
 __ggml_vocab_test__
 🚀 (normal) 😶‍🌫️ (multiple emojis concatenated) ✅ 🦙🦙 3 33 333 3333 33333 333333 3333333 33333333 3.3 3..3 3...3 កាន់តែពិសេសអាច😁 ?我想在apple工作1314151天～ ------======= нещо на Български ''''''```````""""......!!!!!!?????? I've been 'told he's there, 'RE you sure? 'M not sure I'll make it, 'D you like some tea? We'Ve a'lL
 __ggml_vocab_test__
--- a/models/ggml-vocab-gpt-2.gguf.out
+++ b/models/ggml-vocab-gpt-2.gguf.out
@ -0,0 +1,41 @@
 220
 220 220
 220 220 220
 197
 198
 628
 628 198
 197 198
 15496 995
 18435 995
 15496 2159
 18435 2159
 18435 2159 0
 15496 11 995 0
 18435 11 995 0
 428 318 12520 99 247 13 20322
 86 47202 767 28047 45961 288 82 7568 13415
 22177 16843 141 231 15166 12466 121 16142 12466 239 141 232 30143 140 111 16142 21169 21727 31583 18849
 157 252 222 157 252 114 157 252 241 157 253 233 157 252 237 157 253 224 157 252 244 157 252 115 157 252 253 157 253 223 157 252 253 157 252 95 157 252 114 157 252 227 157 252 223 157 252 249 157 252 227 157 253 223 157 252 231
 8582 248 222 357 11265 8 30325 114 447 235 8582 234 104 37929 357 48101 795 13210 271 1673 36686 515 8 14519 227 357 8807 44805 326 468 663 898 11241 8
 15496
 18435
 220 18435
 220 220 18435
 220 220 220 18435
 220 220 220 18435 198 220 220 220 18435
 357
 198 796
 6 6980
 15496 11 331 6 439 0 1374 389 345 30325 223 5633 22755 239 46349 111 28839 101 18040 32432 98 43291 1485 1415 24309 25465 171 121 252
 18
 2091
 20370
 24840
 2091 20370
 24840 2091
 24840 20370
 24840 24840
 24840 2091 20370
 198 220 628 220 628 198 220 197 220 197 197 220 197 198 220 220 198 220 220 220 198 220 220 220 220 198 220 220 220 220 220 198 8582 248 222 357 11265 8 30325 114 447 235 8582 234 104 37929 357 48101 795 13210 271 1673 36686 515 8 14519 227 12520 99 247 8582 99 247 513 4747 23460 513 20370 23460 2091 23460 20370 23460 24840 23460 2091 20370 513 13 18 513 492 18 513 986 18 28053 252 222 157 252 114 157 252 241 157 253 233 157 252 237 157 253 224 157 252 244 157 252 115 157 252 253 157 253 223 157 252 253 157 252 95 157 252 114 157 252 227 47249 223 5633 22755 239 46349 111 28839 101 18040 32432 98 43291 1485 1415 24309 25465 171 121 252 40103 1421 18604 12466 121 16843 141 231 15166 12466 121 16142 12466 239 141 232 30143 140 111 16142 21169 21727 31583 18849 705 39115 6 33153 15506 63 15931 15931 16317 13896 3228 9805 3548 314 1053 587 705 44040 339 338 612 11 705 2200 345 1654 30 705 44 407 1654 314 1183 787 340 11 705 35 345 588 617 8887 30 775 6 26979 257 6 75 43
--- a/models/ggml-vocab-llama-bpe.gguf
+++ b/models/ggml-vocab-llama-bpe.gguf
--- a/models/ggml-vocab-llama-bpe.gguf.inp
+++ b/models/ggml-vocab-llama-bpe.gguf.inp
@ -0,0 +1,102 @@
 __ggml_vocab_test__
 __ggml_vocab_test__
 __ggml_vocab_test__
 __ggml_vocab_test__
 __ggml_vocab_test__
 __ggml_vocab_test__
 __ggml_vocab_test__
 __ggml_vocab_test__
 __ggml_vocab_test__
 Hello world
 __ggml_vocab_test__
 Hello world
 __ggml_vocab_test__
 Hello World
 __ggml_vocab_test__
 Hello World
 __ggml_vocab_test__
 Hello World!
 __ggml_vocab_test__
 Hello, world!
 __ggml_vocab_test__
 Hello, world!
 __ggml_vocab_test__
 this is 🦙.cpp
 __ggml_vocab_test__
 w048 7tuijk dsdfhu
 __ggml_vocab_test__
 нещо на Български
 __ggml_vocab_test__
 កាន់តែពិសេសអាចខលចេញ
 __ggml_vocab_test__
 🚀 (normal) 😶‍🌫️ (multiple emojis concatenated) ✅ (only emoji that has its own token)
 __ggml_vocab_test__
 Hello
 __ggml_vocab_test__
 Hello
 __ggml_vocab_test__
  Hello
 __ggml_vocab_test__
   Hello
 __ggml_vocab_test__
    Hello
 __ggml_vocab_test__
    Hello
    Hello
 __ggml_vocab_test__
 (
 __ggml_vocab_test__
 =
 __ggml_vocab_test__
 ' era
 __ggml_vocab_test__
 Hello, y'all! How are you 😁 ?我想在apple工作1314151天～
 __ggml_vocab_test__
 3
 __ggml_vocab_test__
 33
 __ggml_vocab_test__
 333
 __ggml_vocab_test__
 3333
 __ggml_vocab_test__
 33333
 __ggml_vocab_test__
 333333
 __ggml_vocab_test__
 3333333
 __ggml_vocab_test__
 33333333
 __ggml_vocab_test__
 333333333
 __ggml_vocab_test__
 🚀 (normal) 😶‍🌫️ (multiple emojis concatenated) ✅ 🦙🦙 3 33 333 3333 33333 333333 3333333 33333333 3.3 3..3 3...3 កាន់តែពិសេសអាច😁 ?我想在apple工作1314151天～ ------======= нещо на Български ''''''```````""""......!!!!!!?????? I've been 'told he's there, 'RE you sure? 'M not sure I'll make it, 'D you like some tea? We'Ve a'lL
 __ggml_vocab_test__
--- a/models/ggml-vocab-llama-bpe.gguf.out
+++ b/models/ggml-vocab-llama-bpe.gguf.out
@ -0,0 +1,41 @@
 220
 256
 262
 197
 198
 271
 1432
 1602
 9906 1917
 22691 1917
 9906 4435
 22691 4435
 22691 4435 0
 9906 11 1917 0
 22691 11 1917 0
 420 374 11410 99 247 13 11055
 86 23904 220 22 83 2005 42908 11729 3013 17156
 79862 102118 13373 64571 34694 3114 112203 80112
 21549 222 98629 241 45358 233 21549 237 45358 224 21549 244 21549 115 21549 253 45358 223 21549 253 21549 95 98629 227 21549 223 21549 249 21549 227 45358 223 21549 231
 9468 248 222 320 8416 8 27623 114 102470 9468 234 104 31643 320 36773 100166 98634 8 26602 227 320 3323 43465 430 706 1202 1866 4037 8
 9906
 22691
 220 22691
 256 22691
 262 22691
 262 22691 198 262 22691
 320
 198 284
 6 11639
 9906 11 379 65948 0 2650 527 499 27623 223 949 37046 101067 19000 23182 102301 9263 18136 16 36827 21909
 18
 1644
 8765
 8765 18
 8765 1644
 8765 8765
 8765 8765 18
 8765 8765 1644
 8765 8765 8765
 198 4815 15073 66597 8004 1602 2355 79772 11187 9468 248 222 320 8416 8 27623 114 102470 9468 234 104 31643 320 36773 100166 98634 8 26602 227 11410 99 247 9468 99 247 220 18 220 1644 220 8765 220 8765 18 220 8765 1644 220 8765 8765 220 8765 8765 18 220 8765 8765 1644 220 18 13 18 220 18 497 18 220 18 1131 18 220 21549 222 98629 241 45358 233 21549 237 45358 224 21549 244 21549 115 21549 253 45358 223 21549 253 21549 95 98629 227 76460 223 949 37046 101067 19000 23182 102301 9263 18136 16 36827 21909 56560 54337 19175 102118 13373 64571 34694 3114 112203 80112 3436 106451 14196 14196 74694 3089 3089 29249 17523 3001 27708 7801 358 3077 1027 364 83 820 568 596 1070 11 364 793 499 2771 30 364 44 539 2771 358 3358 1304 433 11 364 35 499 1093 1063 15600 30 1226 6 43712 264 64966 43
--- a/models/ggml-vocab-llama-spm.gguf
+++ b/models/ggml-vocab-llama-spm.gguf
--- a/models/ggml-vocab-llama-spm.gguf.inp
+++ b/models/ggml-vocab-llama-spm.gguf.inp
@ -0,0 +1,102 @@
 __ggml_vocab_test__
 __ggml_vocab_test__
 __ggml_vocab_test__
 __ggml_vocab_test__
 __ggml_vocab_test__
 __ggml_vocab_test__
 __ggml_vocab_test__
 __ggml_vocab_test__
 __ggml_vocab_test__
 Hello world
 __ggml_vocab_test__
 Hello world
 __ggml_vocab_test__
 Hello World
 __ggml_vocab_test__
 Hello World
 __ggml_vocab_test__
 Hello World!
 __ggml_vocab_test__
 Hello, world!
 __ggml_vocab_test__
 Hello, world!
 __ggml_vocab_test__
 this is 🦙.cpp
 __ggml_vocab_test__
 w048 7tuijk dsdfhu
 __ggml_vocab_test__
 нещо на Български
 __ggml_vocab_test__
 កាន់តែពិសេសអាចខលចេញ
 __ggml_vocab_test__
 🚀 (normal) 😶‍🌫️ (multiple emojis concatenated) ✅ (only emoji that has its own token)
 __ggml_vocab_test__
 Hello
 __ggml_vocab_test__
 Hello
 __ggml_vocab_test__
  Hello
 __ggml_vocab_test__
   Hello
 __ggml_vocab_test__
    Hello
 __ggml_vocab_test__
    Hello
    Hello
 __ggml_vocab_test__
 (
 __ggml_vocab_test__
 =
 __ggml_vocab_test__
 ' era
 __ggml_vocab_test__
 Hello, y'all! How are you 😁 ?我想在apple工作1314151天～
 __ggml_vocab_test__
 3
 __ggml_vocab_test__
 33
 __ggml_vocab_test__
 333
 __ggml_vocab_test__
 3333
 __ggml_vocab_test__
 33333
 __ggml_vocab_test__
 333333
 __ggml_vocab_test__
 3333333
 __ggml_vocab_test__
 33333333
 __ggml_vocab_test__
 333333333
 __ggml_vocab_test__
 🚀 (normal) 😶‍🌫️ (multiple emojis concatenated) ✅ 🦙🦙 3 33 333 3333 33333 333333 3333333 33333333 3.3 3..3 3...3 កាន់តែពិសេសអាច😁 ?我想在apple工作1314151天～ ------======= нещо на Български ''''''```````""""......!!!!!!?????? I've been 'told he's there, 'RE you sure? 'M not sure I'll make it, 'D you like some tea? We'Ve a'lL
 __ggml_vocab_test__
--- a/models/ggml-vocab-llama-spm.gguf.out
+++ b/models/ggml-vocab-llama-spm.gguf.out
@ -0,0 +1,41 @@
 259
 1678
 268
 29871 12
 29871 13
 29871 13 13
 29871 13 13 13
 29871 12 13
 15043 3186
 29871 15043 3186
 15043 2787
 29871 15043 2787
 29871 15043 2787 29991
 15043 29892 3186 29991
 29871 15043 29892 3186 29991
 29871 445 338 29871 243 162 169 156 29889 8223
 281 29900 29946 29947 29871 29955 9161 13535 18031 2176 6905
 1538 4851 665 1386 29713 1305
 29871 31849 31324 31934 228 162 142 228 161 146 228 162 133 228 161 153 228 161 186 31708 228 162 132 31708 228 161 165 31324 228 161 136 228 161 132 228 161 158 228 161 136 228 162 132 228 161 140
 29871 243 162 157 131 313 8945 29897 29871 243 162 155 185 30722 243 162 143 174 30598 313 20787 953 3848 275 16125 630 29897 29871 31681 313 6194 953 29877 2397 393 756 967 1914 5993 29897
 15043
 29871 15043
 259 15043
 1678 15043
 268 15043
 268 15043 13 1678 15043
 29871 313
 29871 13 353
 525 3152
 15043 29892 343 29915 497 29991 1128 526 366 29871 243 162 155 132 1577 30672 31522 30505 11548 31041 30732 29896 29941 29896 29946 29896 29945 29896 30408 30739
 29871 29941
 29871 29941 29941
 29871 29941 29941 29941
 29871 29941 29941 29941 29941
 29871 29941 29941 29941 29941 29941
 29871 29941 29941 29941 29941 29941 29941
 29871 29941 29941 29941 29941 29941 29941 29941
 29871 29941 29941 29941 29941 29941 29941 29941 29941
 29871 29941 29941 29941 29941 29941 29941 29941 29941 29941
 29871 13 29871 13 13 29871 13 13 13 29871 12 29871 12 12 29871 12 13 259 13 1678 13 268 13 418 13 243 162 157 131 313 8945 29897 29871 243 162 155 185 30722 243 162 143 174 30598 313 20787 953 3848 275 16125 630 29897 29871 31681 29871 243 162 169 156 243 162 169 156 29871 29941 29871 29941 29941 29871 29941 29941 29941 29871 29941 29941 29941 29941 29871 29941 29941 29941 29941 29941 29871 29941 29941 29941 29941 29941 29941 29871 29941 29941 29941 29941 29941 29941 29941 29871 29941 29941 29941 29941 29941 29941 29941 29941 29871 29941 29889 29941 29871 29941 636 29941 29871 29941 856 29941 29871 31849 31324 31934 228 162 142 228 161 146 228 162 133 228 161 153 228 161 186 31708 228 162 132 31708 228 161 165 31324 228 161 136 243 162 155 132 1577 30672 31522 30505 11548 31041 30732 29896 29941 29896 29946 29896 29945 29896 30408 30739 448 23648 2751 25512 1538 4851 665 1386 29713 1305 14550 4907 11120 16159 16159 16159 15945 15945 3045 636 6824 6824 6824 8773 8773 8773 306 29915 345 1063 525 29873 1025 540 29915 29879 727 29892 525 1525 366 1854 29973 525 29924 451 1854 306 29915 645 1207 372 29892 525 29928 366 763 777 23429 29973 1334 29915 29963 29872 263 29915 29880 29931
--- a/models/ggml-vocab-mpt.gguf
+++ b/models/ggml-vocab-mpt.gguf
--- a/models/ggml-vocab-mpt.gguf.inp
+++ b/models/ggml-vocab-mpt.gguf.inp
@ -0,0 +1,102 @@
 __ggml_vocab_test__
 __ggml_vocab_test__
 __ggml_vocab_test__
 __ggml_vocab_test__
 __ggml_vocab_test__
 __ggml_vocab_test__
 __ggml_vocab_test__
 __ggml_vocab_test__
 __ggml_vocab_test__
 Hello world
 __ggml_vocab_test__
 Hello world
 __ggml_vocab_test__
 Hello World
 __ggml_vocab_test__
 Hello World
 __ggml_vocab_test__
 Hello World!
 __ggml_vocab_test__
 Hello, world!
 __ggml_vocab_test__
 Hello, world!
 __ggml_vocab_test__
 this is 🦙.cpp
 __ggml_vocab_test__
 w048 7tuijk dsdfhu
 __ggml_vocab_test__
 нещо на Български
 __ggml_vocab_test__
 កាន់តែពិសេសអាចខលចេញ
 __ggml_vocab_test__
 🚀 (normal) 😶‍🌫️ (multiple emojis concatenated) ✅ (only emoji that has its own token)
 __ggml_vocab_test__
 Hello
 __ggml_vocab_test__
 Hello
 __ggml_vocab_test__
  Hello
 __ggml_vocab_test__
   Hello
 __ggml_vocab_test__
    Hello
 __ggml_vocab_test__
    Hello
    Hello
 __ggml_vocab_test__
 (
 __ggml_vocab_test__
 =
 __ggml_vocab_test__
 ' era
 __ggml_vocab_test__
 Hello, y'all! How are you 😁 ?我想在apple工作1314151天～
 __ggml_vocab_test__
 3
 __ggml_vocab_test__
 33
 __ggml_vocab_test__
 333
 __ggml_vocab_test__
 3333
 __ggml_vocab_test__
 33333
 __ggml_vocab_test__
 333333
 __ggml_vocab_test__
 3333333
 __ggml_vocab_test__
 33333333
 __ggml_vocab_test__
 333333333
 __ggml_vocab_test__
 🚀 (normal) 😶‍🌫️ (multiple emojis concatenated) ✅ 🦙🦙 3 33 333 3333 33333 333333 3333333 33333333 3.3 3..3 3...3 កាន់តែពិសេសអាច😁 ?我想在apple工作1314151天～ ------======= нещо на Български ''''''```````""""......!!!!!!?????? I've been 'told he's there, 'RE you sure? 'M not sure I'll make it, 'D you like some tea? We'Ve a'lL
 __ggml_vocab_test__
--- a/models/ggml-vocab-mpt.gguf.out
+++ b/models/ggml-vocab-mpt.gguf.out
@ -0,0 +1,41 @@
 209
 50276
 50275
 186
 187
 535
 2756
 186 187
 12092 1533
 24387 1533
 12092 3645
 24387 3645
 24387 3645 2
 12092 13 1533 2
 24387 13 1533 2
 436 310 22692 101 236 15 14161
 88 27244 818 16853 16392 20505 4989 11917
 32520 11514 1068 8713 38177 13396 3415 9925 12559 10453 1389
 18081 211 18081 116 18081 230 39936 222 18081 226 39936 213 18081 233 18081 117 18081 242 39936 212 18081 242 18081 97 18081 116 18081 216 18081 212 18081 238 18081 216 39936 212 18081 220
 14931 237 211 313 6320 10 49042 116 325 224 14931 223 106 171 118 226 313 34263 802 13511 261 32147 456 10 3384 239 216 313 7483 802 80 8020 326 556 697 1211 10669 10
 12092
 24387
 50276 12092
 50275 12092
 50274 12092
 50274 12092 187 50274 12092
 313
 187 426
 8 8685
 12092 13 340 8 455 2 1359 403 368 49042 212 3736 15367 41197 13610 19934 41869 21275 1012 1047 18795 40120 20422 241
 20
 1610
 20084
 26409
 1610 20084
 26409 1610
 26409 20084
 26409 26409
 26409 1610 20084
 586 1744 33525 186 209 623 28910 187 50276 187 50275 187 50274 187 50273 187 14931 237 211 313 6320 10 49042 116 325 224 14931 223 106 171 118 226 313 34263 802 13511 261 32147 456 10 3384 239 216 22692 101 236 14931 101 236 495 5922 30057 495 20084 495 26409 30057 20084 495 26409 1610 495 26409 20084 495 15 20 495 537 20 495 1051 20 209 18081 211 18081 116 18081 230 39936 222 18081 226 39936 213 18081 233 18081 117 18081 242 39936 212 18081 242 18081 97 18081 116 18081 216 14931 235 212 3736 15367 41197 13610 19934 41869 21275 1012 1047 18795 40120 20422 241 16081 6877 12880 11514 1068 8713 38177 13396 3415 9925 12559 10453 1389 42011 35033 34842 11202 9739 9739 33021 18963 4672 25561 8220 309 1849 644 686 42618 344 434 627 13 686 1848 368 2119 32 686 46 417 2119 309 1833 1056 352 13 686 37 368 751 690 10331 32 844 8 31516 247 8 77 45
--- a/models/ggml-vocab-phi-3.gguf
+++ b/models/ggml-vocab-phi-3.gguf
--- a/models/ggml-vocab-phi-3.gguf.inp
+++ b/models/ggml-vocab-phi-3.gguf.inp
@ -0,0 +1,102 @@
 __ggml_vocab_test__
 __ggml_vocab_test__
 __ggml_vocab_test__
 __ggml_vocab_test__
 __ggml_vocab_test__
 __ggml_vocab_test__
 __ggml_vocab_test__
 __ggml_vocab_test__
 __ggml_vocab_test__
 Hello world
 __ggml_vocab_test__
 Hello world
 __ggml_vocab_test__
 Hello World
 __ggml_vocab_test__
 Hello World
 __ggml_vocab_test__
 Hello World!
 __ggml_vocab_test__
 Hello, world!
 __ggml_vocab_test__
 Hello, world!
 __ggml_vocab_test__
 this is 🦙.cpp
 __ggml_vocab_test__
 w048 7tuijk dsdfhu
 __ggml_vocab_test__
 нещо на Български
 __ggml_vocab_test__
 កាន់តែពិសេសអាចខលចេញ
 __ggml_vocab_test__
 🚀 (normal) 😶‍🌫️ (multiple emojis concatenated) ✅ (only emoji that has its own token)
 __ggml_vocab_test__
 Hello
 __ggml_vocab_test__
 Hello
 __ggml_vocab_test__
  Hello
 __ggml_vocab_test__
   Hello
 __ggml_vocab_test__
    Hello
 __ggml_vocab_test__
    Hello
    Hello
 __ggml_vocab_test__
 (
 __ggml_vocab_test__
 =
 __ggml_vocab_test__
 ' era
 __ggml_vocab_test__
 Hello, y'all! How are you 😁 ?我想在apple工作1314151天～
 __ggml_vocab_test__
 3
 __ggml_vocab_test__
 33
 __ggml_vocab_test__
 333
 __ggml_vocab_test__
 3333
 __ggml_vocab_test__
 33333
 __ggml_vocab_test__
 333333
 __ggml_vocab_test__
 3333333
 __ggml_vocab_test__
 33333333
 __ggml_vocab_test__
 333333333
 __ggml_vocab_test__
 🚀 (normal) 😶‍🌫️ (multiple emojis concatenated) ✅ 🦙🦙 3 33 333 3333 33333 333333 3333333 33333333 3.3 3..3 3...3 កាន់តែពិសេសអាច😁 ?我想在apple工作1314151天～ ------======= нещо на Български ''''''```````""""......!!!!!!?????? I've been 'told he's there, 'RE you sure? 'M not sure I'll make it, 'D you like some tea? We'Ve a'lL
 __ggml_vocab_test__
--- a/models/ggml-vocab-phi-3.gguf.out
+++ b/models/ggml-vocab-phi-3.gguf.out
@ -0,0 +1,41 @@
 259
 1678
 268
 29871 12
 29871 13
 29871 13 13
 29871 13 13 13
 29871 12 13
 15043 3186
 29871 15043 3186
 15043 2787
 29871 15043 2787
 29871 15043 2787 29991
 15043 29892 3186 29991
 29871 15043 29892 3186 29991
 29871 445 338 29871 243 162 169 156 29889 8223
 281 29900 29946 29947 29871 29955 9161 13535 18031 2176 6905
 1538 4851 665 1386 29713 1305
 29871 31849 31324 31934 228 162 142 228 161 146 228 162 133 228 161 153 228 161 186 31708 228 162 132 31708 228 161 165 31324 228 161 136 228 161 132 228 161 158 228 161 136 228 162 132 228 161 140
 29871 243 162 157 131 313 8945 29897 29871 243 162 155 185 30722 243 162 143 174 30598 313 20787 953 3848 275 16125 630 29897 29871 31681 313 6194 953 29877 2397 393 756 967 1914 5993 29897
 15043
 29871 15043
 259 15043
 1678 15043
 268 15043
 268 15043 13 1678 15043
 29871 313
 29871 13 353
 525 3152
 15043 29892 343 29915 497 29991 1128 526 366 29871 243 162 155 132 1577 30672 31522 30505 11548 31041 30732 29896 29941 29896 29946 29896 29945 29896 30408 30739
 29871 29941
 29871 29941 29941
 29871 29941 29941 29941
 29871 29941 29941 29941 29941
 29871 29941 29941 29941 29941 29941
 29871 29941 29941 29941 29941 29941 29941
 29871 29941 29941 29941 29941 29941 29941 29941
 29871 29941 29941 29941 29941 29941 29941 29941 29941
 29871 29941 29941 29941 29941 29941 29941 29941 29941 29941
 29871 13 29871 13 13 29871 13 13 13 29871 12 29871 12 12 29871 12 13 259 13 1678 13 268 13 418 13 243 162 157 131 313 8945 29897 29871 243 162 155 185 30722 243 162 143 174 30598 313 20787 953 3848 275 16125 630 29897 29871 31681 29871 243 162 169 156 243 162 169 156 29871 29941 29871 29941 29941 29871 29941 29941 29941 29871 29941 29941 29941 29941 29871 29941 29941 29941 29941 29941 29871 29941 29941 29941 29941 29941 29941 29871 29941 29941 29941 29941 29941 29941 29941 29871 29941 29941 29941 29941 29941 29941 29941 29941 29871 29941 29889 29941 29871 29941 636 29941 29871 29941 856 29941 29871 31849 31324 31934 228 162 142 228 161 146 228 162 133 228 161 153 228 161 186 31708 228 162 132 31708 228 161 165 31324 228 161 136 243 162 155 132 1577 30672 31522 30505 11548 31041 30732 29896 29941 29896 29946 29896 29945 29896 30408 30739 448 23648 2751 25512 1538 4851 665 1386 29713 1305 14550 4907 11120 16159 16159 16159 15945 15945 3045 636 6824 6824 6824 8773 8773 8773 306 29915 345 1063 525 29873 1025 540 29915 29879 727 29892 525 1525 366 1854 29973 525 29924 451 1854 306 29915 645 1207 372 29892 525 29928 366 763 777 23429 29973 1334 29915 29963 29872 263 29915 29880 29931
--- a/models/ggml-vocab-stablelm-3b-4e1t.gguf
+++ b/models/ggml-vocab-stablelm-3b-4e1t.gguf
--- a/models/ggml-vocab-starcoder.gguf
+++ b/models/ggml-vocab-starcoder.gguf
--- a/models/ggml-vocab-starcoder.gguf.inp
+++ b/models/ggml-vocab-starcoder.gguf.inp
@ -0,0 +1,102 @@
 __ggml_vocab_test__
 __ggml_vocab_test__
 __ggml_vocab_test__
 __ggml_vocab_test__
 __ggml_vocab_test__
 __ggml_vocab_test__
 __ggml_vocab_test__
 __ggml_vocab_test__
 __ggml_vocab_test__
 Hello world
 __ggml_vocab_test__
 Hello world
 __ggml_vocab_test__
 Hello World
 __ggml_vocab_test__
 Hello World
 __ggml_vocab_test__
 Hello World!
 __ggml_vocab_test__
 Hello, world!
 __ggml_vocab_test__
 Hello, world!
 __ggml_vocab_test__
 this is 🦙.cpp
 __ggml_vocab_test__
 w048 7tuijk dsdfhu
 __ggml_vocab_test__
 нещо на Български
 __ggml_vocab_test__
 កាន់តែពិសេសអាចខលចេញ
 __ggml_vocab_test__
 🚀 (normal) 😶‍🌫️ (multiple emojis concatenated) ✅ (only emoji that has its own token)
 __ggml_vocab_test__
 Hello
 __ggml_vocab_test__
 Hello
 __ggml_vocab_test__
  Hello
 __ggml_vocab_test__
   Hello
 __ggml_vocab_test__
    Hello
 __ggml_vocab_test__
    Hello
    Hello
 __ggml_vocab_test__
 (
 __ggml_vocab_test__
 =
 __ggml_vocab_test__
 ' era
 __ggml_vocab_test__
 Hello, y'all! How are you 😁 ?我想在apple工作1314151天～
 __ggml_vocab_test__
 3
 __ggml_vocab_test__
 33
 __ggml_vocab_test__
 333
 __ggml_vocab_test__
 3333
 __ggml_vocab_test__
 33333
 __ggml_vocab_test__
 333333
 __ggml_vocab_test__
 3333333
 __ggml_vocab_test__
 33333333
 __ggml_vocab_test__
 333333333
 __ggml_vocab_test__
 🚀 (normal) 😶‍🌫️ (multiple emojis concatenated) ✅ 🦙🦙 3 33 333 3333 33333 333333 3333333 33333333 3.3 3..3 3...3 កាន់តែពិសេសអាច😁 ?我想在apple工作1314151天～ ------======= нещо на Български ''''''```````""""......!!!!!!?????? I've been 'told he's there, 'RE you sure? 'M not sure I'll make it, 'D you like some tea? We'Ve a'lL
 __ggml_vocab_test__
--- a/models/ggml-vocab-starcoder.gguf.out
+++ b/models/ggml-vocab-starcoder.gguf.out
@ -0,0 +1,41 @@
 244
 280
 283
 221
 222
 499
 3067
 15767
 8302 5810
 12009 5810
 8302 10914
 12009 10914
 12009 10914 38
 8302 49 5810 38
 12009 49 5810 38
 477 458 5954 137 271 51 3779
 124 53 57 61 244 60 121 1726 12568 10240 1519 8290
 39916 8389 1059 9504 40216 13858 2073 8983 12571 1539 10721
 14566 246 14566 152 14566 265 30428 257 14566 261 30428 248 14566 268 14566 153 14566 277 30428 247 14566 277 14566 133 14566 152 14566 251 14566 247 14566 273 14566 251 30428 247 14566 255
 3822 272 246 327 4434 46 18445 152 46030 45022 142 13878 327 12585 19884 33773 40920 751 46 41839 327 2605 22716 708 1421 2840 4387 2421 46
 8302
 12009
 244 12009
 280 12009
 283 12009
 283 12009 303 12009
 327
 222 299
 44 34719
 8302 49 553 44 483 38 4998 904 863 18445 247 1037 4995 13379 2924 9515 17823 54 56 54 57 54 58 54 11904 47892
 56
 56 56
 56 56 56
 56 56 56 56
 56 56 56 56 56
 56 56 56 56 56 56
 56 56 56 56 56 56 56
 56 56 56 56 56 56 56 56
 56 56 56 56 56 56 56 56 56
 353 736 8886 221 10883 4238 16101 28540 222 3822 272 246 327 4434 46 18445 152 46030 45022 142 13878 327 12585 19884 33773 40920 751 46 41839 5954 137 271 3822 137 271 244 56 244 56 56 244 56 56 56 244 56 56 56 56 244 56 56 56 56 56 244 56 56 56 56 56 56 244 56 56 56 56 56 56 56 244 56 56 56 56 56 56 56 56 244 56 51 56 244 56 516 56 244 56 1198 56 244 14566 246 14566 152 14566 265 30428 257 14566 261 30428 248 14566 268 14566 153 14566 277 30428 247 14566 277 14566 133 14566 152 14566 251 36570 247 1037 4995 13379 2924 9515 17823 54 56 54 57 54 58 54 11904 47892 20895 16625 13047 8389 1059 9504 40216 13858 2073 8983 12571 1539 10721 5918 9643 13298 932 31723 31330 9221 3226 35426 10400 457 4783 2602 349 121 1477 957 1200 2038 49 349 632 863 3673 68 349 82 666 3673 457 4650 1949 580 49 349 73 863 2144 1649 35941 68 2726 44 7728 331 44 113 81
--- a/requirements.txt
+++ b/requirements.txt
@ -7,6 +7,7 @@
 -r ./requirements/requirements-convert.txt
 -r ./requirements/requirements-convert-hf-to-gguf.txt
 -r ./requirements/requirements-convert-hf-to-gguf-update.txt
 -r ./requirements/requirements-convert-llama-ggml-to-gguf.txt
 -r ./requirements/requirements-convert-lora-to-ggml.txt
 -r ./requirements/requirements-convert-persimmon-to-gguf.txt
--- a/requirements/requirements-convert-hf-to-gguf-update.txt
+++ b/requirements/requirements-convert-hf-to-gguf-update.txt
@ -0,0 +1,3 @@
 -r ./requirements-convert.txt
 torch~=2.1.1
 einops~=0.7.0
--- a/scripts/check-requirements.sh
+++ b/scripts/check-requirements.sh
@ -168,6 +168,11 @@ fi
 check_convert_script convert.py
 for py in convert-*.py; do
    # skip convert-hf-to-gguf-update.py
    # TODO: the check is failing for some reason:
    #       https://github.com/ggerganov/llama.cpp/actions/runs/8875330981/job/24364557177?pr=6920
    [[ $py == convert-hf-to-gguf-update.py ]] && continue
    check_convert_script "$py"
 done
--- a/sgemm.cpp
+++ b/sgemm.cpp
@ -50,7 +50,6 @@
 #pragma GCC diagnostic ignored "-Wignored-attributes"
 #include "sgemm.h"
 #include <algorithm>
 #include "ggml-impl.h"
 #include "ggml-quants.h"
@ -243,23 +242,23 @@ template <> inline __m512 load(const ggml_fp16_t *p) {
 template <int KN, typename D, typename V, typename TA, typename TB, typename TC>
 class tinyBLAS {
  public:
-    tinyBLAS(int k,
+    tinyBLAS(int64_t k,
-             const TA *A, int lda,
+             const TA *A, int64_t lda,
-             const TB *B, int ldb,
+             const TB *B, int64_t ldb,
-             TC *C, int ldc,
+             TC *C, int64_t ldc,
             int ith, int nth)
        : A(A), B(B), C(C), k(k), lda(lda), ldb(ldb), ldc(ldc), ith(ith), nth(nth) {
    }
-    void matmul(int m, int n, int task) {
+    void matmul(int64_t m, int64_t n, int task) {
        if (task == GGML_TASK_TYPE_COMPUTE)
            mnpack(0, m, 0, n);
    }
  private:
-    NOINLINE void mnpack(int m0, int m, int n0, int n) {
+    NOINLINE void mnpack(int64_t m0, int64_t m, int64_t n0, int64_t n) {
-        int mc, nc, mp, np;
+        int64_t mc, nc, mp, np;
-        switch ((std::min(m - m0, 5) << 4) | std::min(n - n0, 5)) {
+        switch ((MIN(m - m0, 5) << 4) | MIN(n - n0, 5)) {
 #if VECTOR_REGISTERS == 32
        case 0x55:
            mc = 5;
@ -409,27 +408,27 @@ class tinyBLAS {
    }
    template <int RM, int RN>
-    NOINLINE void gemm(int m0, int m, int n0, int n) {
+    NOINLINE void gemm(int64_t m0, int64_t m, int64_t n0, int64_t n) {
-        int ytiles = (m - m0) / RM;
+        int64_t ytiles = (m - m0) / RM;
-        int xtiles = (n - n0) / RN;
+        int64_t xtiles = (n - n0) / RN;
-        int tiles = xtiles * ytiles;
+        int64_t tiles = xtiles * ytiles;
-        int duty = (tiles + nth - 1) / nth;
+        int64_t duty = (tiles + nth - 1) / nth;
-        int start = duty * ith;
+        int64_t start = duty * ith;
-        int end = start + duty;
+        int64_t end = start + duty;
        if (end > tiles)
            end = tiles;
-        for (int job = start; job < end; ++job) {
+        for (int64_t job = start; job < end; ++job) {
-            int ii = m0 + job / xtiles * RM;
+            int64_t ii = m0 + job / xtiles * RM;
-            int jj = n0 + job % xtiles * RN;
+            int64_t jj = n0 + job % xtiles * RN;
            D Cv[RN][RM] = {};
-            for (int l = 0; l < k; l += KN)
+            for (int64_t l = 0; l < k; l += KN)
-                for (int j = 0; j < RN; ++j)
+                for (int64_t j = 0; j < RN; ++j)
-                    for (int i = 0; i < RM; ++i)
+                    for (int64_t i = 0; i < RM; ++i)
                        Cv[j][i] = madd(load<V>(A + lda * (ii + i) + l),
                                        load<V>(B + ldb * (jj + j) + l),
                                        Cv[j][i]);
-            for (int j = 0; j < RN; ++j)
+            for (int64_t j = 0; j < RN; ++j)
-                for (int i = 0; i < RM; ++i)
+                for (int64_t i = 0; i < RM; ++i)
                    C[ldc * (jj + j) + (ii + i)] = hsum(Cv[j][i]);
        }
    }
@ -437,10 +436,10 @@ class tinyBLAS {
    const TA *const A;
    const TB *const B;
    TC *const C;
-    const int k;
+    const int64_t k;
-    const int lda;
+    const int64_t lda;
-    const int ldb;
+    const int64_t ldb;
-    const int ldc;
+    const int64_t ldc;
    const int ith;
    const int nth;
 };
@ -452,23 +451,23 @@ class tinyBLAS {
 template <typename TA>
 class tinyBLAS_Q0_ARM {
  public:
-    tinyBLAS_Q0_ARM(int k,
+    tinyBLAS_Q0_ARM(int64_t k,
-                    const TA *A, int lda,
+                    const TA *A, int64_t lda,
-                    const block_q8_0 *B, int ldb,
+                    const block_q8_0 *B, int64_t ldb,
-                    float *C, int ldc,
+                    float *C, int64_t ldc,
                    int ith, int nth)
        : A(A), B(B), C(C), k(k), lda(lda), ldb(ldb), ldc(ldc), ith(ith), nth(nth) {
    }
-    void matmul(int m, int n, int task) {
+    void matmul(int64_t m, int64_t n, int task) {
        if (task == GGML_TASK_TYPE_COMPUTE)
            mnpack(0, m, 0, n);
    }
  private:
-    NOINLINE void mnpack(int m0, int m, int n0, int n) {
+    NOINLINE void mnpack(int64_t m0, int64_t m, int64_t n0, int64_t n) {
-        int mc, nc, mp, np;
+        int64_t mc, nc, mp, np;
-        switch ((std::min(m - m0, 3) << 4) | std::min(n - n0, 3)) {
+        switch ((MIN(m - m0, 3) << 4) | MIN(n - n0, 3ll)) {
        case 0x33:
            mc = 3;
            nc = 3;
@ -524,22 +523,22 @@ class tinyBLAS_Q0_ARM {
    }
    template <int RM, int RN>
-    NOINLINE void gemm(int m0, int m, int n0, int n) {
+    NOINLINE void gemm(int64_t m0, int64_t m, int64_t n0, int64_t n) {
-        int ytiles = (m - m0) / RM;
+        int64_t ytiles = (m - m0) / RM;
-        int xtiles = (n - n0) / RN;
+        int64_t xtiles = (n - n0) / RN;
-        int tiles = xtiles * ytiles;
+        int64_t tiles = xtiles * ytiles;
-        int duty = (tiles + nth - 1) / nth;
+        int64_t duty = (tiles + nth - 1) / nth;
-        int start = duty * ith;
+        int64_t start = duty * ith;
-        int end = start + duty;
+        int64_t end = start + duty;
        if (end > tiles)
            end = tiles;
-        for (int job = start; job < end; ++job) {
+        for (int64_t job = start; job < end; ++job) {
-            int ii = m0 + job / xtiles * RM;
+            int64_t ii = m0 + job / xtiles * RM;
-            int jj = n0 + job % xtiles * RN;
+            int64_t jj = n0 + job % xtiles * RN;
            float32x4_t Cv[RN][RM] = {};
-            for (int l = 0; l < k; ++l)
+            for (int64_t l = 0; l < k; ++l)
-                for (int j = 0; j < RN; ++j)
+                for (int64_t j = 0; j < RN; ++j)
-                    for (int i = 0; i < RM; ++i)
+                    for (int64_t i = 0; i < RM; ++i)
                        Cv[j][i] = vmlaq_n_f32(Cv[j][i],
                                               vcvtq_f32_s32(vdotq_s32(
                                                   vdotq_s32(vdupq_n_s32(0),
@ -549,8 +548,8 @@ class tinyBLAS_Q0_ARM {
                                                   load_hi(B + ldb * (jj + j) + l))),
                                               unhalf(A[lda * (ii + i) + l].d) *
                                               unhalf(B[ldb * (jj + j) + l].d));
-            for (int j = 0; j < RN; ++j)
+            for (int64_t j = 0; j < RN; ++j)
-                for (int i = 0; i < RM; ++i)
+                for (int64_t i = 0; i < RM; ++i)
                    C[ldc * (jj + j) + (ii + i)] = hsum(Cv[j][i]);
        }
    }
@ -577,10 +576,10 @@ class tinyBLAS_Q0_ARM {
    const TA *const A;
    const block_q8_0 *const B;
    float *const C;
-    const int k;
+    const int64_t k;
-    const int lda;
+    const int64_t lda;
-    const int ldb;
+    const int64_t ldb;
-    const int ldc;
+    const int64_t ldc;
    const int ith;
    const int nth;
 };
@ -590,23 +589,23 @@ class tinyBLAS_Q0_ARM {
 template <typename TA, typename TB, typename TC>
 class tinyBLAS_Q0_AVX2 {
  public:
-    tinyBLAS_Q0_AVX2(int k,
+    tinyBLAS_Q0_AVX2(int64_t k,
-                     const TA *A, int lda,
+                     const TA *A, int64_t lda,
-                     const TB *B, int ldb,
+                     const TB *B, int64_t ldb,
-                     TC *C, int ldc,
+                     TC *C, int64_t ldc,
                     int ith, int nth)
        : A(A), B(B), C(C), k(k), lda(lda), ldb(ldb), ldc(ldc), ith(ith), nth(nth) {
    }
-    void matmul(int m, int n, int task) {
+    void matmul(int64_t m, int64_t n, int task) {
        if (task == GGML_TASK_TYPE_COMPUTE)
            mnpack(0, m, 0, n);
    }
  private:
-    void mnpack(int m0, int m, int n0, int n) {
+    void mnpack(int64_t m0, int64_t m, int64_t n0, int64_t n) {
-        int mc, nc, mp, np;
+        int64_t mc, nc, mp, np;
-        switch ((std::min(m - m0, 4) << 4) | std::min(n - n0, 4)) {
+        switch ((MIN(m - m0, 4) << 4) | MIN(n - n0, 4)) {
 #if VECTOR_REGISTERS == 32
        case 0x44:
            mc = 4;
@ -714,22 +713,22 @@ class tinyBLAS_Q0_AVX2 {
    }
    template <int RM, int RN>
-    NOINLINE void gemm(int m0, int m, int n0, int n) {
+    NOINLINE void gemm(int64_t m0, int64_t m, int64_t n0, int64_t n) {
-        int ytiles = (m - m0) / RM;
+        int64_t ytiles = (m - m0) / RM;
-        int xtiles = (n - n0) / RN;
+        int64_t xtiles = (n - n0) / RN;
-        int tiles = xtiles * ytiles;
+        int64_t tiles = xtiles * ytiles;
-        int duty = (tiles + nth - 1) / nth;
+        int64_t duty = (tiles + nth - 1) / nth;
-        int start = duty * ith;
+        int64_t start = duty * ith;
-        int end = start + duty;
+        int64_t end = start + duty;
        if (end > tiles)
            end = tiles;
-        for (int job = start; job < end; ++job) {
+        for (int64_t job = start; job < end; ++job) {
-            int ii = m0 + job / xtiles * RM;
+            int64_t ii = m0 + job / xtiles * RM;
-            int jj = n0 + job % xtiles * RN;
+            int64_t jj = n0 + job % xtiles * RN;
            __m256 Cv[RN][RM] = {};
-            for (int l = 0; l < k; ++l)
+            for (int64_t l = 0; l < k; ++l)
-                for (int j = 0; j < RN; ++j)
+                for (int64_t j = 0; j < RN; ++j)
-                    for (int i = 0; i < RM; ++i)
+                    for (int64_t i = 0; i < RM; ++i)
                        Cv[j][i] = madd(_mm256_set1_ps(unhalf(A[lda * (ii + i) + l].d) *
                                                       unhalf(B[ldb * (jj + j) + l].d)),
                                        updot(_mm256_sign_epi8(load(A + lda * (ii + i) + l),
@ -737,8 +736,8 @@ class tinyBLAS_Q0_AVX2 {
                                              _mm256_sign_epi8(load(B + ldb * (jj + j) + l),
                                                               load(A + lda * (ii + i) + l))),
                                        Cv[j][i]);
-            for (int j = 0; j < RN; ++j)
+            for (int64_t j = 0; j < RN; ++j)
-                for (int i = 0; i < RM; ++i)
+                for (int64_t i = 0; i < RM; ++i)
                    C[ldc * (jj + j) + (ii + i)] = hsum(Cv[j][i]);
        }
    }
@ -771,10 +770,10 @@ class tinyBLAS_Q0_AVX2 {
    const TA *const A;
    const TB *const B;
    TC *const C;
-    const int k;
+    const int64_t k;
-    const int lda;
+    const int64_t lda;
-    const int ldb;
+    const int64_t ldb;
-    const int ldc;
+    const int64_t ldc;
    const int ith;
    const int nth;
 };
@ -813,8 +812,8 @@ class tinyBLAS_Q0_AVX2 {
 * @param Ctype is GGML data type of `C`
 * @return true if this function was able to service the matmul request
 */
-bool llamafile_sgemm(int m, int n, int k, const void *A, int lda, const void *B, int ldb, void *C,
+bool llamafile_sgemm(int64_t m, int64_t n, int64_t k, const void *A, int64_t lda, const void *B, int64_t ldb, void *C,
-                     int ldc, int ith, int nth, int task, int Atype, int Btype, int Ctype) {
+                     int64_t ldc, int ith, int nth, int task, int Atype, int Btype, int Ctype) {
    assert(m >= 0);
    assert(n >= 0);
@ -824,9 +823,6 @@ bool llamafile_sgemm(int m, int n, int k, const void *A, int lda, const void *B,
    assert(ldc >= m);
    assert(nth > 0);
    assert(ith < nth);
    assert(1ll * lda * m <= 0x7fffffff);
    assert(1ll * ldb * n <= 0x7fffffff);
    assert(1ll * ldc * n <= 0x7fffffff);
    if (Ctype != GGML_TYPE_F32)
        return false;
--- a/sgemm.h
+++ b/sgemm.h
@ -1,11 +1,13 @@
 #pragma once
 #include <stdint.h>
 #include <stdbool.h>
 #ifdef __cplusplus
 extern "C" {
 #endif
-bool llamafile_sgemm(int, int, int, const void *, int, const void *, int,
+bool llamafile_sgemm(int64_t, int64_t, int64_t, const void *, int64_t,
-                     void *, int, int, int, int, int, int, int);
+                     const void *, int64_t, void *, int64_t, int, int,
                     int, int, int, int);
 #ifdef __cplusplus
 }
--- a/tests/CMakeLists.txt
+++ b/tests/CMakeLists.txt
@ -1,10 +1,40 @@
 function(llama_test target)
    include(CMakeParseArguments)
    set(options)
    set(oneValueArgs NAME LABEL WORKING_DIRECTORY)
    set(multiValueArgs ARGS)
    cmake_parse_arguments(LLAMA_TEST "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
    if (NOT DEFINED LLAMA_TEST_LABEL)
        set(LLAMA_TEST_LABEL "main")
    endif()
    if (NOT DEFINED LLAMA_TEST_WORKING_DIRECTORY)
        set(LLAMA_TEST_WORKING_DIRECTORY .)
    endif()
    if (DEFINED LLAMA_TEST_NAME)
        set(TEST_NAME ${LLAMA_TEST_NAME})
    else()
        set(TEST_NAME ${target})
    endif()
    set(TEST_TARGET ${target})
    add_test(
        NAME ${TEST_NAME}
        WORKING_DIRECTORY ${LLAMA_TEST_WORKING_DIRECTORY}
        COMMAND $<TARGET_FILE:${TEST_TARGET}>
        ${LLAMA_TEST_ARGS})
    set_property(TEST ${TEST_NAME} PROPERTY LABELS ${LLAMA_TEST_LABEL})
 endfunction()
 # Builds and runs a test source file.
 # Optional args:
 # - NAME: name of the executable & test target (defaults to the source file name without extension)
 # - LABEL: label for the test (defaults to main)
 # - ARGS: arguments to pass to the test executable
 # - WORKING_DIRECTORY
-function(llama_test source)
+function(llama_target_and_test source)
    include(CMakeParseArguments)
    set(options)
    set(oneValueArgs NAME LABEL WORKING_DIRECTORY)
@ -35,41 +65,67 @@ function(llama_test source)
    set_property(TEST ${TEST_TARGET} PROPERTY LABELS ${LLAMA_TEST_LABEL})
 endfunction()
-# llama_test(test-double-float.cpp) # SLOW
+# build test-tokenizer-0 target once and add many tests
-llama_test(test-quantize-fns.cpp)
+add_executable(test-tokenizer-0 test-tokenizer-0.cpp)
-llama_test(test-quantize-perf.cpp)
+target_link_libraries(test-tokenizer-0 PRIVATE common)
-llama_test(test-sampling.cpp)
+install(TARGETS test-tokenizer-0 RUNTIME)
 llama_test(test-chat-template.cpp)
-llama_test(test-tokenizer-0-llama.cpp  NAME test-tokenizer-0-llama            ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-llama.gguf)
+llama_test(test-tokenizer-0 NAME test-tokenizer-0-llama-spm         ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-llama-spm.gguf)
-llama_test(test-tokenizer-0-falcon.cpp NAME test-tokenizer-0-falcon           ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-falcon.gguf)
+llama_test(test-tokenizer-0 NAME test-tokenizer-0-llama-bpe         ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-llama-bpe.gguf)
 llama_test(test-tokenizer-0 NAME test-tokenizer-0-phi-3             ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-phi-3.gguf)
 llama_test(test-tokenizer-0 NAME test-tokenizer-0-falcon            ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-falcon.gguf)
 llama_test(test-tokenizer-0 NAME test-tokenizer-0-deepseek-llm      ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-deepseek-llm.gguf)
 llama_test(test-tokenizer-0 NAME test-tokenizer-0-deepseek-coder    ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-deepseek-coder.gguf)
 llama_test(test-tokenizer-0 NAME test-tokenizer-0-bert-bge          ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-bert-bge.gguf)
 # TODO: enable when fixed
 #llama_test(test-tokenizer-0 NAME test-tokenizer-0-mpt               ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-mpt.gguf)
 llama_test(test-tokenizer-0 NAME test-tokenizer-0-starcoder         ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-starcoder.gguf)
 llama_test(test-tokenizer-0 NAME test-tokenizer-0-gpt-2             ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-gpt-2.gguf)
-llama_test(test-tokenizer-1-llama.cpp  NAME test-tokenizer-1-llama            ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-llama.gguf)
+# build test-tokenizer-1-bpe target once and add many tests
-llama_test(test-tokenizer-1-llama.cpp  NAME test-tokenizer-1-baichuan         ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-baichuan.gguf)
+add_executable(test-tokenizer-1-bpe test-tokenizer-1-bpe.cpp)
 target_link_libraries(test-tokenizer-1-bpe PRIVATE common)
 install(TARGETS test-tokenizer-1-bpe RUNTIME)
-llama_test(test-tokenizer-1-bpe.cpp    NAME test-tokenizer-1-falcon           ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-falcon.gguf)
+# TODO: disabled due to slowness
-llama_test(test-tokenizer-1-bpe.cpp    NAME test-tokenizer-1-aquila           ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-aquila.gguf)
+#llama_test(test-tokenizer-1-bpe NAME test-tokenizer-1-llama-bpe ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-llama-bpe.gguf)
-llama_test(test-tokenizer-1-bpe.cpp    NAME test-tokenizer-1-mpt              ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-mpt.gguf)
+#llama_test(test-tokenizer-1-bpe NAME test-tokenizer-1-falcon    ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-falcon.gguf)
-llama_test(test-tokenizer-1-bpe.cpp    NAME test-tokenizer-1-stablelm-3b-4e1t ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-stablelm-3b-4e1t.gguf)
+#llama_test(test-tokenizer-1-bpe NAME test-tokenizer-1-aquila    ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-aquila.gguf)
-llama_test(test-tokenizer-1-bpe.cpp    NAME test-tokenizer-1-gpt-neox         ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-gpt-neox.gguf)
+#llama_test(test-tokenizer-1-bpe NAME test-tokenizer-1-mpt       ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-mpt.gguf)
-llama_test(test-tokenizer-1-bpe.cpp    NAME test-tokenizer-1-refact           ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-refact.gguf)
+#llama_test(test-tokenizer-1-bpe NAME test-tokenizer-1-stablelm  ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-stablelm.gguf)
-llama_test(test-tokenizer-1-bpe.cpp    NAME test-tokenizer-1-starcoder        ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-starcoder.gguf)
+#llama_test(test-tokenizer-1-bpe NAME test-tokenizer-1-gpt-neox  ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-gpt-neox.gguf)
-llama_test(test-tokenizer-1-bpe.cpp    NAME test-tokenizer-1-gpt2             ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-gpt2.gguf)
+#llama_test(test-tokenizer-1-bpe NAME test-tokenizer-1-refact    ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-refact.gguf)
-#llama_test(test-tokenizer-1-bpe.cpp    NAME test-tokenizer-1-bloom            ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-bloom.gguf) # BIG
+#llama_test(test-tokenizer-1-bpe NAME test-tokenizer-1-starcoder ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-starcoder.gguf)
 #llama_test(test-tokenizer-1-bpe NAME test-tokenizer-1-gpt2      ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-gpt2.gguf)
 #llama_test(test-tokenizer-1-bpe NAME test-tokenizer-1-bloom     ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-bloom.gguf)
-llama_test(test-grammar-parser.cpp)
+# build test-tokenizer-1-spm target once and add many tests
-llama_test(test-llama-grammar.cpp)
+add_executable(test-tokenizer-1-spm test-tokenizer-1-spm.cpp)
-llama_test(test-grammar-integration.cpp)
+target_link_libraries(test-tokenizer-1-spm PRIVATE common)
-llama_test(test-grad0.cpp)
+install(TARGETS test-tokenizer-1-spm RUNTIME)
 # llama_test(test-opt.cpp) # SLOW
 llama_test(test-backend-ops.cpp)
-llama_test(test-rope.cpp)
+llama_test(test-tokenizer-1-spm  NAME test-tokenizer-1-llama-spm ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-llama-spm.gguf)
 #llama_test(test-tokenizer-1-spm  NAME test-tokenizer-1-baichuan  ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-baichuan.gguf)
-llama_test(test-model-load-cancel.cpp  LABEL "model")
+# llama_target_and_test(test-double-float.cpp) # SLOW
-llama_test(test-autorelease.cpp        LABEL "model")
+llama_target_and_test(test-quantize-fns.cpp)
 llama_target_and_test(test-quantize-perf.cpp)
 llama_target_and_test(test-sampling.cpp)
 llama_target_and_test(test-chat-template.cpp)
-llama_test(test-json-schema-to-grammar.cpp   WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}/..)
+llama_target_and_test(test-grammar-parser.cpp)
 llama_target_and_test(test-llama-grammar.cpp)
 llama_target_and_test(test-grammar-integration.cpp)
 llama_target_and_test(test-grad0.cpp)
 # llama_target_and_test(test-opt.cpp) # SLOW
 llama_target_and_test(test-backend-ops.cpp)
 llama_target_and_test(test-rope.cpp)
 llama_target_and_test(test-model-load-cancel.cpp  LABEL "model")
 llama_target_and_test(test-autorelease.cpp        LABEL "model")
 llama_target_and_test(test-json-schema-to-grammar.cpp   WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}/..)
 target_include_directories(test-json-schema-to-grammar PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/../examples/server)
 # dummy executable - not installed
--- a/tests/test-tokenizer-0-falcon.py
+++ b/tests/test-tokenizer-0-falcon.py
@ -1,4 +1,11 @@
 # tests with BPE tokenizer
 #
 # sample usage:
 #
 #   python3 tests/test-tokenizer-0-bpe.py ~/Data/huggingface/Meta-Llama-3-8B-Instruct/
 #   python3 tests/test-tokenizer-0-bpe.py ~/Data/huggingface/falcon-7b/
 #   python3 tests/test-tokenizer-0-bpe.py ~/Data/huggingface/deepseek-coder-6.7b-instruct/
 #
 import argparse
@ -20,6 +27,8 @@ tests = [
    "   ",
    "\t",
    "\n",
    "\n\n",
    "\n\n\n",
    "\t\n",
    "Hello world",
    " Hello world",
@ -39,8 +48,19 @@ tests = [
    "   Hello",
    "    Hello",
    "    Hello\n    Hello",
    " (",
    "\n =",
    "' era",
    "Hello, y'all! How are you 😁 ?我想在apple工作1314151天～",
    "3",
    "33",
    "333",
    "3333",
    "33333",
    "333333",
    "3333333",
    "33333333",
    "333333333",
 ]
 for text in tests:
@ -76,7 +96,22 @@ if fname_tok:
        # write to file
        with open(fname_out, 'w', encoding='utf-8') as f:
            for x in res:
-                f.write(str(x) + ' \'' + tokenizer.decode(x) + '\'\n')
+                # LLaMA v3 for some reason strips the space for these tokens (and others)
                # if x == 662:
                #     f.write(str(x) + ' \' ' + tokenizer.decode(x) + '\'\n')
                # elif x == 1174:
                #     f.write(str(x) + ' \' ' + tokenizer.decode(x) + '\'\n')
                # elif x == 2564:
                #     f.write(str(x) + ' \' ' + tokenizer.decode(x) + '\'\n')
                # elif x == 758:
                #     f.write(str(x) + ' \' ' + tokenizer.decode(x) + '\'\n')
                # elif x == 949:
                #     f.write(str(x) + ' \' ' + tokenizer.decode(x) + '\'\n')
                # elif x == 5354:
                #     f.write(str(x) + ' \' ' + tokenizer.decode(x) + '\'\n')
                # else:
                #     f.write(str(x) + ' \'' + tokenizer.decode(x) + '\'\n')
                f.write(str(x) + ' \'' + tokenizer.decode(x).strip() + '\'\n')
        print('len(res): ', len(res))
        print('len(lines): ', len(lines))
    print('results written to: ', fname_out)
--- a/tests/test-tokenizer-0-falcon.cpp
+++ b/tests/test-tokenizer-0-falcon.cpp
@ -1,187 +0,0 @@
 #include "llama.h"
 #include "common.h"
 #include "console.h"
 #include <cstdio>
 #include <string>
 #include <map>
 #include <vector>
 #include <fstream>
 // generate using test-tokenizer-0-falcon.py
 static const std::map<std::string, std::vector<llama_token>> & k_tests() {
    static std::map<std::string, std::vector<llama_token>> _k_tests = {
        { ""                      , {  }, },
        { " "                     , {     204, }, },
        { "  "                    , {     258, }, },
        { "   "                   , {     466, }, },
        { "\t"                    , {     192, }, },
        { "\n"                    , {     193, }, },
        { "\t\n"                  , {   19125, }, },
        { "Hello world"           , {    9856,   1079, }, },
        { " Hello world"          , {   23090,   1079, }, },
        { "Hello World"           , {    9856,   2889, }, },
        { " Hello World"          , {   23090,   2889, }, },
        { " Hello World!"         , {   23090,   2889,     12, }, },
        { "Hello, world!"         , {    9856,     23,   1079,     12, }, },
        { " Hello, world!"        , {   23090,     23,   1079,     12, }, },
        { " this is 🦙.cpp"        , {     414,    304,   3346,    111,    231,     25,  29247, }, },
        { "w048 7tuijk dsdfhu"    , {      98,  55866,    204,     34,  16682,   7149,  36190,   6869,  11481, }, },
        { "нещо на Български"     , {     150,    133,   6207,    151,    215,    150,    134,   5052,    133,   6279,   5052,    223,    151,    216,  49679,    123,  53110,  47043,   7795, }, },
        { "កាន់តែពិសេសអាចខលចេញ"   , {   38154,    206,  38154,    126,  38154,    225,    167,    237,    217,  38154,    221,    167,    237,    208,  38154,    228,  38154,    127,  38154,    237,    167,    237,    207,  38154,    237,  38154,    107,  38154,    126,  38154,    211,  38154,    207,  38154,    233,  38154,    211,    167,    237,    207,  38154,    215, }, },
        { "🚀 (normal) 😶‍🌫️ (multiple emojis concatenated) ✅ (only emoji that has its own token)", {    2571,    232,    206,    204,     19,  11003,     20,   8196,    126,    283,    219,  48778,    116,  13392,    204,     19,  51831,    732,  63209,   1741,   7955,    522,     20,  22438,    211,    204,     19,   7927,  53360,    325,    504,    701,    946,  10930,     20, }, },
        { "Hello"                 , {    9856, }, },
        { " Hello"                , {   23090, }, },
        { "  Hello"               , {     204,  23090, }, },
        { "   Hello"              , {     258,  23090, }, },
        { "    Hello"             , {     466,  23090, }, },
        { "    Hello\n    Hello"  , {     466,  23090,    742,  23090, }, },
        { "\n ="                  , {    1212,     40, }, },
        { "' era"                 , {      18,   4932, }, },
    };
    return _k_tests;
 }
 int main(int argc, char **argv) {
    if (argc < 2) {
        fprintf(stderr, "Usage: %s vocab-file [text-file]\n", argv[0]);
        return 1;
    }
    const std::string fname = argv[1];
    std::string fname_text;
    if (argc > 2) {
        fname_text = argv[2];
    }
    fprintf(stderr, "%s : reading vocab from: '%s'\n", __func__, fname.c_str());
    llama_model * model;
    llama_context * ctx;
    llama_backend_init();
    // load the vocab
    {
        auto mparams = llama_model_default_params();
        mparams.vocab_only = true;
        model = llama_load_model_from_file(fname.c_str(), mparams);
        if (model == NULL) {
            fprintf(stderr, "%s: error: failed to load vocab '%s'\n", __func__, fname.c_str());
            return 1;
        }
        auto cparams = llama_context_default_params();
        ctx = llama_new_context_with_model(model, cparams);
        if (ctx == NULL) {
            fprintf(stderr, "%s: error: failed to load vocab '%s'\n", __func__, fname.c_str());
            llama_free_model(model);
            return 1;
        }
    }
    if (llama_vocab_type(model) != LLAMA_VOCAB_TYPE_BPE) {
        fprintf(stderr, "%s : error: vocab type is not BPE\n", __func__);
        llama_free_model(model);
        llama_free(ctx);
        return 2;
    }
 #ifdef _WIN32
    // We need this for unicode console support
    console::init(false, false);
    atexit([]() { console::cleanup(); });
 #endif
    bool success = true;
    for (const auto & test_kv : k_tests()) {
        const std::vector<llama_token> res = llama_tokenize(ctx, test_kv.first, false);
        printf("\n");
        printf("src: '%s'\n", test_kv.first.c_str());
        printf("res: '%s'\n", llama_detokenize_bpe(ctx, res).c_str());
        printf("tok: ");
        for (const auto & tok : res) {
            printf("%d ", tok);
        }
        printf("\n");
        bool correct = res.size() == test_kv.second.size();
        for (int i = 0; i < (int) res.size() && correct; ++i) {
            if (test_kv.second[i] != res[i]) {
                correct = false;
            }
        }
        if (!correct) {
            fprintf(stderr, "%s : failed test:    '%s'\n", __func__, test_kv.first.c_str());
            fprintf(stderr, "%s : detokenized to: '%s' instead of '%s'\n", __func__,
                llama_detokenize_bpe(ctx, res).c_str(),
                llama_detokenize_bpe(ctx, test_kv.second).c_str());
            fprintf(stderr, "%s : expected tokens: ", __func__);
            for (const auto & t : test_kv.second) {
                fprintf(stderr, "%6d, ", t);
            }
            fprintf(stderr, "\n");
            fprintf(stderr, "%s : got tokens:      ", __func__);
            for (const auto & t : res) {
                fprintf(stderr, "%6d, ", t);
            }
            fprintf(stderr, "\n");
            success = false;
        }
    }
    if (!fname_text.empty()) {
        fprintf(stderr, "%s : tokenizing: '%s'\n", __func__, fname_text.c_str());
        std::string text;
        {
            std::ifstream ifs(fname_text);
            if (!ifs) {
                fprintf(stderr, "%s : error: could not open file '%s'\n", __func__, fname_text.c_str());
                return 1;
            }
            text = std::string(std::istreambuf_iterator<char>(ifs), std::istreambuf_iterator<char>());
        }
        fprintf(stderr, "%s : text size: %zu\n", __func__, text.size());
        const std::vector<llama_token> res = llama_tokenize(ctx, text, false);
        fprintf(stderr, "%s : tokens: %zu\n", __func__, res.size());
        {
            const std::string fname_out = fname_text + ".tokcpp";
            std::ofstream ofs(fname_out);
            if (!ofs) {
                fprintf(stderr, "%s : error: could not open file '%s'\n", __func__, fname_out.c_str());
                return 1;
            }
            for (const auto & tok : res) {
                ofs << tok << " '" << llama_detokenize_bpe(ctx, std::vector<int>{tok}) << "'" << std::endl;
            }
        }
        fprintf(stderr, "%s : tokens written to '%s'\n", __func__, (fname_text + ".tokcpp").c_str());
    }
    llama_free_model(model);
    llama_free(ctx);
    llama_backend_free();
    return success ? 0 : 3;
 }
--- a/tests/test-tokenizer-0-llama.cpp
+++ b/tests/test-tokenizer-0-llama.cpp
@ -1,190 +0,0 @@
 #include "llama.h"
 #include "common.h"
 #include "console.h"
 #include <cstdio>
 #include <string>
 #include <map>
 #include <vector>
 #include <fstream>
 // generate using test-tokenizer-0-llama.py
 static const std::map<std::string, std::vector<llama_token>> & k_tests() {
    static std::map<std::string, std::vector<llama_token>> _k_tests = {
        { ""                      , {  }, },
        { " "                     , {     259, }, },
        { "  "                    , {    1678, }, },
        { "   "                   , {     268, }, },
        { "\t"                    , {   29871,     12, }, },
        { "\n"                    , {   29871,     13, }, },
        { "\t\n"                  , {   29871,     12,     13, }, },
        { "Hello world"           , {   15043,   3186, }, },
        { " Hello world"          , {   29871,  15043,   3186, }, },
        { "Hello World"           , {   15043,   2787, }, },
        { " Hello World"          , {   29871,  15043,   2787, }, },
        { " Hello World!"         , {   29871,  15043,   2787,  29991, }, },
        { "Hello, world!"         , {   15043,  29892,   3186,  29991, }, },
        { " Hello, world!"        , {   29871,  15043,  29892,   3186,  29991, }, },
        { " this is 🦙.cpp"        , {   29871,    445,    338,  29871,    243,    162,    169,    156,  29889,   8223, }, },
        { "w048 7tuijk dsdfhu"    , {     281,  29900,  29946,  29947,  29871,  29955,   9161,  13535,  18031,   2176,   6905, }, },
        { "нещо на Български"     , {    1538,   4851,    665,   1386,  29713,   1305, }, },
        { "កាន់តែពិសេសអាចខលចេញ"   , {   29871,  31849,  31324,  31934,    228,    162,    142,    228,    161,    146,    228,    162,    133,    228,    161,    153,    228,    161,    186,  31708,    228,    162,    132,  31708,    228,    161,    165,  31324,    228,    161,    136,    228,    161,    132,    228,    161,    158,    228,    161,    136,    228,    162,    132,    228,    161,    140, }, },
        { "🚀 (normal) 😶‍🌫️ (multiple emojis concatenated) ✅ (only emoji that has its own token)", {   29871,    243,    162,    157,    131,    313,   8945,  29897,  29871,    243,    162,    155,    185,  30722,    243,    162,    143,    174,  30598,    313,  20787,    953,   3848,    275,  16125,    630,  29897,  29871,  31681,    313,   6194,    953,  29877,   2397,    393,    756,    967,   1914,   5993,  29897, }, },
        { "Hello"                 , {   15043, }, },
        { " Hello"                , {   29871,  15043, }, },
        { "  Hello"               , {     259,  15043, }, },
        { "   Hello"              , {    1678,  15043, }, },
        { "    Hello"             , {     268,  15043, }, },
        { "    Hello\n    Hello"  , {     268,  15043,     13,   1678,  15043, }, },
        { " ("                    , {   29871,  313, }, },
    };
    return _k_tests;
 }
 int main(int argc, char **argv) {
    if (argc < 2) {
        fprintf(stderr, "Usage: %s vocab-file [text-file]\n", argv[0]);
        return 1;
    }
    const std::string fname = argv[1];
    std::string fname_text;
    if (argc > 2) {
        fname_text = argv[2];
    }
    fprintf(stderr, "%s : reading vocab from: '%s'\n", __func__, fname.c_str());
    llama_model * model;
    llama_context * ctx;
    llama_backend_init();
    // load the vocab
    {
        auto mparams = llama_model_default_params();
        mparams.vocab_only = true;
        model = llama_load_model_from_file(fname.c_str(), mparams);
        if (model == NULL) {
            fprintf(stderr, "%s: error: failed to load vocab '%s'\n", __func__, fname.c_str());
            return 1;
        }
        auto cparams = llama_context_default_params();
        ctx = llama_new_context_with_model(model, cparams);
        if (ctx == NULL) {
            fprintf(stderr, "%s: error: failed to load vocab '%s'\n", __func__, fname.c_str());
            llama_free_model(model);
            return 1;
        }
    }
    if (llama_vocab_type(model) != LLAMA_VOCAB_TYPE_SPM) {
        fprintf(stderr, "%s : error: vocab type is not SPM\n", __func__);
        llama_free_model(model);
        llama_free(ctx);
        return 2;
    }
 #ifdef _WIN32
    // We need this for unicode console support
    console::init(false, false);
    atexit([]() { console::cleanup(); });
 #endif
    bool success = true;
    for (const auto & test_kv : k_tests()) {
        const std::vector<llama_token> res_bos   = llama_tokenize(ctx, test_kv.first, true);
        const std::vector<llama_token> res_nobos = llama_tokenize(ctx, test_kv.first, false);
        printf("\n");
        printf("src: '%s'\n", test_kv.first.c_str());
        printf("res: '%s'\n", llama_detokenize_spm(ctx, res_bos).c_str());
        printf("tok: ");
        for (const auto & tok : res_bos) {
            printf("%d ", tok);
        }
        printf("\n");
        bool correct = res_nobos.size() == test_kv.second.size() && res_bos.size() == res_nobos.size() + 1 && res_bos[0] == 1;
        for (int i = 0; i < (int) res_nobos.size() && correct; ++i) {
            if (test_kv.second[i] != res_bos[i + 1]) {
                correct = false;
            }
            if (test_kv.second[i] != res_nobos[i]) {
                correct = false;
            }
        }
        if (!correct) {
            fprintf(stderr, "%s : failed test:    '%s'\n", __func__, test_kv.first.c_str());
            fprintf(stderr, "%s : detokenized to: '%s' instead of '%s'\n", __func__,
                llama_detokenize_spm(ctx, res_nobos).c_str(),
                llama_detokenize_spm(ctx, test_kv.second).c_str());
            fprintf(stderr, "%s : expected tokens: ", __func__);
            for (const auto & t : test_kv.second) {
                fprintf(stderr, "%6d, ", t);
            }
            fprintf(stderr, "\n");
            fprintf(stderr, "%s : got tokens:      ", __func__);
            for (const auto & t : res_nobos) {
                fprintf(stderr, "%6d, ", t);
            }
            fprintf(stderr, "\n");
            success = false;
        }
    }
    if (!fname_text.empty()) {
        fprintf(stderr, "%s : tokenizing: '%s'\n", __func__, fname_text.c_str());
        std::string text;
        {
            std::ifstream ifs(fname_text);
            if (!ifs) {
                fprintf(stderr, "%s : error: could not open file '%s'\n", __func__, fname_text.c_str());
                return 1;
            }
            text = std::string(std::istreambuf_iterator<char>(ifs), std::istreambuf_iterator<char>());
        }
        fprintf(stderr, "%s : text size: %zu\n", __func__, text.size());
        const std::vector<llama_token> res = llama_tokenize(ctx, text, true);
        fprintf(stderr, "%s : tokens: %zu\n", __func__, res.size());
        {
            const std::string fname_out = fname_text + ".tokcpp";
            std::ofstream ofs(fname_out);
            if (!ofs) {
                fprintf(stderr, "%s : error: could not open file '%s'\n", __func__, fname_out.c_str());
                return 1;
            }
            for (const auto & tok : res) {
                ofs << tok << " '" << llama_detokenize_spm(ctx, std::vector<int>{tok}) << "'" << std::endl;
            }
        }
        fprintf(stderr, "%s : tokens written to '%s'\n", __func__, (fname_text + ".tokcpp").c_str());
    }
    llama_free_model(model);
    llama_free(ctx);
    llama_backend_free();
    return success ? 0 : 3;
 }
--- a/tests/test-tokenizer-0-llama.py
+++ b/tests/test-tokenizer-0-llama.py
@ -1,4 +1,11 @@
 # tests with SPM tokenizer
 #
 # sample usage:
 #
 #   python3 tests/test-tokenizer-0-spm.py ~/Data/huggingface/Llama-2-7b-hf/
 #   python3 tests/test-tokenizer-0-spm.py ~/Data/huggingface/CodeLlama-34b-Instruct-hf/
 #
 import argparse
@ -20,6 +27,8 @@ tests = [
    "   ",
    "\t",
    "\n",
    "\n\n",
    "\n\n\n",
    "\t\n",
    "Hello world",
    " Hello world",
@ -39,6 +48,19 @@ tests = [
    "   Hello",
    "    Hello",
    "    Hello\n    Hello",
    " (",
    "\n =",
    "' era",
    "Hello, y'all! How are you 😁 ?我想在apple工作1314151天～",
    "3",
    "33",
    "333",
    "3333",
    "33333",
    "333333",
    "3333333",
    "33333333",
    "333333333",
 ]
--- a/tests/test-tokenizer-0.cpp
+++ b/tests/test-tokenizer-0.cpp
@ -0,0 +1,271 @@
 #include "llama.h"
 #include "common.h"
 #include "console.h"
 #include <cstdio>
 #include <string>
 #include <map>
 #include <vector>
 #include <fstream>
 //static const std::map<std::string, std::vector<llama_token>> & k_tests() {
 //    static std::map<std::string, std::vector<llama_token>> _k_tests = {
 //        { ""                      , {  }, },
 //        { " "                     , {     220, }, },
 //        { "  "                    , {     256, }, },
 //        { "   "                   , {     262, }, },
 //        { "\t"                    , {     197, }, },
 //        { "\n"                    , {     198, }, },
 //        { "\n\n"                  , {     271, }, },
 //        { "\n\n\n"                , {    1432, }, },
 //        { "\t\n"                  , {    1602, }, },
 //        { "Hello world"           , {    9906,   1917, }, },
 //        { " Hello world"          , {   22691,   1917, }, },
 //        { "Hello World"           , {    9906,   4435, }, },
 //        { " Hello World"          , {   22691,   4435, }, },
 //        { " Hello World!"         , {   22691,   4435,      0, }, },
 //        { "Hello, world!"         , {    9906,     11,   1917,      0, }, },
 //        { " Hello, world!"        , {   22691,     11,   1917,      0, }, },
 //        { " this is 🦙.cpp"        , {     420,    374,  11410,     99,    247,     13,  11055, }, },
 //        { "w048 7tuijk dsdfhu"    , {      86,  23904,    220,     22,     83,   2005,  42908,  11729,   3013,  17156, }, },
 //        { "нещо на Български"     , {   79862, 102118,  13373,  64571,  34694,   3114, 112203,  80112, }, },
 //        { "កាន់តែពិសេសអាចខលចេញ"   , {   21549,    222,  98629,    241,  45358,    233,  21549,    237,  45358,    224,  21549,    244,  21549,    115,  21549,    253,  45358,    223,  21549,    253,  21549,     95,  98629,    227,  21549,    223,  21549,    249,  21549,    227,  45358,    223,  21549,    231, }, },
 //        { "🚀 (normal) 😶‍🌫️ (multiple emojis concatenated) ✅ (only emoji that has its own token)", {    9468,    248,    222,    320,   8416,      8,  27623,    114, 102470,   9468,    234,    104,  31643,    320,  36773, 100166,  98634,      8,  26602,    227,    320,   3323,  43465,    430,    706,   1202,   1866,   4037,      8, }, },
 //        { "Hello"                 , {    9906, }, },
 //        { " Hello"                , {   22691, }, },
 //        { "  Hello"               , {     220,  22691, }, },
 //        { "   Hello"              , {     256,  22691, }, },
 //        { "    Hello"             , {     262,  22691, }, },
 //        { "    Hello\n    Hello"  , {     262,  22691,    198,    262,  22691, }, },
 //        { " ("                    , {     320, }, },
 //        { "\n ="                  , {     198,    284, }, },
 //        { "' era"                 , {       6,  11639, }, },
 //        { "Hello, y'all! How are you 😁 ?我想在apple工作1314151天～", {    9906,     11,    379,  65948,      0,   2650,    527,    499,  27623,    223,    949,  37046, 101067,  19000,  23182, 102301,   9263,  18136,     16,  36827,  21909, }, },
 //        { "3"                     , {      18, }, },
 //        { "33"                    , {    1644, }, },
 //        { "333"                   , {    8765, }, },
 //        { "3333"                  , {    8765,     18, }, },
 //        { "33333"                 , {    8765,   1644, }, },
 //        { "333333"                , {    8765,   8765, }, },
 //        { "3333333"               , {    8765,   8765,     18, }, },
 //        { "33333333"              , {    8765,   8765,   1644, }, },
 //        { "333333333"             , {    8765,   8765,   8765, }, },
 //    };
 //
 //    return _k_tests;
 //}
 static std::map<std::string, std::vector<llama_token>> read_tests(const std::string & fname_inp, const std::string & fname_out) {
    std::map<std::string, std::vector<llama_token>> tests;
    std::ifstream ifs_inp(fname_inp);
    if (!ifs_inp) {
        fprintf(stderr, "%s : error: could not open file '%s'\n", __func__, fname_inp.c_str());
        return tests;
    }
    std::string sraw((std::istreambuf_iterator<char>(ifs_inp)), std::istreambuf_iterator<char>());
    std::ifstream ifs_out(fname_out);
    if (!ifs_out) {
        fprintf(stderr, "%s : error: could not open file '%s'\n", __func__, fname_out.c_str());
        return tests;
    }
    std::vector<std::string> sout;
    for (std::string line; std::getline(ifs_out, line);) {
        sout.push_back(line);
    }
    const std::string sep = "\n__ggml_vocab_test__\n";
    std::vector<std::string> sinp;
    size_t pos = 0;
    while (pos < sraw.size()) {
        const size_t next = sraw.find(sep, pos);
        if (next == std::string::npos) {
            sinp.push_back(sraw.substr(pos));
            break;
        }
        sinp.push_back(sraw.substr(pos, next - pos));
        pos = next + sep.size();
    }
    if (sinp.size() != sout.size()) {
        fprintf(stderr, "%s : error: input and output files have different number of tests\n", __func__);
        return tests;
    }
    for (size_t i = 0; i < sinp.size(); ++i) {
        const std::string & s = sinp[i];
        const std::string & o = string_strip(sout[i]);
        std::vector<llama_token> toks;
        size_t pos = 0;
        while (pos < o.size()) {
            size_t next = o.find(' ', pos);
            if (next == std::string::npos) {
                next = o.size();
            }
            const std::string stok = o.substr(pos, next - pos);
            toks.push_back(std::stoi(stok));
            pos = next + 1;
        }
        tests[s] = toks;
    }
    return tests;
 }
 int main(int argc, char **argv) {
    if (argc < 2) {
        fprintf(stderr, "Usage: %s vocab-file [text-file]\n", argv[0]);
        return 1;
    }
    const std::string fname = argv[1];
    const std::string fname_inp = fname + ".inp";
    const std::string fname_out = fname + ".out";
    std::string fname_text;
    if (argc > 2) {
        fname_text = argv[2];
    }
    fprintf(stderr, "%s : reading vocab from: '%s'\n", __func__, fname.c_str());
    llama_model * model;
    llama_context * ctx;
    llama_backend_init();
    // load the vocab
    {
        auto mparams = llama_model_default_params();
        mparams.vocab_only = true;
        model = llama_load_model_from_file(fname.c_str(), mparams);
        if (model == NULL) {
            fprintf(stderr, "%s: error: failed to load vocab '%s'\n", __func__, fname.c_str());
            return 1;
        }
        auto cparams = llama_context_default_params();
        ctx = llama_new_context_with_model(model, cparams);
        if (ctx == NULL) {
            fprintf(stderr, "%s: error: failed to load vocab '%s'\n", __func__, fname.c_str());
            llama_free_model(model);
            return 1;
        }
    }
 #ifdef _WIN32
    // We need this for unicode console support
    console::init(false, false);
    atexit([]() { console::cleanup(); });
 #endif
    bool success = true;
    const auto k_tests = read_tests(fname_inp, fname_out);
    if (k_tests.empty()) {
        fprintf(stderr, "%s : error: no tests found\n", __func__);
        return 1;
    }
    const bool add_special = false;
    for (const auto & test_kv : k_tests) {
        const std::vector<llama_token> res = llama_tokenize(ctx, test_kv.first, add_special);
        printf("\n");
        printf("src: '%s'\n", test_kv.first.c_str());
        printf("res: '%s'\n", llama_detokenize_bpe(ctx, res).c_str());
        printf("tok: ");
        for (const auto & tok : res) {
            printf("%d ", tok);
        }
        printf("\n");
        bool correct = res.size() == test_kv.second.size();
        for (int i = 0; i < (int) res.size() && correct; ++i) {
            if (test_kv.second[i] != res[i]) {
                correct = false;
            }
        }
        if (!correct) {
            fprintf(stderr, "%s : failed test:    '%s'\n", __func__, test_kv.first.c_str());
            fprintf(stderr, "%s : detokenized to: '%s' instead of '%s'\n", __func__,
                llama_detokenize_bpe(ctx, res).c_str(),
                llama_detokenize_bpe(ctx, test_kv.second).c_str());
            fprintf(stderr, "%s : expected tokens: ", __func__);
            for (const auto & t : test_kv.second) {
                fprintf(stderr, "%6d '%s', ", t, llama_token_to_piece(ctx, t).c_str());
            }
            fprintf(stderr, "\n");
            fprintf(stderr, "%s : got tokens:      ", __func__);
            for (const auto & t : res) {
                fprintf(stderr, "%6d '%s', ", t, llama_token_to_piece(ctx, t).c_str());
            }
            fprintf(stderr, "\n");
            success = false;
        }
    }
    if (!fname_text.empty()) {
        fprintf(stderr, "%s : tokenizing: '%s'\n", __func__, fname_text.c_str());
        std::string text;
        {
            std::ifstream ifs(fname_text);
            if (!ifs) {
                fprintf(stderr, "%s : error: could not open file '%s'\n", __func__, fname_text.c_str());
                return 1;
            }
            text = std::string(std::istreambuf_iterator<char>(ifs), std::istreambuf_iterator<char>());
        }
        fprintf(stderr, "%s : text size: %zu\n", __func__, text.size());
        const std::vector<llama_token> res = llama_tokenize(ctx, text, add_special);
        fprintf(stderr, "%s : tokens: %zu\n", __func__, res.size());
        {
            const std::string fname_out = fname_text + ".tokcpp";
            std::ofstream ofs(fname_out);
            if (!ofs) {
                fprintf(stderr, "%s : error: could not open file '%s'\n", __func__, fname_out.c_str());
                return 1;
            }
            for (const auto & tok : res) {
                ofs << tok << " '" << string_strip(llama_detokenize_bpe(ctx, std::vector<int>{tok})) << "'" << std::endl;
            }
        }
        fprintf(stderr, "%s : tokens written to '%s'\n", __func__, (fname_text + ".tokcpp").c_str());
    }
    llama_free_model(model);
    llama_free(ctx);
    llama_backend_free();
    printf("\n");
    printf("Tests %s\n", success ? "passed" : "failed");
    return success ? 0 : 3;
 }
--- a/tests/test-tokenizer-1-llama.cpp
+++ b/tests/test-tokenizer-1-llama.cpp
@ -12,7 +12,7 @@
 #include <thread>
 #include <vector>
-int main(int argc, char **argv) {
+int main(int argc, char ** argv) {
    if (argc < 2) {
        fprintf(stderr, "Usage: %s <vocab-file>\n", argv[0]);
        return 1;
--- a/unicode-data.cpp
+++ b/unicode-data.cpp
@ -1,4 +1,4 @@
-#include "unicode-data.h"
+#include "unicode-data.h"
 #include <cstdint>
 #include <map>
--- a/unicode.cpp
+++ b/unicode.cpp
@ -5,11 +5,14 @@
 #include <cstddef>
 #include <cstdint>
 #include <map>
 #include <regex>
 #include <stdexcept>
 #include <string>
 #include <unordered_map>
 #include <utility>
 #include <vector>
 #include <locale>
 #include <codecvt>
 static std::string unicode_cpts_to_utf8(const std::vector<uint32_t> & cps) {
    std::string result;
@ -53,23 +56,22 @@ static uint32_t unicode_cpt_from_utf8(const std::string & utf8, size_t & offset)
        offset += 4;
        return result;
    }
-    throw std::invalid_argument("invalid string");
+    throw std::invalid_argument("failed to convert utf8 to codepoint");
 }
-static std::vector<uint16_t> unicode_cpt_to_utf16(uint32_t cp) {
+//static std::vector<uint16_t> unicode_cpt_to_utf16(uint32_t cp) {
-    std::vector<uint16_t> result;
+//    std::vector<uint16_t> result;
-    if (/* 0x0000 <= cp && */ cp <= 0xffff) {
+//    if (/* 0x0000 <= cp && */ cp <= 0xffff) {
-        result.emplace_back(cp);
+//        result.emplace_back(cp);
-    }
+//        return result;
-    else if (0x10000 <= cp && cp <= 0x10ffff) {
+//    }
-        result.emplace_back(0xd800 | ((cp - 0x10000) >> 10));
+//    if (0x10000 <= cp && cp <= 0x10ffff) {
-        result.emplace_back(0xdc00 | ((cp - 0x10000) & 0x03ff));
+//        result.emplace_back(0xd800 | ((cp - 0x10000) >> 10));
-    }
+//        result.emplace_back(0xdc00 | ((cp - 0x10000) & 0x03ff));
-    else {
+//        return result;
-        throw std::invalid_argument("invalid cpt");
+//    }
-    }
+//    throw std::invalid_argument("failed to convert codepoint to utf16");
-    return result;
+//}
 }
 //static std::vector<uint16_t> unicode_cpts_to_utf16(const std::vector<uint32_t> & cps) {
 //    std::vector<uint16_t> result;
@ -80,28 +82,28 @@ static std::vector<uint16_t> unicode_cpt_to_utf16(uint32_t cp) {
 //    return result;
 //}
-static uint32_t cpt_from_utf16(const std::vector<uint16_t> & utf16, size_t & offset) {
+//static uint32_t unicode_cpt_from_utf16(const std::vector<uint16_t> & utf16, size_t & offset) {
-    assert(offset < utf16.size());
+//    assert(offset < utf16.size());
-    if (((utf16[0] >> 10) << 10) != 0xd800) {
+//    if (((utf16[0] >> 10) << 10) != 0xd800) {
-        auto result = utf16[offset + 0];
+//        auto result = utf16[offset + 0];
-        offset += 1;
+//        offset += 1;
-        return result;
+//        return result;
-    }
+//    }
-
+//
-    if (offset + 1 >= utf16.size() || !((utf16[1] & 0xdc00) == 0xdc00)) {
+//    if (offset + 1 >= utf16.size() || !((utf16[1] & 0xdc00) == 0xdc00)) {
-        throw std::invalid_argument("invalid character");
+//        throw std::invalid_argument("invalid character");
-    }
+//    }
-
+//
-    auto result = 0x10000 + (((utf16[0] & 0x03ff) << 10) | (utf16[1] & 0x03ff));
+//    auto result = 0x10000 + (((utf16[0] & 0x03ff) << 10) | (utf16[1] & 0x03ff));
-    offset += 2;
+//    offset += 2;
-    return result;
+//    return result;
-}
+//}
 //static std::vector<uint32_t> unicode_cpts_from_utf16(const std::vector<uint16_t> & utf16) {
 //    std::vector<uint32_t> result;
 //    size_t offset = 0;
 //    while (offset < utf16.size()) {
-//        result.push_back(cpt_from_utf16(utf16, offset));
+//        result.push_back(unicode_cpt_from_utf16(utf16, offset));
 //    }
 //    return result;
 //}
@ -194,34 +196,277 @@ static std::unordered_map<std::string, uint8_t> unicode_utf8_to_byte_map() {
    return map;
 }
 static inline std::wstring unicode_wstring_from_utf8(const std::string & s) {
    std::wstring_convert<std::codecvt_utf8<wchar_t>> conv;
    return conv.from_bytes(s);
 }
 static std::vector<std::string> unicode_byte_encoding_process(const std::vector<std::string> & bpe_words) {
    std::vector<std::string> bpe_encoded_words;
    for (const auto & word : bpe_words) {
        std::string text_utf;
        auto utf_word =  unicode_cpts_from_utf8(word);
        for (size_t i = 0; i < utf_word.size(); ++i) {
            text_utf += unicode_cpt_to_utf8(utf_word[i]);
        }
        std::string encoded_token;
        for (char & c : text_utf) {
            encoded_token += unicode_byte_to_utf8(c);
        }
        bpe_encoded_words.emplace_back(encoded_token);
    }
    return bpe_encoded_words;
 }
 // GPT2 system regex:  's|'t|'re|'ve|'m|'ll|'d| ?\p{L}+| ?\p{N}+| ?[^\s\p{L}\p{N}]+|\s+(?!\S)|\s+
 static std::vector<size_t> unicode_regex_split_custom_gpt2(const std::string & text, const std::vector<size_t> & offsets) {
    std::vector<size_t> bpe_offsets; // store the offset of each word
    bpe_offsets.reserve(offsets.size()); // Reserve memory for the approximate size
    size_t start = 0;
    const auto cpts = unicode_cpts_from_utf8(text);
    for (auto offset : offsets) {
        std::string token;
        bool collecting_numeric = false;
        bool collecting_letter = false;
        bool collecting_special = false;
        bool collecting_whitespace_lookahead = false;
        bool collecting = false;
        std::vector<std::string> text_utf;
        text_utf.reserve(offset);
        for (size_t i = start; i < start + offset; ++i) {
            text_utf.emplace_back(unicode_cpt_to_utf8(cpts[i]));
        }
        for (int i = 0; i < (int)text_utf.size(); i++) {
            const std::string & utf_char = text_utf[i];
            bool split_condition = false;
            int bytes_remain = text_utf.size() - i;
            // forward backward lookups
            const std::string & utf_char_next      = (i + 1 < (int)text_utf.size()) ? text_utf[i + 1] : "";
            const std::string & utf_char_next_next = (i + 2 < (int)text_utf.size()) ? text_utf[i + 2] : "";
            // handling contractions
            if (!split_condition && bytes_remain >= 2) {
                // 's|'t|'m|'d
                if (utf_char == "\'" && (utf_char_next == "s" || utf_char_next == "t" || utf_char_next == "m" || utf_char_next == "d")) {
                    split_condition = true;
                }
                if (split_condition) {
                    if (token.size()) {
                        bpe_offsets.emplace_back(unicode_cpts_from_utf8(token).size());
                    }
                    token = utf_char + utf_char_next;
                    bpe_offsets.emplace_back(unicode_cpts_from_utf8(token).size());
                    token = "";
                    i++;
                    continue;
                }
            }
            if (!split_condition && bytes_remain >= 3) {
                // 're|'ve|'ll
                if (utf_char == "\'" && (
                    (utf_char_next == "r" && utf_char_next_next == "e") ||
                    (utf_char_next == "v" && utf_char_next_next == "e") ||
                    (utf_char_next == "l" && utf_char_next_next == "l"))
                    ) {
                    split_condition = true;
                }
                if (split_condition) {
                    // current token + next token can be defined
                    if (token.size()) {
                        bpe_offsets.emplace_back(unicode_cpts_from_utf8(token).size());
                    }
                    token =  utf_char;
                    token += utf_char_next;
                    token += utf_char_next_next;
                    bpe_offsets.emplace_back(unicode_cpts_from_utf8(token).size());
                    token = "";
                    i += 2;
                    continue;
                }
            }
            if (!split_condition && !collecting) {
                if (unicode_cpt_type(utf_char) == CODEPOINT_TYPE_LETTER || (token.empty() && utf_char == " " && unicode_cpt_type(utf_char_next) == CODEPOINT_TYPE_LETTER)) {
                    collecting_letter = true;
                    collecting = true;
                }
                else if (unicode_cpt_type(utf_char) == CODEPOINT_TYPE_DIGIT || (token.empty() && utf_char == " " && unicode_cpt_type(utf_char_next) == CODEPOINT_TYPE_DIGIT)) {
                    collecting_numeric = true;
                    collecting = true;
                }
                else if (
                    ((unicode_cpt_type(utf_char) != CODEPOINT_TYPE_LETTER && unicode_cpt_type(utf_char) != CODEPOINT_TYPE_DIGIT) && (unicode_cpt_type(utf_char) != CODEPOINT_TYPE_WHITESPACE)) ||
                    (token.empty() && utf_char == " " && unicode_cpt_type(utf_char_next) != CODEPOINT_TYPE_LETTER && unicode_cpt_type(utf_char_next) != CODEPOINT_TYPE_DIGIT && unicode_cpt_type(utf_char_next) != CODEPOINT_TYPE_WHITESPACE)
                    ) {
                    collecting_special = true;
                    collecting = true;
                }
                else if (unicode_cpt_type(utf_char) == CODEPOINT_TYPE_WHITESPACE && unicode_cpt_type(utf_char_next) == CODEPOINT_TYPE_WHITESPACE) {
                    collecting_whitespace_lookahead = true;
                    collecting = true;
                }
                else if (unicode_cpt_type(utf_char) == CODEPOINT_TYPE_WHITESPACE) {
                    split_condition = true;
                }
            }
            else if (!split_condition && collecting) {
                if (collecting_letter && unicode_cpt_type(utf_char) != CODEPOINT_TYPE_LETTER) {
                    split_condition = true;
                }
                else if (collecting_numeric && unicode_cpt_type(utf_char) != CODEPOINT_TYPE_DIGIT) {
                    split_condition = true;
                }
                else if (collecting_special && (unicode_cpt_type(utf_char) == CODEPOINT_TYPE_LETTER || unicode_cpt_type(utf_char) == CODEPOINT_TYPE_DIGIT || unicode_cpt_type(utf_char) == CODEPOINT_TYPE_WHITESPACE)) {
                    split_condition = true;
                }
                else if (collecting_whitespace_lookahead && (unicode_cpt_type(utf_char_next) == CODEPOINT_TYPE_LETTER || unicode_cpt_type(utf_char_next) == CODEPOINT_TYPE_DIGIT)) {
                    split_condition = true;
                }
            }
            if (utf_char_next == "") {
                split_condition = true; // final
                token += utf_char;
            }
            if (split_condition) {
                if (token.size()) {
                    bpe_offsets.emplace_back(unicode_cpts_from_utf8(token).size());
                }
                token = utf_char;
                collecting = false;
                collecting_letter = false;
                collecting_numeric = false;
                collecting_special = false;
                collecting_whitespace_lookahead = false;
            }
            else {
                token += utf_char;
            }
        }
        start += offset;
    }
    return bpe_offsets;
 }
 // use std::wregex to split the text
 static std::vector<size_t> unicode_regex_split_stl(const std::wstring & wtext, const std::wstring & regex_expr, const std::vector<size_t> & offsets) {
    std::wregex expr(regex_expr);
    std::vector<size_t> bpe_offsets; // store the offset of each word
    bpe_offsets.reserve(offsets.size()); // Reserve memory for the approximate size
    size_t start = 0;
    for (auto offset : offsets) {
        std::wcregex_iterator it(wtext.data() + start, wtext.data() + start + offset, expr);
        std::wcregex_iterator end;
        int64_t start_idx = 0;
        while (it != end) {
            std::wcmatch match = *it;
            if (match.position() > start_idx) {
                bpe_offsets.emplace_back(match.position() - start_idx);
            }
            bpe_offsets.emplace_back(match.length());
            start_idx = match.position() + match.length();
            ++it;
        }
        if (start_idx < (int64_t) offset) {
            bpe_offsets.emplace_back(offset - start_idx);
        }
        start += offset;
    }
    return bpe_offsets;
 }
 // use std::regex to split the text
 static std::vector<size_t> unicode_regex_split_stl(const std::string & text, const std::string & regex_expr, const std::vector<size_t> & offsets) {
    std::regex expr(regex_expr);
    std::vector<size_t> bpe_offsets; // store the offset of each word
    bpe_offsets.reserve(offsets.size()); // Reserve memory for the approximate size
    size_t start = 0;
    for (auto offset : offsets) {
        std::cregex_iterator it(text.data() + start, text.data() + start + offset, expr);
        std::cregex_iterator end;
        int64_t start_idx = 0;
        while (it != end) {
            std::cmatch match = *it;
            if (match.position() > start_idx) {
                bpe_offsets.emplace_back(match.position() - start_idx);
            }
            bpe_offsets.emplace_back(match.length());
            start_idx = match.position() + match.length();
            ++it;
        }
        if (start_idx < (int64_t) offset) {
            bpe_offsets.emplace_back(offset - start_idx);
        }
        start += offset;
    }
    return bpe_offsets;
 }
 static std::vector<size_t> unicode_regex_split_custom(const std::string & text, const std::string & regex_expr, const std::vector<size_t> & offsets) {
    std::vector<size_t> bpe_offsets;
    (void)(text);
    (void)(regex_expr);
    (void)(offsets);
    // TODO: this implementation is actually wrong, uncomment and run:
    //       make -j && ./bin/test-tokenizer-0 ../models/ggml-vocab-gpt-2.gguf
    //if (regex_expr == "'s|'t|'re|'ve|'m|'ll|'d| ?\\p{L}+| ?\\p{N}+| ?[^\\s\\p{L}\\p{N}]+|\\s+(?!\\S)") {
    //    bpe_offsets = unicode_regex_split_custom_gpt2(text, offsets);
    //}
    return bpe_offsets;
 }
 //
 // interface
 //
 std::string unicode_cpt_to_utf8(uint32_t cp) {
    std::string result;
    if (/* 0x00 <= cp && */ cp <= 0x7f) {
        result.push_back(cp);
        return result;
    }
-    else if (0x80 <= cp && cp <= 0x7ff) {
+    if (0x80 <= cp && cp <= 0x7ff) {
        result.push_back(0xc0 | ((cp >> 6) & 0x1f));
        result.push_back(0x80 | (cp & 0x3f));
        return result;
    }
-    else if (0x800 <= cp && cp <= 0xffff) {
+    if (0x800 <= cp && cp <= 0xffff) {
        result.push_back(0xe0 | ((cp >> 12) & 0x0f));
        result.push_back(0x80 | ((cp >> 6) & 0x3f));
        result.push_back(0x80 | (cp & 0x3f));
        return result;
    }
-    else if (0x10000 <= cp && cp <= 0x10ffff) {
+    if (0x10000 <= cp && cp <= 0x10ffff) {
        result.push_back(0xf0 | ((cp >> 18) & 0x07));
        result.push_back(0x80 | ((cp >> 12) & 0x3f));
        result.push_back(0x80 | ((cp >> 6) & 0x3f));
        result.push_back(0x80 | (cp & 0x3f));
    }
    else {
        throw std::invalid_argument("invalid codepoint");
    }
        return result;
    }
    throw std::invalid_argument("invalid codepoint");
 }
 std::vector<uint32_t> unicode_cpts_normalize_nfd(const std::vector<uint32_t> & cpts) {
@ -275,3 +520,167 @@ char32_t unicode_tolower(char32_t cp) {
    auto it = unicode_map_lowercase.find(cp);
    return it == unicode_map_lowercase.end() ? cp : it->second;
 }
 std::vector<std::string> unicode_regex_split(const std::string & text, const std::vector<std::string> & regex_exprs) {
    // unicode categories
    static const std::map<std::string, int> k_ucat_enum = {
        { "\\p{N}", CODEPOINT_TYPE_DIGIT },
        { "\\p{L}", CODEPOINT_TYPE_LETTER },
        { "\\p{P}", CODEPOINT_TYPE_PUNCTUATION },
    };
    static const std::map<int, int> k_ucat_cpt = {
        { CODEPOINT_TYPE_DIGIT,         0xD1 },
        { CODEPOINT_TYPE_LETTER,        0xD2 },
        { CODEPOINT_TYPE_PUNCTUATION,   0xD3 },
    };
    static const std::map<int, std::string> k_ucat_map = {
        { CODEPOINT_TYPE_DIGIT,         "\x30-\x39" }, // 0-9
        { CODEPOINT_TYPE_LETTER,        "\x41-\x5A\x61-\x7A" }, // A-Za-z
        { CODEPOINT_TYPE_PUNCTUATION,   "\x21-\x23\x25-\x2A\x2C-\x2F\x3A-\x3B\x3F-\x40\\\x5B-\\\x5D\x5F\\\x7B\\\x7D" }, // !-#%-*,-/:-;?-@\[-\]_\{\}
    };
    // compute collapsed codepoints only if needed by at least one regex
    bool need_collapse = false;
    for (auto & regex_expr : regex_exprs) {
        // search for unicode categories
        for (const auto & ucat : k_ucat_enum) {
            if (std::string::npos != regex_expr.find(ucat.first)) {
                need_collapse = true;
                break;
            }
        }
    }
    const auto cpts = unicode_cpts_from_utf8(text);
    // generate a "collapsed" representation of the text, where all codepoints are replaced by a single byte
    // ref: https://github.com/ggerganov/llama.cpp/pull/6920#issuecomment-2081479935
    std::string text_collapsed;
    if (need_collapse) {
        // collapse all unicode categories
        text_collapsed.resize(cpts.size());
        for (size_t i = 0; i < cpts.size(); ++i) {
            // keep single-byte codepoints as is
            if (cpts[i] < 128) {
                text_collapsed[i] = cpts[i];
                continue;
            }
            const int cpt_type = unicode_cpt_type(cpts[i]);
            if (k_ucat_cpt.find(cpt_type) != k_ucat_cpt.end()) {
                text_collapsed[i] = k_ucat_cpt.at(cpt_type);
            } else {
                text_collapsed[i] = (char) 0xD0; // fallback
            }
        }
    }
    std::vector<size_t> bpe_offsets = { cpts.size() };
    for (auto & regex_expr : regex_exprs) {
        // first, see if we have an efficient custom regex implementation
        auto tmp = unicode_regex_split_custom(text, regex_expr, bpe_offsets);
        if (!tmp.empty()) {
            bpe_offsets = std::move(tmp);
            continue;
        }
        // fallback to general-purpose std::regex / std::wregex
        try {
            // if a unicode category is used in the regex, we use the collapsed text and replace the unicode category
            // with the corresponding collapsed representation
            bool use_collapsed = false;
            for (auto & ucat : k_ucat_enum) {
                if (std::string::npos != regex_expr.find(ucat.first)) {
                    use_collapsed = true;
                    break;
                }
            }
            if (use_collapsed) {
                // sanity-check that the original regex does not contain any non-ASCII characters
                const auto cpts_regex = unicode_cpts_from_utf8(regex_expr);
                for (size_t i = 0; i < cpts_regex.size(); ++i) {
                    if (cpts_regex[i] >= 128) {
                        throw std::runtime_error("Regex includes both unicode categories and non-ASCII characters - not supported");
                    }
                }
                // generate a collapsed representation of the regex
                std::string regex_expr_collapsed;
                // track if we are inside [], because nested [] are not allowed
                bool inside = false;
                for (size_t i = 0; i < regex_expr.size(); ++i) {
                    if (regex_expr[i] == '[' && (i == 0 || regex_expr[i - 1] != '\\')) {
                        regex_expr_collapsed += '[';
                        inside = true;
                        continue;
                    }
                    if (inside && regex_expr[i] == ']' && regex_expr[i - 1] != '\\') {
                        regex_expr_collapsed += ']';
                        inside = false;
                        continue;
                    }
                    if (regex_expr[i + 0] == '\\' && i + 4 < regex_expr.size() &&
                        regex_expr[i + 1] == 'p' &&
                        regex_expr[i + 2] == '{' &&
                        regex_expr[i + 4] == '}') {
                        const std::string pat = regex_expr.substr(i, 5);
                        if (k_ucat_enum.find(pat) != k_ucat_enum.end()) {
                            if (!inside) {
                                regex_expr_collapsed += '[';
                            }
                            regex_expr_collapsed += k_ucat_cpt.at(k_ucat_enum.at(pat));
                            regex_expr_collapsed += k_ucat_map.at(k_ucat_enum.at(pat));
                            if (!inside) {
                                regex_expr_collapsed += ']';
                            }
                            i += 4;
                            continue;
                        }
                    }
                    regex_expr_collapsed += regex_expr[i];
                }
                //printf("text_collapsed: %s\n", text_collapsed.c_str());
                //printf("regex_expr_collapsed: %s\n", regex_expr_collapsed.c_str());
                bpe_offsets = unicode_regex_split_stl(text_collapsed, regex_expr_collapsed, bpe_offsets);
            } else {
                // no unicode category used, we can use std::wregex directly
                const std::wstring wtext       = unicode_wstring_from_utf8(text);
                const std::wstring wregex_expr = unicode_wstring_from_utf8(regex_expr);
                //printf("text: %s\n", text.c_str());
                //printf("regex_expr: %s\n", regex_expr.c_str());
                bpe_offsets = unicode_regex_split_stl(wtext, wregex_expr, bpe_offsets);
            }
        } catch (std::regex_error & e) {
            fprintf(stderr, "Failed to process regex: '%s'\n", regex_expr.c_str());
            fprintf(stderr, "Regex error: %s\n", e.what());
            throw std::runtime_error("Failed to process regex");
        }
    }
    std::vector<std::string> bpe_words;
    bpe_words.reserve(bpe_offsets.size()); // reserve memory for the approximate size
    size_t start = 0;
    for (size_t & offset : bpe_offsets) {
        bpe_words.emplace_back();
        for (size_t i = start; i < start + offset; ++i) {
            bpe_words.back() += unicode_cpt_to_utf8(cpts[i]);
        }
        start += offset;
    }
    return unicode_byte_encoding_process(bpe_words);
 }
--- a/unicode.h
+++ b/unicode.h
@ -24,5 +24,6 @@ int unicode_cpt_type(const std::string & utf8);
 std::string unicode_byte_to_utf8(uint8_t byte);
 uint8_t unicode_utf8_to_byte(const std::string & utf8);
 // simple tolower that only implements one-to-one mapping, not one-to-many
 char32_t unicode_tolower(char32_t cp);
 std::vector<std::string> unicode_regex_split(const std::string & text, const std::vector<std::string> & regex_exprs);