From 4810ab1aa10709cf396dd2bd370ede9b3cac77c8 Mon Sep 17 00:00:00 2001
From: Wenjing Yu <zihao.chen31@gmail.com>
Date: Fri, 26 Jul 2024 16:38:13 -0700
Subject: [PATCH] remove tests

---
 CMakeLists.txt                        |    6 -
 tests/.gitignore                      |    4 -
 tests/CMakeLists.txt                  |  137 --
 tests/get-model.cpp                   |   21 -
 tests/get-model.h                     |    2 -
 tests/run-json-schema-to-grammar.mjs  |   10 -
 tests/test-autorelease.cpp            |   24 -
 tests/test-backend-ops.cpp            | 2557 -------------------------
 tests/test-c.c                        |    7 -
 tests/test-chat-template.cpp          |  177 --
 tests/test-double-float.cpp           |   57 -
 tests/test-grad0.cpp                  | 1566 ---------------
 tests/test-grammar-integration.cpp    | 1325 -------------
 tests/test-grammar-parser.cpp         |  515 -----
 tests/test-json-schema-to-grammar.cpp | 1273 ------------
 tests/test-llama-grammar.cpp          |  408 ----
 tests/test-model-load-cancel.cpp      |   27 -
 tests/test-opt.cpp                    |  181 --
 tests/test-quantize-fns.cpp           |  185 --
 tests/test-quantize-perf.cpp          |  363 ----
 tests/test-rope.cpp                   |  220 ---
 tests/test-sampling.cpp               |  301 ---
 tests/test-tokenizer-0.cpp            |  292 ---
 tests/test-tokenizer-0.py             |   46 -
 tests/test-tokenizer-0.sh             |   41 -
 tests/test-tokenizer-1-bpe.cpp        |  152 --
 tests/test-tokenizer-1-spm.cpp        |  122 --
 tests/test-tokenizer-random.py        |  566 ------
 28 files changed, 10585 deletions(-)
 delete mode 100644 tests/.gitignore
 delete mode 100644 tests/CMakeLists.txt
 delete mode 100644 tests/get-model.cpp
 delete mode 100644 tests/get-model.h
 delete mode 100644 tests/run-json-schema-to-grammar.mjs
 delete mode 100644 tests/test-autorelease.cpp
 delete mode 100644 tests/test-backend-ops.cpp
 delete mode 100644 tests/test-c.c
 delete mode 100644 tests/test-chat-template.cpp
 delete mode 100644 tests/test-double-float.cpp
 delete mode 100644 tests/test-grad0.cpp
 delete mode 100644 tests/test-grammar-integration.cpp
 delete mode 100644 tests/test-grammar-parser.cpp
 delete mode 100755 tests/test-json-schema-to-grammar.cpp
 delete mode 100644 tests/test-llama-grammar.cpp
 delete mode 100644 tests/test-model-load-cancel.cpp
 delete mode 100644 tests/test-opt.cpp
 delete mode 100644 tests/test-quantize-fns.cpp
 delete mode 100644 tests/test-quantize-perf.cpp
 delete mode 100644 tests/test-rope.cpp
 delete mode 100644 tests/test-sampling.cpp
 delete mode 100644 tests/test-tokenizer-0.cpp
 delete mode 100644 tests/test-tokenizer-0.py
 delete mode 100755 tests/test-tokenizer-0.sh
 delete mode 100644 tests/test-tokenizer-1-bpe.cpp
 delete mode 100644 tests/test-tokenizer-1-spm.cpp
 delete mode 100644 tests/test-tokenizer-random.py

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 793709122..b951c58e2 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -63,7 +63,6 @@ option(LLAMA_SANITIZE_ADDRESS   "llama: enable address sanitizer"   OFF)
 option(LLAMA_SANITIZE_UNDEFINED "llama: enable undefined sanitizer" OFF)
 
 # extra artifacts
-option(LLAMA_BUILD_TESTS    "llama: build tests"          ${LLAMA_STANDALONE})
 option(LLAMA_BUILD_EXAMPLES "llama: build examples"       ${LLAMA_STANDALONE})
 option(LLAMA_BUILD_SERVER   "llama: build server example" ${LLAMA_STANDALONE})
 
@@ -189,11 +188,6 @@ install(FILES "${CMAKE_CURRENT_BINARY_DIR}/llama.pc"
 
 add_subdirectory(common)
 
-if (LLAMA_BUILD_TESTS AND NOT CMAKE_JS_VERSION)
-    include(CTest)
-    add_subdirectory(tests)
-endif ()
-
 if (LLAMA_BUILD_EXAMPLES)
     add_subdirectory(examples)
     add_subdirectory(pocs)
diff --git a/tests/.gitignore b/tests/.gitignore
deleted file mode 100644
index 620a48ee4..000000000
--- a/tests/.gitignore
+++ /dev/null
@@ -1,4 +0,0 @@
-*
-!*.*
-*.o
-ggml-common.h
diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt
deleted file mode 100644
index 0207e3a59..000000000
--- a/tests/CMakeLists.txt
+++ /dev/null
@@ -1,137 +0,0 @@
-function(llama_test target)
-    include(CMakeParseArguments)
-    set(options)
-    set(oneValueArgs NAME LABEL WORKING_DIRECTORY)
-    set(multiValueArgs ARGS)
-    cmake_parse_arguments(LLAMA_TEST "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
-
-    if (NOT DEFINED LLAMA_TEST_LABEL)
-        set(LLAMA_TEST_LABEL "main")
-    endif()
-    if (NOT DEFINED LLAMA_TEST_WORKING_DIRECTORY)
-        set(LLAMA_TEST_WORKING_DIRECTORY .)
-    endif()
-    if (DEFINED LLAMA_TEST_NAME)
-        set(TEST_NAME ${LLAMA_TEST_NAME})
-    else()
-        set(TEST_NAME ${target})
-    endif()
-
-    set(TEST_TARGET ${target})
-
-    add_test(
-        NAME ${TEST_NAME}
-        WORKING_DIRECTORY ${LLAMA_TEST_WORKING_DIRECTORY}
-        COMMAND $<TARGET_FILE:${TEST_TARGET}>
-        ${LLAMA_TEST_ARGS})
-
-    set_property(TEST ${TEST_NAME} PROPERTY LABELS ${LLAMA_TEST_LABEL})
-endfunction()
-
-# Builds and runs a test source file.
-# Optional args:
-# - NAME: name of the executable & test target (defaults to the source file name without extension)
-# - LABEL: label for the test (defaults to main)
-# - ARGS: arguments to pass to the test executable
-# - WORKING_DIRECTORY
-function(llama_target_and_test source)
-    include(CMakeParseArguments)
-    set(options)
-    set(oneValueArgs NAME LABEL WORKING_DIRECTORY)
-    set(multiValueArgs ARGS)
-    cmake_parse_arguments(LLAMA_TEST "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
-
-    if (NOT DEFINED LLAMA_TEST_LABEL)
-        set(LLAMA_TEST_LABEL "main")
-    endif()
-    if (NOT DEFINED LLAMA_TEST_WORKING_DIRECTORY)
-        set(LLAMA_TEST_WORKING_DIRECTORY .)
-    endif()
-    if (DEFINED LLAMA_TEST_NAME)
-        set(TEST_TARGET ${LLAMA_TEST_NAME})
-    else()
-        get_filename_component(TEST_TARGET ${source} NAME_WE)
-    endif()
-
-    add_executable(${TEST_TARGET} ${source} get-model.cpp)
-    install(TARGETS ${TEST_TARGET} RUNTIME)
-    target_link_libraries(${TEST_TARGET} PRIVATE common)
-    add_test(
-        NAME ${TEST_TARGET}
-        WORKING_DIRECTORY ${LLAMA_TEST_WORKING_DIRECTORY}
-        COMMAND $<TARGET_FILE:${TEST_TARGET}>
-        ${LLAMA_TEST_ARGS})
-
-    set_property(TEST ${TEST_TARGET} PROPERTY LABELS ${LLAMA_TEST_LABEL})
-endfunction()
-
-# build test-tokenizer-0 target once and add many tests
-add_executable(test-tokenizer-0 test-tokenizer-0.cpp)
-target_link_libraries(test-tokenizer-0 PRIVATE common)
-install(TARGETS test-tokenizer-0 RUNTIME)
-
-llama_test(test-tokenizer-0 NAME test-tokenizer-0-bert-bge          ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-bert-bge.gguf)
-llama_test(test-tokenizer-0 NAME test-tokenizer-0-command-r         ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-command-r.gguf)
-llama_test(test-tokenizer-0 NAME test-tokenizer-0-deepseek-coder    ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-deepseek-coder.gguf)
-llama_test(test-tokenizer-0 NAME test-tokenizer-0-deepseek-llm      ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-deepseek-llm.gguf)
-llama_test(test-tokenizer-0 NAME test-tokenizer-0-falcon            ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-falcon.gguf)
-llama_test(test-tokenizer-0 NAME test-tokenizer-0-gpt-2             ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-gpt-2.gguf)
-llama_test(test-tokenizer-0 NAME test-tokenizer-0-llama-bpe         ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-llama-bpe.gguf)
-llama_test(test-tokenizer-0 NAME test-tokenizer-0-llama-spm         ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-llama-spm.gguf)
-llama_test(test-tokenizer-0 NAME test-tokenizer-0-mpt               ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-mpt.gguf)
-llama_test(test-tokenizer-0 NAME test-tokenizer-0-phi-3             ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-phi-3.gguf)
-llama_test(test-tokenizer-0 NAME test-tokenizer-0-qwen2             ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-qwen2.gguf)
-llama_test(test-tokenizer-0 NAME test-tokenizer-0-refact            ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-refact.gguf)
-llama_test(test-tokenizer-0 NAME test-tokenizer-0-starcoder         ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-starcoder.gguf)
-
-# build test-tokenizer-1-bpe target once and add many tests
-add_executable(test-tokenizer-1-bpe test-tokenizer-1-bpe.cpp)
-target_link_libraries(test-tokenizer-1-bpe PRIVATE common)
-install(TARGETS test-tokenizer-1-bpe RUNTIME)
-
-# TODO: disabled due to slowness
-#llama_test(test-tokenizer-1-bpe NAME test-tokenizer-1-aquila    ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-aquila.gguf)
-#llama_test(test-tokenizer-1-bpe NAME test-tokenizer-1-falcon    ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-falcon.gguf)
-#llama_test(test-tokenizer-1-bpe NAME test-tokenizer-1-gpt-2     ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-gpt-2.gguf)
-#llama_test(test-tokenizer-1-bpe NAME test-tokenizer-1-gpt-neox  ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-gpt-neox.gguf)
-#llama_test(test-tokenizer-1-bpe NAME test-tokenizer-1-llama-bpe ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-llama-bpe.gguf --ignore-merges)
-#llama_test(test-tokenizer-1-bpe NAME test-tokenizer-1-mpt       ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-mpt.gguf)
-#llama_test(test-tokenizer-1-bpe NAME test-tokenizer-1-refact    ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-refact.gguf)
-#llama_test(test-tokenizer-1-bpe NAME test-tokenizer-1-starcoder ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-starcoder.gguf)
-
-# build test-tokenizer-1-spm target once and add many tests
-add_executable(test-tokenizer-1-spm test-tokenizer-1-spm.cpp)
-target_link_libraries(test-tokenizer-1-spm PRIVATE common)
-install(TARGETS test-tokenizer-1-spm RUNTIME)
-
-llama_test(test-tokenizer-1-spm  NAME test-tokenizer-1-llama-spm ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-llama-spm.gguf)
-#llama_test(test-tokenizer-1-spm  NAME test-tokenizer-1-baichuan  ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-baichuan.gguf)
-
-# llama_target_and_test(test-double-float.cpp) # SLOW
-llama_target_and_test(test-quantize-fns.cpp)
-llama_target_and_test(test-quantize-perf.cpp)
-llama_target_and_test(test-sampling.cpp)
-llama_target_and_test(test-chat-template.cpp)
-
-llama_target_and_test(test-grammar-parser.cpp)
-llama_target_and_test(test-llama-grammar.cpp)
-llama_target_and_test(test-grammar-integration.cpp)
-llama_target_and_test(test-grad0.cpp)
-# llama_target_and_test(test-opt.cpp) # SLOW
-llama_target_and_test(test-backend-ops.cpp)
-
-llama_target_and_test(test-rope.cpp)
-
-llama_target_and_test(test-model-load-cancel.cpp  LABEL "model")
-llama_target_and_test(test-autorelease.cpp        LABEL "model")
-
-# TODO: disabled on loongarch64 because the ggml-ci node lacks Python 3.8
-if (NOT ${CMAKE_SYSTEM_PROCESSOR} MATCHES "loongarch64")
-    llama_target_and_test(test-json-schema-to-grammar.cpp   WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}/..)
-    target_include_directories(test-json-schema-to-grammar PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/../examples/server)
-endif()
-
-# dummy executable - not installed
-get_filename_component(TEST_TARGET test-c.c NAME_WE)
-add_executable(${TEST_TARGET} test-c.c)
-target_link_libraries(${TEST_TARGET} PRIVATE llama)
diff --git a/tests/get-model.cpp b/tests/get-model.cpp
deleted file mode 100644
index 4edb685f0..000000000
--- a/tests/get-model.cpp
+++ /dev/null
@@ -1,21 +0,0 @@
-#include <cstdio>
-#include <cstdlib>
-#include <cstring>
-
-#include "get-model.h"
-
-char * get_model_or_exit(int argc, char *argv[]) {
-    char * model_path;
-    if (argc > 1) {
-        model_path = argv[1];
-
-    } else {
-        model_path = getenv("LLAMACPP_TEST_MODELFILE");
-        if (!model_path || strlen(model_path) == 0) {
-            fprintf(stderr, "\033[33mWARNING: No model file provided. Skipping this test. Set LLAMACPP_TEST_MODELFILE=<gguf_model_path> to silence this warning and run this test.\n\033[0m");
-            exit(EXIT_SUCCESS);
-        }
-    }
-
-    return model_path;
-}
diff --git a/tests/get-model.h b/tests/get-model.h
deleted file mode 100644
index 81a3a0fef..000000000
--- a/tests/get-model.h
+++ /dev/null
@@ -1,2 +0,0 @@
-#pragma once
-char * get_model_or_exit(int, char*[]);
diff --git a/tests/run-json-schema-to-grammar.mjs b/tests/run-json-schema-to-grammar.mjs
deleted file mode 100644
index 71bf62ed3..000000000
--- a/tests/run-json-schema-to-grammar.mjs
+++ /dev/null
@@ -1,10 +0,0 @@
-import { readFileSync } from "fs"
-import { SchemaConverter } from "../examples/server/public/json-schema-to-grammar.mjs"
-
-const [, , file] = process.argv
-const url = `file://${file}`
-let schema = JSON.parse(readFileSync(file, "utf8"));
-const converter = new SchemaConverter({})
-schema = await converter.resolveRefs(schema, url)
-converter.visit(schema, '')
-console.log(converter.formatGrammar())
diff --git a/tests/test-autorelease.cpp b/tests/test-autorelease.cpp
deleted file mode 100644
index 57fa00011..000000000
--- a/tests/test-autorelease.cpp
+++ /dev/null
@@ -1,24 +0,0 @@
-// ref: https://github.com/ggerganov/llama.cpp/issues/4952#issuecomment-1892864763
-
-#include <cstdio>
-#include <string>
-#include <thread>
-
-#include "llama.h"
-#include "get-model.h"
-
-// This creates a new context inside a pthread and then tries to exit cleanly.
-int main(int argc, char ** argv) {
-    auto * model_path = get_model_or_exit(argc, argv);
-
-    std::thread([&model_path]() {
-        llama_backend_init();
-        auto * model = llama_load_model_from_file(model_path, llama_model_default_params());
-        auto * ctx = llama_new_context_with_model(model, llama_context_default_params());
-        llama_free(ctx);
-        llama_free_model(model);
-        llama_backend_free();
-    }).join();
-
-    return 0;
-}
diff --git a/tests/test-backend-ops.cpp b/tests/test-backend-ops.cpp
deleted file mode 100644
index 2c03c60d4..000000000
--- a/tests/test-backend-ops.cpp
+++ /dev/null
@@ -1,2557 +0,0 @@
-#include <ggml.h>
-#include <ggml-alloc.h>
-#include <ggml-backend.h>
-
-#include <algorithm>
-#include <array>
-#include <cfloat>
-#include <cstring>
-#include <functional>
-#include <memory>
-#include <random>
-#include <stdio.h>
-#include <stdlib.h>
-#include <string>
-#include <thread>
-#include <vector>
-
-
-static void init_tensor_uniform(ggml_tensor * tensor, float min = -1.0f, float max = 1.0f) {
-    // static RNG initialization (revisit if n_threads stops being constant)
-    static const size_t n_threads = std::thread::hardware_concurrency();
-    static std::vector<std::default_random_engine> generators = []() {
-        std::random_device rd;
-        std::vector<std::default_random_engine> vec;
-        vec.reserve(n_threads);
-        //for (size_t i = 0; i < n_threads; i++) { vec.emplace_back(1234 + i); } // fixed seed
-        for (size_t i = 0; i < n_threads; i++) { vec.emplace_back(rd()); }
-        return vec;
-    }();
-
-    size_t size = ggml_nelements(tensor);
-    std::vector<float> data(size);
-
-    auto init_thread = [&](size_t ith, size_t start, size_t end) {
-        std::uniform_real_distribution<float> distribution(min, max);
-        for (size_t i = start; i < end; i++) {
-            data[i] = distribution(generators[ith]);
-        }
-    };
-
-    std::vector<std::thread> threads;
-    threads.reserve(n_threads);
-    for (size_t i = 0; i < n_threads; i++) {
-        size_t start =     i*size/n_threads;
-        size_t end   = (i+1)*size/n_threads;
-        threads.emplace_back(init_thread, i, start, end);
-    }
-    for (auto & t : threads) {
-        t.join();
-    }
-
-#if 0
-    const char * val_str = getenv("GGML_TEST_EPS");
-    float val = 1e-9f;
-    if (val_str != nullptr) {
-        val = std::stof(val_str);
-        printf("GGML_TEST_EPS=%e\n", val);
-    }
-
-    // test quantization with very small values that may result in nan scales due to division by zero
-    if (ggml_is_quantized(tensor->type)) {
-        for (int i = 0; i < 256; i++) {
-            data[i] = val;
-        }
-    }
-#endif
-
-    if (tensor->type == GGML_TYPE_F32 || tensor->type == GGML_TYPE_I32) {
-        ggml_backend_tensor_set(tensor, data.data(), 0, size * sizeof(float));
-    } else if (ggml_is_quantized(tensor->type) || tensor->type == GGML_TYPE_F16 || tensor->type == GGML_TYPE_BF16) {
-        GGML_ASSERT(size % ggml_blck_size(tensor->type) == 0);
-        std::vector<uint8_t> dataq(ggml_row_size(tensor->type, size));
-        std::vector<float> imatrix(tensor->ne[0], 1.0f); // dummy importance matrix
-        const float * im = imatrix.data();
-        if (!ggml_quantize_requires_imatrix(tensor->type)) {
-            // when the imatrix is optional, we want to test both quantization with and without imatrix
-            // use one of the random numbers to decide
-            if (data[0] > 0.5f*(min + max)) {
-                im = nullptr;
-            }
-        }
-
-        ggml_quantize_chunk(tensor->type, data.data(), dataq.data(), 0, size/tensor->ne[0], tensor->ne[0], im);
-        GGML_ASSERT(ggml_validate_row_data(tensor->type, dataq.data(), dataq.size()));
-        // TODO: other cases
-        //#pragma omp parallel for
-        //for (int i = 0; i < tensor->ne[1]; i++) {
-        //    ggml_quantize_chunk(tensor->type, data.data(), dataq.data(),
-        //        i * tensor->ne[0], 1, tensor->ne[0], im);
-        //}
-
-        ggml_backend_tensor_set(tensor, dataq.data(), 0, dataq.size());
-    } else if (tensor->type == GGML_TYPE_I8 || tensor->type == GGML_TYPE_I16 || tensor->type == GGML_TYPE_I32) {
-        // This is going to create some weird integers though.
-        ggml_backend_tensor_set(tensor, data.data(), 0, ggml_nbytes(tensor));
-    } else {
-        GGML_ASSERT(false);
-    }
-}
-
-static std::vector<float> tensor_to_float(const ggml_tensor * t) {
-    std::vector<float> tv;
-    tv.reserve(ggml_nelements(t));
-
-    std::vector<uint8_t> buf(ggml_nbytes(t));
-    ggml_backend_tensor_get(t, buf.data(), 0, ggml_nbytes(t));
-
-    ggml_type_traits_t tt = ggml_internal_get_type_traits(t->type);
-    size_t bs = ggml_blck_size(t->type);
-    std::vector<float> vq(ggml_blck_size(t->type));
-    bool quantized = ggml_is_quantized(t->type);
-
-    // access elements by index to avoid gaps in views
-    for (int64_t i3 = 0; i3 < t->ne[3]; i3++) {
-        for (int64_t i2 = 0; i2 < t->ne[2]; i2++) {
-            for (int64_t i1 = 0; i1 < t->ne[1]; i1++) {
-                for (int64_t i0 = 0; i0 < t->ne[0]; i0 += bs) {
-                    size_t i = i3*t->nb[3] + i2*t->nb[2] + i1*t->nb[1] + i0/bs*t->nb[0];
-                    if (t->type == GGML_TYPE_F16) {
-                        tv.push_back(ggml_fp16_to_fp32(*(ggml_fp16_t*)&buf[i]));
-                    } else if (t->type == GGML_TYPE_BF16) {
-                        tv.push_back(ggml_bf16_to_fp32(*(ggml_bf16_t*)&buf[i]));
-                    } else if (t->type == GGML_TYPE_F32) {
-                        tv.push_back(*(float *) &buf[i]);
-                    } else if (t->type == GGML_TYPE_I32) {
-                        tv.push_back((float)*(int32_t *) &buf[i]);
-                    } else if (t->type == GGML_TYPE_I16) {
-                        tv.push_back((float)*(int16_t *) &buf[i]);
-                    } else if (t->type == GGML_TYPE_I8) {
-                        tv.push_back((float)*(int8_t *) &buf[i]);
-                    } else if (quantized) {
-                        tt.to_float(&buf[i], vq.data(), bs);
-                        tv.insert(tv.end(), vq.begin(), vq.end());
-                    } else {
-                        GGML_ASSERT(false);
-                    }
-                }
-            }
-        }
-    }
-
-    return tv;
-}
-
-/*
-static double cosine_similarity(const float * v1, const float * v2, size_t n) {
-    double dot = 0.0;
-    double mag1 = 0.0;
-    double mag2 = 0.0;
-
-    for (size_t i = 0; i < n; i++) {
-        if (std::isnan(v1[i]) || std::isnan(v2[i])) {
-            return -1.0f;
-        }
-        if (std::isinf(v1[i]) && std::isinf(v2[i])) {
-            continue;
-        }
-        dot  += v1[i]*v2[i];
-        mag1 += v1[i]*v1[i];
-        mag2 += v2[i]*v2[i];
-    }
-
-    return dot/sqrt(mag1*mag2);
-}
-
-static float distance(const float * v1, const float * v2, size_t n) {
-    double d = 0.0;
-
-    for (size_t i = 0; i < n; i++) {
-        if (std::isnan(v1[i]) || std::isnan(v2[i])) {
-            return INFINITY;
-        }
-        if (std::isinf(v1[i]) && std::isinf(v2[i])) {
-            continue;
-        }
-        d += (v1[i] - v2[i])*(v1[i] - v2[i]);
-    }
-
-    return sqrt(d);
-}
-
-static float vec_len(const float * v, size_t n) {
-    double d = 0.0;
-
-    for (size_t i = 0; i < n; i++) {
-        if (std::isnan(v[i])) {
-            return INFINITY;
-        }
-        if (std::isinf(v[i])) {
-            continue;
-        }
-        d += v[i]*v[i];
-    }
-
-    return sqrt(d);
-}
-*/
-
-// normalized mean squared error = mse(a, b) / mse(a, 0)
-static double nmse(const float * a, const float * b, size_t n) {
-    double mse_a_b = 0.0;
-    double mse_a_0 = 0.0;
-
-    for (size_t i = 0; i < n; i++) {
-        float a_i = a[i];
-        float b_i = b[i];
-
-        mse_a_b += (a_i - b_i) * (a_i - b_i);
-        mse_a_0 += a_i * a_i;
-    }
-
-    return mse_a_b / mse_a_0;
-}
-
-// utils for printing the variables of the test cases
-#define VAR_TO_STR(x) (#x "=" + var_to_str(x))
-
-template<typename T>
-static std::string var_to_str(const T & x) {
-    return std::to_string(x);
-}
-
-template<typename T, size_t N>
-static std::string var_to_str(const T (&x)[N]) {
-    std::string s = "[";
-    for (size_t i = 0; i < N; i++) {
-        if (i > 0) {
-            s += ",";
-        }
-        s += var_to_str(x[i]);
-    }
-    s += "]";
-    return s;
-}
-
-template<typename T, size_t N>
-static std::string var_to_str(const std::array<T, N> & x) {
-    std::string s = "[";
-    for (size_t i = 0; i < N; i++) {
-        if (i > 0) {
-            s += ",";
-        }
-        s += var_to_str(x[i]);
-    }
-    s += "]";
-    return s;
-}
-
-//static std::string var_to_str(ggml_unary_op unary_op) {
-//    return ggml_unary_op_name(unary_op);
-//}
-
-static std::string var_to_str(ggml_type type) {
-    return ggml_type_name(type);
-}
-
-static std::string var_to_str(ggml_op_pool pool) {
-    switch (pool) {
-        case GGML_OP_POOL_AVG:  return "avg";
-        case GGML_OP_POOL_MAX:  return "max";
-        default:                return std::to_string(pool);
-    }
-}
-
-#define VARS_TO_STR1(a) VAR_TO_STR(a)
-#define VARS_TO_STR2(a, b) VAR_TO_STR(a) + "," + VAR_TO_STR(b)
-#define VARS_TO_STR3(a, b, c) VAR_TO_STR(a) + "," + VARS_TO_STR2(b, c)
-#define VARS_TO_STR4(a, b, c, d) VAR_TO_STR(a) + "," + VARS_TO_STR3(b, c, d)
-#define VARS_TO_STR5(a, b, c, d, e) VAR_TO_STR(a) + "," + VARS_TO_STR4(b, c, d, e)
-#define VARS_TO_STR6(a, b, c, d, e, f) VAR_TO_STR(a) + "," + VARS_TO_STR5(b, c, d, e, f)
-#define VARS_TO_STR7(a, b, c, d, e, f, g) VAR_TO_STR(a) + "," + VARS_TO_STR6(b, c, d, e, f, g)
-#define VARS_TO_STR8(a, b, c, d, e, f, g, h) VAR_TO_STR(a) + "," + VARS_TO_STR7(b, c, d, e, f, g, h)
-#define VARS_TO_STR9(a, b, c, d, e, f, g, h, i) VAR_TO_STR(a) + "," + VARS_TO_STR8(b, c, d, e, f, g, h, i)
-#define VARS_TO_STR10(a, b, c, d, e, f, g, h, i, j) VAR_TO_STR(a) + "," + VARS_TO_STR9(b, c, d, e, f, g, h, i, j)
-#define VARS_TO_STR11(a, b, c, d, e, f, g, h, i, j, k) VAR_TO_STR(a) + "," + VARS_TO_STR10(b, c, d, e, f, g, h, i, j, k)
-#define VARS_TO_STR12(a, b, c, d, e, f, g, h, i, j, k, l) VAR_TO_STR(a) + "," + VARS_TO_STR11(b, c, d, e, f, g, h, i, j, k, l)
-
-#ifdef GGML_USE_SYCL
-static bool inline _isinf(float f) {
-    return (*(uint32_t *)&f & 0x7fffffff) == 0x7f800000;
-}
-#else
-static bool inline _isinf(float f) { return std::isinf(f); }
-#endif
-
-// accept FLT_MAX as infinity
-static bool isinf_or_max(float f) {
-    return _isinf(f) || f == FLT_MAX || f == -FLT_MAX;
-}
-
-static bool ggml_is_view_op(enum ggml_op op) {
-    return op == GGML_OP_VIEW || op == GGML_OP_RESHAPE || op == GGML_OP_PERMUTE || op == GGML_OP_TRANSPOSE;
-}
-
-enum test_mode {
-    MODE_TEST,
-    MODE_PERF,
-};
-
-struct test_case {
-    virtual ~test_case() {}
-
-    virtual std::string op_desc(ggml_tensor * t) {
-        return ggml_op_desc(t);
-    }
-
-    virtual std::string vars() {
-        return "";
-    }
-
-    virtual ggml_tensor * build_graph(ggml_context * ctx) = 0;
-
-    virtual double max_nmse_err() {
-        return 1e-7;
-    }
-
-    virtual void initialize_tensors(ggml_context * ctx) {
-        for (ggml_tensor * t = ggml_get_first_tensor(ctx); t != nullptr; t = ggml_get_next_tensor(ctx, t)) {
-            init_tensor_uniform(t);
-        }
-    }
-
-    virtual size_t op_size(ggml_tensor * t) {
-        size_t size = ggml_nbytes(t);
-        // add source tensors
-        for (int i = 0; i < GGML_MAX_SRC; i++) {
-            if (t->src[i] != NULL) {
-                size += ggml_nbytes(t->src[i]);
-            }
-        }
-        return size;
-    }
-
-    ggml_cgraph * gf = nullptr;
-
-    static const int sentinel_size = 1024;
-
-    test_mode mode;
-
-    std::vector<ggml_tensor *> sentinels;
-
-    void add_sentinel(ggml_context * ctx) {
-        if (mode == MODE_PERF) {
-            return;
-        }
-        ggml_tensor * sentinel = ::ggml_new_tensor_1d(ctx, GGML_TYPE_F32, sentinel_size);
-        ggml_format_name(sentinel, "sent_%zu", sentinels.size());
-        sentinels.push_back(sentinel);
-    }
-
-    // hijack ggml_new_tensor to add sentinels after each tensor to check for overflows in the backend
-
-    ggml_tensor * ggml_new_tensor(ggml_context * ctx, ggml_type type, int n_dims, const int64_t * ne) {
-        ggml_tensor * t = ::ggml_new_tensor(ctx, type, n_dims, ne);
-        add_sentinel(ctx);
-        return t;
-    }
-
-    ggml_tensor * ggml_new_tensor_1d(ggml_context * ctx, ggml_type type, int64_t ne0) {
-        ggml_tensor * t = ::ggml_new_tensor_1d(ctx, type, ne0);
-        add_sentinel(ctx);
-        return t;
-    }
-
-    ggml_tensor * ggml_new_tensor_2d(ggml_context * ctx, ggml_type type, int64_t ne0, int64_t ne1) {
-        ggml_tensor * t = ::ggml_new_tensor_2d(ctx, type, ne0, ne1);
-        add_sentinel(ctx);
-        return t;
-    }
-
-    ggml_tensor * ggml_new_tensor_3d(ggml_context * ctx, ggml_type type, int64_t ne0, int64_t ne1, int64_t ne2) {
-        ggml_tensor * t = ::ggml_new_tensor_3d(ctx, type, ne0, ne1, ne2);
-        add_sentinel(ctx);
-        return t;
-    }
-
-    ggml_tensor * ggml_new_tensor_4d(ggml_context * ctx, ggml_type type, int64_t ne0, int64_t ne1, int64_t ne2, int64_t ne3) {
-        ggml_tensor * t = ::ggml_new_tensor_4d(ctx, type, ne0, ne1, ne2, ne3);
-        add_sentinel(ctx);
-        return t;
-    }
-
-    bool eval(ggml_backend_t backend1, ggml_backend_t backend2, const char * op_name) {
-        mode = MODE_TEST;
-
-        ggml_init_params params = {
-            /* .mem_size = */ ggml_tensor_overhead()*128 + ggml_graph_overhead(),
-            /* .mem_base = */ NULL,
-            /* .no_alloc = */ true,
-        };
-        ggml_context * ctx = ggml_init(params);
-
-        gf = ggml_new_graph(ctx);
-
-        // pre-graph sentinel
-        add_sentinel(ctx);
-
-        ggml_tensor * out = build_graph(ctx);
-
-        if (op_name != nullptr && op_desc(out) != op_name) {
-            //printf("  %s: skipping\n", op_desc(out).c_str());
-            ggml_free(ctx);
-            return true;
-        }
-
-        printf("  %s(%s): ", op_desc(out).c_str(), vars().c_str());
-        fflush(stdout);
-
-        // check if the backends support the ops
-        bool supported = true;
-        for (ggml_backend_t backend : {backend1, backend2}) {
-            for (ggml_tensor * t = ggml_get_first_tensor(ctx); t != NULL; t = ggml_get_next_tensor(ctx, t)) {
-                if (!ggml_backend_supports_op(backend, t)) {
-                    printf("not supported [%s] ", ggml_backend_name(backend));
-                    supported = false;
-                    break;
-                }
-            }
-        }
-        if (!supported) {
-            printf("\n");
-            ggml_free(ctx);
-            return true;
-        }
-
-        // post-graph sentinel
-        add_sentinel(ctx);
-
-        // allocate
-        ggml_backend_buffer_t buf = ggml_backend_alloc_ctx_tensors(ctx, backend1);
-        if (buf == NULL) {
-            printf("failed to allocate tensors [%s] ", ggml_backend_name(backend1));
-            ggml_free(ctx);
-            return false;
-        }
-
-        // build graph
-        ggml_build_forward_expand(gf, out);
-
-        // add sentinels as graph nodes so that they are checked in the callback
-        for (ggml_tensor * sentinel : sentinels) {
-            gf->nodes[gf->n_nodes++] = sentinel;
-        }
-
-        // randomize tensors
-        initialize_tensors(ctx);
-
-        // compare
-        struct callback_userdata {
-            bool   ok;
-            double max_err;
-            ggml_backend_t backend1;
-            ggml_backend_t backend2;
-        };
-
-        callback_userdata ud {
-            true,
-            max_nmse_err(),
-            backend1,
-            backend2
-        };
-
-        auto callback = [](int index, ggml_tensor * t1, ggml_tensor * t2, void * user_data) -> bool {
-            callback_userdata * ud = (callback_userdata *) user_data;
-            const char * bn1 = ggml_backend_name(ud->backend1);
-            const char * bn2 = ggml_backend_name(ud->backend2);
-
-            if (t1->op == GGML_OP_NONE) {
-                // sentinels must be unchanged
-                std::vector<uint8_t> t1_data(ggml_nbytes(t1));
-                std::vector<uint8_t> t2_data(ggml_nbytes(t2));
-                ggml_backend_tensor_get(t1, t1_data.data(), 0, ggml_nbytes(t1));
-                ggml_backend_tensor_get(t2, t2_data.data(), 0, ggml_nbytes(t2));
-
-                if (memcmp(t1_data.data(), t2_data.data(), ggml_nbytes(t1)) != 0) {
-                    printf("sentinel mismatch: %s ", t1->name);
-                    ud->ok = false;
-                    return true;
-                }
-            }
-
-            std::vector<float> f1 = tensor_to_float(t1);
-            std::vector<float> f2 = tensor_to_float(t2);
-
-            for (size_t i = 0; i < f1.size(); i++) {
-                // check for nans
-                if (std::isnan(f1[i]) || std::isnan(f2[i])) {
-                    printf("[%s] NaN at index %zu (%s=%f %s=%f) ", ggml_op_desc(t1), i, bn1, f1[i], bn2, f2[i]);
-                    ud->ok = false;
-                    return true;
-                }
-                // check for infs: both must be inf of the same sign, or both must be finite
-                if (isinf_or_max(f1[i]) || isinf_or_max(f2[i])) {
-                    if (isinf_or_max(f1[i]) && isinf_or_max(f2[i])) {
-                        if (std::signbit(f1[i]) != std::signbit(f2[i])) {
-                            printf("[%s] inf sign mismatch: %s=%f %s=%f ", ggml_op_desc(t1), bn1, f1[i], bn2, f2[i]);
-                            ud->ok = false;
-                            return true;
-                        }
-                    } else {
-                        printf("[%s] inf mismatch: %s=%f %s=%f ", ggml_op_desc(t1), bn1, f1[i], bn2, f2[i]);
-                        ud->ok = false;
-                        return true;
-                    }
-                }
-            }
-
-            double err = nmse(f1.data(), f2.data(), f1.size());
-            if (err > ud->max_err) {
-                printf("[%s] NMSE = %.9f > %.9f ", ggml_op_desc(t1), err, ud->max_err);
-                //for (int i = 0; i < (int) f1.size(); i++) {
-                //    printf("%5d %9.6f %9.6f, diff = %9.6f\n", i, f1[i], f2[i], f1[i] - f2[i]);
-                //}
-                //printf("\n");
-                //exit(1);
-                ud->ok = false;
-            }
-            return true;
-
-            GGML_UNUSED(index);
-        };
-
-        const bool cmp_ok = ggml_backend_compare_graph_backend(backend1, backend2, gf, callback, &ud);
-
-        if (!cmp_ok) {
-            printf("compare failed ");
-        }
-
-        ggml_backend_buffer_free(buf);
-
-        ggml_free(ctx);
-
-        if (ud.ok && cmp_ok) {
-            printf("\033[1;32mOK\033[0m\n");
-            return true;
-        }
-
-        printf("\033[1;31mFAIL\033[0m\n");
-        return false;
-    }
-
-    bool eval_perf(ggml_backend_t backend, const char * op_name) {
-        mode = MODE_PERF;
-
-        static const size_t graph_nodes = 8192;
-
-        ggml_init_params params = {
-            /* .mem_size = */ ggml_tensor_overhead()*128 + ggml_graph_overhead_custom(graph_nodes, false),
-            /* .mem_base = */ NULL,
-            /* .no_alloc = */ true,
-        };
-        ggml_context * ctx = ggml_init(params);
-
-        ggml_tensor * out = build_graph(ctx);
-
-        if (op_name != nullptr && op_desc(out) != op_name) {
-            //printf("  %s: skipping\n", op_desc(out).c_str());
-            ggml_free(ctx);
-            return true;
-        }
-
-        int len = printf("  %s(%s): ", op_desc(out).c_str(), vars().c_str());
-        fflush(stdout);
-
-        // check if backends support op
-        if (!ggml_backend_supports_op(backend, out)) {
-            printf("not supported\n");
-            ggml_free(ctx);
-            return true;
-        }
-
-        // align while also leaving some margin for variations in parameters
-        int align = 20;
-        int last = (len + align - 1) / align * align;
-        if (last - len < 5) {
-            last += align;
-        }
-        last = std::max(last, 60);
-        printf("%*s", last - len, "");
-
-        // allocate
-        ggml_backend_buffer_t buf = ggml_backend_alloc_ctx_tensors(ctx, backend);
-        if (buf == NULL) {
-            printf("failed to allocate tensors\n");
-            ggml_free(ctx);
-            return false;
-        }
-
-        // randomize tensors
-        initialize_tensors(ctx);
-
-        // build graph
-        ggml_cgraph * gf = ggml_new_graph_custom(ctx, graph_nodes, false);
-        ggml_build_forward_expand(gf, out);
-
-        // warmup run
-        ggml_backend_graph_compute(backend, gf);
-
-        // duplicate the op
-        size_t target_size = ggml_backend_is_cpu(backend) ? 1ULL << 33 : 1ULL << 35; // 8 GB CPU, 32 GB GPU
-        int n_runs = std::min((size_t)gf->size - gf->n_nodes, target_size / op_size(out)) + 1;
-        for (int i = 1; i < n_runs; i++) {
-            gf->nodes[gf->n_nodes++] = out;
-        }
-
-        // calculate memory
-        size_t mem = n_runs * op_size(out);
-        auto tensor_op_size = [](ggml_tensor * t) {
-            size_t size = ggml_nbytes(t);
-            // add source tensors
-            for (int i = 0; i < GGML_MAX_SRC; i++) {
-                if (t->src[i] != NULL) {
-                    size += ggml_nbytes(t->src[i]);
-                }
-            }
-            return size;
-        };
-        for (int i = 0; i < gf->n_nodes; i++) {
-            if (ggml_is_view_op(gf->nodes[i]->op) || gf->nodes[i] == out) {
-                continue;
-            }
-            mem += tensor_op_size(gf->nodes[i]);
-        }
-
-        // run
-        ggml_backend_synchronize(backend);
-
-        int64_t start_time = ggml_time_us();
-        ggml_backend_graph_compute(backend, gf);
-        ggml_backend_synchronize(backend);
-        int64_t end_time = ggml_time_us();
-        double time_us = end_time - start_time;
-
-        printf("    %5d runs - %8.2f us/run - %8zu kB/run - \033[1;34m%7.2f GB/s\033[0m\n",
-            n_runs,
-            time_us / n_runs,
-            op_size(out) / 1024,
-            mem / (time_us/1e6) / 1024.0 / 1024.0 / 1024.0);
-
-        ggml_backend_buffer_free(buf);
-
-        ggml_free(ctx);
-
-        return true;
-    }
-};
-
-// GGML_OP_UNARY
-struct test_unary : public test_case {
-    const ggml_unary_op op;
-    const ggml_type type;
-    const std::array<int64_t, 4> ne_a;
-    int v; // view (1 : non-contiguous a)
-
-    std::string vars() override {
-        return VARS_TO_STR3(type, ne_a, v);
-    }
-
-    test_unary(ggml_unary_op op,
-            ggml_type type = GGML_TYPE_F32,
-            std::array<int64_t, 4> ne_a = {128, 10, 10, 10},
-            int v = 0)
-        : op(op), type(type), ne_a(ne_a), v(v) {}
-
-    ggml_tensor * build_graph(ggml_context * ctx) override {
-        ggml_tensor * a;
-        if (v & 1) {
-            auto ne = ne_a; ne[0] *= 3;
-            a = ggml_new_tensor(ctx, type, 4, ne.data());
-            a = ggml_view_4d(ctx, a, ne_a[0], ne_a[1], ne_a[2], ne_a[3], a->nb[1], a->nb[2], a->nb[3], 0);
-        } else {
-            a = ggml_new_tensor(ctx, type, 4, ne_a.data());
-        }
-        ggml_tensor * out = ggml_unary(ctx, a, op);
-        return out;
-    }
-
-    void initialize_tensors(ggml_context * ctx) override {
-        for (ggml_tensor * t = ggml_get_first_tensor(ctx); t != NULL; t = ggml_get_next_tensor(ctx, t)) {
-            // test extended range of values to check for NaNs in GELU
-            init_tensor_uniform(t, -150.f, 150.f);
-        }
-    }
-};
-
-// GGML_OP_GET_ROWS
-struct test_get_rows : public test_case {
-    const ggml_type type;
-    const int n; // cols
-    const int m; // rows
-    const int r; // rows to get
-    const int b; // batch size
-    const bool v; // view (non-contiguous src1)
-
-    std::string vars() override {
-        return VARS_TO_STR6(type, n, m, r, b, v);
-    }
-
-    test_get_rows(ggml_type type = GGML_TYPE_F32, int n = 10, int m = 5, int r = 3, int b = 1, bool v = false)
-        : type(type), n(n), m(m), r(r), b(b), v(v) {}
-
-    ggml_tensor * build_graph(ggml_context * ctx) override {
-        ggml_tensor * in = ggml_new_tensor_3d(ctx, type, n, m, b);
-        ggml_tensor * rows = ggml_new_tensor_2d(ctx, GGML_TYPE_I32, r, b);
-        if (v) {
-            rows = ggml_view_2d(ctx, rows, r/2, b, rows->nb[1], 0);
-        }
-        ggml_tensor * out = ggml_get_rows(ctx, in, rows);
-        return out;
-    }
-
-    void initialize_tensors(ggml_context * ctx) override {
-        for (ggml_tensor * t = ggml_get_first_tensor(ctx); t != NULL; t = ggml_get_next_tensor(ctx, t)) {
-            if (t->type == GGML_TYPE_I32) {
-                if (ggml_is_view_op(t->op)) { continue; }
-                // rows
-                std::vector<int> data(r*b);
-                for (int i = 0; i < r*b; i++) {
-                    data[i] = rand() % m;
-                }
-                ggml_backend_tensor_set(t, data.data(), 0, r * b * sizeof(int));
-            } else {
-                init_tensor_uniform(t);
-            }
-        }
-    }
-};
-
-// GGML_OP_REPEAT
-struct test_repeat : public test_case {
-    const ggml_type type;
-    const std::array<int64_t, 4> ne;
-    const std::array<int, 4> nr;
-
-    std::string vars() override {
-        return VARS_TO_STR3(type, ne, nr);
-    }
-
-    size_t op_size(ggml_tensor * t) override {
-        return ggml_nbytes(t) * 2;
-    }
-
-    test_repeat(ggml_type type = GGML_TYPE_F32,
-            std::array<int64_t, 4> ne = {10, 10, 10, 10},
-            std::array<int, 4> nr = {2, 2, 2, 2})
-        : type(type), ne(ne), nr(nr) {}
-
-    ggml_tensor * build_graph(ggml_context * ctx) override {
-        ggml_tensor * target = ggml_new_tensor_4d(ctx, type, ne[0]*nr[0], ne[1]*nr[1], ne[2]*nr[2], ne[3]*nr[3]);
-        ggml_tensor * src = ggml_new_tensor(ctx, type, 4, ne.data());
-        ggml_tensor * out = ggml_repeat(ctx, src, target);
-        return out;
-    }
-};
-
-// GGML_OP_DUP
-struct test_dup : public test_case {
-    const ggml_type type;
-    const std::array<int64_t, 4> ne;
-    const std::array<int64_t, 4> permute;
-    bool _use_permute;
-
-    std::string vars() override {
-        std::string v = VARS_TO_STR2(type, ne);
-        if (_use_permute) v += "," + VAR_TO_STR(permute);
-        return v;
-    }
-
-    test_dup(ggml_type type = GGML_TYPE_F32,
-            std::array<int64_t, 4> ne = {10, 10, 20, 1},
-            std::array<int64_t, 4> permute = {0, 0, 0, 0})
-        : type(type), ne(ne), permute(permute),
-            _use_permute(permute[0] + permute[1] + permute[2] + permute[3] > 0) {}
-
-    ggml_tensor * build_graph(ggml_context * ctx) override {
-        ggml_tensor * src = ggml_new_tensor(ctx, type, 4, ne.data());
-        if (_use_permute) {
-            src = ggml_permute(ctx, src, permute[0], permute[1], permute[2], permute[3]);
-        }
-        ggml_tensor * out = ggml_dup(ctx, src);
-        return out;
-    }
-};
-
-// GGML_OP_CPY
-struct test_cpy : public test_case {
-    const ggml_type type_src;
-    const ggml_type type_dst;
-    const std::array<int64_t, 4> ne;
-    const std::array<int64_t, 4> permute;
-    bool _src_use_permute;
-
-    std::string vars() override {
-        return VARS_TO_STR4(type_src, type_dst, ne, permute);
-    }
-
-    double max_nmse_err() override {
-        return 1e-6;
-    }
-
-    size_t op_size(ggml_tensor * t) override {
-        return ggml_nbytes(t) + ggml_nbytes(t->src[0]);
-    }
-
-    test_cpy(ggml_type type_src = GGML_TYPE_F32, ggml_type type_dst = GGML_TYPE_F32,
-            std::array<int64_t, 4> ne = {10, 10, 10, 1},
-            std::array<int64_t, 4> permute = {0, 0, 0, 0},
-            bool _dst_use_permute = false)
-        : type_src(type_src), type_dst(type_dst), ne(ne), permute(permute),
-          _src_use_permute(permute[0] + permute[1] + permute[2] + permute[3] > 0) {}
-
-    ggml_tensor * build_graph(ggml_context * ctx) override {
-        ggml_tensor * src = ggml_new_tensor(ctx, type_src, 4, ne.data());
-        if (_src_use_permute) {
-            src = ggml_permute(ctx, src, permute[0], permute[1], permute[2], permute[3]);
-        }
-        ggml_tensor* dst = ggml_new_tensor(ctx, type_dst, 4, src->ne);
-        ggml_tensor * out = ggml_cpy(ctx, src, dst);
-        return out;
-    }
-};
-
-// GGML_OP_CONT
-struct test_cont : public test_case {
-    const ggml_type type;
-    const std::array<int64_t, 4> ne;
-
-    std::string vars() override {
-        return VARS_TO_STR2(type, ne);
-    }
-
-    test_cont(ggml_type type = GGML_TYPE_F32,
-            std::array<int64_t, 4> ne = {10, 10, 10, 1})
-        : type(type), ne(ne) {}
-
-    ggml_tensor * build_graph(ggml_context * ctx) override {
-        ggml_tensor * src = ggml_new_tensor(ctx, type, 4, ne.data());
-        src = ggml_transpose(ctx, src);
-        ggml_tensor * out = ggml_cont(ctx, src);
-
-        return out;
-    }
-};
-
-// GGML_OP_ADD
-// GGML_OP_MUL
-// GGML_OP_DIV
-struct test_bin_bcast : public test_case {
-    using op_t = ggml_tensor * (*) (ggml_context *, ggml_tensor *, ggml_tensor *);
-    op_t op;
-    const ggml_type type;
-    const std::array<int64_t, 4> ne;
-    const std::array<int, 4> nr;
-
-    std::string vars() override {
-        return VARS_TO_STR3(type, ne, nr);
-    }
-
-    size_t op_size(ggml_tensor * t) override {
-        return ggml_nbytes(t) * 3;
-    }
-
-    test_bin_bcast(op_t op, ggml_type type = GGML_TYPE_F32,
-            std::array<int64_t, 4> ne = {10, 10, 1, 1},
-            std::array<int, 4> nr = {1, 2, 1, 1})
-        : op(op), type(type), ne(ne), nr(nr) {}
-
-    ggml_tensor * build_graph(ggml_context * ctx) override {
-        ggml_tensor * a = ggml_new_tensor_4d(ctx, type, ne[0]*nr[0], ne[1]*nr[1], ne[2]*nr[2], ne[3]*nr[3]);
-        ggml_tensor * b = ggml_new_tensor(ctx, type, 4, ne.data());
-        ggml_tensor * out = op(ctx, a, b);
-        return out;
-    }
-
-    void initialize_tensors(ggml_context * ctx) override {
-        for (ggml_tensor * t = ggml_get_first_tensor(ctx); t != NULL; t = ggml_get_next_tensor(ctx, t)) {
-            if (op == ggml_div) {
-                // avoid division by zero
-                init_tensor_uniform(t, 1.0f, 2.0f);
-            } else {
-                init_tensor_uniform(t);
-            }
-        }
-    }
-};
-
-// GGML_OP_SCALE
-struct test_scale : public test_case {
-    const ggml_type type;
-    const std::array<int64_t, 4> ne;
-    float scale;
-
-    std::string vars() override {
-        return VARS_TO_STR3(type, ne, scale);
-    }
-
-    test_scale(ggml_type type = GGML_TYPE_F32,
-            std::array<int64_t, 4> ne = {10, 10, 10, 10},
-            float scale = 2.0f)
-        : type(type), ne(ne), scale(scale) {}
-
-    ggml_tensor * build_graph(ggml_context * ctx) override {
-        ggml_tensor * a = ggml_new_tensor(ctx, type, 4, ne.data());
-        ggml_tensor * out = ggml_scale(ctx, a, scale);
-        return out;
-    }
-};
-
-// GGML_OP_NORM
-struct test_norm : public test_case {
-    const ggml_type type;
-    const std::array<int64_t, 4> ne;
-    float eps;
-
-    std::string vars() override {
-        return VARS_TO_STR3(type, ne, eps);
-    }
-
-    test_norm(ggml_type type = GGML_TYPE_F32,
-            std::array<int64_t, 4> ne = {64, 10, 10, 10},
-            float eps = 1e-6f)
-        : type(type), ne(ne), eps(eps) {}
-
-    ggml_tensor * build_graph(ggml_context * ctx) override {
-        ggml_tensor * a = ggml_new_tensor(ctx, type, 4, ne.data());
-        ggml_tensor * out = ggml_norm(ctx, a, eps);
-        return out;
-    }
-};
-
-// GGML_OP_RMS_NORM
-struct test_rms_norm : public test_case {
-    const ggml_type type;
-    const std::array<int64_t, 4> ne;
-    float eps;
-
-    std::string vars() override {
-        return VARS_TO_STR3(type, ne, eps);
-    }
-
-    test_rms_norm(ggml_type type = GGML_TYPE_F32,
-            std::array<int64_t, 4> ne = {64, 10, 10, 10},
-            float eps = 1e-6f)
-        : type(type), ne(ne), eps(eps) {}
-
-    ggml_tensor * build_graph(ggml_context * ctx) override {
-        ggml_tensor * a = ggml_new_tensor(ctx, type, 4, ne.data());
-        ggml_tensor * out = ggml_rms_norm(ctx, a, eps);
-        return out;
-    }
-};
-
-// GGML_OP_MUL_MAT
-struct test_mul_mat : public test_case {
-    const ggml_type type_a;
-    const ggml_type type_b;
-    const int64_t m;
-    const int64_t n;
-    const int64_t k;
-    const std::array<int64_t, 2> bs; // dims 3 and 4
-    const std::array<int64_t, 2> nr; // repeat in dims 3 and 4
-
-    std::string vars() override {
-        return VARS_TO_STR7(type_a, type_b, m, n, k, bs, nr);
-    }
-
-    double max_nmse_err() override {
-        return 5e-4;
-    }
-
-    size_t op_size(ggml_tensor * t) override {
-        size_t a = ggml_nbytes(t->src[0]) * n * nr[0] * nr[1];
-        size_t b = ggml_nbytes(t->src[1]) * m;
-        size_t c  = ggml_nbytes(t);
-        return a + b + c;
-
-        GGML_UNUSED(t);
-    }
-
-    test_mul_mat(ggml_type type_a = GGML_TYPE_F32, ggml_type type_b = GGML_TYPE_F32,
-            int64_t m = 32, int64_t n = 32, int64_t k = 32,
-            std::array<int64_t, 2> bs = {10, 10},
-            std::array<int64_t, 2> nr = {2, 2})
-        : type_a(type_a), type_b(type_b), m(m), n(n), k(k), bs(bs), nr(nr) {}
-
-    ggml_tensor * build_graph(ggml_context * ctx) override {
-        // C^T = A * B^T: (k, m) * (k, n) => (m, n)
-        ggml_tensor * a = ggml_new_tensor_4d(ctx, type_a, k, m, bs[0]      , bs[1]);
-        ggml_tensor * b = ggml_new_tensor_4d(ctx, type_b, k, n, bs[0]*nr[0], bs[1]*nr[1]);
-        ggml_tensor * out = ggml_mul_mat(ctx, a, b);
-        return out;
-    }
-};
-
-// GGML_OP_MUL_MAT_ID
-struct test_mul_mat_id : public test_case {
-    const ggml_type type_a;
-    const ggml_type type_b;
-    const int n_mats;
-    const int n_used;
-    const bool b; // brodcast b matrix
-    const int64_t m;
-    const int64_t n;
-    const int64_t k;
-
-    std::string vars() override {
-        return VARS_TO_STR8(type_a, type_b, n_mats, n_used, b, m, n, k);
-    }
-
-    double max_nmse_err() override {
-        return 5e-4;
-    }
-
-    size_t op_size(ggml_tensor * t) override {
-        size_t a = ggml_nbytes(t->src[2]) * n;
-        size_t b = ggml_nbytes(t->src[1]) * m;
-        size_t c  = ggml_nbytes(t);
-        return a + b + c;
-
-        GGML_UNUSED(t);
-    }
-
-    test_mul_mat_id(ggml_type type_a = GGML_TYPE_F32, ggml_type type_b = GGML_TYPE_F32,
-            int n_mats = 8, int n_used = 2, bool b = false,
-            int64_t m = 32, int64_t n = 32, int64_t k = 32)
-        : type_a(type_a), type_b(type_b), n_mats(n_mats), n_used(n_used), b(b),
-            m(m), n(n), k(k) {
-            GGML_ASSERT(n_used <= n_mats);
-        }
-
-    ggml_tensor * build_graph(ggml_context * ctx) override {
-        // C^T = A * B^T: (k, m) * (k, n) => (m, n)
-        ggml_tensor * as = ggml_new_tensor_3d(ctx, type_a, k, m, n_mats);
-        ggml_tensor * ids = ggml_new_tensor_2d(ctx, GGML_TYPE_I32, n_mats, n);
-        if (n_used != n_mats) {
-            ids = ggml_view_2d(ctx, ids, n_used, n, ids->nb[1], 0);
-        }
-        ggml_tensor * b = ggml_new_tensor_3d(ctx, type_b, k, this->b ? 1 : n_used, n);
-        ggml_tensor * out = ggml_mul_mat_id(ctx, as, b, ids);
-        return out;
-    }
-
-    void initialize_tensors(ggml_context * ctx) override {
-        std::random_device rd;
-        std::default_random_engine rng(rd());
-        for (ggml_tensor * t = ggml_get_first_tensor(ctx); t != NULL; t = ggml_get_next_tensor(ctx, t)) {
-            if (t->type == GGML_TYPE_I32) {
-                if (ggml_is_view_op(t->op)) { continue; }
-                // ids
-                for (int64_t r = 0; r < ggml_nrows(t); r++) {
-                    std::vector<int32_t> data(t->ne[0]);
-                    for (int i = 0; i < t->ne[0]; i++) {
-                        data[i] = i % n_mats;
-                    }
-                    std::shuffle(data.begin(), data.end(), rng);
-                    ggml_backend_tensor_set(t, data.data(), r * t->nb[1], t->ne[0] * sizeof(int32_t));
-                }
-            } else {
-                init_tensor_uniform(t);
-            }
-        }
-    }
-};
-
-// GGML_OP_SQR
-struct test_sqr : public test_case {
-    const ggml_type type;
-    const std::array<int64_t, 4> ne;
-
-    std::string vars() override {
-        return VARS_TO_STR2(type, ne);
-    }
-
-    test_sqr(ggml_type type = GGML_TYPE_F32,
-            std::array<int64_t, 4> ne = {10, 10, 10, 10})
-        : type(type), ne(ne) {}
-
-    ggml_tensor * build_graph(ggml_context * ctx) override {
-        ggml_tensor * a = ggml_new_tensor(ctx, type, 4, ne.data());
-        ggml_tensor * out = ggml_sqr(ctx, a);
-        return out;
-    }
-};
-
-// GGML_OP_SQRT
-struct test_sqrt : public test_case {
-    const ggml_type type;
-    const std::array<int64_t, 4> ne;
-
-    std::string vars() override {
-        return VARS_TO_STR2(type, ne);
-    }
-
-    test_sqrt(ggml_type type = GGML_TYPE_F32,
-            std::array<int64_t, 4> ne = {10, 10, 10, 10})
-        : type(type), ne(ne) {}
-
-    ggml_tensor * build_graph(ggml_context * ctx) override {
-        ggml_tensor * a = ggml_new_tensor(ctx, type, 4, ne.data());
-        ggml_tensor * out = ggml_sqrt(ctx, a);
-        return out;
-    }
-
-    void initialize_tensors(ggml_context * ctx) override {
-        // fill with positive values
-        for (ggml_tensor * t = ggml_get_first_tensor(ctx); t != NULL; t = ggml_get_next_tensor(ctx, t)) {
-            init_tensor_uniform(t, 0.0f, 100.0f);
-        }
-    }
-};
-
-// GGML_OP_CLAMP
-struct test_clamp : public test_case {
-    const ggml_type type;
-    const std::array<int64_t, 4> ne;
-    float min;
-    float max;
-
-    std::string vars() override {
-        return VARS_TO_STR4(type, ne, min, max);
-    }
-
-    test_clamp(ggml_type type = GGML_TYPE_F32,
-            std::array<int64_t, 4> ne = {10, 10, 10, 10},
-            float min = -0.5f, float max = 0.5f)
-        : type(type), ne(ne), min(min), max(max) {}
-
-    ggml_tensor * build_graph(ggml_context * ctx) override {
-        ggml_tensor * a = ggml_new_tensor(ctx, type, 4, ne.data());
-        ggml_tensor * out = ggml_clamp(ctx, a, min, max);
-        return out;
-    }
-};
-
-// GGML_OP_DIAG_MASK_INF
-struct test_diag_mask_inf : public test_case {
-    const ggml_type type;
-    const std::array<int64_t, 4> ne;
-    const int n_past;
-
-    std::string vars() override {
-        return VARS_TO_STR3(type, ne, n_past);
-    }
-
-    test_diag_mask_inf(ggml_type type = GGML_TYPE_F32,
-            std::array<int64_t, 4> ne = {10, 10, 10, 10},
-            int n_past = 5)
-        : type(type), ne(ne), n_past(n_past) {}
-
-    ggml_tensor * build_graph(ggml_context * ctx) override {
-        ggml_tensor * a = ggml_new_tensor(ctx, type, 4, ne.data());
-        ggml_tensor * out = ggml_diag_mask_inf(ctx, a, n_past);
-        return out;
-    }
-};
-
-// GGML_OP_SOFT_MAX
-struct test_soft_max : public test_case {
-    const ggml_type type;
-    const std::array<int64_t, 4> ne;
-    const bool mask;
-    const float scale;
-    const float max_bias;
-
-    std::string vars() override {
-        return VARS_TO_STR5(type, ne, mask, scale, max_bias);
-    }
-
-    // the 1024 test with bias occasionally fails:
-    // SOFT_MAX(type=f32,ne=[1024,16,1,1],mask=1,scale=1.000000,max_bias=8.000000): [SOFT_MAX] NMSE = 0.000000103 > 0.000000100 FAIL
-    virtual double max_nmse_err() override {
-        return 1e-6;
-    }
-
-    test_soft_max(ggml_type type = GGML_TYPE_F32,
-            std::array<int64_t, 4> ne = {10, 10, 10, 10},
-            bool mask = false,
-            float scale = 1.0f,
-            float max_bias = 0.0f)
-        : type(type), ne(ne), mask(mask), scale(scale), max_bias(max_bias) {}
-
-    ggml_tensor * build_graph(ggml_context * ctx) override {
-        ggml_tensor * a = ggml_new_tensor(ctx, type, 4, ne.data());
-        ggml_tensor * mask = nullptr;
-        if (this->mask) {
-            mask = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, ne[0], ne[1]);
-        }
-        ggml_tensor * out = ggml_soft_max_ext(ctx, a, mask, scale, max_bias);
-        return out;
-    }
-};
-
-
-// GGML_OP_ROPE
-struct test_rope : public test_case {
-    const ggml_type type;
-    const std::array<int64_t, 4> ne_a;
-    int n_dims;
-    int mode;
-    int n_ctx; // used to generate positions
-    float fs; // freq_scale
-    float ef; // ext_factor
-    float af; // attn_factor
-    bool ff;
-    int v; // view (1 : non-contiguous a)
-
-    std::string vars() override {
-        return VARS_TO_STR10(type, ne_a, n_dims, mode, n_ctx, fs, ef, af, ff, v);
-    }
-
-    test_rope(ggml_type type = GGML_TYPE_F32,
-            std::array<int64_t, 4> ne_a = {10, 10, 10, 1},
-            int n_dims = 10, int mode = 0, int n_ctx = 512, float fs = 1.0f, float ef = 0.0f, float af = 0.0f, bool ff = false, int v = 0)
-        : type(type), ne_a(ne_a), n_dims(n_dims), mode(mode), n_ctx(n_ctx), fs(fs), ef(ef), af(af), ff(ff), v(v) {}
-
-    ggml_tensor * build_graph(ggml_context * ctx) override {
-        ggml_tensor * a;
-        if (v & 1) {
-            auto ne = ne_a; ne[0] *= 2; ne[1] *= 4; ne[2] *= 3;
-            a = ggml_new_tensor(ctx, type, 4, ne.data());
-            a = ggml_view_4d(ctx, a, ne_a[0], ne_a[1], ne_a[2], ne_a[3], a->nb[1], a->nb[2], a->nb[3], 0);
-        } else {
-            a = ggml_new_tensor(ctx, type, 4, ne_a.data());
-        }
-        ggml_tensor * pos = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, ne_a[2]);
-        ggml_tensor * freq = ff ? ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_dims/2) : nullptr;
-        ggml_tensor * out = ggml_rope_ext(ctx, a, pos, freq, n_dims, mode, 0, 10000.0f, fs, ef, af, 1.0f, 1.0f);
-        return out;
-    }
-
-    void initialize_tensors(ggml_context * ctx) override {
-        for (ggml_tensor * t = ggml_get_first_tensor(ctx); t != NULL; t = ggml_get_next_tensor(ctx, t)) {
-            if (t->type == GGML_TYPE_I32) {
-                // pos
-                std::vector<int> data(ne_a[2]);
-                for (int i = 0; i < ne_a[2]; i++) {
-                    data[i] = rand() % n_ctx;
-                }
-                ggml_backend_tensor_set(t, data.data(), 0, ne_a[2] * sizeof(int));
-            } else {
-                if (t->ne[0] == n_dims/2) {
-                    // frequency factors in the range [0.9f, 1.1f]
-                    init_tensor_uniform(t, 0.9f, 1.1f);
-                } else {
-                    init_tensor_uniform(t);
-                }
-            }
-        }
-    }
-};
-
-// GGML_OP_POOL2D
-struct test_pool2d : public test_case {
-    enum ggml_op_pool pool_type;
-    const ggml_type type_input;
-    const std::array<int64_t, 4> ne_input;
-    // kernel size
-    const int k0;
-    const int k1;
-    // stride
-    const int s0;
-    const int s1;
-    // padding
-    const int p0;
-    const int p1;
-
-    std::string vars() override {
-        return VARS_TO_STR9(pool_type, type_input, ne_input, k0, k1, s0, s1, p0, p1);
-    }
-
-    test_pool2d(ggml_op_pool pool_type = GGML_OP_POOL_AVG,
-            ggml_type type_input = GGML_TYPE_F32,
-            std::array<int64_t, 4> ne_input = {10, 10, 3, 1}, // [input_width, input_height, input_channels, 1]
-            int k0 = 3, int k1 = 3,
-            int s0 = 1, int s1 = 1,
-            int p0 = 1, int p1 = 1)
-        : pool_type(pool_type), type_input(type_input), ne_input(ne_input), k0(k0), k1(k1), s0(s0), s1(s1), p0(p0), p1(p1) {}
-
-    ggml_tensor * build_graph(ggml_context * ctx) override {
-        ggml_tensor * input = ggml_new_tensor(ctx, type_input, 4, ne_input.data());
-        ggml_tensor * out = ggml_pool_2d(ctx, input, pool_type, k0, k1, s0, s1, p0, p1);
-        return out;
-    }
-};
-
-// GGML_OP_CONV_TRANSPOSE_1D
-struct test_conv_transpose_1d : public test_case {
-    const std::array<int64_t, 4> ne_input;
-    const std::array<int64_t, 4> ne_kernel;
-
-    const int s0; // stride
-    const int p0; // padding
-    const int d0; // dilation
-
-    std::string vars() override {
-        return VARS_TO_STR5(ne_input, ne_kernel, s0, p0, d0);
-    }
-
-    test_conv_transpose_1d(std::array<int64_t, 4> ne_input = {197, 32, 1, 1}, // [input_width, input_height, input_channels, 1]
-                           std::array<int64_t, 4> ne_kernel = {16, 32, 32, 1}, // [kernel_width, kernel_height, input_channels, 1]
-                           int s0 = 1, int p0 = 0, int d0 = 1)
-        : ne_input(ne_input), ne_kernel(ne_kernel), s0(s0), p0(p0), d0(d0) {}
-
-    ggml_tensor * build_graph(ggml_context * ctx) override {
-        ggml_tensor * input = ggml_new_tensor(ctx, GGML_TYPE_F32, 4, ne_input.data());
-        ggml_tensor * kernel = ggml_new_tensor(ctx, GGML_TYPE_F32, 4, ne_kernel.data());
-        ggml_tensor * out = ggml_conv_transpose_1d(ctx, kernel, input, s0, p0, d0);
-        return out;
-    }
-};
-
-// GGML_OP_IM2COL
-struct test_im2col : public test_case {
-    const ggml_type type_input;
-    const ggml_type type_kernel;
-    const ggml_type dst_type;
-    const std::array<int64_t, 4> ne_input;
-    const std::array<int64_t, 4> ne_kernel;
-    // stride
-    const int s0;
-    const int s1;
-    // padding
-    const int p0;
-    const int p1;
-    // dilation
-    const int d0;
-    const int d1;
-    // mode
-    const bool is_2D;
-
-    std::string vars() override {
-        return VARS_TO_STR12(type_input, type_kernel, dst_type, ne_input, ne_kernel, s0, s1, p0, p1, d0, d1, is_2D);
-    }
-
-    test_im2col(ggml_type type_input = GGML_TYPE_F32, ggml_type type_kernel = GGML_TYPE_F16, ggml_type dst_type = GGML_TYPE_F32,
-            std::array<int64_t, 4> ne_input = {10, 10, 3, 1}, // [input_width, input_height, input_channels, 1]
-            std::array<int64_t, 4> ne_kernel = {3, 3, 3, 1}, // [kernel_width, kernel_height, input_channels, 1]
-            int s0 = 1, int s1 = 1,
-            int p0 = 1, int p1 = 1,
-            int d0 = 1, int d1 = 1,
-            bool is_2D = true)
-        : type_input(type_input), type_kernel(type_kernel), dst_type(dst_type), ne_input(ne_input), ne_kernel(ne_kernel), s0(s0), s1(s1), p0(p0), p1(p1), d0(d0), d1(d1), is_2D(is_2D) {}
-
-    ggml_tensor * build_graph(ggml_context * ctx) override {
-        ggml_tensor * input = ggml_new_tensor(ctx, type_input, 4, ne_input.data());
-        ggml_tensor * kernel = ggml_new_tensor(ctx, type_kernel, 4, ne_kernel.data());
-        ggml_tensor * out = ggml_im2col(ctx, kernel, input, s0, s1, p0, p1, d0, d1, is_2D, dst_type);
-        return out;
-    }
-};
-
-// GGML_OP_CONCAT
-struct test_concat : public test_case {
-    const ggml_type type;
-    const std::array<int64_t, 4> ne_a;
-    const int64_t ne_b_d;
-    const int dim;
-    const int v; // view (1 << 0: non-cont a, 1 << 1: non-cont b)
-
-    std::string vars() override {
-        return VARS_TO_STR5(type, ne_a, ne_b_d, dim, v);
-    }
-
-    test_concat(ggml_type type = GGML_TYPE_F32,
-            std::array<int64_t, 4> ne_a = {10, 10, 10, 10},
-            int64_t ne_b_d = 10,
-            int dim = 2, int v = 0)
-        : type(type), ne_a(ne_a), ne_b_d(ne_b_d), dim(dim), v(v) {}
-
-    ggml_tensor * build_graph(ggml_context * ctx) override {
-        auto ne_b = ne_a;
-        ne_b[dim] = ne_b_d;
-        ggml_tensor * a;
-        if (v & 1) {
-            auto ne = ne_a; ne[0] *= 2; ne[1] *= 4; ne[2] *= 3;
-            a = ggml_new_tensor(ctx, type, 4, ne.data());
-            a = ggml_view_4d(ctx, a, ne_a[0], ne_a[1], ne_a[2], ne_a[3], a->nb[1], a->nb[2], a->nb[3], 0);
-        } else {
-            a = ggml_new_tensor(ctx, type, 4, ne_a.data());
-        }
-        ggml_tensor * b;
-        if (v & 2) {
-            auto ne = ne_b; ne[0] *= 3; ne[1] *= 2; ne[2] *= 4;
-            b = ggml_new_tensor(ctx, type, 4, ne.data());
-            b = ggml_view_4d(ctx, b, ne_b[0], ne_b[1], ne_b[2], ne_b[3], b->nb[1], b->nb[2], b->nb[3], 0);
-        } else {
-            b = ggml_new_tensor(ctx, type, 4, ne_b.data());
-        }
-        ggml_tensor * out = ggml_concat(ctx, a, b, dim);
-        return out;
-    }
-};
-
-// GGML_OP_ARGSORT
-struct test_argsort : public test_case {
-    const ggml_type type;
-    const std::array<int64_t, 4> ne;
-    ggml_sort_order order;
-
-    std::string vars() override {
-        return VARS_TO_STR3(type, ne, order);
-    }
-
-    test_argsort(ggml_type type = GGML_TYPE_F32,
-            std::array<int64_t, 4> ne = {16, 10, 10, 10},
-            ggml_sort_order order = GGML_SORT_ORDER_ASC)
-        : type(type), ne(ne), order(order) {}
-
-    ggml_tensor * build_graph(ggml_context * ctx) override {
-        ggml_tensor * a = ggml_new_tensor(ctx, type, 4, ne.data());
-        ggml_tensor * out = ggml_argsort(ctx, a, order);
-        return out;
-    }
-
-    void initialize_tensors(ggml_context * ctx) override {
-        std::random_device rd;
-        std::default_random_engine rng(rd());
-        for (ggml_tensor * t = ggml_get_first_tensor(ctx); t != NULL; t = ggml_get_next_tensor(ctx, t)) {
-            if (t->type == GGML_TYPE_I32) {
-                // indices
-                std::vector<int> data(ggml_nelements(t));
-                for (int i = 0; i < ggml_nelements(t); i++) {
-                    data[i] = rand();
-                }
-                std::shuffle(data.begin(), data.end(), rng);
-                ggml_backend_tensor_set(t, data.data(), 0, ne[0]*ne[1]*ne[2]*ne[3] * sizeof(int));
-            } else if (t->type == GGML_TYPE_F32) {
-                // initialize with unique values to avoid ties
-                for (int64_t r = 0; r < ggml_nrows(t); r++) {
-                    std::vector<float> data(t->ne[0]);
-                    for (int i = 0; i < t->ne[0]; i++) {
-                        data[i] = i;
-                    }
-                    std::shuffle(data.begin(), data.end(), rng);
-                    ggml_backend_tensor_set(t, data.data(), r * t->nb[1], t->ne[0] * sizeof(float));
-                }
-            } else {
-                GGML_ASSERT(false);
-            }
-        }
-    }
-};
-
-// GGML_OP_SUM_ROWS
-struct test_sum_rows : public test_case {
-    const ggml_type type;
-    const std::array<int64_t, 4> ne;
-
-    std::string vars() override {
-        return VARS_TO_STR2(type, ne);
-    }
-
-    test_sum_rows(ggml_type type = GGML_TYPE_F32,
-            std::array<int64_t, 4> ne = {10, 10, 10, 10})
-        : type(type), ne(ne) {}
-
-    ggml_tensor * build_graph(ggml_context * ctx) override {
-        ggml_tensor * a = ggml_new_tensor(ctx, type, 4, ne.data());
-        ggml_tensor * out = ggml_sum_rows(ctx, a);
-        return out;
-    }
-};
-
-// GGML_OP_UPSCALE
-struct test_upscale : public test_case {
-    const ggml_type type;
-    const std::array<int64_t, 4> ne;
-    const int32_t scale_factor;
-    const bool transpose;
-
-    std::string vars() override {
-        return VARS_TO_STR4(type, ne, scale_factor, transpose);
-    }
-
-    test_upscale(ggml_type type = GGML_TYPE_F32,
-            std::array<int64_t, 4> ne = {512, 512, 3, 1},
-            int32_t scale_factor = 2, bool transpose = false)
-        : type(type), ne(ne), scale_factor(scale_factor), transpose(transpose) {}
-
-    ggml_tensor * build_graph(ggml_context * ctx) override {
-        ggml_tensor * a = ggml_new_tensor(ctx, type, 4, ne.data());
-        if (transpose) a = ggml_transpose(ctx, a);
-        ggml_tensor * out = ggml_upscale(ctx, a, scale_factor);
-        return out;
-    }
-};
-
-// GGML_OP_UPSCALE (ext)
-struct test_upscale_ext : public test_case {
-    const ggml_type type;
-    const std::array<int64_t, 4> ne;
-    const std::array<int64_t, 4> ne_tgt;
-
-    std::string vars() override {
-        return VARS_TO_STR3(type, ne, ne_tgt);
-    }
-
-    test_upscale_ext(ggml_type type = GGML_TYPE_F32,
-            std::array<int64_t, 4> ne     = {2, 5,  7, 11},
-            std::array<int64_t, 4> ne_tgt = {5, 7, 11, 13})
-        : type(type), ne(ne), ne_tgt(ne_tgt) {}
-
-    ggml_tensor * build_graph(ggml_context * ctx) override {
-        ggml_tensor * a = ggml_new_tensor(ctx, type, 4, ne.data());
-        ggml_tensor * out = ggml_upscale_ext(ctx, a, ne_tgt[0], ne_tgt[1],ne_tgt[2], ne_tgt[3]);
-        return out;
-    }
-};
-
-// GGML_OP_GROUP_NORM
-struct test_group_norm : public test_case {
-    const ggml_type type;
-    const std::array<int64_t, 4> ne;
-    const int32_t num_groups;
-
-    std::string vars() override {
-        return VARS_TO_STR3(type, ne, num_groups);
-    }
-
-    test_group_norm(ggml_type type = GGML_TYPE_F32,
-            std::array<int64_t, 4> ne = {64, 64, 320, 1},
-            int32_t num_groups = 32)
-        : type(type), ne(ne), num_groups(num_groups) {}
-
-    ggml_tensor * build_graph(ggml_context * ctx) override {
-        ggml_tensor * a = ggml_new_tensor(ctx, type, 4, ne.data());
-        ggml_tensor * out = ggml_group_norm(ctx, a, num_groups);
-        return out;
-    }
-};
-
-// GGML_OP_ACC
-struct test_acc : public test_case {
-    const ggml_type type;
-    const std::array<int64_t, 4> ne_a;
-    const std::array<int64_t, 4> ne_b;
-
-    std::string vars() override {
-        return VARS_TO_STR3(type, ne_a, ne_b);
-    }
-
-    test_acc(ggml_type type = GGML_TYPE_F32,
-            std::array<int64_t, 4> ne_a = {1024, 577, 1, 1},
-            std::array<int64_t, 4> ne_b = {1024, 576, 1, 1})
-        : type(type), ne_a(ne_a), ne_b(ne_b) {}
-
-    ggml_tensor * build_graph(ggml_context * ctx) override {
-        ggml_tensor * a = ggml_new_tensor(ctx, type, 4, ne_a.data());
-        ggml_tensor * b = ggml_new_tensor(ctx, type, 4, ne_b.data());
-        ggml_tensor * out = ggml_acc(ctx, a, b, a->nb[1], a->nb[2], a->nb[3], b->nb[1]);
-        return out;
-    }
-};
-
-// GGML_OP_PAD
-struct test_pad : public test_case {
-    const ggml_type type;
-    const std::array<int64_t, 4> ne_a;
-    const int pad_0;
-    const int pad_1;
-
-    std::string vars() override {
-        return VARS_TO_STR4(type, ne_a, pad_0, pad_1);
-    }
-
-    test_pad(ggml_type type = GGML_TYPE_F32,
-            std::array<int64_t, 4> ne_a = {512, 512, 1, 1},
-            int pad_0 = 1, int pad_1 = 1)
-        : type(type), ne_a(ne_a), pad_0(pad_0), pad_1(pad_1)  {}
-
-    ggml_tensor * build_graph(ggml_context * ctx) override {
-        ggml_tensor * a = ggml_new_tensor(ctx, type, 4, ne_a.data());
-        ggml_tensor * out = ggml_pad(ctx, a, pad_0, pad_1, 0, 0);
-        return out;
-    }
-};
-
-// GGML_OP_ARANGE
-struct test_arange : public test_case {
-    const ggml_type type;
-    const float start;
-    const float stop;
-    const float step;
-
-    std::string vars() override {
-        return VARS_TO_STR4(type, start, stop, step);
-    }
-
-    test_arange(ggml_type type = GGML_TYPE_F32,
-            float start = 0.f, float stop = 10.f, float step = 1.f)
-        : type(type), start(start), stop(stop), step(step)  {}
-
-    ggml_tensor * build_graph(ggml_context * ctx) override {
-        ggml_tensor * out = ggml_arange(ctx, start, stop, step);
-        return out;
-    }
-};
-
-// GGML_OP_TIMESTEP_EMBEDDING
-struct test_timestep_embedding : public test_case {
-    const ggml_type type;
-    const std::array<int64_t, 4> ne_a;
-    const int dim;
-    const int max_period;
-
-    std::string vars() override {
-        return VARS_TO_STR4(type, ne_a, dim, max_period);
-    }
-
-    test_timestep_embedding(ggml_type type = GGML_TYPE_F32,
-            std::array<int64_t, 4> ne_a = {2, 1, 1, 1},
-            int dim = 320, int max_period=10000)
-        : type(type), ne_a(ne_a), dim(dim), max_period(max_period)  {}
-
-    ggml_tensor * build_graph(ggml_context * ctx) override {
-        ggml_tensor * a = ggml_new_tensor(ctx, type, 4, ne_a.data());
-        ggml_tensor * out = ggml_timestep_embedding(ctx, a, dim, max_period);
-        return out;
-    }
-};
-
-// GGML_OP_LEAKY_RELU
-struct test_leaky_relu : public test_case {
-    const ggml_type type;
-    const std::array<int64_t, 4> ne_a;
-    const float negative_slope;
-
-    std::string vars() override {
-        return VARS_TO_STR3(type, ne_a, negative_slope);
-    }
-
-    test_leaky_relu(ggml_type type = GGML_TYPE_F32,
-            std::array<int64_t, 4> ne_a = {10, 10, 10, 10},
-            float negative_slope = 0.1f)
-        : type(type), ne_a(ne_a), negative_slope(negative_slope)  {}
-
-    ggml_tensor * build_graph(ggml_context * ctx) override {
-        ggml_tensor * a = ggml_new_tensor(ctx, type, 4, ne_a.data());
-        ggml_tensor * out = ggml_leaky_relu(ctx, a, negative_slope, true);
-        return out;
-    }
-};
-
-// GGML_OP_FLASH_ATTN_EXT
-struct test_flash_attn_ext : public test_case {
-    const int64_t hs; // head size
-    const int64_t nh; // num heads
-    const int64_t kv; // kv size
-    const int64_t nb; // batch size
-
-    const bool mask; // use mask
-
-    const float max_bias; // ALiBi
-
-    const ggml_type type_KV;
-
-    std::string vars() override {
-        return VARS_TO_STR7(hs, nh, kv, nb, mask, max_bias, type_KV);
-    }
-
-    double max_nmse_err() override {
-        return 5e-4;
-    }
-
-    test_flash_attn_ext(int64_t hs = 128, int64_t nh = 32, int64_t kv = 96, int64_t nb = 8, bool mask = true, float max_bias = 0.0f, ggml_type type_KV = GGML_TYPE_F16)
-        : hs(hs), nh(nh), kv(kv), nb(nb), mask(mask), max_bias(max_bias), type_KV(type_KV) {}
-
-    ggml_tensor * build_graph(ggml_context * ctx) override {
-        const int64_t hs_padded = GGML_PAD(hs, ggml_blck_size(type_KV));
-
-        ggml_tensor * q = ggml_new_tensor_4d(ctx, GGML_TYPE_F32, hs_padded, nb, nh, 1);
-        ggml_tensor * k = ggml_new_tensor_4d(ctx, type_KV,       hs_padded, kv, nh, 1);
-        ggml_tensor * v = ggml_new_tensor_4d(ctx, type_KV,       hs_padded, kv, nh, 1);
-        ggml_tensor * m = mask ? ggml_new_tensor_4d(ctx, GGML_TYPE_F16, kv, GGML_PAD(nb, GGML_KQ_MASK_PAD), 1, 1) : nullptr;
-        ggml_tensor * out = ggml_flash_attn_ext(ctx, q, k, v, m, 1.0f/sqrtf(hs), max_bias);
-        return out;
-    }
-};
-
-enum llm_norm_type {
-    LLM_NORM,
-    LLM_NORM_RMS,
-};
-
-struct llama_hparams {
-    uint32_t n_vocab;
-    uint32_t n_embd;
-    uint32_t n_head;
-    uint32_t n_head_kv;
-    static constexpr uint32_t n_layer = 1;
-    uint32_t n_rot;
-    uint32_t n_embd_head; // dimension of values (d_v)
-    uint32_t n_ff;
-
-    float f_norm_eps;
-    float f_norm_rms_eps;
-
-    // cparams
-    static constexpr uint32_t n_ctx = 512; // user-specified context size
-    static constexpr uint32_t n_ctx_orig = n_ctx;
-
-    // batch
-    int32_t n_tokens;
-
-    // llm_build_context
-    static constexpr int32_t n_kv    = 32; // size of KV cache to consider (n_kv <= n_ctx
-    static constexpr int32_t kv_head = 1;  // index of where we store new KV data in the cache
-
-    uint32_t n_embd_gqa() const { // dimension of key embeddings across all k-v heads
-        return n_embd_head * n_head_kv;
-    }
-};
-
-// LLM base class
-struct test_llm : public test_case {
-    llama_hparams hp;
-
-protected:
-    test_llm(llama_hparams hp)
-        : hp(std::move(hp)) {
-    }
-
-public:
-    struct ggml_tensor * llm_build_norm(
-            struct ggml_context * ctx,
-             struct ggml_tensor * cur,
-             struct ggml_tensor * mw,
-             struct ggml_tensor * mb,
-                  llm_norm_type   type) {
-        switch (type) {
-            case LLM_NORM:     cur = ggml_norm    (ctx, cur, hp.f_norm_eps); break;
-            case LLM_NORM_RMS: cur = ggml_rms_norm(ctx, cur, hp.f_norm_rms_eps); break;
-        }
-        cur = ggml_mul(ctx, cur, mw);
-        if (mb) {
-            cur = ggml_add(ctx, cur, mb);
-        }
-        return cur;
-    }
-
-    void llm_build_kv_store(
-            struct ggml_context * ctx,
-             struct ggml_tensor * k_l,
-             struct ggml_tensor * v_l,
-             struct ggml_tensor * k_cur,
-             struct ggml_tensor * v_cur) {
-        // compute the transposed [n_tokens, n_embd] V matrix
-        struct ggml_tensor * v_cur_t = ggml_transpose(ctx, ggml_reshape_2d(ctx, v_cur, hp.n_embd_gqa(), hp.n_tokens));
-
-        struct ggml_tensor * k_cache_view = ggml_view_1d(ctx, k_l, hp.n_tokens*hp.n_embd_gqa(),
-                (ggml_row_size(k_l->type, hp.n_embd_gqa()))*hp.kv_head);
-
-        struct ggml_tensor * v_cache_view = ggml_view_2d(ctx, v_l, hp.n_tokens, hp.n_embd_gqa(),
-                (  hp.n_ctx)*ggml_element_size(v_l),
-                (hp.kv_head)*ggml_element_size(v_l));
-
-        // important: storing RoPE-ed version of K in the KV cache!
-        ggml_cpy(ctx, k_cur,   k_cache_view);
-        ggml_cpy(ctx, v_cur_t, v_cache_view);
-    }
-
-    struct ggml_tensor * llm_build_kqv(
-            struct ggml_context * ctx,
-             struct ggml_tensor * k_l,
-             struct ggml_tensor * v_l,
-             struct ggml_tensor * q_cur,
-             struct ggml_tensor * kq_mask,
-                        float     kq_scale) {
-        struct ggml_tensor * q = ggml_permute(ctx, q_cur, 0, 2, 1, 3);
-
-        struct ggml_tensor * k =
-            ggml_view_3d(ctx, k_l,
-                    hp.n_embd_head, hp.n_kv, hp.n_head_kv,
-                    ggml_row_size(k_l->type, hp.n_embd_gqa()),
-                    ggml_row_size(k_l->type, hp.n_embd_head),
-                    0);
-
-        struct ggml_tensor * kq = ggml_mul_mat(ctx, k, q);
-
-        kq = ggml_soft_max_ext(ctx, kq, kq_mask, kq_scale, 0.0f);
-
-        // split cached v into n_head heads
-        struct ggml_tensor * v =
-            ggml_view_3d(ctx, v_l,
-                    hp.n_kv, hp.n_embd_head, hp.n_head_kv,
-                    ggml_element_size(v_l)*hp.n_ctx,
-                    ggml_element_size(v_l)*hp.n_ctx*hp.n_embd_head,
-                    0);
-
-        struct ggml_tensor * kqv = ggml_mul_mat(ctx, v, kq);
-
-        struct ggml_tensor * kqv_merged = ggml_permute(ctx, kqv, 0, 2, 1, 3);
-
-        struct ggml_tensor * cur = ggml_cont_2d(ctx, kqv_merged, hp.n_embd_head*hp.n_head, hp.n_tokens);
-
-        struct ggml_tensor * wo = ggml_new_tensor_2d(ctx, GGML_TYPE_Q4_0, hp.n_embd, hp.n_embd);
-        cur = ggml_mul_mat(ctx, wo, cur);
-
-        return cur;
-    }
-
-    void initialize_tensors(ggml_context * ctx) override {
-        for (ggml_tensor * t = ggml_get_first_tensor(ctx); t != NULL; t = ggml_get_next_tensor(ctx, t)) {
-            if (t->type == GGML_TYPE_I32) {
-                // pos
-                std::vector<int> data(hp.n_tokens);
-                for (int i = 0; i < hp.n_tokens; i++) {
-                    data[i] = rand() % hp.n_ctx;
-                }
-                ggml_backend_tensor_set(t, data.data(), 0, hp.n_tokens * sizeof(int));
-            } else {
-                init_tensor_uniform(t);
-            }
-        }
-    }
-};
-
-// Llama
-struct test_llama : public test_llm {
-    static constexpr float freq_base = 10000.0f;
-    static constexpr float freq_scale = 1.0f;
-    static constexpr float ext_factor = 0.0f;
-    static constexpr float attn_factor = 1.0f;
-    static constexpr float beta_fast = 32.0f;
-    static constexpr float beta_slow = 1.0f;
-
-    std::string op_desc(ggml_tensor * t) override {
-        GGML_UNUSED(t);
-        return "LLAMA";
-    }
-
-    std::string vars() override {
-        auto n_tokens = hp.n_tokens;
-        return VARS_TO_STR1(n_tokens);
-    }
-
-    double max_nmse_err() override {
-        return 2e-3;
-    }
-
-    test_llama(int n_tokens = 1)
-        : test_llm({
-            /*n_vocab        =*/ 32000,
-            /*n_embd         =*/ 3200,
-            /*n_head         =*/ 32,
-            /*n_head_kv      =*/ 32,
-            /*n_rot          =*/ 100,
-            /*n_embd_head    =*/ 100,
-            /*n_ff           =*/ 8640,
-            /*f_norm_eps     =*/ 0.f,
-            /*f_norm_rms_eps =*/ 1e-5f,
-            /*n_tokens       =*/ n_tokens,
-        }) {
-    }
-
-    ggml_tensor * build_graph(ggml_context * ctx) override {
-        struct ggml_tensor * cur;
-        struct ggml_tensor * inpL;
-
-        inpL = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, hp.n_embd, hp.n_tokens);
-
-        // inp_pos - contains the positions
-        struct ggml_tensor * inp_pos = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, hp.n_tokens);
-
-        // KQ_mask (mask for 1 head, it will be broadcasted to all heads)
-        struct ggml_tensor * KQ_mask = ggml_new_tensor_3d(ctx, GGML_TYPE_F16, hp.n_kv, hp.n_tokens, 1);
-
-        ggml_tensor * k_l = ggml_new_tensor_1d(ctx, GGML_TYPE_F16, 1638400);
-        ggml_tensor * v_l = ggml_new_tensor_1d(ctx, GGML_TYPE_F16, 1638400);
-
-        for (uint32_t il = 0; il < hp.n_layer; ++il) {
-            struct ggml_tensor * inpSA = inpL;
-
-            // norm
-            ggml_tensor * attn_norm = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, hp.n_embd);
-            cur = llm_build_norm(ctx, inpL, attn_norm, nullptr, LLM_NORM_RMS);
-
-            // self-attention
-            {
-                ggml_tensor * wq = ggml_new_tensor_2d(ctx, GGML_TYPE_Q4_0, hp.n_embd, hp.n_embd);
-                ggml_tensor * wk = ggml_new_tensor_2d(ctx, GGML_TYPE_Q4_0, hp.n_embd, hp.n_embd_gqa());
-                ggml_tensor * wv = ggml_new_tensor_2d(ctx, GGML_TYPE_Q4_0, hp.n_embd, hp.n_embd_gqa());
-
-                // compute Q and K and RoPE them
-                struct ggml_tensor * Qcur = ggml_mul_mat(ctx, wq, cur);
-                struct ggml_tensor * Kcur = ggml_mul_mat(ctx, wk, cur);
-                struct ggml_tensor * Vcur = ggml_mul_mat(ctx, wv, cur);
-
-                Qcur = ggml_rope_ext(
-                    ctx, ggml_reshape_3d(ctx, Qcur, hp.n_embd_head, hp.n_head,    hp.n_tokens), inp_pos, nullptr,
-                    hp.n_rot, 0, hp.n_ctx_orig, freq_base, freq_scale,
-                    ext_factor, attn_factor, beta_fast, beta_slow
-                );
-
-                Kcur = ggml_rope_ext(
-                    ctx, ggml_reshape_3d(ctx, Kcur, hp.n_embd_head, hp.n_head_kv, hp.n_tokens), inp_pos, nullptr,
-                    hp.n_rot, 0, hp.n_ctx_orig, freq_base, freq_scale,
-                    ext_factor, attn_factor, beta_fast, beta_slow
-                );
-
-                llm_build_kv_store(ctx, k_l, v_l, Kcur, Vcur);
-
-                cur = llm_build_kqv(ctx, k_l, v_l, Qcur, KQ_mask, 1.0f/sqrtf(float(hp.n_embd_head)));
-            }
-
-            struct ggml_tensor * ffn_inp = ggml_add(ctx, cur, inpSA);
-
-            // feed-forward network
-            ggml_tensor * ffn_norm = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, hp.n_embd);
-            cur = llm_build_norm(ctx, ffn_inp, ffn_norm, nullptr, LLM_NORM_RMS);
-
-            ggml_tensor * ffn_gate = ggml_new_tensor_2d(ctx, GGML_TYPE_Q4_0, hp.n_embd, hp.n_ff);
-            ggml_tensor * ffn_down = ggml_new_tensor_2d(ctx, GGML_TYPE_Q4_0, hp.n_ff,   hp.n_embd);
-            ggml_tensor * ffn_up   = ggml_new_tensor_2d(ctx, GGML_TYPE_Q4_0, hp.n_embd, hp.n_ff);
-            struct ggml_tensor * tmp = ggml_mul_mat(ctx, ffn_up, cur);
-            cur = ggml_mul_mat(ctx, ffn_gate, cur);
-            cur = ggml_silu(ctx, cur);
-            cur = ggml_mul(ctx, cur, tmp);
-            cur = ggml_mul_mat(ctx, ffn_down, cur);
-
-            cur = ggml_add(ctx, cur, ffn_inp);
-
-            // input for next layer
-            inpL = cur;
-        }
-
-        cur = inpL;
-
-        ggml_tensor * output_norm = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, hp.n_embd);
-        cur = llm_build_norm(ctx, cur, output_norm, nullptr, LLM_NORM_RMS);
-
-        // lm_head
-        ggml_tensor * output = ggml_new_tensor_2d(ctx, GGML_TYPE_Q4_0, hp.n_embd, hp.n_vocab);
-        cur = ggml_mul_mat(ctx, output, cur);
-
-        return cur;
-    }
-};
-
-// Falcon
-struct test_falcon : public test_llm {
-    static constexpr float freq_base = 10000.0f;
-    static constexpr float freq_scale = 1.0f;
-    static constexpr float ext_factor = 0.0f;
-    static constexpr float attn_factor = 1.0f;
-    static constexpr float beta_fast = 32.0f;
-    static constexpr float beta_slow = 1.0f;
-
-    std::string op_desc(ggml_tensor * t) override {
-        GGML_UNUSED(t);
-        return "FALCON";
-    }
-
-    std::string vars() override {
-        auto n_tokens = hp.n_tokens;
-        return VARS_TO_STR1(n_tokens);
-    }
-
-    double max_nmse_err() override {
-        return 2e-3;
-    }
-
-    test_falcon(int n_tokens = 1)
-        : test_llm({
-            /*n_vocab        =*/ 32000,
-            /*n_embd         =*/ 3200,
-            /*n_head         =*/ 50,
-            /*n_head_kv      =*/ 1,
-            /*n_rot          =*/ 64,
-            /*n_embd_head    =*/ 64,
-            /*n_ff           =*/ 8640,
-            /*f_norm_eps     =*/ 1e-5f,
-            /*f_norm_rms_eps =*/ 0.f,
-            /*n_tokens       =*/ n_tokens,
-        }) {
-    }
-
-    ggml_tensor * build_graph(ggml_context * ctx) override {
-        struct ggml_tensor * cur;
-        struct ggml_tensor * inpL;
-
-        inpL = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, hp.n_embd, hp.n_tokens);
-
-        // inp_pos - contains the positions
-        struct ggml_tensor * inp_pos = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, hp.n_tokens);
-
-        // KQ_mask (mask for 1 head, it will be broadcasted to all heads)
-        struct ggml_tensor * KQ_mask = ggml_new_tensor_3d(ctx, GGML_TYPE_F16, hp.n_kv, hp.n_tokens, 1);
-
-        ggml_tensor * k_l = ggml_new_tensor_1d(ctx, GGML_TYPE_F16, 1638400);
-        ggml_tensor * v_l = ggml_new_tensor_1d(ctx, GGML_TYPE_F16, 1638400);
-
-        for (uint32_t il = 0; il < hp.n_layer; ++il) {
-            // norm
-            ggml_tensor * attn_norm_w = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, hp.n_embd);
-            ggml_tensor * attn_norm_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, hp.n_embd);
-            ggml_tensor * attn_norm = llm_build_norm(ctx, inpL, attn_norm_w, attn_norm_b, LLM_NORM);
-
-            // self-attention
-            {
-                cur = attn_norm;
-
-                ggml_tensor * wqkv = ggml_new_tensor_2d(ctx, GGML_TYPE_Q4_0, hp.n_embd, hp.n_embd + 2*hp.n_embd_gqa());
-
-                cur = ggml_mul_mat(ctx, wqkv, cur);
-
-                struct ggml_tensor * Qcur = ggml_cont(ctx, ggml_view_2d(ctx, cur, hp.n_embd,     hp.n_tokens, cur->nb[1], 0*sizeof(float)*(hp.n_embd)));
-                struct ggml_tensor * Kcur = ggml_cont(ctx, ggml_view_2d(ctx, cur, hp.n_embd_gqa(), hp.n_tokens, cur->nb[1], 1*sizeof(float)*(hp.n_embd)));
-                struct ggml_tensor * Vcur = ggml_cont(ctx, ggml_view_2d(ctx, cur, hp.n_embd_gqa(), hp.n_tokens, cur->nb[1], 1*sizeof(float)*(hp.n_embd + hp.n_embd_gqa())));
-
-                Qcur = ggml_reshape_3d(ctx, Qcur, hp.n_embd_head, hp.n_head,    hp.n_tokens);
-                Kcur = ggml_reshape_3d(ctx, Kcur, hp.n_embd_head, hp.n_head_kv, hp.n_tokens);
-
-                // using mode = 2 for neox mode
-                Qcur = ggml_rope_ext(
-                    ctx, Qcur, inp_pos, nullptr, hp.n_rot, 2, hp.n_ctx_orig,
-                    freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow
-                );
-
-                Kcur = ggml_rope_ext(
-                    ctx, Kcur, inp_pos, nullptr, hp.n_rot, 2, hp.n_ctx_orig,
-                    freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow
-                );
-
-                llm_build_kv_store(ctx, k_l, v_l, Kcur, Vcur);
-
-                cur = llm_build_kqv(ctx, k_l, v_l, Qcur, KQ_mask, 1.0f/sqrtf(float(hp.n_embd_head)));
-            }
-
-            struct ggml_tensor * ffn_inp = cur;
-
-            // feed forward
-            {
-                ggml_tensor * ffn_up   = ggml_new_tensor_2d(ctx, GGML_TYPE_Q4_0, hp.n_embd, hp.n_ff);
-                ggml_tensor * ffn_down = ggml_new_tensor_2d(ctx, GGML_TYPE_Q4_0, hp.n_ff, hp.n_embd);
-                cur = attn_norm;
-                cur = ggml_mul_mat(ctx, ffn_up, cur);
-                cur = ggml_gelu(ctx, cur);
-                cur = ggml_mul_mat(ctx, ffn_down, cur);
-            }
-
-            cur = ggml_add(ctx, cur, ffn_inp);
-
-            cur = ggml_add(ctx, cur, inpL);
-
-            // input for next layer
-            inpL = cur;
-        }
-
-        cur = inpL;
-
-        ggml_tensor * output_norm   = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, hp.n_embd);
-        ggml_tensor * output_norm_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, hp.n_embd);
-        cur = llm_build_norm(ctx, cur, output_norm, output_norm_b, LLM_NORM);
-
-        // lm_head
-        ggml_tensor * output = ggml_new_tensor_2d(ctx, GGML_TYPE_Q8_0, hp.n_embd, hp.n_vocab);
-        cur = ggml_mul_mat(ctx, output, cur);
-
-        return cur;
-    }
-};
-
-static bool test_backend(ggml_backend_t backend, test_mode mode, const char * op_name) {
-    std::vector<std::unique_ptr<test_case>> test_cases;
-    std::default_random_engine rng(0);
-
-    const ggml_type all_types[] = {
-        GGML_TYPE_F32, GGML_TYPE_F16, GGML_TYPE_BF16,
-        GGML_TYPE_Q4_0, GGML_TYPE_Q4_1,
-        GGML_TYPE_Q5_0, GGML_TYPE_Q5_1,
-        GGML_TYPE_Q8_0,
-        GGML_TYPE_Q2_K, GGML_TYPE_Q3_K,
-        GGML_TYPE_Q4_K, GGML_TYPE_Q5_K,
-        GGML_TYPE_Q6_K,
-        GGML_TYPE_IQ2_XXS, GGML_TYPE_IQ2_XS, GGML_TYPE_IQ2_S,
-        GGML_TYPE_IQ3_XXS, GGML_TYPE_IQ1_S, GGML_TYPE_IQ1_M,
-        GGML_TYPE_IQ4_NL, GGML_TYPE_IQ3_S, GGML_TYPE_IQ4_XS,
-    };
-
-    const ggml_type base_types[] = {
-        GGML_TYPE_F32, GGML_TYPE_F16,
-        GGML_TYPE_Q4_0,
-        GGML_TYPE_Q4_K,
-        GGML_TYPE_IQ2_XXS
-    };
-
-    const ggml_type other_types[] = {
-        GGML_TYPE_Q4_1,
-        GGML_TYPE_Q5_0, GGML_TYPE_Q5_1,
-        GGML_TYPE_Q8_0,
-        GGML_TYPE_Q2_K, GGML_TYPE_Q3_K,
-        GGML_TYPE_Q5_K,
-        GGML_TYPE_Q6_K,
-        GGML_TYPE_IQ2_XS, GGML_TYPE_IQ2_S,
-        GGML_TYPE_IQ3_XXS, GGML_TYPE_IQ1_S, GGML_TYPE_IQ1_M,
-        GGML_TYPE_IQ4_NL, GGML_TYPE_IQ3_S, GGML_TYPE_IQ4_XS,
-        GGML_TYPE_BF16,
-    };
-
-    // unary ops
-    for (int v : {0, 1}) {
-        for (int op = 0; op < GGML_UNARY_OP_COUNT; op++) {
-            test_cases.emplace_back(new test_unary((ggml_unary_op) op, GGML_TYPE_F32, { 128, 10, 10, 10 }, v));
-            test_cases.emplace_back(new test_unary((ggml_unary_op) op, GGML_TYPE_F32, { 7, 13, 19, 23 }, v));
-        }
-    }
-
-    test_cases.emplace_back(new test_get_rows(GGML_TYPE_F32, 1, 8, 2, 1, false));
-    for (ggml_type type : all_types) {
-        for (int b : {1, 7}) {
-            for (bool v : {false, true}) {
-                test_cases.emplace_back(new test_get_rows(type, 256, 5, 4, b, v));
-            }
-        }
-    }
-    for (int b : {1, 7}) {
-        for (bool v : {false, true}) {
-            test_cases.emplace_back(new test_get_rows(GGML_TYPE_I32, 256, 5, 4, b, v));
-        }
-    }
-
-    for (ggml_type type_input : {GGML_TYPE_F32}) {
-        for (ggml_op_pool pool_type : {GGML_OP_POOL_AVG, GGML_OP_POOL_MAX}) {
-            for (int k0 : {1, 3}) {
-                for (int k1 : {1, 3}) {
-                    for (int s0 : {1, 2}) {
-                        for (int s1 : {1, 2}) {
-                            for (int p0 : {0, 1}) {
-                                for (int p1 : {0, 1}) {
-                                    test_cases.emplace_back(new test_pool2d(pool_type, type_input, {10, 10, 3, 1}, k0, k1, s0, s1, p0, p1));
-                                }
-                            }
-                        }
-                    }
-                }
-            }
-        }
-    }
-
-    test_cases.emplace_back(new test_im2col(GGML_TYPE_F32, GGML_TYPE_F16, GGML_TYPE_F32));
-    test_cases.emplace_back(new test_im2col(GGML_TYPE_F32, GGML_TYPE_F16, GGML_TYPE_F16));
-
-    test_cases.emplace_back(new test_conv_transpose_1d());
-    test_cases.emplace_back(new test_conv_transpose_1d({3,2,1,1}, {2,3,2,1}, 3, 0, 1));
-    test_cases.emplace_back(new test_conv_transpose_1d({3,2,1,1}, {2,3,2,1}, 2, 0, 1));
-    test_cases.emplace_back(new test_conv_transpose_1d({3,2,1,1}, {2,3,2,1}, 1, 0, 1));
-    test_cases.emplace_back(new test_conv_transpose_1d({3,2,1,1}, {3,2,2,1}, 2, 0, 1));
-    test_cases.emplace_back(new test_conv_transpose_1d({3,2,1,1}, {3,2,2,1}, 1, 0, 1));
-    test_cases.emplace_back(new test_conv_transpose_1d({3,2,1,1}, {3,1,2,1}, 1, 0, 1));
-    test_cases.emplace_back(new test_conv_transpose_1d({2,1,1,1}, {3,1,1,1}, 1, 0, 1));
-
-
-    test_cases.emplace_back(new test_repeat(GGML_TYPE_F32, {10, 10, 10, 10}, {1, 1, 1, 1}));
-    test_cases.emplace_back(new test_repeat(GGML_TYPE_F32, {10, 10, 10, 10}, {2, 1, 1, 1}));
-    test_cases.emplace_back(new test_repeat(GGML_TYPE_F32, {10, 10, 10, 10}, {1, 2, 1, 1}));
-    test_cases.emplace_back(new test_repeat(GGML_TYPE_F32, {10, 10, 10, 10}, {1, 1, 2, 1}));
-    test_cases.emplace_back(new test_repeat(GGML_TYPE_F32, {10, 10, 10, 10}, {1, 1, 1, 2}));
-    test_cases.emplace_back(new test_repeat(GGML_TYPE_I32, {10, 10, 10, 10}, {2, 1, 1, 1}));
-    test_cases.emplace_back(new test_repeat(GGML_TYPE_I16, {10, 10, 10, 10}, {1, 1, 1, 2}));
-
-    test_cases.emplace_back(new test_dup(GGML_TYPE_F32));
-    test_cases.emplace_back(new test_dup(GGML_TYPE_F16));
-    test_cases.emplace_back(new test_dup(GGML_TYPE_I32));
-    test_cases.emplace_back(new test_dup(GGML_TYPE_I16));
-    test_cases.emplace_back(new test_dup(GGML_TYPE_F32, {10, 10, 5, 1}, {0, 2, 1, 3}));
-    test_cases.emplace_back(new test_dup(GGML_TYPE_F16, {10, 10, 5, 1}, {0, 2, 1, 3})); // dup by rows
-    test_cases.emplace_back(new test_dup(GGML_TYPE_F32, {10, 10, 5, 1}, {1, 0, 2, 3}));
-    test_cases.emplace_back(new test_dup(GGML_TYPE_F16, {10, 10, 5, 1}, {1, 0, 2, 3})); // dup dst not-contiguous
-    test_cases.emplace_back(new test_dup(GGML_TYPE_I16, {10, 8, 3, 1}, {0, 2, 1, 3}));
-    test_cases.emplace_back(new test_dup(GGML_TYPE_I16, {10, 8, 3, 1}, {1, 2, 0, 3}));
-
-    for (ggml_type type_src : {GGML_TYPE_F16, GGML_TYPE_F32}) {
-        for (ggml_type type_dst : all_types) {
-           test_cases.emplace_back(new test_cpy(type_src, type_dst, {256, 4, 4, 4}));
-           test_cases.emplace_back(new test_cpy(type_src, type_dst, {256, 2, 3, 4}, {0, 2, 1, 3})); // cpy by rows
-        }
-    }
-    for (ggml_type type_src : {GGML_TYPE_F16, GGML_TYPE_F32}) {
-        for (ggml_type type_dst : {GGML_TYPE_F16, GGML_TYPE_F32}) {
-            test_cases.emplace_back(new test_cpy(type_src, type_dst, {256, 2, 3, 4}, {1, 0, 2, 3})); // cpy not-contiguous
-        }
-    }
-
-    test_cases.emplace_back(new test_cont());
-
-    auto add_test_bin_bcast = [&](ggml_type type, std::array<int64_t, 4> ne, std::array<int, 4> nr) {
-        for (auto op : {ggml_add, ggml_mul, ggml_div}) {
-            test_cases.emplace_back(new test_bin_bcast(op, type, ne, nr));
-        }
-    };
-
-    add_test_bin_bcast(GGML_TYPE_F32, {1, 1, 8, 1}, {1, 1, 1, 1});
-    add_test_bin_bcast(GGML_TYPE_F32, {1, 1, 1, 1}, {32, 1, 1, 1});
-    add_test_bin_bcast(GGML_TYPE_F32, {1, 1, 320, 320}, {1, 1, 1, 1});
-    add_test_bin_bcast(GGML_TYPE_F32, {16, 10, 1, 1}, {1, 1, 1, 1});
-    add_test_bin_bcast(GGML_TYPE_F32, {16, 10, 10, 1}, {1, 1, 1, 1});
-    add_test_bin_bcast(GGML_TYPE_F32, {16, 10, 10, 10}, {1, 1, 1, 1});
-    add_test_bin_bcast(GGML_TYPE_F32, {16, 10, 10, 10}, {2, 1, 1, 1});
-    add_test_bin_bcast(GGML_TYPE_F32, {16, 10, 10, 10}, {1, 2, 1, 1});
-    add_test_bin_bcast(GGML_TYPE_F32, {16, 10, 10, 10}, {1, 1, 2, 1});
-    add_test_bin_bcast(GGML_TYPE_F32, {16, 10, 10, 10}, {1, 1, 1, 2});
-    add_test_bin_bcast(GGML_TYPE_F32, {16, 10, 10, 10}, {1, 1, 2, 2});
-    add_test_bin_bcast(GGML_TYPE_F32, {16, 10, 10, 10}, {1, 2, 2, 2});
-    add_test_bin_bcast(GGML_TYPE_F32, {16, 10, 10, 10}, {2, 2, 2, 2});
-
-    // stable diffusion
-    add_test_bin_bcast(GGML_TYPE_F32, {1280, 1, 1, 1}, {1, 1, 1, 1});
-    add_test_bin_bcast(GGML_TYPE_F32, {1280, 1, 1, 1}, {1, 16, 16, 1});
-    add_test_bin_bcast(GGML_TYPE_F32, {1280, 16, 16, 1}, {1, 1, 1, 1});
-    add_test_bin_bcast(GGML_TYPE_F32, {1280, 1, 1, 1}, {1, 256, 1, 1});
-    add_test_bin_bcast(GGML_TYPE_F32, {1, 1, 1280, 1}, {16, 16, 1, 1});
-    add_test_bin_bcast(GGML_TYPE_F32, {16, 16, 1280, 1}, {1, 1, 1, 1});
-    add_test_bin_bcast(GGML_TYPE_F32, {1, 1, 1920, 1}, {16, 16, 1, 1});
-    add_test_bin_bcast(GGML_TYPE_F32, {1, 1, 2560, 1}, {16, 16, 1, 1});
-    add_test_bin_bcast(GGML_TYPE_F32, {1, 1, 1280, 1}, {32, 32, 1, 1});
-    add_test_bin_bcast(GGML_TYPE_F32, {1, 1, 1920, 1}, {32, 32, 1, 1});
-    add_test_bin_bcast(GGML_TYPE_F32, {1, 1, 640, 1}, {32, 32, 1, 1});
-    add_test_bin_bcast(GGML_TYPE_F32, {5120, 1, 1, 1}, {1, 256, 1, 1});
-    add_test_bin_bcast(GGML_TYPE_F32, {640, 1, 1, 1}, {1, 1, 1, 1});
-    //add_test_bin_bcast(GGML_TYPE_F32, {3, 3, 2560, 1280}, {1, 1, 1, 1});
-    //add_test_bin_bcast(GGML_TYPE_F32, {3, 3, 2560, 1280}, {2, 1, 1, 1});
-
-    test_cases.emplace_back(new test_scale());
-
-    for (float eps : {1e-6f, 1e-5f, 1e-3f, 1e-1f}) {
-        test_cases.emplace_back(new test_norm(GGML_TYPE_F32, {64, 10, 10, 10}, eps));
-        test_cases.emplace_back(new test_rms_norm(GGML_TYPE_F32, {64, 10, 10, 10}, eps));
-    }
-
-#if 1
-    for (ggml_type type_a : base_types) {
-        for (ggml_type type_b : {GGML_TYPE_F32, GGML_TYPE_F16}) {
-            test_cases.emplace_back(new test_mul_mat(type_a, type_b, 16, 1, 256, { 1,  1}, {1, 1}));
-            test_cases.emplace_back(new test_mul_mat(type_a, type_b, 16, 1, 256, {10,  1}, {1, 1}));
-            test_cases.emplace_back(new test_mul_mat(type_a, type_b, 16, 1, 256, {10,  1}, {2, 1}));
-            test_cases.emplace_back(new test_mul_mat(type_a, type_b, 16, 1, 256, {10, 10}, {1, 1}));
-            test_cases.emplace_back(new test_mul_mat(type_a, type_b, 16, 1, 256, {10, 10}, {2, 1}));
-            test_cases.emplace_back(new test_mul_mat(type_a, type_b, 16, 1, 256, {10, 10}, {1, 2}));
-            test_cases.emplace_back(new test_mul_mat(type_a, type_b, 16, 1, 256, {10, 10}, {2, 2}));
-
-            test_cases.emplace_back(new test_mul_mat(type_a, type_b, 16, 16, 256, { 1,  1}, {1, 1}));
-            test_cases.emplace_back(new test_mul_mat(type_a, type_b, 16, 16, 256, {10,  1}, {1, 1}));
-            test_cases.emplace_back(new test_mul_mat(type_a, type_b, 16, 16, 256, {10,  1}, {2, 1}));
-            test_cases.emplace_back(new test_mul_mat(type_a, type_b, 16, 16, 256, {10, 10}, {1, 1}));
-            test_cases.emplace_back(new test_mul_mat(type_a, type_b, 16, 16, 256, {10, 10}, {2, 1}));
-            test_cases.emplace_back(new test_mul_mat(type_a, type_b, 16, 16, 256, {10, 10}, {1, 2}));
-            test_cases.emplace_back(new test_mul_mat(type_a, type_b, 16, 16, 256, {10, 10}, {2, 2}));
-        }
-    }
-#else
-    // m = a rows
-    // n = b rows
-    // k = cols
-    std::uniform_int_distribution<> dist_m(1, 128);
-    std::uniform_int_distribution<> dist_n(16, 128);
-    std::uniform_int_distribution<> dist_k(1, 16);
-    for (int i = 0; i < 1000; i++) {
-        for (ggml_type type_a : all_types) {
-            for (ggml_type type_b : {GGML_TYPE_F32}) {
-                int m = dist_m(rng);
-                int n = dist_n(rng);
-                int k = dist_k(rng) * ggml_blck_size(type_a);
-                test_cases.emplace_back(new test_mul_mat(type_a, type_b, m, n, k, { 1,  1}, {1, 1}));
-            }
-        }
-    }
-#endif
-
-    for (ggml_type type_a : other_types) {
-        for (ggml_type type_b : {GGML_TYPE_F32}) {
-            test_cases.emplace_back(new test_mul_mat(type_a, type_b, 16, 1, 256, { 1,  1}, {1, 1}));
-        }
-    }
-
-    test_cases.emplace_back(new test_mul_mat(GGML_TYPE_F16, GGML_TYPE_F32,  64, 2,  128, { 8,  1}, {1, 1}));
-    test_cases.emplace_back(new test_mul_mat(GGML_TYPE_F16, GGML_TYPE_F32,  83, 2,  128, { 8,  1}, {4, 1}));
-    test_cases.emplace_back(new test_mul_mat(GGML_TYPE_F16, GGML_TYPE_F32,  64, 2,   64, { 8,  1}, {4, 1}));
-    test_cases.emplace_back(new test_mul_mat(GGML_TYPE_F16, GGML_TYPE_F32,  83, 2,   64, { 8,  1}, {4, 1}));
-    test_cases.emplace_back(new test_mul_mat(GGML_TYPE_F16, GGML_TYPE_F32,  64, 45, 128, { 8,  1}, {4, 1}));
-    test_cases.emplace_back(new test_mul_mat(GGML_TYPE_F16, GGML_TYPE_F32, 128, 45,  64, { 8,  1}, {4, 1}));
-
-    for (ggml_type type_a : base_types) {
-        for (ggml_type type_b : {GGML_TYPE_F32 /*, GGML_TYPE_F16 */}) {
-            for (int n_mats : {4, 8}) {
-                for (int n_used : {1, 2, 4}) {
-                    for (bool b : {false, true}) {
-                        for (int n : {1, 32}) {
-                            int m = 512;
-                            int k = 256;
-                            test_cases.emplace_back(new test_mul_mat_id(type_a, type_b, n_mats, n_used, b, m, n, k));
-                        }
-                    }
-                }
-            }
-        }
-    }
-
-    for (ggml_type type_a : other_types) {
-        for (ggml_type type_b : {GGML_TYPE_F32 /*, GGML_TYPE_F16 */}) {
-            for (int n_mats : {4}) {
-                for (int n_used : {2}) {
-                    for (bool b : {false}) {
-                        for (int n : {1}) {
-                            int m = 512;
-                            int k = 256;
-                            test_cases.emplace_back(new test_mul_mat_id(type_a, type_b, n_mats, n_used, b, m, n, k));
-                        }
-                    }
-                }
-            }
-        }
-    }
-
-    test_cases.emplace_back(new test_sqr());
-    test_cases.emplace_back(new test_sqrt());
-    test_cases.emplace_back(new test_clamp());
-
-    test_cases.emplace_back(new test_diag_mask_inf(GGML_TYPE_F32, {10, 10,  1,  1}, 5));
-    test_cases.emplace_back(new test_diag_mask_inf(GGML_TYPE_F32, {10, 10, 10,  1}, 5));
-    test_cases.emplace_back(new test_diag_mask_inf(GGML_TYPE_F32, {10, 10, 10, 10}, 5));
-
-#if 0
-    std::uniform_int_distribution<> dist_ne1(1, 50);
-    int exponent = 1;
-    while (exponent < (1 << 17)) {
-        std::uniform_int_distribution<> dist_ne0(exponent, 2*exponent);
-
-        for (int n = 0; n < 10; ++n) {
-            int64_t ne0 = dist_ne0(rng);
-            int64_t ne1 = dist_ne1(rng);
-            test_cases.emplace_back(new test_soft_max(GGML_TYPE_F32, GGML_TYPE_F32, {ne0, ne1, 1, 1}, n/2 == 0, 0.1f, ne0 < 1000 ? 4.0f : 0.0f));
-        }
-
-        exponent <<= 1;
-    }
-#endif
-    for (bool mask : {false, true}) {
-        for (float max_bias : {0.0f, 8.0f}) {
-            if (!mask && max_bias > 0.0f) continue;
-            for (float scale : {1.0f, 0.1f}) {
-                for (int64_t ne0 : {16, 1024}) {
-                    for (int64_t ne1 : {16, 1024}) {
-                        test_cases.emplace_back(new test_soft_max(GGML_TYPE_F32, {ne0,   ne1,   1, 1}, mask, scale, max_bias));
-                        test_cases.emplace_back(new test_soft_max(GGML_TYPE_F32, {ne0-1, ne1-1, 1, 1}, mask, scale, max_bias));
-                    }
-                }
-            }
-        }
-    }
-    test_cases.emplace_back(new test_soft_max(GGML_TYPE_F32, {16, 2, 32, 1}, true, 0.1f, 0.0f));
-    test_cases.emplace_back(new test_soft_max(GGML_TYPE_F32, {16, 2, 32, 1}, false, 0.1f, 0.0f));
-    test_cases.emplace_back(new test_soft_max(GGML_TYPE_F32, {32, 2, 32, 1}, true,  0.1f, 0.0f));
-    test_cases.emplace_back(new test_soft_max(GGML_TYPE_F32, {32, 2, 32, 1}, true,  0.1f, 8.0f));
-
-    {
-        bool all = true;
-
-        for (float v : { 0, 1 }) {
-            for (float fs : { 1.0f, 1.4245f }) {
-                for (float ef : { 0.0f, 0.7465f }) {
-                    for (float af : { 1.0f, 1.4245f }) {
-                        for (ggml_type type : {GGML_TYPE_F32, GGML_TYPE_F16}) {
-                            for (bool ff : {false, true}) { // freq_factors
-                                test_cases.emplace_back(new test_rope(type, {128,  32, 10, 1}, 128, 0, 512, fs, ef, af, ff, v)); // llama 7B
-
-                                if (all) {
-                                    test_cases.emplace_back(new test_rope(type, {128,  40, 10, 1}, 128, 0, 512, fs, ef, af, ff, v)); // llama 13B
-                                    test_cases.emplace_back(new test_rope(type, {128,  52, 10, 1}, 128, 0, 512, fs, ef, af, ff, v)); // llama 30B
-                                    test_cases.emplace_back(new test_rope(type, {128,  64, 10, 1}, 128, 0, 512, fs, ef, af, ff, v)); // llama 65B
-                                }
-
-                                if (all) {
-                                    test_cases.emplace_back(new test_rope(type, { 64,   1, 10, 1},  64, 2, 512, fs, ef, af, ff, v)); // neox (falcon 7B)
-                                    test_cases.emplace_back(new test_rope(type, { 64,  71, 10, 1},  64, 2, 512, fs, ef, af, ff, v)); // neox (falcon 7B)
-                                    test_cases.emplace_back(new test_rope(type, { 64,   8, 10, 1},  64, 2, 512, fs, ef, af, ff, v)); // neox (falcon 40B)
-                                    test_cases.emplace_back(new test_rope(type, { 80,  32, 10, 1},  20, 2, 512, fs, ef, af, ff, v)); // neox (stablelm)
-                                    test_cases.emplace_back(new test_rope(type, { 80,  32, 10, 1},  32, 2, 512, fs, ef, af, ff, v)); // neox (phi-2)
-                                }
-
-                                test_cases.emplace_back(new test_rope(type, { 64, 128, 10, 1},  64, 2, 512, fs, ef, af, ff, v)); // neox (falcon 40B)
-                            }
-                        }
-
-                        all = false;
-                    }
-                }
-            }
-        }
-    }
-
-    for (int v : { 0, 1, 2, 3 }) {
-        for (int dim : { 0, 1, 2, 3, }) {
-            test_cases.emplace_back(new test_concat(GGML_TYPE_F32, {11, 12, 13, 14}, 7, dim, v));
-            test_cases.emplace_back(new test_concat(GGML_TYPE_I32, {11, 12, 13, 14}, 7, dim, v));
-        }
-    }
-
-    for (ggml_sort_order order : {GGML_SORT_ORDER_ASC, GGML_SORT_ORDER_DESC}) {
-        test_cases.emplace_back(new test_argsort(GGML_TYPE_F32, {8, 1, 1, 1}, order));
-        test_cases.emplace_back(new test_argsort(GGML_TYPE_F32, {16, 10, 10, 10}, order));
-        test_cases.emplace_back(new test_argsort(GGML_TYPE_F32, {60, 10, 10, 10}, order)); // qwen
-    }
-
-    test_cases.emplace_back(new test_sum_rows());
-    test_cases.emplace_back(new test_upscale());
-    test_cases.emplace_back(new test_upscale(GGML_TYPE_F32, { 512, 512, 3, 1 }, 2, true));
-    test_cases.emplace_back(new test_upscale_ext());
-    test_cases.emplace_back(new test_group_norm());
-    test_cases.emplace_back(new test_acc());
-    test_cases.emplace_back(new test_pad());
-    test_cases.emplace_back(new test_arange());
-    test_cases.emplace_back(new test_timestep_embedding());
-    test_cases.emplace_back(new test_leaky_relu());
-
-    for (int hs : { 64, 80, 128, 256, }) {
-        for (bool mask : { true, false } ) {
-            for (float max_bias : { 0.0f, 8.0f }) {
-                if (!mask && max_bias > 0.0f) continue;
-                for (int nh : { 32, }) {
-                    for (int kv : { 512, 1024, }) {
-                        for (int nb : { 1, 2, 4, 8, }) {
-                            for (ggml_type type_KV : {GGML_TYPE_F16, GGML_TYPE_Q8_0, GGML_TYPE_Q4_0}) {
-                                test_cases.emplace_back(new test_flash_attn_ext(hs, nh, kv, nb, mask, max_bias, type_KV));
-                            }
-                        }
-                    }
-                }
-            }
-        }
-    }
-
-    // these tests are disabled to save execution time, but they can be handy for debugging
-#if 0
-    test_cases.emplace_back(new test_llama(1));
-    test_cases.emplace_back(new test_llama(2));
-    test_cases.emplace_back(new test_falcon(1));
-    test_cases.emplace_back(new test_falcon(2));
-#endif
-
-    // run tests
-    if (mode == MODE_TEST) {
-        ggml_backend_t backend_cpu = ggml_backend_cpu_init();
-
-        size_t n_ok = 0;
-        for (auto & test : test_cases) {
-            if (test->eval(backend, backend_cpu, op_name)) {
-                n_ok++;
-            }
-        }
-        printf("  %zu/%zu tests passed\n", n_ok, test_cases.size());
-
-        ggml_backend_free(backend_cpu);
-
-        return n_ok == test_cases.size();
-    }
-
-    if (mode == MODE_PERF) {
-        for (auto & test : test_cases) {
-            test->eval_perf(backend, op_name);
-        }
-        return true;
-    }
-
-    GGML_ASSERT(false);
-    return false;
-}
-
-static void usage(char ** argv) {
-    printf("Usage: %s [mode] [-o op] [-b backend]\n", argv[0]);
-    printf("  valid modes are: test (compare with CPU backend for correctness) or perf (performance evaluation)\n");
-    printf("  op names are as given by ggml_op_desc()\n");
-}
-
-int main(int argc, char ** argv) {
-    test_mode mode = MODE_TEST;
-    const char * op_name_filter = NULL;
-    const char * backend_filter = NULL;
-
-    for (int i = 1; i < argc; i++) {
-        if (strcmp(argv[i], "test") == 0) {
-            mode = MODE_TEST;
-        } else if (strcmp(argv[i], "perf") == 0) {
-            mode = MODE_PERF;
-        } else if (strcmp(argv[i], "-o") == 0) {
-            if (i + 1 < argc) {
-                op_name_filter = argv[++i];
-            } else {
-                usage(argv);
-                return 1;
-            }
-        } else if (strcmp(argv[i], "-b") == 0) {
-            if (i + 1 < argc) {
-                backend_filter = argv[++i];
-            } else {
-                usage(argv);
-                return 1;
-            }
-        } else {
-            usage(argv);
-            return 1;
-        }
-    }
-
-    // enumerate backends
-    printf("Testing %zu backends\n\n", ggml_backend_reg_get_count());
-
-    size_t n_ok = 0;
-
-    for (size_t i = 0; i < ggml_backend_reg_get_count(); i++) {
-        printf("Backend %zu/%zu (%s)\n", i + 1, ggml_backend_reg_get_count(), ggml_backend_reg_get_name(i));
-
-        if (backend_filter != NULL && strcmp(backend_filter, ggml_backend_reg_get_name(i)) != 0) {
-            printf("  Skipping\n");
-            n_ok++;
-            continue;
-        }
-
-        ggml_backend_t backend = ggml_backend_reg_init_backend(i, NULL);
-        GGML_ASSERT(backend != NULL);
-
-        if (backend_filter == NULL && ggml_backend_is_cpu(backend)) {
-            printf("  Skipping CPU backend\n");
-            ggml_backend_free(backend);
-            n_ok++;
-            continue;
-        }
-
-        printf("  Backend name: %s\n", ggml_backend_name(backend));
-
-        bool ok = test_backend(backend, mode, op_name_filter);
-
-        printf("  Backend %s: ", ggml_backend_name(backend));
-        if (ok) {
-            printf("\033[1;32mOK\033[0m\n");
-            n_ok++;
-        } else {
-            printf("\033[1;31mFAIL\033[0m\n");
-        }
-
-        printf("\n");
-
-        ggml_backend_free(backend);
-    }
-
-    printf("%zu/%zu backends passed\n", n_ok, ggml_backend_reg_get_count());
-
-    if (n_ok != ggml_backend_reg_get_count()) {
-        printf("\033[1;31mFAIL\033[0m\n");
-        return 1;
-    }
-
-    ggml_quantize_free();
-
-    printf("\033[1;32mOK\033[0m\n");
-    return 0;
-}
diff --git a/tests/test-c.c b/tests/test-c.c
deleted file mode 100644
index 95ba73df3..000000000
--- a/tests/test-c.c
+++ /dev/null
@@ -1,7 +0,0 @@
-#include "llama.h"
-
-#ifdef GGML_USE_KOMPUTE
-#include "ggml-kompute.h"
-#endif
-
-int main(void) {}
diff --git a/tests/test-chat-template.cpp b/tests/test-chat-template.cpp
deleted file mode 100644
index a8222caee..000000000
--- a/tests/test-chat-template.cpp
+++ /dev/null
@@ -1,177 +0,0 @@
-#include <string>
-#include <vector>
-#include <sstream>
-
-#undef NDEBUG
-#include <cassert>
-
-#include "llama.h"
-#include "common.h"
-
-int main(void) {
-    llama_chat_message conversation[] = {
-        {"system", "You are a helpful assistant"},
-        {"user", "Hello"},
-        {"assistant", "Hi there"},
-        {"user", "Who are you"},
-        {"assistant", "   I am an assistant   "},
-        {"user", "Another question"},
-    };
-    size_t message_count = 6;
-    std::vector<std::string> templates = {
-        // teknium/OpenHermes-2.5-Mistral-7B
-        "{% for message in messages %}{{'<|im_start|>' + message['role'] + '\\n' + message['content'] + '<|im_end|>' + '\\n'}}{% endfor %}{% if add_generation_prompt %}{{ '<|im_start|>assistant\\n' }}{% endif %}",
-        // mistralai/Mistral-7B-Instruct-v0.2
-        "{{ bos_token }}{% for message in messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% if message['role'] == 'user' %}{{ '[INST] ' + message['content'] + ' [/INST]' }}{% elif message['role'] == 'assistant' %}{{ message['content'] + eos_token}}{% else %}{{ raise_exception('Only user and assistant roles are supported!') }}{% endif %}{% endfor %}",
-        // TheBloke/FusionNet_34Bx2_MoE-AWQ
-        "{%- for idx in range(0, messages|length) -%}\\n{%- if messages[idx]['role'] == 'user' -%}\\n{%- if idx > 1 -%}\\n{{- bos_token + '[INST] ' + messages[idx]['content'] + ' [/INST]' -}}\\n{%- else -%}\\n{{- messages[idx]['content'] + ' [/INST]' -}}\\n{%- endif -%}\\n{% elif messages[idx]['role'] == 'system' %}\\n{{- '[INST] <<SYS>>\\\\n' + messages[idx]['content'] + '\\\\n<</SYS>>\\\\n\\\\n' -}}\\n{%- elif messages[idx]['role'] == 'assistant' -%}\\n{{- ' '  + messages[idx]['content'] + ' ' + eos_token -}}\\n{% endif %}\\n{% endfor %}",
-        // bofenghuang/vigogne-2-70b-chat
-        "{{ bos_token }}{% if messages[0]['role'] == 'system' %}{% set loop_messages = messages[1:] %}{% set system_message = messages[0]['content'] %}{% elif true == true and not '<<SYS>>' in messages[0]['content'] %}{% set loop_messages = messages %}{% set system_message = 'Vous êtes Vigogne, un assistant IA créé par Zaion Lab. Vous suivez extrêmement bien les instructions. Aidez autant que vous le pouvez.' %}{% else %}{% set loop_messages = messages %}{% set system_message = false %}{% endif %}{% for message in loop_messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% if loop.index0 == 0 and system_message != false %}{% set content = '<<SYS>>\\\\n' + system_message + '\\\\n<</SYS>>\\\\n\\\\n' + message['content'] %}{% else %}{% set content = message['content'] %}{% endif %}{% if message['role'] == 'user' %}{{ '[INST] ' + content.strip() + ' [/INST]' }}{% elif message['role'] == 'system' %}{{ '<<SYS>>\\\\n' + content.strip() + '\\\\n<</SYS>>\\\\n\\\\n' }}{% elif message['role'] == 'assistant' %}{{ ' '  + content.strip() + ' ' + eos_token }}{% endif %}{% endfor %}",
-        // mlabonne/AlphaMonarch-7B
-        "{% for message in messages %}{{bos_token + message['role'] + '\\n' + message['content'] + eos_token + '\\n'}}{% endfor %}{% if add_generation_prompt %}{{ bos_token + 'assistant\\n' }}{% endif %}",
-        // google/gemma-7b-it
-        "{% if messages[0]['role'] == 'system' %}{{ raise_exception('System role not supported') }}{% endif %}{% for message in messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% if (message['role'] == 'assistant') %}{% set role = 'model' %}{% else %}{% set role = message['role'] %}{% endif %}{{ '<start_of_turn>' + role + '\\n' + message['content'] | trim + '<end_of_turn>\\n' }}{% endfor %}{% if add_generation_prompt %}{{'<start_of_turn>model\\n'}}{% endif %}",
-        // OrionStarAI/Orion-14B-Chat
-        "{% for message in messages %}{% if loop.first %}{{ bos_token }}{% endif %}{% if message['role'] == 'user' %}{{ 'Human: ' + message['content'] + '\\n\\nAssistant: ' + eos_token }}{% elif message['role'] == 'assistant' %}{{ message['content'] + eos_token }}{% endif %}{% endfor %}",
-        // openchat/openchat-3.5-0106
-        // The included chat_template differs from the author's suggestions here: https://huggingface.co/openchat/openchat_3.5/discussions/5#65448109b4a3f3a2f486fd9d
-        // So we match against the included template but implement the suggested version.
-        "{{ bos_token }}{% for message in messages %}{{ 'GPT4 Correct ' + message['role'].title() + ': ' + message['content'] + '<|end_of_turn|>'}}{% endfor %}{% if add_generation_prompt %}{{ 'GPT4 Correct Assistant:' }}{% endif %}",
-        // deepseek-ai/deepseek-coder-33b-instruct
-        "{% if not add_generation_prompt is defined %}\n{% set add_generation_prompt = false %}\n{% endif %}\n{%- set ns = namespace(found=false) -%}\n{%- for message in messages -%}\n    {%- if message['role'] == 'system' -%}\n        {%- set ns.found = true -%}\n    {%- endif -%}\n{%- endfor -%}\n{{bos_token}}{%- if not ns.found -%}\n{{'You are an AI programming assistant, utilizing the Deepseek Coder model, developed by Deepseek Company, and you only answer questions related to computer science. For politically sensitive questions, security and privacy issues, and other non-computer science questions, you will refuse to answer\\n'}}\n{%- endif %}\n{%- for message in messages %}\n    {%- if message['role'] == 'system' %}\n{{ message['content'] }}\n    {%- else %}\n        {%- if message['role'] == 'user' %}\n{{'### Instruction:\\n' + message['content'] + '\\n'}}\n        {%- else %}\n{{'### Response:\\n' + message['content'] + '\\n<|EOT|>\\n'}}\n        {%- endif %}\n    {%- endif %}\n{%- endfor %}\n{% if add_generation_prompt %}\n{{'### Response:'}}\n{% endif %}",
-        // eachadea/vicuna-13b-1.1
-        // No template included in tokenizer_config.json, so this template likely needs to be manually set.
-        "{%- for message in messages %}{%- if message['role'] == 'system' -%}{{- '' + message['content'] + '\n\n' -}}{%- else -%}{%- if message['role'] == 'user' -%}{{-'USER: ' + message['content'] + '\n'-}}{%- else -%}{{-'ASSISTANT: ' + message['content'] + '</s>\n' -}}{%- endif -%}{%- endif -%}{%- endfor -%}{%- if add_generation_prompt -%}{{-'ASSISTANT:'-}}{%- endif -%}",
-        // Orca-Vicuna
-        // No template included in tokenizer_config.json, so this template likely needs to be manually set.
-        "{%- for message in messages %}{%- if message['role'] == 'system' -%}{{-'SYSTEM: ' + message['content'] + '\n' -}}{%- else -%}{%- if message['role'] == 'user' -%}{{-'USER: ' + message['content'] + '\n'-}}{%- else -%}{{-'ASSISTANT: ' + message['content'] + '</s>\n' -}}{%- endif -%}{%- endif -%}{%- endfor -%}{%- if add_generation_prompt -%}{{-'ASSISTANT:'-}}{%- endif -%}",
-        // CohereForAI/c4ai-command-r-plus
-        "{{ bos_token }}{% if messages[0]['role'] == 'system' %}{% set loop_messages = messages[1:] %}{% set system_message = messages[0]['content'] %}{% elif false == true %}{% set loop_messages = messages %}{% set system_message = 'You are Command-R, a brilliant, sophisticated, AI-assistant trained to assist human users by providing thorough responses. You are trained by Cohere.' %}{% else %}{% set loop_messages = messages %}{% set system_message = false %}{% endif %}{% if system_message != false %}{{ '<|START_OF_TURN_TOKEN|><|SYSTEM_TOKEN|>' + system_message + '<|END_OF_TURN_TOKEN|>' }}{% endif %}{% for message in loop_messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% set content = message['content'] %}{% if message['role'] == 'user' %}{{ '<|START_OF_TURN_TOKEN|><|USER_TOKEN|>' + content.strip() + '<|END_OF_TURN_TOKEN|>' }}{% elif message['role'] == 'assistant' %}{{ '<|START_OF_TURN_TOKEN|><|CHATBOT_TOKEN|>'  + content.strip() + '<|END_OF_TURN_TOKEN|>' }}{% endif %}{% endfor %}{% if add_generation_prompt %}{{ '<|START_OF_TURN_TOKEN|><|CHATBOT_TOKEN|>' }}{% endif %}",
-        // Llama-3
-        "{% set loop_messages = messages %}{% for message in loop_messages %}{% set content = '<|start_header_id|>' + message['role'] + '<|end_header_id|>\n\n'+ message['content'] | trim + '<|eot_id|>' %}{% if loop.index0 == 0 %}{% set content = bos_token + content %}{% endif %}{{ content }}{% endfor %}{{ '<|start_header_id|>assistant<|end_header_id|>\n\n' }}",
-        //Phi-3-mini
-        "{{ bos_token }}{% for message in messages %}{% if (message['role'] == 'user') %}{{'<|user|>' + '\n' + message['content'] + '<|end|>' + '\n' + '<|assistant|>' + '\n'}}{% elif (message['role'] == 'assistant') %}{{message['content'] + '<|end|>' + '\n'}}{% endif %}{% endfor %}",
-        //Phi-3-small
-        "{{ bos_token }}{% for message in messages %}{{'<|' + message['role'] + '|>' + '\n' + message['content'] + '<|end|>\n' }}{% endfor %}{% if add_generation_prompt %}{{ '<|assistant|>\n' }}{% else %}{{ eos_token }}{% endif %}",
-        //Phi-3-medium
-        "{% for message in messages %}{% if (message['role'] == 'user') %}{{'<|user|>' + '\n' + message['content'] + '<|end|>' + '\n' + '<|assistant|>' + '\n'}}{% elif (message['role'] == 'assistant') %}{{message['content'] + '<|end|>' + '\n'}}{% endif %}{% endfor %}",
-        //Phi-3-vision
-        "{% for message in messages %}{{'<|' + message['role'] + '|>' + '\n' + message['content'] + '<|end|>\n' }}{% endfor %}{% if add_generation_prompt and messages[-1]['role'] != 'assistant' %}{{- '<|assistant|>\n' -}}{% endif %}",
-        // ChatGLM3
-        "{% for message in messages %}{% if loop.first %}[gMASK]sop<|{{ message['role'] }}|>\n {{ message['content'] }}{% else %}<|{{ message['role'] }}|>\n {{ message['content'] }}{% endif %}{% endfor %}{% if add_generation_prompt %}<|assistant|>{% endif %}",
-        // ChatGLM4
-        u8"[gMASK]<sop>{% for item in messages %}{% if item['tools'] is defined %}<|system|>\n你是一个名为 ChatGLM 的人工智能助手。你是基于智谱AI训练的语言模型 GLM-4 模型开发的，你的任务是针对用户的问题和要求提供适当的答复和支持。\n\n# 可用工具{% set tools = item['tools'] %}{% for tool in tools %}{% if tool['type'] == 'function' %}\n\n## {{ tool['function']['name'] }}\n\n{{ tool['function'] | tojson(indent=4) }}\n......{% endif %}{% endfor %}{% endif %}{% if item['content'] %}<|{{ item['role'] }}|>{{ item['metadata'] }}\n{{ item['content'] }}{% endif %}{% endfor %}{% if add_generation_prompt %}<|assistant|>{% endif %}",
-        // MiniCPM-3B-OpenHermes-2.5-v2-GGUF
-        u8"{% for message in messages %}{% if message['role'] == 'user' %}{{'<用户>' + message['content'].strip() + '<AI>'}}{% else %}{{message['content'].strip()}}{% endif %}{% endfor %}",
-        // DeepSeek-V2
-        "{% if not add_generation_prompt is defined %}{% set add_generation_prompt = false %}{% endif %}{{ bos_token }}{% for message in messages %}{% if message['role'] == 'user' %}{{ 'User: ' + message['content'] + '\n\n' }}{% elif message['role'] == 'assistant' %}{{ 'Assistant: ' + message['content'] + eos_token }}{% elif message['role'] == 'system' %}{{ message['content'] + '\n\n' }}{% endif %}{% endfor %}{% if add_generation_prompt %}{{ 'Assistant:' }}{% endif %}",
-    };
-    std::vector<std::string> expected_output = {
-        // teknium/OpenHermes-2.5-Mistral-7B
-        "<|im_start|>system\nYou are a helpful assistant<|im_end|>\n<|im_start|>user\nHello<|im_end|>\n<|im_start|>assistant\nHi there<|im_end|>\n<|im_start|>user\nWho are you<|im_end|>\n<|im_start|>assistant\n   I am an assistant   <|im_end|>\n<|im_start|>user\nAnother question<|im_end|>\n<|im_start|>assistant\n",
-        // mistralai/Mistral-7B-Instruct-v0.2
-        "[INST] You are a helpful assistant\nHello [/INST]Hi there</s>[INST] Who are you [/INST]   I am an assistant   </s>[INST] Another question [/INST]",
-        // TheBloke/FusionNet_34Bx2_MoE-AWQ
-        "[INST] <<SYS>>\nYou are a helpful assistant\n<</SYS>>\n\nHello [/INST] Hi there </s><s>[INST] Who are you [/INST]    I am an assistant    </s><s>[INST] Another question [/INST]",
-        // bofenghuang/vigogne-2-70b-chat
-        "[INST] <<SYS>>\nYou are a helpful assistant\n<</SYS>>\n\nHello [/INST] Hi there </s>[INST] Who are you [/INST] I am an assistant </s>[INST] Another question [/INST]",
-        // mlabonne/AlphaMonarch-7B
-        "system\nYou are a helpful assistant</s>\n<s>user\nHello</s>\n<s>assistant\nHi there</s>\n<s>user\nWho are you</s>\n<s>assistant\n   I am an assistant   </s>\n<s>user\nAnother question</s>\n<s>assistant\n",
-        // google/gemma-7b-it
-        "<start_of_turn>user\nYou are a helpful assistant\n\nHello<end_of_turn>\n<start_of_turn>model\nHi there<end_of_turn>\n<start_of_turn>user\nWho are you<end_of_turn>\n<start_of_turn>model\nI am an assistant<end_of_turn>\n<start_of_turn>user\nAnother question<end_of_turn>\n<start_of_turn>model\n",
-        // OrionStarAI/Orion-14B-Chat
-        "Human: You are a helpful assistant\n\nHello\n\nAssistant: </s>Hi there</s>Human: Who are you\n\nAssistant: </s>   I am an assistant   </s>Human: Another question\n\nAssistant: </s>",
-        // openchat/openchat-3.5-0106
-        "You are a helpful assistant<|end_of_turn|>GPT4 Correct User: Hello<|end_of_turn|>GPT4 Correct Assistant: Hi there<|end_of_turn|>GPT4 Correct User: Who are you<|end_of_turn|>GPT4 Correct Assistant:    I am an assistant   <|end_of_turn|>GPT4 Correct User: Another question<|end_of_turn|>GPT4 Correct Assistant:",
-        // deepseek-ai/deepseek-coder-33b-instruct
-        "You are a helpful assistant### Instruction:\nHello\n### Response:\nHi there\n<|EOT|>\n### Instruction:\nWho are you\n### Response:\n   I am an assistant   \n<|EOT|>\n### Instruction:\nAnother question\n### Response:\n",
-        // eachadea/vicuna-13b-1.1
-        "You are a helpful assistant\n\nUSER: Hello\nASSISTANT: Hi there</s>\nUSER: Who are you\nASSISTANT:    I am an assistant   </s>\nUSER: Another question\nASSISTANT:",
-        // Orca-Vicuna
-        "SYSTEM: You are a helpful assistant\nUSER: Hello\nASSISTANT: Hi there</s>\nUSER: Who are you\nASSISTANT:    I am an assistant   </s>\nUSER: Another question\nASSISTANT:",
-        // CohereForAI/c4ai-command-r-plus
-        "<|START_OF_TURN_TOKEN|><|SYSTEM_TOKEN|>You are a helpful assistant<|END_OF_TURN_TOKEN|><|START_OF_TURN_TOKEN|><|USER_TOKEN|>Hello<|END_OF_TURN_TOKEN|><|START_OF_TURN_TOKEN|><|CHATBOT_TOKEN|>Hi there<|END_OF_TURN_TOKEN|><|START_OF_TURN_TOKEN|><|USER_TOKEN|>Who are you<|END_OF_TURN_TOKEN|><|START_OF_TURN_TOKEN|><|CHATBOT_TOKEN|>I am an assistant<|END_OF_TURN_TOKEN|><|START_OF_TURN_TOKEN|><|USER_TOKEN|>Another question<|END_OF_TURN_TOKEN|><|START_OF_TURN_TOKEN|><|CHATBOT_TOKEN|>",
-        // Llama 3
-        "<|start_header_id|>system<|end_header_id|>\n\nYou are a helpful assistant<|eot_id|><|start_header_id|>user<|end_header_id|>\n\nHello<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\nHi there<|eot_id|><|start_header_id|>user<|end_header_id|>\n\nWho are you<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\nI am an assistant<|eot_id|><|start_header_id|>user<|end_header_id|>\n\nAnother question<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n",
-        //Phi-3-mini
-        "<|system|>\nYou are a helpful assistant<|end|>\n<|user|>\nHello<|end|>\n<|assistant|>\nHi there<|end|>\n<|user|>\nWho are you<|end|>\n<|assistant|>\n   I am an assistant   <|end|>\n<|user|>\nAnother question<|end|>\n<|assistant|>\n",
-        //Phi-3-small
-        "<|system|>\nYou are a helpful assistant<|end|>\n<|user|>\nHello<|end|>\n<|assistant|>\nHi there<|end|>\n<|user|>\nWho are you<|end|>\n<|assistant|>\n   I am an assistant   <|end|>\n<|user|>\nAnother question<|end|>\n<|assistant|>\n",
-        //Phi-3-medium
-        "<|system|>\nYou are a helpful assistant<|end|>\n<|user|>\nHello<|end|>\n<|assistant|>\nHi there<|end|>\n<|user|>\nWho are you<|end|>\n<|assistant|>\n   I am an assistant   <|end|>\n<|user|>\nAnother question<|end|>\n<|assistant|>\n",
-        //Phi-3-vision
-        "<|system|>\nYou are a helpful assistant<|end|>\n<|user|>\nHello<|end|>\n<|assistant|>\nHi there<|end|>\n<|user|>\nWho are you<|end|>\n<|assistant|>\n   I am an assistant   <|end|>\n<|user|>\nAnother question<|end|>\n<|assistant|>\n",
-        // ChatGLM3
-        "[gMASK]sop<|system|>\n You are a helpful assistant<|user|>\n Hello<|assistant|>\n Hi there<|user|>\n Who are you<|assistant|>\n    I am an assistant   <|user|>\n Another question<|assistant|>",
-        // ChatGLM4
-        "[gMASK]<sop><|system|>\nYou are a helpful assistant<|user|>\nHello<|assistant|>\nHi there<|user|>\nWho are you<|assistant|>\n   I am an assistant   <|user|>\nAnother question<|assistant|>",
-        // MiniCPM-3B-OpenHermes-2.5-v2-GGUF
-        u8"You are a helpful assistant<用户>Hello<AI>Hi there<用户>Who are you<AI>I am an assistant<用户>Another question<AI>",
-        // DeepSeek-V2
-        u8"You are a helpful assistant\n\nUser: Hello\n\nAssistant: Hi there<｜end▁of▁sentence｜>User: Who are you\n\nAssistant:    I am an assistant   <｜end▁of▁sentence｜>User: Another question\n\nAssistant:",
-    };
-    std::vector<char> formatted_chat(1024);
-    int32_t res;
-
-    // test invalid chat template
-    res = llama_chat_apply_template(nullptr, "INVALID TEMPLATE", conversation, message_count, true, formatted_chat.data(), formatted_chat.size());
-    assert(res < 0);
-
-    for (size_t i = 0; i < templates.size(); i++) {
-        std::string custom_template = templates[i];
-        std::string expected = expected_output[i];
-        formatted_chat.resize(1024);
-        res = llama_chat_apply_template(
-            nullptr,
-            custom_template.c_str(),
-            conversation,
-            message_count,
-            true,
-            formatted_chat.data(),
-            formatted_chat.size()
-        );
-        formatted_chat.resize(res);
-        std::string output(formatted_chat.data(), formatted_chat.size());
-        printf("%s\n", output.c_str());
-        printf("-------------------------\n");
-        assert(output == expected);
-    }
-
-
-    // test llama_chat_format_single for system message
-    printf("\n\n=== llama_chat_format_single (system message) ===\n\n");
-    std::vector<llama_chat_msg> chat2;
-    llama_chat_msg sys_msg{"system", "You are a helpful assistant"};
-
-    auto fmt_sys = [&](std::string tmpl) {
-        auto output = llama_chat_format_single(nullptr, tmpl, chat2, sys_msg, false);
-        printf("fmt_sys(%s) : %s\n", tmpl.c_str(), output.c_str());
-        printf("-------------------------\n");
-        return output;
-    };
-    assert(fmt_sys("chatml") == "<|im_start|>system\nYou are a helpful assistant<|im_end|>\n");
-    assert(fmt_sys("llama2") == "[INST] You are a helpful assistant\n");
-    assert(fmt_sys("gemma")  == ""); // for gemma, system message is merged with user message
-    assert(fmt_sys("llama3") == "<|start_header_id|>system<|end_header_id|>\n\nYou are a helpful assistant<|eot_id|>");
-
-
-    // test llama_chat_format_single for user message
-    printf("\n\n=== llama_chat_format_single (user message) ===\n\n");
-    chat2.push_back({"system", "You are a helpful assistant"});
-    chat2.push_back({"user", "Hello"});
-    chat2.push_back({"assistant", "I am assistant"});
-    llama_chat_msg new_msg{"user", "How are you"};
-
-    auto fmt_single = [&](std::string tmpl) {
-        auto output = llama_chat_format_single(nullptr, tmpl, chat2, new_msg, true);
-        printf("fmt_single(%s) : %s\n", tmpl.c_str(), output.c_str());
-        printf("-------------------------\n");
-        return output;
-    };
-    assert(fmt_single("chatml") == "\n<|im_start|>user\nHow are you<|im_end|>\n<|im_start|>assistant\n");
-    assert(fmt_single("llama2") == "[INST] How are you [/INST]");
-    assert(fmt_single("gemma")  == "\n<start_of_turn>user\nHow are you<end_of_turn>\n<start_of_turn>model\n");
-    assert(fmt_single("llama3") == "<|start_header_id|>user<|end_header_id|>\n\nHow are you<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n");
-
-    return 0;
-}
diff --git a/tests/test-double-float.cpp b/tests/test-double-float.cpp
deleted file mode 100644
index 6aac4737a..000000000
--- a/tests/test-double-float.cpp
+++ /dev/null
@@ -1,57 +0,0 @@
-// These tests may take a long time!
-// They are to prove that conversion from double to float of various functions in ggml.c doesn't affect the result.
-// This is done by checking all finite (non-NaN, non-infinite) floats.
-
-#undef NDEBUG
-#include <cassert>
-#if !defined(__riscv) && !defined(__s390__) && !defined(__ARM_NEON)
-#include <immintrin.h>
-#endif
-#include <cmath>
-#include <cstdint>
-#include <cstring>
-
-#pragma GCC diagnostic push
-#pragma GCC diagnostic ignored "-Wdouble-promotion"
-
-// ggml.c::quantize_row_q4_0_ref
-inline static uint8_t round_orig(float v0) { return ((int8_t) (round(v0))) + 8; }
-
-// ggml.c::ggml_silu_f32
-inline static float silu_orig(float x) {
-    return x/(1.0 + exp(-x));
-}
-
-#pragma GCC diagnostic pop
-
-// ggml.c::quantize_row_q4_0_ref
-inline static uint8_t round_float(float v0) { return (int8_t)roundf(v0) + 8; }
-
-// ggml.c::ggml_silu_f32
-inline static float silu_float(float x) {
-    return x/(1.0f + expf(-x));
-}
-
-int main(void) {
-    uint32_t x = UINT32_MAX;
-    do {
-        float f;
-        memcpy(&f, &x, sizeof(x));
-        assert(!std::isfinite(f) || (round_orig(f) == round_float(f)));
-    } while (x--);
-
-#ifdef __F16C__
-    // GELU and SILU implementations are used with a FP16 lookup table.
-    // The original and float-only results are not equal for all inputs after converting to FP16.
-    // GELU is an approximation anyway (tanh), not tested here.
-    // For SILU, verify that the results are at least the closest floating point numbers, if the FP16 values don't match.
-    for (x = 0; x <= UINT16_MAX; x++) {
-        float f = _cvtsh_ss(x);
-        const float so = silu_orig(f);
-        const float sf = silu_float(f);
-        assert(   (_cvtss_sh(so, 0) == _cvtss_sh(sf, 0))
-               || (nextafterf(so, sf) == sf)
-               || (nextafterf(sf, so) == so));
-    }
-#endif
-}
diff --git a/tests/test-grad0.cpp b/tests/test-grad0.cpp
deleted file mode 100644
index a35327645..000000000
--- a/tests/test-grad0.cpp
+++ /dev/null
@@ -1,1566 +0,0 @@
-#define _CRT_SECURE_NO_DEPRECATE // Disables ridiculous "unsafe" warnings on Windows
-#include "ggml.h"
-
-#include <cmath>
-#include <cstdio>
-#include <cstdlib>
-#include <cassert>
-
-#if defined(_MSC_VER)
-#pragma warning(disable: 4244 4267) // possible loss of data
-#endif
-
-#if defined(__GNUC__)
-#pragma GCC diagnostic ignored "-Wdouble-promotion"
-#endif
-
-#define MAX_NARGS 3
-
-#undef MIN
-#undef MAX
-#define MIN(a, b) ((a) < (b) ? (a) : (b))
-#define MAX(a, b) ((a) > (b) ? (a) : (b))
-
-#define GGML_SILU_FP16
-
-//
-// logging
-//
-
-#if (GGML_DEBUG >= 1)
-#define GGML_PRINT_DEBUG(...) printf(__VA_ARGS__)
-#else
-#define GGML_PRINT_DEBUG(...)
-#endif
-
-#if (GGML_DEBUG >= 5)
-#define GGML_PRINT_DEBUG_5(...) printf(__VA_ARGS__)
-#else
-#define GGML_PRINT_DEBUG_5(...)
-#endif
-
-#if (GGML_DEBUG >= 10)
-#define GGML_PRINT_DEBUG_10(...) printf(__VA_ARGS__)
-#else
-#define GGML_PRINT_DEBUG_10(...)
-#endif
-
-#define GGML_PRINT(...) printf(__VA_ARGS__)
-
-static float frand(void) {
-    return (float)rand()/(float)RAND_MAX;
-}
-
-static int irand(int n) {
-    if (n == 0) return 0;
-    return rand()%n;
-}
-
-static void get_random_dims(int64_t * dims, int ndims) {
-    dims[0] = dims[1] = dims[2] = dims[3] = 1;
-
-    for (int i = 0; i < ndims; i++) {
-        dims[i] = 1 + irand(4);
-    }
-}
-
-static struct ggml_tensor * get_random_tensor_f32(
-        struct ggml_context * ctx0,
-        int ndims,
-        int64_t ne[],
-        float fmin,
-        float fmax) {
-    struct ggml_tensor * result = ggml_new_tensor(ctx0, GGML_TYPE_F32, ndims, ne);
-
-    switch (ndims) {
-        case 1:
-            for (int i0 = 0; i0 < ne[0]; i0++) {
-                ((float *)result->data)[i0] = frand()*(fmax - fmin) + fmin;
-            }
-            break;
-        case 2:
-            for (int i1 = 0; i1 < ne[1]; i1++) {
-                for (int i0 = 0; i0 < ne[0]; i0++) {
-                    ((float *)result->data)[i1*ne[0] + i0] = frand()*(fmax - fmin) + fmin;
-                }
-            }
-            break;
-        case 3:
-            for (int i2 = 0; i2 < ne[2]; i2++) {
-                for (int i1 = 0; i1 < ne[1]; i1++) {
-                    for (int i0 = 0; i0 < ne[0]; i0++) {
-                        ((float *)result->data)[i2*ne[1]*ne[0] + i1*ne[0] + i0] = frand()*(fmax - fmin) + fmin;
-                    }
-                }
-            }
-            break;
-        case 4:
-            for (int i3 = 0; i3 < ne[3]; i3++) {
-                for (int i2 = 0; i2 < ne[2]; i2++) {
-                    for (int i1 = 0; i1 < ne[1]; i1++) {
-                        for (int i0 = 0; i0 < ne[0]; i0++) {
-                            ((float *)result->data)[i3*ne[2]*ne[1]*ne[0] + i2*ne[1]*ne[0] + i1*ne[0] + i0] = frand()*(fmax - fmin) + fmin;
-                        }
-                    }
-                }
-            }
-            break;
-        default:
-            assert(false);
-    }
-
-    return result;
-}
-
-static struct ggml_tensor * get_random_tensor_f16(
-        struct ggml_context * ctx0,
-        int ndims,
-        int64_t ne[],
-        float fmin,
-        float fmax) {
-    struct ggml_tensor * result = ggml_new_tensor(ctx0, GGML_TYPE_F16, ndims, ne);
-
-    switch (ndims) {
-        case 1:
-            for (int i0 = 0; i0 < ne[0]; i0++) {
-                ((ggml_fp16_t *)result->data)[i0] = ggml_fp32_to_fp16(frand()*(fmax - fmin) + fmin);
-            }
-            break;
-        case 2:
-            for (int i1 = 0; i1 < ne[1]; i1++) {
-                for (int i0 = 0; i0 < ne[0]; i0++) {
-                    ((ggml_fp16_t *)result->data)[i1*ne[0] + i0] = ggml_fp32_to_fp16(frand()*(fmax - fmin) + fmin);
-                }
-            }
-            break;
-        case 3:
-            for (int i2 = 0; i2 < ne[2]; i2++) {
-                for (int i1 = 0; i1 < ne[1]; i1++) {
-                    for (int i0 = 0; i0 < ne[0]; i0++) {
-                        ((ggml_fp16_t *)result->data)[i2*ne[1]*ne[0] + i1*ne[0] + i0] = ggml_fp32_to_fp16(frand()*(fmax - fmin) + fmin);
-                    }
-                }
-            }
-            break;
-        case 4:
-            for (int i3 = 0; i3 < ne[3]; i3++) {
-                for (int i2 = 0; i2 < ne[2]; i2++) {
-                    for (int i1 = 0; i1 < ne[1]; i1++) {
-                        for (int i0 = 0; i0 < ne[0]; i0++) {
-                            ((ggml_fp16_t *)result->data)[i3*ne[2]*ne[1]*ne[0] + i2*ne[1]*ne[0] + i1*ne[0] + i0] = ggml_fp32_to_fp16(frand()*(fmax - fmin) + fmin);
-                        }
-                    }
-                }
-            }
-            break;
-        default:
-            assert(false);
-    }
-
-    return result;
-}
-
-static struct ggml_tensor * get_random_tensor_i32(
-        struct ggml_context * ctx0,
-        int ndims,
-        int64_t ne[],
-        int32_t imin,
-        int32_t imax) {
-    struct ggml_tensor * result = ggml_new_tensor(ctx0, GGML_TYPE_I32, ndims, ne);
-
-    switch (ndims) {
-        case 1:
-            for (int i0 = 0; i0 < ne[0]; i0++) {
-                ((int32_t *)result->data)[i0] = irand(imax - imin) + imin;
-            }
-            break;
-        case 2:
-            for (int i1 = 0; i1 < ne[1]; i1++) {
-                for (int i0 = 0; i0 < ne[0]; i0++) {
-                    ((int32_t *)result->data)[i1*ne[0] + i0] = irand(imax - imin) + imin;
-                }
-            }
-            break;
-        case 3:
-            for (int i2 = 0; i2 < ne[2]; i2++) {
-                for (int i1 = 0; i1 < ne[1]; i1++) {
-                    for (int i0 = 0; i0 < ne[0]; i0++) {
-                        ((int32_t *)result->data)[i2*ne[1]*ne[0] + i1*ne[0] + i0] = irand(imax - imin) + imin;
-                    }
-                }
-            }
-            break;
-        case 4:
-            for (int i3 = 0; i3 < ne[3]; i3++) {
-                for (int i2 = 0; i2 < ne[2]; i2++) {
-                    for (int i1 = 0; i1 < ne[1]; i1++) {
-                        for (int i0 = 0; i0 < ne[0]; i0++) {
-                            ((int32_t *)result->data)[i3*ne[2]*ne[1]*ne[0] + i2*ne[1]*ne[0] + i1*ne[0] + i0] = irand(imax - imin) + imin;
-                        }
-                    }
-                }
-            }
-            break;
-        default:
-            assert(false);
-    }
-
-    return result;
-}
-
-static bool check_gradient(
-        const char * op_name,
-        struct ggml_context * ctx0,
-        struct ggml_tensor * x[],
-        struct ggml_tensor * f,
-        int ndims,
-        int nargs,
-        float eps,
-        float max_error_abs,
-        float max_error_rel) {
-
-    static int n_threads = -1;
-    if (n_threads < 0) {
-        n_threads = GGML_DEFAULT_N_THREADS;
-
-        const char *env = getenv("GGML_N_THREADS");
-        if (env) {
-            n_threads = atoi(env);
-        }
-
-        printf("GGML_N_THREADS = %d\n", n_threads);
-    }
-
-    struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, GGML_DEFAULT_GRAPH_SIZE, true);
-    struct ggml_cgraph * gb = ggml_new_graph_custom(ctx0, GGML_DEFAULT_GRAPH_SIZE, true);
-    ggml_build_forward_expand(gf, f);
-    ggml_graph_cpy(gf, gb);
-    ggml_build_backward_expand(ctx0, gf, gb, false);
-
-    ggml_graph_compute_with_ctx(ctx0, gf, n_threads);
-
-    ggml_graph_reset  (gf);
-    ggml_set_f32      (f->grad, 1.0f);
-
-    ggml_graph_compute_with_ctx(ctx0, gb, n_threads);
-
-    // ggml_graph_dump_dot(gf, NULL, "test-grad0-forward.dot");
-    // ggml_graph_dump_dot(gb, gf,  "test-grad0-backward.dot");
-
-    for (int i = 0; i < nargs; ++i) {
-        const int nelements = ggml_nelements(x[i]);
-        for (int k = 0; k < nelements; ++k) {
-            // compute gradient using finite differences
-            const float x0 = ggml_get_f32_1d(x[i], k);
-            const float xm = x0 - eps;
-            const float xp = x0 + eps;
-            ggml_set_f32_1d(x[i], k, xp);
-
-            ggml_graph_compute_with_ctx(ctx0, gf, n_threads);
-
-            const double f0 = ggml_get_f32_1d(f, 0);
-
-            ggml_set_f32_1d(x[i], k, xm);
-
-            ggml_graph_compute_with_ctx(ctx0, gf, n_threads);
-
-            const double f1 = ggml_get_f32_1d(f, 0);
-            const double g0 = (f0 - f1)/(2.0*(double) eps);
-
-            ggml_set_f32_1d(x[i], k, x0);
-
-            // compute gradient using backward graph
-            ggml_graph_reset  (gf);
-            ggml_set_f32      (f->grad, 1.0f);
-
-            ggml_graph_compute_with_ctx(ctx0, gb, n_threads);
-
-            const double g1 = ggml_get_f32_1d(x[i]->grad, k);
-
-            const double error_abs = fabs(g0 - g1);
-            const double error_rel = g0 != 0 ? fabs(g0 - g1)/fabs(g0) : 0;
-
-            if (error_abs > max_error_abs || error_rel > max_error_rel) {
-                printf("%s: ndims=%d, i=%d, k=%d, x0=%f, xm=%f, xp=%f, f0=%f, f1=%f, g0=%f, g1=%f, eps=%f, error_abs=%f, error_rel=%f\n",
-                            op_name, ndims, i, k, x0, xm, xp, f0, f1, g0, g1, eps, error_abs, error_rel);
-                //assert(false);
-                return false;
-            }
-        }
-    }
-
-    return true;
-}
-
-// TODO: clean-up this ..
-static bool check_mat_mul(
-        const struct ggml_tensor * y,
-        const struct ggml_tensor * x0,
-        const struct ggml_tensor * x1) {
-    float * dst  = (float *) y->data;
-    float * src0 = (float *) x0->data;
-    float * src1 = (float *) x1->data;
-
-    const int nc = x0->ne[1];
-    const int nr = x1->ne[1];
-    const int nk = x0->ne[0];
-
-    GGML_PRINT_DEBUG("check_mat_mul: nc=%d, nr=%d, nk=%d\n", nc, nr, nk);
-
-    GGML_PRINT_DEBUG("x0:\n");
-    for (int j = 0; j < x0->ne[1]; ++j) {
-        for (int i = 0; i < x0->ne[0]; ++i) {
-            GGML_PRINT_DEBUG("%6.3f ", src0[j*nk + i]);
-        }
-        GGML_PRINT_DEBUG("\n");
-    }
-    GGML_PRINT_DEBUG("\n");
-
-    GGML_PRINT_DEBUG("x1:\n");
-    for (int j = 0; j < x1->ne[1]; ++j) {
-        for (int i = 0; i < x1->ne[0]; ++i) {
-            GGML_PRINT_DEBUG("%6.3f ", src1[j*nk + i]);
-        }
-        GGML_PRINT_DEBUG("\n");
-    }
-    GGML_PRINT_DEBUG("\n");
-
-    GGML_PRINT_DEBUG("y: n_dims = %d, (%lld, %lld)\n", y->n_dims, y->ne[0], y->ne[1]);
-    for (int j = 0; j < y->ne[1]; ++j) {
-        for (int i = 0; i < y->ne[0]; ++i) {
-            GGML_PRINT_DEBUG("%6.3f ", dst[j*nr + i]);
-        }
-        GGML_PRINT_DEBUG("\n");
-    }
-
-    for (int i = 0; i < nr; ++i) {
-        for (int j = 0; j < nc; ++j) {
-            float sum = 0.0f;
-
-            for (int k = 0; k < nk; ++k) {
-                sum += src0[j*nk + k]*src1[i*nk + k];
-            }
-
-            if (fabsf(dst[i*nc + j] - sum) > 1e-5f) {
-                fprintf(stderr, "check_mat_mul: dst[%d] = %f, sum = %f\n", i*nc + j, dst[i*nc + j], sum);
-                assert(false);
-                return false;
-            }
-        }
-    }
-
-    return true;
-}
-
-#define NUM_PERMUTATIONS (4*3*2*1)
-
-int main(int argc, const char ** argv) {
-    struct ggml_init_params params = {
-        /* .mem_size   = */ 256*1024*1024,
-        /* .mem_buffer = */ NULL,
-        /* .no_alloc   = */ false,
-    };
-
-    int64_t ne[4];
-
-    int all_permutations[4 * NUM_PERMUTATIONS];
-    {
-        int count = 0;
-        for (int ax0=0; ax0<4; ++ax0) {
-            for (int ax1=0; ax1<4; ++ax1) {
-                if (ax1 == ax0) continue;
-                for (int ax2=0; ax2<4; ++ax2) {
-                    if (ax2 == ax0) continue;
-                    if (ax2 == ax1) continue;
-                    for (int ax3=0; ax3<4; ++ax3) {
-                        if (ax3 == ax0) continue;
-                        if (ax3 == ax1) continue;
-                        if (ax3 == ax2) continue;
-                        assert(count < NUM_PERMUTATIONS);
-                        all_permutations[count*4+0] = ax0;
-                        all_permutations[count*4+1] = ax1;
-                        all_permutations[count*4+2] = ax2;
-                        all_permutations[count*4+3] = ax3;
-                        ++count;
-                    }
-                }
-            }
-        }
-    }
-
-    unsigned seed_iter = 1;
-
-    // original loop: 1000
-    int niter = 4;
-    const char *env = getenv("GGML_NLOOP");
-    if (env != NULL) {
-        niter = atoi(env);
-    }
-    if (argc > 1) {
-        niter = atoi(argv[1]);
-    }
-    for (int iter = 0; iter < niter; ++iter) {
-        srand(seed_iter);
-        seed_iter = rand();
-        unsigned seed = rand();
-
-        printf("test-grad0: iter:%d/%d\n", iter, niter);
-        struct ggml_context * ctx0 = ggml_init(params);
-
-        get_random_dims(ne, 4);
-
-        struct ggml_tensor * x[MAX_NARGS];
-
-        // add f32
-        {
-            srand(seed);
-            const int nargs = 2;
-
-            for (int ndims = 1; ndims <= 4; ++ndims) {
-                for (int i = 0; i < nargs; ++i) {
-                    x[i] = get_random_tensor_f32(ctx0, ndims, ne, -1.0f, 1.0f);
-                    ggml_set_param(ctx0, x[i]);
-                }
-
-                struct ggml_tensor * f = ggml_sum(ctx0, ggml_add(ctx0, x[0], x[1]));
-
-                check_gradient("add f32", ctx0, x, f, ndims, nargs, 1e-3f, 2e-3f, 2e-3f);
-            }
-        }
-
-        // add f16
-        {
-            srand(seed);
-            const int nargs = 2;
-
-            for (int ndims = 1; ndims <= 4; ++ndims) {
-                for (int i = 0; i < nargs; ++i) {
-                    x[i] = get_random_tensor_f16(ctx0, ndims, ne, -1.0f, 1.0f);
-                    ggml_set_param(ctx0, x[i]);
-                }
-
-                struct ggml_tensor * f = ggml_sum(ctx0, ggml_add(ctx0, x[0], x[1]));
-
-                check_gradient("add f16", ctx0, x, f, ndims, nargs, 1e-1f, 2e-1f, 2e-1f);
-            }
-        }
-
-        // sub
-        {
-            srand(seed);
-            const int nargs = 2;
-
-            for (int ndims = 1; ndims <= 4; ++ndims) {
-                for (int i = 0; i < nargs; ++i) {
-                    x[i] = get_random_tensor_f32(ctx0, ndims, ne, -1.0f, 1.0f);
-                    ggml_set_param(ctx0, x[i]);
-                }
-
-                struct ggml_tensor * f = ggml_sum(ctx0, ggml_sub(ctx0, x[0], x[1]));
-
-                check_gradient("sub", ctx0, x, f, ndims, nargs, 1e-3f, 1e-3f, 1e-3f);
-            }
-        }
-
-        // mul
-        {
-            srand(seed);
-            const int nargs = 2;
-
-            for (int ndims = 1; ndims <= 4; ++ndims) {
-                for (int i = 0; i < nargs; ++i) {
-                    x[i] = get_random_tensor_f32(ctx0, ndims, ne, -1.0f, 1.0f);
-                    ggml_set_param(ctx0, x[i]);
-                }
-
-                struct ggml_tensor * f = ggml_sum(ctx0, ggml_mul(ctx0, x[0], x[1]));
-
-                check_gradient("mul", ctx0, x, f, ndims, nargs, 1e-3f, 1e-3f, INFINITY);
-            }
-        }
-
-        // div
-        {
-            srand(seed);
-            const int nargs = 2;
-
-            for (int ndims = 1; ndims <= 4; ++ndims) {
-                for (int i = 0; i < nargs; ++i) {
-                    x[i] = get_random_tensor_f32(ctx0, ndims, ne, 0.5f, 1.0f);
-                    ggml_set_param(ctx0, x[i]);
-                }
-
-                struct ggml_tensor * f = ggml_sum(ctx0, ggml_div(ctx0, x[0], x[1]));
-
-                check_gradient("div", ctx0, x, f, ndims, nargs, 1e-3f, 1e-1f, 1e-1f);
-            }
-        }
-
-        // sqr
-        {
-            srand(seed);
-            const int nargs = 1;
-
-            for (int ndims = 1; ndims <= 2; ++ndims) {
-                for (int i = 0; i < nargs; ++i) {
-                    x[i] = get_random_tensor_f32(ctx0, ndims, ne, -1.0f, 1.0f);
-                    ggml_set_param(ctx0, x[i]);
-                }
-
-                struct ggml_tensor * f = ggml_sum(ctx0, ggml_sqr(ctx0, x[0]));
-
-                check_gradient("sqr", ctx0, x, f, ndims, nargs, 1e-3f, 1e-3f, INFINITY);
-            }
-        }
-
-        // sqrt
-        {
-            srand(seed);
-            const int nargs = 1;
-
-            for (int ndims = 1; ndims <= 2; ++ndims) {
-                for (int i = 0; i < nargs; ++i) {
-                    x[i] = get_random_tensor_f32(ctx0, ndims, ne, 2.0f*1e-3f, 1.0f);
-                    ggml_set_param(ctx0, x[i]);
-                }
-
-                struct ggml_tensor * f = ggml_sum(ctx0, ggml_sqrt(ctx0, x[0]));
-
-                check_gradient("sqrt", ctx0, x, f, ndims, nargs, 1e-3f, 2e-2f, 1e-1f);
-            }
-        }
-
-        // log
-        {
-            srand(seed);
-            const int nargs = 1;
-
-            for (int ndims = 1; ndims <= 2; ++ndims) {
-                for (int i = 0; i < nargs; ++i) {
-                    x[i] = get_random_tensor_f32(ctx0, ndims, ne, 2.0f*1e-3f, 1.0f);
-                    ggml_set_param(ctx0, x[i]);
-                }
-
-                struct ggml_tensor * f = ggml_sum(ctx0, ggml_log(ctx0, x[0]));
-
-                check_gradient("log", ctx0, x, f, ndims, nargs, 1e-3f, INFINITY, 1e-1f);
-            }
-        }
-
-        // sum
-        {
-            srand(seed);
-            const int nargs = 1;
-
-            for (int ndims = 1; ndims <= 2; ++ndims) {
-                for (int i = 0; i < nargs; ++i) {
-                    x[i] = get_random_tensor_f32(ctx0, ndims, ne, -1.0f, 1.0f);
-                    ggml_set_param(ctx0, x[i]);
-                }
-
-                struct ggml_tensor * f = ggml_sum(ctx0, x[0]);
-
-                check_gradient("sum", ctx0, x, f, ndims, nargs, 1e-3f, 1e-3f, 1e-3f);
-            }
-        }
-
-
-        // sum_rows
-        {
-            srand(seed);
-            const int nargs = 1;
-
-            for (int ndims = 1; ndims <= 4; ++ndims) {
-                for (int i = 0; i < nargs; ++i) {
-                    x[i] = get_random_tensor_f32(ctx0, ndims, ne, -1.0f, 1.0f);
-                    ggml_set_param(ctx0, x[i]);
-                }
-
-                struct ggml_tensor * f = ggml_sum(ctx0, ggml_sqr(ctx0, ggml_sum_rows(ctx0, x[0])));
-
-                check_gradient("sum_rows", ctx0, x, f, ndims, nargs, 1e-3f, 1e-2f, INFINITY);
-            }
-        }
-
-        // mean, not yet fully implemented
-        if(0)
-        {
-            srand(seed);
-            const int nargs = 1;
-
-            for (int ndims = 1; ndims <= 4; ++ndims) {
-                for (int i = 0; i < nargs; ++i) {
-                    x[i] = get_random_tensor_f32(ctx0, ndims, ne, -1.0f, 1.0f);
-                    ggml_set_param(ctx0, x[i]);
-                }
-
-                struct ggml_tensor * f = ggml_sum(ctx0, ggml_mean(ctx0, x[0]));
-
-                check_gradient("mean", ctx0, x, f, ndims, nargs, 1e-3f, 1e-3f, 1e-3f);
-            }
-        }
-
-        // argmax
-        if (0)
-        {
-            srand(seed);
-            const int nargs = 1;
-
-            for (int ndims = 1; ndims <= 4; ++ndims) {
-                for (int i = 0; i < nargs; ++i) {
-                    x[i] = get_random_tensor_f32(ctx0, ndims, ne, -1.0f, 1.0f);
-                    ggml_set_param(ctx0, x[i]);
-                }
-
-                struct ggml_tensor * f = ggml_sum(ctx0, ggml_argmax(ctx0, x[0]));
-
-                check_gradient("argmax", ctx0, x, f, ndims, nargs, 1e-3f, 1e-3f, 1e-3f);
-            }
-        }
-
-        // repeat
-        {
-            srand(seed);
-            int64_t ne2[4];
-            get_random_dims(ne2, 4);
-
-            ne2[0] = ne[0] * ne2[0];
-            ne2[1] = ne[1] * ne2[1];
-            ne2[2] = 1;
-            ne2[3] = 1;
-
-            const int nargs = 1;
-            for (int ndims = 1; ndims <= 2; ++ndims) {
-                x[0] = get_random_tensor_f32(ctx0, ndims, ne, -1.0f, 1.0f);
-                x[1] = get_random_tensor_f32(ctx0, ndims, ne2, -1.0f, 1.0f);
-                ggml_set_param(ctx0, x[0]);
-
-                struct ggml_tensor * f = ggml_sum(ctx0, ggml_sqr(ctx0, ggml_sub(ctx0, x[1], ggml_repeat(ctx0, x[0], x[1]))));
-
-                check_gradient("repeat", ctx0, x, f, ndims, nargs, 1e-3f, 1e-2f, INFINITY);
-            }
-        }
-
-        // repeat back
-        {
-            srand(seed);
-            int64_t ne2[4];
-            get_random_dims(ne2, 4);
-
-            ne2[0] = ne[0] * ne2[0];
-            ne2[1] = ne[1] * ne2[1];
-            ne2[2] = 1;
-            ne2[3] = 1;
-
-            const int nargs = 1;
-            for (int ndims = 1; ndims <= 2; ++ndims) {
-                x[0] = get_random_tensor_f32(ctx0, ndims, ne, -1.0f, 1.0f);
-                x[1] = get_random_tensor_f32(ctx0, ndims, ne2, -1.0f, 1.0f);
-                ggml_set_param(ctx0, x[0]);
-
-                struct ggml_tensor * f = ggml_sum(ctx0, ggml_sqr(ctx0, ggml_sub(ctx0, x[0], ggml_repeat_back(ctx0, x[1], x[0]))));
-
-                check_gradient("repeat back", ctx0, x, f, ndims, nargs, 1e-3f, 1e-2f, INFINITY);
-            }
-        }
-
-        // abs (finite differences do not work)
-        //{
-        //    const int nargs = 1;
-
-        //    for (int ndims = 1; ndims <= 2; ++ndims) {
-        //        for (int i = 0; i < nargs; ++i) {
-        //            x[i] = get_random_tensor_f32(ctx0, ndims, ne, -1.0f, 1.0f);
-        //            ggml_set_param(ctx0, x[i]);
-        //        }
-
-        //        struct ggml_tensor * f = ggml_sum(ctx0, ggml_abs(ctx0, x[0]));
-
-        //        check_gradient("abs", ctx0, x, f, ndims, nargs, 1e-3f, INFINITY, 1e-3f);
-        //    }
-        //}
-
-        // sgn
-        {
-            srand(seed);
-            const int nargs = 1;
-
-            for (int ndims = 1; ndims <= 4; ++ndims) {
-                for (int i = 0; i < nargs; ++i) {
-                    x[i] = get_random_tensor_f32(ctx0, ndims, ne, -1.0f, 1.0f);
-                    ggml_set_param(ctx0, x[i]);
-                }
-
-                struct ggml_tensor* f = ggml_sum(ctx0, ggml_sgn(ctx0, x[0]));
-
-                check_gradient("sgn", ctx0, x, f, ndims, nargs, 1e-3f, 1e-3f, 1e-3f);
-            }
-        }
-
-        // neg
-        {
-            srand(seed);
-            const int nargs = 1;
-
-            for (int ndims = 1; ndims <= 4; ++ndims) {
-                for (int i = 0; i < nargs; ++i) {
-                    x[i] = get_random_tensor_f32(ctx0, ndims, ne, -1.0f, 1.0f);
-                    ggml_set_param(ctx0, x[i]);
-                }
-
-                struct ggml_tensor* f = ggml_sum(ctx0, ggml_neg(ctx0, x[0]));
-
-                check_gradient("neg", ctx0, x, f, ndims, nargs, 1e-3f, 1e-3f, 1e-3f);
-            }
-        }
-
-        // step
-        {
-            srand(seed);
-            const int nargs = 1;
-
-            for (int ndims = 1; ndims <= 4; ++ndims) {
-                for (int i = 0; i < nargs; ++i) {
-                    x[i] = get_random_tensor_f32(ctx0, ndims, ne, -1.0f, 1.0f);
-                    ggml_set_param(ctx0, x[i]);
-                }
-
-                struct ggml_tensor* f = ggml_sum(ctx0, ggml_step(ctx0, x[0]));
-
-                check_gradient("step", ctx0, x, f, ndims, nargs, 1e-3f, 1e-3f, 1e-3f);
-            }
-        }
-
-        // tanh, not yet fully implemented
-        if(0)
-        {
-            srand(seed);
-            const int nargs = 1;
-
-            for (int ndims = 1; ndims <= 4; ++ndims) {
-                for (int i = 0; i < nargs; ++i) {
-                    x[i] = get_random_tensor_f32(ctx0, ndims, ne, -1.0f, 1.0f);
-                    ggml_set_param(ctx0, x[i]);
-                }
-
-                struct ggml_tensor* f = ggml_sum(ctx0, ggml_tanh(ctx0, x[0]));
-
-                check_gradient("tanh", ctx0, x, f, ndims, nargs, 1e-3f, 1e-3f, 1e-3f);
-            }
-        }
-
-        // mul_mat
-        {
-            srand(seed);
-            const int nargs = 2;
-
-            for (int ndims = 2; ndims <= 4; ++ndims) {
-                int max_nrep = (ndims >= 3) ? 2 : 1;
-                x[0] = get_random_tensor_f32(ctx0, ndims, ne, -1.0f, 1.0f);
-                for (int nrep2 = 1; nrep2 < max_nrep; ++nrep2) {
-                    for (int nrep3 = 1; nrep3 < max_nrep; ++nrep3) {
-                        {
-                            int64_t ne2[4];
-                            get_random_dims(ne2, 4);
-                            ne2[0] = ne[0];
-                            ne2[2] = nrep2 * ne[2];
-                            ne2[3] = nrep3 * ne[3];
-                            x[1] = get_random_tensor_f32(ctx0, ndims, ne2, -1.0f, 1.0f);
-                        }
-
-                        ggml_set_param(ctx0, x[0]);
-                        ggml_set_param(ctx0, x[1]);
-
-                        struct ggml_tensor * m = ggml_mul_mat(ctx0, x[1], x[0]);
-                        struct ggml_tensor * f = ggml_sum(ctx0, m);
-
-                        GGML_PRINT_DEBUG("testing: mul_mat, [%lld, %lld] (%d) * [%lld, %lld] (%d)\n", x[1]->ne[0], x[1]->ne[1], x[1]->n_dims, x[0]->ne[0], x[0]->ne[1], x[0]->n_dims);
-
-                        check_gradient("mul_mat", ctx0, x, f, ndims, nargs, 1e-3f, 1e-3f, INFINITY);
-                        if (ndims == 2) {
-                            // check_mat_mul does not support ndims > 2
-                            check_mat_mul(m, x[1], x[0]);
-                        }
-                    }
-                }
-            }
-        }
-
-        // elu, not yet fully implemented
-        if(0)
-        {
-            srand(seed);
-            const int nargs = 1;
-
-            for (int ndims = 1; ndims <= 4; ++ndims) {
-                for (int i = 0; i < nargs; ++i) {
-                    x[i] = get_random_tensor_f32(ctx0, ndims, ne, -1.0f, 1.0f);
-                    ggml_set_param(ctx0, x[i]);
-                }
-
-                struct ggml_tensor* f = ggml_sum(ctx0, ggml_elu(ctx0, x[0]));
-
-                check_gradient("elu", ctx0, x, f, ndims, nargs, 1e-3f, 1e-3f, 1e-3f);
-            }
-        }
-
-        // relu
-        {
-            srand(seed);
-            const int nargs = 1;
-
-            for (int ndims = 1; ndims <= 4; ++ndims) {
-                for (int i = 0; i < nargs; ++i) {
-                    x[i] = get_random_tensor_f32(ctx0, ndims, ne, -1.0f, 1.0f);
-                    ggml_set_param(ctx0, x[i]);
-                }
-
-                struct ggml_tensor* f = ggml_sum(ctx0, ggml_relu(ctx0, x[0]));
-
-                check_gradient("relu", ctx0, x, f, ndims, nargs, 1e-3f, 1e-3f, INFINITY);
-            }
-        }
-
-        // gelu, not yet fully implemented
-        if(0)
-        {
-            srand(seed);
-            const int nargs = 1;
-
-            for (int ndims = 1; ndims <= 4; ++ndims) {
-                for (int i = 0; i < nargs; ++i) {
-                    x[i] = get_random_tensor_f32(ctx0, ndims, ne, -1.0f, 1.0f);
-                    ggml_set_param(ctx0, x[i]);
-                }
-
-                struct ggml_tensor* f = ggml_sum(ctx0, ggml_gelu(ctx0, x[0]));
-
-                check_gradient("gelu", ctx0, x, f, ndims, nargs, 1e-3f, 1e-3f, 1e-3f);
-            }
-        }
-
-        // silu
-        {
-            srand(seed);
-            const int nargs = 1;
-
-            for (int ndims = 1; ndims <= 2; ++ndims) {
-                for (int i = 0; i < nargs; ++i) {
-                    x[i] = get_random_tensor_f32(ctx0, ndims, ne, -1.0f, 1.0f);
-                    ggml_set_param(ctx0, x[i]);
-                }
-
-                struct ggml_tensor * f = ggml_sum(ctx0, ggml_silu(ctx0, x[0]));
-
-#ifdef GGML_SILU_FP16
-                // due to GGML_SILU_FP16 the finite difference method will be slightly wrong -> increase error bounds.
-                check_gradient("silu", ctx0, x, f, ndims, nargs, 1e-3f, 0.5, INFINITY);
-#else
-                check_gradient("silu", ctx0, x, f, ndims, nargs, 1e-3f, 1e-3f, INFINITY);
-#endif
-            }
-        }
-
-        // rms_norm
-        {
-            srand(seed);
-            const int nargs = 1;
-
-            for (int ndims = 1; ndims <= 2; ++ndims) {
-                for (int i = 0; i < nargs; ++i) {
-                    x[i] = get_random_tensor_f32(ctx0, ndims, ne, -1.0f, 1.0f);
-                    ggml_set_param(ctx0, x[i]);
-                }
-
-                struct ggml_tensor * f = ggml_sum(ctx0, ggml_rms_norm(ctx0, x[0], 1e-6f));
-
-                check_gradient("rms_norm", ctx0, x, f, ndims, nargs, 1e-4f, 1.0f, INFINITY);
-            }
-        }
-
-        // scale
-        {
-            srand(seed);
-            const int nargs = 1;
-
-            for (int ndims = 1; ndims <= 2; ++ndims) {
-                x[0] = get_random_tensor_f32(ctx0, ndims, ne, -1.0f, 1.0f);
-
-                const float s = -1.0f + 2.0f*frand();
-
-                ggml_set_param(ctx0, x[0]);
-
-                struct ggml_tensor * f = ggml_sum(ctx0, ggml_scale(ctx0, x[0], s));
-
-                check_gradient("scale", ctx0, x, f, ndims, nargs, 1e-3f, 1e-3f, INFINITY);
-            }
-        }
-
-        // cpy f32
-        {
-            srand(seed);
-            const int nargs = 2;
-
-            for (int ndims = 1; ndims <= 2; ++ndims) {
-                for (int i = 0; i < nargs; ++i) {
-                    x[i] = get_random_tensor_f32(ctx0, ndims, ne, -1.0f, 1.0f);
-                    ggml_set_param(ctx0, x[i]);
-                }
-                // x[1] is overwritten by x[0], so the gradients don't propagate to x[1]
-
-                struct ggml_tensor * f = ggml_sum(ctx0, ggml_cpy(ctx0, x[0], x[1]));
-
-                check_gradient("cpy f32", ctx0, x, f, ndims, nargs, 1e-3f, 1e-3f, INFINITY);
-            }
-        }
-
-        // cpy f16
-        {
-            srand(seed);
-            const int nargs = 2;
-
-            for (int ndims = 1; ndims <= 2; ++ndims) {
-                for (int i = 0; i < nargs; ++i) {
-                    x[i] = get_random_tensor_f16(ctx0, ndims, ne, -1.0f, 1.0f);
-                    ggml_set_param(ctx0, x[i]);
-                }
-                // x[1] is overwritten by x[0], so the gradients don't propagate to x[1]
-
-                struct ggml_tensor * f = ggml_sum(ctx0, ggml_cpy(ctx0, x[0], x[1]));
-
-                check_gradient("cpy f16", ctx0, x, f, ndims, nargs, 1e-1f, 1e-1f, INFINITY);
-            }
-        }
-
-        // reshape (1d->nd)
-        {
-            srand(seed);
-            const int nargs = 1;
-
-            for (int ndims = 1; ndims <= 2; ++ndims) {
-                int64_t ne2[4];
-                ne2[0] = 1;
-                ne2[1] = 1;
-                ne2[2] = 1;
-                ne2[3] = 1;
-                for (int i = 0; i < ndims; ++i) {
-                    ne2[0] *= ne[i];
-                }
-                x[0] = get_random_tensor_f32(ctx0, 1, ne2, -1.0f, 1.0f);
-                x[1] = get_random_tensor_f32(ctx0, ndims, ne, -1.0f, 1.0f);
-                ggml_set_param(ctx0, x[0]);
-
-
-                struct ggml_tensor * f = ggml_sum(ctx0, ggml_reshape(ctx0, x[0], x[1]));
-                check_gradient("reshape", ctx0, x, f, ndims, nargs, 1e-3f, 1e-3f, INFINITY);
-            }
-        }
-
-        // reshape (nd->1d)
-        {
-            srand(seed);
-            const int nargs = 1;
-
-            for (int ndims = 1; ndims <= 2; ++ndims) {
-                int64_t ne2[4];
-                ne2[0] = 1;
-                ne2[1] = 1;
-                ne2[2] = 1;
-                ne2[3] = 1;
-                for (int i = 0; i < ndims; ++i) {
-                    ne2[0] *= ne[i];
-                }
-                x[0] = get_random_tensor_f32(ctx0, ndims, ne, -1.0f, 1.0f);
-                x[1] = get_random_tensor_f32(ctx0, 1, ne2, -1.0f, 1.0f);
-                ggml_set_param(ctx0, x[0]);
-
-
-                struct ggml_tensor * f = ggml_sum(ctx0, ggml_reshape(ctx0, x[0], x[1]));
-                check_gradient("reshape", ctx0, x, f, ndims, nargs, 1e-3f, 1e-3f, INFINITY);
-            }
-        }
-
-        // acc 1d
-        {
-            srand(seed);
-            int64_t ne2[4] = { 1, 1, 1, 1 };
-
-            const int nargs = 2;
-            for (int ndims = 1; ndims <= 4; ++ndims) {
-
-                x[0] = get_random_tensor_f32(ctx0, ndims, ne, -1.0f, 1.0f);
-                ggml_set_param(ctx0, x[0]);
-
-                get_random_dims(ne2, 1);
-                while ((ne2[0] > ne[0]) || (ne2[0] > ggml_nelements(x[0]))) {
-                    get_random_dims(ne2, 1);
-                }
-
-                x[1] = get_random_tensor_f32(ctx0, 1, ne2, -1.0f, 1.0f);
-                ggml_set_param(ctx0, x[1]);
-
-                const int max_offset = MAX(0, ggml_nelements(x[0]) - ggml_nelements(x[1]));
-                const int offset = irand(max_offset) * ggml_element_size(x[0]);
-
-                struct ggml_tensor * f = ggml_sum(ctx0, ggml_acc(ctx0, x[0], x[1], x[0]->nb[1], x[0]->nb[2], x[0]->nb[3], offset));
-
-                check_gradient("acc 1d", ctx0, x, f, ndims, nargs, 1e-3f, 1e-3f, INFINITY);
-            }
-        }
-
-        // acc 2d
-        {
-            srand(seed);
-            int64_t ne2[4]         = { 1, 1, 1, 1 };
-            int64_t max_offsets[4] = { 0, 0, 0, 0 };
-            int64_t offsets[4]     = { 0, 0, 0, 0 };
-
-            const int nargs = 2;
-            for (int ndims = 2; ndims <= 4; ++ndims) {
-
-                x[0] = get_random_tensor_f32(ctx0, ndims, ne, -1.0f, 1.0f);
-                ggml_set_param(ctx0, x[0]);
-
-                get_random_dims(ne2, 2);
-                while ((ne2[0] > ne[0]) || (ne2[1] > ne[1]) || (ne2[0]*ne2[1] > ggml_nelements(x[0]))) {
-                    get_random_dims(ne2, 2);
-                }
-
-                x[1] = get_random_tensor_f32(ctx0, 2, ne2, -1.0f, 1.0f);
-                ggml_set_param(ctx0, x[1]);
-
-                max_offsets[0] = MAX(0, x[0]->ne[0] - x[1]->ne[0]);
-                max_offsets[1] = MAX(0, x[0]->ne[1] - x[1]->ne[1]);
-                offsets[0] = irand(max_offsets[0]) * x[0]->nb[0];
-                offsets[1] = irand(max_offsets[1]) * x[0]->nb[1];
-                const int offset = offsets[0] + offsets[1];
-
-                struct ggml_tensor * f = ggml_sum(ctx0, ggml_acc(ctx0, x[0], x[1], x[0]->nb[1], x[0]->nb[2], x[0]->nb[3], offset));
-
-                check_gradient("acc 2d", ctx0, x, f, ndims, nargs, 1e-3f, 1e-3f, INFINITY);
-            }
-        }
-
-        // acc 3d
-        {
-            srand(seed);
-            int64_t ne2[4]         = { 1, 1, 1, 1 };
-            int64_t max_offsets[4] = { 0, 0, 0, 0 };
-            int64_t offsets[4]     = { 0, 0, 0, 0 };
-
-            const int nargs = 2;
-            for (int ndims = 3; ndims <= 4; ++ndims) {
-
-                x[0] = get_random_tensor_f32(ctx0, ndims, ne, -1.0f, 1.0f);
-                ggml_set_param(ctx0, x[0]);
-
-                get_random_dims(ne2, 3);
-                while ((ne2[0] > ne[0]) || (ne2[1] > ne[1]) || (ne2[2] > ne[2]) || (ne2[0]*ne2[1]*ne2[2] > ggml_nelements(x[0]))) {
-                    get_random_dims(ne2, 3);
-                }
-
-                x[1] = get_random_tensor_f32(ctx0, 3, ne2, -1.0f, 1.0f);
-                ggml_set_param(ctx0, x[1]);
-
-                max_offsets[0] = MAX(0, x[0]->ne[0] - x[1]->ne[0]);
-                max_offsets[1] = MAX(0, x[0]->ne[1] - x[1]->ne[1]);
-                max_offsets[2] = MAX(0, x[0]->ne[2] - x[1]->ne[2]);
-                offsets[0] = irand(max_offsets[0]) * x[0]->nb[0];
-                offsets[1] = irand(max_offsets[1]) * x[0]->nb[1];
-                offsets[2] = irand(max_offsets[2]) * x[0]->nb[2];
-                const int offset = offsets[0] + offsets[1] + offsets[2];
-
-                struct ggml_tensor * f = ggml_sum(ctx0, ggml_acc(ctx0, x[0], x[1], x[0]->nb[1], x[0]->nb[2], x[0]->nb[3], offset));
-
-                check_gradient("acc 3d", ctx0, x, f, ndims, nargs, 1e-3f, 1e-3f, INFINITY);
-            }
-        }
-
-        // acc 4d
-        {
-            srand(seed);
-            int64_t ne2[4]         = { 1, 1, 1, 1 };
-            int64_t max_offsets[4] = { 0, 0, 0, 0 };
-            int64_t offsets[4]     = { 0, 0, 0, 0 };
-
-            const int nargs = 2;
-            for (int ndims = 4; ndims <= 4; ++ndims) {
-
-                x[0] = get_random_tensor_f32(ctx0, ndims, ne, -1.0f, 1.0f);
-                ggml_set_param(ctx0, x[0]);
-
-                get_random_dims(ne2, 4);
-                while ((ne2[0] > ne[0]) || (ne2[1] > ne[1]) || (ne2[2] > ne[2]) || (ne2[3] > ne[3]) || (ne2[0]*ne2[1]*ne2[2]*ne2[3] > ggml_nelements(x[0]))) {
-                    get_random_dims(ne2, 4);
-                }
-
-                x[1] = get_random_tensor_f32(ctx0, 4, ne2, -1.0f, 1.0f);
-                ggml_set_param(ctx0, x[1]);
-
-                max_offsets[0] = MAX(0, x[0]->ne[0] - x[1]->ne[0]);
-                max_offsets[1] = MAX(0, x[0]->ne[1] - x[1]->ne[1]);
-                max_offsets[2] = MAX(0, x[0]->ne[2] - x[1]->ne[2]);
-                max_offsets[3] = MAX(0, x[0]->ne[3] - x[1]->ne[3]);
-                offsets[0] = irand(max_offsets[0]) * x[0]->nb[0];
-                offsets[1] = irand(max_offsets[1]) * x[0]->nb[1];
-                offsets[2] = irand(max_offsets[2]) * x[0]->nb[2];
-                offsets[3] = irand(max_offsets[3]) * x[0]->nb[3];
-                const int offset = offsets[0] + offsets[1] + offsets[2] + offsets[3];
-
-                struct ggml_tensor * f = ggml_sum(ctx0, ggml_acc(ctx0, x[0], x[1], x[0]->nb[1], x[0]->nb[2], x[0]->nb[3], offset));
-
-                check_gradient("acc 4d", ctx0, x, f, ndims, nargs, 1e-3f, 1e-3f, INFINITY);
-            }
-        }
-
-        // set_1d
-        {
-            srand(seed);
-            int64_t ne2[4];
-
-            const int nargs = 2;
-            for (int ndims = 1; ndims <= 4; ++ndims) {
-
-                x[0] = get_random_tensor_f32(ctx0, ndims, ne, -1.0f, 1.0f);
-                ggml_set_param(ctx0, x[0]);
-
-                get_random_dims(ne2, 1);
-                while ((ne2[0] > ne[0]) || (ne2[0] > ggml_nelements(x[0]))) {
-                    get_random_dims(ne2, 1);
-                }
-
-                x[1] = get_random_tensor_f32(ctx0, 1, ne2, -1.0f, 1.0f);
-                ggml_set_param(ctx0, x[1]);
-
-                const int max_offset = MAX(0, ggml_nelements(x[0]) - ggml_nelements(x[1]));
-                const int offset = irand(max_offset) * ggml_element_size(x[0]);
-
-                struct ggml_tensor * f = ggml_sum(ctx0, ggml_set_1d(ctx0, x[0], x[1], offset));
-
-                check_gradient("set_1d", ctx0, x, f, ndims, nargs, 1e-3f, 1e-3f, INFINITY);
-            }
-        }
-
-        // set_2d
-        {
-            srand(seed);
-            int64_t ne2[4];
-            int64_t max_offsets[4] = { 0, 0, 0, 0 };
-            int64_t offsets[4]     = { 0, 0, 0, 0 };
-
-            const int nargs = 1;
-            for (int ndims = 2; ndims <= 4; ++ndims) {
-
-                x[0] = get_random_tensor_f32(ctx0, ndims, ne, -1.0f, 1.0f);
-                ggml_set_param(ctx0, x[0]);
-
-                get_random_dims(ne2, 2);
-                while ((ne2[0] > ne[0]) || (ne2[1] > ne[1]) || (ne2[0]*ne2[1] > ggml_nelements(x[0]))) {
-                    get_random_dims(ne2, 2);
-                }
-
-                x[1] = get_random_tensor_f32(ctx0, 2, ne2, -1.0f, 1.0f);
-                ggml_set_param(ctx0, x[1]);
-
-                max_offsets[0] = MAX(0, x[0]->ne[0] - x[1]->ne[0]);
-                max_offsets[1] = MAX(0, x[0]->ne[1] - x[1]->ne[1]);
-                offsets[0] = irand(max_offsets[0]) * x[0]->nb[0];
-                offsets[1] = irand(max_offsets[1]) * x[0]->nb[1];
-                const int offset = offsets[0] + offsets[1];
-
-                struct ggml_tensor * f = ggml_sum(ctx0, ggml_set_2d(ctx0, x[0], x[1], x[1]->nb[1], offset));
-
-                check_gradient("set_2d", ctx0, x, f, ndims, nargs, 1e-3f, 1e-3f, INFINITY);
-            }
-        }
-
-        // view_1d
-        {
-            srand(seed);
-            const int nargs = 1;
-            for (int ndims = 1; ndims <= 4; ++ndims) {
-
-                x[0] = get_random_tensor_f32(ctx0, ndims, ne, -1.0f, 1.0f);
-
-                ggml_set_param(ctx0, x[0]);
-
-                const int k0 = irand(ggml_nelements(x[0]));
-                const int k1 = irand(ggml_nelements(x[0]));
-                const int i0 = MIN(k0, k1);
-                const int i1 = MAX(k0, k1);
-
-                const int offset = i0 * sizeof(float);
-                const int nelem  = i1 - i0;
-
-                struct ggml_tensor * f = ggml_sum(ctx0, ggml_view_1d(ctx0, x[0], nelem, offset));
-
-                check_gradient("view_1d", ctx0, x, f, ndims, nargs, 1e-3f, 1e-3f, INFINITY);
-            }
-        }
-
-        // view_2d
-        {
-            srand(seed);
-            int64_t ne2[4];
-            int64_t nb2[4];
-
-            const int nargs = 1;
-            for (int ndims = 1; ndims <= 4; ++ndims) {
-
-                x[0] = get_random_tensor_f32(ctx0, ndims, ne, -1.0f, 1.0f);
-
-                get_random_dims(ne2, 2);
-                while (ne2[0]*ne2[1] > ggml_nelements(x[0])) {
-                    get_random_dims(ne2, 2);
-                }
-                const int count = ne2[0]*ne2[1];
-
-                nb2[0] = sizeof(float);
-                nb2[1] = nb2[0]*ne2[0];
-
-                ggml_set_param(ctx0, x[0]);
-
-                const int max_offset = ggml_nelements(x[0]) - count;
-                const int offset = irand(max_offset+1) * sizeof(float);
-
-                struct ggml_tensor * f = ggml_sum(ctx0, ggml_view_2d(ctx0, x[0], ne2[0], ne2[1], nb2[1], offset));
-
-                check_gradient("view_2d", ctx0, x, f, ndims, nargs, 1e-3f, 1e-3f, INFINITY);
-            }
-        }
-
-        // view_3d
-        {
-            srand(seed);
-            int64_t ne2[4] = {1,1,1,1};
-            int64_t nb2[4] = {0,0,0,0};
-
-            const int nargs = 1;
-            for (int ndims = 1; ndims <= 4; ++ndims) {
-
-                x[0] = get_random_tensor_f32(ctx0, ndims, ne, -1.0f, 1.0f);
-
-                get_random_dims(ne2, 3);
-                while (ne2[0]*ne2[1]*ne2[2] > ggml_nelements(x[0])) {
-                    get_random_dims(ne2, 3);
-                }
-                const int count = ne2[0]*ne2[1]*ne2[2];
-
-                nb2[0] = sizeof(float);
-                nb2[1] = nb2[0]*ne2[0];
-                nb2[2] = nb2[1]*ne2[1];
-
-                ggml_set_param(ctx0, x[0]);
-
-                const int max_offset = ggml_nelements(x[0]) - count;
-                const int offset = irand(max_offset+1) * sizeof(float);
-
-                struct ggml_tensor * f = ggml_sum(ctx0, ggml_view_3d(ctx0, x[0], ne2[0], ne2[1], ne2[2], nb2[1], nb2[2], offset));
-
-                check_gradient("view_3d", ctx0, x, f, ndims, nargs, 1e-3f, 1e-3f, INFINITY);
-            }
-        }
-
-        // permute
-        {
-            srand(seed);
-            int64_t ne2[4];
-
-            const int nargs = 1;
-            for (int ndims = 1; ndims <= 4; ++ndims)
-            {
-                // ggml_permute will set axes of dimensions below n_dims to 1.
-                // to make ggml_permute work correctly on all axes,
-                // the input tensor needs maximal n_dim of 4.
-                for (int i=0; i<ndims; ++i) {
-                    ne2[i] = ne[i];
-                }
-                for (int i=ndims; i<4; ++i) {
-                    ne2[i] = 1;
-                }
-                x[0] = get_random_tensor_f32(ctx0, 4, ne2, -1.0f, 1.0f);
-
-                ggml_set_param(ctx0, x[0]);
-
-                const int p = irand(NUM_PERMUTATIONS);
-                const int ax0 = all_permutations[p*4+0];
-                const int ax1 = all_permutations[p*4+1];
-                const int ax2 = all_permutations[p*4+2];
-                const int ax3 = all_permutations[p*4+3];
-
-                // sum requires contiguous tensor rows
-                struct ggml_tensor * f = ggml_sum(ctx0, ggml_cont(ctx0, ggml_permute(ctx0, x[0], ax0, ax1, ax2, ax3)));
-
-                check_gradient("permute", ctx0, x, f, ndims, nargs, 1e-3f, 1e-3f, INFINITY);
-            }
-        }
-
-        // transpose
-        {
-            srand(seed);
-            int64_t ne2[4];
-
-            const int nargs = 1;
-            for (int ndims = 1; ndims <= 4; ++ndims)
-            {
-                // ggml_transpose will set axes of dimensions below n_dims to 1.
-                // to make ggml_transpose work correctly on all axes,
-                // the input tensor needs maximal n_dim of 4.
-                for (int i=0; i<ndims; ++i) {
-                    ne2[i] = ne[i];
-                }
-                for (int i=ndims; i<4; ++i) {
-                    ne2[i] = 1;
-                }
-                x[0] = get_random_tensor_f32(ctx0, 4, ne2, -1.0f, 1.0f);
-
-                ggml_set_param(ctx0, x[0]);
-
-                // sum requires contiguous tensor rows
-                struct ggml_tensor * f = ggml_sum(ctx0, ggml_cont(ctx0, ggml_transpose(ctx0, x[0])));
-
-                check_gradient("transpose", ctx0, x, f, ndims, nargs, 1e-3f, 1e-3f, INFINITY);
-            }
-        }
-
-        // get_rows
-        {
-            srand(seed);
-            int64_t ne2[4] = {ne[0], ne[1], 1, 1};
-            int64_t ne3[4] = {1+irand(ne[1]), 1, 1, 1};
-            const int nargs = 1;
-            const int ndims = 2;
-            x[0] = get_random_tensor_f32(ctx0, ndims, ne2, -1.0f, 1.0f);
-            x[1] = get_random_tensor_i32(ctx0, 1, ne3, 0, ne2[1]);
-
-            ggml_set_param(ctx0, x[0]);
-
-            struct ggml_tensor * f = ggml_sum(ctx0, ggml_get_rows(ctx0, x[0], x[1]));
-
-            check_gradient("get_rows", ctx0, x, f, ndims, nargs, 1e-3f, 1e-3f, INFINITY);
-        }
-
-        // diag_mask_inf
-        {
-            srand(seed);
-            const int nargs = 1;
-            const int ndims = 2;
-
-            x[0] = get_random_tensor_f32(ctx0, ndims, ne, -1.0f, 1.0f);
-            ggml_set_param(ctx0, x[0]);
-
-            int n_past = irand(ne[0]);
-
-            struct ggml_tensor * f = ggml_sum(ctx0, ggml_diag_mask_inf(ctx0, x[0], n_past));
-
-            check_gradient("diag_mask_inf", ctx0, x, f, ndims, nargs, 1e-3f, 1e-3f, INFINITY);
-        }
-
-        // diag_mask_zero
-        {
-            srand(seed);
-            const int nargs = 1;
-            const int ndims = 2;
-
-            x[0] = get_random_tensor_f32(ctx0, ndims, ne, -1.0f, 1.0f);
-            ggml_set_param(ctx0, x[0]);
-
-            int n_past = irand(ne[0]);
-
-            struct ggml_tensor * f = ggml_sum(ctx0, ggml_diag_mask_zero(ctx0, x[0], n_past));
-
-            check_gradient("diag_mask_zero", ctx0, x, f, ndims, nargs, 1e-3f, 1e-3f, INFINITY);
-        }
-
-        // softmax
-        {
-            srand(seed);
-            const int nargs = 1;
-
-            int64_t ne2[4];
-            get_random_dims(ne2, 4);
-
-            for (int ndims = 1; ndims <= 3; ++ndims) {
-                x[0] = get_random_tensor_f32(ctx0, ndims, ne2, -1.0f, 1.0f);
-                ggml_set_param(ctx0, x[0]);
-
-                float eps = 1e-6f;
-                // dont use only sum as aggregation, because sum of softmax is always 1 -> finite differences should not work
-                // instead use sum(log(soft_max()*(1-eps)+eps)); use eps to avoid log(0)
-                struct ggml_tensor * f = ggml_sum(ctx0,
-                                            ggml_log(ctx0,
-                                                ggml_add1(ctx0,
-                                                    ggml_scale(ctx0,
-                                                        ggml_soft_max(ctx0, x[0]),
-                                                        1.0f - eps),
-                                                    ggml_new_f32(ctx0, eps))));
-
-                check_gradient("softmax", ctx0, x, f, ndims, nargs, 1e-3f, 2e-1f, INFINITY);
-                // NOTE: softmax forward is computed using f16 table lookup instead of using actual expf, but backward assumes actual expf.
-                // this may result in different gradients too finite differences.
-                // when this test reports errors, first try to replace the table lookup with actual expf and test again to see if just that was the cause.
-                // if only the table lookup causes gradients to differ this is acceptable.
-            }
-        }
-
-        // cross_entropy_loss
-        {
-            srand(seed);
-            const int nargs = 1;
-
-            int64_t ne2[4];
-            get_random_dims(ne2, 4);
-
-            for (int ndims = 1; ndims <= 4; ++ndims) {
-                x[0] = get_random_tensor_f32(ctx0, ndims, ne2, -0.1f, 0.1f);
-                x[1] = get_random_tensor_f32(ctx0, ndims, ne2, 0.0f, 1.0f);
-                // the second argument to cross_entropy_loss must sum up to 1 for each row
-                int nr = ggml_nrows(x[1]);
-                int nc = ggml_nelements(x[1]) / nr;
-                for (int ir = 0; ir < nr; ++ir) {
-                    float sum = 0;
-                    for (int ic = 0; ic < nc; ++ic) {
-                        sum += ((float *) x[1]->data)[ic + ir*nc];
-                    }
-                    for (int ic = 0; ic < nc; ++ic) {
-                        ((float *) x[1]->data)[ic + ir*nc] /= sum;
-                    }
-                }
-                ggml_set_param(ctx0, x[0]);
-
-                struct ggml_tensor * f = ggml_cross_entropy_loss(ctx0, x[0], x[1]);
-
-                check_gradient("cross_entropy_loss", ctx0, x, f, ndims, nargs, 1e-4f, 1e-3f, INFINITY);
-            }
-        }
-
-        // rope f32
-        {
-            srand(seed);
-            const int nargs = 1;
-
-            int64_t ne2[4];
-            get_random_dims(ne2, 4);
-            ne2[0] += ne2[0] % 2;
-            int n_rot = ne2[0];
-
-            for (int ndims = 3; ndims <= 4; ++ndims) {
-                for (int mode = 0; mode < 4; ++mode) {
-                    for (int n_past = 1; n_past < ne2[2]; ++n_past) {
-                        x[0] = get_random_tensor_f32(ctx0, ndims, ne2, -1.0f, 1.0f);
-
-                        struct ggml_tensor * p = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, ne2[2]);
-                        for (int i = 0; i < ne2[2]; ++i) {
-                            ((int32_t *) p->data)[i] = n_past + i;
-                        }
-
-                        ggml_set_param(ctx0, x[0]);
-
-                        const bool skip_past = (mode & 1);
-                        if (skip_past) {
-                            // we have no past, so this would have to work on uninitialized memory.
-                            // we only test the gradients here;
-                            // skip_past should have no influence on gradient computation.
-                            // so when other modes work, we assume that this does as well.
-                            continue;
-                        }
-
-                        struct ggml_tensor * f = ggml_sum(ctx0, ggml_rope(ctx0, x[0], p, n_rot, mode));
-
-                        GGML_PRINT_DEBUG("rope f32: n_past: %d n_rot: %d mode: %d\n", n_past, n_rot, mode);
-                        check_gradient("rope f32", ctx0, x, f, ndims, nargs, 1e-2f, 1e-3f, INFINITY);
-                    }
-                }
-            }
-        }
-
-        // rope f16
-        {
-            srand(seed);
-            const int nargs = 1;
-
-            int64_t ne2[4];
-            get_random_dims(ne2, 4);
-            ne2[0] += ne2[0] % 2;
-            int n_rot = ne2[0];
-
-            for (int ndims = 3; ndims <= 4; ++ndims) {
-                for (int mode = 0; mode < 4; ++mode) {
-                    for (int n_past = 1; n_past < ne2[2]; ++n_past) {
-                        x[0] = get_random_tensor_f16(ctx0, ndims, ne2, -1.0f, 1.0f);
-
-                        struct ggml_tensor * p = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, ne2[2]);
-                        for (int i = 0; i < ne2[2]; ++i) {
-                            ((int32_t *) p->data)[i] = n_past + i;
-                        }
-
-                        ggml_set_param(ctx0, x[0]);
-
-                        const bool skip_past = (mode & 1);
-                        if (skip_past) {
-                            // we have no past, so this would have to work on uninitialized memory.
-                            // we only test the gradients here;
-                            // skip_past should have no influence on gradient computation.
-                            // so when other modes work, we assume that this does as well.
-                            continue;
-                        }
-
-                        struct ggml_tensor * f = ggml_sum(ctx0, ggml_rope(ctx0, x[0], p, n_rot, mode));
-
-                        GGML_PRINT_DEBUG("rope f16: n_past: %d n_rot: %d mode: %d\n", n_past, n_rot, mode);
-                        check_gradient("rope f16", ctx0, x, f, ndims, nargs, 1e-1f, 1e-1f, INFINITY);
-                    }
-                }
-            }
-        }
-
-        // flash_attn f32
-        // TODO: adapt to ggml_flash_attn_ext() changes
-        //{
-        //    srand(seed);
-        //    const int nargs = 3;
-
-        //    int64_t ne2[4];
-
-        //    get_random_dims(ne2, 4);
-        //    int64_t D = ne2[0];
-        //    int64_t N = ne2[1];
-        //    int64_t M = ne2[2] + N;
-        //    int64_t B = ne2[3];
-
-        //    for (int masked = 0; masked <= 1; ++masked) {
-        //        for (int ndims = 2; ndims <= 4; ++ndims) {
-        //            int max_nrep = (ndims >= 3) ? 2 : 1;
-        //            for (int nrep = 1; nrep < max_nrep; ++nrep) {
-        //                int64_t neq[4] = { D, N, B*nrep, ne[3] };
-        //                int64_t nek[4] = { D, M, B, ne[3] };
-        //                int64_t nev[4] = { M, D, B, ne[3] };
-        //                if (ndims == 2) {
-        //                    neq[2] = 1; neq[3] = 1;
-        //                    nek[2] = 1; nek[3] = 1;
-        //                    nev[2] = 1; nev[3] = 1;
-        //                } else if (ndims == 3) {
-        //                    neq[3] = 1;
-        //                    nek[3] = 1;
-        //                    nev[3] = 1;
-        //                }
-        //                x[0] = get_random_tensor_f32(ctx0, ndims, neq, -0.1250f, 0.1250f);
-        //                x[1] = get_random_tensor_f32(ctx0, ndims, nek, -0.1250f, 0.1250f);
-        //                x[2] = get_random_tensor_f32(ctx0, ndims, nev, -0.1250f, 0.1250f);
-        //                ggml_set_param(ctx0, x[0]);
-        //                ggml_set_param(ctx0, x[1]);
-        //                ggml_set_param(ctx0, x[2]);
-
-        //                struct ggml_tensor * f = ggml_sum(ctx0, ggml_flash_attn(ctx0, x[0], x[1], x[2], (masked == 0)));
-
-        //                check_gradient("flash_attn f32", ctx0, x, f, ndims, nargs, 1.5e-4f, 1e-3f, INFINITY);
-        //            }
-        //        }
-        //    }
-        //}
-
-        ggml_free(ctx0);
-    }
-
-    return 0;
-}
diff --git a/tests/test-grammar-integration.cpp b/tests/test-grammar-integration.cpp
deleted file mode 100644
index 68f971bfe..000000000
--- a/tests/test-grammar-integration.cpp
+++ /dev/null
@@ -1,1325 +0,0 @@
-#ifdef NDEBUG
-#undef NDEBUG
-#endif
-
-#define LLAMA_API_INTERNAL
-
-#include "ggml.h"
-#include "llama.h"
-#include "grammar-parser.h"
-#include "json-schema-to-grammar.h"
-#include "unicode.h"
-#include <cassert>
-#include <string>
-#include <vector>
-
-using json = nlohmann::ordered_json;
-
-static llama_grammar* build_grammar(const std::string & grammar_str) {
-    auto parsed_grammar = grammar_parser::parse(grammar_str.c_str());
-
-    // Ensure we parsed correctly
-    assert(!parsed_grammar.rules.empty());
-
-    // Ensure we have a root node
-    assert(!(parsed_grammar.symbol_ids.find("root") == parsed_grammar.symbol_ids.end()));
-
-    std::vector<const llama_grammar_element*> grammar_rules(parsed_grammar.c_rules());
-    llama_grammar* grammar = llama_grammar_init(
-        grammar_rules.data(), grammar_rules.size(), parsed_grammar.symbol_ids.at("root"));
-
-    return grammar;
-}
-
-static bool test_build_grammar_fails(const std::string & grammar_str) {
-    fprintf(stderr, "⚫ Testing failure for grammar: %s\n", grammar_str.c_str());
-    bool grammar_fails = false;
-    llama_grammar * grammar = build_grammar(grammar_str);
-    if (grammar != nullptr) {
-        fprintf(stderr, "  ❌ Expected build failure, but succeeded\n");
-    } else {
-        grammar_fails = true;
-        fprintf(stdout, "  ✅︎\n");
-    }
-    return grammar_fails;
-}
-
-static bool match_string(const std::string & input, llama_grammar * grammar) {
-    auto decoded = decode_utf8(input, {});
-
-    const auto & code_points = decoded.first;
-
-    const llama_grammar_rules  & rules      = llama_grammar_get_rules (grammar);
-          llama_grammar_stacks & cur_stacks = llama_grammar_get_stacks(grammar);
-
-    for (auto it = code_points.begin(), end = code_points.end() - 1; it != end; ++it) {
-        const llama_grammar_stacks prev_stacks = llama_grammar_get_stacks(grammar); // copy
-
-        llama_grammar_accept(rules, prev_stacks, *it, cur_stacks);
-
-        if (cur_stacks.empty()) {
-            // no stacks means that the grammar failed to match at this point
-            return false;
-        }
-    }
-
-    for (const auto & stack : cur_stacks) {
-        if (stack.empty()) {
-            // An empty stack means that the grammar has been completed
-            return true;
-        }
-    }
-
-    return false;
-}
-
-static void test(const std::string & test_desc, const std::string & grammar_str, const std::vector<std::string> & passing_strings, const std::vector<std::string> & failing_strings) {
-    fprintf(stderr, "⚫ Testing %s\n%s\n", test_desc.c_str(), grammar_str.c_str());
-    fflush(stderr);
-
-    auto grammar = build_grammar(grammar_str);
-
-    // Save the original grammar stacks so that we can reset after every new string we want to test
-    const llama_grammar_stacks original_stacks = llama_grammar_get_stacks(grammar);
-
-    llama_grammar_stacks & cur_stacks = llama_grammar_get_stacks(grammar);
-
-    fprintf(stderr, "  🔵 Valid strings:\n");
-
-    // Passing strings
-    for (const auto & test_string : passing_strings) {
-        fprintf(stderr, "    \"%s\" ", test_string.c_str());
-        fflush(stderr);
-
-        bool matched = match_string(test_string, grammar);
-
-        if (!matched) {
-            fprintf(stderr, "❌ (failed to match)\n");
-
-            // DEBUG: Write strings to files so that we can analyze more easily with gbnf-validator program to see exactly where things failed.
-            // DEBUG: Write the grammar_str to test-grammar-integration.grammar.gbnf
-            FILE* grammar_file = fopen("test-grammar-integration.grammar.gbnf", "w");
-            if (grammar_file) {
-                fprintf(grammar_file, "%s", grammar_str.c_str());
-                fclose(grammar_file);
-            }
-
-            // DEBUG: Write the test string to test-grammar-integration.string.txt
-            FILE* string_file = fopen("test-grammar-integration.string.txt", "w");
-            if (string_file) {
-                fprintf(string_file, "%s", test_string.c_str());
-                fclose(string_file);
-            }
-
-            fprintf(stderr, "\n NOTE: Debug grammar file generated. To analyze this failure in detail, run the following command:     ./llama-gbnf-validator test-grammar-integration.grammar.gbnf test-grammar-integration.string.txt\n\n");
-        } else {
-            fprintf(stdout, "✅︎\n");
-        }
-
-        assert(matched);
-
-        // Reset the grammar stacks
-        cur_stacks = original_stacks;
-    }
-
-    fprintf(stderr, "  🟠 Invalid strings:\n");
-
-    // Failing strings
-    for (const auto & test_string : failing_strings) {
-        fprintf(stderr, "    \"%s\" ", test_string.c_str());
-        fflush(stderr);
-
-        bool matched = match_string(test_string, grammar);
-
-        if (matched) {
-            fprintf(stderr, "❌ (incorrectly matched)\n");
-        } else {
-            fprintf(stdout, "✅︎\n");
-        }
-        assert(!matched);
-
-        // Reset the grammar stacks
-        cur_stacks = original_stacks;
-    }
-
-    // Clean up allocated memory
-    llama_grammar_free(grammar);
-}
-static void test_grammar(const std::string & test_desc, const std::string & grammar_str, const std::vector<std::string> & passing_strings, const std::vector<std::string> & failing_strings) {
-    test(test_desc + ". Grammar: " + grammar_str, grammar_str, passing_strings, failing_strings);
-}
-static void test_schema(const std::string & test_desc, const std::string & schema_str, const std::vector<std::string> & passing_strings, const std::vector<std::string> & failing_strings) {
-    test(test_desc + ". Schema: " + schema_str, json_schema_to_grammar(json::parse(schema_str)), passing_strings, failing_strings);
-}
-
-static void test_simple_grammar() {
-    test_schema(
-        "min 0",
-        R"""({
-            "type": "integer",
-            "minimum": 0
-        })""",
-        // Passing strings
-        {
-            "0",
-            "10",
-            "12",
-            "10000",
-        },
-        // Failing strings
-        {
-            "-1",
-            "-10",
-            "-10000",
-            "-100000000000000000000000000000000",
-            "100000000000000000000000000000000",
-            "00",
-            "01",
-            "-0",
-        }
-    );
-    test_schema(
-        "min 2",
-        // Schema
-        R"""({
-            "type": "integer",
-            "minimum": 2
-        })""",
-        // Passing strings
-        {
-            "2",
-            "3",
-            "4",
-            "10",
-            "20",
-            "1234567890000000",
-        },
-        // Failing strings
-        {
-            "0",
-            "1",
-            "-1",
-            "-100",
-            "0",
-            "1",
-            "01",
-            "02",
-            "12345678900000000",
-        }
-    );
-    test_schema(
-        "min 456",
-        R"""({
-            "type": "integer",
-            "minimum": 456
-        })""",
-        // Passing strings
-        {
-            "456",
-            "4560",
-            "457",
-            "460",
-            "500",
-        },
-        // Failing strings
-        {
-            "455",
-            "356",
-            "50",
-            "050",
-            "-1",
-            "-456",
-        }
-    );
-    test_schema(
-        "min -123",
-        R"""({
-            "type": "integer",
-            "minimum": -123
-        })""",
-        // Passing strings
-        {
-            "-123",
-            "-122",
-            "-11",
-            "-1",
-            "0",
-            "1",
-            "123",
-            "1234",
-            "2345",
-        },
-        // Failing strings
-        {
-            "-1234",
-            "-124",
-        }
-    );
-
-    test_schema(
-        "max 9999",
-        // Schema
-        R"""({
-            "type": "integer",
-            "maximum": 9999
-        })""",
-        // Passing strings
-        {
-            "-99999",
-            "0",
-            "9999",
-        },
-        // Failing strings
-        {
-            "10000",
-            "99991",
-        }
-    );
-    test_schema(
-        "max -9999",
-        // Schema
-        R"""({
-            "type": "integer",
-            "maximum": -9999
-        })""",
-        // Passing strings
-        {
-            "-10000",
-            "-9999",
-        },
-        // Failing strings
-        {
-            "-9998",
-            "0",
-            "9999",
-        }
-    );
-    test_schema(
-        "min 5 max 30",
-        // Schema
-        R"""({
-            "type": "integer",
-            "minimum": 5,
-            "maximum": 30
-        })""",
-        // Passing strings
-        {
-            "5",
-            "10",
-            "30",
-        },
-        // Failing strings
-        {
-            "05",
-            "4",
-            "-1",
-            "31",
-            "123",
-            "0123",
-        }
-    );
-    test_schema(
-        "min -1 max 1",
-        R"""({
-            "type": "integer",
-            "minimum": -1,
-            "maximum": 1
-        })""",
-        // Passing strings
-        {
-            "-1",
-            "0",
-            "1",
-        },
-        // Failing strings
-        {
-            "-11",
-            "-10",
-            "-2",
-            "2",
-            "10",
-            "11",
-        }
-    );
-    test_schema(
-        "min -123 max 42",
-        R"""({
-            "type": "integer",
-            "minimum": -123,
-            "maximum": 42
-        })""",
-        // Passing strings
-        {
-            "-123",
-            "-122",
-            "-13",
-            "-11",
-            "-2",
-            "-1",
-            "0",
-            "1",
-            "5",
-            "10",
-            "39",
-            "40",
-            "42",
-        },
-        // Failing strings
-        {
-            "-0123",
-            "-124",
-            "-1123",
-            "-200",
-            "43",
-            "123",
-            "0123",
-        }
-    );
-    test_schema(
-        "exclusive min / max",
-        // Schema
-        R"""({
-            "type": "integer",
-            "exclusiveMinimum": 0,
-            "exclusiveMaximum": 10000
-        })""",
-        // Passing strings
-        {
-            "1",
-            "9999",
-        },
-        // Failing strings
-        {
-            "0",
-            "01",
-            "10000",
-            "99999",
-        }
-    );
-
-    // Test case for a simple grammar
-    test_grammar(
-        "simple grammar",
-        R"""(
-            root ::= expr
-            expr ::= term ("+" term)*
-            term ::= number
-            number ::= [0-9]+)""",
-        // Passing strings
-        {
-            "42",
-            "1+2+3+4+5",
-            "123+456",
-        },
-        // Failing strings
-        {
-            "+",
-            "/ 3",
-            "1+2+3+4+5+",
-            "12a45",
-        }
-    );
-}
-
-static void test_complex_grammar() {
-    // Test case for a more complex grammar, with both failure strings and success strings
-    test_grammar(
-        "medium complexity grammar",
-        // Grammar
-        R"""(
-            root ::= expression
-            expression ::= term ws (("+"|"-") ws term)*
-            term ::= factor ws (("*"|"/") ws factor)*
-            factor ::= number | variable | "(" expression ")" | function-call
-            number ::= [0-9]+
-            variable ::= [a-zA-Z_][a-zA-Z0-9_]*
-            function-call ::= variable ws "(" (expression ("," ws expression)*)? ")"
-            ws ::= [ \t\n\r]?)""",
-        // Passing strings
-        {
-            "42",
-            "1*2*3*4*5",
-            "x",
-            "x+10",
-            "x1+y2",
-            "(a+b)*(c-d)",
-            "func()",
-            "func(x,y+2)",
-            "a*(b+c)-d/e",
-            "f(g(x),h(y,z))",
-            "x + 10",
-            "x1 + y2",
-            "(a + b) * (c - d)",
-            "func()",
-            "func(x, y + 2)",
-            "a * (b + c) - d / e",
-            "f(g(x), h(y, z))",
-            "123+456",
-            "123*456*789-123/456+789*123",
-            "123+456*789-123/456+789*123-456/789+123*456-789/123+456*789-123/456+789*123-456"
-        },
-        // Failing strings
-        {
-            "+",
-            "/ 3x",
-            "x + + y",
-            "a * / b",
-            "func(,)",
-            "func(x y)",
-            "(a + b",
-            "x + y)",
-            "a + b * (c - d",
-            "42 +",
-            "x +",
-            "x + 10 +",
-            "(a + b) * (c - d",
-            "func(",
-            "func(x, y + 2",
-            "a * (b + c) - d /",
-            "f(g(x), h(y, z)",
-            "123+456*789-123/456+789*123-456/789+123*456-789/123+456*789-123/456+789*123-456/",
-        }
-    );
-}
-
-static void test_special_chars() {
-    // A collection of tests to exercise special characters such as "."
-    test_grammar(
-        "special characters",
-        // Grammar
-        R"""(
-            root ::= ... "abc" ...
-            )""",
-        // Passing strings
-        {
-            "abcabcabc",
-            "aaaabcccc",
-            // NOTE: Also ensures that multi-byte characters still count as a single character
-            "🔵🟠✅abc❌🟠🔵"
-        },
-        // Failing strings
-        {
-            "aaabcccc",
-            "aaaaabcccc",
-            "aaaabccc",
-            "aaaabccccc",
-            "🔵🟠✅❌abc❌✅🟠🔵"
-            "🔵🟠abc🟠🔵"
-        }
-    );
-}
-
-static void test_quantifiers() {
-    // A collection of tests to exercise * + and ? quantifiers
-
-    test_grammar(
-        "* quantifier",
-        // Grammar
-        R"""(root ::= "a"*)""",
-        // Passing strings
-        {
-            "",
-            "a",
-            "aaaaa",
-            "aaaaaaaaaaaaaaaaaa",
-            "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa"
-        },
-        // Failing strings
-        {
-            "b",
-            "ab",
-            "aab",
-            "ba",
-            "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaab"
-        }
-    );
-    test_grammar(
-        "+ quantifier",
-        // Grammar
-        R"""(root ::= "a"+)""",
-        // Passing strings
-        {
-            "a",
-            "aaaaa",
-            "aaaaaaaaaaaaaaaaaa",
-            "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa"
-        },
-        // Failing strings
-        {
-            "",
-            "b",
-            "ab",
-            "aab",
-            "ba",
-            "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaab"
-        }
-    );
-    test_grammar(
-        "? quantifier",
-        // Grammar
-        R"""(root ::= "a"?)""",
-        // Passing strings
-        {
-            "",
-            "a"
-        },
-        // Failing strings
-        {
-            "b",
-            "ab",
-            "aa",
-            "ba",
-        }
-    );
-    test_grammar(
-        "mixed quantifiers",
-        // Grammar
-        R"""(
-            root ::= cons+ vowel* cons? (vowel cons)*
-            vowel ::= [aeiouy]
-            cons ::= [bcdfghjklmnpqrstvwxyz]
-            )""",
-        // Passing strings
-        {
-            "yes",
-            "no",
-            "noyes",
-            "crwth",
-            "four",
-            "bryyyy",
-        },
-        // Failing strings
-        {
-            "yess",
-            "yesno",
-            "forty",
-            "catyyy",
-        }
-    );
-    test_grammar(
-        "simple exact repetition",
-        // Grammar
-        R"""(
-            root ::= [ab]{4}
-        )""",
-        // Passing strings
-        {
-            "aaaa",
-            "bbbb",
-            "abab",
-        },
-        // Failing strings
-        {
-            "a",
-            "b",
-            "aaaaa",
-        }
-    );
-    test_grammar(
-        "simple min repetition",
-        // Grammar
-        R"""(
-            root ::= [ab]{4,}
-        )""",
-        // Passing strings
-        {
-            "aaaa",
-            "aaaaab",
-            "bbbb",
-            "ababab",
-        },
-        // Failing strings
-        {
-            "",
-            "aba",
-        }
-    );
-    test_grammar(
-        "simple max repetition",
-        // Grammar
-        R"""(
-            root ::= [ab]{0,4}
-        )""",
-        // Passing strings
-        {
-            "",
-            "a",
-            "aa",
-            "aaa",
-            "aaab",
-        },
-        // Failing strings
-        {
-            "aaaaa",
-        }
-    );
-    test_grammar(
-        "min / max repetition",
-        // Grammar
-        R"""(
-            root ::= ("0x" [A-F0-9]{2} " "?){3,5}
-        )""",
-        // Passing strings
-        {
-            "0xFF 0x12 0xAB",
-            "0xFF 0x12 0xAB 0x00 0x00",
-        },
-        // Failing strings
-        {
-            "",
-            "0xFF",
-            "0xFF 0x12",
-            "0xFF 0x12 0xAB 0x00 0x00 0x00",
-        }
-    );
-}
-
-static void test_failure_missing_root() {
-    fprintf(stderr, "⚫ Testing missing root node:\n");
-    // Test case for a grammar that is missing a root rule
-    const std::string grammar_str = R"""(
-        rot ::= expr
-        expr ::= term ("+" term)*
-        term ::= number
-        number ::= [0-9]+)""";
-
-    grammar_parser::parse_state parsed_grammar = grammar_parser::parse(grammar_str.c_str());
-
-    // Ensure we parsed correctly
-    assert(!parsed_grammar.rules.empty());
-
-    // Ensure we do NOT have a root node
-    assert(parsed_grammar.symbol_ids.find("root") == parsed_grammar.symbol_ids.end());
-    fprintf(stderr, "  ✅︎ Passed\n");
-}
-
-static void test_failure_missing_reference() {
-    fprintf(stderr, "⚫ Testing missing reference node:\n");
-
-    // Test case for a grammar that is missing a referenced rule
-    const std::string grammar_str =
-        R"""(root ::= expr
-        expr ::= term ("+" term)*
-        term ::= numero
-        number ::= [0-9]+)""";
-
-    fprintf(stderr, "    Expected error:  ");
-
-    grammar_parser::parse_state parsed_grammar = grammar_parser::parse(grammar_str.c_str());
-
-    // Ensure we did NOT parsed correctly
-    assert(parsed_grammar.rules.empty());
-
-    fprintf(stderr, "    End of expected error.\n");
-    fprintf(stderr, "  ✅︎ Passed\n");
-}
-
-static void test_failure_left_recursion() {
-    fprintf(stderr, "⚫ Testing left recursion detection:\n");
-
-    // Test simple left recursion detection
-    const std::string simple_str = R"""(root ::= "a" | root "a")""";
-    assert(test_build_grammar_fails(simple_str));
-
-    // Test more complicated left recursion detection
-    const std::string medium_str = R"""(
-        root ::= asdf
-        asdf ::= "a" | asdf "a"
-        )""";
-    assert(test_build_grammar_fails(medium_str));
-
-    // Test even more complicated left recursion detection
-    const std::string hard_str = R"""(
-        root ::= asdf
-        asdf ::= "a" | foo "b"
-        foo ::= "c" | asdf "d" | "e")""";
-    assert(test_build_grammar_fails(hard_str));
-
-    // Test yet even more complicated left recursion detection
-    const std::string hardest_str = R"""(
-        root ::= asdf
-        asdf ::= "a" | foo "b"
-        foo ::= "c" | empty asdf "d" | "e"
-        empty ::= "blah" | )""";
-    assert(test_build_grammar_fails(hardest_str));
-
-    fprintf(stderr, "  ✅︎ Passed\n");
-}
-
-static void test_json_schema() {
-    // Note that this is similar to the regular grammar tests,
-    //  but we convert each json schema to a grammar before parsing.
-    // Otherwise, this test structure is the same.
-
-    test_schema(
-        "empty schema (object)",
-        // Schema
-        R"""(
-            {}
-        )""",
-        // Passing strings
-        {
-            R"""({})""",
-            R"""({"foo": "bar"})""",
-        },
-        // Failing strings
-        {
-            "",
-            "[]",
-            "null",
-            R"""("")""",
-            "true",
-        }
-    );
-
-    test_schema(
-        "exotic formats (list)",
-        // Schema
-        R"""({
-            "items": [
-                { "format": "date" },
-                { "format": "uuid" },
-                { "format": "time" },
-                { "format": "date-time" }
-            ]
-        })""",
-        // Passing strings
-        {
-            // "{}", // NOTE: This string passes for this schema on https://www.jsonschemavalidator.net/ -- should it?
-            // "[]", // NOTE: This string passes for this schema on https://www.jsonschemavalidator.net/ -- should it?
-            R"""(["2012-04-23", "12345678-1234-1234-1234-1234567890ab", "18:25:43.511Z", "2012-04-23T18:25:43.511Z"])""",
-            //R"""(["2012-04-23","12345678-1234-1234-1234-1234567890ab"])""", // NOTE: This string passes for this schema on https://www.jsonschemavalidator.net/ -- should it?
-            //R"""({"foo": "bar"})""", // NOTE: This string passes for this schema on https://www.jsonschemavalidator.net/ -- should it?
-        },
-        // Failing strings
-        {
-            R"""(["foo", "bar"])""",
-            R"""(["12345678-1234-1234-1234-1234567890ab"])""",
-        }
-    );
-
-    test_schema(
-        "string",
-        // Schema
-        R"""({
-            "type": "string"
-        })""",
-        // Passing strings
-        {
-            R"""("foo")""",
-            R"""("bar")""",
-            R"""("")""",
-        },
-        // Failing strings
-        {
-            R"""({})""",
-            R"""("foo": "bar")""",
-        }
-    );
-
-    test_schema(
-        "string w/ min length 1",
-        // Schema
-        R"""({
-            "type": "string",
-            "minLength": 1
-        })""",
-        // Passing strings
-        {
-            R"""("foo")""",
-            R"""("bar")""",
-        },
-        // Failing strings
-        {
-            R"""("")""",
-            R"""({})""",
-            R"""("foo": "bar")""",
-        }
-    );
-
-    test_schema(
-        "string w/ min length 3",
-        // Schema
-        R"""({
-                "type": "string",
-                "minLength": 3
-        })""",
-        // Passing strings
-        {
-            R"""("foo")""",
-            R"""("bar")""",
-            R"""("foobar")""",
-        },
-        // Failing strings
-        {
-            R"""("")""",
-            R"""("f")""",
-            R"""("fo")""",
-        }
-    );
-
-    test_schema(
-        "string w/ max length",
-        // Schema
-        R"""({
-            "type": "string",
-            "maxLength": 3
-        })""",
-        // Passing strings
-        {
-            R"""("foo")""",
-            R"""("bar")""",
-            R"""("")""",
-            R"""("f")""",
-            R"""("fo")""",
-        },
-        // Failing strings
-        {
-            R"""("foobar")""",
-        }
-    );
-
-    test_schema(
-        "string w/ min & max length",
-        // Schema
-        R"""({
-            "type": "string",
-            "minLength": 1,
-            "maxLength": 4
-        })""",
-        // Passing strings
-        {
-            R"""("foo")""",
-            R"""("bar")""",
-            R"""("f")""",
-            R"""("barf")""",
-        },
-        // Failing strings
-        {
-            R"""("")""",
-            R"""("barfo")""",
-            R"""("foobar")""",
-        }
-    );
-
-    test_schema(
-        "boolean",
-        // Schema
-        R"""({
-            "type": "boolean"
-        })""",
-        // Passing strings
-        {
-            "true",
-            "false",
-        },
-        // Failing strings
-        {
-            R"""("")""",
-            R"""("true")""",
-            R"""(True)""",
-            R"""(FALSE)""",
-        }
-    );
-
-    test_schema(
-        "integer",
-        // Schema
-        R"""({
-            "type": "integer"
-        })""",
-        // Passing strings
-        {
-            R"""(0)""",
-            R"""(12345)""",
-            R"""(1234567890123456)""",
-        },
-        // Failing strings
-        {
-            R"""()""",
-            R"""(01)""",
-            R"""(007)""",
-            R"""(12345678901234567  )""",
-        }
-    );
-
-    test_schema(
-        "string const",
-        // Schema
-        R"""({
-            "const": "foo"
-        })""",
-        // Passing strings
-        {
-            R"""("foo")""",
-        },
-        // Failing strings
-        {
-            R"""(foo)""",
-            R"""("bar")""",
-        }
-    );
-
-    test_schema(
-        "non-string const",
-        // Schema
-        R"""({
-            "const": true
-        })""",
-        // Passing strings
-        {
-            R"""(true)""",
-        },
-        // Failing strings
-        {
-            R"""()""",
-            R"""(foo)""",
-            R"""("true")""",
-        }
-    );
-
-    test_schema(
-        "non-string const",
-        // Schema
-        R"""({
-            "enum": ["red", "amber", "green", null, 42, ["foo"]]
-        })""",
-        // Passing strings
-        {
-            R"""("red")""",
-            R"""(null)""",
-            R"""(42)""",
-            R"""(["foo"])""",
-        },
-        // Failing strings
-        {
-            R"""()""",
-            R"""(420)""",
-            R"""(true)""",
-            R"""(foo)""",
-        }
-    );
-
-    test_schema(
-        "simple pattern",
-        // Schema
-        R"""({
-            "pattern": "^[a-zA-Z0-9_-]*$"
-        })""",
-        // Passing strings
-        {
-            R"""("")""",
-            R"""("He_llo-12")""",
-        },
-        // Failing strings
-        {
-            R"""("!")""",
-            R"""("Hello World")""",
-        }
-    );
-
-    test_schema(
-        "pattern with escapes",
-        // Schema
-        R"""({
-            "pattern": "^a\\^\\$\\.\\[\\]\\(\\)\\|\\{\\}\\*\\+\\?b$"
-        })""",
-        // Passing strings
-        {
-            R"""("a^$.[]()|{}*+?b")""",
-        },
-        // Failing strings
-        {
-            R"""("ab")""",
-        }
-    );
-
-    test_schema(
-        "",
-        // Schema
-        R"""(
-            {
-                "type": ["array", "null"],
-                "items": { "type": "string" }
-            }
-        )""",
-        // Passing strings
-        {
-            "null",
-            "[]",
-            "[\"123\"]",
-            "[\"foo\", \"bar\"]",
-        },
-        // Failing strings
-        {
-            "",
-            "[123]",
-            "\"foo\"",
-            "[\"foo\", 42]",
-        }
-    );
-
-    test_schema(
-        "min+max items",
-        // Schema
-        R"""({
-            "items": {
-                "type": ["number", "integer"]
-            },
-            "minItems": 3,
-            "maxItems": 5
-        })""",
-        // Passing strings
-        {
-            R"""([1, 2, 3])""",
-            R"""([1, 2, 3, 4])""",
-            R"""([1, 2, 3, 4, 5])""",
-        },
-        // Failing strings
-        {
-            R"""([1, 2])""",
-            R"""([1, 2, 3, 4, 5, 6])""",
-            R"""(1)""",
-        }
-    );
-
-    // Properties (from: https://json-schema.org/understanding-json-schema/reference/object#properties)
-    test_schema(
-        "object properties",
-        // Schema
-        R"""({
-            "type": "object",
-            "properties": {
-                "number": { "type": "number" },
-                "street_name": { "type": "string" },
-                "street_type": { "enum": ["Street", "Avenue", "Boulevard"] }
-            }
-        })""",
-        // Passing strings
-        {
-            R"""({ "number": 1600, "street_name": "Pennsylvania", "street_type":"Avenue"})""",
-            // "By default, leaving out properties is valid"
-            R"""({ "street_name": "Pennsylvania" })""",
-            R"""({ "number": 1600, "street_name": "Pennsylvania" })""",
-            // "By extension, even an empty object is valid"
-            R"""({})""",
-            R"""({ "number": 1600, "street_name": "Pennsylvania", "street_type": "Avenue" })""",
-        },
-        // Failing strings
-        {
-            // Change datatype from number to string
-            R"""({ "number": "1600", "street_name": "Pennsylvania", "street_type":"Avenue"})""",
-            // Reorder properties
-            R"""({ "street_name": "Pennsylvania", "number": 1600 })""",
-            // Reorder properties
-            R"""({ "number": "1600", "street_name": "Pennsylvania", "street_type":"Avenue"})""",
-            // "Additional properties default to false for generation, even though the spec says true.
-            R"""({ "number": 1600, "street_name": "Pennsylvania", "street_type":"Avenue", "direction":"NW"})""",
-
-        }
-    );
-
-    test_schema(
-        "additional properties can't override other properties",
-        R"""({
-            "properties": {
-                "a": {"type": "integer"},
-                "b": {"type": "integer"}
-            },
-            "additionalProperties": true
-        })""",
-        // Passing strings
-        {
-            R"""({"a": 42})""",
-            R"""({"c": ""})""",
-            R"""({"a": 42, "c": ""})""",
-            R"""({"a_": ""})""",
-        },
-        // Failing strings
-        {
-            R"""()""",
-            R"""({"a": ""})""",
-            R"""({"a": "", "b": ""})""",
-        }
-    );
-
-    // Properties (from: https://json-schema.org/understanding-json-schema/reference/object#properties)
-    test_schema(
-        "object properties, additionalProperties: true",
-        // Schema
-        R"""({
-            "type": "object",
-            "properties": {
-                "number": { "type": "number" },
-                "street_name": { "type": "string" },
-                "street_type": { "enum": ["Street", "Avenue", "Boulevard"] }
-            },
-            "additionalProperties": true
-        })""",
-        // Passing strings
-        {
-            // "By extension, even an empty object is valid"
-            R"""({})""",
-            R"""({"number":1600,"street_name":"Pennsylvania","street_type":"Avenue"})""",
-            // "By default, leaving out properties is valid"
-            R"""({ "street_name": "Pennsylvania" })""",
-            R"""({ "number": 1600, "street_name": "Pennsylvania" })""",
-            // "By default, providing additional properties is valid"
-            R"""({ "number": 1600, "street_name": "Pennsylvania", "street_type":"Avenue", "direction":"NW"})""",
-            R"""({ "number": 1600, "street_name": "Pennsylvania", "street_type": "Avenue" })""",
-        },
-        // Failing strings
-        {
-            // Change datatype from number to string
-            R"""({ "number": "1600", "street_name": "Pennsylvania", "street_type":"Avenue"})""",
-            // Reorder properties
-            R"""({ "street_name": "Pennsylvania", "number": 1600, "street_type":"Avenue"})""",
-        }
-    );
-
-    // Additional properties: false
-    test_schema(
-        "required + optional props each in original order",
-        // Schema
-        R"""({
-            "type": "object",
-            "properties": {
-                "number": { "type": "number" },
-                "street_name": { "type": "string" },
-                "street_type": { "enum": ["Street", "Avenue", "Boulevard"] }
-            },
-            "additionalProperties": false
-        })""",
-        // Passing strings
-        {
-            R"""({ "street_name": "Pennsylvania" })""",
-            R"""({ "number": 1600, "street_type":"Avenue"})""",
-            R"""({ "number": 1600, "street_name": "Pennsylvania" })""",
-            R"""({ "number": 1600, "street_name": "Pennsylvania", "street_type":"Avenue"})""",
-            // Spaces are permitted around enum values
-            R"""({ "number": 1600, "street_name": "Pennsylvania", "street_type": "Avenue" })""",
-        },
-        // Failing strings
-        {
-            // Reorder properties
-            R"""({ "street_type": "Avenue", "number": 1600 })""",
-            // Add "direction"
-            R"""({ "number": 1600, "street_name": "Pennsylvania", "street_type": "Avenue", "direction": "NW" })""",
-        }
-    );
-
-    test_schema(
-        "required + optional props each in original order",
-        // Schema
-        R"""({
-            "properties": {
-                "b": {"type": "string"},
-                "a": {"type": "string"},
-                "d": {"type": "string"},
-                "c": {"type": "string"}
-            },
-            "required": ["a", "b"],
-            "additionalProperties": false
-        })""",
-        // Passing strings
-        {
-            R"""({"b": "foo", "a": "bar"})""",
-            R"""({"b":"foo","a":"bar","d":"qux"})""",
-            R"""({"b":"foo", "a":"bar", "d":"qux", "c":"baz"})""",
-        },
-        // Failing strings
-        {
-            R"""({"a": "foo", "b": "bar"})""",
-            R"""({"b": "bar"})""",
-            R"""({"a": "foo", "c": "baz"})""",
-            R"""({"a":"foo", "b":"bar", "c":"baz", "d":"qux"})""",
-        }
-    );
-
-    // NOTE: Example from https://json-schema.org/learn/getting-started-step-by-step#define-required-properties
-    test_schema(
-        "required props",
-        // Schema
-        R"""({
-            "$schema": "https://json-schema.org/draft/2020-12/schema",
-            "$id": "https://example.com/product.schema.json",
-            "title": "Product",
-            "description": "A product from Acme's catalog",
-            "type": "object",
-            "properties": {
-                "productId": {
-                "description": "The unique identifier for a product",
-                "type": "integer"
-                },
-                "productName": {
-                "description": "Name of the product",
-                "type": "string"
-                },
-                "price": {
-                "description": "The price of the product",
-                "type": "number",
-                "exclusiveMinimum": 0
-                },
-                "tags": {
-                "description": "Tags for the product",
-                "type": "array",
-                "items": {
-                    "type": "string"
-                },
-                "minItems": 1,
-                "uniqueItems": true
-                },
-                "dimensions": {
-                "type": "object",
-                "properties": {
-                    "length": {
-                    "type": "number"
-                    },
-                    "width": {
-                    "type": "number"
-                    },
-                    "height": {
-                    "type": "number"
-                    }
-                },
-                "required": [ "length", "width", "height" ]
-                }
-            },
-            "required": [ "productId", "productName", "price" ]
-        })""",
-        // Passing strings
-        {
-            R"""({"productId": 1, "productName": "A green door", "price": 12.50})""",
-            R"""({"productId": 1, "productName": "A green door", "price": 12.50, "tags": ["home", "green"]})""",
-            R"""({"productId": 1, "productName": "A green door", "price": 12.50, "tags": ["home", "green"], "dimensions": {"length": 785, "width": 250.5, "height": -0.359}})""",
-        },
-        // Failing strings
-        {
-            R"""({})""", // Missing all required properties
-            R"""({"productName": "A green door", "price": 12.50, "productId": 1})""", // Out of order properties
-            // TODO: The following line should fail, but currently it passes. `exclusiveMinimum` is not supported, as it would likely be too difficult to implement.
-            //  Perhaps special checks for minimum and maximum values of 0 could be added (since that's relatively easy to do with grammars), but anything else would likely be too complex.
-            // R"""({"productId": 1, "productName": "A green door", "price": -12.50})""",
-            R"""({"productId": 1, "productName": "A green door"})""", // Missing required property (price)
-            R"""({"productName": "A green door", "price": 12.50})""", // Missing required property (productId)
-            R"""({"productId": 1, "productName": "A green door", "price": 12.50, "tags": []})""", // tags is empty, but minItems is 1
-            R"""({"productId": 1, "productName": "A green door", "price": 12.50, "dimensions": {"length": 785, "width": 250.5, "height": -0.359}, "tags": ["home", "green"]})""", // Tags and dimensions are out of order
-            // TODO: The following line should fail, but currently it passes. `uniqueItems` is not supported, as it would likely be too difficult to implement.
-            // R"""({"productId": 1, "productName": "A green door", "price": 12.50, "tags": ["home", "green", "home"]})""",
-        }
-    );
-}
-
-int main() {
-    fprintf(stdout, "Running grammar integration tests...\n");
-    test_simple_grammar();
-    test_complex_grammar();
-    test_special_chars();
-    test_quantifiers();
-    test_failure_missing_root();
-    test_failure_missing_reference();
-    test_failure_left_recursion();
-    test_json_schema();
-    fprintf(stdout, "All tests passed.\n");
-    return 0;
-}
diff --git a/tests/test-grammar-parser.cpp b/tests/test-grammar-parser.cpp
deleted file mode 100644
index 5df5abb25..000000000
--- a/tests/test-grammar-parser.cpp
+++ /dev/null
@@ -1,515 +0,0 @@
-#ifdef NDEBUG
-#undef NDEBUG
-#endif
-
-#include "llama.h"
-#include "grammar-parser.h"
-
-#include <cassert>
-
-static const char * type_str(llama_gretype type) {
-    switch (type) {
-        case LLAMA_GRETYPE_CHAR: return "LLAMA_GRETYPE_CHAR";
-        case LLAMA_GRETYPE_CHAR_NOT: return "LLAMA_GRETYPE_CHAR_NOT";
-        case LLAMA_GRETYPE_CHAR_ALT: return "LLAMA_GRETYPE_CHAR_ALT";
-        case LLAMA_GRETYPE_CHAR_RNG_UPPER: return "LLAMA_GRETYPE_CHAR_RNG_UPPER";
-        case LLAMA_GRETYPE_RULE_REF: return "LLAMA_GRETYPE_RULE_REF";
-        case LLAMA_GRETYPE_ALT: return "LLAMA_GRETYPE_ALT";
-        case LLAMA_GRETYPE_END: return "LLAMA_GRETYPE_END";
-        default: return "?";
-    }
-}
-
-static void verify_parsing(const char *grammar_bytes, const std::vector<std::pair<std::string, uint32_t>> expected, const std::vector<llama_grammar_element> &expected_rules) {
-    uint32_t index = 0;
-    grammar_parser::parse_state parsed_grammar = grammar_parser::parse(grammar_bytes);
-
-    std::map<uint32_t, std::string> symbol_names;
-    for (auto it = parsed_grammar.symbol_ids.begin(); it != parsed_grammar.symbol_ids.end(); ++it) {
-        symbol_names[it->second] = it->first;
-    }
-
-    auto print_all = [&]() {
-        fprintf(stderr, "    verify_parsing(R\"\"\"(%s)\"\"\", {\n", grammar_bytes);
-        for (auto it = parsed_grammar.symbol_ids.begin(); it != parsed_grammar.symbol_ids.end(); ++it) {
-            fprintf(stderr, "        {\"%s\", %u},\n", it->first.c_str(), it->second);
-        }
-        fprintf(stderr, "    }, {\n");
-        for (size_t i_rule = 0; i_rule < parsed_grammar.rules.size(); i_rule++) {
-            fprintf(stderr, "        // %s (index %zu)\n", symbol_names[i_rule].c_str(), i_rule);
-            auto & rule = parsed_grammar.rules[i_rule];
-            for (uint32_t i = 0; i < rule.size(); i++) {
-                std::string rule_str;
-                fprintf(stderr, "        {%s, ", type_str(rule[i].type));
-                if (rule[i].type == LLAMA_GRETYPE_CHAR || rule[i].type == LLAMA_GRETYPE_CHAR_ALT ||
-                    rule[i].type == LLAMA_GRETYPE_CHAR_NOT || rule[i].type == LLAMA_GRETYPE_CHAR_RNG_UPPER) {
-                    char c = rule[i].value;
-                    if (c == '\n') {
-                        fprintf(stderr, "'\\n'");
-                    } else if (c == '\t') {
-                        fprintf(stderr, "'\\t'");
-                    } else if (c == '\r') {
-                        fprintf(stderr, "'\\r'");
-                    } else if (c == '\0') {
-                        fprintf(stderr, "'\\0'");
-                    } else {
-                        fprintf(stderr, "'%c'", c);
-                    }
-                } else if (rule[i].type == LLAMA_GRETYPE_RULE_REF) {
-                    fprintf(stderr, "/* %s */ %u", symbol_names[rule[i].value].c_str(), rule[i].value);
-                } else {
-                    fprintf(stderr, "%u", rule[i].value);
-                }
-                fprintf(stderr, "},\n");
-            }
-        }
-        fprintf(stderr, "    });\n");
-    };
-
-    if (getenv("TEST_GRAMMAR_PARSER_PRINT_ALL")) {
-        print_all();
-        fprintf(stderr, "\n");
-        return;
-    }
-
-    fprintf(stderr, "Testing grammar:%s\n", grammar_bytes);
-
-    if (parsed_grammar.symbol_ids.size() != expected.size()) {
-        fprintf(stderr, "Code to update expectation (set TEST_GRAMMAR_PARSER_PRINT_ALL=1 to print all):\n");
-        print_all();
-        assert(parsed_grammar.symbol_ids.size() == expected.size());
-    }
-
-    for (auto it = parsed_grammar.symbol_ids.begin(); it != parsed_grammar.symbol_ids.end(); ++it)
-    {
-        std::string key = it->first;
-        uint32_t value = it->second;
-        std::pair<std::string, uint32_t> expected_pair = expected[index];
-
-        // pretty print error message before asserting
-        if (expected_pair.first != key || expected_pair.second != value)
-        {
-            fprintf(stderr, "index: %u\n", index);
-            fprintf(stderr, "expected_pair: %s, %u\n", expected_pair.first.c_str(), expected_pair.second);
-            fprintf(stderr, "actual_pair: %s, %u\n", key.c_str(), value);
-            fprintf(stderr, "expected_pair != actual_pair\n");
-            fprintf(stderr, "Code to update expectation (set TEST_GRAMMAR_PARSER_PRINT_ALL=1 to print all):\n");
-            print_all();
-        }
-
-        assert(expected_pair.first == key && expected_pair.second == value);
-
-        index++;
-    }
-
-    index = 0;
-    for (auto rule : parsed_grammar.rules)
-    {
-        // compare rule to expected rule
-        for (uint32_t i = 0; i < rule.size(); i++)
-        {
-            llama_grammar_element element = rule[i];
-            llama_grammar_element expected_element = expected_rules[index];
-
-            // pretty print error message before asserting
-            if (expected_element.type != element.type || expected_element.value != element.value)
-            {
-                fprintf(stderr, "index: %u\n", index);
-                fprintf(stderr, "expected_element: %s, %u\n", type_str(expected_element.type), expected_element.value);
-                fprintf(stderr, "actual_element: %s, %u\n", type_str(element.type), element.value);
-                fprintf(stderr, "expected_element != actual_element\n");
-                fprintf(stderr, "all elements:\n");
-                fprintf(stderr, "Code to update expectation (set TEST_GRAMMAR_PARSER_PRINT_ALL=1 to print all):\n");
-                print_all();
-            }
-
-            assert(expected_element.type == element.type && expected_element.value == element.value);
-            index++;
-        }
-    }
-}
-
-static void verify_failure(const char *grammar_bytes) {
-    fprintf(stderr, "Testing expected failure:%s\n", grammar_bytes);
-    auto result = grammar_parser::parse(grammar_bytes);
-    assert(result.rules.empty() && "should have failed");
-}
-
-int main()
-{
-    verify_failure(R"""(
-        root ::= "a"{,}"
-    )""");
-
-    verify_failure(R"""(
-        root ::= "a"{,10}"
-    )""");
-
-    verify_parsing(R"""(
-        root  ::= "a"
-    )""", {
-        {"root", 0},
-    }, {
-        // root (index 0)
-        {LLAMA_GRETYPE_CHAR, 'a'},
-        {LLAMA_GRETYPE_END, 0},
-    });
-
-    verify_parsing(R"""(
-        root  ::= "a" | [bdx-z] | [^1-3]
-    )""", {
-        {"root", 0},
-    }, {
-        // root (index 0)
-        {LLAMA_GRETYPE_CHAR, 'a'},
-        {LLAMA_GRETYPE_ALT, 0},
-        {LLAMA_GRETYPE_CHAR, 'b'},
-        {LLAMA_GRETYPE_CHAR_ALT, 'd'},
-        {LLAMA_GRETYPE_CHAR_ALT, 'x'},
-        {LLAMA_GRETYPE_CHAR_RNG_UPPER, 'z'},
-        {LLAMA_GRETYPE_ALT, 0},
-        {LLAMA_GRETYPE_CHAR_NOT, '1'},
-        {LLAMA_GRETYPE_CHAR_RNG_UPPER, '3'},
-        {LLAMA_GRETYPE_END, 0},
-    });
-
-    verify_parsing(R"""(
-        root  ::= a+
-        a     ::= "a"
-    )""", {
-        {"a", 1},
-        {"root", 0},
-        {"root_2", 2},
-    }, {
-        // root (index 0)
-        {LLAMA_GRETYPE_RULE_REF, /* a */ 1},
-        {LLAMA_GRETYPE_RULE_REF, /* root_2 */ 2},
-        {LLAMA_GRETYPE_END, 0},
-        // a (index 1)
-        {LLAMA_GRETYPE_CHAR, 'a'},
-        {LLAMA_GRETYPE_END, 0},
-        // root_2 (index 2)
-        {LLAMA_GRETYPE_RULE_REF, /* a */ 1},
-        {LLAMA_GRETYPE_RULE_REF, /* root_2 */ 2},
-        {LLAMA_GRETYPE_ALT, 0},
-        {LLAMA_GRETYPE_END, 0},
-    });
-
-    verify_parsing(R"""(
-        root  ::= "a"+
-    )""", {
-        {"root", 0},
-        {"root_1", 1},
-    }, {
-        // root (index 0)
-        {LLAMA_GRETYPE_CHAR, 'a'},
-        {LLAMA_GRETYPE_RULE_REF, /* root_1 */ 1},
-        {LLAMA_GRETYPE_END, 0},
-        // root_1 (index 1)
-        {LLAMA_GRETYPE_CHAR, 'a'},
-        {LLAMA_GRETYPE_RULE_REF, /* root_1 */ 1},
-        {LLAMA_GRETYPE_ALT, 0},
-        {LLAMA_GRETYPE_END, 0},
-    });
-
-    verify_parsing(R"""(
-        root  ::= a?
-        a     ::= "a"
-    )""", {
-        {"a", 1},
-        {"root", 0},
-        {"root_2", 2},
-    }, {
-        // root (index 0)
-        {LLAMA_GRETYPE_RULE_REF, /* root_2 */ 2},
-        {LLAMA_GRETYPE_END, 0},
-        // a (index 1)
-        {LLAMA_GRETYPE_CHAR, 'a'},
-        {LLAMA_GRETYPE_END, 0},
-        // root_2 (index 2)
-        {LLAMA_GRETYPE_RULE_REF, /* a */ 1},
-        {LLAMA_GRETYPE_ALT, 0},
-        {LLAMA_GRETYPE_END, 0},
-    });
-
-    verify_parsing(R"""(
-        root  ::= "a"?
-    )""", {
-        {"root", 0},
-        {"root_1", 1},
-    }, {
-        // root (index 0)
-        {LLAMA_GRETYPE_RULE_REF, /* root_1 */ 1},
-        {LLAMA_GRETYPE_END, 0},
-        // root_1 (index 1)
-        {LLAMA_GRETYPE_CHAR, 'a'},
-        {LLAMA_GRETYPE_ALT, 0},
-        {LLAMA_GRETYPE_END, 0},
-    });
-
-    verify_parsing(R"""(
-        root  ::= a*
-        a     ::= "a"
-    )""", {
-        {"a", 1},
-        {"root", 0},
-        {"root_2", 2},
-    }, {
-        // root (index 0)
-        {LLAMA_GRETYPE_RULE_REF, /* root_2 */ 2},
-        {LLAMA_GRETYPE_END, 0},
-        // a (index 1)
-        {LLAMA_GRETYPE_CHAR, 'a'},
-        {LLAMA_GRETYPE_END, 0},
-        // root_2 (index 2)
-        {LLAMA_GRETYPE_RULE_REF, /* a */ 1},
-        {LLAMA_GRETYPE_RULE_REF, /* root_2 */ 2},
-        {LLAMA_GRETYPE_ALT, 0},
-        {LLAMA_GRETYPE_END, 0},
-    });
-
-    verify_parsing(R"""(
-        root  ::= "a"*
-    )""", {
-        {"root", 0},
-        {"root_1", 1},
-    }, {
-        // root (index 0)
-        {LLAMA_GRETYPE_RULE_REF, /* root_1 */ 1},
-        {LLAMA_GRETYPE_END, 0},
-        // root_1 (index 1)
-        {LLAMA_GRETYPE_CHAR, 'a'},
-        {LLAMA_GRETYPE_RULE_REF, /* root_1 */ 1},
-        {LLAMA_GRETYPE_ALT, 0},
-        {LLAMA_GRETYPE_END, 0},
-    });
-
-    verify_parsing(R"""(
-        root  ::= "a"{2}
-    )""", {
-        {"root", 0},
-    }, {
-        // root (index 0)
-        {LLAMA_GRETYPE_CHAR, 'a'},
-        {LLAMA_GRETYPE_CHAR, 'a'},
-        {LLAMA_GRETYPE_END, 0},
-    });
-
-    verify_parsing(R"""(
-        root  ::= "a"{2,}
-    )""", {
-        {"root", 0},
-        {"root_1", 1},
-    }, {
-        // root (index 0)
-        {LLAMA_GRETYPE_CHAR, 'a'},
-        {LLAMA_GRETYPE_CHAR, 'a'},
-        {LLAMA_GRETYPE_RULE_REF, /* root_1 */ 1},
-        {LLAMA_GRETYPE_END, 0},
-        // root_1 (index 1)
-        {LLAMA_GRETYPE_CHAR, 'a'},
-        {LLAMA_GRETYPE_RULE_REF, /* root_1 */ 1},
-        {LLAMA_GRETYPE_ALT, 0},
-        {LLAMA_GRETYPE_END, 0},
-    });
-
-    verify_parsing(R"""(
-        root  ::= "a"{ 4}
-    )""", {
-        {"root", 0},
-    }, {
-        // root (index 0)
-        {LLAMA_GRETYPE_CHAR, 'a'},
-        {LLAMA_GRETYPE_CHAR, 'a'},
-        {LLAMA_GRETYPE_CHAR, 'a'},
-        {LLAMA_GRETYPE_CHAR, 'a'},
-        {LLAMA_GRETYPE_END, 0},
-    });
-
-    verify_parsing(R"""(
-        root  ::= "a"{2,4}
-    )""", {
-        {"root", 0},
-        {"root_1", 1},
-        {"root_2", 2},
-    }, {
-        // root (index 0)
-        {LLAMA_GRETYPE_CHAR, 'a'},
-        {LLAMA_GRETYPE_CHAR, 'a'},
-        {LLAMA_GRETYPE_RULE_REF, /* root_2 */ 2},
-        {LLAMA_GRETYPE_END, 0},
-        // root_1 (index 1)
-        {LLAMA_GRETYPE_CHAR, 'a'},
-        {LLAMA_GRETYPE_ALT, 0},
-        {LLAMA_GRETYPE_END, 0},
-        // root_2 (index 2)
-        {LLAMA_GRETYPE_CHAR, 'a'},
-        {LLAMA_GRETYPE_RULE_REF, /* root_1 */ 1},
-        {LLAMA_GRETYPE_ALT, 0},
-        {LLAMA_GRETYPE_END, 0},
-    });
-
-    verify_parsing(R"""(
-        root  ::= (expr "=" term "\n")+
-        expr  ::= term ([-+*/] term)*
-        term  ::= [0-9]+
-    )""", {
-        {"expr", 2},
-        {"expr_5", 5},
-        {"expr_6", 6},
-        {"root", 0},
-        {"root_1", 1},
-        {"root_4", 4},
-        {"term", 3},
-        {"term_7", 7},
-    }, {
-        // root (index 0)
-        {LLAMA_GRETYPE_RULE_REF, /* root_1 */ 1},
-        {LLAMA_GRETYPE_RULE_REF, /* root_4 */ 4},
-        {LLAMA_GRETYPE_END, 0},
-        // root_1 (index 1)
-        {LLAMA_GRETYPE_RULE_REF, /* expr */ 2},
-        {LLAMA_GRETYPE_CHAR, '='},
-        {LLAMA_GRETYPE_RULE_REF, /* term */ 3},
-        {LLAMA_GRETYPE_CHAR, '\n'},
-        {LLAMA_GRETYPE_END, 0},
-        // expr (index 2)
-        {LLAMA_GRETYPE_RULE_REF, /* term */ 3},
-        {LLAMA_GRETYPE_RULE_REF, /* expr_6 */ 6},
-        {LLAMA_GRETYPE_END, 0},
-        // term (index 3)
-        {LLAMA_GRETYPE_CHAR, '0'},
-        {LLAMA_GRETYPE_CHAR_RNG_UPPER, '9'},
-        {LLAMA_GRETYPE_RULE_REF, /* term_7 */ 7},
-        {LLAMA_GRETYPE_END, 0},
-        // root_4 (index 4)
-        {LLAMA_GRETYPE_RULE_REF, /* root_1 */ 1},
-        {LLAMA_GRETYPE_RULE_REF, /* root_4 */ 4},
-        {LLAMA_GRETYPE_ALT, 0},
-        {LLAMA_GRETYPE_END, 0},
-        // expr_5 (index 5)
-        {LLAMA_GRETYPE_CHAR, '-'},
-        {LLAMA_GRETYPE_CHAR_ALT, '+'},
-        {LLAMA_GRETYPE_CHAR_ALT, '*'},
-        {LLAMA_GRETYPE_CHAR_ALT, '/'},
-        {LLAMA_GRETYPE_RULE_REF, /* term */ 3},
-        {LLAMA_GRETYPE_END, 0},
-        // expr_6 (index 6)
-        {LLAMA_GRETYPE_RULE_REF, /* expr_5 */ 5},
-        {LLAMA_GRETYPE_RULE_REF, /* expr_6 */ 6},
-        {LLAMA_GRETYPE_ALT, 0},
-        {LLAMA_GRETYPE_END, 0},
-        // term_7 (index 7)
-        {LLAMA_GRETYPE_CHAR, '0'},
-        {LLAMA_GRETYPE_CHAR_RNG_UPPER, '9'},
-        {LLAMA_GRETYPE_RULE_REF, /* term_7 */ 7},
-        {LLAMA_GRETYPE_ALT, 0},
-        {LLAMA_GRETYPE_END, 0},
-    });
-
-    verify_parsing(R"""(
-        root  ::= (expr "=" ws term "\n")+
-        expr  ::= term ([-+*/] term)*
-        term  ::= ident | num | "(" ws expr ")" ws
-        ident ::= [a-z] [a-z0-9_]* ws
-        num   ::= [0-9]+ ws
-        ws    ::= [ \t\n]*
-    )""", {
-        {"expr", 2},
-        {"expr_6", 6},
-        {"expr_7", 7},
-        {"ident", 8},
-        {"ident_10", 10},
-        {"num", 9},
-        {"num_11", 11},
-        {"root", 0},
-        {"root_1", 1},
-        {"root_5", 5},
-        {"term", 4},
-        {"ws", 3},
-        {"ws_12", 12},
-    }, {
-        // root (index 0)
-        {LLAMA_GRETYPE_RULE_REF, /* root_1 */ 1},
-        {LLAMA_GRETYPE_RULE_REF, /* root_5 */ 5},
-        {LLAMA_GRETYPE_END, 0},
-        // root_1 (index 1)
-        {LLAMA_GRETYPE_RULE_REF, /* expr */ 2},
-        {LLAMA_GRETYPE_CHAR, '='},
-        {LLAMA_GRETYPE_RULE_REF, /* ws */ 3},
-        {LLAMA_GRETYPE_RULE_REF, /* term */ 4},
-        {LLAMA_GRETYPE_CHAR, '\n'},
-        {LLAMA_GRETYPE_END, 0},
-        // expr (index 2)
-        {LLAMA_GRETYPE_RULE_REF, /* term */ 4},
-        {LLAMA_GRETYPE_RULE_REF, /* expr_7 */ 7},
-        {LLAMA_GRETYPE_END, 0},
-        // ws (index 3)
-        {LLAMA_GRETYPE_RULE_REF, /* ws_12 */ 12},
-        {LLAMA_GRETYPE_END, 0},
-        // term (index 4)
-        {LLAMA_GRETYPE_RULE_REF, /* ident */ 8},
-        {LLAMA_GRETYPE_ALT, 0},
-        {LLAMA_GRETYPE_RULE_REF, /* num */ 9},
-        {LLAMA_GRETYPE_ALT, 0},
-        {LLAMA_GRETYPE_CHAR, '('},
-        {LLAMA_GRETYPE_RULE_REF, /* ws */ 3},
-        {LLAMA_GRETYPE_RULE_REF, /* expr */ 2},
-        {LLAMA_GRETYPE_CHAR, ')'},
-        {LLAMA_GRETYPE_RULE_REF, /* ws */ 3},
-        {LLAMA_GRETYPE_END, 0},
-        // root_5 (index 5)
-        {LLAMA_GRETYPE_RULE_REF, /* root_1 */ 1},
-        {LLAMA_GRETYPE_RULE_REF, /* root_5 */ 5},
-        {LLAMA_GRETYPE_ALT, 0},
-        {LLAMA_GRETYPE_END, 0},
-        // expr_6 (index 6)
-        {LLAMA_GRETYPE_CHAR, '-'},
-        {LLAMA_GRETYPE_CHAR_ALT, '+'},
-        {LLAMA_GRETYPE_CHAR_ALT, '*'},
-        {LLAMA_GRETYPE_CHAR_ALT, '/'},
-        {LLAMA_GRETYPE_RULE_REF, /* term */ 4},
-        {LLAMA_GRETYPE_END, 0},
-        // expr_7 (index 7)
-        {LLAMA_GRETYPE_RULE_REF, /* expr_6 */ 6},
-        {LLAMA_GRETYPE_RULE_REF, /* expr_7 */ 7},
-        {LLAMA_GRETYPE_ALT, 0},
-        {LLAMA_GRETYPE_END, 0},
-        // ident (index 8)
-        {LLAMA_GRETYPE_CHAR, 'a'},
-        {LLAMA_GRETYPE_CHAR_RNG_UPPER, 'z'},
-        {LLAMA_GRETYPE_RULE_REF, /* ident_10 */ 10},
-        {LLAMA_GRETYPE_RULE_REF, /* ws */ 3},
-        {LLAMA_GRETYPE_END, 0},
-        // num (index 9)
-        {LLAMA_GRETYPE_CHAR, '0'},
-        {LLAMA_GRETYPE_CHAR_RNG_UPPER, '9'},
-        {LLAMA_GRETYPE_RULE_REF, /* num_11 */ 11},
-        {LLAMA_GRETYPE_RULE_REF, /* ws */ 3},
-        {LLAMA_GRETYPE_END, 0},
-        // ident_10 (index 10)
-        {LLAMA_GRETYPE_CHAR, 'a'},
-        {LLAMA_GRETYPE_CHAR_RNG_UPPER, 'z'},
-        {LLAMA_GRETYPE_CHAR_ALT, '0'},
-        {LLAMA_GRETYPE_CHAR_RNG_UPPER, '9'},
-        {LLAMA_GRETYPE_CHAR_ALT, '_'},
-        {LLAMA_GRETYPE_RULE_REF, /* ident_10 */ 10},
-        {LLAMA_GRETYPE_ALT, 0},
-        {LLAMA_GRETYPE_END, 0},
-        // num_11 (index 11)
-        {LLAMA_GRETYPE_CHAR, '0'},
-        {LLAMA_GRETYPE_CHAR_RNG_UPPER, '9'},
-        {LLAMA_GRETYPE_RULE_REF, /* num_11 */ 11},
-        {LLAMA_GRETYPE_ALT, 0},
-        {LLAMA_GRETYPE_END, 0},
-        // ws_12 (index 12)
-        {LLAMA_GRETYPE_CHAR, ' '},
-        {LLAMA_GRETYPE_CHAR_ALT, '\t'},
-        {LLAMA_GRETYPE_CHAR_ALT, '\n'},
-        {LLAMA_GRETYPE_RULE_REF, /* ws_12 */ 12},
-        {LLAMA_GRETYPE_ALT, 0},
-        {LLAMA_GRETYPE_END, 0},
-    });
-
-    return 0;
-}
diff --git a/tests/test-json-schema-to-grammar.cpp b/tests/test-json-schema-to-grammar.cpp
deleted file mode 100755
index 65486ac5c..000000000
--- a/tests/test-json-schema-to-grammar.cpp
+++ /dev/null
@@ -1,1273 +0,0 @@
-#ifdef NDEBUG
-#undef NDEBUG
-#endif
-
-#include <cassert>
-#include <fstream>
-#include <sstream>
-#include <regex>
-
-#include "json-schema-to-grammar.h"
-#include "grammar-parser.h"
-
-static std::string trim(const std::string & source) {
-    std::string s(source);
-    s.erase(0,s.find_first_not_of(" \n\r\t"));
-    s.erase(s.find_last_not_of(" \n\r\t")+1);
-    return std::regex_replace(s, std::regex("(^|\n)[ \t]+"), "$1");
-}
-
-enum TestCaseStatus {
-    SUCCESS,
-    FAILURE
-};
-
-struct TestCase {
-    TestCaseStatus expected_status;
-    std::string name;
-    std::string schema;
-    std::string expected_grammar;
-
-    void _print_failure_header() const {
-        fprintf(stderr, "#\n# Test '%s' failed.\n#\n%s\n", name.c_str(), schema.c_str());
-    }
-    void verify(const std::string & actual_grammar) const {
-        if (trim(actual_grammar) != trim(expected_grammar)) {
-        _print_failure_header();
-        fprintf(stderr, "# EXPECTED:\n%s\n# ACTUAL:\n%s\n", expected_grammar.c_str(), actual_grammar.c_str());
-        assert(false);
-        }
-    }
-    void verify_expectation_parseable() const {
-        try {
-            auto state = grammar_parser::parse(expected_grammar.c_str());
-            if (state.symbol_ids.find("root") == state.symbol_ids.end()) {
-                throw std::runtime_error("Grammar failed to parse:\n" + expected_grammar);
-            }
-        } catch (const std::runtime_error & ex) {
-            _print_failure_header();
-            fprintf(stderr, "# GRAMMAR ERROR: %s\n", ex.what());
-            assert(false);
-        }
-    }
-    void verify_status(TestCaseStatus status) const {
-        if (status != expected_status) {
-            _print_failure_header();
-            fprintf(stderr, "# EXPECTED STATUS: %s\n", expected_status == SUCCESS ? "SUCCESS" : "FAILURE");
-            fprintf(stderr, "# ACTUAL STATUS: %s\n", status == SUCCESS ? "SUCCESS" : "FAILURE");
-            assert(false);
-        }
-    }
-};
-
-static void write(const std::string & file, const std::string & content) {
-    std::ofstream f;
-    f.open(file.c_str());
-    f << content.c_str();
-    f.close();
-}
-
-static std::string read(const std::string & file) {
-    std::ostringstream actuals;
-    actuals << std::ifstream(file.c_str()).rdbuf();
-    return actuals.str();
-}
-
-static void test_all(const std::string & lang, std::function<void(const TestCase &)> runner) {
-    fprintf(stderr, "#\n# Testing JSON schema conversion (%s)\n#\n", lang.c_str());
-    auto test = [&](const TestCase & tc) {
-        fprintf(stderr, "- %s%s\n", tc.name.c_str(), tc.expected_status == FAILURE ? " (failure expected)" : "");
-        runner(tc);
-    };
-
-    test({
-        SUCCESS,
-        "min 0",
-        R"""({
-            "type": "integer",
-            "minimum": 0
-        })""",
-        R"""(
-            root ::= ([0] | [1-9] [0-9]{0,15}) space
-            space ::= | " " | "\n" [ \t]{0,20}
-        )"""
-    });
-
-    test({
-        SUCCESS,
-        "min 1",
-        R"""({
-            "type": "integer",
-            "minimum": 1
-        })""",
-        R"""(
-            root ::= ([1-9] [0-9]{0,15}) space
-            space ::= | " " | "\n" [ \t]{0,20}
-        )"""
-    });
-
-    test({
-        SUCCESS,
-        "min 3",
-        R"""({
-            "type": "integer",
-            "minimum": 3
-        })""",
-        R"""(
-            root ::= ([1-2] [0-9]{1,15} | [3-9] [0-9]{0,15}) space
-            space ::= | " " | "\n" [ \t]{0,20}
-        )"""
-    });
-
-    test({
-        SUCCESS,
-        "min 9",
-        R"""({
-            "type": "integer",
-            "minimum": 9
-        })""",
-        R"""(
-            root ::= ([1-8] [0-9]{1,15} | [9] [0-9]{0,15}) space
-            space ::= | " " | "\n" [ \t]{0,20}
-        )"""
-    });
-
-    test({
-        SUCCESS,
-        "min 10",
-        R"""({
-            "type": "integer",
-            "minimum": 10
-        })""",
-        R"""(
-            root ::= ([1] ([0-9]{1,15}) | [2-9] [0-9]{1,15}) space
-            space ::= | " " | "\n" [ \t]{0,20}
-        )"""
-    });
-
-    test({
-        SUCCESS,
-        "min 25",
-        R"""({
-            "type": "integer",
-            "minimum": 25
-        })""",
-        R"""(
-            root ::= ([1] [0-9]{2,15} | [2] ([0-4] [0-9]{1,14} | [5-9] [0-9]{0,14}) | [3-9] [0-9]{1,15}) space
-            space ::= | " " | "\n" [ \t]{0,20}
-        )"""
-    });
-
-    test({
-        SUCCESS,
-        "max 30",
-        R"""({
-            "type": "integer",
-            "maximum": 30
-        })""",
-        R"""(
-            root ::= ("-" [1-9] [0-9]{0,15} | [0-9] | ([1-2] [0-9] | [3] "0")) space
-            space ::= | " " | "\n" [ \t]{0,20}
-        )"""
-    });
-
-    test({
-        SUCCESS,
-        "min -5",
-        R"""({
-            "type": "integer",
-            "minimum": -5
-        })""",
-        R"""(
-            root ::= ("-" ([0-5]) | [0] | [1-9] [0-9]{0,15}) space
-            space ::= | " " | "\n" [ \t]{0,20}
-        )"""
-    });
-
-    test({
-        SUCCESS,
-        "min -123",
-        R"""({
-            "type": "integer",
-            "minimum": -123
-        })""",
-        R"""(
-            root ::= ("-" ([0-9] | ([1-8] [0-9] | [9] [0-9]) | "1" ([0-1] [0-9] | [2] [0-3])) | [0] | [1-9] [0-9]{0,15}) space
-            space ::= | " " | "\n" [ \t]{0,20}
-        )"""
-    });
-
-    test({
-        SUCCESS,
-        "max -5",
-        R"""({
-            "type": "integer",
-            "maximum": -5
-        })""",
-        R"""(
-            root ::= ("-" ([0-4] [0-9]{1,15} | [5-9] [0-9]{0,15})) space
-            space ::= | " " | "\n" [ \t]{0,20}
-        )"""
-    });
-
-    test({
-        SUCCESS,
-        "max 1",
-        R"""({
-            "type": "integer",
-            "maximum": 1
-        })""",
-        R"""(
-            root ::= ("-" [1-9] [0-9]{0,15} | [0-1]) space
-            space ::= | " " | "\n" [ \t]{0,20}
-        )"""
-    });
-
-    test({
-        SUCCESS,
-        "max 100",
-        R"""({
-            "type": "integer",
-            "maximum": 100
-        })""",
-        R"""(
-            root ::= ("-" [1-9] [0-9]{0,15} | [0-9] | ([1-8] [0-9] | [9] [0-9]) | "100") space
-            space ::= | " " | "\n" [ \t]{0,20}
-        )"""
-    });
-
-    test({
-        SUCCESS,
-        "min 0 max 23",
-        R"""({
-            "type": "integer",
-            "minimum": 0,
-            "maximum": 23
-        })""",
-        R"""(
-            root ::= ([0-9] | ([1] [0-9] | [2] [0-3])) space
-            space ::= | " " | "\n" [ \t]{0,20}
-        )"""
-    });
-
-    test({
-        SUCCESS,
-        "min 15 max 300",
-        R"""({
-            "type": "integer",
-            "minimum": 15,
-            "maximum": 300
-        })""",
-        R"""(
-            root ::= (([1] ([5-9]) | [2-9] [0-9]) | ([1-2] [0-9]{2} | [3] "00")) space
-            space ::= | " " | "\n" [ \t]{0,20}
-        )"""
-    });
-
-    test({
-        SUCCESS,
-        "min 5 max 30",
-        R"""({
-            "type": "integer",
-            "minimum": 5,
-            "maximum": 30
-        })""",
-        R"""(
-            root ::= ([5-9] | ([1-2] [0-9] | [3] "0")) space
-            space ::= | " " | "\n" [ \t]{0,20}
-        )"""
-    });
-
-    test({
-        SUCCESS,
-        "min -123 max 42",
-        R"""({
-            "type": "integer",
-            "minimum": -123,
-            "maximum": 42
-        })""",
-        R"""(
-            root ::= ("-" ([0-9] | ([1-8] [0-9] | [9] [0-9]) | "1" ([0-1] [0-9] | [2] [0-3])) | [0-9] | ([1-3] [0-9] | [4] [0-2])) space
-            space ::= | " " | "\n" [ \t]{0,20}
-        )"""
-    });
-
-    test({
-        SUCCESS,
-        "min -10 max 10",
-        R"""({
-            "type": "integer",
-            "minimum": -10,
-            "maximum": 10
-        })""",
-        R"""(
-            root ::= ("-" ([0-9] | "10") | [0-9] | "10") space
-            space ::= | " " | "\n" [ \t]{0,20}
-        )"""
-    });
-
-    test({
-        FAILURE,
-        "unknown type",
-        R"""({
-            "type": "kaboom"
-        })""",
-        ""
-    });
-
-    test({
-        FAILURE,
-        "invalid type",
-        R"""({
-            "type": 123
-        })""",
-        ""
-    });
-
-    test({
-        SUCCESS,
-        "empty schema (object)",
-        "{}",
-        R"""(
-            array ::= "[" space ( value ("," space value)* )? "]" space
-            boolean ::= ("true" | "false") space
-            char ::= [^"\\\x7F\x00-\x1F] | [\\] (["\\bfnrt] | "u" [0-9a-fA-F]{4})
-            decimal-part ::= [0-9]{1,16}
-            integral-part ::= [0] | [1-9] [0-9]{0,15}
-            null ::= "null" space
-            number ::= ("-"? integral-part) ("." decimal-part)? ([eE] [-+]? integral-part)? space
-            object ::= "{" space ( string ":" space value ("," space string ":" space value)* )? "}" space
-            root ::= object
-            space ::= | " " | "\n" [ \t]{0,20}
-            string ::= "\"" char* "\"" space
-            value ::= object | array | string | number | boolean | null
-        )"""
-    });
-
-    test({
-        SUCCESS,
-        "exotic formats",
-        R"""({
-            "items": [
-                { "format": "date" },
-                { "format": "uuid" },
-                { "format": "time" },
-                { "format": "date-time" }
-            ]
-        })""",
-        R"""(
-            date ::= [0-9]{4} "-" ( "0" [1-9] | "1" [0-2] ) "-" ( "0" [1-9] | [1-2] [0-9] | "3" [0-1] )
-            date-string ::= "\"" date "\"" space
-            date-time ::= date "T" time
-            date-time-string ::= "\"" date-time "\"" space
-            root ::= "[" space tuple-0 "," space uuid "," space tuple-2 "," space tuple-3 "]" space
-            space ::= | " " | "\n" [ \t]{0,20}
-            time ::= ([01] [0-9] | "2" [0-3]) ":" [0-5] [0-9] ":" [0-5] [0-9] ( "." [0-9]{3} )? ( "Z" | ( "+" | "-" ) ( [01] [0-9] | "2" [0-3] ) ":" [0-5] [0-9] )
-            time-string ::= "\"" time "\"" space
-            tuple-0 ::= date-string
-            tuple-2 ::= time-string
-            tuple-3 ::= date-time-string
-            uuid ::= "\"" [0-9a-fA-F]{8} "-" [0-9a-fA-F]{4} "-" [0-9a-fA-F]{4} "-" [0-9a-fA-F]{4} "-" [0-9a-fA-F]{12} "\"" space
-        )"""
-    });
-
-    test({
-        SUCCESS,
-        "string",
-        R"""({
-            "type": "string"
-        })""",
-        R"""(
-            char ::= [^"\\\x7F\x00-\x1F] | [\\] (["\\bfnrt] | "u" [0-9a-fA-F]{4})
-            root ::= "\"" char* "\"" space
-            space ::= | " " | "\n" [ \t]{0,20}
-        )"""
-    });
-
-    test({
-        SUCCESS,
-        "string w/ min length 1",
-        R"""({
-            "type": "string",
-            "minLength": 1
-        })""",
-        R"""(
-            char ::= [^"\\\x7F\x00-\x1F] | [\\] (["\\bfnrt] | "u" [0-9a-fA-F]{4})
-            root ::= "\"" char+ "\"" space
-            space ::= | " " | "\n" [ \t]{0,20}
-        )"""
-    });
-
-    test({
-        SUCCESS,
-        "string w/ min length 3",
-        R"""({
-            "type": "string",
-            "minLength": 3
-        })""",
-        R"""(
-            char ::= [^"\\\x7F\x00-\x1F] | [\\] (["\\bfnrt] | "u" [0-9a-fA-F]{4})
-            root ::= "\"" char{3,} "\"" space
-            space ::= | " " | "\n" [ \t]{0,20}
-        )"""
-    });
-
-    test({
-        SUCCESS,
-        "string w/ max length",
-        R"""({
-            "type": "string",
-            "maxLength": 3
-        })""",
-        R"""(
-            char ::= [^"\\\x7F\x00-\x1F] | [\\] (["\\bfnrt] | "u" [0-9a-fA-F]{4})
-            root ::= "\"" char{0,3} "\"" space
-            space ::= | " " | "\n" [ \t]{0,20}
-        )"""
-    });
-
-    test({
-        SUCCESS,
-        "string w/ min & max length",
-        R"""({
-            "type": "string",
-            "minLength": 1,
-            "maxLength": 4
-        })""",
-        R"""(
-            char ::= [^"\\\x7F\x00-\x1F] | [\\] (["\\bfnrt] | "u" [0-9a-fA-F]{4})
-            root ::= "\"" char{1,4} "\"" space
-            space ::= | " " | "\n" [ \t]{0,20}
-        )"""
-    });
-
-    test({
-        SUCCESS,
-        "boolean",
-        R"""({
-            "type": "boolean"
-        })""",
-        R"""(
-            root ::= ("true" | "false") space
-            space ::= | " " | "\n" [ \t]{0,20}
-        )"""
-    });
-
-    test({
-        SUCCESS,
-        "integer",
-        R"""({
-            "type": "integer"
-        })""",
-        R"""(
-            integral-part ::= [0] | [1-9] [0-9]{0,15}
-            root ::= ("-"? integral-part) space
-            space ::= | " " | "\n" [ \t]{0,20}
-        )"""
-    });
-
-    test({
-        SUCCESS,
-        "string const",
-        R"""({
-            "const": "foo"
-        })""",
-        R"""(
-            root ::= "\"foo\"" space
-            space ::= | " " | "\n" [ \t]{0,20}
-        )"""
-    });
-
-    test({
-        SUCCESS,
-        "non-string const",
-        R"""({
-            "const": 123
-        })""",
-        R"""(
-            root ::= "123" space
-            space ::= | " " | "\n" [ \t]{0,20}
-        )"""
-    });
-
-    test({
-        SUCCESS,
-        "non-string enum",
-        R"""({
-            "enum": ["red", "amber", "green", null, 42, ["foo"]]
-        })""",
-        R"""(
-            root ::= ("\"red\"" | "\"amber\"" | "\"green\"" | "null" | "42" | "[\"foo\"]") space
-            space ::= | " " | "\n" [ \t]{0,20}
-        )"""
-    });
-
-    test({
-        SUCCESS,
-        "string array",
-        R"""({
-            "type": "array",
-            "prefixItems": { "type": "string" }
-        })""",
-        R"""(
-            char ::= [^"\\\x7F\x00-\x1F] | [\\] (["\\bfnrt] | "u" [0-9a-fA-F]{4})
-            root ::= "[" space (string ("," space string)*)? "]" space
-            space ::= | " " | "\n" [ \t]{0,20}
-            string ::= "\"" char* "\"" space
-        )"""
-    });
-
-    test({
-        SUCCESS,
-        "nullable string array",
-        R"""({
-            "type": ["array", "null"],
-            "prefixItems": { "type": "string" }
-        })""",
-        R"""(
-            alternative-0 ::= "[" space (string ("," space string)*)? "]" space
-            char ::= [^"\\\x7F\x00-\x1F] | [\\] (["\\bfnrt] | "u" [0-9a-fA-F]{4})
-            null ::= "null" space
-            root ::= alternative-0 | null
-            space ::= | " " | "\n" [ \t]{0,20}
-            string ::= "\"" char* "\"" space
-        )"""
-    });
-
-    test({
-        SUCCESS,
-        "tuple1",
-        R"""({
-            "prefixItems": [{ "type": "string" }]
-        })""",
-        R"""(
-            char ::= [^"\\\x7F\x00-\x1F] | [\\] (["\\bfnrt] | "u" [0-9a-fA-F]{4})
-            root ::= "[" space string "]" space
-            space ::= | " " | "\n" [ \t]{0,20}
-            string ::= "\"" char* "\"" space
-        )"""
-    });
-
-    test({
-        SUCCESS,
-        "tuple2",
-        R"""({
-            "prefixItems": [{ "type": "string" }, { "type": "number" }]
-        })""",
-        R"""(
-            char ::= [^"\\\x7F\x00-\x1F] | [\\] (["\\bfnrt] | "u" [0-9a-fA-F]{4})
-            decimal-part ::= [0-9]{1,16}
-            integral-part ::= [0] | [1-9] [0-9]{0,15}
-            number ::= ("-"? integral-part) ("." decimal-part)? ([eE] [-+]? integral-part)? space
-            root ::= "[" space string "," space number "]" space
-            space ::= | " " | "\n" [ \t]{0,20}
-            string ::= "\"" char* "\"" space
-        )"""
-    });
-
-    test({
-        SUCCESS,
-        "number",
-        R"""({
-            "type": "number"
-        })""",
-        R"""(
-            decimal-part ::= [0-9]{1,16}
-            integral-part ::= [0] | [1-9] [0-9]{0,15}
-            root ::= ("-"? integral-part) ("." decimal-part)? ([eE] [-+]? integral-part)? space
-            space ::= | " " | "\n" [ \t]{0,20}
-        )"""
-    });
-
-    test({
-        SUCCESS,
-        "minItems",
-        R"""({
-            "items": {
-                "type": "boolean"
-            },
-            "minItems": 2
-        })""",
-        R"""(
-            boolean ::= ("true" | "false") space
-            root ::= "[" space boolean ("," space boolean)+ "]" space
-            space ::= | " " | "\n" [ \t]{0,20}
-        )"""
-    });
-
-    test({
-        SUCCESS,
-        "maxItems 1",
-        R"""({
-            "items": {
-                "type": "boolean"
-            },
-            "maxItems": 1
-        })""",
-        R"""(
-            boolean ::= ("true" | "false") space
-            root ::= "[" space boolean? "]" space
-            space ::= | " " | "\n" [ \t]{0,20}
-        )"""
-    });
-
-    test({
-        SUCCESS,
-        "maxItems 2",
-        R"""({
-            "items": {
-                "type": "boolean"
-            },
-            "maxItems": 2
-        })""",
-        R"""(
-            boolean ::= ("true" | "false") space
-            root ::= "[" space (boolean ("," space boolean)?)? "]" space
-            space ::= | " " | "\n" [ \t]{0,20}
-        )"""
-    });
-
-    test({
-        SUCCESS,
-        "min + maxItems",
-        R"""({
-            "items": {
-                "type": ["number", "integer"]
-            },
-            "minItems": 3,
-            "maxItems": 5
-        })""",
-        R"""(
-            decimal-part ::= [0-9]{1,16}
-            integer ::= ("-"? integral-part) space
-            integral-part ::= [0] | [1-9] [0-9]{0,15}
-            item ::= number | integer
-            number ::= ("-"? integral-part) ("." decimal-part)? ([eE] [-+]? integral-part)? space
-            root ::= "[" space item ("," space item){2,4} "]" space
-            space ::= | " " | "\n" [ \t]{0,20}
-        )"""
-    });
-
-    test({
-        SUCCESS,
-        "min + max items with min + max values across zero",
-        R"""({
-            "items": {
-                "type": "integer",
-                "minimum": -12,
-                "maximum": 207
-            },
-            "minItems": 3,
-            "maxItems": 5
-        })""",
-        R"""(
-            item ::= ("-" ([0-9] | "1" [0-2]) | [0-9] | ([1-8] [0-9] | [9] [0-9]) | ([1] [0-9]{2} | [2] "0" [0-7])) space
-            root ::= "[" space item ("," space item){2,4} "]" space
-            space ::= | " " | "\n" [ \t]{0,20}
-        )"""
-    });
-
-    test({
-        SUCCESS,
-        "min + max items with min + max values",
-        R"""({
-            "items": {
-                "type": "integer",
-                "minimum": 12,
-                "maximum": 207
-            },
-            "minItems": 3,
-            "maxItems": 5
-        })""",
-        R"""(
-            item ::= (([1] ([2-9]) | [2-9] [0-9]) | ([1] [0-9]{2} | [2] "0" [0-7])) space
-            root ::= "[" space item ("," space item){2,4} "]" space
-            space ::= | " " | "\n" [ \t]{0,20}
-        )"""
-    });
-
-    test({
-        SUCCESS,
-        "simple regexp",
-        R"""({
-            "type": "string",
-            "pattern": "^abc?d*efg+(hij)?kl$"
-        })""",
-        R"""(
-            root ::= "\"" "ab" "c"? "d"* "ef" "g"+ ("hij")? "kl" "\"" space
-            space ::= | " " | "\n" [ \t]{0,20}
-        )"""
-    });
-
-    test({
-        SUCCESS,
-        "regexp escapes",
-        R"""({
-            "type": "string",
-            "pattern": "^\\[\\]\\{\\}\\(\\)\\|\\+\\*\\?$"
-        })""",
-        R"""(
-            root ::= "\"" "[]{}()|+*?" "\"" space
-            space ::= | " " | "\n" [ \t]{0,20}
-        )"""
-    });
-
-    test({
-        SUCCESS,
-        "regexp quote",
-        R"""({
-            "type": "string",
-            "pattern": "^\"$"
-        })""",
-        R"""(
-            root ::= "\"" "\"" "\"" space
-            space ::= | " " | "\n" [ \t]{0,20}
-        )"""
-    });
-
-    test({
-        SUCCESS,
-        "regexp",
-        R"""({
-            "type": "string",
-            "pattern": "^(\\([0-9]{1,3}\\))?[0-9]{3}-[0-9]{4} a{3,5}nd...$"
-        })""",
-        R"""(
-            dot ::= [^\x0A\x0D]
-            root ::= "\"" ("(" root-1{1,3} ")")? root-1{3,3} "-" root-1{4,4} " " "a"{3,5} "nd" dot dot dot "\"" space
-            root-1 ::= [0-9]
-            space ::= | " " | "\n" [ \t]{0,20}
-        )"""
-    });
-
-    test({
-        SUCCESS,
-        "required props in original order",
-        R"""({
-            "type": "object",
-            "properties": {
-                "b": {"type": "string"},
-                "c": {"type": "string"},
-                "a": {"type": "string"}
-            },
-            "required": [
-                "a",
-                "b",
-                "c"
-            ],
-            "additionalProperties": false,
-            "definitions": {}
-        })""",
-        R"""(
-            a-kv ::= "\"a\"" space ":" space string
-            b-kv ::= "\"b\"" space ":" space string
-            c-kv ::= "\"c\"" space ":" space string
-            char ::= [^"\\\x7F\x00-\x1F] | [\\] (["\\bfnrt] | "u" [0-9a-fA-F]{4})
-            root ::= "{" space b-kv "," space c-kv "," space a-kv "}" space
-            space ::= | " " | "\n" [ \t]{0,20}
-            string ::= "\"" char* "\"" space
-        )"""
-    });
-
-    test({
-        SUCCESS,
-        "1 optional prop",
-        R"""({
-            "properties": {
-                "a": {
-                "type": "string"
-                }
-            },
-            "additionalProperties": false
-        })""",
-        R"""(
-            a-kv ::= "\"a\"" space ":" space string
-            char ::= [^"\\\x7F\x00-\x1F] | [\\] (["\\bfnrt] | "u" [0-9a-fA-F]{4})
-            root ::= "{" space  (a-kv )? "}" space
-            space ::= | " " | "\n" [ \t]{0,20}
-            string ::= "\"" char* "\"" space
-        )"""
-    });
-
-    test({
-        SUCCESS,
-        "N optional props",
-        R"""({
-            "properties": {
-                "a": {"type": "string"},
-                "b": {"type": "string"},
-                "c": {"type": "string"}
-            },
-            "additionalProperties": false
-        })""",
-        R"""(
-            a-kv ::= "\"a\"" space ":" space string
-            a-rest ::= ( "," space b-kv )? b-rest
-            b-kv ::= "\"b\"" space ":" space string
-            b-rest ::= ( "," space c-kv )?
-            c-kv ::= "\"c\"" space ":" space string
-            char ::= [^"\\\x7F\x00-\x1F] | [\\] (["\\bfnrt] | "u" [0-9a-fA-F]{4})
-            root ::= "{" space  (a-kv a-rest | b-kv b-rest | c-kv )? "}" space
-            space ::= | " " | "\n" [ \t]{0,20}
-            string ::= "\"" char* "\"" space
-        )"""
-    });
-
-    test({
-        SUCCESS,
-        "required + optional props each in original order",
-        R"""({
-            "properties": {
-                "b": {"type": "string"},
-                "a": {"type": "string"},
-                "d": {"type": "string"},
-                "c": {"type": "string"}
-            },
-            "required": ["a", "b"],
-            "additionalProperties": false
-        })""",
-        R"""(
-            a-kv ::= "\"a\"" space ":" space string
-            b-kv ::= "\"b\"" space ":" space string
-            c-kv ::= "\"c\"" space ":" space string
-            char ::= [^"\\\x7F\x00-\x1F] | [\\] (["\\bfnrt] | "u" [0-9a-fA-F]{4})
-            d-kv ::= "\"d\"" space ":" space string
-            d-rest ::= ( "," space c-kv )?
-            root ::= "{" space b-kv "," space a-kv ( "," space ( d-kv d-rest | c-kv ) )? "}" space
-            space ::= | " " | "\n" [ \t]{0,20}
-            string ::= "\"" char* "\"" space
-        )"""
-    });
-
-    test({
-        SUCCESS,
-        "additional props",
-        R"""({
-            "type": "object",
-            "additionalProperties": {"type": "array", "items": {"type": "number"}}
-        })""",
-        R"""(
-            additional-kv ::= string ":" space additional-value
-            additional-value ::= "[" space (number ("," space number)*)? "]" space
-            char ::= [^"\\\x7F\x00-\x1F] | [\\] (["\\bfnrt] | "u" [0-9a-fA-F]{4})
-            decimal-part ::= [0-9]{1,16}
-            integral-part ::= [0] | [1-9] [0-9]{0,15}
-            number ::= ("-"? integral-part) ("." decimal-part)? ([eE] [-+]? integral-part)? space
-            root ::= "{" space  (additional-kv ( "," space additional-kv )* )? "}" space
-            space ::= | " " | "\n" [ \t]{0,20}
-            string ::= "\"" char* "\"" space
-        )"""
-    });
-
-    test({
-        SUCCESS,
-        "additional props (true)",
-        R"""({
-            "type": "object",
-            "additionalProperties": true
-        })""",
-        R"""(
-            array ::= "[" space ( value ("," space value)* )? "]" space
-            boolean ::= ("true" | "false") space
-            char ::= [^"\\\x7F\x00-\x1F] | [\\] (["\\bfnrt] | "u" [0-9a-fA-F]{4})
-            decimal-part ::= [0-9]{1,16}
-            integral-part ::= [0] | [1-9] [0-9]{0,15}
-            null ::= "null" space
-            number ::= ("-"? integral-part) ("." decimal-part)? ([eE] [-+]? integral-part)? space
-            object ::= "{" space ( string ":" space value ("," space string ":" space value)* )? "}" space
-            root ::= object
-            space ::= | " " | "\n" [ \t]{0,20}
-            string ::= "\"" char* "\"" space
-            value ::= object | array | string | number | boolean | null
-        )"""
-    });
-
-    test({
-        SUCCESS,
-        "additional props (implicit)",
-        R"""({
-            "type": "object"
-        })""",
-        R"""(
-            array ::= "[" space ( value ("," space value)* )? "]" space
-            boolean ::= ("true" | "false") space
-            char ::= [^"\\\x7F\x00-\x1F] | [\\] (["\\bfnrt] | "u" [0-9a-fA-F]{4})
-            decimal-part ::= [0-9]{1,16}
-            integral-part ::= [0] | [1-9] [0-9]{0,15}
-            null ::= "null" space
-            number ::= ("-"? integral-part) ("." decimal-part)? ([eE] [-+]? integral-part)? space
-            object ::= "{" space ( string ":" space value ("," space string ":" space value)* )? "}" space
-            root ::= object
-            space ::= | " " | "\n" [ \t]{0,20}
-            string ::= "\"" char* "\"" space
-            value ::= object | array | string | number | boolean | null
-        )"""
-    });
-
-    test({
-        SUCCESS,
-        "empty w/o additional props",
-        R"""({
-            "type": "object",
-            "additionalProperties": false
-        })""",
-        R"""(
-            root ::= "{" space  "}" space
-            space ::= | " " | "\n" [ \t]{0,20}
-        )"""
-    });
-
-    test({
-        SUCCESS,
-        "required + additional props",
-        R"""({
-            "type": "object",
-            "properties": {
-                "a": {"type": "number"}
-            },
-            "required": ["a"],
-            "additionalProperties": {"type": "string"}
-        })""",
-        R"""(
-            a-kv ::= "\"a\"" space ":" space number
-            additional-k ::= ["] ( [a] char+ | [^"a] char* )? ["] space
-            additional-kv ::= additional-k ":" space string
-            char ::= [^"\\\x7F\x00-\x1F] | [\\] (["\\bfnrt] | "u" [0-9a-fA-F]{4})
-            decimal-part ::= [0-9]{1,16}
-            integral-part ::= [0] | [1-9] [0-9]{0,15}
-            number ::= ("-"? integral-part) ("." decimal-part)? ([eE] [-+]? integral-part)? space
-            root ::= "{" space a-kv ( "," space ( additional-kv ( "," space additional-kv )* ) )? "}" space
-            space ::= | " " | "\n" [ \t]{0,20}
-            string ::= "\"" char* "\"" space
-        )"""
-    });
-
-    test({
-        SUCCESS,
-        "optional + additional props",
-        R"""({
-            "type": "object",
-            "properties": {
-                "a": {"type": "number"}
-            },
-            "additionalProperties": {"type": "number"}
-        })""",
-        R"""(
-            a-kv ::= "\"a\"" space ":" space number
-            a-rest ::= ( "," space additional-kv )*
-            additional-k ::= ["] ( [a] char+ | [^"a] char* )? ["] space
-            additional-kv ::= additional-k ":" space number
-            char ::= [^"\\\x7F\x00-\x1F] | [\\] (["\\bfnrt] | "u" [0-9a-fA-F]{4})
-            decimal-part ::= [0-9]{1,16}
-            integral-part ::= [0] | [1-9] [0-9]{0,15}
-            number ::= ("-"? integral-part) ("." decimal-part)? ([eE] [-+]? integral-part)? space
-            root ::= "{" space  (a-kv a-rest | additional-kv ( "," space additional-kv )* )? "}" space
-            space ::= | " " | "\n" [ \t]{0,20}
-        )"""
-    });
-
-    test({
-        SUCCESS,
-        "required + optional + additional props",
-        R"""({
-            "type": "object",
-            "properties": {
-                "and": {"type": "number"},
-                "also": {"type": "number"}
-            },
-            "required": ["and"],
-            "additionalProperties": {"type": "number"}
-        })""",
-        R"""(
-            additional-k ::= ["] ( [a] ([l] ([s] ([o] char+ | [^"o] char*) | [^"s] char*) | [n] ([d] char+ | [^"d] char*) | [^"ln] char*) | [^"a] char* )? ["] space
-            additional-kv ::= additional-k ":" space number
-            also-kv ::= "\"also\"" space ":" space number
-            also-rest ::= ( "," space additional-kv )*
-            and-kv ::= "\"and\"" space ":" space number
-            char ::= [^"\\\x7F\x00-\x1F] | [\\] (["\\bfnrt] | "u" [0-9a-fA-F]{4})
-            decimal-part ::= [0-9]{1,16}
-            integral-part ::= [0] | [1-9] [0-9]{0,15}
-            number ::= ("-"? integral-part) ("." decimal-part)? ([eE] [-+]? integral-part)? space
-            root ::= "{" space and-kv ( "," space ( also-kv also-rest | additional-kv ( "," space additional-kv )* ) )? "}" space
-            space ::= | " " | "\n" [ \t]{0,20}
-        )"""
-    });
-
-    test({
-        SUCCESS,
-        "optional props with empty name",
-        R"""({
-            "properties": {
-                "": {"type": "integer"},
-                "a": {"type": "integer"}
-            },
-            "additionalProperties": {"type": "integer"}
-        })""",
-        R"""(
-            -kv ::= "\"\"" space ":" space root
-            -rest ::= ( "," space a-kv )? a-rest
-            a-kv ::= "\"a\"" space ":" space integer
-            a-rest ::= ( "," space additional-kv )*
-            additional-k ::= ["] ( [a] char+ | [^"a] char* ) ["] space
-            additional-kv ::= additional-k ":" space integer
-            char ::= [^"\\\x7F\x00-\x1F] | [\\] (["\\bfnrt] | "u" [0-9a-fA-F]{4})
-            integer ::= ("-"? integral-part) space
-            integral-part ::= [0] | [1-9] [0-9]{0,15}
-            root ::= ("-"? integral-part) space
-            root0 ::= "{" space  (-kv -rest | a-kv a-rest | additional-kv ( "," space additional-kv )* )? "}" space
-            space ::= | " " | "\n" [ \t]{0,20}
-        )"""
-    });
-
-    test({
-        SUCCESS,
-        "optional props with nested names",
-        R"""({
-            "properties": {
-                "a": {"type": "integer"},
-                "aa": {"type": "integer"}
-            },
-            "additionalProperties": {"type": "integer"}
-        })""",
-        R"""(
-            a-kv ::= "\"a\"" space ":" space integer
-            a-rest ::= ( "," space aa-kv )? aa-rest
-            aa-kv ::= "\"aa\"" space ":" space integer
-            aa-rest ::= ( "," space additional-kv )*
-            additional-k ::= ["] ( [a] ([a] char+ | [^"a] char*) | [^"a] char* )? ["] space
-            additional-kv ::= additional-k ":" space integer
-            char ::= [^"\\\x7F\x00-\x1F] | [\\] (["\\bfnrt] | "u" [0-9a-fA-F]{4})
-            integer ::= ("-"? integral-part) space
-            integral-part ::= [0] | [1-9] [0-9]{0,15}
-            root ::= "{" space  (a-kv a-rest | aa-kv aa-rest | additional-kv ( "," space additional-kv )* )? "}" space
-            space ::= | " " | "\n" [ \t]{0,20}
-        )"""
-    });
-
-    test({
-        SUCCESS,
-        "optional props with common prefix",
-        R"""({
-            "properties": {
-                "ab": {"type": "integer"},
-                "ac": {"type": "integer"}
-            },
-            "additionalProperties": {"type": "integer"}
-        })""",
-        R"""(
-            ab-kv ::= "\"ab\"" space ":" space integer
-            ab-rest ::= ( "," space ac-kv )? ac-rest
-            ac-kv ::= "\"ac\"" space ":" space integer
-            ac-rest ::= ( "," space additional-kv )*
-            additional-k ::= ["] ( [a] ([b] char+ | [c] char+ | [^"bc] char*) | [^"a] char* )? ["] space
-            additional-kv ::= additional-k ":" space integer
-            char ::= [^"\\\x7F\x00-\x1F] | [\\] (["\\bfnrt] | "u" [0-9a-fA-F]{4})
-            integer ::= ("-"? integral-part) space
-            integral-part ::= [0] | [1-9] [0-9]{0,15}
-            root ::= "{" space  (ab-kv ab-rest | ac-kv ac-rest | additional-kv ( "," space additional-kv )* )? "}" space
-            space ::= | " " | "\n" [ \t]{0,20}
-        )"""
-    });
-
-    test({
-        SUCCESS,
-        "top-level $ref",
-        R"""({
-            "$ref": "#/definitions/foo",
-            "definitions": {
-                "foo": {
-                    "type": "object",
-                    "properties": {
-                        "a": {
-                            "type": "string"
-                        }
-                    },
-                    "required": [
-                        "a"
-                    ],
-                    "additionalProperties": false
-                }
-            }
-        })""",
-        R"""(
-            char ::= [^"\\\x7F\x00-\x1F] | [\\] (["\\bfnrt] | "u" [0-9a-fA-F]{4})
-            foo ::= "{" space foo-a-kv "}" space
-            foo-a-kv ::= "\"a\"" space ":" space string
-            root ::= foo
-            space ::= | " " | "\n" [ \t]{0,20}
-            string ::= "\"" char* "\"" space
-        )"""
-    });
-
-    test({
-        SUCCESS,
-        "anyOf",
-        R"""({
-            "anyOf": [
-                {"$ref": "#/definitions/foo"},
-                {"$ref": "#/definitions/bar"}
-            ],
-            "definitions": {
-                "foo": {
-                    "properties": {"a": {"type": "number"}}
-                },
-                "bar": {
-                    "properties": {"b": {"type": "number"}}
-                }
-            },
-            "type": "object"
-        })""",
-        R"""(
-            alternative-0 ::= foo
-            alternative-1 ::= bar
-            bar ::= "{" space  (bar-b-kv )? "}" space
-            bar-b-kv ::= "\"b\"" space ":" space number
-            decimal-part ::= [0-9]{1,16}
-            foo ::= "{" space  (foo-a-kv )? "}" space
-            foo-a-kv ::= "\"a\"" space ":" space number
-            integral-part ::= [0] | [1-9] [0-9]{0,15}
-            number ::= ("-"? integral-part) ("." decimal-part)? ([eE] [-+]? integral-part)? space
-            root ::= alternative-0 | alternative-1
-            space ::= | " " | "\n" [ \t]{0,20}
-        )"""
-    });
-
-    test({
-        SUCCESS,
-        "mix of allOf, anyOf and $ref (similar to https://json.schemastore.org/tsconfig.json)",
-        R"""({
-            "allOf": [
-                {"$ref": "#/definitions/foo"},
-                {"$ref": "#/definitions/bar"},
-                {
-                "anyOf": [
-                    {"$ref": "#/definitions/baz"},
-                    {"$ref": "#/definitions/bam"}
-                ]
-                }
-            ],
-            "definitions": {
-                "foo": {
-                    "properties": {"a": {"type": "number"}}
-                },
-                "bar": {
-                    "properties": {"b": {"type": "number"}}
-                },
-                "bam": {
-                    "properties": {"c": {"type": "number"}}
-                },
-                "baz": {
-                    "properties": {"d": {"type": "number"}}
-                }
-            },
-            "type": "object"
-        })""",
-        R"""(
-            a-kv ::= "\"a\"" space ":" space number
-            b-kv ::= "\"b\"" space ":" space number
-            c-kv ::= "\"c\"" space ":" space number
-            d-kv ::= "\"d\"" space ":" space number
-            d-rest ::= ( "," space c-kv )?
-            decimal-part ::= [0-9]{1,16}
-            integral-part ::= [0] | [1-9] [0-9]{0,15}
-            number ::= ("-"? integral-part) ("." decimal-part)? ([eE] [-+]? integral-part)? space
-            root ::= "{" space a-kv "," space b-kv ( "," space ( d-kv d-rest | c-kv ) )? "}" space
-            space ::= | " " | "\n" [ \t]{0,20}
-        )"""
-    });
-
-    test({
-        SUCCESS,
-        "conflicting names",
-        R"""({
-            "type": "object",
-            "properties": {
-                "number": {
-                "type": "object",
-                "properties": {
-                    "number": {
-                    "type": "object",
-                        "properties": {
-                            "root": {
-                                "type": "number"
-                            }
-                        },
-                        "required": [
-                            "root"
-                        ],
-                        "additionalProperties": false
-                    }
-                },
-                "required": [
-                    "number"
-                ],
-                "additionalProperties": false
-                }
-            },
-            "required": [
-                "number"
-            ],
-            "additionalProperties": false,
-            "definitions": {}
-        })""",
-        R"""(
-            decimal-part ::= [0-9]{1,16}
-            integral-part ::= [0] | [1-9] [0-9]{0,15}
-            number ::= ("-"? integral-part) ("." decimal-part)? ([eE] [-+]? integral-part)? space
-            number- ::= "{" space number-number-kv "}" space
-            number-kv ::= "\"number\"" space ":" space number-
-            number-number ::= "{" space number-number-root-kv "}" space
-            number-number-kv ::= "\"number\"" space ":" space number-number
-            number-number-root-kv ::= "\"root\"" space ":" space number
-            root ::= "{" space number-kv "}" space
-            space ::= | " " | "\n" [ \t]{0,20}
-        )"""
-    });
-}
-
-int main() {
-    fprintf(stderr, "LLAMA_NODE_AVAILABLE = %s\n", getenv("LLAMA_NODE_AVAILABLE") ? "true" : "false");
-    fprintf(stderr, "LLAMA_PYTHON_AVAILABLE = %s\n", getenv("LLAMA_PYTHON_AVAILABLE") ? "true" : "false");
-
-    test_all("C++", [](const TestCase & tc) {
-        try {
-            tc.verify(json_schema_to_grammar(nlohmann::ordered_json::parse(tc.schema)));
-            tc.verify_status(SUCCESS);
-        } catch (const std::runtime_error & ex) {
-            fprintf(stderr, "Error: %s\n", ex.what());
-            tc.verify_status(FAILURE);
-        }
-    });
-
-    if (getenv("LLAMA_SKIP_TESTS_SLOW_ON_EMULATOR")) {
-        fprintf(stderr, "\033[33mWARNING: Skipping slow tests on emulator.\n\033[0m");
-    } else {
-        if (getenv("LLAMA_PYTHON_AVAILABLE") || (std::system("python -c \"import sys; exit(1) if sys.version_info < (3, 8) else print('Python version is sufficient')\"") == 0)) {
-            test_all("Python", [](const TestCase & tc) {
-                write("test-json-schema-input.tmp", tc.schema);
-                tc.verify_status(std::system(
-                    "python ./examples/json_schema_to_grammar.py test-json-schema-input.tmp > test-grammar-output.tmp") == 0 ? SUCCESS : FAILURE);
-                tc.verify(read("test-grammar-output.tmp"));
-            });
-        } else {
-            fprintf(stderr, "\033[33mWARNING: Python not found (min version required is 3.8), skipping Python JSON schema -> grammar tests.\n\033[0m");
-        }
-
-        if (getenv("LLAMA_NODE_AVAILABLE") || (std::system("node --version") == 0)) {
-            test_all("JavaScript", [](const TestCase & tc) {
-                write("test-json-schema-input.tmp", tc.schema);
-                tc.verify_status(std::system(
-                    "node ./tests/run-json-schema-to-grammar.mjs test-json-schema-input.tmp > test-grammar-output.tmp") == 0 ? SUCCESS : FAILURE);
-                tc.verify(read("test-grammar-output.tmp"));
-            });
-        } else {
-            fprintf(stderr, "\033[33mWARNING: Node not found, skipping JavaScript JSON schema -> grammar tests.\n\033[0m");
-        }
-    }
-
-    test_all("Check Expectations Validity", [](const TestCase & tc) {
-        if (tc.expected_status == SUCCESS) {
-            tc.verify_expectation_parseable();
-        }
-    });
-}
diff --git a/tests/test-llama-grammar.cpp b/tests/test-llama-grammar.cpp
deleted file mode 100644
index 1f3a267b3..000000000
--- a/tests/test-llama-grammar.cpp
+++ /dev/null
@@ -1,408 +0,0 @@
-#ifdef NDEBUG
-#undef NDEBUG
-#endif
-
-#define LLAMA_API_INTERNAL
-#include "llama.h"
-#include "grammar-parser.h"
-
-#include <cassert>
-#include <stdexcept>
-
-int main()
-{
-    grammar_parser::parse_state parsed_grammar;
-
-    std::vector<std::pair<std::string, uint32_t>> expected = {
-        {"expr", 2},
-        {"expr_6", 6},
-        {"expr_7", 7},
-        {"ident", 8},
-        {"ident_10", 10},
-        {"num", 9},
-        {"num_11", 11},
-        {"root", 0},
-        {"root_1", 1},
-        {"root_5", 5},
-        {"term", 4},
-        {"ws", 3},
-        {"ws_12", 12},
-    };
-
-    std::vector<std::vector<llama_grammar_element>> expected_rules = {
-        {{LLAMA_GRETYPE_RULE_REF, 5}, {LLAMA_GRETYPE_END, 0}},
-        {
-            {LLAMA_GRETYPE_RULE_REF, 2},
-            {LLAMA_GRETYPE_CHAR, 61},
-            {LLAMA_GRETYPE_RULE_REF, 3},
-            {LLAMA_GRETYPE_RULE_REF, 4},
-            {LLAMA_GRETYPE_CHAR, 10},
-            {LLAMA_GRETYPE_END, 0},
-        },
-        {{LLAMA_GRETYPE_RULE_REF, 4}, {LLAMA_GRETYPE_RULE_REF, 7}, {LLAMA_GRETYPE_END, 0}},
-        {{LLAMA_GRETYPE_RULE_REF, 12}, {LLAMA_GRETYPE_END, 0}},
-        {
-            {LLAMA_GRETYPE_RULE_REF, 8},
-            {LLAMA_GRETYPE_ALT, 0},
-            {LLAMA_GRETYPE_RULE_REF, 9},
-            {LLAMA_GRETYPE_ALT, 0},
-            {LLAMA_GRETYPE_CHAR, 40},
-            {LLAMA_GRETYPE_RULE_REF, 3},
-            {LLAMA_GRETYPE_RULE_REF, 2},
-            {LLAMA_GRETYPE_CHAR, 41},
-            {LLAMA_GRETYPE_RULE_REF, 3},
-            {LLAMA_GRETYPE_END, 0},
-        },
-        {{LLAMA_GRETYPE_RULE_REF, 1}, {LLAMA_GRETYPE_RULE_REF, 5}, {LLAMA_GRETYPE_ALT, 0}, {LLAMA_GRETYPE_RULE_REF, 1}, {LLAMA_GRETYPE_END, 0}},
-        {
-            {LLAMA_GRETYPE_CHAR, 45},
-            {LLAMA_GRETYPE_CHAR_ALT, 43},
-            {LLAMA_GRETYPE_CHAR_ALT, 42},
-            {LLAMA_GRETYPE_CHAR_ALT, 47},
-            {LLAMA_GRETYPE_RULE_REF, 4},
-            {LLAMA_GRETYPE_END, 0},
-        },
-        {{LLAMA_GRETYPE_RULE_REF, 6}, {LLAMA_GRETYPE_RULE_REF, 7}, {LLAMA_GRETYPE_ALT, 0}, {LLAMA_GRETYPE_END, 0}},
-        {
-            {LLAMA_GRETYPE_CHAR, 97},
-            {LLAMA_GRETYPE_CHAR_RNG_UPPER, 122},
-            {LLAMA_GRETYPE_RULE_REF, 10},
-            {LLAMA_GRETYPE_RULE_REF, 3},
-            {LLAMA_GRETYPE_END, 0},
-        },
-        {{LLAMA_GRETYPE_RULE_REF, 11}, {LLAMA_GRETYPE_RULE_REF, 3}, {LLAMA_GRETYPE_END, 0}},
-        {
-            {LLAMA_GRETYPE_CHAR, 97},
-            {LLAMA_GRETYPE_CHAR_RNG_UPPER, 122},
-            {LLAMA_GRETYPE_CHAR_ALT, 48},
-            {LLAMA_GRETYPE_CHAR_RNG_UPPER, 57},
-            {LLAMA_GRETYPE_CHAR_ALT, 95},
-            {LLAMA_GRETYPE_RULE_REF, 10},
-            {LLAMA_GRETYPE_ALT, 0},
-            {LLAMA_GRETYPE_END, 0},
-        },
-        {
-            {LLAMA_GRETYPE_CHAR, 48},
-            {LLAMA_GRETYPE_CHAR_RNG_UPPER, 57},
-            {LLAMA_GRETYPE_RULE_REF, 11},
-            {LLAMA_GRETYPE_ALT, 0},
-            {LLAMA_GRETYPE_CHAR, 48},
-            {LLAMA_GRETYPE_CHAR_RNG_UPPER, 57},
-            {LLAMA_GRETYPE_END, 0},
-        },
-        {
-            {LLAMA_GRETYPE_CHAR, 32},
-            {LLAMA_GRETYPE_CHAR_ALT, 9},
-            {LLAMA_GRETYPE_CHAR_ALT, 10},
-            {LLAMA_GRETYPE_RULE_REF, 12},
-            {LLAMA_GRETYPE_ALT, 0},
-            {LLAMA_GRETYPE_END, 0},
-        },
-    };
-
-    for (auto pair : expected)
-    {
-        parsed_grammar.symbol_ids[pair.first] = pair.second;
-    }
-
-    for (auto rule : expected_rules)
-    {
-        parsed_grammar.rules.emplace_back();
-        for (auto element : rule)
-        {
-            parsed_grammar.rules.back().push_back(element);
-        }
-    }
-
-    llama_grammar * grammar = NULL;
-    std::vector<const llama_grammar_element *> grammar_rules(parsed_grammar.c_rules());
-
-    grammar = llama_grammar_init(grammar_rules.data(), grammar_rules.size(), parsed_grammar.symbol_ids.at("root"));
-    if (grammar == nullptr)
-    {
-        throw std::runtime_error("Failed to initialize llama_grammar");
-    }
-
-    std::vector<std::vector<llama_grammar_element>> expected_stacks = {
-        {
-            {LLAMA_GRETYPE_RULE_REF, 5},
-            {LLAMA_GRETYPE_CHAR, 61},
-            {LLAMA_GRETYPE_RULE_REF, 7},
-            {LLAMA_GRETYPE_CHAR, 97},
-        },
-        {
-            {LLAMA_GRETYPE_RULE_REF, 5},
-            {LLAMA_GRETYPE_CHAR, 61},
-            {LLAMA_GRETYPE_RULE_REF, 7},
-            {LLAMA_GRETYPE_RULE_REF, 3},
-            {LLAMA_GRETYPE_CHAR, 48},
-        },
-        {
-            {LLAMA_GRETYPE_RULE_REF, 5},
-            {LLAMA_GRETYPE_CHAR, 61},
-            {LLAMA_GRETYPE_RULE_REF, 7},
-            {LLAMA_GRETYPE_RULE_REF, 3},
-            {LLAMA_GRETYPE_CHAR, 48},
-        },
-        {
-            {LLAMA_GRETYPE_RULE_REF, 5},
-            {LLAMA_GRETYPE_CHAR, 61},
-            {LLAMA_GRETYPE_RULE_REF, 7},
-            {LLAMA_GRETYPE_CHAR, 40},
-        },
-        {
-            {LLAMA_GRETYPE_CHAR, 61},
-            {LLAMA_GRETYPE_RULE_REF, 7},
-            {LLAMA_GRETYPE_CHAR, 97},
-        },
-        {
-            {LLAMA_GRETYPE_CHAR, 61},
-            {LLAMA_GRETYPE_RULE_REF, 7},
-            {LLAMA_GRETYPE_RULE_REF, 3},
-            {LLAMA_GRETYPE_CHAR, 48},
-        },
-        {
-            {LLAMA_GRETYPE_CHAR, 61},
-            {LLAMA_GRETYPE_RULE_REF, 7},
-            {LLAMA_GRETYPE_RULE_REF, 3},
-            {LLAMA_GRETYPE_CHAR, 48},
-        },
-        {
-            {LLAMA_GRETYPE_CHAR, 61},
-            {LLAMA_GRETYPE_RULE_REF, 7},
-            {LLAMA_GRETYPE_CHAR, 40},
-        }};
-
-    auto index = 0;
-    for (auto stack : llama_grammar_get_stacks(grammar))
-    {
-        // compare stack to expected_stack
-        for (uint32_t i = 0; i < stack.size(); i++)
-        {
-            auto element = stack[i];
-            auto expected_element = expected_stacks[index][i];
-
-            // pretty print error message before asserting
-            if (expected_element.type != element->type || expected_element.value != element->value)
-            {
-                fprintf(stderr, "index: %d\n", index);
-                fprintf(stderr, "expected_element: %d, %u\n", expected_element.type, expected_element.value);
-                fprintf(stderr, "actual_element: %d, %u\n", element->type, element->value);
-                fprintf(stderr, "expected_element != actual_element\n");
-            }
-
-            assert(expected_element.type == element->type && expected_element.value == element->value);
-        }
-        index++;
-    }
-
-    std::vector<llama_grammar_candidate> next_candidates;
-    next_candidates.resize(24);
-
-    for (size_t i = 0; i < 24; ++i)
-    {
-        uint32_t *cp = new uint32_t[2]; // dynamically allocate memory for code_point
-        cp[0] = 37 + i;
-        cp[1] = 0;
-        next_candidates[i] = {i, cp, {}};
-    }
-
-    std::vector<std::vector<std::pair<uint32_t, uint16_t>>> expected_reject = {
-        {
-            {0, 37},
-            {1, 38},
-            {2, 39},
-            {3, 40},
-            {4, 41},
-            {5, 42},
-            {6, 43},
-            {7, 44},
-            {8, 45},
-            {9, 46},
-            {10, 47},
-            {11, 48},
-            {12, 49},
-            {13, 50},
-            {14, 51},
-            {15, 52},
-            {16, 53},
-            {17, 54},
-            {18, 55},
-            {19, 56},
-            {20, 57},
-            {21, 58},
-            {22, 59},
-            {23, 60},
-        },
-        {
-            {0, 37},
-            {1, 38},
-            {2, 39},
-            {3, 40},
-            {4, 41},
-            {5, 42},
-            {6, 43},
-            {7, 44},
-            {8, 45},
-            {9, 46},
-            {10, 47},
-            {21, 58},
-            {22, 59},
-            {23, 60},
-        },
-        {
-            {0, 37},
-            {1, 38},
-            {2, 39},
-            {3, 40},
-            {4, 41},
-            {5, 42},
-            {6, 43},
-            {7, 44},
-            {8, 45},
-            {9, 46},
-            {10, 47},
-            {21, 58},
-            {22, 59},
-            {23, 60},
-        },
-        {
-            {0, 37},
-            {1, 38},
-            {2, 39},
-            {4, 41},
-            {5, 42},
-            {6, 43},
-            {7, 44},
-            {8, 45},
-            {9, 46},
-            {10, 47},
-            {11, 48},
-            {12, 49},
-            {13, 50},
-            {14, 51},
-            {15, 52},
-            {16, 53},
-            {17, 54},
-            {18, 55},
-            {19, 56},
-            {20, 57},
-            {21, 58},
-            {22, 59},
-            {23, 60},
-        },
-        {
-            {0, 37},
-            {1, 38},
-            {2, 39},
-            {3, 40},
-            {4, 41},
-            {5, 42},
-            {6, 43},
-            {7, 44},
-            {8, 45},
-            {9, 46},
-            {10, 47},
-            {11, 48},
-            {12, 49},
-            {13, 50},
-            {14, 51},
-            {15, 52},
-            {16, 53},
-            {17, 54},
-            {18, 55},
-            {19, 56},
-            {20, 57},
-            {21, 58},
-            {22, 59},
-            {23, 60},
-        },
-        {
-            {0, 37},
-            {1, 38},
-            {2, 39},
-            {3, 40},
-            {4, 41},
-            {5, 42},
-            {6, 43},
-            {7, 44},
-            {8, 45},
-            {9, 46},
-            {10, 47},
-            {21, 58},
-            {22, 59},
-            {23, 60},
-        },
-        {
-            {0, 37},
-            {1, 38},
-            {2, 39},
-            {3, 40},
-            {4, 41},
-            {5, 42},
-            {6, 43},
-            {7, 44},
-            {8, 45},
-            {9, 46},
-            {10, 47},
-            {21, 58},
-            {22, 59},
-            {23, 60},
-        },
-        {
-            {0, 37},
-            {1, 38},
-            {2, 39},
-            {4, 41},
-            {5, 42},
-            {6, 43},
-            {7, 44},
-            {8, 45},
-            {9, 46},
-            {10, 47},
-            {11, 48},
-            {12, 49},
-            {13, 50},
-            {14, 51},
-            {15, 52},
-            {16, 53},
-            {17, 54},
-            {18, 55},
-            {19, 56},
-            {20, 57},
-            {21, 58},
-            {22, 59},
-            {23, 60},
-        },
-    };
-
-    std::vector<llama_grammar_candidate> rejects = llama_grammar_reject_candidates_for_stack(llama_grammar_get_rules(grammar), llama_grammar_get_stacks(grammar)[0], next_candidates);
-
-    std::vector<std::vector<llama_grammar_candidate>> all_rejects;
-
-    for (std::size_t count = 0; count < llama_grammar_get_stacks(grammar).size(); ++count)
-    {
-        rejects = llama_grammar_reject_candidates_for_stack(llama_grammar_get_rules(grammar), llama_grammar_get_stacks(grammar)[count], next_candidates);
-        all_rejects.push_back(rejects);
-    }
-
-    index = 0;
-    for (auto rej : all_rejects)
-    {
-        for (uint32_t i = 0; i < rej.size(); i++)
-        {
-            auto element = rej[i];
-            auto expected_element = expected_reject[index][i];
-            assert(element.index == expected_element.first && *element.code_points == expected_element.second);
-        }
-        index++;
-    }
-
-    for (auto &candidate : next_candidates)
-    {
-        delete[] candidate.code_points;
-        candidate.code_points = nullptr;
-    }
-    llama_grammar_free(grammar);
-    return 0;
-}
diff --git a/tests/test-model-load-cancel.cpp b/tests/test-model-load-cancel.cpp
deleted file mode 100644
index 858535c3c..000000000
--- a/tests/test-model-load-cancel.cpp
+++ /dev/null
@@ -1,27 +0,0 @@
-#include "llama.h"
-#include "get-model.h"
-
-#include <cstdlib>
-
-int main(int argc, char *argv[] ) {
-    auto * model_path = get_model_or_exit(argc, argv);
-    auto * file = fopen(model_path, "r");
-    if (file == nullptr) {
-        fprintf(stderr, "no model at '%s' found\n", model_path);
-        return EXIT_FAILURE;
-    }
-
-    fprintf(stderr, "using '%s'\n", model_path);
-    fclose(file);
-
-    llama_backend_init();
-    auto params = llama_model_params{};
-    params.use_mmap = false;
-    params.progress_callback = [](float progress, void * ctx){
-        (void) ctx;
-        return progress > 0.50;
-    };
-    auto * model = llama_load_model_from_file(model_path, params);
-    llama_backend_free();
-    return model == nullptr ? EXIT_SUCCESS : EXIT_FAILURE;
-}
diff --git a/tests/test-opt.cpp b/tests/test-opt.cpp
deleted file mode 100644
index 546ca230b..000000000
--- a/tests/test-opt.cpp
+++ /dev/null
@@ -1,181 +0,0 @@
-#include "ggml.h"
-
-#include <cmath>
-#include <cstdio>
-#include <cstdlib>
-#include <cassert>
-
-#define MAX_NARGS 2
-
-#if defined(__GNUC__)
-#pragma GCC diagnostic ignored "-Wdouble-promotion"
-#endif
-
-//
-// logging
-//
-#define GGML_DEBUG 0
-#if (GGML_DEBUG >= 1)
-#define GGML_PRINT_DEBUG(...) printf(__VA_ARGS__)
-#else
-#define GGML_PRINT_DEBUG(...)
-#endif
-
-#if (GGML_DEBUG >= 5)
-#define GGML_PRINT_DEBUG_5(...) printf(__VA_ARGS__)
-#else
-#define GGML_PRINT_DEBUG_5(...)
-#endif
-
-#if (GGML_DEBUG >= 10)
-#define GGML_PRINT_DEBUG_10(...) printf(__VA_ARGS__)
-#else
-#define GGML_PRINT_DEBUG_10(...)
-#endif
-
-#define GGML_PRINT(...) printf(__VA_ARGS__)
-
-
-static float frand(void) {
-    return (float)rand()/(float)RAND_MAX;
-}
-
-static struct ggml_tensor * get_random_tensor(
-    struct ggml_context * ctx0, int ndims, int64_t ne[], float fmin, float fmax
-) {
-    struct ggml_tensor * result = ggml_new_tensor(ctx0, GGML_TYPE_F32, ndims, ne);
-
-    switch (ndims) {
-        case 1:
-            for (int i0 = 0; i0 < ne[0]; i0++) {
-                ((float *)result->data)[i0] = frand()*(fmax - fmin) + fmin;
-            }
-            break;
-        case 2:
-            for (int i1 = 0; i1 < ne[1]; i1++) {
-                for (int i0 = 0; i0 < ne[0]; i0++) {
-                    ((float *)result->data)[i1*ne[0] + i0] = frand()*(fmax - fmin) + fmin;
-                }
-            }
-            break;
-        case 3:
-            for (int i2 = 0; i2 < ne[2]; i2++) {
-                for (int i1 = 0; i1 < ne[1]; i1++) {
-                    for (int i0 = 0; i0 < ne[0]; i0++) {
-                        ((float *)result->data)[i2*ne[1]*ne[0] + i1*ne[0] + i0] = frand()*(fmax - fmin) + fmin;
-                    }
-                }
-            }
-            break;
-        case 4:
-            for (int i3 = 0; i3 < ne[3]; i3++) {
-                for (int i2 = 0; i2 < ne[2]; i2++) {
-                    for (int i1 = 0; i1 < ne[1]; i1++) {
-                        for (int i0 = 0; i0 < ne[0]; i0++) {
-                            ((float *)result->data)[i3*ne[2]*ne[1]*ne[0] + i2*ne[1]*ne[0] + i1*ne[0] + i0] = frand()*(fmax - fmin) + fmin;
-                        }
-                    }
-                }
-            }
-            break;
-        default:
-            assert(false);
-    }
-
-    return result;
-}
-
-int main(void) {
-    struct ggml_init_params params = {
-        /* .mem_size   = */ 1024*1024*1024,
-        /* .mem_buffer = */ NULL,
-        /* .no_alloc   = */ false,
-    };
-
-    struct ggml_context * ctx = ggml_init(params);
-
-    int64_t ne1[4] = {4, 128, 1, 1};
-    int64_t ne2[4] = {4, 256, 1, 1};
-    int64_t ne3[4] = {128, 256, 1, 1};
-
-    struct ggml_tensor * a = get_random_tensor(ctx, 2, ne1, -1, +1);
-    struct ggml_tensor * b = get_random_tensor(ctx, 2, ne2, -1, +1);
-    ggml_set_param(ctx, a);
-    ggml_set_param(ctx, b);
-
-    struct ggml_tensor * c = get_random_tensor(ctx, 2, ne3, -1, +1);
-
-    struct ggml_tensor * ab = ggml_mul_mat(ctx, a, b);
-    struct ggml_tensor * d  = ggml_sub(ctx, c, ab);
-    struct ggml_tensor * e  = ggml_sum(ctx, ggml_sqr(ctx, d));
-
-    struct ggml_cgraph * ge = ggml_new_graph_custom(ctx, GGML_DEFAULT_GRAPH_SIZE, true);
-    ggml_build_forward_expand(ge, e);
-    ggml_graph_reset(ge);
-
-    ggml_graph_compute_with_ctx(ctx, ge, /*n_threads*/ 1);
-
-    const float fe = ggml_get_f32_1d(e, 0);
-    printf("%s: e = %.4f\n", __func__, fe);
-
-    struct ggml_opt_params opt_params = ggml_opt_default_params(GGML_OPT_TYPE_ADAM);
-
-    ggml_opt(ctx, opt_params, e);
-
-    ggml_graph_reset(ge);
-
-    ggml_graph_compute_with_ctx(ctx, ge, /*n_threads*/ 1);
-
-    const float fe_opt = ggml_get_f32_1d(e, 0);
-    printf("%s: original  e = %.4f\n", __func__, fe);
-    printf("%s: optimized e = %.4f\n", __func__, fe_opt);
-
-    const bool success = (fe_opt <= fe);
-    assert(success);
-
-    ggml_free(ctx);
-    return success ? 0 : -1;
-}
-// int64_t ne1[4] = {4, 128, 1, 1};
-// int64_t ne2[4] = {4, 256, 1, 1};;
-// int64_t ne3[4] = {128, 256, 1, 1};
-// main: original  e = 25890.9375
-// main: optimized e = 10094.7031
-
-// int64_t ne1[4] = {8, 128, 1, 1};
-// int64_t ne2[4] = {8, 256, 1, 1};;
-// int64_t ne3[4] = {128, 256, 1, 1};
-// main: original  e = 39429.5078
-// main: optimized e = 9275.8936
-
-// int64_t ne1[4] = {16, 128, 1, 1};
-// int64_t ne2[4] = {16, 256, 1, 1};;
-// int64_t ne3[4] = {128, 256, 1, 1};
-// main: original  e = 68371.1328
-// main: optimized e = 7854.4502
-
-
-// int64_t ne1[4] = {32, 128, 1, 1};
-// int64_t ne2[4] = {32, 256, 1, 1};;
-// int64_t ne3[4] = {128, 256, 1, 1};
-// main: original  e = 126061.1953
-// main: optimized e = 5451.0166
-
-// int64_t ne1[4] = {4, 1024, 1, 1};
-// int64_t ne2[4] = {4, 2048, 1, 1};;
-// int64_t ne3[4] = {1024, 2048, 1, 1};
-// main: original  e = 1620817.8750
-// main: optimized e = 698387.6875
-
-// another run on M1
-// int64_t ne1[4] = {4, 1024, 1, 1};
-// int64_t ne2[4] = {4, 2048, 1, 1};;
-// int64_t ne3[4] = {1024, 2048, 1, 1};
-// main: original  e = 1629595.6250
-// main: optimized e = 698169.1250
-
-// int64_t ne1[4] = {32, 1024, 1, 1};
-// int64_t ne2[4] = {32, 2048, 1, 1};;
-// int64_t ne3[4] = {1024, 2048, 1, 1};
-// main: original  e = 8146770.5000
-// main: optimized e = 651119.1250
diff --git a/tests/test-quantize-fns.cpp b/tests/test-quantize-fns.cpp
deleted file mode 100644
index c97458d1d..000000000
--- a/tests/test-quantize-fns.cpp
+++ /dev/null
@@ -1,185 +0,0 @@
-// Unit tests for quantization specific functions - quantize, dequantize and dot product
-
-#include "ggml.h"
-
-#undef NDEBUG
-#include <assert.h>
-#include <math.h>
-#include <stdio.h>
-#include <string>
-#include <vector>
-
-#if defined(_MSC_VER)
-#pragma warning(disable: 4244 4267) // possible loss of data
-#endif
-
-constexpr float MAX_QUANTIZATION_REFERENCE_ERROR = 0.0001f;
-constexpr float MAX_QUANTIZATION_TOTAL_ERROR = 0.002f;
-constexpr float MAX_QUANTIZATION_TOTAL_ERROR_2BITS = 0.0075f;
-constexpr float MAX_QUANTIZATION_TOTAL_ERROR_3BITS = 0.0040f;
-constexpr float MAX_QUANTIZATION_TOTAL_ERROR_3BITS_XXS = 0.0050f;
-constexpr float MAX_DOT_PRODUCT_ERROR = 0.02f;
-constexpr float MAX_DOT_PRODUCT_ERROR_LOWBIT = 0.04f;
-
-static const char* RESULT_STR[] = {"ok", "FAILED"};
-
-
-// Generate synthetic data
-static void generate_data(float offset, size_t n, float * dst) {
-    for (size_t i = 0; i < n; i++) {
-        dst[i] = 0.1 + 2*cosf(i + offset);
-    }
-}
-
-// Calculate RMSE between two float arrays
-static float array_rmse(const float * a1, const float * a2, size_t n) {
-    double sum = 0;
-    for (size_t i = 0; i < n; i++) {
-        double diff = a1[i] - a2[i];
-        sum += diff * diff;
-    }
-    return sqrtf(sum) / n;
-}
-
-// Total quantization error on test data
-static float total_quantization_error(ggml_type_traits_t & qfns, size_t test_size, const float * test_data) {
-    std::vector<uint8_t> tmp_q(2*test_size);
-    std::vector<float> tmp_out(test_size);
-
-    qfns.from_float(test_data, tmp_q.data(), test_size);
-    qfns.to_float(tmp_q.data(), tmp_out.data(), test_size);
-    return array_rmse(test_data, tmp_out.data(), test_size);
-}
-
-// Total quantization error on test data
-static float reference_quantization_error(ggml_type_traits_t & qfns, size_t test_size, const float * test_data) {
-    std::vector<uint8_t> tmp_q(2*test_size);
-    std::vector<float> tmp_out(test_size);
-    std::vector<float> tmp_out_ref(test_size);
-
-    qfns.from_float(test_data, tmp_q.data(), test_size);
-    qfns.to_float(tmp_q.data(), tmp_out.data(), test_size);
-
-    qfns.from_float_ref(test_data, tmp_q.data(), test_size);
-    qfns.to_float(tmp_q.data(), tmp_out_ref.data(), test_size);
-
-    return array_rmse(tmp_out.data(), tmp_out_ref.data(), test_size);
-}
-
-static float dot_product(const float * a1, const float * a2, size_t test_size) {
-    double sum = 0;
-    for (size_t i = 0; i < test_size; i++) {
-        sum += a1[i] * a2[i];
-    }
-    return sum;
-}
-
-// Total dot product error
-static float dot_product_error(
-    ggml_type_traits_t & qfns, size_t test_size, const float * test_data1, const float *test_data2
-) {
-    std::vector<uint8_t> tmp_q1(2*test_size);
-    std::vector<uint8_t> tmp_q2(2*test_size);
-
-    auto vdot = ggml_internal_get_type_traits(qfns.vec_dot_type);
-
-    qfns.from_float(test_data1, tmp_q1.data(), test_size);
-    vdot.from_float(test_data2, tmp_q2.data(), test_size);
-
-    float result = INFINITY;
-    qfns.vec_dot(test_size, &result, 0, tmp_q1.data(), 0, tmp_q2.data(), 0, 1);
-
-    const float dot_ref = dot_product(test_data1, test_data2, test_size);
-
-    return fabsf(result - dot_ref) / test_size;
-}
-
-int main(int argc, char * argv[]) {
-    bool verbose = false;
-    const size_t test_size = 32 * 128;
-
-    std::string arg;
-    for (int i = 1; i < argc; i++) {
-        arg = argv[i];
-
-        if (arg == "-v") {
-            verbose = true;
-        } else {
-            fprintf(stderr, "error: unknown argument: %s\n", arg.c_str());
-            return 1;
-        }
-    }
-
-    std::vector<float> test_data(test_size);
-    std::vector<float> test_data2(test_size);
-
-    generate_data(0.0, test_data.size(), test_data.data());
-    generate_data(1.0, test_data2.size(), test_data2.data());
-
-    // Initialize GGML, ensures float conversion tables are initialized
-    struct ggml_init_params ggml_params = {
-        /* .mem_size   = */ 1*1024,
-        /* .mem_buffer = */ NULL,
-        /* .no_alloc   = */ true,
-    };
-    struct ggml_context * ctx = ggml_init(ggml_params);
-
-    int num_failed = 0;
-    bool failed = false;
-
-    for (int i = 0; i < GGML_TYPE_COUNT; i++) {
-        ggml_type type = (ggml_type) i;
-        ggml_type_traits_t qfns = ggml_internal_get_type_traits(type);
-
-        // deprecated - skip
-        if (qfns.blck_size == 0) {
-            continue;
-        }
-
-        const ggml_type ei = (ggml_type)i;
-
-        printf("Testing %s\n", ggml_type_name((ggml_type) i));
-        ggml_quantize_init(ei);
-
-        if (qfns.from_float && qfns.to_float) {
-            const float total_error = total_quantization_error(qfns, test_size, test_data.data());
-            const float max_quantization_error =
-                type == GGML_TYPE_Q2_K    ? MAX_QUANTIZATION_TOTAL_ERROR_2BITS :
-                type == GGML_TYPE_IQ2_S   ? MAX_QUANTIZATION_TOTAL_ERROR_2BITS :
-                type == GGML_TYPE_Q3_K    ? MAX_QUANTIZATION_TOTAL_ERROR_3BITS :
-                type == GGML_TYPE_IQ3_S   ? MAX_QUANTIZATION_TOTAL_ERROR_3BITS :
-                type == GGML_TYPE_IQ3_XXS ? MAX_QUANTIZATION_TOTAL_ERROR_3BITS_XXS : MAX_QUANTIZATION_TOTAL_ERROR;
-            failed = !(total_error < max_quantization_error);
-            num_failed += failed;
-            if (failed || verbose) {
-                printf("%5s absolute quantization error:    %s (%f)\n", ggml_type_name(type), RESULT_STR[failed], total_error);
-            }
-
-            const float reference_error = reference_quantization_error(qfns, test_size, test_data.data());
-            failed = !(reference_error < MAX_QUANTIZATION_REFERENCE_ERROR);
-            num_failed += failed;
-            if (failed || verbose) {
-                printf("%5s reference implementation error: %s (%f)\n", ggml_type_name(type), RESULT_STR[failed], reference_error);
-            }
-
-            const float vec_dot_error = dot_product_error(qfns, test_size, test_data.data(), test_data2.data());
-            const float max_allowed_error = type == GGML_TYPE_Q2_K || type == GGML_TYPE_IQ2_XS || type == GGML_TYPE_IQ2_XXS ||
-                                            type == GGML_TYPE_IQ3_XXS || type == GGML_TYPE_IQ3_S || type == GGML_TYPE_IQ2_S
-                                          ? MAX_DOT_PRODUCT_ERROR_LOWBIT
-                                          : MAX_DOT_PRODUCT_ERROR;
-            failed = !(vec_dot_error < max_allowed_error);
-            num_failed += failed;
-            if (failed || verbose) {
-                printf("%5s dot product error:              %s (%f)\n", ggml_type_name(type), RESULT_STR[failed], vec_dot_error);
-            }
-        }
-    }
-
-    if (num_failed || verbose) {
-        printf("%d tests failed\n", num_failed);
-    }
-
-    ggml_free(ctx);
-
-    return num_failed > 0;
-}
diff --git a/tests/test-quantize-perf.cpp b/tests/test-quantize-perf.cpp
deleted file mode 100644
index 24e066053..000000000
--- a/tests/test-quantize-perf.cpp
+++ /dev/null
@@ -1,363 +0,0 @@
-// Benchmark quantization specific functions on synthetic data
-
-#include "ggml.h"
-
-#undef NDEBUG
-#include <algorithm>
-#include <assert.h>
-#include <functional>
-#include <inttypes.h>
-#include <math.h>
-#include <memory>
-#include <stdio.h>
-#include <string>
-#include <vector>
-
-#if defined(_MSC_VER)
-#pragma warning(disable: 4244 4267) // possible loss of data
-#endif
-
-#define MAX_ALIGNMENT 64
-#define QK 32
-#define WARMUP 5
-#define ITERATIONS 10
-#define MAX_ITERATIONS 100000000
-
-#define L1_SIZE      32*128
-#define L2_SIZE     32*2048
-#define L3_SIZE    32*20480
-#define MEM_SIZE 32*2048000
-
-struct quantize_perf_params {
-    std::vector<std::string> include_types;
-    std::vector<size_t> test_sizes;
-    size_t alignment_offset = 0;
-    bool op_quantize_row_q_reference = false;
-    bool op_quantize_row_q = false;
-    bool op_dequantize_row_q = false;
-    bool op_quantize_row_q_dot = false;
-    bool op_vec_dot_q = false;
-    int64_t iterations = ITERATIONS;
-};
-
-#if defined(__x86_64__) || defined(__i386__)
-
-#include <x86intrin.h>
-inline int64_t cpu_cycles() {
-// Rough way to detect new-ish CPUs
-#ifdef __POPCNT__
-    unsigned int dummy;
-    return __rdtscp(&dummy);
-#else
-    return __rdtsc();
-#endif
-}
-
-#else
-
-#define cpu_cycles() 0
-
-#endif
-
-
-// Generate synthetic data
-static void generate_data(float offset, size_t n, float * dst) {
-    for (size_t i = 0; i < n; i++) {
-        dst[i] = 0.1 + 2*cosf(i + offset);
-    }
-}
-
-static float gigabytes_per_second(size_t bytes, int64_t usecs) {
-    return bytes / (float) usecs * 1000000 / (1024*1024*1024);
-}
-
-static void * align_with_offset(void * ptr, int offset) {
-    size_t dummy_size = MAX_ALIGNMENT * 4;
-    return (char *) std::align(MAX_ALIGNMENT, MAX_ALIGNMENT, ptr, dummy_size) + offset;
-}
-
-static void benchmark_function(size_t size, size_t q_size, int64_t iterations, const std::function<float(void)> & func) {
-    int64_t min_time_us = INT64_MAX;
-    int64_t total_time_us = 0;
-    int64_t min_time_cycles = INT64_MAX;
-    int64_t total_time_cycles = 0;
-
-    for (int i = 0; i < WARMUP; i++) {
-        func();
-    }
-
-    for (int i = 0; i < iterations; i++) {
-        const int64_t start_time = ggml_time_us();
-        const int64_t start_cycles = cpu_cycles();
-
-        func();
-
-        const int64_t end_cycles = cpu_cycles();
-        const int64_t end_time = ggml_time_us();
-
-        total_time_cycles += end_cycles - start_cycles;
-        min_time_cycles = std::min(min_time_cycles, end_cycles - start_cycles);
-        total_time_us += end_time - start_time;
-        min_time_us = std::min(min_time_us, end_time - start_time);
-    }
-
-    printf("      min cycles/%d vals   : %9.2f\n",  QK, QK * min_time_cycles / (float) size);
-    printf("      avg cycles/%d vals   : %9.2f\n",  QK, QK * total_time_cycles / (float) (size * iterations));
-    printf("      float32 throughput   : %9.2f GB/s\n",  gigabytes_per_second(4 * size * iterations, total_time_us));
-    printf("      quantized throughput : %9.2f GB/s\n",  gigabytes_per_second(q_size * iterations, total_time_us));
-}
-
-static void usage(char * argv[]) {
-    printf("Benchmark quantization specific functions on synthetic data\n");
-    printf("\n");
-    printf("usage: %s [options]\n", argv[0]);
-    printf("\n");
-    printf("options: (default)\n");
-    printf("  -h, --help            show this help message and exit\n");
-    printf("  --size SIZE           set test size, divisible by 32 (L1_SIZE:%d)\n", L1_SIZE);
-    printf("  -3                    use size as L1, L2, L3 sizes (L1:%d L2:%d L3:%d)\n", L1_SIZE, L2_SIZE, L3_SIZE);
-    printf("  -4                    use size as L1, L2, L3, MEM sizes (L1:%d L2:%d L3:%d MEM:%d)\n", L1_SIZE, L2_SIZE, L3_SIZE, MEM_SIZE);
-    printf("  --op OP               set test operation as quantize_row_q_reference, quantize_row_q, dequantize_row_q,\n");
-    printf("                        quantize_row_q_dot, vec_dot_q (all)\n");
-    printf("  --type TYPE           set test type as");
-    for (int i = 0; i < GGML_TYPE_COUNT; i++) {
-        ggml_type type = (ggml_type) i;
-        ggml_type_traits_t qfns = ggml_internal_get_type_traits(type);
-        if (ggml_type_name(type) != NULL) {
-            if (qfns.from_float && qfns.to_float) {
-                printf(" %s", ggml_type_name(type));
-            }
-        }
-    }
-    printf(" (all)\n");
-    printf("  --alignment-offset OFFSET\n");
-    printf("                        set alignment offset as OFFSET (0)\n");
-    printf("  -i NUM, --iterations NUM\n");
-    printf("                        set test iteration number (%d)\n", ITERATIONS);
-}
-
-int main(int argc, char * argv[]) {
-    quantize_perf_params params {};
-
-    // read command line
-
-    bool invalid_param = false;
-    std::string arg;
-    for (int i = 1; i < argc; i++) {
-        arg = argv[i];
-
-        if (arg == "--size") {
-            if (++i >= argc) {
-                invalid_param = true;
-                break;
-            }
-            size_t size = std::stoi(argv[i]);
-            if (size % 32 != 0) {
-                fprintf(stderr, "error: size %zu not divisible by 32\n", size);
-                invalid_param = true;
-                break;
-            }
-            params.test_sizes.push_back(size);
-        } else if (arg == "-3") {
-            // quick select sizes that probably fit in CPU caches
-            params.test_sizes.push_back(L1_SIZE);
-            params.test_sizes.push_back(L2_SIZE);
-            params.test_sizes.push_back(L3_SIZE);
-        } else if (arg == "-4") {
-            // quick select cache sizes + memory
-            params.test_sizes.push_back(L1_SIZE);
-            params.test_sizes.push_back(L2_SIZE);
-            params.test_sizes.push_back(L3_SIZE);
-            params.test_sizes.push_back(MEM_SIZE);
-        } else if (arg == "--op") {
-            if (++i >= argc) {
-                invalid_param = true;
-                break;
-            }
-            std::string op {argv[i]};
-            if (op == "quantize_row_q_reference") {
-                params.op_quantize_row_q_reference = true;
-            } else if (op == "quantize_row_q") {
-                params.op_quantize_row_q = true;
-            } else if (op == "dequantize_row_q") {
-                params.op_dequantize_row_q = true;
-            } else if (op == "quantize_row_q_dot") {
-                params.op_quantize_row_q_dot = true;
-            } else if (op == "vec_dot_q") {
-                params.op_vec_dot_q = true;
-            } else {
-                invalid_param = true;
-                break;
-            }
-        } else if (arg == "--type") {
-            if (++i >= argc) {
-                invalid_param = true;
-                break;
-            }
-            params.include_types.push_back(argv[i]);
-        } else if (arg == "--alignment-offset") {
-            if (++i >= argc) {
-                invalid_param = true;
-                break;
-            }
-            int alignment = std::stoi(argv[i]);
-            if (alignment < 0 || alignment > MAX_ALIGNMENT) {
-            fprintf(stderr, "error: alignment-offset must be less than %d\n", MAX_ALIGNMENT);
-                invalid_param = true;
-                break;
-            }
-            params.alignment_offset = alignment;
-        } else if ((arg == "-i") || (arg == "--iterations")) {
-            if (++i >= argc) {
-                invalid_param = true;
-                break;
-            }
-            int number = std::stoi(argv[i]);
-            if (number < 0 || number > MAX_ITERATIONS) {
-            fprintf(stderr, "error: iterations must be less than %d\n", MAX_ITERATIONS);
-                invalid_param = true;
-                break;
-            }
-            params.iterations = number;
-        } else if ((arg == "-h") || (arg == "--help")) {
-            usage(argv);
-            return 1;
-        } else {
-            fprintf(stderr, "error: unknown argument: %s\n", arg.c_str());
-            return 1;
-        }
-    }
-    if (invalid_param) {
-        fprintf(stderr, "error: invalid parameter for argument: %s\n", arg.c_str());
-        return 1;
-    }
-
-    if (params.test_sizes.empty()) {
-        params.test_sizes.push_back(L1_SIZE);
-    }
-    if (!(params.op_quantize_row_q_reference || params.op_quantize_row_q || params.op_dequantize_row_q || params.op_quantize_row_q_dot || params.op_vec_dot_q)) {
-        params.op_quantize_row_q_reference = params.op_quantize_row_q = params.op_dequantize_row_q = params.op_quantize_row_q_dot = params.op_vec_dot_q = true;
-    }
-
-    std::sort(params.test_sizes.begin(), params.test_sizes.end());
-    size_t largest = params.test_sizes.back();
-
-    std::vector<uint8_t> test_data1_v(largest*4 + MAX_ALIGNMENT*2);
-    std::vector<uint8_t> test_data2_v(largest*4 + MAX_ALIGNMENT*2);
-    std::vector<uint8_t> test_q1_v   (largest*4 + MAX_ALIGNMENT*2);
-    std::vector<uint8_t> test_q2_v   (largest*4 + MAX_ALIGNMENT*2);
-    std::vector<uint8_t> test_out_v  (largest*4 + MAX_ALIGNMENT*2);
-
-    float * test_data1 = (float *) align_with_offset(test_data1_v.data(), params.alignment_offset);
-    float * test_data2 = (float *) align_with_offset(test_data2_v.data(), params.alignment_offset);
-    float * test_q1    = (float *) align_with_offset(test_q1_v.data(),    params.alignment_offset);
-    float * test_q2    = (float *) align_with_offset(test_q2_v.data(),    params.alignment_offset);
-    float * test_out   = (float *) align_with_offset(test_out_v.data(),   params.alignment_offset);
-
-    generate_data(0, largest, test_data1);
-    generate_data(1, largest, test_data2);
-
-    int64_t iterations = params.iterations;
-
-
-    // Initialize GGML, ensures float conversion tables are initialized
-    struct ggml_init_params ggml_params = {
-        /* .mem_size   = */ 1*1024,
-        /* .mem_buffer = */ NULL,
-        /* .no_alloc   = */ true,
-    };
-    struct ggml_context * ctx = ggml_init(ggml_params);
-
-    for (int i = 0; i < GGML_TYPE_COUNT; i++) {
-        ggml_type type = (ggml_type) i;
-        ggml_type_traits_t qfns = ggml_internal_get_type_traits(type);
-        if (!params.include_types.empty() && ggml_type_name(type) && std::find(params.include_types.begin(), params.include_types.end(), ggml_type_name(type)) == params.include_types.end()) {
-            continue;
-        }
-
-        if (qfns.from_float && qfns.to_float) {
-            printf("%s\n", ggml_type_name(type));
-
-            ggml_quantize_init(type);
-
-            if (params.op_quantize_row_q_reference) {
-                printf("  quantize_row_q_reference\n");
-                for (size_t size : params.test_sizes) {
-                    printf("    %zu values (%.2f MB)\n", size, 4*size/(float)(1024*1024));
-                    auto quantize_fn = [&](void) -> float {
-                        qfns.from_float_ref(test_data1, test_q1, size);
-                        return test_q1[0];
-                    };
-                    size_t quantized_size = ggml_row_size(type, size);
-                    benchmark_function(size, quantized_size, iterations, quantize_fn);
-                }
-                printf("\n");
-            }
-
-            if (params.op_quantize_row_q) {
-                printf("  quantize_row_q\n");
-                for (size_t size : params.test_sizes) {
-                    printf("    %zu values (%.2f MB)\n", size, 4*size/(float)(1024*1024));
-                    auto quantize_fn = [&](void) -> float {
-                        qfns.from_float(test_data1, test_q1, size);
-                        return test_q1[0];
-                    };
-                    size_t quantized_size = ggml_row_size(type, size);
-                    benchmark_function(size, quantized_size, iterations, quantize_fn);
-                }
-                printf("\n");
-            }
-
-            if (params.op_dequantize_row_q) {
-                printf("  dequantize_row_q\n");
-                qfns.from_float(test_data1, test_q1, largest);
-                for (size_t size : params.test_sizes) {
-                    printf("    %zu values (%.2f MB)\n", size, 4*size/(float)(1024*1024));
-                    auto quantize_fn = [&](void) -> float {
-                        qfns.to_float(test_q1, test_out, size);
-                        return test_out[0];
-                    };
-                    size_t quantized_size = ggml_row_size(type, size);
-                    benchmark_function(size, quantized_size, iterations, quantize_fn);
-                }
-                printf("\n");
-            }
-
-            if (params.op_quantize_row_q_dot) {
-                printf("  quantize_row_q_dot\n");
-                for (size_t size : params.test_sizes) {
-                    printf("    %zu values (%.2f MB)\n", size, 4*size/(float)(1024*1024));
-                    auto quantize_fn = [&](void) -> float {
-                        auto vdot = ggml_internal_get_type_traits(qfns.vec_dot_type);
-                        vdot.from_float(test_data1, test_q1, size);
-                        return test_q1[0];
-                    };
-                    size_t quantized_size = ggml_row_size(type, size);
-                    benchmark_function(size, quantized_size, iterations, quantize_fn);
-                }
-                printf("\n");
-            }
-
-            if (params.op_vec_dot_q) {
-                printf("  vec_dot_q\n");
-                qfns.from_float(test_data1, test_q1, largest);
-                qfns.from_float(test_data2, test_q2, largest);
-                for (size_t size : params.test_sizes) {
-                    printf("    %zu values (%.2f MB)\n", size, 4*size/(float)(1024*1024));
-                    auto quantize_fn = [&](void) -> float {
-                        float result;
-                        qfns.vec_dot(size, &result, 0, test_q1, 0, test_q2, 0, 1);
-                        return result;
-                    };
-                    size_t quantized_size = ggml_row_size(type, size);
-                    benchmark_function(size, quantized_size, iterations, quantize_fn);
-                }
-                printf("\n");
-            }
-        }
-    }
-
-    ggml_free(ctx);
-
-    return 0;
-}
diff --git a/tests/test-rope.cpp b/tests/test-rope.cpp
deleted file mode 100644
index 8159e276a..000000000
--- a/tests/test-rope.cpp
+++ /dev/null
@@ -1,220 +0,0 @@
-#include "ggml.h"
-
-#include <cmath>
-#include <cstdio>
-#include <cstdlib>
-#include <cassert>
-#include <vector>
-
-#if defined(_MSC_VER)
-#pragma warning(disable: 4244 4267) // possible loss of data
-#endif
-
-#if defined(__GNUC__)
-#pragma GCC diagnostic ignored "-Wdouble-promotion"
-#endif
-
-#define MAX_NARGS 3
-
-#undef MIN
-#undef MAX
-#define MIN(a, b) ((a) < (b) ? (a) : (b))
-#define MAX(a, b) ((a) > (b) ? (a) : (b))
-
-#define GGML_SILU_FP16
-
-//
-// logging
-//
-
-#if (GGML_DEBUG >= 1)
-#define GGML_PRINT_DEBUG(...) printf(__VA_ARGS__)
-#else
-#define GGML_PRINT_DEBUG(...)
-#endif
-
-#if (GGML_DEBUG >= 5)
-#define GGML_PRINT_DEBUG_5(...) printf(__VA_ARGS__)
-#else
-#define GGML_PRINT_DEBUG_5(...)
-#endif
-
-#if (GGML_DEBUG >= 10)
-#define GGML_PRINT_DEBUG_10(...) printf(__VA_ARGS__)
-#else
-#define GGML_PRINT_DEBUG_10(...)
-#endif
-
-#define GGML_PRINT(...) printf(__VA_ARGS__)
-
-static float frand(void) {
-    return (float)rand()/(float)RAND_MAX;
-}
-
-static int irand(int n) {
-    if (n == 0) return 0;
-    return rand()%n;
-}
-
-static void get_random_dims(int64_t * dims, int ndims) {
-    dims[0] = dims[1] = dims[2] = dims[3] = 1;
-
-    for (int i = 0; i < ndims; i++) {
-        dims[i] = 1 + irand(4);
-    }
-}
-
-static struct ggml_tensor * get_random_tensor_f32(
-        struct ggml_context * ctx0,
-        int ndims,
-        const int64_t ne[],
-        float fmin,
-        float fmax) {
-    struct ggml_tensor * result = ggml_new_tensor(ctx0, GGML_TYPE_F32, ndims, ne);
-
-    switch (ndims) {
-        case 1:
-            for (int i0 = 0; i0 < ne[0]; i0++) {
-                ((float *)result->data)[i0] = frand()*(fmax - fmin) + fmin;
-            }
-            break;
-        case 2:
-            for (int i1 = 0; i1 < ne[1]; i1++) {
-                for (int i0 = 0; i0 < ne[0]; i0++) {
-                    ((float *)result->data)[i1*ne[0] + i0] = frand()*(fmax - fmin) + fmin;
-                }
-            }
-            break;
-        case 3:
-            for (int i2 = 0; i2 < ne[2]; i2++) {
-                for (int i1 = 0; i1 < ne[1]; i1++) {
-                    for (int i0 = 0; i0 < ne[0]; i0++) {
-                        ((float *)result->data)[i2*ne[1]*ne[0] + i1*ne[0] + i0] = frand()*(fmax - fmin) + fmin;
-                    }
-                }
-            }
-            break;
-        case 4:
-            for (int i3 = 0; i3 < ne[3]; i3++) {
-                for (int i2 = 0; i2 < ne[2]; i2++) {
-                    for (int i1 = 0; i1 < ne[1]; i1++) {
-                        for (int i0 = 0; i0 < ne[0]; i0++) {
-                            ((float *)result->data)[i3*ne[2]*ne[1]*ne[0] + i2*ne[1]*ne[0] + i1*ne[0] + i0] = frand()*(fmax - fmin) + fmin;
-                        }
-                    }
-                }
-            }
-            break;
-        default:
-            assert(false);
-    };
-
-    return result;
-}
-
-static void ggml_graph_compute_helper(std::vector<uint8_t> & buf, ggml_cgraph * graph, int n_threads) {
-    struct ggml_cplan plan = ggml_graph_plan(graph, n_threads);
-
-    if (plan.work_size > 0) {
-        buf.resize(plan.work_size);
-        plan.work_data = buf.data();
-    }
-
-    ggml_graph_compute(graph, &plan);
-}
-
-int main(int /*argc*/, const char ** /*argv*/) {
-    struct ggml_init_params params = {
-        /* .mem_size   = */ 128*1024*1024,
-        /* .mem_buffer = */ NULL,
-        /* .no_alloc   = */ false,
-    };
-
-    std::vector<uint8_t> work_buffer;
-
-    struct ggml_context * ctx0 = ggml_init(params);
-
-    struct ggml_tensor * x;
-
-    // rope f32
-    for (int m = 0; m < 3; ++m) {
-        const int ndims = 4;
-
-        const int64_t n_rot = 128;
-        const int64_t ne[4] = { 2*n_rot, 32, 73, 1 };
-
-        const int n_past_0 = 100;
-        const int n_past_2 = 33;
-
-        struct ggml_tensor * p0 = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, ne[2]);
-        struct ggml_tensor * p1 = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, ne[2]);
-        struct ggml_tensor * p2 = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, ne[2]);
-
-        for (int i = 0; i < ne[2]; ++i) {
-            ((int32_t *) p0->data)[i] = n_past_0 + i;
-            ((int32_t *) p1->data)[i] = n_past_2 - n_past_0;
-            ((int32_t *) p2->data)[i] = n_past_2 + i;
-        }
-
-        // test mode 0, 2, 4 (standard, GPT-NeoX, GLM)
-        const int mode = m == 0 ? 0 : m == 1 ? 2 : 4;
-
-        x = get_random_tensor_f32(ctx0, ndims, ne, -1.0f, 1.0f);
-
-        // 100, 101, 102, ..., 172
-        struct ggml_tensor * r0 = ggml_rope(ctx0, x,  p0, n_rot, mode);
-        // -67, -67, -67, ..., -67
-        struct ggml_tensor * r1 = ggml_rope(ctx0, r0, p1, n_rot, mode); // "context swap", i.e. forget n_past_0 - n_past_2 tokens
-
-        //  33,  34,  35, ..., 105
-        struct ggml_tensor * r2 = ggml_rope(ctx0, x,  p2, n_rot, mode);
-
-        ggml_cgraph * gf = ggml_new_graph(ctx0);
-
-        ggml_build_forward_expand(gf, r0);
-        ggml_build_forward_expand(gf, r1);
-        ggml_build_forward_expand(gf, r2);
-
-        ggml_graph_compute_helper(work_buffer, gf, 4);
-
-        // check that r1 and r2 are the same
-        {
-            double sum0 = 0.0f;
-            double sum1 = 0.0f;
-            double diff = 0.0f;
-
-            const float * r1_data = (float *) r1->data;
-            const float * r2_data = (float *) r2->data;
-
-            const int n_elements = ggml_nelements(r1);
-
-            for (int i = 0; i < n_elements; ++i) {
-                sum0 += fabs(r1_data[i]);
-                sum1 += fabs(r2_data[i]);
-                diff += fabs(r1_data[i] - r2_data[i]);
-                //if (fabs(r1_data[i] - r2_data[i]) > 0.0001f) {
-                //    printf("%d: %f %f\n", i, r1_data[i], r2_data[i]);
-                //    printf("diff: %f\n", fabs(r1_data[i] - r2_data[i]));
-                //}
-            }
-
-            //for (int i = 4096; i < 4096 + 128; ++i) {
-            //    printf("%f %f\n", r1_data[i], r2_data[i]);
-            //}
-
-            printf("mode: %d\n", mode);
-            printf("sum0: %f\n", sum0);
-            printf("sum1: %f\n", sum1);
-            printf("diff: %f\n", diff);
-            printf("rel err: %f\n", diff / sum0);
-            printf("rel err: %f\n", diff / sum1);
-
-            GGML_ASSERT(diff / sum0 < 0.0001f);
-            GGML_ASSERT(diff / sum1 < 0.0001f);
-        }
-    }
-
-    ggml_free(ctx0);
-
-    return 0;
-}
diff --git a/tests/test-sampling.cpp b/tests/test-sampling.cpp
deleted file mode 100644
index 6374958fe..000000000
--- a/tests/test-sampling.cpp
+++ /dev/null
@@ -1,301 +0,0 @@
-#include "ggml.h"
-#include "llama.h"
-
-#ifdef NDEBUG
-#undef NDEBUG
-#endif
-
-#include <algorithm>
-#include <cmath>
-#include <string>
-#include <vector>
-
-static void dump(const llama_token_data_array * candidates) {
-    for (size_t i = 0; i < candidates->size; i++) {
-        printf("%d: %f (%f)\n", candidates->data[i].id, candidates->data[i].p, candidates->data[i].logit);
-    }
-}
-
-#define DUMP(__candidates) do { printf("%s:%d (%s)\n", __FILE__, __LINE__, __func__); dump((__candidates)); printf("-\n"); } while(0)
-
-static void test_top_k(const std::vector<float> & probs, const std::vector<float> & expected_probs, int k) {
-    const size_t n_vocab = probs.size();
-    std::vector<llama_token_data> candidates;
-    candidates.reserve(n_vocab);
-    for (llama_token token_id = 0; token_id < (llama_token)n_vocab; token_id++) {
-        const float logit = logf(probs[token_id]);
-        candidates.emplace_back(llama_token_data{token_id, logit, 0.0f});
-    }
-
-    llama_token_data_array candidates_p = { candidates.data(), candidates.size(), false };
-    llama_sample_softmax(nullptr, &candidates_p);
-    DUMP(&candidates_p);
-    llama_sample_top_k(nullptr, &candidates_p, k, 1);
-    DUMP(&candidates_p);
-
-    GGML_ASSERT(candidates_p.size == expected_probs.size());
-    for (size_t i = 0; i < candidates_p.size; i++) {
-        GGML_ASSERT(fabs(candidates_p.data[i].p - expected_probs[i]) < 1e-5);
-    }
-}
-
-static void test_top_p(const std::vector<float> & probs, const std::vector<float> & expected_probs, float p) {
-    const size_t n_vocab = probs.size();
-    std::vector<llama_token_data> candidates;
-    candidates.reserve(n_vocab);
-    for (llama_token token_id = 0; token_id < (llama_token)n_vocab; token_id++) {
-        const float logit = logf(probs[token_id]);
-        candidates.emplace_back(llama_token_data{token_id, logit, 0.0f});
-    }
-
-    llama_token_data_array candidates_p = { candidates.data(), candidates.size(), false };
-    llama_sample_softmax(nullptr, &candidates_p);
-    DUMP(&candidates_p);
-    llama_sample_top_p(nullptr, &candidates_p, p, 1);
-    DUMP(&candidates_p);
-
-    GGML_ASSERT(candidates_p.size == expected_probs.size());
-    for (size_t i = 0; i < candidates_p.size; i++) {
-        GGML_ASSERT(fabs(candidates_p.data[i].p - expected_probs[i]) < 1e-3);
-    }
-}
-
-static void test_tfs(const std::vector<float> & probs, const std::vector<float> & expected_probs, float z) {
-    const size_t n_vocab = probs.size();
-    std::vector<llama_token_data> candidates;
-    candidates.reserve(n_vocab);
-    for (llama_token token_id = 0; token_id < (llama_token)n_vocab; token_id++) {
-        const float logit = logf(probs[token_id]);
-        candidates.emplace_back(llama_token_data{token_id, logit, 0.0f});
-    }
-
-    llama_token_data_array candidates_p = { candidates.data(), candidates.size(), false };
-    DUMP(&candidates_p);
-    llama_sample_tail_free(nullptr, &candidates_p, z, 1);
-    DUMP(&candidates_p);
-
-    GGML_ASSERT(candidates_p.size == expected_probs.size());
-    for (size_t i = 0; i < candidates_p.size; i++) {
-        GGML_ASSERT(fabs(candidates_p.data[i].p - expected_probs[i]) < 1e-3);
-    }
-}
-
-static void test_min_p(const std::vector<float> & probs, const std::vector<float> & expected_probs, float p) {
-    const size_t n_vocab = probs.size();
-    std::vector<llama_token_data> candidates;
-    candidates.reserve(n_vocab);
-    for (llama_token token_id = 0; token_id < (llama_token)n_vocab; token_id++) {
-        const float logit = logf(probs[token_id]);
-        candidates.emplace_back(llama_token_data{token_id, logit, 0.0f});
-    }
-
-    llama_token_data_array candidates_p = { candidates.data(), candidates.size(), false };
-    DUMP(&candidates_p);
-    llama_sample_min_p(nullptr, &candidates_p, p, 1);
-    DUMP(&candidates_p);
-    llama_sample_softmax(nullptr, &candidates_p);
-
-    GGML_ASSERT(candidates_p.size == expected_probs.size());
-    for (size_t i = 0; i < candidates_p.size; i++) {
-        GGML_ASSERT(fabs(candidates_p.data[i].p - expected_probs[i]) < 1e-3);
-    }
-}
-
-static void test_typical(const std::vector<float> & probs, const std::vector<float> & expected_probs, float p) {
-    const size_t n_vocab = probs.size();
-    std::vector<llama_token_data> candidates;
-    candidates.reserve(n_vocab);
-    for (llama_token token_id = 0; token_id < (llama_token)n_vocab; token_id++) {
-        const float logit = logf(probs[token_id]);
-        candidates.emplace_back(llama_token_data{token_id, logit, 0.0f});
-    }
-
-    llama_token_data_array candidates_p = { candidates.data(), candidates.size(), false };
-    DUMP(&candidates_p);
-    llama_sample_typical(nullptr, &candidates_p, p, 1);
-    DUMP(&candidates_p);
-
-    GGML_ASSERT(candidates_p.size == expected_probs.size());
-    for (size_t i = 0; i < candidates_p.size; i++) {
-        GGML_ASSERT(fabs(candidates_p.data[i].p - expected_probs[i]) < 1e-3);
-    }
-}
-
-static void test_repetition_penalties(
-    const std::vector<float> & probs, const std::vector<llama_token> & last_tokens,
-    const std::vector<float> & expected_probs, float repeat_penalty, float alpha_frequency, float alpha_presence
-) {
-    GGML_ASSERT(probs.size() == expected_probs.size());
-
-    const size_t n_vocab = probs.size();
-    std::vector<llama_token_data> candidates;
-    candidates.reserve(n_vocab);
-    for (llama_token token_id = 0; token_id < (llama_token)n_vocab; token_id++) {
-        const float logit = logf(probs[token_id]);
-        candidates.emplace_back(llama_token_data{token_id, logit, 0.0f});
-    }
-
-    llama_token_data_array candidates_p = { candidates.data(), candidates.size(), false };
-    llama_sample_softmax(nullptr, &candidates_p);
-    DUMP(&candidates_p);
-    llama_sample_repetition_penalties(nullptr, &candidates_p, (const llama_token *) last_tokens.data(), last_tokens.size(), repeat_penalty, alpha_frequency, alpha_presence);
-    llama_sample_softmax(nullptr, &candidates_p);
-    DUMP(&candidates_p);
-
-    GGML_ASSERT(candidates_p.size == expected_probs.size());
-    for (size_t i = 0; i < candidates_p.size; i++) {
-        GGML_ASSERT(fabs(candidates_p.data[i].p - expected_probs[i]) < 1e-3);
-    }
-}
-
-static void test_sampler_queue(
-    const size_t n_vocab, const std::string samplers_sequence, const int top_k, const float top_p, const float min_p
-) {
-    std::vector<llama_token_data> candidates;
-    candidates.reserve(n_vocab);
-    for (llama_token token_id = 0; token_id < (llama_token)n_vocab; token_id++) {
-        const float logit = logf(token_id);
-        candidates.emplace_back(llama_token_data{token_id, logit, 0.0f});
-    }
-
-    llama_token_data_array candidates_p = { candidates.data(), candidates.size(), false };
-
-          llama_token min_token_id = 0;
-    const llama_token max_token_id = n_vocab-1;
-
-    for (auto s : samplers_sequence) {
-        switch (s){
-            case 'k': llama_sample_top_k    (nullptr, &candidates_p, top_k, 1); break;
-            case 'f': GGML_ASSERT(false && "tail_free test not implemented");   break;
-            case 'y': GGML_ASSERT(false && "typical test not implemented");     break;
-            case 'p': llama_sample_top_p    (nullptr, &candidates_p, top_p, 1); break;
-            case 'm': llama_sample_min_p    (nullptr, &candidates_p, min_p, 1); break;
-            case 't': GGML_ASSERT(false && "temperature test not implemented"); break;
-            default : GGML_ASSERT(false && "Unknown sampler");                  break;
-        }
-
-        llama_sample_softmax(nullptr, &candidates_p); // make sure tokens are sorted for tests
-
-        const int size = candidates_p.size;
-
-        if (s == 'k') {
-            const int expected_size = std::min(size, top_k);
-            min_token_id = std::max(min_token_id, (llama_token)(n_vocab - top_k));
-
-            GGML_ASSERT(size == expected_size);
-            GGML_ASSERT(candidates_p.data[0].id == max_token_id);
-            GGML_ASSERT(candidates_p.data[expected_size-1].id == min_token_id);
-        } else if (s == 'p') {
-            const int softmax_divisor = n_vocab * (n_vocab-1) / 2 - min_token_id * (min_token_id-1) / 2;
-            const int softmax_numerator_target = ceilf(top_p * softmax_divisor);
-
-                min_token_id  = n_vocab;
-            int expected_size = 0;
-            int cumsum        = 0;
-            do { // do-while because always at least one token is sampled
-                min_token_id--;
-                expected_size++;
-
-                cumsum += min_token_id;
-            } while (cumsum < softmax_numerator_target);
-
-            // token 0 has p == 0, need special consideration for cumsum because top_p immediately returns
-            if (min_token_id == 1) {
-                min_token_id--;
-                expected_size += 1;
-            }
-
-            GGML_ASSERT(size == expected_size);
-            GGML_ASSERT(candidates_p.data[0].id == max_token_id);
-            GGML_ASSERT(candidates_p.data[expected_size-1].id == min_token_id);
-        } else if (s == 'm') {
-            int expected_size = ceilf((1.0f-min_p) * n_vocab);
-            expected_size = std::max(expected_size, 1);
-            expected_size = std::min(expected_size, size);
-
-            min_token_id = floorf(min_p * n_vocab);
-            min_token_id = std::max(min_token_id, 1);
-            min_token_id = std::max(min_token_id, (llama_token)(n_vocab - size));
-            min_token_id = std::min(min_token_id, (llama_token)(n_vocab - 1));
-
-            GGML_ASSERT(size == expected_size);
-            GGML_ASSERT(candidates_p.data[0].id == max_token_id);
-            GGML_ASSERT(candidates_p.data[expected_size-1].id == min_token_id);
-        } else {
-            GGML_ASSERT(false);
-        }
-    }
-
-    printf("Sampler queue %3s OK with n_vocab=%05ld top_k=%05d top_p=%f min_p=%f\n",
-           samplers_sequence.c_str(), n_vocab, top_k, top_p, min_p);
-}
-
-int main(void) {
-    ggml_time_init();
-
-    test_top_k({0.1f, 0.2f, 0.3f, 0.4f}, {0.4f}, 1);
-    test_top_k({0.1f, 0.2f, 0.3f, 0.4f}, {0.4f, 0.3f, 0.2f}, 3);
-    test_top_k({0.1f, 0.2f, 0.3f, 0.4f}, {0.4f, 0.3f, 0.2f, 0.1f}, 4);
-    test_top_k({0.1f, 0.2f, 0.3f, 0.4f}, {0.4f, 0.3f, 0.2f, 0.1f}, 0);
-
-    test_top_p({0.1f, 0.2f, 0.3f, 0.4f}, {0.4f}, 0);
-    test_top_p({0.1f, 0.2f, 0.3f, 0.4f}, {0.4f, 0.3f}, 0.7f);
-    test_top_p({0.1f, 0.2f, 0.3f, 0.4f}, {0.4f, 0.3f, 0.2f}, 0.8f);
-    test_top_p({0.1f, 0.2f, 0.3f, 0.4f}, {0.4f, 0.3f, 0.2f, 0.1f}, 1);
-
-    test_min_p({0.1f, 0.2f, 0.3f, 0.4f}, {0.4f/1.0f, 0.3f/1.0f, 0.2f/1.0f, 0.1f/1.0f}, 0.00f);
-    test_min_p({0.1f, 0.2f, 0.3f, 0.4f}, {0.4f/1.0f, 0.3f/1.0f, 0.2f/1.0f, 0.1f/1.0f}, 0.24f);
-    test_min_p({0.1f, 0.2f, 0.3f, 0.4f}, {0.4f/0.9f, 0.3f/0.9f, 0.2f/0.9f},            0.26f);
-    test_min_p({0.1f, 0.2f, 0.3f, 0.4f}, {0.4f/0.9f, 0.3f/0.9f, 0.2f/0.9f},            0.49f);
-    test_min_p({0.1f, 0.2f, 0.3f, 0.4f}, {0.4f/0.7f, 0.3f/0.7f},                       0.51f);
-    test_min_p({0.1f, 0.2f, 0.3f, 0.4f}, {0.4f/0.7f, 0.3f/0.7f},                       0.74f);
-    test_min_p({0.1f, 0.2f, 0.3f, 0.4f}, {0.4f/0.4f},                                  0.76f);
-    test_min_p({0.1f, 0.2f, 0.3f, 0.4f}, {0.4f/0.4f},                                  1.00f);
-
-    test_tfs({0.1f, 0.15f, 0.2f, 0.25f, 0.3f}, {0.3f}, 0.25f);
-    test_tfs({0.1f, 0.15f, 0.2f, 0.25f, 0.3f}, {0.3f, 0.25f}, 0.75f);
-    test_tfs({0.1f, 0.15f, 0.2f, 0.25f, 0.3f}, {0.3f, 0.25f}, 0.99f);
-
-    test_typical({0.97f, 0.01f, 0.01f, 0.01f}, {0.97f}, 0.5f);
-    test_typical({0.4f, 0.2f, 0.2f, 0.2f}, {0.2f, 0.2f, 0.2f}, 0.5f);
-
-    test_repetition_penalties({0.2f, 0.2f, 0.2f, 0.2f, 0.2f}, {0}, {0.25f, 0.25f, 0.25f, 0.25f, 0},   50.0f, 0.0f, 0.0f);
-    test_repetition_penalties({0.2f, 0.2f, 0.2f, 0.2f, 0.2f}, {0, 1, 2}, {0.5f, 0.5f, 0, 0, 0},       50.0f, 0.0f, 0.0f);
-    test_repetition_penalties({0.2f, 0.2f, 0.2f, 0.2f, 0.2f}, {0, 1, 2, 0, 0}, {0.5f, 0.5f, 0, 0, 0}, 50.0f, 0.0f, 0.0f);
-
-    test_repetition_penalties({0.2f, 0.2f, 0.2f, 0.2f, 0.2f}, {0},             {0.249997f, 0.249997f, 0.249997f, 0.249997f, 0.000011f}, 1.0f, 5.0f, 5.0f);
-    test_repetition_penalties({0.2f, 0.2f, 0.2f, 0.2f, 0.2f}, {0, 1, 2},       {0.499966f, 0.499966f, 0.000023f, 0.000023f, 0.000023f}, 1.0f, 5.0f, 5.0f);
-    test_repetition_penalties({0.2f, 0.2f, 0.2f, 0.2f, 0.2f}, {0, 1, 2, 0, 0}, {0.499977f, 0.499977f, 0.000023f, 0.000023f, 0.000000f}, 1.0f, 5.0f, 5.0f);
-
-    test_sampler_queue(10000, "k", 10000, 1.0f, 1.0f);
-    test_sampler_queue(10000, "k",     1, 1.0f, 1.0f);
-    test_sampler_queue(10000, "p", 10000, 1.0f, 1.0f);
-    test_sampler_queue(10000, "p", 10000, 0.0f, 1.0f);
-    test_sampler_queue(10000, "m", 10000, 1.0f, 1.0f);
-    test_sampler_queue(10000, "m", 10000, 1.0f, 1e-12);
-
-    test_sampler_queue(10000, "k",   100, 1.0000f, 1.0f);
-    test_sampler_queue(10000, "p", 10000, 0.0002f, 1.0f);
-    test_sampler_queue(10000, "p", 10000, 0.8000f, 1.0f);
-    test_sampler_queue(10000, "m", 10000, 1.0000f, 9997.9f/9999.0f);
-    test_sampler_queue(10000, "m", 10000, 1.0000f, 0.1f);
-
-    test_sampler_queue(10000, "kp", 100, 0.8f, 0.1f);
-    test_sampler_queue(10000, "km", 100, 0.8f, 0.1f);
-    test_sampler_queue(10000, "pk", 100, 0.8f, 0.1f);
-    test_sampler_queue(10000, "pm", 100, 0.8f, 0.1f);
-    test_sampler_queue(10000, "mk", 100, 0.8f, 0.1f);
-    test_sampler_queue(10000, "mp", 100, 0.8f, 9997.9f/9999.0f);
-    test_sampler_queue(10000, "mp", 100, 0.8f, 0.1f);
-
-    test_sampler_queue(10000, "kpm", 100, 0.8f, 0.1f);
-    test_sampler_queue(10000, "kmp", 100, 0.8f, 0.1f);
-    test_sampler_queue(10000, "pkm", 100, 0.8f, 0.1f);
-    test_sampler_queue(10000, "pmk", 100, 0.8f, 0.1f);
-    test_sampler_queue(10000, "mkp", 100, 0.8f, 0.1f);
-    test_sampler_queue(10000, "mpk", 100, 0.8f, 0.1f);
-
-    printf("OK\n");
-
-    return 0;
-}
diff --git a/tests/test-tokenizer-0.cpp b/tests/test-tokenizer-0.cpp
deleted file mode 100644
index d3d21331b..000000000
--- a/tests/test-tokenizer-0.cpp
+++ /dev/null
@@ -1,292 +0,0 @@
-#include "llama.h"
-#include "common.h"
-#include "console.h"
-
-#include <cstdio>
-#include <string>
-#include <map>
-#include <vector>
-#include <fstream>
-
-//static const std::map<std::string, std::vector<llama_token>> & k_tests() {
-//    static std::map<std::string, std::vector<llama_token>> _k_tests = {
-//        { ""                      , {  }, },
-//        { " "                     , {     220, }, },
-//        { "  "                    , {     256, }, },
-//        { "   "                   , {     262, }, },
-//        { "\t"                    , {     197, }, },
-//        { "\n"                    , {     198, }, },
-//        { "\n\n"                  , {     271, }, },
-//        { "\n\n\n"                , {    1432, }, },
-//        { "\t\n"                  , {    1602, }, },
-//        { "Hello world"           , {    9906,   1917, }, },
-//        { " Hello world"          , {   22691,   1917, }, },
-//        { "Hello World"           , {    9906,   4435, }, },
-//        { " Hello World"          , {   22691,   4435, }, },
-//        { " Hello World!"         , {   22691,   4435,      0, }, },
-//        { "Hello, world!"         , {    9906,     11,   1917,      0, }, },
-//        { " Hello, world!"        , {   22691,     11,   1917,      0, }, },
-//        { " this is 🦙.cpp"        , {     420,    374,  11410,     99,    247,     13,  11055, }, },
-//        { "w048 7tuijk dsdfhu"    , {      86,  23904,    220,     22,     83,   2005,  42908,  11729,   3013,  17156, }, },
-//        { "нещо на Български"     , {   79862, 102118,  13373,  64571,  34694,   3114, 112203,  80112, }, },
-//        { "កាន់តែពិសេសអាចខលចេញ"   , {   21549,    222,  98629,    241,  45358,    233,  21549,    237,  45358,    224,  21549,    244,  21549,    115,  21549,    253,  45358,    223,  21549,    253,  21549,     95,  98629,    227,  21549,    223,  21549,    249,  21549,    227,  45358,    223,  21549,    231, }, },
-//        { "🚀 (normal) 😶‍🌫️ (multiple emojis concatenated) ✅ (only emoji that has its own token)", {    9468,    248,    222,    320,   8416,      8,  27623,    114, 102470,   9468,    234,    104,  31643,    320,  36773, 100166,  98634,      8,  26602,    227,    320,   3323,  43465,    430,    706,   1202,   1866,   4037,      8, }, },
-//        { "Hello"                 , {    9906, }, },
-//        { " Hello"                , {   22691, }, },
-//        { "  Hello"               , {     220,  22691, }, },
-//        { "   Hello"              , {     256,  22691, }, },
-//        { "    Hello"             , {     262,  22691, }, },
-//        { "    Hello\n    Hello"  , {     262,  22691,    198,    262,  22691, }, },
-//        { " ("                    , {     320, }, },
-//        { "\n ="                  , {     198,    284, }, },
-//        { "' era"                 , {       6,  11639, }, },
-//        { "Hello, y'all! How are you 😁 ?我想在apple工作1314151天～", {    9906,     11,    379,  65948,      0,   2650,    527,    499,  27623,    223,    949,  37046, 101067,  19000,  23182, 102301,   9263,  18136,     16,  36827,  21909, }, },
-//        { "3"                     , {      18, }, },
-//        { "33"                    , {    1644, }, },
-//        { "333"                   , {    8765, }, },
-//        { "3333"                  , {    8765,     18, }, },
-//        { "33333"                 , {    8765,   1644, }, },
-//        { "333333"                , {    8765,   8765, }, },
-//        { "3333333"               , {    8765,   8765,     18, }, },
-//        { "33333333"              , {    8765,   8765,   1644, }, },
-//        { "333333333"             , {    8765,   8765,   8765, }, },
-//    };
-//
-//    return _k_tests;
-//}
-
-using llama_tests = std::map<std::string, std::vector<llama_token>>;
-
-static llama_tests read_tests(const std::string & fname_inp, const std::string & fname_out) {
-    llama_tests tests;
-
-    std::ifstream ifs_inp(fname_inp);
-    if (!ifs_inp) {
-        fprintf(stderr, "%s : error: could not open file '%s'\n", __func__, fname_inp.c_str());
-        return tests;
-    }
-
-    std::string sraw((std::istreambuf_iterator<char>(ifs_inp)), std::istreambuf_iterator<char>());
-
-    std::ifstream ifs_out(fname_out);
-    if (!ifs_out) {
-        fprintf(stderr, "%s : error: could not open file '%s'\n", __func__, fname_out.c_str());
-        return tests;
-    }
-
-    std::vector<std::string> sout;
-    for (std::string line; std::getline(ifs_out, line);) {
-        sout.push_back(line);
-    }
-
-    const std::string sep = "\n__ggml_vocab_test__\n";
-
-    std::vector<std::string> sinp;
-
-    size_t pos = 0;
-    while (pos < sraw.size()) {
-        const size_t next = sraw.find(sep, pos);
-        if (next == std::string::npos) {
-            sinp.push_back(sraw.substr(pos));
-            break;
-        }
-        sinp.push_back(sraw.substr(pos, next - pos));
-        pos = next + sep.size();
-    }
-
-    if (sinp.size() != sout.size()) {
-        fprintf(stderr, "%s : error: input and output files have different number of tests\n", __func__);
-        return tests;
-    }
-
-    for (size_t i = 0; i < sinp.size(); ++i) {
-        const std::string & s = sinp[i];
-        const std::string & o = string_strip(sout[i]);
-
-        std::vector<llama_token> toks;
-
-        size_t pos = 0;
-        while (pos < o.size()) {
-            size_t next = o.find(' ', pos);
-            if (next == std::string::npos) {
-                next = o.size();
-            }
-            const std::string stok = o.substr(pos, next - pos);
-            toks.push_back(std::stoi(stok));
-            pos = next + 1;
-        }
-
-        tests[s] = toks;
-    }
-
-    return tests;
-}
-
-int main(int argc, char **argv) {
-    if (argc < 2) {
-        fprintf(stderr, "Usage: %s vocab-file [text-file]\n", argv[0]);
-        return 1;
-    }
-
-    const std::string fname = argv[1];
-
-    const std::string fname_inp = fname + ".inp";
-    const std::string fname_out = fname + ".out";
-
-    std::string fname_text;
-    if (argc > 2) {
-        fname_text = argv[2];
-    }
-
-    fprintf(stderr, "%s : reading vocab from: '%s'\n", __func__, fname.c_str());
-
-    llama_model * model;
-    llama_context * ctx;
-
-    llama_backend_init();
-
-    // load the vocab
-    {
-        auto mparams = llama_model_default_params();
-
-        mparams.vocab_only = true;
-
-        model = llama_load_model_from_file(fname.c_str(), mparams);
-
-        if (model == NULL) {
-            fprintf(stderr, "%s: error: failed to load vocab '%s'\n", __func__, fname.c_str());
-            return 1;
-        }
-
-        auto cparams = llama_context_default_params();
-
-        ctx = llama_new_context_with_model(model, cparams);
-
-        if (ctx == NULL) {
-            fprintf(stderr, "%s: error: failed to load vocab '%s'\n", __func__, fname.c_str());
-            llama_free_model(model);
-            return 1;
-        }
-    }
-
-#ifdef _WIN32
-    // We need this for unicode console support
-    console::init(false, false);
-    atexit([]() { console::cleanup(); });
-#endif
-
-    bool success = true;
-
-    const auto k_tests = [&]() -> llama_tests {
-        if (!fname_text.empty()) {
-            return {};
-        }
-
-        const auto res = read_tests(fname_inp, fname_out);
-
-        if (res.empty()) {
-            fprintf(stderr, "%s : error: no tests found\n", __func__);
-            exit(1);
-        }
-
-        return res;
-    }();
-
-    const bool add_special = false;
-
-    for (const auto & test_kv : k_tests) {
-        const std::vector<llama_token> res = llama_tokenize(ctx, test_kv.first, add_special, false);
-
-        printf("\n");
-        printf("src: '%s'\n", test_kv.first.c_str());
-        printf("res: '%s'\n", llama_detokenize(ctx, res).c_str());
-        printf("tok: ");
-        for (const auto & tok : res) {
-            printf("%d ", tok);
-        }
-        printf("\n");
-
-        bool correct = res.size() == test_kv.second.size();
-        for (int i = 0; i < (int) res.size() && correct; ++i) {
-            if (test_kv.second[i] != res[i]) {
-                correct = false;
-            }
-        }
-
-        if (!correct) {
-            fprintf(stderr, "%s : failed test:    '%s'\n", __func__, test_kv.first.c_str());
-            fprintf(stderr, "%s : detokenized to: '%s' instead of '%s'\n", __func__,
-                llama_detokenize(ctx, res).c_str(),
-                llama_detokenize(ctx, test_kv.second).c_str());
-            fprintf(stderr, "%s : expected tokens: ", __func__);
-            for (const auto & t : test_kv.second) {
-                fprintf(stderr, "%6d '%s', ", t, llama_token_to_piece(ctx, t).c_str());
-            }
-            fprintf(stderr, "\n");
-            fprintf(stderr, "%s : got tokens:      ", __func__);
-            for (const auto & t : res) {
-                fprintf(stderr, "%6d '%s', ", t, llama_token_to_piece(ctx, t).c_str());
-            }
-            fprintf(stderr, "\n");
-
-            success = false;
-        }
-    }
-
-    if (!fname_text.empty()) {
-        fprintf(stderr, "%s : tokenizing: '%s'\n", __func__, fname_text.c_str());
-
-        std::string text;
-        {
-            std::ifstream ifs(fname_text);
-            if (!ifs) {
-                fprintf(stderr, "%s : error: could not open file '%s'\n", __func__, fname_text.c_str());
-                return 1;
-            }
-            text = std::string(std::istreambuf_iterator<char>(ifs), std::istreambuf_iterator<char>());
-        }
-
-        fprintf(stderr, "%s : text size: %zu\n", __func__, text.size());
-
-        std::vector<llama_token> res;
-
-        {
-            const auto t_start = ggml_time_us();
-
-            res = llama_tokenize(ctx, text, add_special, false);
-
-            const auto t_end = ggml_time_us();
-
-            fprintf(stderr, "%s : tokenized in %.3f ms (cpp)\n", __func__, (t_end - t_start) / 1000.0);
-        }
-
-        fprintf(stderr, "%s : tokens: %zu\n", __func__, res.size());
-
-        {
-            const std::string fname_out = fname_text + ".tokcpp";
-
-            std::ofstream ofs(fname_out);
-            if (!ofs) {
-                fprintf(stderr, "%s : error: could not open file '%s'\n", __func__, fname_out.c_str());
-                return 1;
-            }
-
-            for (const auto & tok : res) {
-                //ofs << tok << " '" << string_strip(llama_detokenize(ctx, std::vector<int>{tok})) << "'" << std::endl;
-                ofs << tok << "\n";
-            }
-        }
-
-        fprintf(stderr, "%s : tokens written to '%s'\n", __func__, (fname_text + ".tokcpp").c_str());
-    }
-
-    llama_free_model(model);
-    llama_free(ctx);
-
-    llama_backend_free();
-
-    printf("\n");
-    printf("Tests %s\n", success ? "passed" : "failed");
-
-    return success ? 0 : 3;
-}
diff --git a/tests/test-tokenizer-0.py b/tests/test-tokenizer-0.py
deleted file mode 100644
index cd760d1ce..000000000
--- a/tests/test-tokenizer-0.py
+++ /dev/null
@@ -1,46 +0,0 @@
-import time
-import argparse
-
-from transformers import AutoTokenizer
-
-parser = argparse.ArgumentParser()
-parser.add_argument("dir_tokenizer", help="directory containing 'tokenizer.model' file")
-parser.add_argument("--fname-tok",   help="path to a text file to tokenize", required=True)
-args = parser.parse_args()
-
-dir_tokenizer = args.dir_tokenizer
-fname_tok = args.fname_tok
-
-tokenizer = AutoTokenizer.from_pretrained(dir_tokenizer)
-
-print('tokenizing file: ', fname_tok) # noqa: NP100
-fname_out = fname_tok + '.tok'
-with open(fname_tok, 'r', encoding='utf-8') as f:
-    lines = f.readlines()
-    s = ''.join(lines)
-    t_start = time.time()
-    res = tokenizer.encode(s, add_special_tokens=False)
-    t_end = time.time()
-    print('\nmain : tokenized in', "{:.3f}".format(1000.0 * (t_end - t_start)), 'ms (py)') # noqa: NP100
-    with open(fname_out, 'w', encoding='utf-8') as f:
-        for x in res:
-            # LLaMA v3 for some reason strips the space for these tokens (and others)
-            # if x == 662:
-            #     f.write(str(x) + ' \' ' + tokenizer.decode(x) + '\'\n')
-            # elif x == 1174:
-            #     f.write(str(x) + ' \' ' + tokenizer.decode(x) + '\'\n')
-            # elif x == 2564:
-            #     f.write(str(x) + ' \' ' + tokenizer.decode(x) + '\'\n')
-            # elif x == 758:
-            #     f.write(str(x) + ' \' ' + tokenizer.decode(x) + '\'\n')
-            # elif x == 949:
-            #     f.write(str(x) + ' \' ' + tokenizer.decode(x) + '\'\n')
-            # elif x == 5354:
-            #     f.write(str(x) + ' \' ' + tokenizer.decode(x) + '\'\n')
-            # else:
-            #     f.write(str(x) + ' \'' + tokenizer.decode(x) + '\'\n')
-            # f.write(str(x) + ' \'' + tokenizer.decode(x).strip() + '\'\n')
-            f.write(str(x) + '\n')
-    print('len(res): ', len(res)) # noqa: NP100
-    print('len(lines): ', len(lines)) # noqa: NP100
-print('results written to: ', fname_out) # noqa: NP100
diff --git a/tests/test-tokenizer-0.sh b/tests/test-tokenizer-0.sh
deleted file mode 100755
index 4d2b83655..000000000
--- a/tests/test-tokenizer-0.sh
+++ /dev/null
@@ -1,41 +0,0 @@
-#!/bin/bash
-#
-# Usage:
-#
-#   test-tokenizer-0.sh <name> <input>
-#
-
-if [ $# -ne 2 ]; then
-    printf "Usage: $0 <name> <input>\n"
-    exit 1
-fi
-
-name=$1
-input=$2
-
-make -j tests/test-tokenizer-0
-
-printf "Testing %s on %s ...\n" $name $input
-
-set -e
-
-printf "Tokenizing using (py)  Python AutoTokenizer ...\n"
-python3 ./tests/test-tokenizer-0.py ./models/tokenizers/$name --fname-tok $input > /tmp/test-tokenizer-0-$name-py.log 2>&1
-
-printf "Tokenizing using (cpp) llama.cpp ...\n"
-./tests/test-tokenizer-0 ./models/ggml-vocab-$name.gguf $input > /tmp/test-tokenizer-0-$name-cpp.log 2>&1
-
-cat /tmp/test-tokenizer-0-$name-py.log | grep "tokenized in"
-cat /tmp/test-tokenizer-0-$name-cpp.log | grep "tokenized in"
-
-set +e
-
-diff $input.tok $input.tokcpp > /dev/null 2>&1
-
-if [ $? -eq 0 ]; then
-    printf "Tokenization is correct!\n"
-else
-    diff $input.tok $input.tokcpp | head -n 32
-
-    printf "Tokenization differs!\n"
-fi
diff --git a/tests/test-tokenizer-1-bpe.cpp b/tests/test-tokenizer-1-bpe.cpp
deleted file mode 100644
index 9498387e0..000000000
--- a/tests/test-tokenizer-1-bpe.cpp
+++ /dev/null
@@ -1,152 +0,0 @@
-#include "llama.h"
-#include "common.h"
-#include "unicode.h"
-#include "console.h"
-
-#include <cassert>
-#include <codecvt>
-#include <cstdio>
-#include <cstring>
-#include <locale>
-#include <string>
-#include <thread>
-#include <vector>
-#include <atomic>
-
-int main(int argc, char **argv) {
-    if (argc < 2 || argc > 3) {
-        fprintf(stderr, "Usage: %s <vocab-file> [--ignore-merges]\n", argv[0]);
-        return 1;
-    }
-
-    const std::string fname = argv[1];
-    bool ignore_merges = false;
-    if (argc == 3) {
-        if (std::strcmp(argv[2], "--ignore-merges") != 0) {
-            fprintf(stderr, "Usage: %s <vocab-file> [--ignore-merges]\n", argv[0]);
-            return 1;
-        }
-        ignore_merges = true;
-    }
-
-    fprintf(stderr, "%s : reading vocab from: '%s'\n", __func__, fname.c_str());
-
-    if (ignore_merges) {
-        fprintf(stderr, "%s : ignoring merges for tokens inside vocab\n", __func__);
-    }
-
-    llama_model * model;
-    llama_context * ctx;
-
-    llama_backend_init();
-
-    // load the vocab
-    {
-        auto mparams = llama_model_default_params();
-
-        mparams.vocab_only = true;
-
-        model = llama_load_model_from_file(fname.c_str(), mparams);
-
-        if (model == NULL) {
-            fprintf(stderr, "%s: error: failed to load vocab '%s'\n", __func__, fname.c_str());
-            return 1;
-        }
-
-        auto cparams = llama_context_default_params();
-
-        ctx = llama_new_context_with_model(model, cparams);
-
-        if (ctx == NULL) {
-            fprintf(stderr, "%s: error: failed to load vocab '%s'\n", __func__, fname.c_str());
-            llama_free_model(model);
-            return 1;
-        }
-    }
-
-    //GGML_ASSERT(llama_vocab_type(model) == LLAMA_VOCAB_TYPE_BPE);
-    if (llama_vocab_type(model) != LLAMA_VOCAB_TYPE_BPE) {
-        return 99;
-    }
-
-#ifdef _WIN32
-    // We need this for unicode console support
-    console::init(false, false);
-    atexit([]() { console::cleanup(); });
-#endif
-
-    const int n_vocab = llama_n_vocab(model);
-
-    for (int i = 0; i < n_vocab; ++i) {
-        std::string str = llama_detokenize(ctx, std::vector<int>(1, i));
-        try {
-            auto cps = unicode_cpts_from_utf8(str);
-            std::vector<llama_token> tokens = llama_tokenize(ctx, str, false, true);
-            if (ignore_merges && tokens.size() > 1) {
-                fprintf(stderr,
-                        "%s : error: token %d detokenizes to '%s'(%zu) but "
-                        "tokenization of this to multiple tokens: [",
-                        __func__, i, str.c_str(), str.length());
-                fprintf(stderr, "%d", tokens[0]);
-                for (size_t i = 1; i < tokens.size(); i++) {
-                    fprintf(stderr, ", %d", tokens[i]);
-                }
-                fprintf(stderr, "]\n");
-                return 2;
-            }
-            std::string check = llama_detokenize(ctx, tokens);
-            if (check != str) {
-                fprintf(stderr, "%s : error: token %d detokenizes to '%s'(%zu) but tokenization of this detokenizes to '%s'(%zu)\n",
-                    __func__, i, str.c_str(), str.length(), check.c_str(), check.length());
-                return 2;
-            }
-        }
-        catch (const std::invalid_argument &) {
-            //fprintf(stderr, "%s : info: utf8 conversion %d '%s'\n", __func__, i, str.c_str());
-        }
-    }
-
-    // unicode
-    {
-        const int nthread = std::thread::hardware_concurrency();
-
-        std::vector<std::thread> threads(nthread);
-
-        std::atomic_int errcode = {};
-
-        for (int i = 0; i < nthread; ++i) {
-            threads[i] = std::thread([i, nthread, ctx, &errcode]() {
-                for (uint32_t cp = i; !errcode && cp < 0x00110000; cp += nthread) {
-                    if ((0x0000D800 <= cp && cp <= 0x0000DFFF) ||  // surrogates \p{Cs}
-                        (0x00040000 <= cp && cp <= 0x000E0000)) {  // undefined  \p{Cn}
-                        continue;
-                    }
-
-                    std::string str = unicode_cpt_to_utf8(cp);
-                    std::vector<llama_token> tokens = llama_tokenize(ctx, str, false);
-                    std::string check = llama_detokenize(ctx, tokens);
-                    if (cp != 9601 && str != check) {
-                        fprintf(stderr, "error: codepoint 0x%x detokenizes to '%s'(%zu) instead of '%s'(%zu)\n",
-                                cp, check.c_str(), check.length(), str.c_str(), str.length());
-                        errcode = 3;
-                    }
-                }
-            });
-        }
-
-        for (auto & t : threads) {
-            t.join();
-        }
-
-        if (errcode) {
-            return errcode;
-        }
-    }
-
-    llama_free_model(model);
-    llama_free(ctx);
-
-    llama_backend_free();
-
-    return 0;
-}
diff --git a/tests/test-tokenizer-1-spm.cpp b/tests/test-tokenizer-1-spm.cpp
deleted file mode 100644
index 7ca9e2ca6..000000000
--- a/tests/test-tokenizer-1-spm.cpp
+++ /dev/null
@@ -1,122 +0,0 @@
-#include "llama.h"
-#include "common.h"
-#include "unicode.h"
-#include "console.h"
-
-#include <cassert>
-#include <codecvt>
-#include <cstdio>
-#include <cstring>
-#include <locale>
-#include <string>
-#include <thread>
-#include <vector>
-#include <atomic>
-
-int main(int argc, char ** argv) {
-    if (argc < 2) {
-        fprintf(stderr, "Usage: %s <vocab-file>\n", argv[0]);
-        return 1;
-    }
-
-    const std::string fname = argv[1];
-
-    fprintf(stderr, "%s : reading vocab from: '%s'\n", __func__, fname.c_str());
-
-    llama_model * model;
-    llama_context * ctx;
-
-    llama_backend_init();
-
-    // load the vocab
-    {
-        auto mparams = llama_model_default_params();
-
-        mparams.vocab_only = true;
-
-        model = llama_load_model_from_file(fname.c_str(), mparams);
-
-        if (model == NULL) {
-            fprintf(stderr, "%s: error: failed to load vocab '%s'\n", __func__, fname.c_str());
-            return 1;
-        }
-
-        auto cparams = llama_context_default_params();
-
-        ctx = llama_new_context_with_model(model, cparams);
-
-        if (ctx == NULL) {
-            fprintf(stderr, "%s: error: failed to load vocab '%s'\n", __func__, fname.c_str());
-            llama_free_model(model);
-            return 1;
-        }
-    }
-
-    //GGML_ASSERT(llama_vocab_type(model) == LLAMA_VOCAB_TYPE_SPM);
-    if (llama_vocab_type(model) != LLAMA_VOCAB_TYPE_SPM) {
-        return 99;
-    }
-
-#ifdef _WIN32
-    // We need this for unicode console support
-    console::init(false, false);
-    atexit([]() { console::cleanup(); });
-#endif
-
-    const int n_vocab = llama_n_vocab(model);
-
-    for (int i = 0; i < n_vocab; ++i) {
-        std::string str = llama_detokenize(ctx, std::vector<int>(1, i), true);
-        std::vector<llama_token> tokens = llama_tokenize(ctx, str, false, true);
-        std::string check = llama_detokenize(ctx, tokens);
-        if (check != str) {
-            fprintf(stderr, "%s : error: token %d detokenizes to '%s'(%zu) but tokenization of this detokenizes to '%s'(%zu)\n",
-                __func__, i, str.c_str(), str.length(), check.c_str(), check.length());
-            return 2;
-        }
-    }
-
-    // unicode
-    {
-        const int nthread = std::thread::hardware_concurrency();
-
-        std::vector<std::thread> threads(nthread);
-
-        std::atomic_int errcode = {};
-
-        for (int i = 0; i < nthread; ++i) {
-            threads[i] = std::thread([i, nthread, ctx, &errcode]() {
-                for (uint32_t cp = i; !errcode && cp < 0x00110000; cp += nthread) {
-                    if ((0x0000D800 <= cp && cp <= 0x0000DFFF) ||  // surrogates \p{Cs}
-                        (0x00040000 <= cp && cp <= 0x000E0000)) {  // undefined \p{Cn}
-                        continue;
-                    }
-
-                    std::string str = unicode_cpt_to_utf8(cp);
-                    std::vector<llama_token> tokens = llama_tokenize(ctx, str, false, true);
-                    std::string check = llama_detokenize(ctx, tokens);
-                    if (cp != 9601 && str != check) {
-                        fprintf(stderr, "error: codepoint 0x%x detokenizes to '%s'(%zu) instead of '%s'(%zu)\n",
-                                cp, check.c_str(), check.length(), str.c_str(), str.length());
-                        errcode = 3;
-                    }
-                }
-            });
-        }
-
-        for (auto & t : threads) {
-            t.join();
-        }
-
-        if(errcode) {
-            return errcode;
-        }
-    }
-
-    llama_free_model(model);
-    llama_free(ctx);
-
-    llama_backend_free();
-
-    return 0;
-}
diff --git a/tests/test-tokenizer-random.py b/tests/test-tokenizer-random.py
deleted file mode 100644
index 9ebe6c891..000000000
--- a/tests/test-tokenizer-random.py
+++ /dev/null
@@ -1,566 +0,0 @@
-# Test libllama tokenizer == AutoTokenizer.
-# Brute force random words/text generation.
-#
-# Sample usage:
-#
-#   python3 tests/test-tokenizer-random.py ./models/ggml-vocab-llama-bpe.gguf ./models/tokenizers/llama-bpe
-#
-
-from __future__ import annotations
-
-import time
-import logging
-import argparse
-import subprocess
-import random
-import unicodedata
-
-from pathlib import Path
-from typing import Any, Iterator, cast
-from typing_extensions import Buffer
-
-import cffi
-from transformers import AutoTokenizer, PreTrainedTokenizer
-
-
-logger = logging.getLogger("test-tokenizer-random")
-
-
-class LibLlama:
-
-    DEFAULT_PATH_LLAMA_H = "./include/llama.h"
-    DEFAULT_PATH_INCLUDES = ["./ggml/include/", "./include/"]
-    DEFAULT_PATH_LIBLLAMA = "./build/src/libllama.so"  # CMakeLists.txt: BUILD_SHARED_LIBS ON
-
-    def __init__(self, path_llama_h: str | None = None, path_includes: list[str] = [], path_libllama: str | None = None):
-        path_llama_h = path_llama_h or self.DEFAULT_PATH_LLAMA_H
-        path_includes = path_includes or self.DEFAULT_PATH_INCLUDES
-        path_libllama = path_libllama or self.DEFAULT_PATH_LIBLLAMA
-        (self.ffi, self.lib) = self._load_libllama_cffi(path_llama_h, path_includes, path_libllama)
-        self.lib.llama_backend_init()
-
-    def _load_libllama_cffi(self, path_llama_h: str, path_includes: list[str], path_libllama: str) -> tuple[cffi.FFI, Any]:
-        cmd = ["gcc", "-O0", "-E", "-P", "-D__restrict=", "-D__attribute__(x)=", "-D__asm__(x)="]
-        cmd += ["-I" + path for path in path_includes] + [path_llama_h]
-        res = subprocess.run(cmd, stdout=subprocess.PIPE)
-        assert (res.returncode == 0)
-        source = res.stdout.decode()
-        ffi = cffi.FFI()
-        if True:  # workarounds for pycparser
-            source = "typedef struct { } __builtin_va_list;" + "\n" + source
-            source = source.replace("sizeof (int)",    str(ffi.sizeof("int")))
-            source = source.replace("sizeof (void *)", str(ffi.sizeof("void*")))
-            source = source.replace("sizeof (size_t)", str(ffi.sizeof("size_t")))
-            source = source.replace("sizeof(int32_t)", str(ffi.sizeof("int32_t")))
-        ffi.cdef(source, override=True)
-        lib = ffi.dlopen(path_libllama)
-        return (ffi, lib)
-
-    def model_default_params(self, **kwargs):
-        mparams = self.lib.llama_model_default_params()
-        for k, v in kwargs.items():
-            setattr(mparams, k, v)
-        return mparams
-
-    def context_default_params(self, **kwargs):
-        cparams = self.lib.llama_context_default_params()
-        for k, v in kwargs.items():
-            setattr(cparams, k, v)
-        return cparams
-
-
-class LibLlamaModel:
-
-    def __init__(self, libllama: LibLlama, path_model: str, mparams={}, cparams={}):
-        self.lib: Any = libllama.lib
-        self.ffi = libllama.ffi
-        if isinstance(mparams, dict):
-            mparams = libllama.model_default_params(**mparams)
-        self.model = self.lib.llama_load_model_from_file(path_model.encode(), mparams)
-        if not self.model:
-            raise RuntimeError("error: failed to load model '%s'" % path_model)
-        if isinstance(cparams, dict):
-            cparams = libllama.context_default_params(**cparams)
-        self.ctx = self.lib.llama_new_context_with_model(self.model, cparams)
-        if not self.ctx:
-            raise RuntimeError("error: failed to create context for model '%s'" % path_model)
-        n_tokens_max = self.lib.llama_n_ctx(self.ctx)
-        self.token_ids = self.ffi.new("llama_token[]", n_tokens_max)
-        self.text_buff = self.ffi.new("uint8_t[]", 1024)
-
-    def free(self):
-        if self.ctx:
-            self.lib.llama_free(self.ctx)
-        if self.model:
-            self.lib.llama_free_model(self.model)
-        self.ctx = None
-        self.model = None
-        self.lib = None
-
-    def tokenize(self, text: str, add_special: bool = False, parse_special: bool = False) -> list[int]:
-        encoded_text: bytes = text.encode("utf-8")
-        num = self.lib.llama_tokenize(self.model, encoded_text, len(encoded_text), self.token_ids, len(self.token_ids), add_special, parse_special)
-        while num < 0 and len(self.token_ids) < (16 << 20):
-            self.token_ids = self.ffi.new("llama_token[]", -2 * num)
-            num = self.lib.llama_tokenize(self.model, encoded_text, len(encoded_text), self.token_ids, len(self.token_ids), add_special, parse_special)
-        return list(self.token_ids[0:num])
-
-    def detokenize(self, ids: list[int], remove_special: bool = False, unparse_special: bool = False) -> str:
-        if len(self.token_ids) < len(ids):
-            self.token_ids = self.ffi.new("llama_token[]", 2 * len(ids))
-        for i, id in enumerate(ids):
-            self.token_ids[i] = id
-        num = self.lib.llama_detokenize(self.model, self.token_ids, len(ids), self.text_buff, len(self.text_buff), remove_special, unparse_special)
-        while num < 0 and len(self.text_buff) < (16 << 20):
-            self.text_buff = self.ffi.new("uint8_t[]", -2 * num)
-            num = self.lib.llama_detokenize(self.model, self.token_ids, len(ids), self.text_buff, len(self.text_buff), remove_special, unparse_special)
-        return str(cast(Buffer, self.ffi.buffer(self.text_buff, num)), encoding="utf-8", errors="replace")  # replace errors with '\uFFFD'
-
-
-class Tokenizer:
-
-    def encode(self, text: str) -> list[int]:
-        raise NotImplementedError
-
-    def decode(self, ids: list[int]) -> str:
-        raise NotImplementedError
-
-
-class TokenizerGroundtruth (Tokenizer):
-
-    def __init__(self, dir_tokenizer: str):
-        self.model: PreTrainedTokenizer = AutoTokenizer.from_pretrained(dir_tokenizer)
-        # guess BOS and EOS
-        ids = self.encode("a")
-        assert 1 <= len(ids) <= 3
-        add_bos_token = len(ids) > 1 and self.model.bos_token_id == ids[0]
-        add_eos_token = len(ids) > 1 and self.model.eos_token_id == ids[-1]
-        self.add_bos_token = getattr(self.model, "add_bos_token", add_bos_token)
-        self.add_eos_token = getattr(self.model, "add_eos_token", add_eos_token)
-        # build vocab
-        tokens = list(self.model.get_vocab().values())
-        self.vocab = self.model.batch_decode(tokens, skip_special_tokens=True)
-        self.vocab = list(sorted(self.vocab))
-        # tokens and lists
-        self.special_tokens = list(self.model.all_special_tokens)
-        self.added_tokens   = self.model.batch_decode(self.model.added_tokens_encoder.values(), skip_special_tokens=False)
-        self.bos_token = self.model.bos_token
-        self.eos_token = self.model.eos_token
-
-    def encode(self, text: str) -> list[int]:
-        return self.model.encode(text, add_special_tokens=True)
-
-    def decode(self, ids: list[int]) -> str:
-        return self.model.decode(ids, skip_special_tokens=False)
-
-
-class TokenizerLlamaCpp (Tokenizer):
-
-    libllama: LibLlama | None = None
-
-    def __init__(self, vocab_file: str):
-        if not self.libllama:
-            self.libllama = LibLlama()
-        self.model = LibLlamaModel(self.libllama, vocab_file, mparams=dict(vocab_only=True), cparams=dict(n_ctx=4096))
-
-    def encode(self, text: str) -> list[int]:
-        return self.model.tokenize(text, add_special=True, parse_special=True)
-
-    def decode(self, ids: list[int]) -> str:
-        return self.model.detokenize(ids, remove_special=False, unparse_special=True)
-
-
-def generator_custom_text() -> Iterator[str]:
-    """General tests"""
-    yield from [
-        "",
-        " ",
-        "  ",
-        "   ",
-        "\t",
-        "\n",
-        "\n\n",
-        "\n\n\n",
-        "\t\n",
-        "Hello world",
-        " Hello world",
-        "Hello World",
-        " Hello World",
-        " Hello World!",
-        "Hello, world!",
-        " Hello, world!",
-        " this is 🦙.cpp",
-        "w048 7tuijk dsdfhu",
-        "нещо на Български",
-        "កាន់តែពិសេសអាចខលចេញ",
-        "🚀 (normal) 😶‍🌫️ (multiple emojis concatenated) ✅ (only emoji that has its own token)",
-        "Hello",
-        " Hello",
-        "  Hello",
-        "   Hello",
-        "    Hello",
-        "    Hello\n    Hello",
-        " (",
-        "\n =",
-        "' era",
-        "Hello, y'all! How are you 😁 ?我想在apple工作1314151天～",
-        "3",
-        "33",
-        "333",
-        "3333",
-        "33333",
-        "333333",
-        "3333333",
-        "33333333",
-        "333333333",
-    ]
-
-
-def generator_custom_text_edge_cases() -> Iterator[str]:
-    """Edge cases found while debugging"""
-    yield from [
-        '\x1f-a',     # unicode_ranges_control, {0x00001C, 0x00001F}
-        '¼-a',        # unicode_ranges_digit, 0x00BC
-        '½-a',        # unicode_ranges_digit, 0x00BD
-        '¾-a',        # unicode_ranges_digit, 0x00BE
-        'a 〇b',      # unicode_ranges_digit, 0x3007
-        'Ⅵ-a',       # unicode_ranges_digit, {0x00002150, 0x0000218F} // Number Forms
-        '\uFEFF//',   # unicode_ranges_control, 0xFEFF (BOM)
-        'Cửa Việt',   # llama-3, ignore_merges = true
-        '<s>a',       # Phi-3 fail
-        '<unk><|endoftext|><s>',  # Phi-3 fail
-        'a\na',            # bert fail
-        '"`',              # falcon
-        ' \u2e4e',         # falcon
-        '\n\x0b  ',        # falcon
-        'a\xa0\xa0\x00b',  # jina-v2-es
-        'one <mask>',      # jina-v2-es  <mask> lstrip=true
-        'a </s> b',        # rstrip phi-3
-        'a <mask> b',      # lstrip jina-v2
-        '\xa0aC',          # deepseek
-        '\u2029 \uA3E4',   # deepseek-llm
-        "a ?",
-        'å',               # mpt
-        '\U000ac517',      # utf-8 encode error, falcon
-        '\U000522f4',      # utf-8 encode error, starcoder
-        "<s><s><unk><s>a<s>b<s>c<unk>d<unk></s>",
-        "<s> <s> <unk><s>a<s>b<s>c<unk>d<unk></s>",
-    ]
-
-
-def generator_vocab_words(tokenizer: TokenizerGroundtruth) -> Iterator[str]:
-    """Brute force check all vocab words"""
-    yield from tokenizer.vocab
-
-
-def generator_ascii_lr_strip() -> Iterator[str]:
-    WHITESPACES = ["", " ", "  "]
-    CHARACTERS = list(chr(i) for i in range(1, 0x80)) + [""]
-    for char1 in CHARACTERS:
-        for char2 in CHARACTERS:
-            for lstrip in WHITESPACES:
-                for rstrip in WHITESPACES:
-                    yield lstrip + char1 + char2 + rstrip
-                    yield lstrip + char1 + rstrip + char2
-                    yield char1 + lstrip + char2 + rstrip
-
-
-def generator_apostrophe() -> Iterator[str]:
-    WHITESPACES = ["", " ", "  "]
-    CHARACTERS = list(chr(i) for i in range(1, 0x80)) + [""]
-    for char1 in CHARACTERS:
-        for char2 in CHARACTERS:
-            for lstrip in WHITESPACES:
-                for rstrip in WHITESPACES:
-                    yield char1 + lstrip + "'" + rstrip + char2
-                    yield char1 + char2 + lstrip + "'" + rstrip + "z"
-                    yield "a" + lstrip + "'" + rstrip + char1 + char2
-
-
-def generator_added_lr_strip(tokenizer: TokenizerGroundtruth) -> Iterator[str]:
-    WHITESPACES = ["", " ", "  ", "\n", "\r\n", "\n\n", "\t", "\t\t"]
-    all_tokens = list(sorted(set(tokenizer.special_tokens + tokenizer.added_tokens)))
-    for token in all_tokens:
-        for lstrip in WHITESPACES:
-            for rstrip in WHITESPACES:
-                yield lstrip + token + rstrip
-                yield "a" + lstrip + token + rstrip
-                yield lstrip + token + rstrip + "z"
-                yield "a" + lstrip + token + rstrip + "z"
-
-
-def generator_random_added_tokens(tokenizer: TokenizerGroundtruth, iterations=100) -> Iterator[str]:
-    separations = [" ", "\n", "\t", "-", "!", "one", "1", "<s>", "</s>"]
-    all_tokens  = list(sorted(set(tokenizer.special_tokens + tokenizer.added_tokens + separations)))
-    rand = random.Random()
-    for m in range(iterations):
-        rand.seed(m)
-        words = rand.choices(all_tokens, k=500)
-        if words and words[0] == tokenizer.bos_token:  # skip spam warning of double BOS
-            while len(words) > 1 and words[1] == tokenizer.bos_token:  # leave one starting BOS
-                words.pop(0)
-            if tokenizer.add_bos_token:  # drop all starting BOS
-                words.pop(0)
-        if words and words[-1] == tokenizer.eos_token:  # skip spam warning of double EOS
-            while len(words) > 1 and words[-2] == tokenizer.eos_token:  # leave one trailing EOS
-                words.pop(-1)
-            if tokenizer.add_bos_token:  # drop all trailing EOS
-                words.pop(-1)
-        yield "".join(words)
-
-
-def generator_random_chars(iterations=100) -> Iterator[str]:
-    """Brute force random text with simple characters"""
-
-    NUM_WORDS = 400
-    WHITESPACES = list(" " * 20 + "\n" * 5 + "\r\n" * 5 + "\t" * 5)
-    CHARS = list(sorted(set("""
-        ABCDEFGHIJKLMNOPQRSTUVWXYZ
-        abcdefghijklmnopqrstuvwxyz
-        ÁÉÍÓÚÀÈÌÒÙÂÊÎÔÛÄËÏÖÜ
-        áéíóúàèìòùâêîôûäëïöü
-        .-,*/-+ª!"·$%&/()=?¿[]{}<>\\|@#~½¬~;:_
-    """)))
-
-    rand = random.Random()
-    for m in range(iterations):
-        rand.seed(m)
-        text = []
-        for _ in range(NUM_WORDS):
-            k = rand.randint(1, 7)
-            word = rand.choices(CHARS, k=k)
-            word.append(rand.choice(WHITESPACES))
-            text.append("".join(word))
-        yield "".join(text)
-
-
-def generator_unicodes() -> Iterator[str]:
-    """Iterate unicode characters"""
-
-    MAX_CODEPOINTS = 0x30000  # 0x110000
-
-    def _valid(cpt):
-        if cpt >= 0x30000:  # unassigned and supplement­ary
-            return False
-        # if cpt == 0x2029:  # deepseek-llm
-        #    return False
-        if unicodedata.category(chr(cpt)) in ("Cn", "Cs", "Co"):  # undefined, surrogates, private
-            return False
-        return True
-
-    characters = [chr(cpt) for cpt in range(0, MAX_CODEPOINTS) if _valid(cpt)]
-
-    yield from characters
-
-
-def generator_random_unicodes(iterations=100) -> Iterator[str]:
-    """Brute force random text with unicode characters"""
-
-    NUM_WORDS = 200
-    WHITESPACES = list(" " * 20 + "\n" * 5 + "\r\n" * 5 + "\t" * 5)
-
-    characters = list(generator_unicodes())
-
-    rand = random.Random()
-    for m in range(iterations):
-        rand.seed(m)
-        text = []
-        for _ in range(NUM_WORDS):
-            k = rand.randint(1, 7)
-            word = rand.choices(characters, k=k)
-            word.append(rand.choice(WHITESPACES))
-            text.append("".join(word))
-        yield "".join(text)
-
-
-def generator_random_vocab_chars(tokenizer: TokenizerGroundtruth, iterations=100) -> Iterator[str]:
-    """Brute force random text with vocab characters"""
-
-    vocab_chars = set()
-    for word in tokenizer.vocab:
-        vocab_chars.update(word)
-    vocab_chars = list(sorted(vocab_chars))
-
-    rand = random.Random()
-    for m in range(iterations):
-        rand.seed(m)
-        text = rand.choices(vocab_chars, k=1024)
-        yield "".join(text)
-
-
-def generator_random_vocab_words(tokenizer: TokenizerGroundtruth, iterations=100) -> Iterator[str]:
-    """Brute force random text from vocab words"""
-
-    vocab = [w.strip() for w in tokenizer.vocab]
-    yield from vocab
-
-    rand = random.Random()
-    for m in range(iterations):
-        rand.seed(m)
-        text = []
-        num_words = rand.randint(300, 400)
-        for i in range(num_words):
-            k = rand.randint(1, 3)
-            words = rand.choices(vocab, k=k)
-            sep = rand.choice("     \n\r\t")
-            text.append("".join(words) + sep)
-        yield "".join(text)
-
-
-def compare_tokenizers(tokenizer1: TokenizerGroundtruth, tokenizer2: TokenizerLlamaCpp, generator: Iterator[str]):
-
-    def find_first_mismatch(ids1: list[int] | str, ids2: list[int] | str):
-        for i, (a, b) in enumerate(zip(ids1, ids2)):
-            if a != b:
-                return i
-        if len(ids1) == len(ids2):
-            return -1
-        return min(len(ids1), len(ids2))
-
-    def check_detokenizer(text: str, text1: str, text2: str) -> bool:
-        if text1 == text2:  # equal to TokenizerGroundtruth?
-            return True
-        # equal to source text?
-        if tokenizer1.add_bos_token:  # remove BOS
-            if text2.startswith(tokenizer1.bos_token):
-                text2 = text2[len(tokenizer1.bos_token):]
-        if tokenizer1.add_eos_token:  # remove EOS
-            if text2.endswith(tokenizer1.eos_token):
-                text2 = text2[:-len(tokenizer1.eos_token)]
-        return text == text2
-
-    t_encode1 = 0
-    t_encode2 = 0
-    t_decode1 = 0
-    t_decode2 = 0
-    t_start = time.perf_counter()
-    encode_errors = 0
-    decode_errors = 0
-    MAX_ERRORS = 10
-
-    logger.info("%s: %s" % (generator.__qualname__, "ini"))
-    for text in generator:
-        # print(repr(text), text.encode())
-        # print(repr(text), hex(ord(text[0])), text.encode())
-        t0 = time.perf_counter()
-        ids1 = tokenizer1.encode(text)
-        t1 = time.perf_counter()
-        ids2 = tokenizer2.encode(text)
-        t2 = time.perf_counter()
-        text1 = tokenizer1.decode(ids1)
-        t3 = time.perf_counter()
-        text2 = tokenizer2.decode(ids1)
-        t4 = time.perf_counter()
-        t_encode1 += t1 - t0
-        t_encode2 += t2 - t1
-        t_decode1 += t3 - t2
-        t_decode2 += t4 - t3
-        if encode_errors < MAX_ERRORS and ids1 != ids2:
-            i = find_first_mismatch(ids1, ids2)
-            ids1 = list(ids1)[max(0, i - 2) : i + 5 + 1]
-            ids2 = list(ids2)[max(0, i - 2) : i + 5 + 1]
-            logger.error(" Expected: " + str(ids1))
-            logger.error("   Result: " + str(ids2))
-            encode_errors += 1
-            logger.error(f" {encode_errors=}")
-        if decode_errors < MAX_ERRORS and not check_detokenizer(text, text1, text2):
-            i = find_first_mismatch(text1, text2)
-            text1 = list(text1[max(0, i - 2) : i + 5 + 1])
-            text2 = list(text2[max(0, i - 2) : i + 5 + 1])
-            logger.error(" Expected: " + " ".join(hex(ord(x)) for x in text1))
-            logger.error("   Result: " + " ".join(hex(ord(x)) for x in text2))
-            decode_errors += 1
-            logger.error(f" {decode_errors=}")
-        if encode_errors >= MAX_ERRORS and decode_errors >= MAX_ERRORS:
-            logger.error(f" EXIT: {encode_errors=} {decode_errors=}")
-            # raise Exception()
-            break
-
-    t_total = time.perf_counter() - t_start
-    logger.info(f"{generator.__qualname__}: end,  {t_encode1=:.3f} {t_encode2=:.3f}  {t_decode1=:.3f} {t_decode2=:.3f}  {t_total=:.3f}")
-
-
-def main(argv: list[str] | None = None):
-    parser = argparse.ArgumentParser()
-    parser.add_argument("vocab_file", type=str, help="path to vocab 'gguf' file")
-    parser.add_argument("dir_tokenizer", type=str, help="directory containing 'tokenizer.model' file")
-    parser.add_argument("--verbose", action="store_true", help="increase output verbosity")
-    args = parser.parse_args(argv)
-
-    logging.basicConfig(level = logging.DEBUG if args.verbose else logging.INFO)
-    logger.info(f"VOCABFILE: '{args.vocab_file}'")
-
-    tokenizer1 = TokenizerGroundtruth(args.dir_tokenizer)
-    tokenizer2 = TokenizerLlamaCpp(args.vocab_file)
-
-    # compare_tokenizers(tokenizer1, tokenizer2, generator_custom_text())
-    # compare_tokenizers(tokenizer1, tokenizer2, generator_custom_text_edge_cases())
-    compare_tokenizers(tokenizer1, tokenizer2, generator_ascii_lr_strip())
-    compare_tokenizers(tokenizer1, tokenizer2, generator_apostrophe())
-    compare_tokenizers(tokenizer1, tokenizer2, generator_unicodes())
-    compare_tokenizers(tokenizer1, tokenizer2, generator_vocab_words(tokenizer1))
-    compare_tokenizers(tokenizer1, tokenizer2, generator_added_lr_strip(tokenizer1))
-    # compare_tokenizers(tokenizer1, tokenizer2, generator_random_added_tokens(tokenizer1, 10_000))
-    # compare_tokenizers(tokenizer1, tokenizer2, generator_random_chars(10_000))
-    # compare_tokenizers(tokenizer1, tokenizer2, generator_random_unicodes(10_000))
-    # compare_tokenizers(tokenizer1, tokenizer2, generator_random_vocab_chars(tokenizer1, 10_000))
-    # compare_tokenizers(tokenizer1, tokenizer2, generator_random_vocab_words(tokenizer1, 5_000))
-
-    tokenizer2.model.free()
-
-
-if __name__ == "__main__":
-    # main()
-
-    if True:
-        logging.basicConfig(
-            level    = logging.DEBUG,
-            format   = "%(asctime)s.%(msecs)03d %(name)s %(levelname)s %(message)s",
-            datefmt  = "%Y-%m-%d %H:%M:%S",
-            filename = logger.name + ".log",
-            filemode = "a"
-        )
-    logging.basicConfig(
-        level    = logging.DEBUG,
-        format   = "%(levelname)s %(message)s",
-    )
-
-    path_tokenizers   = Path("./models/tokenizers/")
-    path_vocab_format = "./models/ggml-vocab-%s.gguf"
-
-    tokenizers = [
-        "llama-spm",      # SPM
-        "phi-3",          # SPM
-        "gemma",          # SPM
-        "gemma-2",        # SPM
-        "baichuan",       # SPM
-        "bert-bge",       # WPM
-        "jina-v2-en",     # WPM
-        "llama-bpe",      # BPE
-        "phi-2",          # BPE
-        "deepseek-llm",   # BPE
-        "deepseek-coder", # BPE
-        "falcon",         # BPE
-        "mpt",            # BPE
-        "starcoder",      # BPE
-        "gpt-2",          # BPE
-        "stablelm2",      # BPE
-        "refact",         # BPE
-        "qwen2",          # BPE
-        "olmo",           # BPE
-        "jina-v2-es",     # BPE
-        "jina-v2-de",     # BPE
-        "smaug-bpe",      # BPE
-        "poro-chat",      # BPE
-        "jina-v2-code",   # BPE
-        "viking",         # BPE
-        "jais",           # BPE
-    ]
-
-    logger.info("=" * 50)
-    for tokenizer in tokenizers:
-        logger.info("-" * 50)
-        logger.info(f"TOKENIZER: '{tokenizer}'")
-        vocab_file = Path(path_vocab_format % tokenizer)
-        dir_tokenizer = path_tokenizers / tokenizer
-        main([str(vocab_file), str(dir_tokenizer), "--verbose"])