Merge pull request #5 from zihaoccc/cleanup2

remove tests
2024-07-26 16:39:41 -07:00 · 2024-07-26 16:39:41 -07:00 · f7c0f9f576
commit f7c0f9f576
parent 0e5165b605 4810ab1aa1
28 changed files with 0 additions and 10585 deletions
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@ -63,7 +63,6 @@ option(LLAMA_SANITIZE_ADDRESS   "llama: enable address sanitizer"   OFF)
 option(LLAMA_SANITIZE_UNDEFINED "llama: enable undefined sanitizer" OFF)

 # extra artifacts
-option(LLAMA_BUILD_TESTS    "llama: build tests"          ${LLAMA_STANDALONE})
 option(LLAMA_BUILD_EXAMPLES "llama: build examples"       ${LLAMA_STANDALONE})
 option(LLAMA_BUILD_SERVER   "llama: build server example" ${LLAMA_STANDALONE})

@ -189,11 +188,6 @@ install(FILES "${CMAKE_CURRENT_BINARY_DIR}/llama.pc"

 add_subdirectory(common)

-if (LLAMA_BUILD_TESTS AND NOT CMAKE_JS_VERSION)
-    include(CTest)
-    add_subdirectory(tests)
-endif ()
-
 if (LLAMA_BUILD_EXAMPLES)
    add_subdirectory(examples)
    add_subdirectory(pocs)
--- a/tests/.gitignore
+++ b/tests/.gitignore
@ -1,4 +0,0 @@
-*
-!*.*
-*.o
-ggml-common.h
--- a/tests/CMakeLists.txt
+++ b/tests/CMakeLists.txt
@ -1,137 +0,0 @@
-function(llama_test target)
-    include(CMakeParseArguments)
-    set(options)
-    set(oneValueArgs NAME LABEL WORKING_DIRECTORY)
-    set(multiValueArgs ARGS)
-    cmake_parse_arguments(LLAMA_TEST "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
-
-    if (NOT DEFINED LLAMA_TEST_LABEL)
-        set(LLAMA_TEST_LABEL "main")
-    endif()
-    if (NOT DEFINED LLAMA_TEST_WORKING_DIRECTORY)
-        set(LLAMA_TEST_WORKING_DIRECTORY .)
-    endif()
-    if (DEFINED LLAMA_TEST_NAME)
-        set(TEST_NAME ${LLAMA_TEST_NAME})
-    else()
-        set(TEST_NAME ${target})
-    endif()
-
-    set(TEST_TARGET ${target})
-
-    add_test(
-        NAME ${TEST_NAME}
-        WORKING_DIRECTORY ${LLAMA_TEST_WORKING_DIRECTORY}
-        COMMAND $<TARGET_FILE:${TEST_TARGET}>
-        ${LLAMA_TEST_ARGS})
-
-    set_property(TEST ${TEST_NAME} PROPERTY LABELS ${LLAMA_TEST_LABEL})
-endfunction()
-
-# Builds and runs a test source file.
-# Optional args:
-# - NAME: name of the executable & test target (defaults to the source file name without extension)
-# - LABEL: label for the test (defaults to main)
-# - ARGS: arguments to pass to the test executable
-# - WORKING_DIRECTORY
-function(llama_target_and_test source)
-    include(CMakeParseArguments)
-    set(options)
-    set(oneValueArgs NAME LABEL WORKING_DIRECTORY)
-    set(multiValueArgs ARGS)
-    cmake_parse_arguments(LLAMA_TEST "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
-
-    if (NOT DEFINED LLAMA_TEST_LABEL)
-        set(LLAMA_TEST_LABEL "main")
-    endif()
-    if (NOT DEFINED LLAMA_TEST_WORKING_DIRECTORY)
-        set(LLAMA_TEST_WORKING_DIRECTORY .)
-    endif()
-    if (DEFINED LLAMA_TEST_NAME)
-        set(TEST_TARGET ${LLAMA_TEST_NAME})
-    else()
-        get_filename_component(TEST_TARGET ${source} NAME_WE)
-    endif()
-
-    add_executable(${TEST_TARGET} ${source} get-model.cpp)
-    install(TARGETS ${TEST_TARGET} RUNTIME)
-    target_link_libraries(${TEST_TARGET} PRIVATE common)
-    add_test(
-        NAME ${TEST_TARGET}
-        WORKING_DIRECTORY ${LLAMA_TEST_WORKING_DIRECTORY}
-        COMMAND $<TARGET_FILE:${TEST_TARGET}>
-        ${LLAMA_TEST_ARGS})
-
-    set_property(TEST ${TEST_TARGET} PROPERTY LABELS ${LLAMA_TEST_LABEL})
-endfunction()
-
-# build test-tokenizer-0 target once and add many tests
-add_executable(test-tokenizer-0 test-tokenizer-0.cpp)
-target_link_libraries(test-tokenizer-0 PRIVATE common)
-install(TARGETS test-tokenizer-0 RUNTIME)
-
-llama_test(test-tokenizer-0 NAME test-tokenizer-0-bert-bge          ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-bert-bge.gguf)
-llama_test(test-tokenizer-0 NAME test-tokenizer-0-command-r         ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-command-r.gguf)
-llama_test(test-tokenizer-0 NAME test-tokenizer-0-deepseek-coder    ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-deepseek-coder.gguf)
-llama_test(test-tokenizer-0 NAME test-tokenizer-0-deepseek-llm      ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-deepseek-llm.gguf)
-llama_test(test-tokenizer-0 NAME test-tokenizer-0-falcon            ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-falcon.gguf)
-llama_test(test-tokenizer-0 NAME test-tokenizer-0-gpt-2             ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-gpt-2.gguf)
-llama_test(test-tokenizer-0 NAME test-tokenizer-0-llama-bpe         ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-llama-bpe.gguf)
-llama_test(test-tokenizer-0 NAME test-tokenizer-0-llama-spm         ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-llama-spm.gguf)
-llama_test(test-tokenizer-0 NAME test-tokenizer-0-mpt               ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-mpt.gguf)
-llama_test(test-tokenizer-0 NAME test-tokenizer-0-phi-3             ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-phi-3.gguf)
-llama_test(test-tokenizer-0 NAME test-tokenizer-0-qwen2             ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-qwen2.gguf)
-llama_test(test-tokenizer-0 NAME test-tokenizer-0-refact            ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-refact.gguf)
-llama_test(test-tokenizer-0 NAME test-tokenizer-0-starcoder         ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-starcoder.gguf)
-
-# build test-tokenizer-1-bpe target once and add many tests
-add_executable(test-tokenizer-1-bpe test-tokenizer-1-bpe.cpp)
-target_link_libraries(test-tokenizer-1-bpe PRIVATE common)
-install(TARGETS test-tokenizer-1-bpe RUNTIME)
-
-# TODO: disabled due to slowness
-#llama_test(test-tokenizer-1-bpe NAME test-tokenizer-1-aquila    ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-aquila.gguf)
-#llama_test(test-tokenizer-1-bpe NAME test-tokenizer-1-falcon    ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-falcon.gguf)
-#llama_test(test-tokenizer-1-bpe NAME test-tokenizer-1-gpt-2     ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-gpt-2.gguf)
-#llama_test(test-tokenizer-1-bpe NAME test-tokenizer-1-gpt-neox  ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-gpt-neox.gguf)
-#llama_test(test-tokenizer-1-bpe NAME test-tokenizer-1-llama-bpe ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-llama-bpe.gguf --ignore-merges)
-#llama_test(test-tokenizer-1-bpe NAME test-tokenizer-1-mpt       ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-mpt.gguf)
-#llama_test(test-tokenizer-1-bpe NAME test-tokenizer-1-refact    ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-refact.gguf)
-#llama_test(test-tokenizer-1-bpe NAME test-tokenizer-1-starcoder ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-starcoder.gguf)
-
-# build test-tokenizer-1-spm target once and add many tests
-add_executable(test-tokenizer-1-spm test-tokenizer-1-spm.cpp)
-target_link_libraries(test-tokenizer-1-spm PRIVATE common)
-install(TARGETS test-tokenizer-1-spm RUNTIME)
-
-llama_test(test-tokenizer-1-spm  NAME test-tokenizer-1-llama-spm ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-llama-spm.gguf)
-#llama_test(test-tokenizer-1-spm  NAME test-tokenizer-1-baichuan  ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-baichuan.gguf)
-
-# llama_target_and_test(test-double-float.cpp) # SLOW
-llama_target_and_test(test-quantize-fns.cpp)
-llama_target_and_test(test-quantize-perf.cpp)
-llama_target_and_test(test-sampling.cpp)
-llama_target_and_test(test-chat-template.cpp)
-
-llama_target_and_test(test-grammar-parser.cpp)
-llama_target_and_test(test-llama-grammar.cpp)
-llama_target_and_test(test-grammar-integration.cpp)
-llama_target_and_test(test-grad0.cpp)
-# llama_target_and_test(test-opt.cpp) # SLOW
-llama_target_and_test(test-backend-ops.cpp)
-
-llama_target_and_test(test-rope.cpp)
-
-llama_target_and_test(test-model-load-cancel.cpp  LABEL "model")
-llama_target_and_test(test-autorelease.cpp        LABEL "model")
-
-# TODO: disabled on loongarch64 because the ggml-ci node lacks Python 3.8
-if (NOT ${CMAKE_SYSTEM_PROCESSOR} MATCHES "loongarch64")
-    llama_target_and_test(test-json-schema-to-grammar.cpp   WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}/..)
-    target_include_directories(test-json-schema-to-grammar PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/../examples/server)
-endif()
-
-# dummy executable - not installed
-get_filename_component(TEST_TARGET test-c.c NAME_WE)
-add_executable(${TEST_TARGET} test-c.c)
-target_link_libraries(${TEST_TARGET} PRIVATE llama)
--- a/tests/get-model.cpp
+++ b/tests/get-model.cpp
@ -1,21 +0,0 @@
-#include <cstdio>
-#include <cstdlib>
-#include <cstring>
-
-#include "get-model.h"
-
-char * get_model_or_exit(int argc, char *argv[]) {
-    char * model_path;
-    if (argc > 1) {
-        model_path = argv[1];
-
-    } else {
-        model_path = getenv("LLAMACPP_TEST_MODELFILE");
-        if (!model_path || strlen(model_path) == 0) {
-            fprintf(stderr, "\033[33mWARNING: No model file provided. Skipping this test. Set LLAMACPP_TEST_MODELFILE=<gguf_model_path> to silence this warning and run this test.\n\033[0m");
-            exit(EXIT_SUCCESS);
-        }
-    }
-
-    return model_path;
-}
--- a/tests/get-model.h
+++ b/tests/get-model.h
@ -1,2 +0,0 @@
-#pragma once
-char * get_model_or_exit(int, char*[]);
--- a/tests/run-json-schema-to-grammar.mjs
+++ b/tests/run-json-schema-to-grammar.mjs
@ -1,10 +0,0 @@
-import { readFileSync } from "fs"
-import { SchemaConverter } from "../examples/server/public/json-schema-to-grammar.mjs"
-
-const [, , file] = process.argv
-const url = `file://${file}`
-let schema = JSON.parse(readFileSync(file, "utf8"));
-const converter = new SchemaConverter({})
-schema = await converter.resolveRefs(schema, url)
-converter.visit(schema, '')
-console.log(converter.formatGrammar())
--- a/tests/test-autorelease.cpp
+++ b/tests/test-autorelease.cpp
@ -1,24 +0,0 @@
-// ref: https://github.com/ggerganov/llama.cpp/issues/4952#issuecomment-1892864763
-
-#include <cstdio>
-#include <string>
-#include <thread>
-
-#include "llama.h"
-#include "get-model.h"
-
-// This creates a new context inside a pthread and then tries to exit cleanly.
-int main(int argc, char ** argv) {
-    auto * model_path = get_model_or_exit(argc, argv);
-
-    std::thread([&model_path]() {
-        llama_backend_init();
-        auto * model = llama_load_model_from_file(model_path, llama_model_default_params());
-        auto * ctx = llama_new_context_with_model(model, llama_context_default_params());
-        llama_free(ctx);
-        llama_free_model(model);
-        llama_backend_free();
-    }).join();
-
-    return 0;
-}
--- a/tests/test-backend-ops.cpp
+++ b/tests/test-backend-ops.cpp
--- a/tests/test-c.c
+++ b/tests/test-c.c
@ -1,7 +0,0 @@
-#include "llama.h"
-
-#ifdef GGML_USE_KOMPUTE
-#include "ggml-kompute.h"
-#endif
-
-int main(void) {}
--- a/tests/test-chat-template.cpp
+++ b/tests/test-chat-template.cpp
@ -1,177 +0,0 @@
-#include <string>
-#include <vector>
-#include <sstream>
-
-#undef NDEBUG
-#include <cassert>
-
-#include "llama.h"
-#include "common.h"
-
-int main(void) {
-    llama_chat_message conversation[] = {
-        {"system", "You are a helpful assistant"},
-        {"user", "Hello"},
-        {"assistant", "Hi there"},
-        {"user", "Who are you"},
-        {"assistant", "   I am an assistant   "},
-        {"user", "Another question"},
-    };
-    size_t message_count = 6;
-    std::vector<std::string> templates = {
-        // teknium/OpenHermes-2.5-Mistral-7B
-        "{% for message in messages %}{{'<|im_start|>' + message['role'] + '\\n' + message['content'] + '<|im_end|>' + '\\n'}}{% endfor %}{% if add_generation_prompt %}{{ '<|im_start|>assistant\\n' }}{% endif %}",
-        // mistralai/Mistral-7B-Instruct-v0.2
-        "{{ bos_token }}{% for message in messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% if message['role'] == 'user' %}{{ '[INST] ' + message['content'] + ' [/INST]' }}{% elif message['role'] == 'assistant' %}{{ message['content'] + eos_token}}{% else %}{{ raise_exception('Only user and assistant roles are supported!') }}{% endif %}{% endfor %}",
-        // TheBloke/FusionNet_34Bx2_MoE-AWQ
-        "{%- for idx in range(0, messages|length) -%}\\n{%- if messages[idx]['role'] == 'user' -%}\\n{%- if idx > 1 -%}\\n{{- bos_token + '[INST] ' + messages[idx]['content'] + ' [/INST]' -}}\\n{%- else -%}\\n{{- messages[idx]['content'] + ' [/INST]' -}}\\n{%- endif -%}\\n{% elif messages[idx]['role'] == 'system' %}\\n{{- '[INST] <<SYS>>\\\\n' + messages[idx]['content'] + '\\\\n<</SYS>>\\\\n\\\\n' -}}\\n{%- elif messages[idx]['role'] == 'assistant' -%}\\n{{- ' '  + messages[idx]['content'] + ' ' + eos_token -}}\\n{% endif %}\\n{% endfor %}",
-        // bofenghuang/vigogne-2-70b-chat
-        "{{ bos_token }}{% if messages[0]['role'] == 'system' %}{% set loop_messages = messages[1:] %}{% set system_message = messages[0]['content'] %}{% elif true == true and not '<<SYS>>' in messages[0]['content'] %}{% set loop_messages = messages %}{% set system_message = 'Vous êtes Vigogne, un assistant IA créé par Zaion Lab. Vous suivez extrêmement bien les instructions. Aidez autant que vous le pouvez.' %}{% else %}{% set loop_messages = messages %}{% set system_message = false %}{% endif %}{% for message in loop_messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% if loop.index0 == 0 and system_message != false %}{% set content = '<<SYS>>\\\\n' + system_message + '\\\\n<</SYS>>\\\\n\\\\n' + message['content'] %}{% else %}{% set content = message['content'] %}{% endif %}{% if message['role'] == 'user' %}{{ '[INST] ' + content.strip() + ' [/INST]' }}{% elif message['role'] == 'system' %}{{ '<<SYS>>\\\\n' + content.strip() + '\\\\n<</SYS>>\\\\n\\\\n' }}{% elif message['role'] == 'assistant' %}{{ ' '  + content.strip() + ' ' + eos_token }}{% endif %}{% endfor %}",
-        // mlabonne/AlphaMonarch-7B
-        "{% for message in messages %}{{bos_token + message['role'] + '\\n' + message['content'] + eos_token + '\\n'}}{% endfor %}{% if add_generation_prompt %}{{ bos_token + 'assistant\\n' }}{% endif %}",
-        // google/gemma-7b-it
-        "{% if messages[0]['role'] == 'system' %}{{ raise_exception('System role not supported') }}{% endif %}{% for message in messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% if (message['role'] == 'assistant') %}{% set role = 'model' %}{% else %}{% set role = message['role'] %}{% endif %}{{ '<start_of_turn>' + role + '\\n' + message['content'] | trim + '<end_of_turn>\\n' }}{% endfor %}{% if add_generation_prompt %}{{'<start_of_turn>model\\n'}}{% endif %}",
-        // OrionStarAI/Orion-14B-Chat
-        "{% for message in messages %}{% if loop.first %}{{ bos_token }}{% endif %}{% if message['role'] == 'user' %}{{ 'Human: ' + message['content'] + '\\n\\nAssistant: ' + eos_token }}{% elif message['role'] == 'assistant' %}{{ message['content'] + eos_token }}{% endif %}{% endfor %}",
-        // openchat/openchat-3.5-0106
-        // The included chat_template differs from the author's suggestions here: https://huggingface.co/openchat/openchat_3.5/discussions/5#65448109b4a3f3a2f486fd9d
-        // So we match against the included template but implement the suggested version.
-        "{{ bos_token }}{% for message in messages %}{{ 'GPT4 Correct ' + message['role'].title() + ': ' + message['content'] + '<|end_of_turn|>'}}{% endfor %}{% if add_generation_prompt %}{{ 'GPT4 Correct Assistant:' }}{% endif %}",
-        // deepseek-ai/deepseek-coder-33b-instruct
-        "{% if not add_generation_prompt is defined %}\n{% set add_generation_prompt = false %}\n{% endif %}\n{%- set ns = namespace(found=false) -%}\n{%- for message in messages -%}\n    {%- if message['role'] == 'system' -%}\n        {%- set ns.found = true -%}\n    {%- endif -%}\n{%- endfor -%}\n{{bos_token}}{%- if not ns.found -%}\n{{'You are an AI programming assistant, utilizing the Deepseek Coder model, developed by Deepseek Company, and you only answer questions related to computer science. For politically sensitive questions, security and privacy issues, and other non-computer science questions, you will refuse to answer\\n'}}\n{%- endif %}\n{%- for message in messages %}\n    {%- if message['role'] == 'system' %}\n{{ message['content'] }}\n    {%- else %}\n        {%- if message['role'] == 'user' %}\n{{'### Instruction:\\n' + message['content'] + '\\n'}}\n        {%- else %}\n{{'### Response:\\n' + message['content'] + '\\n<|EOT|>\\n'}}\n        {%- endif %}\n    {%- endif %}\n{%- endfor %}\n{% if add_generation_prompt %}\n{{'### Response:'}}\n{% endif %}",
-        // eachadea/vicuna-13b-1.1
-        // No template included in tokenizer_config.json, so this template likely needs to be manually set.
-        "{%- for message in messages %}{%- if message['role'] == 'system' -%}{{- '' + message['content'] + '\n\n' -}}{%- else -%}{%- if message['role'] == 'user' -%}{{-'USER: ' + message['content'] + '\n'-}}{%- else -%}{{-'ASSISTANT: ' + message['content'] + '</s>\n' -}}{%- endif -%}{%- endif -%}{%- endfor -%}{%- if add_generation_prompt -%}{{-'ASSISTANT:'-}}{%- endif -%}",
-        // Orca-Vicuna
-        // No template included in tokenizer_config.json, so this template likely needs to be manually set.
-        "{%- for message in messages %}{%- if message['role'] == 'system' -%}{{-'SYSTEM: ' + message['content'] + '\n' -}}{%- else -%}{%- if message['role'] == 'user' -%}{{-'USER: ' + message['content'] + '\n'-}}{%- else -%}{{-'ASSISTANT: ' + message['content'] + '</s>\n' -}}{%- endif -%}{%- endif -%}{%- endfor -%}{%- if add_generation_prompt -%}{{-'ASSISTANT:'-}}{%- endif -%}",
-        // CohereForAI/c4ai-command-r-plus
-        "{{ bos_token }}{% if messages[0]['role'] == 'system' %}{% set loop_messages = messages[1:] %}{% set system_message = messages[0]['content'] %}{% elif false == true %}{% set loop_messages = messages %}{% set system_message = 'You are Command-R, a brilliant, sophisticated, AI-assistant trained to assist human users by providing thorough responses. You are trained by Cohere.' %}{% else %}{% set loop_messages = messages %}{% set system_message = false %}{% endif %}{% if system_message != false %}{{ '<|START_OF_TURN_TOKEN|><|SYSTEM_TOKEN|>' + system_message + '<|END_OF_TURN_TOKEN|>' }}{% endif %}{% for message in loop_messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% set content = message['content'] %}{% if message['role'] == 'user' %}{{ '<|START_OF_TURN_TOKEN|><|USER_TOKEN|>' + content.strip() + '<|END_OF_TURN_TOKEN|>' }}{% elif message['role'] == 'assistant' %}{{ '<|START_OF_TURN_TOKEN|><|CHATBOT_TOKEN|>'  + content.strip() + '<|END_OF_TURN_TOKEN|>' }}{% endif %}{% endfor %}{% if add_generation_prompt %}{{ '<|START_OF_TURN_TOKEN|><|CHATBOT_TOKEN|>' }}{% endif %}",
-        // Llama-3
-        "{% set loop_messages = messages %}{% for message in loop_messages %}{% set content = '<|start_header_id|>' + message['role'] + '<|end_header_id|>\n\n'+ message['content'] | trim + '<|eot_id|>' %}{% if loop.index0 == 0 %}{% set content = bos_token + content %}{% endif %}{{ content }}{% endfor %}{{ '<|start_header_id|>assistant<|end_header_id|>\n\n' }}",
-        //Phi-3-mini
-        "{{ bos_token }}{% for message in messages %}{% if (message['role'] == 'user') %}{{'<|user|>' + '\n' + message['content'] + '<|end|>' + '\n' + '<|assistant|>' + '\n'}}{% elif (message['role'] == 'assistant') %}{{message['content'] + '<|end|>' + '\n'}}{% endif %}{% endfor %}",
-        //Phi-3-small
-        "{{ bos_token }}{% for message in messages %}{{'<|' + message['role'] + '|>' + '\n' + message['content'] + '<|end|>\n' }}{% endfor %}{% if add_generation_prompt %}{{ '<|assistant|>\n' }}{% else %}{{ eos_token }}{% endif %}",
-        //Phi-3-medium
-        "{% for message in messages %}{% if (message['role'] == 'user') %}{{'<|user|>' + '\n' + message['content'] + '<|end|>' + '\n' + '<|assistant|>' + '\n'}}{% elif (message['role'] == 'assistant') %}{{message['content'] + '<|end|>' + '\n'}}{% endif %}{% endfor %}",
-        //Phi-3-vision
-        "{% for message in messages %}{{'<|' + message['role'] + '|>' + '\n' + message['content'] + '<|end|>\n' }}{% endfor %}{% if add_generation_prompt and messages[-1]['role'] != 'assistant' %}{{- '<|assistant|>\n' -}}{% endif %}",
-        // ChatGLM3
-        "{% for message in messages %}{% if loop.first %}[gMASK]sop<|{{ message['role'] }}|>\n {{ message['content'] }}{% else %}<|{{ message['role'] }}|>\n {{ message['content'] }}{% endif %}{% endfor %}{% if add_generation_prompt %}<|assistant|>{% endif %}",
-        // ChatGLM4
-        u8"[gMASK]<sop>{% for item in messages %}{% if item['tools'] is defined %}<|system|>\n你是一个名为 ChatGLM 的人工智能助手。你是基于智谱AI训练的语言模型 GLM-4 模型开发的，你的任务是针对用户的问题和要求提供适当的答复和支持。\n\n# 可用工具{% set tools = item['tools'] %}{% for tool in tools %}{% if tool['type'] == 'function' %}\n\n## {{ tool['function']['name'] }}\n\n{{ tool['function'] | tojson(indent=4) }}\n......{% endif %}{% endfor %}{% endif %}{% if item['content'] %}<|{{ item['role'] }}|>{{ item['metadata'] }}\n{{ item['content'] }}{% endif %}{% endfor %}{% if add_generation_prompt %}<|assistant|>{% endif %}",
-        // MiniCPM-3B-OpenHermes-2.5-v2-GGUF
-        u8"{% for message in messages %}{% if message['role'] == 'user' %}{{'<用户>' + message['content'].strip() + '<AI>'}}{% else %}{{message['content'].strip()}}{% endif %}{% endfor %}",
-        // DeepSeek-V2
-        "{% if not add_generation_prompt is defined %}{% set add_generation_prompt = false %}{% endif %}{{ bos_token }}{% for message in messages %}{% if message['role'] == 'user' %}{{ 'User: ' + message['content'] + '\n\n' }}{% elif message['role'] == 'assistant' %}{{ 'Assistant: ' + message['content'] + eos_token }}{% elif message['role'] == 'system' %}{{ message['content'] + '\n\n' }}{% endif %}{% endfor %}{% if add_generation_prompt %}{{ 'Assistant:' }}{% endif %}",
-    };
-    std::vector<std::string> expected_output = {
-        // teknium/OpenHermes-2.5-Mistral-7B
-        "<|im_start|>system\nYou are a helpful assistant<|im_end|>\n<|im_start|>user\nHello<|im_end|>\n<|im_start|>assistant\nHi there<|im_end|>\n<|im_start|>user\nWho are you<|im_end|>\n<|im_start|>assistant\n   I am an assistant   <|im_end|>\n<|im_start|>user\nAnother question<|im_end|>\n<|im_start|>assistant\n",
-        // mistralai/Mistral-7B-Instruct-v0.2
-        "[INST] You are a helpful assistant\nHello [/INST]Hi there</s>[INST] Who are you [/INST]   I am an assistant   </s>[INST] Another question [/INST]",
-        // TheBloke/FusionNet_34Bx2_MoE-AWQ
-        "[INST] <<SYS>>\nYou are a helpful assistant\n<</SYS>>\n\nHello [/INST] Hi there </s><s>[INST] Who are you [/INST]    I am an assistant    </s><s>[INST] Another question [/INST]",
-        // bofenghuang/vigogne-2-70b-chat
-        "[INST] <<SYS>>\nYou are a helpful assistant\n<</SYS>>\n\nHello [/INST] Hi there </s>[INST] Who are you [/INST] I am an assistant </s>[INST] Another question [/INST]",
-        // mlabonne/AlphaMonarch-7B
-        "system\nYou are a helpful assistant</s>\n<s>user\nHello</s>\n<s>assistant\nHi there</s>\n<s>user\nWho are you</s>\n<s>assistant\n   I am an assistant   </s>\n<s>user\nAnother question</s>\n<s>assistant\n",
-        // google/gemma-7b-it
-        "<start_of_turn>user\nYou are a helpful assistant\n\nHello<end_of_turn>\n<start_of_turn>model\nHi there<end_of_turn>\n<start_of_turn>user\nWho are you<end_of_turn>\n<start_of_turn>model\nI am an assistant<end_of_turn>\n<start_of_turn>user\nAnother question<end_of_turn>\n<start_of_turn>model\n",
-        // OrionStarAI/Orion-14B-Chat
-        "Human: You are a helpful assistant\n\nHello\n\nAssistant: </s>Hi there</s>Human: Who are you\n\nAssistant: </s>   I am an assistant   </s>Human: Another question\n\nAssistant: </s>",
-        // openchat/openchat-3.5-0106
-        "You are a helpful assistant<|end_of_turn|>GPT4 Correct User: Hello<|end_of_turn|>GPT4 Correct Assistant: Hi there<|end_of_turn|>GPT4 Correct User: Who are you<|end_of_turn|>GPT4 Correct Assistant:    I am an assistant   <|end_of_turn|>GPT4 Correct User: Another question<|end_of_turn|>GPT4 Correct Assistant:",
-        // deepseek-ai/deepseek-coder-33b-instruct
-        "You are a helpful assistant### Instruction:\nHello\n### Response:\nHi there\n<|EOT|>\n### Instruction:\nWho are you\n### Response:\n   I am an assistant   \n<|EOT|>\n### Instruction:\nAnother question\n### Response:\n",
-        // eachadea/vicuna-13b-1.1
-        "You are a helpful assistant\n\nUSER: Hello\nASSISTANT: Hi there</s>\nUSER: Who are you\nASSISTANT:    I am an assistant   </s>\nUSER: Another question\nASSISTANT:",
-        // Orca-Vicuna
-        "SYSTEM: You are a helpful assistant\nUSER: Hello\nASSISTANT: Hi there</s>\nUSER: Who are you\nASSISTANT:    I am an assistant   </s>\nUSER: Another question\nASSISTANT:",
-        // CohereForAI/c4ai-command-r-plus
-        "<|START_OF_TURN_TOKEN|><|SYSTEM_TOKEN|>You are a helpful assistant<|END_OF_TURN_TOKEN|><|START_OF_TURN_TOKEN|><|USER_TOKEN|>Hello<|END_OF_TURN_TOKEN|><|START_OF_TURN_TOKEN|><|CHATBOT_TOKEN|>Hi there<|END_OF_TURN_TOKEN|><|START_OF_TURN_TOKEN|><|USER_TOKEN|>Who are you<|END_OF_TURN_TOKEN|><|START_OF_TURN_TOKEN|><|CHATBOT_TOKEN|>I am an assistant<|END_OF_TURN_TOKEN|><|START_OF_TURN_TOKEN|><|USER_TOKEN|>Another question<|END_OF_TURN_TOKEN|><|START_OF_TURN_TOKEN|><|CHATBOT_TOKEN|>",
-        // Llama 3
-        "<|start_header_id|>system<|end_header_id|>\n\nYou are a helpful assistant<|eot_id|><|start_header_id|>user<|end_header_id|>\n\nHello<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\nHi there<|eot_id|><|start_header_id|>user<|end_header_id|>\n\nWho are you<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\nI am an assistant<|eot_id|><|start_header_id|>user<|end_header_id|>\n\nAnother question<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n",
-        //Phi-3-mini
-        "<|system|>\nYou are a helpful assistant<|end|>\n<|user|>\nHello<|end|>\n<|assistant|>\nHi there<|end|>\n<|user|>\nWho are you<|end|>\n<|assistant|>\n   I am an assistant   <|end|>\n<|user|>\nAnother question<|end|>\n<|assistant|>\n",
-        //Phi-3-small
-        "<|system|>\nYou are a helpful assistant<|end|>\n<|user|>\nHello<|end|>\n<|assistant|>\nHi there<|end|>\n<|user|>\nWho are you<|end|>\n<|assistant|>\n   I am an assistant   <|end|>\n<|user|>\nAnother question<|end|>\n<|assistant|>\n",
-        //Phi-3-medium
-        "<|system|>\nYou are a helpful assistant<|end|>\n<|user|>\nHello<|end|>\n<|assistant|>\nHi there<|end|>\n<|user|>\nWho are you<|end|>\n<|assistant|>\n   I am an assistant   <|end|>\n<|user|>\nAnother question<|end|>\n<|assistant|>\n",
-        //Phi-3-vision
-        "<|system|>\nYou are a helpful assistant<|end|>\n<|user|>\nHello<|end|>\n<|assistant|>\nHi there<|end|>\n<|user|>\nWho are you<|end|>\n<|assistant|>\n   I am an assistant   <|end|>\n<|user|>\nAnother question<|end|>\n<|assistant|>\n",
-        // ChatGLM3
-        "[gMASK]sop<|system|>\n You are a helpful assistant<|user|>\n Hello<|assistant|>\n Hi there<|user|>\n Who are you<|assistant|>\n    I am an assistant   <|user|>\n Another question<|assistant|>",
-        // ChatGLM4
-        "[gMASK]<sop><|system|>\nYou are a helpful assistant<|user|>\nHello<|assistant|>\nHi there<|user|>\nWho are you<|assistant|>\n   I am an assistant   <|user|>\nAnother question<|assistant|>",
-        // MiniCPM-3B-OpenHermes-2.5-v2-GGUF
-        u8"You are a helpful assistant<用户>Hello<AI>Hi there<用户>Who are you<AI>I am an assistant<用户>Another question<AI>",
-        // DeepSeek-V2
-        u8"You are a helpful assistant\n\nUser: Hello\n\nAssistant: Hi there<｜end▁of▁sentence｜>User: Who are you\n\nAssistant:    I am an assistant   <｜end▁of▁sentence｜>User: Another question\n\nAssistant:",
-    };
-    std::vector<char> formatted_chat(1024);
-    int32_t res;
-
-    // test invalid chat template
-    res = llama_chat_apply_template(nullptr, "INVALID TEMPLATE", conversation, message_count, true, formatted_chat.data(), formatted_chat.size());
-    assert(res < 0);
-
-    for (size_t i = 0; i < templates.size(); i++) {
-        std::string custom_template = templates[i];
-        std::string expected = expected_output[i];
-        formatted_chat.resize(1024);
-        res = llama_chat_apply_template(
-            nullptr,
-            custom_template.c_str(),
-            conversation,
-            message_count,
-            true,
-            formatted_chat.data(),
-            formatted_chat.size()
-        );
-        formatted_chat.resize(res);
-        std::string output(formatted_chat.data(), formatted_chat.size());
-        printf("%s\n", output.c_str());
-        printf("-------------------------\n");
-        assert(output == expected);
-    }
-
-
-    // test llama_chat_format_single for system message
-    printf("\n\n=== llama_chat_format_single (system message) ===\n\n");
-    std::vector<llama_chat_msg> chat2;
-    llama_chat_msg sys_msg{"system", "You are a helpful assistant"};
-
-    auto fmt_sys = [&](std::string tmpl) {
-        auto output = llama_chat_format_single(nullptr, tmpl, chat2, sys_msg, false);
-        printf("fmt_sys(%s) : %s\n", tmpl.c_str(), output.c_str());
-        printf("-------------------------\n");
-        return output;
-    };
-    assert(fmt_sys("chatml") == "<|im_start|>system\nYou are a helpful assistant<|im_end|>\n");
-    assert(fmt_sys("llama2") == "[INST] You are a helpful assistant\n");
-    assert(fmt_sys("gemma")  == ""); // for gemma, system message is merged with user message
-    assert(fmt_sys("llama3") == "<|start_header_id|>system<|end_header_id|>\n\nYou are a helpful assistant<|eot_id|>");
-
-
-    // test llama_chat_format_single for user message
-    printf("\n\n=== llama_chat_format_single (user message) ===\n\n");
-    chat2.push_back({"system", "You are a helpful assistant"});
-    chat2.push_back({"user", "Hello"});
-    chat2.push_back({"assistant", "I am assistant"});
-    llama_chat_msg new_msg{"user", "How are you"};
-
-    auto fmt_single = [&](std::string tmpl) {
-        auto output = llama_chat_format_single(nullptr, tmpl, chat2, new_msg, true);
-        printf("fmt_single(%s) : %s\n", tmpl.c_str(), output.c_str());
-        printf("-------------------------\n");
-        return output;
-    };
-    assert(fmt_single("chatml") == "\n<|im_start|>user\nHow are you<|im_end|>\n<|im_start|>assistant\n");
-    assert(fmt_single("llama2") == "[INST] How are you [/INST]");
-    assert(fmt_single("gemma")  == "\n<start_of_turn>user\nHow are you<end_of_turn>\n<start_of_turn>model\n");
-    assert(fmt_single("llama3") == "<|start_header_id|>user<|end_header_id|>\n\nHow are you<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n");
-
-    return 0;
-}
--- a/tests/test-double-float.cpp
+++ b/tests/test-double-float.cpp
@ -1,57 +0,0 @@
-// These tests may take a long time!
-// They are to prove that conversion from double to float of various functions in ggml.c doesn't affect the result.
-// This is done by checking all finite (non-NaN, non-infinite) floats.
-
-#undef NDEBUG
-#include <cassert>
-#if !defined(__riscv) && !defined(__s390__) && !defined(__ARM_NEON)
-#include <immintrin.h>
-#endif
-#include <cmath>
-#include <cstdint>
-#include <cstring>
-
-#pragma GCC diagnostic push
-#pragma GCC diagnostic ignored "-Wdouble-promotion"
-
-// ggml.c::quantize_row_q4_0_ref
-inline static uint8_t round_orig(float v0) { return ((int8_t) (round(v0))) + 8; }
-
-// ggml.c::ggml_silu_f32
-inline static float silu_orig(float x) {
-    return x/(1.0 + exp(-x));
-}
-
-#pragma GCC diagnostic pop
-
-// ggml.c::quantize_row_q4_0_ref
-inline static uint8_t round_float(float v0) { return (int8_t)roundf(v0) + 8; }
-
-// ggml.c::ggml_silu_f32
-inline static float silu_float(float x) {
-    return x/(1.0f + expf(-x));
-}
-
-int main(void) {
-    uint32_t x = UINT32_MAX;
-    do {
-        float f;
-        memcpy(&f, &x, sizeof(x));
-        assert(!std::isfinite(f) || (round_orig(f) == round_float(f)));
-    } while (x--);
-
-#ifdef __F16C__
-    // GELU and SILU implementations are used with a FP16 lookup table.
-    // The original and float-only results are not equal for all inputs after converting to FP16.
-    // GELU is an approximation anyway (tanh), not tested here.
-    // For SILU, verify that the results are at least the closest floating point numbers, if the FP16 values don't match.
-    for (x = 0; x <= UINT16_MAX; x++) {
-        float f = _cvtsh_ss(x);
-        const float so = silu_orig(f);
-        const float sf = silu_float(f);
-        assert(   (_cvtss_sh(so, 0) == _cvtss_sh(sf, 0))
-               || (nextafterf(so, sf) == sf)
-               || (nextafterf(sf, so) == so));
-    }
-#endif
-}
--- a/tests/test-grad0.cpp
+++ b/tests/test-grad0.cpp
--- a/tests/test-grammar-integration.cpp
+++ b/tests/test-grammar-integration.cpp
--- a/tests/test-grammar-parser.cpp
+++ b/tests/test-grammar-parser.cpp
@ -1,515 +0,0 @@
-#ifdef NDEBUG
-#undef NDEBUG
-#endif
-
-#include "llama.h"
-#include "grammar-parser.h"
-
-#include <cassert>
-
-static const char * type_str(llama_gretype type) {
-    switch (type) {
-        case LLAMA_GRETYPE_CHAR: return "LLAMA_GRETYPE_CHAR";
-        case LLAMA_GRETYPE_CHAR_NOT: return "LLAMA_GRETYPE_CHAR_NOT";
-        case LLAMA_GRETYPE_CHAR_ALT: return "LLAMA_GRETYPE_CHAR_ALT";
-        case LLAMA_GRETYPE_CHAR_RNG_UPPER: return "LLAMA_GRETYPE_CHAR_RNG_UPPER";
-        case LLAMA_GRETYPE_RULE_REF: return "LLAMA_GRETYPE_RULE_REF";
-        case LLAMA_GRETYPE_ALT: return "LLAMA_GRETYPE_ALT";
-        case LLAMA_GRETYPE_END: return "LLAMA_GRETYPE_END";
-        default: return "?";
-    }
-}
-
-static void verify_parsing(const char *grammar_bytes, const std::vector<std::pair<std::string, uint32_t>> expected, const std::vector<llama_grammar_element> &expected_rules) {
-    uint32_t index = 0;
-    grammar_parser::parse_state parsed_grammar = grammar_parser::parse(grammar_bytes);
-
-    std::map<uint32_t, std::string> symbol_names;
-    for (auto it = parsed_grammar.symbol_ids.begin(); it != parsed_grammar.symbol_ids.end(); ++it) {
-        symbol_names[it->second] = it->first;
-    }
-
-    auto print_all = [&]() {
-        fprintf(stderr, "    verify_parsing(R\"\"\"(%s)\"\"\", {\n", grammar_bytes);
-        for (auto it = parsed_grammar.symbol_ids.begin(); it != parsed_grammar.symbol_ids.end(); ++it) {
-            fprintf(stderr, "        {\"%s\", %u},\n", it->first.c_str(), it->second);
-        }
-        fprintf(stderr, "    }, {\n");
-        for (size_t i_rule = 0; i_rule < parsed_grammar.rules.size(); i_rule++) {
-            fprintf(stderr, "        // %s (index %zu)\n", symbol_names[i_rule].c_str(), i_rule);
-            auto & rule = parsed_grammar.rules[i_rule];
-            for (uint32_t i = 0; i < rule.size(); i++) {
-                std::string rule_str;
-                fprintf(stderr, "        {%s, ", type_str(rule[i].type));
-                if (rule[i].type == LLAMA_GRETYPE_CHAR || rule[i].type == LLAMA_GRETYPE_CHAR_ALT ||
-                    rule[i].type == LLAMA_GRETYPE_CHAR_NOT || rule[i].type == LLAMA_GRETYPE_CHAR_RNG_UPPER) {
-                    char c = rule[i].value;
-                    if (c == '\n') {
-                        fprintf(stderr, "'\\n'");
-                    } else if (c == '\t') {
-                        fprintf(stderr, "'\\t'");
-                    } else if (c == '\r') {
-                        fprintf(stderr, "'\\r'");
-                    } else if (c == '\0') {
-                        fprintf(stderr, "'\\0'");
-                    } else {
-                        fprintf(stderr, "'%c'", c);
-                    }
-                } else if (rule[i].type == LLAMA_GRETYPE_RULE_REF) {
-                    fprintf(stderr, "/* %s */ %u", symbol_names[rule[i].value].c_str(), rule[i].value);
-                } else {
-                    fprintf(stderr, "%u", rule[i].value);
-                }
-                fprintf(stderr, "},\n");
-            }
-        }
-        fprintf(stderr, "    });\n");
-    };
-
-    if (getenv("TEST_GRAMMAR_PARSER_PRINT_ALL")) {
-        print_all();
-        fprintf(stderr, "\n");
-        return;
-    }
-
-    fprintf(stderr, "Testing grammar:%s\n", grammar_bytes);
-
-    if (parsed_grammar.symbol_ids.size() != expected.size()) {
-        fprintf(stderr, "Code to update expectation (set TEST_GRAMMAR_PARSER_PRINT_ALL=1 to print all):\n");
-        print_all();
-        assert(parsed_grammar.symbol_ids.size() == expected.size());
-    }
-
-    for (auto it = parsed_grammar.symbol_ids.begin(); it != parsed_grammar.symbol_ids.end(); ++it)
-    {
-        std::string key = it->first;
-        uint32_t value = it->second;
-        std::pair<std::string, uint32_t> expected_pair = expected[index];
-
-        // pretty print error message before asserting
-        if (expected_pair.first != key || expected_pair.second != value)
-        {
-            fprintf(stderr, "index: %u\n", index);
-            fprintf(stderr, "expected_pair: %s, %u\n", expected_pair.first.c_str(), expected_pair.second);
-            fprintf(stderr, "actual_pair: %s, %u\n", key.c_str(), value);
-            fprintf(stderr, "expected_pair != actual_pair\n");
-            fprintf(stderr, "Code to update expectation (set TEST_GRAMMAR_PARSER_PRINT_ALL=1 to print all):\n");
-            print_all();
-        }
-
-        assert(expected_pair.first == key && expected_pair.second == value);
-
-        index++;
-    }
-
-    index = 0;
-    for (auto rule : parsed_grammar.rules)
-    {
-        // compare rule to expected rule
-        for (uint32_t i = 0; i < rule.size(); i++)
-        {
-            llama_grammar_element element = rule[i];
-            llama_grammar_element expected_element = expected_rules[index];
-
-            // pretty print error message before asserting
-            if (expected_element.type != element.type || expected_element.value != element.value)
-            {
-                fprintf(stderr, "index: %u\n", index);
-                fprintf(stderr, "expected_element: %s, %u\n", type_str(expected_element.type), expected_element.value);
-                fprintf(stderr, "actual_element: %s, %u\n", type_str(element.type), element.value);
-                fprintf(stderr, "expected_element != actual_element\n");
-                fprintf(stderr, "all elements:\n");
-                fprintf(stderr, "Code to update expectation (set TEST_GRAMMAR_PARSER_PRINT_ALL=1 to print all):\n");
-                print_all();
-            }
-
-            assert(expected_element.type == element.type && expected_element.value == element.value);
-            index++;
-        }
-    }
-}
-
-static void verify_failure(const char *grammar_bytes) {
-    fprintf(stderr, "Testing expected failure:%s\n", grammar_bytes);
-    auto result = grammar_parser::parse(grammar_bytes);
-    assert(result.rules.empty() && "should have failed");
-}
-
-int main()
-{
-    verify_failure(R"""(
-        root ::= "a"{,}"
-    )""");
-
-    verify_failure(R"""(
-        root ::= "a"{,10}"
-    )""");
-
-    verify_parsing(R"""(
-        root  ::= "a"
-    )""", {
-        {"root", 0},
-    }, {
-        // root (index 0)
-        {LLAMA_GRETYPE_CHAR, 'a'},
-        {LLAMA_GRETYPE_END, 0},
-    });
-
-    verify_parsing(R"""(
-        root  ::= "a" | [bdx-z] | [^1-3]
-    )""", {
-        {"root", 0},
-    }, {
-        // root (index 0)
-        {LLAMA_GRETYPE_CHAR, 'a'},
-        {LLAMA_GRETYPE_ALT, 0},
-        {LLAMA_GRETYPE_CHAR, 'b'},
-        {LLAMA_GRETYPE_CHAR_ALT, 'd'},
-        {LLAMA_GRETYPE_CHAR_ALT, 'x'},
-        {LLAMA_GRETYPE_CHAR_RNG_UPPER, 'z'},
-        {LLAMA_GRETYPE_ALT, 0},
-        {LLAMA_GRETYPE_CHAR_NOT, '1'},
-        {LLAMA_GRETYPE_CHAR_RNG_UPPER, '3'},
-        {LLAMA_GRETYPE_END, 0},
-    });
-
-    verify_parsing(R"""(
-        root  ::= a+
-        a     ::= "a"
-    )""", {
-        {"a", 1},
-        {"root", 0},
-        {"root_2", 2},
-    }, {
-        // root (index 0)
-        {LLAMA_GRETYPE_RULE_REF, /* a */ 1},
-        {LLAMA_GRETYPE_RULE_REF, /* root_2 */ 2},
-        {LLAMA_GRETYPE_END, 0},
-        // a (index 1)
-        {LLAMA_GRETYPE_CHAR, 'a'},
-        {LLAMA_GRETYPE_END, 0},
-        // root_2 (index 2)
-        {LLAMA_GRETYPE_RULE_REF, /* a */ 1},
-        {LLAMA_GRETYPE_RULE_REF, /* root_2 */ 2},
-        {LLAMA_GRETYPE_ALT, 0},
-        {LLAMA_GRETYPE_END, 0},
-    });
-
-    verify_parsing(R"""(
-        root  ::= "a"+
-    )""", {
-        {"root", 0},
-        {"root_1", 1},
-    }, {
-        // root (index 0)
-        {LLAMA_GRETYPE_CHAR, 'a'},
-        {LLAMA_GRETYPE_RULE_REF, /* root_1 */ 1},
-        {LLAMA_GRETYPE_END, 0},
-        // root_1 (index 1)
-        {LLAMA_GRETYPE_CHAR, 'a'},
-        {LLAMA_GRETYPE_RULE_REF, /* root_1 */ 1},
-        {LLAMA_GRETYPE_ALT, 0},
-        {LLAMA_GRETYPE_END, 0},
-    });
-
-    verify_parsing(R"""(
-        root  ::= a?
-        a     ::= "a"
-    )""", {
-        {"a", 1},
-        {"root", 0},
-        {"root_2", 2},
-    }, {
-        // root (index 0)
-        {LLAMA_GRETYPE_RULE_REF, /* root_2 */ 2},
-        {LLAMA_GRETYPE_END, 0},
-        // a (index 1)
-        {LLAMA_GRETYPE_CHAR, 'a'},
-        {LLAMA_GRETYPE_END, 0},
-        // root_2 (index 2)
-        {LLAMA_GRETYPE_RULE_REF, /* a */ 1},
-        {LLAMA_GRETYPE_ALT, 0},
-        {LLAMA_GRETYPE_END, 0},
-    });
-
-    verify_parsing(R"""(
-        root  ::= "a"?
-    )""", {
-        {"root", 0},
-        {"root_1", 1},
-    }, {
-        // root (index 0)
-        {LLAMA_GRETYPE_RULE_REF, /* root_1 */ 1},
-        {LLAMA_GRETYPE_END, 0},
-        // root_1 (index 1)
-        {LLAMA_GRETYPE_CHAR, 'a'},
-        {LLAMA_GRETYPE_ALT, 0},
-        {LLAMA_GRETYPE_END, 0},
-    });
-
-    verify_parsing(R"""(
-        root  ::= a*
-        a     ::= "a"
-    )""", {
-        {"a", 1},
-        {"root", 0},
-        {"root_2", 2},
-    }, {
-        // root (index 0)
-        {LLAMA_GRETYPE_RULE_REF, /* root_2 */ 2},
-        {LLAMA_GRETYPE_END, 0},
-        // a (index 1)
-        {LLAMA_GRETYPE_CHAR, 'a'},
-        {LLAMA_GRETYPE_END, 0},
-        // root_2 (index 2)
-        {LLAMA_GRETYPE_RULE_REF, /* a */ 1},
-        {LLAMA_GRETYPE_RULE_REF, /* root_2 */ 2},
-        {LLAMA_GRETYPE_ALT, 0},
-        {LLAMA_GRETYPE_END, 0},
-    });
-
-    verify_parsing(R"""(
-        root  ::= "a"*
-    )""", {
-        {"root", 0},
-        {"root_1", 1},
-    }, {
-        // root (index 0)
-        {LLAMA_GRETYPE_RULE_REF, /* root_1 */ 1},
-        {LLAMA_GRETYPE_END, 0},
-        // root_1 (index 1)
-        {LLAMA_GRETYPE_CHAR, 'a'},
-        {LLAMA_GRETYPE_RULE_REF, /* root_1 */ 1},
-        {LLAMA_GRETYPE_ALT, 0},
-        {LLAMA_GRETYPE_END, 0},
-    });
-
-    verify_parsing(R"""(
-        root  ::= "a"{2}
-    )""", {
-        {"root", 0},
-    }, {
-        // root (index 0)
-        {LLAMA_GRETYPE_CHAR, 'a'},
-        {LLAMA_GRETYPE_CHAR, 'a'},
-        {LLAMA_GRETYPE_END, 0},
-    });
-
-    verify_parsing(R"""(
-        root  ::= "a"{2,}
-    )""", {
-        {"root", 0},
-        {"root_1", 1},
-    }, {
-        // root (index 0)
-        {LLAMA_GRETYPE_CHAR, 'a'},
-        {LLAMA_GRETYPE_CHAR, 'a'},
-        {LLAMA_GRETYPE_RULE_REF, /* root_1 */ 1},
-        {LLAMA_GRETYPE_END, 0},
-        // root_1 (index 1)
-        {LLAMA_GRETYPE_CHAR, 'a'},
-        {LLAMA_GRETYPE_RULE_REF, /* root_1 */ 1},
-        {LLAMA_GRETYPE_ALT, 0},
-        {LLAMA_GRETYPE_END, 0},
-    });
-
-    verify_parsing(R"""(
-        root  ::= "a"{ 4}
-    )""", {
-        {"root", 0},
-    }, {
-        // root (index 0)
-        {LLAMA_GRETYPE_CHAR, 'a'},
-        {LLAMA_GRETYPE_CHAR, 'a'},
-        {LLAMA_GRETYPE_CHAR, 'a'},
-        {LLAMA_GRETYPE_CHAR, 'a'},
-        {LLAMA_GRETYPE_END, 0},
-    });
-
-    verify_parsing(R"""(
-        root  ::= "a"{2,4}
-    )""", {
-        {"root", 0},
-        {"root_1", 1},
-        {"root_2", 2},
-    }, {
-        // root (index 0)
-        {LLAMA_GRETYPE_CHAR, 'a'},
-        {LLAMA_GRETYPE_CHAR, 'a'},
-        {LLAMA_GRETYPE_RULE_REF, /* root_2 */ 2},
-        {LLAMA_GRETYPE_END, 0},
-        // root_1 (index 1)
-        {LLAMA_GRETYPE_CHAR, 'a'},
-        {LLAMA_GRETYPE_ALT, 0},
-        {LLAMA_GRETYPE_END, 0},
-        // root_2 (index 2)
-        {LLAMA_GRETYPE_CHAR, 'a'},
-        {LLAMA_GRETYPE_RULE_REF, /* root_1 */ 1},
-        {LLAMA_GRETYPE_ALT, 0},
-        {LLAMA_GRETYPE_END, 0},
-    });
-
-    verify_parsing(R"""(
-        root  ::= (expr "=" term "\n")+
-        expr  ::= term ([-+*/] term)*
-        term  ::= [0-9]+
-    )""", {
-        {"expr", 2},
-        {"expr_5", 5},
-        {"expr_6", 6},
-        {"root", 0},
-        {"root_1", 1},
-        {"root_4", 4},
-        {"term", 3},
-        {"term_7", 7},
-    }, {
-        // root (index 0)
-        {LLAMA_GRETYPE_RULE_REF, /* root_1 */ 1},
-        {LLAMA_GRETYPE_RULE_REF, /* root_4 */ 4},
-        {LLAMA_GRETYPE_END, 0},
-        // root_1 (index 1)
-        {LLAMA_GRETYPE_RULE_REF, /* expr */ 2},
-        {LLAMA_GRETYPE_CHAR, '='},
-        {LLAMA_GRETYPE_RULE_REF, /* term */ 3},
-        {LLAMA_GRETYPE_CHAR, '\n'},
-        {LLAMA_GRETYPE_END, 0},
-        // expr (index 2)
-        {LLAMA_GRETYPE_RULE_REF, /* term */ 3},
-        {LLAMA_GRETYPE_RULE_REF, /* expr_6 */ 6},
-        {LLAMA_GRETYPE_END, 0},
-        // term (index 3)
-        {LLAMA_GRETYPE_CHAR, '0'},
-        {LLAMA_GRETYPE_CHAR_RNG_UPPER, '9'},
-        {LLAMA_GRETYPE_RULE_REF, /* term_7 */ 7},
-        {LLAMA_GRETYPE_END, 0},
-        // root_4 (index 4)
-        {LLAMA_GRETYPE_RULE_REF, /* root_1 */ 1},
-        {LLAMA_GRETYPE_RULE_REF, /* root_4 */ 4},
-        {LLAMA_GRETYPE_ALT, 0},
-        {LLAMA_GRETYPE_END, 0},
-        // expr_5 (index 5)
-        {LLAMA_GRETYPE_CHAR, '-'},
-        {LLAMA_GRETYPE_CHAR_ALT, '+'},
-        {LLAMA_GRETYPE_CHAR_ALT, '*'},
-        {LLAMA_GRETYPE_CHAR_ALT, '/'},
-        {LLAMA_GRETYPE_RULE_REF, /* term */ 3},
-        {LLAMA_GRETYPE_END, 0},
-        // expr_6 (index 6)
-        {LLAMA_GRETYPE_RULE_REF, /* expr_5 */ 5},
-        {LLAMA_GRETYPE_RULE_REF, /* expr_6 */ 6},
-        {LLAMA_GRETYPE_ALT, 0},
-        {LLAMA_GRETYPE_END, 0},
-        // term_7 (index 7)
-        {LLAMA_GRETYPE_CHAR, '0'},
-        {LLAMA_GRETYPE_CHAR_RNG_UPPER, '9'},
-        {LLAMA_GRETYPE_RULE_REF, /* term_7 */ 7},
-        {LLAMA_GRETYPE_ALT, 0},
-        {LLAMA_GRETYPE_END, 0},
-    });
-
-    verify_parsing(R"""(
-        root  ::= (expr "=" ws term "\n")+
-        expr  ::= term ([-+*/] term)*
-        term  ::= ident | num | "(" ws expr ")" ws
-        ident ::= [a-z] [a-z0-9_]* ws
-        num   ::= [0-9]+ ws
-        ws    ::= [ \t\n]*
-    )""", {
-        {"expr", 2},
-        {"expr_6", 6},
-        {"expr_7", 7},
-        {"ident", 8},
-        {"ident_10", 10},
-        {"num", 9},
-        {"num_11", 11},
-        {"root", 0},
-        {"root_1", 1},
-        {"root_5", 5},
-        {"term", 4},
-        {"ws", 3},
-        {"ws_12", 12},
-    }, {
-        // root (index 0)
-        {LLAMA_GRETYPE_RULE_REF, /* root_1 */ 1},
-        {LLAMA_GRETYPE_RULE_REF, /* root_5 */ 5},
-        {LLAMA_GRETYPE_END, 0},
-        // root_1 (index 1)
-        {LLAMA_GRETYPE_RULE_REF, /* expr */ 2},
-        {LLAMA_GRETYPE_CHAR, '='},
-        {LLAMA_GRETYPE_RULE_REF, /* ws */ 3},
-        {LLAMA_GRETYPE_RULE_REF, /* term */ 4},
-        {LLAMA_GRETYPE_CHAR, '\n'},
-        {LLAMA_GRETYPE_END, 0},
-        // expr (index 2)
-        {LLAMA_GRETYPE_RULE_REF, /* term */ 4},
-        {LLAMA_GRETYPE_RULE_REF, /* expr_7 */ 7},
-        {LLAMA_GRETYPE_END, 0},
-        // ws (index 3)
-        {LLAMA_GRETYPE_RULE_REF, /* ws_12 */ 12},
-        {LLAMA_GRETYPE_END, 0},
-        // term (index 4)
-        {LLAMA_GRETYPE_RULE_REF, /* ident */ 8},
-        {LLAMA_GRETYPE_ALT, 0},
-        {LLAMA_GRETYPE_RULE_REF, /* num */ 9},
-        {LLAMA_GRETYPE_ALT, 0},
-        {LLAMA_GRETYPE_CHAR, '('},
-        {LLAMA_GRETYPE_RULE_REF, /* ws */ 3},
-        {LLAMA_GRETYPE_RULE_REF, /* expr */ 2},
-        {LLAMA_GRETYPE_CHAR, ')'},
-        {LLAMA_GRETYPE_RULE_REF, /* ws */ 3},
-        {LLAMA_GRETYPE_END, 0},
-        // root_5 (index 5)
-        {LLAMA_GRETYPE_RULE_REF, /* root_1 */ 1},
-        {LLAMA_GRETYPE_RULE_REF, /* root_5 */ 5},
-        {LLAMA_GRETYPE_ALT, 0},
-        {LLAMA_GRETYPE_END, 0},
-        // expr_6 (index 6)
-        {LLAMA_GRETYPE_CHAR, '-'},
-        {LLAMA_GRETYPE_CHAR_ALT, '+'},
-        {LLAMA_GRETYPE_CHAR_ALT, '*'},
-        {LLAMA_GRETYPE_CHAR_ALT, '/'},
-        {LLAMA_GRETYPE_RULE_REF, /* term */ 4},
-        {LLAMA_GRETYPE_END, 0},
-        // expr_7 (index 7)
-        {LLAMA_GRETYPE_RULE_REF, /* expr_6 */ 6},
-        {LLAMA_GRETYPE_RULE_REF, /* expr_7 */ 7},
-        {LLAMA_GRETYPE_ALT, 0},
-        {LLAMA_GRETYPE_END, 0},
-        // ident (index 8)
-        {LLAMA_GRETYPE_CHAR, 'a'},
-        {LLAMA_GRETYPE_CHAR_RNG_UPPER, 'z'},
-        {LLAMA_GRETYPE_RULE_REF, /* ident_10 */ 10},
-        {LLAMA_GRETYPE_RULE_REF, /* ws */ 3},
-        {LLAMA_GRETYPE_END, 0},
-        // num (index 9)
-        {LLAMA_GRETYPE_CHAR, '0'},
-        {LLAMA_GRETYPE_CHAR_RNG_UPPER, '9'},
-        {LLAMA_GRETYPE_RULE_REF, /* num_11 */ 11},
-        {LLAMA_GRETYPE_RULE_REF, /* ws */ 3},
-        {LLAMA_GRETYPE_END, 0},
-        // ident_10 (index 10)
-        {LLAMA_GRETYPE_CHAR, 'a'},
-        {LLAMA_GRETYPE_CHAR_RNG_UPPER, 'z'},
-        {LLAMA_GRETYPE_CHAR_ALT, '0'},
-        {LLAMA_GRETYPE_CHAR_RNG_UPPER, '9'},
-        {LLAMA_GRETYPE_CHAR_ALT, '_'},
-        {LLAMA_GRETYPE_RULE_REF, /* ident_10 */ 10},
-        {LLAMA_GRETYPE_ALT, 0},
-        {LLAMA_GRETYPE_END, 0},
-        // num_11 (index 11)
-        {LLAMA_GRETYPE_CHAR, '0'},
-        {LLAMA_GRETYPE_CHAR_RNG_UPPER, '9'},
-        {LLAMA_GRETYPE_RULE_REF, /* num_11 */ 11},
-        {LLAMA_GRETYPE_ALT, 0},
-        {LLAMA_GRETYPE_END, 0},
-        // ws_12 (index 12)
-        {LLAMA_GRETYPE_CHAR, ' '},
-        {LLAMA_GRETYPE_CHAR_ALT, '\t'},
-        {LLAMA_GRETYPE_CHAR_ALT, '\n'},
-        {LLAMA_GRETYPE_RULE_REF, /* ws_12 */ 12},
-        {LLAMA_GRETYPE_ALT, 0},
-        {LLAMA_GRETYPE_END, 0},
-    });
-
-    return 0;
-}
--- a/tests/test-json-schema-to-grammar.cpp
+++ b/tests/test-json-schema-to-grammar.cpp
--- a/tests/test-llama-grammar.cpp
+++ b/tests/test-llama-grammar.cpp
@ -1,408 +0,0 @@
-#ifdef NDEBUG
-#undef NDEBUG
-#endif
-
-#define LLAMA_API_INTERNAL
-#include "llama.h"
-#include "grammar-parser.h"
-
-#include <cassert>
-#include <stdexcept>
-
-int main()
-{
-    grammar_parser::parse_state parsed_grammar;
-
-    std::vector<std::pair<std::string, uint32_t>> expected = {
-        {"expr", 2},
-        {"expr_6", 6},
-        {"expr_7", 7},
-        {"ident", 8},
-        {"ident_10", 10},
-        {"num", 9},
-        {"num_11", 11},
-        {"root", 0},
-        {"root_1", 1},
-        {"root_5", 5},
-        {"term", 4},
-        {"ws", 3},
-        {"ws_12", 12},
-    };
-
-    std::vector<std::vector<llama_grammar_element>> expected_rules = {
-        {{LLAMA_GRETYPE_RULE_REF, 5}, {LLAMA_GRETYPE_END, 0}},
-        {
-            {LLAMA_GRETYPE_RULE_REF, 2},
-            {LLAMA_GRETYPE_CHAR, 61},
-            {LLAMA_GRETYPE_RULE_REF, 3},
-            {LLAMA_GRETYPE_RULE_REF, 4},
-            {LLAMA_GRETYPE_CHAR, 10},
-            {LLAMA_GRETYPE_END, 0},
-        },
-        {{LLAMA_GRETYPE_RULE_REF, 4}, {LLAMA_GRETYPE_RULE_REF, 7}, {LLAMA_GRETYPE_END, 0}},
-        {{LLAMA_GRETYPE_RULE_REF, 12}, {LLAMA_GRETYPE_END, 0}},
-        {
-            {LLAMA_GRETYPE_RULE_REF, 8},
-            {LLAMA_GRETYPE_ALT, 0},
-            {LLAMA_GRETYPE_RULE_REF, 9},
-            {LLAMA_GRETYPE_ALT, 0},
-            {LLAMA_GRETYPE_CHAR, 40},
-            {LLAMA_GRETYPE_RULE_REF, 3},
-            {LLAMA_GRETYPE_RULE_REF, 2},
-            {LLAMA_GRETYPE_CHAR, 41},
-            {LLAMA_GRETYPE_RULE_REF, 3},
-            {LLAMA_GRETYPE_END, 0},
-        },
-        {{LLAMA_GRETYPE_RULE_REF, 1}, {LLAMA_GRETYPE_RULE_REF, 5}, {LLAMA_GRETYPE_ALT, 0}, {LLAMA_GRETYPE_RULE_REF, 1}, {LLAMA_GRETYPE_END, 0}},
-        {
-            {LLAMA_GRETYPE_CHAR, 45},
-            {LLAMA_GRETYPE_CHAR_ALT, 43},
-            {LLAMA_GRETYPE_CHAR_ALT, 42},
-            {LLAMA_GRETYPE_CHAR_ALT, 47},
-            {LLAMA_GRETYPE_RULE_REF, 4},
-            {LLAMA_GRETYPE_END, 0},
-        },
-        {{LLAMA_GRETYPE_RULE_REF, 6}, {LLAMA_GRETYPE_RULE_REF, 7}, {LLAMA_GRETYPE_ALT, 0}, {LLAMA_GRETYPE_END, 0}},
-        {
-            {LLAMA_GRETYPE_CHAR, 97},
-            {LLAMA_GRETYPE_CHAR_RNG_UPPER, 122},
-            {LLAMA_GRETYPE_RULE_REF, 10},
-            {LLAMA_GRETYPE_RULE_REF, 3},
-            {LLAMA_GRETYPE_END, 0},
-        },
-        {{LLAMA_GRETYPE_RULE_REF, 11}, {LLAMA_GRETYPE_RULE_REF, 3}, {LLAMA_GRETYPE_END, 0}},
-        {
-            {LLAMA_GRETYPE_CHAR, 97},
-            {LLAMA_GRETYPE_CHAR_RNG_UPPER, 122},
-            {LLAMA_GRETYPE_CHAR_ALT, 48},
-            {LLAMA_GRETYPE_CHAR_RNG_UPPER, 57},
-            {LLAMA_GRETYPE_CHAR_ALT, 95},
-            {LLAMA_GRETYPE_RULE_REF, 10},
-            {LLAMA_GRETYPE_ALT, 0},
-            {LLAMA_GRETYPE_END, 0},
-        },
-        {
-            {LLAMA_GRETYPE_CHAR, 48},
-            {LLAMA_GRETYPE_CHAR_RNG_UPPER, 57},
-            {LLAMA_GRETYPE_RULE_REF, 11},
-            {LLAMA_GRETYPE_ALT, 0},
-            {LLAMA_GRETYPE_CHAR, 48},
-            {LLAMA_GRETYPE_CHAR_RNG_UPPER, 57},
-            {LLAMA_GRETYPE_END, 0},
-        },
-        {
-            {LLAMA_GRETYPE_CHAR, 32},
-            {LLAMA_GRETYPE_CHAR_ALT, 9},
-            {LLAMA_GRETYPE_CHAR_ALT, 10},
-            {LLAMA_GRETYPE_RULE_REF, 12},
-            {LLAMA_GRETYPE_ALT, 0},
-            {LLAMA_GRETYPE_END, 0},
-        },
-    };
-
-    for (auto pair : expected)
-    {
-        parsed_grammar.symbol_ids[pair.first] = pair.second;
-    }
-
-    for (auto rule : expected_rules)
-    {
-        parsed_grammar.rules.emplace_back();
-        for (auto element : rule)
-        {
-            parsed_grammar.rules.back().push_back(element);
-        }
-    }
-
-    llama_grammar * grammar = NULL;
-    std::vector<const llama_grammar_element *> grammar_rules(parsed_grammar.c_rules());
-
-    grammar = llama_grammar_init(grammar_rules.data(), grammar_rules.size(), parsed_grammar.symbol_ids.at("root"));
-    if (grammar == nullptr)
-    {
-        throw std::runtime_error("Failed to initialize llama_grammar");
-    }
-
-    std::vector<std::vector<llama_grammar_element>> expected_stacks = {
-        {
-            {LLAMA_GRETYPE_RULE_REF, 5},
-            {LLAMA_GRETYPE_CHAR, 61},
-            {LLAMA_GRETYPE_RULE_REF, 7},
-            {LLAMA_GRETYPE_CHAR, 97},
-        },
-        {
-            {LLAMA_GRETYPE_RULE_REF, 5},
-            {LLAMA_GRETYPE_CHAR, 61},
-            {LLAMA_GRETYPE_RULE_REF, 7},
-            {LLAMA_GRETYPE_RULE_REF, 3},
-            {LLAMA_GRETYPE_CHAR, 48},
-        },
-        {
-            {LLAMA_GRETYPE_RULE_REF, 5},
-            {LLAMA_GRETYPE_CHAR, 61},
-            {LLAMA_GRETYPE_RULE_REF, 7},
-            {LLAMA_GRETYPE_RULE_REF, 3},
-            {LLAMA_GRETYPE_CHAR, 48},
-        },
-        {
-            {LLAMA_GRETYPE_RULE_REF, 5},
-            {LLAMA_GRETYPE_CHAR, 61},
-            {LLAMA_GRETYPE_RULE_REF, 7},
-            {LLAMA_GRETYPE_CHAR, 40},
-        },
-        {
-            {LLAMA_GRETYPE_CHAR, 61},
-            {LLAMA_GRETYPE_RULE_REF, 7},
-            {LLAMA_GRETYPE_CHAR, 97},
-        },
-        {
-            {LLAMA_GRETYPE_CHAR, 61},
-            {LLAMA_GRETYPE_RULE_REF, 7},
-            {LLAMA_GRETYPE_RULE_REF, 3},
-            {LLAMA_GRETYPE_CHAR, 48},
-        },
-        {
-            {LLAMA_GRETYPE_CHAR, 61},
-            {LLAMA_GRETYPE_RULE_REF, 7},
-            {LLAMA_GRETYPE_RULE_REF, 3},
-            {LLAMA_GRETYPE_CHAR, 48},
-        },
-        {
-            {LLAMA_GRETYPE_CHAR, 61},
-            {LLAMA_GRETYPE_RULE_REF, 7},
-            {LLAMA_GRETYPE_CHAR, 40},
-        }};
-
-    auto index = 0;
-    for (auto stack : llama_grammar_get_stacks(grammar))
-    {
-        // compare stack to expected_stack
-        for (uint32_t i = 0; i < stack.size(); i++)
-        {
-            auto element = stack[i];
-            auto expected_element = expected_stacks[index][i];
-
-            // pretty print error message before asserting
-            if (expected_element.type != element->type || expected_element.value != element->value)
-            {
-                fprintf(stderr, "index: %d\n", index);
-                fprintf(stderr, "expected_element: %d, %u\n", expected_element.type, expected_element.value);
-                fprintf(stderr, "actual_element: %d, %u\n", element->type, element->value);
-                fprintf(stderr, "expected_element != actual_element\n");
-            }
-
-            assert(expected_element.type == element->type && expected_element.value == element->value);
-        }
-        index++;
-    }
-
-    std::vector<llama_grammar_candidate> next_candidates;
-    next_candidates.resize(24);
-
-    for (size_t i = 0; i < 24; ++i)
-    {
-        uint32_t *cp = new uint32_t[2]; // dynamically allocate memory for code_point
-        cp[0] = 37 + i;
-        cp[1] = 0;
-        next_candidates[i] = {i, cp, {}};
-    }
-
-    std::vector<std::vector<std::pair<uint32_t, uint16_t>>> expected_reject = {
-        {
-            {0, 37},
-            {1, 38},
-            {2, 39},
-            {3, 40},
-            {4, 41},
-            {5, 42},
-            {6, 43},
-            {7, 44},
-            {8, 45},
-            {9, 46},
-            {10, 47},
-            {11, 48},
-            {12, 49},
-            {13, 50},
-            {14, 51},
-            {15, 52},
-            {16, 53},
-            {17, 54},
-            {18, 55},
-            {19, 56},
-            {20, 57},
-            {21, 58},
-            {22, 59},
-            {23, 60},
-        },
-        {
-            {0, 37},
-            {1, 38},
-            {2, 39},
-            {3, 40},
-            {4, 41},
-            {5, 42},
-            {6, 43},
-            {7, 44},
-            {8, 45},
-            {9, 46},
-            {10, 47},
-            {21, 58},
-            {22, 59},
-            {23, 60},
-        },
-        {
-            {0, 37},
-            {1, 38},
-            {2, 39},
-            {3, 40},
-            {4, 41},
-            {5, 42},
-            {6, 43},
-            {7, 44},
-            {8, 45},
-            {9, 46},
-            {10, 47},
-            {21, 58},
-            {22, 59},
-            {23, 60},
-        },
-        {
-            {0, 37},
-            {1, 38},
-            {2, 39},
-            {4, 41},
-            {5, 42},
-            {6, 43},
-            {7, 44},
-            {8, 45},
-            {9, 46},
-            {10, 47},
-            {11, 48},
-            {12, 49},
-            {13, 50},
-            {14, 51},
-            {15, 52},
-            {16, 53},
-            {17, 54},
-            {18, 55},
-            {19, 56},
-            {20, 57},
-            {21, 58},
-            {22, 59},
-            {23, 60},
-        },
-        {
-            {0, 37},
-            {1, 38},
-            {2, 39},
-            {3, 40},
-            {4, 41},
-            {5, 42},
-            {6, 43},
-            {7, 44},
-            {8, 45},
-            {9, 46},
-            {10, 47},
-            {11, 48},
-            {12, 49},
-            {13, 50},
-            {14, 51},
-            {15, 52},
-            {16, 53},
-            {17, 54},
-            {18, 55},
-            {19, 56},
-            {20, 57},
-            {21, 58},
-            {22, 59},
-            {23, 60},
-        },
-        {
-            {0, 37},
-            {1, 38},
-            {2, 39},
-            {3, 40},
-            {4, 41},
-            {5, 42},
-            {6, 43},
-            {7, 44},
-            {8, 45},
-            {9, 46},
-            {10, 47},
-            {21, 58},
-            {22, 59},
-            {23, 60},
-        },
-        {
-            {0, 37},
-            {1, 38},
-            {2, 39},
-            {3, 40},
-            {4, 41},
-            {5, 42},
-            {6, 43},
-            {7, 44},
-            {8, 45},
-            {9, 46},
-            {10, 47},
-            {21, 58},
-            {22, 59},
-            {23, 60},
-        },
-        {
-            {0, 37},
-            {1, 38},
-            {2, 39},
-            {4, 41},
-            {5, 42},
-            {6, 43},
-            {7, 44},
-            {8, 45},
-            {9, 46},
-            {10, 47},
-            {11, 48},
-            {12, 49},
-            {13, 50},
-            {14, 51},
-            {15, 52},
-            {16, 53},
-            {17, 54},
-            {18, 55},
-            {19, 56},
-            {20, 57},
-            {21, 58},
-            {22, 59},
-            {23, 60},
-        },
-    };
-
-    std::vector<llama_grammar_candidate> rejects = llama_grammar_reject_candidates_for_stack(llama_grammar_get_rules(grammar), llama_grammar_get_stacks(grammar)[0], next_candidates);
-
-    std::vector<std::vector<llama_grammar_candidate>> all_rejects;
-
-    for (std::size_t count = 0; count < llama_grammar_get_stacks(grammar).size(); ++count)
-    {
-        rejects = llama_grammar_reject_candidates_for_stack(llama_grammar_get_rules(grammar), llama_grammar_get_stacks(grammar)[count], next_candidates);
-        all_rejects.push_back(rejects);
-    }
-
-    index = 0;
-    for (auto rej : all_rejects)
-    {
-        for (uint32_t i = 0; i < rej.size(); i++)
-        {
-            auto element = rej[i];
-            auto expected_element = expected_reject[index][i];
-            assert(element.index == expected_element.first && *element.code_points == expected_element.second);
-        }
-        index++;
-    }
-
-    for (auto &candidate : next_candidates)
-    {
-        delete[] candidate.code_points;
-        candidate.code_points = nullptr;
-    }
-    llama_grammar_free(grammar);
-    return 0;
-}
--- a/tests/test-model-load-cancel.cpp
+++ b/tests/test-model-load-cancel.cpp
@ -1,27 +0,0 @@
-#include "llama.h"
-#include "get-model.h"
-
-#include <cstdlib>
-
-int main(int argc, char *argv[] ) {
-    auto * model_path = get_model_or_exit(argc, argv);
-    auto * file = fopen(model_path, "r");
-    if (file == nullptr) {
-        fprintf(stderr, "no model at '%s' found\n", model_path);
-        return EXIT_FAILURE;
-    }
-
-    fprintf(stderr, "using '%s'\n", model_path);
-    fclose(file);
-
-    llama_backend_init();
-    auto params = llama_model_params{};
-    params.use_mmap = false;
-    params.progress_callback = [](float progress, void * ctx){
-        (void) ctx;
-        return progress > 0.50;
-    };
-    auto * model = llama_load_model_from_file(model_path, params);
-    llama_backend_free();
-    return model == nullptr ? EXIT_SUCCESS : EXIT_FAILURE;
-}
--- a/tests/test-opt.cpp
+++ b/tests/test-opt.cpp
@ -1,181 +0,0 @@
-#include "ggml.h"
-
-#include <cmath>
-#include <cstdio>
-#include <cstdlib>
-#include <cassert>
-
-#define MAX_NARGS 2
-
-#if defined(__GNUC__)
-#pragma GCC diagnostic ignored "-Wdouble-promotion"
-#endif
-
-//
-// logging
-//
-#define GGML_DEBUG 0
-#if (GGML_DEBUG >= 1)
-#define GGML_PRINT_DEBUG(...) printf(__VA_ARGS__)
-#else
-#define GGML_PRINT_DEBUG(...)
-#endif
-
-#if (GGML_DEBUG >= 5)
-#define GGML_PRINT_DEBUG_5(...) printf(__VA_ARGS__)
-#else
-#define GGML_PRINT_DEBUG_5(...)
-#endif
-
-#if (GGML_DEBUG >= 10)
-#define GGML_PRINT_DEBUG_10(...) printf(__VA_ARGS__)
-#else
-#define GGML_PRINT_DEBUG_10(...)
-#endif
-
-#define GGML_PRINT(...) printf(__VA_ARGS__)
-
-
-static float frand(void) {
-    return (float)rand()/(float)RAND_MAX;
-}
-
-static struct ggml_tensor * get_random_tensor(
-    struct ggml_context * ctx0, int ndims, int64_t ne[], float fmin, float fmax
-) {
-    struct ggml_tensor * result = ggml_new_tensor(ctx0, GGML_TYPE_F32, ndims, ne);
-
-    switch (ndims) {
-        case 1:
-            for (int i0 = 0; i0 < ne[0]; i0++) {
-                ((float *)result->data)[i0] = frand()*(fmax - fmin) + fmin;
-            }
-            break;
-        case 2:
-            for (int i1 = 0; i1 < ne[1]; i1++) {
-                for (int i0 = 0; i0 < ne[0]; i0++) {
-                    ((float *)result->data)[i1*ne[0] + i0] = frand()*(fmax - fmin) + fmin;
-                }
-            }
-            break;
-        case 3:
-            for (int i2 = 0; i2 < ne[2]; i2++) {
-                for (int i1 = 0; i1 < ne[1]; i1++) {
-                    for (int i0 = 0; i0 < ne[0]; i0++) {
-                        ((float *)result->data)[i2*ne[1]*ne[0] + i1*ne[0] + i0] = frand()*(fmax - fmin) + fmin;
-                    }
-                }
-            }
-            break;
-        case 4:
-            for (int i3 = 0; i3 < ne[3]; i3++) {
-                for (int i2 = 0; i2 < ne[2]; i2++) {
-                    for (int i1 = 0; i1 < ne[1]; i1++) {
-                        for (int i0 = 0; i0 < ne[0]; i0++) {
-                            ((float *)result->data)[i3*ne[2]*ne[1]*ne[0] + i2*ne[1]*ne[0] + i1*ne[0] + i0] = frand()*(fmax - fmin) + fmin;
-                        }
-                    }
-                }
-            }
-            break;
-        default:
-            assert(false);
-    }
-
-    return result;
-}
-
-int main(void) {
-    struct ggml_init_params params = {
-        /* .mem_size   = */ 1024*1024*1024,
-        /* .mem_buffer = */ NULL,
-        /* .no_alloc   = */ false,
-    };
-
-    struct ggml_context * ctx = ggml_init(params);
-
-    int64_t ne1[4] = {4, 128, 1, 1};
-    int64_t ne2[4] = {4, 256, 1, 1};
-    int64_t ne3[4] = {128, 256, 1, 1};
-
-    struct ggml_tensor * a = get_random_tensor(ctx, 2, ne1, -1, +1);
-    struct ggml_tensor * b = get_random_tensor(ctx, 2, ne2, -1, +1);
-    ggml_set_param(ctx, a);
-    ggml_set_param(ctx, b);
-
-    struct ggml_tensor * c = get_random_tensor(ctx, 2, ne3, -1, +1);
-
-    struct ggml_tensor * ab = ggml_mul_mat(ctx, a, b);
-    struct ggml_tensor * d  = ggml_sub(ctx, c, ab);
-    struct ggml_tensor * e  = ggml_sum(ctx, ggml_sqr(ctx, d));
-
-    struct ggml_cgraph * ge = ggml_new_graph_custom(ctx, GGML_DEFAULT_GRAPH_SIZE, true);
-    ggml_build_forward_expand(ge, e);
-    ggml_graph_reset(ge);
-
-    ggml_graph_compute_with_ctx(ctx, ge, /*n_threads*/ 1);
-
-    const float fe = ggml_get_f32_1d(e, 0);
-    printf("%s: e = %.4f\n", __func__, fe);
-
-    struct ggml_opt_params opt_params = ggml_opt_default_params(GGML_OPT_TYPE_ADAM);
-
-    ggml_opt(ctx, opt_params, e);
-
-    ggml_graph_reset(ge);
-
-    ggml_graph_compute_with_ctx(ctx, ge, /*n_threads*/ 1);
-
-    const float fe_opt = ggml_get_f32_1d(e, 0);
-    printf("%s: original  e = %.4f\n", __func__, fe);
-    printf("%s: optimized e = %.4f\n", __func__, fe_opt);
-
-    const bool success = (fe_opt <= fe);
-    assert(success);
-
-    ggml_free(ctx);
-    return success ? 0 : -1;
-}
-// int64_t ne1[4] = {4, 128, 1, 1};
-// int64_t ne2[4] = {4, 256, 1, 1};;
-// int64_t ne3[4] = {128, 256, 1, 1};
-// main: original  e = 25890.9375
-// main: optimized e = 10094.7031
-
-// int64_t ne1[4] = {8, 128, 1, 1};
-// int64_t ne2[4] = {8, 256, 1, 1};;
-// int64_t ne3[4] = {128, 256, 1, 1};
-// main: original  e = 39429.5078
-// main: optimized e = 9275.8936
-
-// int64_t ne1[4] = {16, 128, 1, 1};
-// int64_t ne2[4] = {16, 256, 1, 1};;
-// int64_t ne3[4] = {128, 256, 1, 1};
-// main: original  e = 68371.1328
-// main: optimized e = 7854.4502
-
-
-// int64_t ne1[4] = {32, 128, 1, 1};
-// int64_t ne2[4] = {32, 256, 1, 1};;
-// int64_t ne3[4] = {128, 256, 1, 1};
-// main: original  e = 126061.1953
-// main: optimized e = 5451.0166
-
-// int64_t ne1[4] = {4, 1024, 1, 1};
-// int64_t ne2[4] = {4, 2048, 1, 1};;
-// int64_t ne3[4] = {1024, 2048, 1, 1};
-// main: original  e = 1620817.8750
-// main: optimized e = 698387.6875
-
-// another run on M1
-// int64_t ne1[4] = {4, 1024, 1, 1};
-// int64_t ne2[4] = {4, 2048, 1, 1};;
-// int64_t ne3[4] = {1024, 2048, 1, 1};
-// main: original  e = 1629595.6250
-// main: optimized e = 698169.1250
-
-// int64_t ne1[4] = {32, 1024, 1, 1};
-// int64_t ne2[4] = {32, 2048, 1, 1};;
-// int64_t ne3[4] = {1024, 2048, 1, 1};
-// main: original  e = 8146770.5000
-// main: optimized e = 651119.1250
--- a/tests/test-quantize-fns.cpp
+++ b/tests/test-quantize-fns.cpp
@ -1,185 +0,0 @@
-// Unit tests for quantization specific functions - quantize, dequantize and dot product
-
-#include "ggml.h"
-
-#undef NDEBUG
-#include <assert.h>
-#include <math.h>
-#include <stdio.h>
-#include <string>
-#include <vector>
-
-#if defined(_MSC_VER)
-#pragma warning(disable: 4244 4267) // possible loss of data
-#endif
-
-constexpr float MAX_QUANTIZATION_REFERENCE_ERROR = 0.0001f;
-constexpr float MAX_QUANTIZATION_TOTAL_ERROR = 0.002f;
-constexpr float MAX_QUANTIZATION_TOTAL_ERROR_2BITS = 0.0075f;
-constexpr float MAX_QUANTIZATION_TOTAL_ERROR_3BITS = 0.0040f;
-constexpr float MAX_QUANTIZATION_TOTAL_ERROR_3BITS_XXS = 0.0050f;
-constexpr float MAX_DOT_PRODUCT_ERROR = 0.02f;
-constexpr float MAX_DOT_PRODUCT_ERROR_LOWBIT = 0.04f;
-
-static const char* RESULT_STR[] = {"ok", "FAILED"};
-
-
-// Generate synthetic data
-static void generate_data(float offset, size_t n, float * dst) {
-    for (size_t i = 0; i < n; i++) {
-        dst[i] = 0.1 + 2*cosf(i + offset);
-    }
-}
-
-// Calculate RMSE between two float arrays
-static float array_rmse(const float * a1, const float * a2, size_t n) {
-    double sum = 0;
-    for (size_t i = 0; i < n; i++) {
-        double diff = a1[i] - a2[i];
-        sum += diff * diff;
-    }
-    return sqrtf(sum) / n;
-}
-
-// Total quantization error on test data
-static float total_quantization_error(ggml_type_traits_t & qfns, size_t test_size, const float * test_data) {
-    std::vector<uint8_t> tmp_q(2*test_size);
-    std::vector<float> tmp_out(test_size);
-
-    qfns.from_float(test_data, tmp_q.data(), test_size);
-    qfns.to_float(tmp_q.data(), tmp_out.data(), test_size);
-    return array_rmse(test_data, tmp_out.data(), test_size);
-}
-
-// Total quantization error on test data
-static float reference_quantization_error(ggml_type_traits_t & qfns, size_t test_size, const float * test_data) {
-    std::vector<uint8_t> tmp_q(2*test_size);
-    std::vector<float> tmp_out(test_size);
-    std::vector<float> tmp_out_ref(test_size);
-
-    qfns.from_float(test_data, tmp_q.data(), test_size);
-    qfns.to_float(tmp_q.data(), tmp_out.data(), test_size);
-
-    qfns.from_float_ref(test_data, tmp_q.data(), test_size);
-    qfns.to_float(tmp_q.data(), tmp_out_ref.data(), test_size);
-
-    return array_rmse(tmp_out.data(), tmp_out_ref.data(), test_size);
-}
-
-static float dot_product(const float * a1, const float * a2, size_t test_size) {
-    double sum = 0;
-    for (size_t i = 0; i < test_size; i++) {
-        sum += a1[i] * a2[i];
-    }
-    return sum;
-}
-
-// Total dot product error
-static float dot_product_error(
-    ggml_type_traits_t & qfns, size_t test_size, const float * test_data1, const float *test_data2
-) {
-    std::vector<uint8_t> tmp_q1(2*test_size);
-    std::vector<uint8_t> tmp_q2(2*test_size);
-
-    auto vdot = ggml_internal_get_type_traits(qfns.vec_dot_type);
-
-    qfns.from_float(test_data1, tmp_q1.data(), test_size);
-    vdot.from_float(test_data2, tmp_q2.data(), test_size);
-
-    float result = INFINITY;
-    qfns.vec_dot(test_size, &result, 0, tmp_q1.data(), 0, tmp_q2.data(), 0, 1);
-
-    const float dot_ref = dot_product(test_data1, test_data2, test_size);
-
-    return fabsf(result - dot_ref) / test_size;
-}
-
-int main(int argc, char * argv[]) {
-    bool verbose = false;
-    const size_t test_size = 32 * 128;
-
-    std::string arg;
-    for (int i = 1; i < argc; i++) {
-        arg = argv[i];
-
-        if (arg == "-v") {
-            verbose = true;
-        } else {
-            fprintf(stderr, "error: unknown argument: %s\n", arg.c_str());
-            return 1;
-        }
-    }
-
-    std::vector<float> test_data(test_size);
-    std::vector<float> test_data2(test_size);
-
-    generate_data(0.0, test_data.size(), test_data.data());
-    generate_data(1.0, test_data2.size(), test_data2.data());
-
-    // Initialize GGML, ensures float conversion tables are initialized
-    struct ggml_init_params ggml_params = {
-        /* .mem_size   = */ 1*1024,
-        /* .mem_buffer = */ NULL,
-        /* .no_alloc   = */ true,
-    };
-    struct ggml_context * ctx = ggml_init(ggml_params);
-
-    int num_failed = 0;
-    bool failed = false;
-
-    for (int i = 0; i < GGML_TYPE_COUNT; i++) {
-        ggml_type type = (ggml_type) i;
-        ggml_type_traits_t qfns = ggml_internal_get_type_traits(type);
-
-        // deprecated - skip
-        if (qfns.blck_size == 0) {
-            continue;
-        }
-
-        const ggml_type ei = (ggml_type)i;
-
-        printf("Testing %s\n", ggml_type_name((ggml_type) i));
-        ggml_quantize_init(ei);
-
-        if (qfns.from_float && qfns.to_float) {
-            const float total_error = total_quantization_error(qfns, test_size, test_data.data());
-            const float max_quantization_error =
-                type == GGML_TYPE_Q2_K    ? MAX_QUANTIZATION_TOTAL_ERROR_2BITS :
-                type == GGML_TYPE_IQ2_S   ? MAX_QUANTIZATION_TOTAL_ERROR_2BITS :
-                type == GGML_TYPE_Q3_K    ? MAX_QUANTIZATION_TOTAL_ERROR_3BITS :
-                type == GGML_TYPE_IQ3_S   ? MAX_QUANTIZATION_TOTAL_ERROR_3BITS :
-                type == GGML_TYPE_IQ3_XXS ? MAX_QUANTIZATION_TOTAL_ERROR_3BITS_XXS : MAX_QUANTIZATION_TOTAL_ERROR;
-            failed = !(total_error < max_quantization_error);
-            num_failed += failed;
-            if (failed || verbose) {
-                printf("%5s absolute quantization error:    %s (%f)\n", ggml_type_name(type), RESULT_STR[failed], total_error);
-            }
-
-            const float reference_error = reference_quantization_error(qfns, test_size, test_data.data());
-            failed = !(reference_error < MAX_QUANTIZATION_REFERENCE_ERROR);
-            num_failed += failed;
-            if (failed || verbose) {
-                printf("%5s reference implementation error: %s (%f)\n", ggml_type_name(type), RESULT_STR[failed], reference_error);
-            }
-
-            const float vec_dot_error = dot_product_error(qfns, test_size, test_data.data(), test_data2.data());
-            const float max_allowed_error = type == GGML_TYPE_Q2_K || type == GGML_TYPE_IQ2_XS || type == GGML_TYPE_IQ2_XXS ||
-                                            type == GGML_TYPE_IQ3_XXS || type == GGML_TYPE_IQ3_S || type == GGML_TYPE_IQ2_S
-                                          ? MAX_DOT_PRODUCT_ERROR_LOWBIT
-                                          : MAX_DOT_PRODUCT_ERROR;
-            failed = !(vec_dot_error < max_allowed_error);
-            num_failed += failed;
-            if (failed || verbose) {
-                printf("%5s dot product error:              %s (%f)\n", ggml_type_name(type), RESULT_STR[failed], vec_dot_error);
-            }
-        }
-    }
-
-    if (num_failed || verbose) {
-        printf("%d tests failed\n", num_failed);
-    }
-
-    ggml_free(ctx);
-
-    return num_failed > 0;
-}
--- a/tests/test-quantize-perf.cpp
+++ b/tests/test-quantize-perf.cpp
@ -1,363 +0,0 @@
-// Benchmark quantization specific functions on synthetic data
-
-#include "ggml.h"
-
-#undef NDEBUG
-#include <algorithm>
-#include <assert.h>
-#include <functional>
-#include <inttypes.h>
-#include <math.h>
-#include <memory>
-#include <stdio.h>
-#include <string>
-#include <vector>
-
-#if defined(_MSC_VER)
-#pragma warning(disable: 4244 4267) // possible loss of data
-#endif
-
-#define MAX_ALIGNMENT 64
-#define QK 32
-#define WARMUP 5
-#define ITERATIONS 10
-#define MAX_ITERATIONS 100000000
-
-#define L1_SIZE      32*128
-#define L2_SIZE     32*2048
-#define L3_SIZE    32*20480
-#define MEM_SIZE 32*2048000
-
-struct quantize_perf_params {
-    std::vector<std::string> include_types;
-    std::vector<size_t> test_sizes;
-    size_t alignment_offset = 0;
-    bool op_quantize_row_q_reference = false;
-    bool op_quantize_row_q = false;
-    bool op_dequantize_row_q = false;
-    bool op_quantize_row_q_dot = false;
-    bool op_vec_dot_q = false;
-    int64_t iterations = ITERATIONS;
-};
-
-#if defined(__x86_64__) || defined(__i386__)
-
-#include <x86intrin.h>
-inline int64_t cpu_cycles() {
-// Rough way to detect new-ish CPUs
-#ifdef __POPCNT__
-    unsigned int dummy;
-    return __rdtscp(&dummy);
-#else
-    return __rdtsc();
-#endif
-}
-
-#else
-
-#define cpu_cycles() 0
-
-#endif
-
-
-// Generate synthetic data
-static void generate_data(float offset, size_t n, float * dst) {
-    for (size_t i = 0; i < n; i++) {
-        dst[i] = 0.1 + 2*cosf(i + offset);
-    }
-}
-
-static float gigabytes_per_second(size_t bytes, int64_t usecs) {
-    return bytes / (float) usecs * 1000000 / (1024*1024*1024);
-}
-
-static void * align_with_offset(void * ptr, int offset) {
-    size_t dummy_size = MAX_ALIGNMENT * 4;
-    return (char *) std::align(MAX_ALIGNMENT, MAX_ALIGNMENT, ptr, dummy_size) + offset;
-}
-
-static void benchmark_function(size_t size, size_t q_size, int64_t iterations, const std::function<float(void)> & func) {
-    int64_t min_time_us = INT64_MAX;
-    int64_t total_time_us = 0;
-    int64_t min_time_cycles = INT64_MAX;
-    int64_t total_time_cycles = 0;
-
-    for (int i = 0; i < WARMUP; i++) {
-        func();
-    }
-
-    for (int i = 0; i < iterations; i++) {
-        const int64_t start_time = ggml_time_us();
-        const int64_t start_cycles = cpu_cycles();
-
-        func();
-
-        const int64_t end_cycles = cpu_cycles();
-        const int64_t end_time = ggml_time_us();
-
-        total_time_cycles += end_cycles - start_cycles;
-        min_time_cycles = std::min(min_time_cycles, end_cycles - start_cycles);
-        total_time_us += end_time - start_time;
-        min_time_us = std::min(min_time_us, end_time - start_time);
-    }
-
-    printf("      min cycles/%d vals   : %9.2f\n",  QK, QK * min_time_cycles / (float) size);
-    printf("      avg cycles/%d vals   : %9.2f\n",  QK, QK * total_time_cycles / (float) (size * iterations));
-    printf("      float32 throughput   : %9.2f GB/s\n",  gigabytes_per_second(4 * size * iterations, total_time_us));
-    printf("      quantized throughput : %9.2f GB/s\n",  gigabytes_per_second(q_size * iterations, total_time_us));
-}
-
-static void usage(char * argv[]) {
-    printf("Benchmark quantization specific functions on synthetic data\n");
-    printf("\n");
-    printf("usage: %s [options]\n", argv[0]);
-    printf("\n");
-    printf("options: (default)\n");
-    printf("  -h, --help            show this help message and exit\n");
-    printf("  --size SIZE           set test size, divisible by 32 (L1_SIZE:%d)\n", L1_SIZE);
-    printf("  -3                    use size as L1, L2, L3 sizes (L1:%d L2:%d L3:%d)\n", L1_SIZE, L2_SIZE, L3_SIZE);
-    printf("  -4                    use size as L1, L2, L3, MEM sizes (L1:%d L2:%d L3:%d MEM:%d)\n", L1_SIZE, L2_SIZE, L3_SIZE, MEM_SIZE);
-    printf("  --op OP               set test operation as quantize_row_q_reference, quantize_row_q, dequantize_row_q,\n");
-    printf("                        quantize_row_q_dot, vec_dot_q (all)\n");
-    printf("  --type TYPE           set test type as");
-    for (int i = 0; i < GGML_TYPE_COUNT; i++) {
-        ggml_type type = (ggml_type) i;
-        ggml_type_traits_t qfns = ggml_internal_get_type_traits(type);
-        if (ggml_type_name(type) != NULL) {
-            if (qfns.from_float && qfns.to_float) {
-                printf(" %s", ggml_type_name(type));
-            }
-        }
-    }
-    printf(" (all)\n");
-    printf("  --alignment-offset OFFSET\n");
-    printf("                        set alignment offset as OFFSET (0)\n");
-    printf("  -i NUM, --iterations NUM\n");
-    printf("                        set test iteration number (%d)\n", ITERATIONS);
-}
-
-int main(int argc, char * argv[]) {
-    quantize_perf_params params {};
-
-    // read command line
-
-    bool invalid_param = false;
-    std::string arg;
-    for (int i = 1; i < argc; i++) {
-        arg = argv[i];
-
-        if (arg == "--size") {
-            if (++i >= argc) {
-                invalid_param = true;
-                break;
-            }
-            size_t size = std::stoi(argv[i]);
-            if (size % 32 != 0) {
-                fprintf(stderr, "error: size %zu not divisible by 32\n", size);
-                invalid_param = true;
-                break;
-            }
-            params.test_sizes.push_back(size);
-        } else if (arg == "-3") {
-            // quick select sizes that probably fit in CPU caches
-            params.test_sizes.push_back(L1_SIZE);
-            params.test_sizes.push_back(L2_SIZE);
-            params.test_sizes.push_back(L3_SIZE);
-        } else if (arg == "-4") {
-            // quick select cache sizes + memory
-            params.test_sizes.push_back(L1_SIZE);
-            params.test_sizes.push_back(L2_SIZE);
-            params.test_sizes.push_back(L3_SIZE);
-            params.test_sizes.push_back(MEM_SIZE);
-        } else if (arg == "--op") {
-            if (++i >= argc) {
-                invalid_param = true;
-                break;
-            }
-            std::string op {argv[i]};
-            if (op == "quantize_row_q_reference") {
-                params.op_quantize_row_q_reference = true;
-            } else if (op == "quantize_row_q") {
-                params.op_quantize_row_q = true;
-            } else if (op == "dequantize_row_q") {
-                params.op_dequantize_row_q = true;
-            } else if (op == "quantize_row_q_dot") {
-                params.op_quantize_row_q_dot = true;
-            } else if (op == "vec_dot_q") {
-                params.op_vec_dot_q = true;
-            } else {
-                invalid_param = true;
-                break;
-            }
-        } else if (arg == "--type") {
-            if (++i >= argc) {
-                invalid_param = true;
-                break;
-            }
-            params.include_types.push_back(argv[i]);
-        } else if (arg == "--alignment-offset") {
-            if (++i >= argc) {
-                invalid_param = true;
-                break;
-            }
-            int alignment = std::stoi(argv[i]);
-            if (alignment < 0 || alignment > MAX_ALIGNMENT) {
-            fprintf(stderr, "error: alignment-offset must be less than %d\n", MAX_ALIGNMENT);
-                invalid_param = true;
-                break;
-            }
-            params.alignment_offset = alignment;
-        } else if ((arg == "-i") || (arg == "--iterations")) {
-            if (++i >= argc) {
-                invalid_param = true;
-                break;
-            }
-            int number = std::stoi(argv[i]);
-            if (number < 0 || number > MAX_ITERATIONS) {
-            fprintf(stderr, "error: iterations must be less than %d\n", MAX_ITERATIONS);
-                invalid_param = true;
-                break;
-            }
-            params.iterations = number;
-        } else if ((arg == "-h") || (arg == "--help")) {
-            usage(argv);
-            return 1;
-        } else {
-            fprintf(stderr, "error: unknown argument: %s\n", arg.c_str());
-            return 1;
-        }
-    }
-    if (invalid_param) {
-        fprintf(stderr, "error: invalid parameter for argument: %s\n", arg.c_str());
-        return 1;
-    }
-
-    if (params.test_sizes.empty()) {
-        params.test_sizes.push_back(L1_SIZE);
-    }
-    if (!(params.op_quantize_row_q_reference || params.op_quantize_row_q || params.op_dequantize_row_q || params.op_quantize_row_q_dot || params.op_vec_dot_q)) {
-        params.op_quantize_row_q_reference = params.op_quantize_row_q = params.op_dequantize_row_q = params.op_quantize_row_q_dot = params.op_vec_dot_q = true;
-    }
-
-    std::sort(params.test_sizes.begin(), params.test_sizes.end());
-    size_t largest = params.test_sizes.back();
-
-    std::vector<uint8_t> test_data1_v(largest*4 + MAX_ALIGNMENT*2);
-    std::vector<uint8_t> test_data2_v(largest*4 + MAX_ALIGNMENT*2);
-    std::vector<uint8_t> test_q1_v   (largest*4 + MAX_ALIGNMENT*2);
-    std::vector<uint8_t> test_q2_v   (largest*4 + MAX_ALIGNMENT*2);
-    std::vector<uint8_t> test_out_v  (largest*4 + MAX_ALIGNMENT*2);
-
-    float * test_data1 = (float *) align_with_offset(test_data1_v.data(), params.alignment_offset);
-    float * test_data2 = (float *) align_with_offset(test_data2_v.data(), params.alignment_offset);
-    float * test_q1    = (float *) align_with_offset(test_q1_v.data(),    params.alignment_offset);
-    float * test_q2    = (float *) align_with_offset(test_q2_v.data(),    params.alignment_offset);
-    float * test_out   = (float *) align_with_offset(test_out_v.data(),   params.alignment_offset);
-
-    generate_data(0, largest, test_data1);
-    generate_data(1, largest, test_data2);
-
-    int64_t iterations = params.iterations;
-
-
-    // Initialize GGML, ensures float conversion tables are initialized
-    struct ggml_init_params ggml_params = {
-        /* .mem_size   = */ 1*1024,
-        /* .mem_buffer = */ NULL,
-        /* .no_alloc   = */ true,
-    };
-    struct ggml_context * ctx = ggml_init(ggml_params);
-
-    for (int i = 0; i < GGML_TYPE_COUNT; i++) {
-        ggml_type type = (ggml_type) i;
-        ggml_type_traits_t qfns = ggml_internal_get_type_traits(type);
-        if (!params.include_types.empty() && ggml_type_name(type) && std::find(params.include_types.begin(), params.include_types.end(), ggml_type_name(type)) == params.include_types.end()) {
-            continue;
-        }
-
-        if (qfns.from_float && qfns.to_float) {
-            printf("%s\n", ggml_type_name(type));
-
-            ggml_quantize_init(type);
-
-            if (params.op_quantize_row_q_reference) {
-                printf("  quantize_row_q_reference\n");
-                for (size_t size : params.test_sizes) {
-                    printf("    %zu values (%.2f MB)\n", size, 4*size/(float)(1024*1024));
-                    auto quantize_fn = [&](void) -> float {
-                        qfns.from_float_ref(test_data1, test_q1, size);
-                        return test_q1[0];
-                    };
-                    size_t quantized_size = ggml_row_size(type, size);
-                    benchmark_function(size, quantized_size, iterations, quantize_fn);
-                }
-                printf("\n");
-            }
-
-            if (params.op_quantize_row_q) {
-                printf("  quantize_row_q\n");
-                for (size_t size : params.test_sizes) {
-                    printf("    %zu values (%.2f MB)\n", size, 4*size/(float)(1024*1024));
-                    auto quantize_fn = [&](void) -> float {
-                        qfns.from_float(test_data1, test_q1, size);
-                        return test_q1[0];
-                    };
-                    size_t quantized_size = ggml_row_size(type, size);
-                    benchmark_function(size, quantized_size, iterations, quantize_fn);
-                }
-                printf("\n");
-            }
-
-            if (params.op_dequantize_row_q) {
-                printf("  dequantize_row_q\n");
-                qfns.from_float(test_data1, test_q1, largest);
-                for (size_t size : params.test_sizes) {
-                    printf("    %zu values (%.2f MB)\n", size, 4*size/(float)(1024*1024));
-                    auto quantize_fn = [&](void) -> float {
-                        qfns.to_float(test_q1, test_out, size);
-                        return test_out[0];
-                    };
-                    size_t quantized_size = ggml_row_size(type, size);
-                    benchmark_function(size, quantized_size, iterations, quantize_fn);
-                }
-                printf("\n");
-            }
-
-            if (params.op_quantize_row_q_dot) {
-                printf("  quantize_row_q_dot\n");
-                for (size_t size : params.test_sizes) {
-                    printf("    %zu values (%.2f MB)\n", size, 4*size/(float)(1024*1024));
-                    auto quantize_fn = [&](void) -> float {
-                        auto vdot = ggml_internal_get_type_traits(qfns.vec_dot_type);
-                        vdot.from_float(test_data1, test_q1, size);
-                        return test_q1[0];
-                    };
-                    size_t quantized_size = ggml_row_size(type, size);
-                    benchmark_function(size, quantized_size, iterations, quantize_fn);
-                }
-                printf("\n");
-            }
-
-            if (params.op_vec_dot_q) {
-                printf("  vec_dot_q\n");
-                qfns.from_float(test_data1, test_q1, largest);
-                qfns.from_float(test_data2, test_q2, largest);
-                for (size_t size : params.test_sizes) {
-                    printf("    %zu values (%.2f MB)\n", size, 4*size/(float)(1024*1024));
-                    auto quantize_fn = [&](void) -> float {
-                        float result;
-                        qfns.vec_dot(size, &result, 0, test_q1, 0, test_q2, 0, 1);
-                        return result;
-                    };
-                    size_t quantized_size = ggml_row_size(type, size);
-                    benchmark_function(size, quantized_size, iterations, quantize_fn);
-                }
-                printf("\n");
-            }
-        }
-    }
-
-    ggml_free(ctx);
-
-    return 0;
-}
--- a/tests/test-rope.cpp
+++ b/tests/test-rope.cpp
@ -1,220 +0,0 @@
-#include "ggml.h"
-
-#include <cmath>
-#include <cstdio>
-#include <cstdlib>
-#include <cassert>
-#include <vector>
-
-#if defined(_MSC_VER)
-#pragma warning(disable: 4244 4267) // possible loss of data
-#endif
-
-#if defined(__GNUC__)
-#pragma GCC diagnostic ignored "-Wdouble-promotion"
-#endif
-
-#define MAX_NARGS 3
-
-#undef MIN
-#undef MAX
-#define MIN(a, b) ((a) < (b) ? (a) : (b))
-#define MAX(a, b) ((a) > (b) ? (a) : (b))
-
-#define GGML_SILU_FP16
-
-//
-// logging
-//
-
-#if (GGML_DEBUG >= 1)
-#define GGML_PRINT_DEBUG(...) printf(__VA_ARGS__)
-#else
-#define GGML_PRINT_DEBUG(...)
-#endif
-
-#if (GGML_DEBUG >= 5)
-#define GGML_PRINT_DEBUG_5(...) printf(__VA_ARGS__)
-#else
-#define GGML_PRINT_DEBUG_5(...)
-#endif
-
-#if (GGML_DEBUG >= 10)
-#define GGML_PRINT_DEBUG_10(...) printf(__VA_ARGS__)
-#else
-#define GGML_PRINT_DEBUG_10(...)
-#endif
-
-#define GGML_PRINT(...) printf(__VA_ARGS__)
-
-static float frand(void) {
-    return (float)rand()/(float)RAND_MAX;
-}
-
-static int irand(int n) {
-    if (n == 0) return 0;
-    return rand()%n;
-}
-
-static void get_random_dims(int64_t * dims, int ndims) {
-    dims[0] = dims[1] = dims[2] = dims[3] = 1;
-
-    for (int i = 0; i < ndims; i++) {
-        dims[i] = 1 + irand(4);
-    }
-}
-
-static struct ggml_tensor * get_random_tensor_f32(
-        struct ggml_context * ctx0,
-        int ndims,
-        const int64_t ne[],
-        float fmin,
-        float fmax) {
-    struct ggml_tensor * result = ggml_new_tensor(ctx0, GGML_TYPE_F32, ndims, ne);
-
-    switch (ndims) {
-        case 1:
-            for (int i0 = 0; i0 < ne[0]; i0++) {
-                ((float *)result->data)[i0] = frand()*(fmax - fmin) + fmin;
-            }
-            break;
-        case 2:
-            for (int i1 = 0; i1 < ne[1]; i1++) {
-                for (int i0 = 0; i0 < ne[0]; i0++) {
-                    ((float *)result->data)[i1*ne[0] + i0] = frand()*(fmax - fmin) + fmin;
-                }
-            }
-            break;
-        case 3:
-            for (int i2 = 0; i2 < ne[2]; i2++) {
-                for (int i1 = 0; i1 < ne[1]; i1++) {
-                    for (int i0 = 0; i0 < ne[0]; i0++) {
-                        ((float *)result->data)[i2*ne[1]*ne[0] + i1*ne[0] + i0] = frand()*(fmax - fmin) + fmin;
-                    }
-                }
-            }
-            break;
-        case 4:
-            for (int i3 = 0; i3 < ne[3]; i3++) {
-                for (int i2 = 0; i2 < ne[2]; i2++) {
-                    for (int i1 = 0; i1 < ne[1]; i1++) {
-                        for (int i0 = 0; i0 < ne[0]; i0++) {
-                            ((float *)result->data)[i3*ne[2]*ne[1]*ne[0] + i2*ne[1]*ne[0] + i1*ne[0] + i0] = frand()*(fmax - fmin) + fmin;
-                        }
-                    }
-                }
-            }
-            break;
-        default:
-            assert(false);
-    };
-
-    return result;
-}
-
-static void ggml_graph_compute_helper(std::vector<uint8_t> & buf, ggml_cgraph * graph, int n_threads) {
-    struct ggml_cplan plan = ggml_graph_plan(graph, n_threads);
-
-    if (plan.work_size > 0) {
-        buf.resize(plan.work_size);
-        plan.work_data = buf.data();
-    }
-
-    ggml_graph_compute(graph, &plan);
-}
-
-int main(int /*argc*/, const char ** /*argv*/) {
-    struct ggml_init_params params = {
-        /* .mem_size   = */ 128*1024*1024,
-        /* .mem_buffer = */ NULL,
-        /* .no_alloc   = */ false,
-    };
-
-    std::vector<uint8_t> work_buffer;
-
-    struct ggml_context * ctx0 = ggml_init(params);
-
-    struct ggml_tensor * x;
-
-    // rope f32
-    for (int m = 0; m < 3; ++m) {
-        const int ndims = 4;
-
-        const int64_t n_rot = 128;
-        const int64_t ne[4] = { 2*n_rot, 32, 73, 1 };
-
-        const int n_past_0 = 100;
-        const int n_past_2 = 33;
-
-        struct ggml_tensor * p0 = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, ne[2]);
-        struct ggml_tensor * p1 = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, ne[2]);
-        struct ggml_tensor * p2 = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, ne[2]);
-
-        for (int i = 0; i < ne[2]; ++i) {
-            ((int32_t *) p0->data)[i] = n_past_0 + i;
-            ((int32_t *) p1->data)[i] = n_past_2 - n_past_0;
-            ((int32_t *) p2->data)[i] = n_past_2 + i;
-        }
-
-        // test mode 0, 2, 4 (standard, GPT-NeoX, GLM)
-        const int mode = m == 0 ? 0 : m == 1 ? 2 : 4;
-
-        x = get_random_tensor_f32(ctx0, ndims, ne, -1.0f, 1.0f);
-
-        // 100, 101, 102, ..., 172
-        struct ggml_tensor * r0 = ggml_rope(ctx0, x,  p0, n_rot, mode);
-        // -67, -67, -67, ..., -67
-        struct ggml_tensor * r1 = ggml_rope(ctx0, r0, p1, n_rot, mode); // "context swap", i.e. forget n_past_0 - n_past_2 tokens
-
-        //  33,  34,  35, ..., 105
-        struct ggml_tensor * r2 = ggml_rope(ctx0, x,  p2, n_rot, mode);
-
-        ggml_cgraph * gf = ggml_new_graph(ctx0);
-
-        ggml_build_forward_expand(gf, r0);
-        ggml_build_forward_expand(gf, r1);
-        ggml_build_forward_expand(gf, r2);
-
-        ggml_graph_compute_helper(work_buffer, gf, 4);
-
-        // check that r1 and r2 are the same
-        {
-            double sum0 = 0.0f;
-            double sum1 = 0.0f;
-            double diff = 0.0f;
-
-            const float * r1_data = (float *) r1->data;
-            const float * r2_data = (float *) r2->data;
-
-            const int n_elements = ggml_nelements(r1);
-
-            for (int i = 0; i < n_elements; ++i) {
-                sum0 += fabs(r1_data[i]);
-                sum1 += fabs(r2_data[i]);
-                diff += fabs(r1_data[i] - r2_data[i]);
-                //if (fabs(r1_data[i] - r2_data[i]) > 0.0001f) {
-                //    printf("%d: %f %f\n", i, r1_data[i], r2_data[i]);
-                //    printf("diff: %f\n", fabs(r1_data[i] - r2_data[i]));
-                //}
-            }
-
-            //for (int i = 4096; i < 4096 + 128; ++i) {
-            //    printf("%f %f\n", r1_data[i], r2_data[i]);
-            //}
-
-            printf("mode: %d\n", mode);
-            printf("sum0: %f\n", sum0);
-            printf("sum1: %f\n", sum1);
-            printf("diff: %f\n", diff);
-            printf("rel err: %f\n", diff / sum0);
-            printf("rel err: %f\n", diff / sum1);
-
-            GGML_ASSERT(diff / sum0 < 0.0001f);
-            GGML_ASSERT(diff / sum1 < 0.0001f);
-        }
-    }
-
-    ggml_free(ctx0);
-
-    return 0;
-}
--- a/tests/test-sampling.cpp
+++ b/tests/test-sampling.cpp
@ -1,301 +0,0 @@
-#include "ggml.h"
-#include "llama.h"
-
-#ifdef NDEBUG
-#undef NDEBUG
-#endif
-
-#include <algorithm>
-#include <cmath>
-#include <string>
-#include <vector>
-
-static void dump(const llama_token_data_array * candidates) {
-    for (size_t i = 0; i < candidates->size; i++) {
-        printf("%d: %f (%f)\n", candidates->data[i].id, candidates->data[i].p, candidates->data[i].logit);
-    }
-}
-
-#define DUMP(__candidates) do { printf("%s:%d (%s)\n", __FILE__, __LINE__, __func__); dump((__candidates)); printf("-\n"); } while(0)
-
-static void test_top_k(const std::vector<float> & probs, const std::vector<float> & expected_probs, int k) {
-    const size_t n_vocab = probs.size();
-    std::vector<llama_token_data> candidates;
-    candidates.reserve(n_vocab);
-    for (llama_token token_id = 0; token_id < (llama_token)n_vocab; token_id++) {
-        const float logit = logf(probs[token_id]);
-        candidates.emplace_back(llama_token_data{token_id, logit, 0.0f});
-    }
-
-    llama_token_data_array candidates_p = { candidates.data(), candidates.size(), false };
-    llama_sample_softmax(nullptr, &candidates_p);
-    DUMP(&candidates_p);
-    llama_sample_top_k(nullptr, &candidates_p, k, 1);
-    DUMP(&candidates_p);
-
-    GGML_ASSERT(candidates_p.size == expected_probs.size());
-    for (size_t i = 0; i < candidates_p.size; i++) {
-        GGML_ASSERT(fabs(candidates_p.data[i].p - expected_probs[i]) < 1e-5);
-    }
-}
-
-static void test_top_p(const std::vector<float> & probs, const std::vector<float> & expected_probs, float p) {
-    const size_t n_vocab = probs.size();
-    std::vector<llama_token_data> candidates;
-    candidates.reserve(n_vocab);
-    for (llama_token token_id = 0; token_id < (llama_token)n_vocab; token_id++) {
-        const float logit = logf(probs[token_id]);
-        candidates.emplace_back(llama_token_data{token_id, logit, 0.0f});
-    }
-
-    llama_token_data_array candidates_p = { candidates.data(), candidates.size(), false };
-    llama_sample_softmax(nullptr, &candidates_p);
-    DUMP(&candidates_p);
-    llama_sample_top_p(nullptr, &candidates_p, p, 1);
-    DUMP(&candidates_p);
-
-    GGML_ASSERT(candidates_p.size == expected_probs.size());
-    for (size_t i = 0; i < candidates_p.size; i++) {
-        GGML_ASSERT(fabs(candidates_p.data[i].p - expected_probs[i]) < 1e-3);
-    }
-}
-
-static void test_tfs(const std::vector<float> & probs, const std::vector<float> & expected_probs, float z) {
-    const size_t n_vocab = probs.size();
-    std::vector<llama_token_data> candidates;
-    candidates.reserve(n_vocab);
-    for (llama_token token_id = 0; token_id < (llama_token)n_vocab; token_id++) {
-        const float logit = logf(probs[token_id]);
-        candidates.emplace_back(llama_token_data{token_id, logit, 0.0f});
-    }
-
-    llama_token_data_array candidates_p = { candidates.data(), candidates.size(), false };
-    DUMP(&candidates_p);
-    llama_sample_tail_free(nullptr, &candidates_p, z, 1);
-    DUMP(&candidates_p);
-
-    GGML_ASSERT(candidates_p.size == expected_probs.size());
-    for (size_t i = 0; i < candidates_p.size; i++) {
-        GGML_ASSERT(fabs(candidates_p.data[i].p - expected_probs[i]) < 1e-3);
-    }
-}
-
-static void test_min_p(const std::vector<float> & probs, const std::vector<float> & expected_probs, float p) {
-    const size_t n_vocab = probs.size();
-    std::vector<llama_token_data> candidates;
-    candidates.reserve(n_vocab);
-    for (llama_token token_id = 0; token_id < (llama_token)n_vocab; token_id++) {
-        const float logit = logf(probs[token_id]);
-        candidates.emplace_back(llama_token_data{token_id, logit, 0.0f});
-    }
-
-    llama_token_data_array candidates_p = { candidates.data(), candidates.size(), false };
-    DUMP(&candidates_p);
-    llama_sample_min_p(nullptr, &candidates_p, p, 1);
-    DUMP(&candidates_p);
-    llama_sample_softmax(nullptr, &candidates_p);
-
-    GGML_ASSERT(candidates_p.size == expected_probs.size());
-    for (size_t i = 0; i < candidates_p.size; i++) {
-        GGML_ASSERT(fabs(candidates_p.data[i].p - expected_probs[i]) < 1e-3);
-    }
-}
-
-static void test_typical(const std::vector<float> & probs, const std::vector<float> & expected_probs, float p) {
-    const size_t n_vocab = probs.size();
-    std::vector<llama_token_data> candidates;
-    candidates.reserve(n_vocab);
-    for (llama_token token_id = 0; token_id < (llama_token)n_vocab; token_id++) {
-        const float logit = logf(probs[token_id]);
-        candidates.emplace_back(llama_token_data{token_id, logit, 0.0f});
-    }
-
-    llama_token_data_array candidates_p = { candidates.data(), candidates.size(), false };
-    DUMP(&candidates_p);
-    llama_sample_typical(nullptr, &candidates_p, p, 1);
-    DUMP(&candidates_p);
-
-    GGML_ASSERT(candidates_p.size == expected_probs.size());
-    for (size_t i = 0; i < candidates_p.size; i++) {
-        GGML_ASSERT(fabs(candidates_p.data[i].p - expected_probs[i]) < 1e-3);
-    }
-}
-
-static void test_repetition_penalties(
-    const std::vector<float> & probs, const std::vector<llama_token> & last_tokens,
-    const std::vector<float> & expected_probs, float repeat_penalty, float alpha_frequency, float alpha_presence
-) {
-    GGML_ASSERT(probs.size() == expected_probs.size());
-
-    const size_t n_vocab = probs.size();
-    std::vector<llama_token_data> candidates;
-    candidates.reserve(n_vocab);
-    for (llama_token token_id = 0; token_id < (llama_token)n_vocab; token_id++) {
-        const float logit = logf(probs[token_id]);
-        candidates.emplace_back(llama_token_data{token_id, logit, 0.0f});
-    }
-
-    llama_token_data_array candidates_p = { candidates.data(), candidates.size(), false };
-    llama_sample_softmax(nullptr, &candidates_p);
-    DUMP(&candidates_p);
-    llama_sample_repetition_penalties(nullptr, &candidates_p, (const llama_token *) last_tokens.data(), last_tokens.size(), repeat_penalty, alpha_frequency, alpha_presence);
-    llama_sample_softmax(nullptr, &candidates_p);
-    DUMP(&candidates_p);
-
-    GGML_ASSERT(candidates_p.size == expected_probs.size());
-    for (size_t i = 0; i < candidates_p.size; i++) {
-        GGML_ASSERT(fabs(candidates_p.data[i].p - expected_probs[i]) < 1e-3);
-    }
-}
-
-static void test_sampler_queue(
-    const size_t n_vocab, const std::string samplers_sequence, const int top_k, const float top_p, const float min_p
-) {
-    std::vector<llama_token_data> candidates;
-    candidates.reserve(n_vocab);
-    for (llama_token token_id = 0; token_id < (llama_token)n_vocab; token_id++) {
-        const float logit = logf(token_id);
-        candidates.emplace_back(llama_token_data{token_id, logit, 0.0f});
-    }
-
-    llama_token_data_array candidates_p = { candidates.data(), candidates.size(), false };
-
-          llama_token min_token_id = 0;
-    const llama_token max_token_id = n_vocab-1;
-
-    for (auto s : samplers_sequence) {
-        switch (s){
-            case 'k': llama_sample_top_k    (nullptr, &candidates_p, top_k, 1); break;
-            case 'f': GGML_ASSERT(false && "tail_free test not implemented");   break;
-            case 'y': GGML_ASSERT(false && "typical test not implemented");     break;
-            case 'p': llama_sample_top_p    (nullptr, &candidates_p, top_p, 1); break;
-            case 'm': llama_sample_min_p    (nullptr, &candidates_p, min_p, 1); break;
-            case 't': GGML_ASSERT(false && "temperature test not implemented"); break;
-            default : GGML_ASSERT(false && "Unknown sampler");                  break;
-        }
-
-        llama_sample_softmax(nullptr, &candidates_p); // make sure tokens are sorted for tests
-
-        const int size = candidates_p.size;
-
-        if (s == 'k') {
-            const int expected_size = std::min(size, top_k);
-            min_token_id = std::max(min_token_id, (llama_token)(n_vocab - top_k));
-
-            GGML_ASSERT(size == expected_size);
-            GGML_ASSERT(candidates_p.data[0].id == max_token_id);
-            GGML_ASSERT(candidates_p.data[expected_size-1].id == min_token_id);
-        } else if (s == 'p') {
-            const int softmax_divisor = n_vocab * (n_vocab-1) / 2 - min_token_id * (min_token_id-1) / 2;
-            const int softmax_numerator_target = ceilf(top_p * softmax_divisor);
-
-                min_token_id  = n_vocab;
-            int expected_size = 0;
-            int cumsum        = 0;
-            do { // do-while because always at least one token is sampled
-                min_token_id--;
-                expected_size++;
-
-                cumsum += min_token_id;
-            } while (cumsum < softmax_numerator_target);
-
-            // token 0 has p == 0, need special consideration for cumsum because top_p immediately returns
-            if (min_token_id == 1) {
-                min_token_id--;
-                expected_size += 1;
-            }
-
-            GGML_ASSERT(size == expected_size);
-            GGML_ASSERT(candidates_p.data[0].id == max_token_id);
-            GGML_ASSERT(candidates_p.data[expected_size-1].id == min_token_id);
-        } else if (s == 'm') {
-            int expected_size = ceilf((1.0f-min_p) * n_vocab);
-            expected_size = std::max(expected_size, 1);
-            expected_size = std::min(expected_size, size);
-
-            min_token_id = floorf(min_p * n_vocab);
-            min_token_id = std::max(min_token_id, 1);
-            min_token_id = std::max(min_token_id, (llama_token)(n_vocab - size));
-            min_token_id = std::min(min_token_id, (llama_token)(n_vocab - 1));
-
-            GGML_ASSERT(size == expected_size);
-            GGML_ASSERT(candidates_p.data[0].id == max_token_id);
-            GGML_ASSERT(candidates_p.data[expected_size-1].id == min_token_id);
-        } else {
-            GGML_ASSERT(false);
-        }
-    }
-
-    printf("Sampler queue %3s OK with n_vocab=%05ld top_k=%05d top_p=%f min_p=%f\n",
-           samplers_sequence.c_str(), n_vocab, top_k, top_p, min_p);
-}
-
-int main(void) {
-    ggml_time_init();
-
-    test_top_k({0.1f, 0.2f, 0.3f, 0.4f}, {0.4f}, 1);
-    test_top_k({0.1f, 0.2f, 0.3f, 0.4f}, {0.4f, 0.3f, 0.2f}, 3);
-    test_top_k({0.1f, 0.2f, 0.3f, 0.4f}, {0.4f, 0.3f, 0.2f, 0.1f}, 4);
-    test_top_k({0.1f, 0.2f, 0.3f, 0.4f}, {0.4f, 0.3f, 0.2f, 0.1f}, 0);
-
-    test_top_p({0.1f, 0.2f, 0.3f, 0.4f}, {0.4f}, 0);
-    test_top_p({0.1f, 0.2f, 0.3f, 0.4f}, {0.4f, 0.3f}, 0.7f);
-    test_top_p({0.1f, 0.2f, 0.3f, 0.4f}, {0.4f, 0.3f, 0.2f}, 0.8f);
-    test_top_p({0.1f, 0.2f, 0.3f, 0.4f}, {0.4f, 0.3f, 0.2f, 0.1f}, 1);
-
-    test_min_p({0.1f, 0.2f, 0.3f, 0.4f}, {0.4f/1.0f, 0.3f/1.0f, 0.2f/1.0f, 0.1f/1.0f}, 0.00f);
-    test_min_p({0.1f, 0.2f, 0.3f, 0.4f}, {0.4f/1.0f, 0.3f/1.0f, 0.2f/1.0f, 0.1f/1.0f}, 0.24f);
-    test_min_p({0.1f, 0.2f, 0.3f, 0.4f}, {0.4f/0.9f, 0.3f/0.9f, 0.2f/0.9f},            0.26f);
-    test_min_p({0.1f, 0.2f, 0.3f, 0.4f}, {0.4f/0.9f, 0.3f/0.9f, 0.2f/0.9f},            0.49f);
-    test_min_p({0.1f, 0.2f, 0.3f, 0.4f}, {0.4f/0.7f, 0.3f/0.7f},                       0.51f);
-    test_min_p({0.1f, 0.2f, 0.3f, 0.4f}, {0.4f/0.7f, 0.3f/0.7f},                       0.74f);
-    test_min_p({0.1f, 0.2f, 0.3f, 0.4f}, {0.4f/0.4f},                                  0.76f);
-    test_min_p({0.1f, 0.2f, 0.3f, 0.4f}, {0.4f/0.4f},                                  1.00f);
-
-    test_tfs({0.1f, 0.15f, 0.2f, 0.25f, 0.3f}, {0.3f}, 0.25f);
-    test_tfs({0.1f, 0.15f, 0.2f, 0.25f, 0.3f}, {0.3f, 0.25f}, 0.75f);
-    test_tfs({0.1f, 0.15f, 0.2f, 0.25f, 0.3f}, {0.3f, 0.25f}, 0.99f);
-
-    test_typical({0.97f, 0.01f, 0.01f, 0.01f}, {0.97f}, 0.5f);
-    test_typical({0.4f, 0.2f, 0.2f, 0.2f}, {0.2f, 0.2f, 0.2f}, 0.5f);
-
-    test_repetition_penalties({0.2f, 0.2f, 0.2f, 0.2f, 0.2f}, {0}, {0.25f, 0.25f, 0.25f, 0.25f, 0},   50.0f, 0.0f, 0.0f);
-    test_repetition_penalties({0.2f, 0.2f, 0.2f, 0.2f, 0.2f}, {0, 1, 2}, {0.5f, 0.5f, 0, 0, 0},       50.0f, 0.0f, 0.0f);
-    test_repetition_penalties({0.2f, 0.2f, 0.2f, 0.2f, 0.2f}, {0, 1, 2, 0, 0}, {0.5f, 0.5f, 0, 0, 0}, 50.0f, 0.0f, 0.0f);
-
-    test_repetition_penalties({0.2f, 0.2f, 0.2f, 0.2f, 0.2f}, {0},             {0.249997f, 0.249997f, 0.249997f, 0.249997f, 0.000011f}, 1.0f, 5.0f, 5.0f);
-    test_repetition_penalties({0.2f, 0.2f, 0.2f, 0.2f, 0.2f}, {0, 1, 2},       {0.499966f, 0.499966f, 0.000023f, 0.000023f, 0.000023f}, 1.0f, 5.0f, 5.0f);
-    test_repetition_penalties({0.2f, 0.2f, 0.2f, 0.2f, 0.2f}, {0, 1, 2, 0, 0}, {0.499977f, 0.499977f, 0.000023f, 0.000023f, 0.000000f}, 1.0f, 5.0f, 5.0f);
-
-    test_sampler_queue(10000, "k", 10000, 1.0f, 1.0f);
-    test_sampler_queue(10000, "k",     1, 1.0f, 1.0f);
-    test_sampler_queue(10000, "p", 10000, 1.0f, 1.0f);
-    test_sampler_queue(10000, "p", 10000, 0.0f, 1.0f);
-    test_sampler_queue(10000, "m", 10000, 1.0f, 1.0f);
-    test_sampler_queue(10000, "m", 10000, 1.0f, 1e-12);
-
-    test_sampler_queue(10000, "k",   100, 1.0000f, 1.0f);
-    test_sampler_queue(10000, "p", 10000, 0.0002f, 1.0f);
-    test_sampler_queue(10000, "p", 10000, 0.8000f, 1.0f);
-    test_sampler_queue(10000, "m", 10000, 1.0000f, 9997.9f/9999.0f);
-    test_sampler_queue(10000, "m", 10000, 1.0000f, 0.1f);
-
-    test_sampler_queue(10000, "kp", 100, 0.8f, 0.1f);
-    test_sampler_queue(10000, "km", 100, 0.8f, 0.1f);
-    test_sampler_queue(10000, "pk", 100, 0.8f, 0.1f);
-    test_sampler_queue(10000, "pm", 100, 0.8f, 0.1f);
-    test_sampler_queue(10000, "mk", 100, 0.8f, 0.1f);
-    test_sampler_queue(10000, "mp", 100, 0.8f, 9997.9f/9999.0f);
-    test_sampler_queue(10000, "mp", 100, 0.8f, 0.1f);
-
-    test_sampler_queue(10000, "kpm", 100, 0.8f, 0.1f);
-    test_sampler_queue(10000, "kmp", 100, 0.8f, 0.1f);
-    test_sampler_queue(10000, "pkm", 100, 0.8f, 0.1f);
-    test_sampler_queue(10000, "pmk", 100, 0.8f, 0.1f);
-    test_sampler_queue(10000, "mkp", 100, 0.8f, 0.1f);
-    test_sampler_queue(10000, "mpk", 100, 0.8f, 0.1f);
-
-    printf("OK\n");
-
-    return 0;
-}
--- a/tests/test-tokenizer-0.cpp
+++ b/tests/test-tokenizer-0.cpp
@ -1,292 +0,0 @@
-#include "llama.h"
-#include "common.h"
-#include "console.h"
-
-#include <cstdio>
-#include <string>
-#include <map>
-#include <vector>
-#include <fstream>
-
-//static const std::map<std::string, std::vector<llama_token>> & k_tests() {
-//    static std::map<std::string, std::vector<llama_token>> _k_tests = {
-//        { ""                      , {  }, },
-//        { " "                     , {     220, }, },
-//        { "  "                    , {     256, }, },
-//        { "   "                   , {     262, }, },
-//        { "\t"                    , {     197, }, },
-//        { "\n"                    , {     198, }, },
-//        { "\n\n"                  , {     271, }, },
-//        { "\n\n\n"                , {    1432, }, },
-//        { "\t\n"                  , {    1602, }, },
-//        { "Hello world"           , {    9906,   1917, }, },
-//        { " Hello world"          , {   22691,   1917, }, },
-//        { "Hello World"           , {    9906,   4435, }, },
-//        { " Hello World"          , {   22691,   4435, }, },
-//        { " Hello World!"         , {   22691,   4435,      0, }, },
-//        { "Hello, world!"         , {    9906,     11,   1917,      0, }, },
-//        { " Hello, world!"        , {   22691,     11,   1917,      0, }, },
-//        { " this is 🦙.cpp"        , {     420,    374,  11410,     99,    247,     13,  11055, }, },
-//        { "w048 7tuijk dsdfhu"    , {      86,  23904,    220,     22,     83,   2005,  42908,  11729,   3013,  17156, }, },
-//        { "нещо на Български"     , {   79862, 102118,  13373,  64571,  34694,   3114, 112203,  80112, }, },
-//        { "កាន់តែពិសេសអាចខលចេញ"   , {   21549,    222,  98629,    241,  45358,    233,  21549,    237,  45358,    224,  21549,    244,  21549,    115,  21549,    253,  45358,    223,  21549,    253,  21549,     95,  98629,    227,  21549,    223,  21549,    249,  21549,    227,  45358,    223,  21549,    231, }, },
-//        { "🚀 (normal) 😶‍🌫️ (multiple emojis concatenated) ✅ (only emoji that has its own token)", {    9468,    248,    222,    320,   8416,      8,  27623,    114, 102470,   9468,    234,    104,  31643,    320,  36773, 100166,  98634,      8,  26602,    227,    320,   3323,  43465,    430,    706,   1202,   1866,   4037,      8, }, },
-//        { "Hello"                 , {    9906, }, },
-//        { " Hello"                , {   22691, }, },
-//        { "  Hello"               , {     220,  22691, }, },
-//        { "   Hello"              , {     256,  22691, }, },
-//        { "    Hello"             , {     262,  22691, }, },
-//        { "    Hello\n    Hello"  , {     262,  22691,    198,    262,  22691, }, },
-//        { " ("                    , {     320, }, },
-//        { "\n ="                  , {     198,    284, }, },
-//        { "' era"                 , {       6,  11639, }, },
-//        { "Hello, y'all! How are you 😁 ?我想在apple工作1314151天～", {    9906,     11,    379,  65948,      0,   2650,    527,    499,  27623,    223,    949,  37046, 101067,  19000,  23182, 102301,   9263,  18136,     16,  36827,  21909, }, },
-//        { "3"                     , {      18, }, },
-//        { "33"                    , {    1644, }, },
-//        { "333"                   , {    8765, }, },
-//        { "3333"                  , {    8765,     18, }, },
-//        { "33333"                 , {    8765,   1644, }, },
-//        { "333333"                , {    8765,   8765, }, },
-//        { "3333333"               , {    8765,   8765,     18, }, },
-//        { "33333333"              , {    8765,   8765,   1644, }, },
-//        { "333333333"             , {    8765,   8765,   8765, }, },
-//    };
-//
-//    return _k_tests;
-//}
-
-using llama_tests = std::map<std::string, std::vector<llama_token>>;
-
-static llama_tests read_tests(const std::string & fname_inp, const std::string & fname_out) {
-    llama_tests tests;
-
-    std::ifstream ifs_inp(fname_inp);
-    if (!ifs_inp) {
-        fprintf(stderr, "%s : error: could not open file '%s'\n", __func__, fname_inp.c_str());
-        return tests;
-    }
-
-    std::string sraw((std::istreambuf_iterator<char>(ifs_inp)), std::istreambuf_iterator<char>());
-
-    std::ifstream ifs_out(fname_out);
-    if (!ifs_out) {
-        fprintf(stderr, "%s : error: could not open file '%s'\n", __func__, fname_out.c_str());
-        return tests;
-    }
-
-    std::vector<std::string> sout;
-    for (std::string line; std::getline(ifs_out, line);) {
-        sout.push_back(line);
-    }
-
-    const std::string sep = "\n__ggml_vocab_test__\n";
-
-    std::vector<std::string> sinp;
-
-    size_t pos = 0;
-    while (pos < sraw.size()) {
-        const size_t next = sraw.find(sep, pos);
-        if (next == std::string::npos) {
-            sinp.push_back(sraw.substr(pos));
-            break;
-        }
-        sinp.push_back(sraw.substr(pos, next - pos));
-        pos = next + sep.size();
-    }
-
-    if (sinp.size() != sout.size()) {
-        fprintf(stderr, "%s : error: input and output files have different number of tests\n", __func__);
-        return tests;
-    }
-
-    for (size_t i = 0; i < sinp.size(); ++i) {
-        const std::string & s = sinp[i];
-        const std::string & o = string_strip(sout[i]);
-
-        std::vector<llama_token> toks;
-
-        size_t pos = 0;
-        while (pos < o.size()) {
-            size_t next = o.find(' ', pos);
-            if (next == std::string::npos) {
-                next = o.size();
-            }
-            const std::string stok = o.substr(pos, next - pos);
-            toks.push_back(std::stoi(stok));
-            pos = next + 1;
-        }
-
-        tests[s] = toks;
-    }
-
-    return tests;
-}
-
-int main(int argc, char **argv) {
-    if (argc < 2) {
-        fprintf(stderr, "Usage: %s vocab-file [text-file]\n", argv[0]);
-        return 1;
-    }
-
-    const std::string fname = argv[1];
-
-    const std::string fname_inp = fname + ".inp";
-    const std::string fname_out = fname + ".out";
-
-    std::string fname_text;
-    if (argc > 2) {
-        fname_text = argv[2];
-    }
-
-    fprintf(stderr, "%s : reading vocab from: '%s'\n", __func__, fname.c_str());
-
-    llama_model * model;
-    llama_context * ctx;
-
-    llama_backend_init();
-
-    // load the vocab
-    {
-        auto mparams = llama_model_default_params();
-
-        mparams.vocab_only = true;
-
-        model = llama_load_model_from_file(fname.c_str(), mparams);
-
-        if (model == NULL) {
-            fprintf(stderr, "%s: error: failed to load vocab '%s'\n", __func__, fname.c_str());
-            return 1;
-        }
-
-        auto cparams = llama_context_default_params();
-
-        ctx = llama_new_context_with_model(model, cparams);
-
-        if (ctx == NULL) {
-            fprintf(stderr, "%s: error: failed to load vocab '%s'\n", __func__, fname.c_str());
-            llama_free_model(model);
-            return 1;
-        }
-    }
-
-#ifdef _WIN32
-    // We need this for unicode console support
-    console::init(false, false);
-    atexit([]() { console::cleanup(); });
-#endif
-
-    bool success = true;
-
-    const auto k_tests = [&]() -> llama_tests {
-        if (!fname_text.empty()) {
-            return {};
-        }
-
-        const auto res = read_tests(fname_inp, fname_out);
-
-        if (res.empty()) {
-            fprintf(stderr, "%s : error: no tests found\n", __func__);
-            exit(1);
-        }
-
-        return res;
-    }();
-
-    const bool add_special = false;
-
-    for (const auto & test_kv : k_tests) {
-        const std::vector<llama_token> res = llama_tokenize(ctx, test_kv.first, add_special, false);
-
-        printf("\n");
-        printf("src: '%s'\n", test_kv.first.c_str());
-        printf("res: '%s'\n", llama_detokenize(ctx, res).c_str());
-        printf("tok: ");
-        for (const auto & tok : res) {
-            printf("%d ", tok);
-        }
-        printf("\n");
-
-        bool correct = res.size() == test_kv.second.size();
-        for (int i = 0; i < (int) res.size() && correct; ++i) {
-            if (test_kv.second[i] != res[i]) {
-                correct = false;
-            }
-        }
-
-        if (!correct) {
-            fprintf(stderr, "%s : failed test:    '%s'\n", __func__, test_kv.first.c_str());
-            fprintf(stderr, "%s : detokenized to: '%s' instead of '%s'\n", __func__,
-                llama_detokenize(ctx, res).c_str(),
-                llama_detokenize(ctx, test_kv.second).c_str());
-            fprintf(stderr, "%s : expected tokens: ", __func__);
-            for (const auto & t : test_kv.second) {
-                fprintf(stderr, "%6d '%s', ", t, llama_token_to_piece(ctx, t).c_str());
-            }
-            fprintf(stderr, "\n");
-            fprintf(stderr, "%s : got tokens:      ", __func__);
-            for (const auto & t : res) {
-                fprintf(stderr, "%6d '%s', ", t, llama_token_to_piece(ctx, t).c_str());
-            }
-            fprintf(stderr, "\n");
-
-            success = false;
-        }
-    }
-
-    if (!fname_text.empty()) {
-        fprintf(stderr, "%s : tokenizing: '%s'\n", __func__, fname_text.c_str());
-
-        std::string text;
-        {
-            std::ifstream ifs(fname_text);
-            if (!ifs) {
-                fprintf(stderr, "%s : error: could not open file '%s'\n", __func__, fname_text.c_str());
-                return 1;
-            }
-            text = std::string(std::istreambuf_iterator<char>(ifs), std::istreambuf_iterator<char>());
-        }
-
-        fprintf(stderr, "%s : text size: %zu\n", __func__, text.size());
-
-        std::vector<llama_token> res;
-
-        {
-            const auto t_start = ggml_time_us();
-
-            res = llama_tokenize(ctx, text, add_special, false);
-
-            const auto t_end = ggml_time_us();
-
-            fprintf(stderr, "%s : tokenized in %.3f ms (cpp)\n", __func__, (t_end - t_start) / 1000.0);
-        }
-
-        fprintf(stderr, "%s : tokens: %zu\n", __func__, res.size());
-
-        {
-            const std::string fname_out = fname_text + ".tokcpp";
-
-            std::ofstream ofs(fname_out);
-            if (!ofs) {
-                fprintf(stderr, "%s : error: could not open file '%s'\n", __func__, fname_out.c_str());
-                return 1;
-            }
-
-            for (const auto & tok : res) {
-                //ofs << tok << " '" << string_strip(llama_detokenize(ctx, std::vector<int>{tok})) << "'" << std::endl;
-                ofs << tok << "\n";
-            }
-        }
-
-        fprintf(stderr, "%s : tokens written to '%s'\n", __func__, (fname_text + ".tokcpp").c_str());
-    }
-
-    llama_free_model(model);
-    llama_free(ctx);
-
-    llama_backend_free();
-
-    printf("\n");
-    printf("Tests %s\n", success ? "passed" : "failed");
-
-    return success ? 0 : 3;
-}
--- a/tests/test-tokenizer-0.py
+++ b/tests/test-tokenizer-0.py
@ -1,46 +0,0 @@
-import time
-import argparse
-
-from transformers import AutoTokenizer
-
-parser = argparse.ArgumentParser()
-parser.add_argument("dir_tokenizer", help="directory containing 'tokenizer.model' file")
-parser.add_argument("--fname-tok",   help="path to a text file to tokenize", required=True)
-args = parser.parse_args()
-
-dir_tokenizer = args.dir_tokenizer
-fname_tok = args.fname_tok
-
-tokenizer = AutoTokenizer.from_pretrained(dir_tokenizer)
-
-print('tokenizing file: ', fname_tok) # noqa: NP100
-fname_out = fname_tok + '.tok'
-with open(fname_tok, 'r', encoding='utf-8') as f:
-    lines = f.readlines()
-    s = ''.join(lines)
-    t_start = time.time()
-    res = tokenizer.encode(s, add_special_tokens=False)
-    t_end = time.time()
-    print('\nmain : tokenized in', "{:.3f}".format(1000.0 * (t_end - t_start)), 'ms (py)') # noqa: NP100
-    with open(fname_out, 'w', encoding='utf-8') as f:
-        for x in res:
-            # LLaMA v3 for some reason strips the space for these tokens (and others)
-            # if x == 662:
-            #     f.write(str(x) + ' \' ' + tokenizer.decode(x) + '\'\n')
-            # elif x == 1174:
-            #     f.write(str(x) + ' \' ' + tokenizer.decode(x) + '\'\n')
-            # elif x == 2564:
-            #     f.write(str(x) + ' \' ' + tokenizer.decode(x) + '\'\n')
-            # elif x == 758:
-            #     f.write(str(x) + ' \' ' + tokenizer.decode(x) + '\'\n')
-            # elif x == 949:
-            #     f.write(str(x) + ' \' ' + tokenizer.decode(x) + '\'\n')
-            # elif x == 5354:
-            #     f.write(str(x) + ' \' ' + tokenizer.decode(x) + '\'\n')
-            # else:
-            #     f.write(str(x) + ' \'' + tokenizer.decode(x) + '\'\n')
-            # f.write(str(x) + ' \'' + tokenizer.decode(x).strip() + '\'\n')
-            f.write(str(x) + '\n')
-    print('len(res): ', len(res)) # noqa: NP100
-    print('len(lines): ', len(lines)) # noqa: NP100
-print('results written to: ', fname_out) # noqa: NP100
--- a/tests/test-tokenizer-0.sh
+++ b/tests/test-tokenizer-0.sh
@ -1,41 +0,0 @@
-#!/bin/bash
-#
-# Usage:
-#
-#   test-tokenizer-0.sh <name> <input>
-#
-
-if [ $# -ne 2 ]; then
-    printf "Usage: $0 <name> <input>\n"
-    exit 1
-fi
-
-name=$1
-input=$2
-
-make -j tests/test-tokenizer-0
-
-printf "Testing %s on %s ...\n" $name $input
-
-set -e
-
-printf "Tokenizing using (py)  Python AutoTokenizer ...\n"
-python3 ./tests/test-tokenizer-0.py ./models/tokenizers/$name --fname-tok $input > /tmp/test-tokenizer-0-$name-py.log 2>&1
-
-printf "Tokenizing using (cpp) llama.cpp ...\n"
-./tests/test-tokenizer-0 ./models/ggml-vocab-$name.gguf $input > /tmp/test-tokenizer-0-$name-cpp.log 2>&1
-
-cat /tmp/test-tokenizer-0-$name-py.log | grep "tokenized in"
-cat /tmp/test-tokenizer-0-$name-cpp.log | grep "tokenized in"
-
-set +e
-
-diff $input.tok $input.tokcpp > /dev/null 2>&1
-
-if [ $? -eq 0 ]; then
-    printf "Tokenization is correct!\n"
-else
-    diff $input.tok $input.tokcpp | head -n 32
-
-    printf "Tokenization differs!\n"
-fi
--- a/tests/test-tokenizer-1-bpe.cpp
+++ b/tests/test-tokenizer-1-bpe.cpp
@ -1,152 +0,0 @@
-#include "llama.h"
-#include "common.h"
-#include "unicode.h"
-#include "console.h"
-
-#include <cassert>
-#include <codecvt>
-#include <cstdio>
-#include <cstring>
-#include <locale>
-#include <string>
-#include <thread>
-#include <vector>
-#include <atomic>
-
-int main(int argc, char **argv) {
-    if (argc < 2 || argc > 3) {
-        fprintf(stderr, "Usage: %s <vocab-file> [--ignore-merges]\n", argv[0]);
-        return 1;
-    }
-
-    const std::string fname = argv[1];
-    bool ignore_merges = false;
-    if (argc == 3) {
-        if (std::strcmp(argv[2], "--ignore-merges") != 0) {
-            fprintf(stderr, "Usage: %s <vocab-file> [--ignore-merges]\n", argv[0]);
-            return 1;
-        }
-        ignore_merges = true;
-    }
-
-    fprintf(stderr, "%s : reading vocab from: '%s'\n", __func__, fname.c_str());
-
-    if (ignore_merges) {
-        fprintf(stderr, "%s : ignoring merges for tokens inside vocab\n", __func__);
-    }
-
-    llama_model * model;
-    llama_context * ctx;
-
-    llama_backend_init();
-
-    // load the vocab
-    {
-        auto mparams = llama_model_default_params();
-
-        mparams.vocab_only = true;
-
-        model = llama_load_model_from_file(fname.c_str(), mparams);
-
-        if (model == NULL) {
-            fprintf(stderr, "%s: error: failed to load vocab '%s'\n", __func__, fname.c_str());
-            return 1;
-        }
-
-        auto cparams = llama_context_default_params();
-
-        ctx = llama_new_context_with_model(model, cparams);
-
-        if (ctx == NULL) {
-            fprintf(stderr, "%s: error: failed to load vocab '%s'\n", __func__, fname.c_str());
-            llama_free_model(model);
-            return 1;
-        }
-    }
-
-    //GGML_ASSERT(llama_vocab_type(model) == LLAMA_VOCAB_TYPE_BPE);
-    if (llama_vocab_type(model) != LLAMA_VOCAB_TYPE_BPE) {
-        return 99;
-    }
-
-#ifdef _WIN32
-    // We need this for unicode console support
-    console::init(false, false);
-    atexit([]() { console::cleanup(); });
-#endif
-
-    const int n_vocab = llama_n_vocab(model);
-
-    for (int i = 0; i < n_vocab; ++i) {
-        std::string str = llama_detokenize(ctx, std::vector<int>(1, i));
-        try {
-            auto cps = unicode_cpts_from_utf8(str);
-            std::vector<llama_token> tokens = llama_tokenize(ctx, str, false, true);
-            if (ignore_merges && tokens.size() > 1) {
-                fprintf(stderr,
-                        "%s : error: token %d detokenizes to '%s'(%zu) but "
-                        "tokenization of this to multiple tokens: [",
-                        __func__, i, str.c_str(), str.length());
-                fprintf(stderr, "%d", tokens[0]);
-                for (size_t i = 1; i < tokens.size(); i++) {
-                    fprintf(stderr, ", %d", tokens[i]);
-                }
-                fprintf(stderr, "]\n");
-                return 2;
-            }
-            std::string check = llama_detokenize(ctx, tokens);
-            if (check != str) {
-                fprintf(stderr, "%s : error: token %d detokenizes to '%s'(%zu) but tokenization of this detokenizes to '%s'(%zu)\n",
-                    __func__, i, str.c_str(), str.length(), check.c_str(), check.length());
-                return 2;
-            }
-        }
-        catch (const std::invalid_argument &) {
-            //fprintf(stderr, "%s : info: utf8 conversion %d '%s'\n", __func__, i, str.c_str());
-        }
-    }
-
-    // unicode
-    {
-        const int nthread = std::thread::hardware_concurrency();
-
-        std::vector<std::thread> threads(nthread);
-
-        std::atomic_int errcode = {};
-
-        for (int i = 0; i < nthread; ++i) {
-            threads[i] = std::thread([i, nthread, ctx, &errcode]() {
-                for (uint32_t cp = i; !errcode && cp < 0x00110000; cp += nthread) {
-                    if ((0x0000D800 <= cp && cp <= 0x0000DFFF) ||  // surrogates \p{Cs}
-                        (0x00040000 <= cp && cp <= 0x000E0000)) {  // undefined  \p{Cn}
-                        continue;
-                    }
-
-                    std::string str = unicode_cpt_to_utf8(cp);
-                    std::vector<llama_token> tokens = llama_tokenize(ctx, str, false);
-                    std::string check = llama_detokenize(ctx, tokens);
-                    if (cp != 9601 && str != check) {
-                        fprintf(stderr, "error: codepoint 0x%x detokenizes to '%s'(%zu) instead of '%s'(%zu)\n",
-                                cp, check.c_str(), check.length(), str.c_str(), str.length());
-                        errcode = 3;
-                    }
-                }
-            });
-        }
-
-        for (auto & t : threads) {
-            t.join();
-        }
-
-        if (errcode) {
-            return errcode;
-        }
-    }
-
-    llama_free_model(model);
-    llama_free(ctx);
-
-    llama_backend_free();
-
-    return 0;
-}
--- a/tests/test-tokenizer-1-spm.cpp
+++ b/tests/test-tokenizer-1-spm.cpp
@ -1,122 +0,0 @@
-#include "llama.h"
-#include "common.h"
-#include "unicode.h"
-#include "console.h"
-
-#include <cassert>
-#include <codecvt>
-#include <cstdio>
-#include <cstring>
-#include <locale>
-#include <string>
-#include <thread>
-#include <vector>
-#include <atomic>
-
-int main(int argc, char ** argv) {
-    if (argc < 2) {
-        fprintf(stderr, "Usage: %s <vocab-file>\n", argv[0]);
-        return 1;
-    }
-
-    const std::string fname = argv[1];
-
-    fprintf(stderr, "%s : reading vocab from: '%s'\n", __func__, fname.c_str());
-
-    llama_model * model;
-    llama_context * ctx;
-
-    llama_backend_init();
-
-    // load the vocab
-    {
-        auto mparams = llama_model_default_params();
-
-        mparams.vocab_only = true;
-
-        model = llama_load_model_from_file(fname.c_str(), mparams);
-
-        if (model == NULL) {
-            fprintf(stderr, "%s: error: failed to load vocab '%s'\n", __func__, fname.c_str());
-            return 1;
-        }
-
-        auto cparams = llama_context_default_params();
-
-        ctx = llama_new_context_with_model(model, cparams);
-
-        if (ctx == NULL) {
-            fprintf(stderr, "%s: error: failed to load vocab '%s'\n", __func__, fname.c_str());
-            llama_free_model(model);
-            return 1;
-        }
-    }
-
-    //GGML_ASSERT(llama_vocab_type(model) == LLAMA_VOCAB_TYPE_SPM);
-    if (llama_vocab_type(model) != LLAMA_VOCAB_TYPE_SPM) {
-        return 99;
-    }
-
-#ifdef _WIN32
-    // We need this for unicode console support
-    console::init(false, false);
-    atexit([]() { console::cleanup(); });
-#endif
-
-    const int n_vocab = llama_n_vocab(model);
-
-    for (int i = 0; i < n_vocab; ++i) {
-        std::string str = llama_detokenize(ctx, std::vector<int>(1, i), true);
-        std::vector<llama_token> tokens = llama_tokenize(ctx, str, false, true);
-        std::string check = llama_detokenize(ctx, tokens);
-        if (check != str) {
-            fprintf(stderr, "%s : error: token %d detokenizes to '%s'(%zu) but tokenization of this detokenizes to '%s'(%zu)\n",
-                __func__, i, str.c_str(), str.length(), check.c_str(), check.length());
-            return 2;
-        }
-    }
-
-    // unicode
-    {
-        const int nthread = std::thread::hardware_concurrency();
-
-        std::vector<std::thread> threads(nthread);
-
-        std::atomic_int errcode = {};
-
-        for (int i = 0; i < nthread; ++i) {
-            threads[i] = std::thread([i, nthread, ctx, &errcode]() {
-                for (uint32_t cp = i; !errcode && cp < 0x00110000; cp += nthread) {
-                    if ((0x0000D800 <= cp && cp <= 0x0000DFFF) ||  // surrogates \p{Cs}
-                        (0x00040000 <= cp && cp <= 0x000E0000)) {  // undefined \p{Cn}
-                        continue;
-                    }
-
-                    std::string str = unicode_cpt_to_utf8(cp);
-                    std::vector<llama_token> tokens = llama_tokenize(ctx, str, false, true);
-                    std::string check = llama_detokenize(ctx, tokens);
-                    if (cp != 9601 && str != check) {
-                        fprintf(stderr, "error: codepoint 0x%x detokenizes to '%s'(%zu) instead of '%s'(%zu)\n",
-                                cp, check.c_str(), check.length(), str.c_str(), str.length());
-                        errcode = 3;
-                    }
-                }
-            });
-        }
-
-        for (auto & t : threads) {
-            t.join();
-        }
-
-        if(errcode) {
-            return errcode;
-        }
-    }
-
-    llama_free_model(model);
-    llama_free(ctx);
-
-    llama_backend_free();
-
-    return 0;
-}
--- a/tests/test-tokenizer-random.py
+++ b/tests/test-tokenizer-random.py
@ -1,566 +0,0 @@
-# Test libllama tokenizer == AutoTokenizer.
-# Brute force random words/text generation.
-#
-# Sample usage:
-#
-#   python3 tests/test-tokenizer-random.py ./models/ggml-vocab-llama-bpe.gguf ./models/tokenizers/llama-bpe
-#
-
-from __future__ import annotations
-
-import time
-import logging
-import argparse
-import subprocess
-import random
-import unicodedata
-
-from pathlib import Path
-from typing import Any, Iterator, cast
-from typing_extensions import Buffer
-
-import cffi
-from transformers import AutoTokenizer, PreTrainedTokenizer
-
-
-logger = logging.getLogger("test-tokenizer-random")
-
-
-class LibLlama:
-
-    DEFAULT_PATH_LLAMA_H = "./include/llama.h"
-    DEFAULT_PATH_INCLUDES = ["./ggml/include/", "./include/"]
-    DEFAULT_PATH_LIBLLAMA = "./build/src/libllama.so"  # CMakeLists.txt: BUILD_SHARED_LIBS ON
-
-    def __init__(self, path_llama_h: str | None = None, path_includes: list[str] = [], path_libllama: str | None = None):
-        path_llama_h = path_llama_h or self.DEFAULT_PATH_LLAMA_H
-        path_includes = path_includes or self.DEFAULT_PATH_INCLUDES
-        path_libllama = path_libllama or self.DEFAULT_PATH_LIBLLAMA
-        (self.ffi, self.lib) = self._load_libllama_cffi(path_llama_h, path_includes, path_libllama)
-        self.lib.llama_backend_init()
-
-    def _load_libllama_cffi(self, path_llama_h: str, path_includes: list[str], path_libllama: str) -> tuple[cffi.FFI, Any]:
-        cmd = ["gcc", "-O0", "-E", "-P", "-D__restrict=", "-D__attribute__(x)=", "-D__asm__(x)="]
-        cmd += ["-I" + path for path in path_includes] + [path_llama_h]
-        res = subprocess.run(cmd, stdout=subprocess.PIPE)
-        assert (res.returncode == 0)
-        source = res.stdout.decode()
-        ffi = cffi.FFI()
-        if True:  # workarounds for pycparser
-            source = "typedef struct { } __builtin_va_list;" + "\n" + source
-            source = source.replace("sizeof (int)",    str(ffi.sizeof("int")))
-            source = source.replace("sizeof (void *)", str(ffi.sizeof("void*")))
-            source = source.replace("sizeof (size_t)", str(ffi.sizeof("size_t")))
-            source = source.replace("sizeof(int32_t)", str(ffi.sizeof("int32_t")))
-        ffi.cdef(source, override=True)
-        lib = ffi.dlopen(path_libllama)
-        return (ffi, lib)
-
-    def model_default_params(self, **kwargs):
-        mparams = self.lib.llama_model_default_params()
-        for k, v in kwargs.items():
-            setattr(mparams, k, v)
-        return mparams
-
-    def context_default_params(self, **kwargs):
-        cparams = self.lib.llama_context_default_params()
-        for k, v in kwargs.items():
-            setattr(cparams, k, v)
-        return cparams
-
-
-class LibLlamaModel:
-
-    def __init__(self, libllama: LibLlama, path_model: str, mparams={}, cparams={}):
-        self.lib: Any = libllama.lib
-        self.ffi = libllama.ffi
-        if isinstance(mparams, dict):
-            mparams = libllama.model_default_params(**mparams)
-        self.model = self.lib.llama_load_model_from_file(path_model.encode(), mparams)
-        if not self.model:
-            raise RuntimeError("error: failed to load model '%s'" % path_model)
-        if isinstance(cparams, dict):
-            cparams = libllama.context_default_params(**cparams)
-        self.ctx = self.lib.llama_new_context_with_model(self.model, cparams)
-        if not self.ctx:
-            raise RuntimeError("error: failed to create context for model '%s'" % path_model)
-        n_tokens_max = self.lib.llama_n_ctx(self.ctx)
-        self.token_ids = self.ffi.new("llama_token[]", n_tokens_max)
-        self.text_buff = self.ffi.new("uint8_t[]", 1024)
-
-    def free(self):
-        if self.ctx:
-            self.lib.llama_free(self.ctx)
-        if self.model:
-            self.lib.llama_free_model(self.model)
-        self.ctx = None
-        self.model = None
-        self.lib = None
-
-    def tokenize(self, text: str, add_special: bool = False, parse_special: bool = False) -> list[int]:
-        encoded_text: bytes = text.encode("utf-8")
-        num = self.lib.llama_tokenize(self.model, encoded_text, len(encoded_text), self.token_ids, len(self.token_ids), add_special, parse_special)
-        while num < 0 and len(self.token_ids) < (16 << 20):
-            self.token_ids = self.ffi.new("llama_token[]", -2 * num)
-            num = self.lib.llama_tokenize(self.model, encoded_text, len(encoded_text), self.token_ids, len(self.token_ids), add_special, parse_special)
-        return list(self.token_ids[0:num])
-
-    def detokenize(self, ids: list[int], remove_special: bool = False, unparse_special: bool = False) -> str:
-        if len(self.token_ids) < len(ids):
-            self.token_ids = self.ffi.new("llama_token[]", 2 * len(ids))
-        for i, id in enumerate(ids):
-            self.token_ids[i] = id
-        num = self.lib.llama_detokenize(self.model, self.token_ids, len(ids), self.text_buff, len(self.text_buff), remove_special, unparse_special)
-        while num < 0 and len(self.text_buff) < (16 << 20):
-            self.text_buff = self.ffi.new("uint8_t[]", -2 * num)
-            num = self.lib.llama_detokenize(self.model, self.token_ids, len(ids), self.text_buff, len(self.text_buff), remove_special, unparse_special)
-        return str(cast(Buffer, self.ffi.buffer(self.text_buff, num)), encoding="utf-8", errors="replace")  # replace errors with '\uFFFD'
-
-
-class Tokenizer:
-
-    def encode(self, text: str) -> list[int]:
-        raise NotImplementedError
-
-    def decode(self, ids: list[int]) -> str:
-        raise NotImplementedError
-
-
-class TokenizerGroundtruth (Tokenizer):
-
-    def __init__(self, dir_tokenizer: str):
-        self.model: PreTrainedTokenizer = AutoTokenizer.from_pretrained(dir_tokenizer)
-        # guess BOS and EOS
-        ids = self.encode("a")
-        assert 1 <= len(ids) <= 3
-        add_bos_token = len(ids) > 1 and self.model.bos_token_id == ids[0]
-        add_eos_token = len(ids) > 1 and self.model.eos_token_id == ids[-1]
-        self.add_bos_token = getattr(self.model, "add_bos_token", add_bos_token)
-        self.add_eos_token = getattr(self.model, "add_eos_token", add_eos_token)
-        # build vocab
-        tokens = list(self.model.get_vocab().values())
-        self.vocab = self.model.batch_decode(tokens, skip_special_tokens=True)
-        self.vocab = list(sorted(self.vocab))
-        # tokens and lists
-        self.special_tokens = list(self.model.all_special_tokens)
-        self.added_tokens   = self.model.batch_decode(self.model.added_tokens_encoder.values(), skip_special_tokens=False)
-        self.bos_token = self.model.bos_token
-        self.eos_token = self.model.eos_token
-
-    def encode(self, text: str) -> list[int]:
-        return self.model.encode(text, add_special_tokens=True)
-
-    def decode(self, ids: list[int]) -> str:
-        return self.model.decode(ids, skip_special_tokens=False)
-
-
-class TokenizerLlamaCpp (Tokenizer):
-
-    libllama: LibLlama | None = None
-
-    def __init__(self, vocab_file: str):
-        if not self.libllama:
-            self.libllama = LibLlama()
-        self.model = LibLlamaModel(self.libllama, vocab_file, mparams=dict(vocab_only=True), cparams=dict(n_ctx=4096))
-
-    def encode(self, text: str) -> list[int]:
-        return self.model.tokenize(text, add_special=True, parse_special=True)
-
-    def decode(self, ids: list[int]) -> str:
-        return self.model.detokenize(ids, remove_special=False, unparse_special=True)
-
-
-def generator_custom_text() -> Iterator[str]:
-    """General tests"""
-    yield from [
-        "",
-        " ",
-        "  ",
-        "   ",
-        "\t",
-        "\n",
-        "\n\n",
-        "\n\n\n",
-        "\t\n",
-        "Hello world",
-        " Hello world",
-        "Hello World",
-        " Hello World",
-        " Hello World!",
-        "Hello, world!",
-        " Hello, world!",
-        " this is 🦙.cpp",
-        "w048 7tuijk dsdfhu",
-        "нещо на Български",
-        "កាន់តែពិសេសអាចខលចេញ",
-        "🚀 (normal) 😶‍🌫️ (multiple emojis concatenated) ✅ (only emoji that has its own token)",
-        "Hello",
-        " Hello",
-        "  Hello",
-        "   Hello",
-        "    Hello",
-        "    Hello\n    Hello",
-        " (",
-        "\n =",
-        "' era",
-        "Hello, y'all! How are you 😁 ?我想在apple工作1314151天～",
-        "3",
-        "33",
-        "333",
-        "3333",
-        "33333",
-        "333333",
-        "3333333",
-        "33333333",
-        "333333333",
-    ]
-
-
-def generator_custom_text_edge_cases() -> Iterator[str]:
-    """Edge cases found while debugging"""
-    yield from [
-        '\x1f-a',     # unicode_ranges_control, {0x00001C, 0x00001F}
-        '¼-a',        # unicode_ranges_digit, 0x00BC
-        '½-a',        # unicode_ranges_digit, 0x00BD
-        '¾-a',        # unicode_ranges_digit, 0x00BE
-        'a 〇b',      # unicode_ranges_digit, 0x3007
-        'Ⅵ-a',       # unicode_ranges_digit, {0x00002150, 0x0000218F} // Number Forms
-        '\uFEFF//',   # unicode_ranges_control, 0xFEFF (BOM)
-        'Cửa Việt',   # llama-3, ignore_merges = true
-        '<s>a',       # Phi-3 fail
-        '<unk><|endoftext|><s>',  # Phi-3 fail
-        'a\na',            # bert fail
-        '"`',              # falcon
-        ' \u2e4e',         # falcon
-        '\n\x0b  ',        # falcon
-        'a\xa0\xa0\x00b',  # jina-v2-es
-        'one <mask>',      # jina-v2-es  <mask> lstrip=true
-        'a </s> b',        # rstrip phi-3
-        'a <mask> b',      # lstrip jina-v2
-        '\xa0aC',          # deepseek
-        '\u2029 \uA3E4',   # deepseek-llm
-        "a ?",
-        'å',               # mpt
-        '\U000ac517',      # utf-8 encode error, falcon
-        '\U000522f4',      # utf-8 encode error, starcoder
-        "<s><s><unk><s>a<s>b<s>c<unk>d<unk></s>",
-        "<s> <s> <unk><s>a<s>b<s>c<unk>d<unk></s>",
-    ]
-
-
-def generator_vocab_words(tokenizer: TokenizerGroundtruth) -> Iterator[str]:
-    """Brute force check all vocab words"""
-    yield from tokenizer.vocab
-
-
-def generator_ascii_lr_strip() -> Iterator[str]:
-    WHITESPACES = ["", " ", "  "]
-    CHARACTERS = list(chr(i) for i in range(1, 0x80)) + [""]
-    for char1 in CHARACTERS:
-        for char2 in CHARACTERS:
-            for lstrip in WHITESPACES:
-                for rstrip in WHITESPACES:
-                    yield lstrip + char1 + char2 + rstrip
-                    yield lstrip + char1 + rstrip + char2
-                    yield char1 + lstrip + char2 + rstrip
-
-
-def generator_apostrophe() -> Iterator[str]:
-    WHITESPACES = ["", " ", "  "]
-    CHARACTERS = list(chr(i) for i in range(1, 0x80)) + [""]
-    for char1 in CHARACTERS:
-        for char2 in CHARACTERS:
-            for lstrip in WHITESPACES:
-                for rstrip in WHITESPACES:
-                    yield char1 + lstrip + "'" + rstrip + char2
-                    yield char1 + char2 + lstrip + "'" + rstrip + "z"
-                    yield "a" + lstrip + "'" + rstrip + char1 + char2
-
-
-def generator_added_lr_strip(tokenizer: TokenizerGroundtruth) -> Iterator[str]:
-    WHITESPACES = ["", " ", "  ", "\n", "\r\n", "\n\n", "\t", "\t\t"]
-    all_tokens = list(sorted(set(tokenizer.special_tokens + tokenizer.added_tokens)))
-    for token in all_tokens:
-        for lstrip in WHITESPACES:
-            for rstrip in WHITESPACES:
-                yield lstrip + token + rstrip
-                yield "a" + lstrip + token + rstrip
-                yield lstrip + token + rstrip + "z"
-                yield "a" + lstrip + token + rstrip + "z"
-
-
-def generator_random_added_tokens(tokenizer: TokenizerGroundtruth, iterations=100) -> Iterator[str]:
-    separations = [" ", "\n", "\t", "-", "!", "one", "1", "<s>", "</s>"]
-    all_tokens  = list(sorted(set(tokenizer.special_tokens + tokenizer.added_tokens + separations)))
-    rand = random.Random()
-    for m in range(iterations):
-        rand.seed(m)
-        words = rand.choices(all_tokens, k=500)
-        if words and words[0] == tokenizer.bos_token:  # skip spam warning of double BOS
-            while len(words) > 1 and words[1] == tokenizer.bos_token:  # leave one starting BOS
-                words.pop(0)
-            if tokenizer.add_bos_token:  # drop all starting BOS
-                words.pop(0)
-        if words and words[-1] == tokenizer.eos_token:  # skip spam warning of double EOS
-            while len(words) > 1 and words[-2] == tokenizer.eos_token:  # leave one trailing EOS
-                words.pop(-1)
-            if tokenizer.add_bos_token:  # drop all trailing EOS
-                words.pop(-1)
-        yield "".join(words)
-
-
-def generator_random_chars(iterations=100) -> Iterator[str]:
-    """Brute force random text with simple characters"""
-
-    NUM_WORDS = 400
-    WHITESPACES = list(" " * 20 + "\n" * 5 + "\r\n" * 5 + "\t" * 5)
-    CHARS = list(sorted(set("""
-        ABCDEFGHIJKLMNOPQRSTUVWXYZ
-        abcdefghijklmnopqrstuvwxyz
-        ÁÉÍÓÚÀÈÌÒÙÂÊÎÔÛÄËÏÖÜ
-        áéíóúàèìòùâêîôûäëïöü
-        .-,*/-+ª!"·$%&/()=?¿[]{}<>\\|@#~½¬~;:_
-    """)))
-
-    rand = random.Random()
-    for m in range(iterations):
-        rand.seed(m)
-        text = []
-        for _ in range(NUM_WORDS):
-            k = rand.randint(1, 7)
-            word = rand.choices(CHARS, k=k)
-            word.append(rand.choice(WHITESPACES))
-            text.append("".join(word))
-        yield "".join(text)
-
-
-def generator_unicodes() -> Iterator[str]:
-    """Iterate unicode characters"""
-
-    MAX_CODEPOINTS = 0x30000  # 0x110000
-
-    def _valid(cpt):
-        if cpt >= 0x30000:  # unassigned and supplementary
-            return False
-        # if cpt == 0x2029:  # deepseek-llm
-        #    return False
-        if unicodedata.category(chr(cpt)) in ("Cn", "Cs", "Co"):  # undefined, surrogates, private
-            return False
-        return True
-
-    characters = [chr(cpt) for cpt in range(0, MAX_CODEPOINTS) if _valid(cpt)]
-
-    yield from characters
-
-
-def generator_random_unicodes(iterations=100) -> Iterator[str]:
-    """Brute force random text with unicode characters"""
-
-    NUM_WORDS = 200
-    WHITESPACES = list(" " * 20 + "\n" * 5 + "\r\n" * 5 + "\t" * 5)
-
-    characters = list(generator_unicodes())
-
-    rand = random.Random()
-    for m in range(iterations):
-        rand.seed(m)
-        text = []
-        for _ in range(NUM_WORDS):
-            k = rand.randint(1, 7)
-            word = rand.choices(characters, k=k)
-            word.append(rand.choice(WHITESPACES))
-            text.append("".join(word))
-        yield "".join(text)
-
-
-def generator_random_vocab_chars(tokenizer: TokenizerGroundtruth, iterations=100) -> Iterator[str]:
-    """Brute force random text with vocab characters"""
-
-    vocab_chars = set()
-    for word in tokenizer.vocab:
-        vocab_chars.update(word)
-    vocab_chars = list(sorted(vocab_chars))
-
-    rand = random.Random()
-    for m in range(iterations):
-        rand.seed(m)
-        text = rand.choices(vocab_chars, k=1024)
-        yield "".join(text)
-
-
-def generator_random_vocab_words(tokenizer: TokenizerGroundtruth, iterations=100) -> Iterator[str]:
-    """Brute force random text from vocab words"""
-
-    vocab = [w.strip() for w in tokenizer.vocab]
-    yield from vocab
-
-    rand = random.Random()
-    for m in range(iterations):
-        rand.seed(m)
-        text = []
-        num_words = rand.randint(300, 400)
-        for i in range(num_words):
-            k = rand.randint(1, 3)
-            words = rand.choices(vocab, k=k)
-            sep = rand.choice("     \n\r\t")
-            text.append("".join(words) + sep)
-        yield "".join(text)
-
-
-def compare_tokenizers(tokenizer1: TokenizerGroundtruth, tokenizer2: TokenizerLlamaCpp, generator: Iterator[str]):
-
-    def find_first_mismatch(ids1: list[int] | str, ids2: list[int] | str):
-        for i, (a, b) in enumerate(zip(ids1, ids2)):
-            if a != b:
-                return i
-        if len(ids1) == len(ids2):
-            return -1
-        return min(len(ids1), len(ids2))
-
-    def check_detokenizer(text: str, text1: str, text2: str) -> bool:
-        if text1 == text2:  # equal to TokenizerGroundtruth?
-            return True
-        # equal to source text?
-        if tokenizer1.add_bos_token:  # remove BOS
-            if text2.startswith(tokenizer1.bos_token):
-                text2 = text2[len(tokenizer1.bos_token):]
-        if tokenizer1.add_eos_token:  # remove EOS
-            if text2.endswith(tokenizer1.eos_token):
-                text2 = text2[:-len(tokenizer1.eos_token)]
-        return text == text2
-
-    t_encode1 = 0
-    t_encode2 = 0
-    t_decode1 = 0
-    t_decode2 = 0
-    t_start = time.perf_counter()
-    encode_errors = 0
-    decode_errors = 0
-    MAX_ERRORS = 10
-
-    logger.info("%s: %s" % (generator.__qualname__, "ini"))
-    for text in generator:
-        # print(repr(text), text.encode())
-        # print(repr(text), hex(ord(text[0])), text.encode())
-        t0 = time.perf_counter()
-        ids1 = tokenizer1.encode(text)
-        t1 = time.perf_counter()
-        ids2 = tokenizer2.encode(text)
-        t2 = time.perf_counter()
-        text1 = tokenizer1.decode(ids1)
-        t3 = time.perf_counter()
-        text2 = tokenizer2.decode(ids1)
-        t4 = time.perf_counter()
-        t_encode1 += t1 - t0
-        t_encode2 += t2 - t1
-        t_decode1 += t3 - t2
-        t_decode2 += t4 - t3
-        if encode_errors < MAX_ERRORS and ids1 != ids2:
-            i = find_first_mismatch(ids1, ids2)
-            ids1 = list(ids1)[max(0, i - 2) : i + 5 + 1]
-            ids2 = list(ids2)[max(0, i - 2) : i + 5 + 1]
-            logger.error(" Expected: " + str(ids1))
-            logger.error("   Result: " + str(ids2))
-            encode_errors += 1
-            logger.error(f" {encode_errors=}")
-        if decode_errors < MAX_ERRORS and not check_detokenizer(text, text1, text2):
-            i = find_first_mismatch(text1, text2)
-            text1 = list(text1[max(0, i - 2) : i + 5 + 1])
-            text2 = list(text2[max(0, i - 2) : i + 5 + 1])
-            logger.error(" Expected: " + " ".join(hex(ord(x)) for x in text1))
-            logger.error("   Result: " + " ".join(hex(ord(x)) for x in text2))
-            decode_errors += 1
-            logger.error(f" {decode_errors=}")
-        if encode_errors >= MAX_ERRORS and decode_errors >= MAX_ERRORS:
-            logger.error(f" EXIT: {encode_errors=} {decode_errors=}")
-            # raise Exception()
-            break
-
-    t_total = time.perf_counter() - t_start
-    logger.info(f"{generator.__qualname__}: end,  {t_encode1=:.3f} {t_encode2=:.3f}  {t_decode1=:.3f} {t_decode2=:.3f}  {t_total=:.3f}")
-
-
-def main(argv: list[str] | None = None):
-    parser = argparse.ArgumentParser()
-    parser.add_argument("vocab_file", type=str, help="path to vocab 'gguf' file")
-    parser.add_argument("dir_tokenizer", type=str, help="directory containing 'tokenizer.model' file")
-    parser.add_argument("--verbose", action="store_true", help="increase output verbosity")
-    args = parser.parse_args(argv)
-
-    logging.basicConfig(level = logging.DEBUG if args.verbose else logging.INFO)
-    logger.info(f"VOCABFILE: '{args.vocab_file}'")
-
-    tokenizer1 = TokenizerGroundtruth(args.dir_tokenizer)
-    tokenizer2 = TokenizerLlamaCpp(args.vocab_file)
-
-    # compare_tokenizers(tokenizer1, tokenizer2, generator_custom_text())
-    # compare_tokenizers(tokenizer1, tokenizer2, generator_custom_text_edge_cases())
-    compare_tokenizers(tokenizer1, tokenizer2, generator_ascii_lr_strip())
-    compare_tokenizers(tokenizer1, tokenizer2, generator_apostrophe())
-    compare_tokenizers(tokenizer1, tokenizer2, generator_unicodes())
-    compare_tokenizers(tokenizer1, tokenizer2, generator_vocab_words(tokenizer1))
-    compare_tokenizers(tokenizer1, tokenizer2, generator_added_lr_strip(tokenizer1))
-    # compare_tokenizers(tokenizer1, tokenizer2, generator_random_added_tokens(tokenizer1, 10_000))
-    # compare_tokenizers(tokenizer1, tokenizer2, generator_random_chars(10_000))
-    # compare_tokenizers(tokenizer1, tokenizer2, generator_random_unicodes(10_000))
-    # compare_tokenizers(tokenizer1, tokenizer2, generator_random_vocab_chars(tokenizer1, 10_000))
-    # compare_tokenizers(tokenizer1, tokenizer2, generator_random_vocab_words(tokenizer1, 5_000))
-
-    tokenizer2.model.free()
-
-
-if __name__ == "__main__":
-    # main()
-
-    if True:
-        logging.basicConfig(
-            level    = logging.DEBUG,
-            format   = "%(asctime)s.%(msecs)03d %(name)s %(levelname)s %(message)s",
-            datefmt  = "%Y-%m-%d %H:%M:%S",
-            filename = logger.name + ".log",
-            filemode = "a"
-        )
-    logging.basicConfig(
-        level    = logging.DEBUG,
-        format   = "%(levelname)s %(message)s",
-    )
-
-    path_tokenizers   = Path("./models/tokenizers/")
-    path_vocab_format = "./models/ggml-vocab-%s.gguf"
-
-    tokenizers = [
-        "llama-spm",      # SPM
-        "phi-3",          # SPM
-        "gemma",          # SPM
-        "gemma-2",        # SPM
-        "baichuan",       # SPM
-        "bert-bge",       # WPM
-        "jina-v2-en",     # WPM
-        "llama-bpe",      # BPE
-        "phi-2",          # BPE
-        "deepseek-llm",   # BPE
-        "deepseek-coder", # BPE
-        "falcon",         # BPE
-        "mpt",            # BPE
-        "starcoder",      # BPE
-        "gpt-2",          # BPE
-        "stablelm2",      # BPE
-        "refact",         # BPE
-        "qwen2",          # BPE
-        "olmo",           # BPE
-        "jina-v2-es",     # BPE
-        "jina-v2-de",     # BPE
-        "smaug-bpe",      # BPE
-        "poro-chat",      # BPE
-        "jina-v2-code",   # BPE
-        "viking",         # BPE
-        "jais",           # BPE
-    ]
-
-    logger.info("=" * 50)
-    for tokenizer in tokenizers:
-        logger.info("-" * 50)
-        logger.info(f"TOKENIZER: '{tokenizer}'")
-        vocab_file = Path(path_vocab_format % tokenizer)
-        dir_tokenizer = path_tokenizers / tokenizer
-        main([str(vocab_file), str(dir_tokenizer), "--verbose"])