Merge branch 'master' into gg/flash-attn

2024-02-19 12:58:18 +02:00 · 2024-02-19 12:58:18 +02:00 · 31109ca00a
commit 31109ca00a
parent 6875997fd6 13e2c771aa
87 changed files with 5115 additions and 1531 deletions
--- a/tests/CMakeLists.txt
+++ b/tests/CMakeLists.txt
@ -28,6 +28,7 @@ endfunction()
 llama_build_and_test_executable(test-quantize-fns.cpp)
 llama_build_and_test_executable(test-quantize-perf.cpp)
 llama_build_and_test_executable(test-sampling.cpp)
+llama_build_and_test_executable(test-chat-template.cpp)

 llama_build_executable(test-tokenizer-0-llama.cpp)
 llama_test_executable (test-tokenizer-0-llama test-tokenizer-0-llama.cpp ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-llama.gguf)
--- a/tests/test-autorelease.cpp
+++ b/tests/test-autorelease.cpp
@ -12,7 +12,7 @@ int main(int argc, char ** argv) {
    auto * model_path = get_model_or_exit(argc, argv);

    std::thread([&model_path]() {
-        llama_backend_init(false);
+        llama_backend_init();
        auto * model = llama_load_model_from_file(model_path, llama_model_default_params());
        auto * ctx = llama_new_context_with_model(model, llama_context_default_params());
        llama_free(ctx);
--- a/tests/test-backend-ops.cpp
+++ b/tests/test-backend-ops.cpp
@ -1095,24 +1095,32 @@ struct test_diag_mask_inf : public test_case {
 struct test_soft_max : public test_case {
    const ggml_type type;
    const std::array<int64_t, 4> ne;
-    const float scale;
    const bool mask;
+    const float scale;
+    const float max_bias;

    std::string vars() override {
-        return VARS_TO_STR4(type, ne, scale, mask);
+        return VARS_TO_STR5(type, ne, mask, scale, max_bias);
    }

    test_soft_max(ggml_type type = GGML_TYPE_F32,
            std::array<int64_t, 4> ne = {10, 10, 10, 10},
+            bool mask = false,
            float scale = 1.0f,
-            bool mask = false)
-        : type(type), ne(ne), scale(scale), mask(mask) {}
+            float max_bias = 0.0f)
+        : type(type), ne(ne), mask(mask), scale(scale), max_bias(max_bias) {}

    ggml_tensor * build_graph(ggml_context * ctx) override {
        ggml_tensor * a = ggml_new_tensor(ctx, type, 4, ne.data());
-        ggml_tensor * b = nullptr;
-        if (mask) { b = ggml_new_tensor_2d(ctx, GGML_TYPE_F16, ne[0], ne[1]); }
-        ggml_tensor * out = ggml_soft_max_ext(ctx, a, b, scale);
+        ggml_tensor * mask = nullptr;
+        if (this->mask) {
+            mask = ggml_new_tensor_2d(ctx, type, ne[0], ne[1]);
+        }
+        ggml_tensor * pos = nullptr;
+        if (max_bias > 0.0f) {
+            pos = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, ne[0]);
+        }
+        ggml_tensor * out = ggml_soft_max_ext(ctx, a, mask, pos, scale, max_bias);
        return out;
    }
 };
@ -1157,30 +1165,6 @@ struct test_rope : public test_case {
    }
 };

-// GGML_OP_ALIBI
-struct test_alibi : public test_case {
-    const ggml_type type;
-    const std::array<int64_t, 4> ne;
-    int n_past;
-    int n_head;
-    float bias_max;
-
-    std::string vars() override {
-        return VARS_TO_STR5(type, ne, n_past, n_head, bias_max);
-    }
-
-    test_alibi(ggml_type type = GGML_TYPE_F32,
-            std::array<int64_t, 4> ne = {10, 10, 10, 10},
-            int n_past = 512, int n_head = 10, float bias_max = 0.5f)
-        : type(type), ne(ne), n_past(n_past), n_head(n_head), bias_max(bias_max) {}
-
-    ggml_tensor * build_graph(ggml_context * ctx) override {
-        ggml_tensor * a = ggml_new_tensor(ctx, type, 4, ne.data());
-        ggml_tensor * out = ggml_alibi(ctx, a, n_past, n_head, bias_max);
-        return out;
-    }
-};
-
 // GGML_OP_POOL2D
 struct test_pool2d : public test_case {
    enum ggml_op_pool pool_type;
@ -1568,7 +1552,7 @@ struct test_moe : public test_case {
        ggml_tensor * cur = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd, n_tokens);

        ggml_tensor * logits = ggml_mul_mat(ctx, ffn_gate_inp, cur);
-        ggml_tensor * probs = ggml_soft_max_ext(ctx, logits, nullptr, 1.0f/sqrtf(n_embd));
+        ggml_tensor * probs = ggml_soft_max_ext(ctx, logits, nullptr, nullptr, 1.0f/sqrtf(n_embd), 0.0f);

        // select experts
        ggml_tensor * selected_experts = ggml_top_k(ctx, probs, n_experts_per_tok);
@ -1697,7 +1681,6 @@ public:
        ggml_cpy(ctx, v_cur_t, v_cache_view);
    }

-    // if max_alibi_bias > 0 then apply ALiBi
    struct ggml_tensor * llm_build_kqv(
            struct ggml_context * ctx,
             struct ggml_tensor * k_l,
@ -1716,7 +1699,7 @@ public:

        struct ggml_tensor * kq = ggml_mul_mat(ctx, k, q);

-        kq = ggml_soft_max_ext(ctx, kq, kq_mask, kq_scale);
+        kq = ggml_soft_max_ext(ctx, kq, kq_mask, nullptr, kq_scale, 0.0f);

        // split cached v into n_head heads
        struct ggml_tensor * v =
@ -2014,7 +1997,7 @@ static bool test_backend(ggml_backend_t backend, test_mode mode, const char * op
        GGML_TYPE_Q4_K, GGML_TYPE_Q5_K,
        GGML_TYPE_Q6_K,
        GGML_TYPE_IQ2_XXS, GGML_TYPE_IQ2_XS,
-        GGML_TYPE_IQ3_XXS,
+        GGML_TYPE_IQ3_XXS, GGML_TYPE_IQ1_S,
    };

    // unary ops
@ -2163,6 +2146,7 @@ static bool test_backend(ggml_backend_t backend, test_mode mode, const char * op
    test_cases.emplace_back(new test_diag_mask_inf(GGML_TYPE_F32, {10, 10, 10,  1}, 5));
    test_cases.emplace_back(new test_diag_mask_inf(GGML_TYPE_F32, {10, 10, 10, 10}, 5));

+#if 0
    std::uniform_int_distribution<> dist_ne1(1, 50);
    int exponent = 1;
    while (exponent < (1 << 17)) {
@ -2171,14 +2155,29 @@ static bool test_backend(ggml_backend_t backend, test_mode mode, const char * op
        for (int n = 0; n < 10; ++n) {
            int64_t ne0 = dist_ne0(rng);
            int64_t ne1 = dist_ne1(rng);
-            test_cases.emplace_back(new test_soft_max(GGML_TYPE_F32, {ne0, ne1, 1, 1}));
+            test_cases.emplace_back(new test_soft_max(GGML_TYPE_F32, {ne0, ne1, 1, 1}, n/2 == 0, 0.1f, ne0 < 1000 ? 4.0f : 0.0f));
        }

        exponent <<= 1;
    }
+#endif
+    for (bool mask : {false, true}) {
+        for (float max_bias : {0.0f, 8.0f}) {
+            for (float scale : {1.0f, 0.1f}) {
+                for (int64_t ne0 : {16, 1024}) {
+                    for (int64_t ne1 : {16, 1024}) {
+                        test_cases.emplace_back(new test_soft_max(GGML_TYPE_F32, {ne0, ne1, 1, 1}, mask, scale, max_bias));
+                        test_cases.emplace_back(new test_soft_max(GGML_TYPE_F32, {ne0-1, ne1-1, 1, 1}, mask, scale, max_bias));
+                    }
+                }
+            }
+        }
+    }

-    test_cases.emplace_back(new test_soft_max(GGML_TYPE_F32, {16, 2, 32, 1}, 0.1f));
-    test_cases.emplace_back(new test_soft_max(GGML_TYPE_F32, {32, 2, 32, 1}, 0.1f, true));
+    test_cases.emplace_back(new test_soft_max(GGML_TYPE_F32, {16, 2, 32, 1}, false, 0.1f, 0.0f));
+    test_cases.emplace_back(new test_soft_max(GGML_TYPE_F32, {32, 2, 32, 1}, true,  0.1f, 0.0f));
+    test_cases.emplace_back(new test_soft_max(GGML_TYPE_F32, {16, 2, 32, 1}, false, 0.1f, 8.0f));
+    test_cases.emplace_back(new test_soft_max(GGML_TYPE_F32, {32, 2, 32, 1}, true,  0.1f, 8.0f));

    for (ggml_type type : {GGML_TYPE_F32, GGML_TYPE_F16}) {
        test_cases.emplace_back(new test_rope(type, {128,  32, 10, 1}, 128, 0, 512)); // llama 7B
@ -2193,7 +2192,6 @@ static bool test_backend(ggml_backend_t backend, test_mode mode, const char * op
        test_cases.emplace_back(new test_rope(type, { 80,  32, 10, 1},  32, 2, 512)); // neox (phi-2)
    }

-    test_cases.emplace_back(new test_alibi());
    test_cases.emplace_back(new test_concat(GGML_TYPE_F32));
    test_cases.emplace_back(new test_concat(GGML_TYPE_I32));

@ -2233,14 +2231,13 @@ static bool test_backend(ggml_backend_t backend, test_mode mode, const char * op
    }
 #endif

+    // these tests are disabled to save execution time, but they can be handy for debugging
+#if 0
 #if !defined(__SANITIZE_THREAD__)
    // FIXME: these tests use too much memory with thread sanitizer
    test_cases.emplace_back(new test_moe(8, 2, 1, 4096, 8*1024));
    //test_cases.emplace_back(new test_moe(8, 2, 8, 4096, 14336));
 #endif
-
-    // these tests are disabled to save execution time, but they can be handy for debugging
-#if 0
    test_cases.emplace_back(new test_llama(1));
    test_cases.emplace_back(new test_llama(2));
    test_cases.emplace_back(new test_falcon(1));
--- a/tests/test-chat-template.cpp
+++ b/tests/test-chat-template.cpp
@ -0,0 +1,64 @@
+#include <iostream>
+#include <string>
+#include <vector>
+#include <sstream>
+
+#undef NDEBUG
+#include <cassert>
+
+#include "llama.h"
+
+int main(void) {
+    llama_chat_message conversation[] = {
+        {"system", "You are a helpful assistant"},
+        {"user", "Hello"},
+        {"assistant", "Hi there"},
+        {"user", "Who are you"},
+        {"assistant", "   I am an assistant   "},
+        {"user", "Another question"},
+    };
+    size_t message_count = 6;
+    std::vector<std::string> templates = {
+        // teknium/OpenHermes-2.5-Mistral-7B
+        "{% for message in messages %}{{'<|im_start|>' + message['role'] + '\\n' + message['content'] + '<|im_end|>' + '\\n'}}{% endfor %}{% if add_generation_prompt %}{{ '<|im_start|>assistant\\n' }}{% endif %}",
+        // mistralai/Mistral-7B-Instruct-v0.2
+        "{{ bos_token }}{% for message in messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% if message['role'] == 'user' %}{{ '[INST] ' + message['content'] + ' [/INST]' }}{% elif message['role'] == 'assistant' %}{{ message['content'] + eos_token}}{% else %}{{ raise_exception('Only user and assistant roles are supported!') }}{% endif %}{% endfor %}",
+        // TheBloke/FusionNet_34Bx2_MoE-AWQ
+        "{%- for idx in range(0, messages|length) -%}\\n{%- if messages[idx]['role'] == 'user' -%}\\n{%- if idx > 1 -%}\\n{{- bos_token + '[INST] ' + messages[idx]['content'] + ' [/INST]' -}}\\n{%- else -%}\\n{{- messages[idx]['content'] + ' [/INST]' -}}\\n{%- endif -%}\\n{% elif messages[idx]['role'] == 'system' %}\\n{{- '[INST] <<SYS>>\\\\n' + messages[idx]['content'] + '\\\\n<</SYS>>\\\\n\\\\n' -}}\\n{%- elif messages[idx]['role'] == 'assistant' -%}\\n{{- ' '  + messages[idx]['content'] + ' ' + eos_token -}}\\n{% endif %}\\n{% endfor %}",
+        // bofenghuang/vigogne-2-70b-chat
+        "{{ bos_token }}{% if messages[0]['role'] == 'system' %}{% set loop_messages = messages[1:] %}{% set system_message = messages[0]['content'] %}{% elif true == true and not '<<SYS>>' in messages[0]['content'] %}{% set loop_messages = messages %}{% set system_message = 'Vous êtes Vigogne, un assistant IA créé par Zaion Lab. Vous suivez extrêmement bien les instructions. Aidez autant que vous le pouvez.' %}{% else %}{% set loop_messages = messages %}{% set system_message = false %}{% endif %}{% for message in loop_messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% if loop.index0 == 0 and system_message != false %}{% set content = '<<SYS>>\\\\n' + system_message + '\\\\n<</SYS>>\\\\n\\\\n' + message['content'] %}{% else %}{% set content = message['content'] %}{% endif %}{% if message['role'] == 'user' %}{{ '[INST] ' + content.strip() + ' [/INST]' }}{% elif message['role'] == 'system' %}{{ '<<SYS>>\\\\n' + content.strip() + '\\\\n<</SYS>>\\\\n\\\\n' }}{% elif message['role'] == 'assistant' %}{{ ' '  + content.strip() + ' ' + eos_token }}{% endif %}{% endfor %}",
+    };
+    std::vector<std::string> expected_substr = {
+        "<|im_start|>assistant\n   I am an assistant   <|im_end|>\n<|im_start|>user\nAnother question<|im_end|>\n<|im_start|>assistant",
+        "[/INST]Hi there</s>[INST] Who are you [/INST]   I am an assistant   </s>[INST] Another question [/INST]",
+        "</s><s>[INST] Who are you [/INST]    I am an assistant    </s><s>[INST] Another question [/INST]",
+        "[/INST] Hi there </s>[INST] Who are you [/INST] I am an assistant </s>[INST] Another question [/INST]",
+    };
+    std::vector<char> formatted_chat(1024);
+    int32_t res;
+
+    // test invalid chat template
+    res = llama_chat_apply_template(nullptr, "INVALID TEMPLATE", conversation, message_count, true, formatted_chat.data(), formatted_chat.size());
+    assert(res < 0);
+
+    for (size_t i = 0; i < templates.size(); i++) {
+        std::string custom_template = templates[i];
+        std::string substr = expected_substr[i];
+        formatted_chat.resize(1024);
+        res = llama_chat_apply_template(
+            nullptr,
+            custom_template.c_str(),
+            conversation,
+            message_count,
+            true,
+            formatted_chat.data(),
+            formatted_chat.size()
+        );
+        formatted_chat.resize(res);
+        std::string output(formatted_chat.data(), formatted_chat.size());
+        std::cout << output << "\n-------------------------\n";
+        // expect the "formatted_chat" to contain pre-defined strings
+        assert(output.find(substr) != std::string::npos);
+    }
+    return 0;
+}
--- a/tests/test-grammar-parser.cpp
+++ b/tests/test-grammar-parser.cpp
@ -38,8 +38,8 @@ term  ::= [0-9]+)""";
        // pretty print error message before asserting
        if (expected_pair.first != key || expected_pair.second != value)
        {
-            fprintf(stderr, "expected_pair: %s, %d\n", expected_pair.first.c_str(), expected_pair.second);
-            fprintf(stderr, "actual_pair: %s, %d\n", key.c_str(), value);
+            fprintf(stderr, "expected_pair: %s, %u\n", expected_pair.first.c_str(), expected_pair.second);
+            fprintf(stderr, "actual_pair: %s, %u\n", key.c_str(), value);
            fprintf(stderr, "expected_pair != actual_pair\n");
        }

@ -96,9 +96,9 @@ term  ::= [0-9]+)""";
            // pretty print error message before asserting
            if (expected_element.type != element.type || expected_element.value != element.value)
            {
-                fprintf(stderr, "index: %d\n", index);
-                fprintf(stderr, "expected_element: %d, %d\n", expected_element.type, expected_element.value);
-                fprintf(stderr, "actual_element: %d, %d\n", element.type, element.value);
+                fprintf(stderr, "index: %u\n", index);
+                fprintf(stderr, "expected_element: %d, %u\n", expected_element.type, expected_element.value);
+                fprintf(stderr, "actual_element: %d, %u\n", element.type, element.value);
                fprintf(stderr, "expected_element != actual_element\n");
            }

@ -144,8 +144,8 @@ term  ::= [0-9]+)""";
        // pretty print error message before asserting
        if (expected_pair.first != key || expected_pair.second != value)
        {
-            fprintf(stderr, "expected_pair: %s, %d\n", expected_pair.first.c_str(), expected_pair.second);
-            fprintf(stderr, "actual_pair: %s, %d\n", key.c_str(), value);
+            fprintf(stderr, "expected_pair: %s, %u\n", expected_pair.first.c_str(), expected_pair.second);
+            fprintf(stderr, "actual_pair: %s, %u\n", key.c_str(), value);
            fprintf(stderr, "expected_pair != actual_pair\n");
        }

@ -235,9 +235,9 @@ term  ::= [0-9]+)""";
            // pretty print error message before asserting
            if (expected_element.type != element.type || expected_element.value != element.value)
            {
-                fprintf(stderr, "index: %d\n", index);
-                fprintf(stderr, "expected_element: %d, %d\n", expected_element.type, expected_element.value);
-                fprintf(stderr, "actual_element: %d, %d\n", element.type, element.value);
+                fprintf(stderr, "index: %u\n", index);
+                fprintf(stderr, "expected_element: %d, %u\n", expected_element.type, expected_element.value);
+                fprintf(stderr, "actual_element: %d, %u\n", element.type, element.value);
                fprintf(stderr, "expected_element != actual_element\n");
            }

--- a/tests/test-llama-grammar.cpp
+++ b/tests/test-llama-grammar.cpp
@ -180,8 +180,8 @@ int main()
            if (expected_element.type != element->type || expected_element.value != element->value)
            {
                fprintf(stderr, "index: %d\n", index);
-                fprintf(stderr, "expected_element: %d, %d\n", expected_element.type, expected_element.value);
-                fprintf(stderr, "actual_element: %d, %d\n", element->type, element->value);
+                fprintf(stderr, "expected_element: %d, %u\n", expected_element.type, expected_element.value);
+                fprintf(stderr, "actual_element: %d, %u\n", element->type, element->value);
                fprintf(stderr, "expected_element != actual_element\n");
            }

--- a/tests/test-model-load-cancel.cpp
+++ b/tests/test-model-load-cancel.cpp
@ -14,7 +14,7 @@ int main(int argc, char *argv[] ) {
    fprintf(stderr, "using '%s'\n", model_path);
    fclose(file);

-    llama_backend_init(false);
+    llama_backend_init();
    auto params = llama_model_params{};
    params.use_mmap = false;
    params.progress_callback = [](float progress, void * ctx){
--- a/tests/test-tokenizer-0-falcon.cpp
+++ b/tests/test-tokenizer-0-falcon.cpp
@ -61,7 +61,7 @@ int main(int argc, char **argv) {
    llama_model * model;
    llama_context * ctx;

-    llama_backend_init(false);
+    llama_backend_init();

    // load the vocab
    {
--- a/tests/test-tokenizer-0-llama.cpp
+++ b/tests/test-tokenizer-0-llama.cpp
@ -60,7 +60,7 @@ int main(int argc, char **argv) {
    llama_model * model;
    llama_context * ctx;

-    llama_backend_init(false);
+    llama_backend_init();

    // load the vocab
    {
--- a/tests/test-tokenizer-1-bpe.cpp
+++ b/tests/test-tokenizer-1-bpe.cpp
@ -4,13 +4,13 @@
 #include "console.h"

 #include <cassert>
+#include <codecvt>
 #include <cstdio>
 #include <cstring>
-#include <string>
-#include <codecvt>
-#include <map>
-#include <vector>
 #include <locale>
+#include <string>
+#include <thread>
+#include <vector>

 int main(int argc, char **argv) {
    if (argc < 2) {
@ -25,7 +25,7 @@ int main(int argc, char **argv) {
    llama_model * model;
    llama_context * ctx;

-    llama_backend_init(false);
+    llama_backend_init();

    // load the vocab
    {
@ -74,45 +74,46 @@ int main(int argc, char **argv) {
            }
        }
        catch (const std::invalid_argument &) {
-            fprintf(stderr, "%s : info: utf8 conversion %d '%s'\n", __func__, i, str.c_str());
+            //fprintf(stderr, "%s : info: utf8 conversion %d '%s'\n", __func__, i, str.c_str());
        }
    }

-    for (uint32_t cp = 0x0000; cp < 0xffff; ++cp) {
-        // NOTE: these exceptions seem to be necessary, because the GPT2 tokenizer doesn't want to interfere with some ASCII control characters
-        if ((cp < 0x03 || cp > 0x05) && cp != 0x0b && cp != 0x11 && (cp < 0x13 || cp > 0x17) && cp != 0x19 && (cp < 0x1c || cp > 0x1e) && (cp < 0xd800 || cp > 0xdfff)) {
-            std::string str = " " + codepoint_to_utf8(cp);
-            std::vector<llama_token> tokens = llama_tokenize(ctx, str, false);
-            std::string check = llama_detokenize_bpe(ctx, tokens);
-            if (str != check) {
-                fprintf(stderr, "%s : error: codepoint %x detokenizes to '%s'(%zu) instead of '%s'(%zu)\n",
-                    __func__, cp, check.c_str(), check.length(), str.c_str(), str.length());
-                return 3;
-            }
-        }
-    }
-    // Restrict to assigned unicode planes
-    // for (uint32_t cp = 0x10000; cp < 0x0010ffff; ++cp) {
-    for (uint32_t cp = 0x10000; cp < 0x00040000; ++cp) {
-        std::string str = codepoint_to_utf8(cp);
-        std::vector<llama_token> tokens = llama_tokenize(ctx, str, false);
-        std::string check = llama_detokenize_bpe(ctx, tokens);
-        if (str != check) {
-            fprintf(stderr, "%s : error: codepoint %x detokenizes to '%s'(%zu) instead of '%s'(%zu)\n",
-                __func__, cp, check.c_str(), check.length(), str.c_str(), str.length());
-            return 4;
-        }
-    }
-    for (uint32_t cp = 0x000e0000; cp < 0x0010ffff; ++cp) {
-        std::string str = codepoint_to_utf8(cp);
-        std::vector<llama_token> tokens = llama_tokenize(ctx, str, false);
-        std::string check = llama_detokenize_bpe(ctx, tokens);
-        if (str != check) {
-            fprintf(stderr, "%s : error: codepoint %x detokenizes to '%s'(%zu) instead of '%s'(%zu)\n",
-                __func__, cp, check.c_str(), check.length(), str.c_str(), str.length());
-            return 4;
+    // unicode
+    {
+        const int nthread = std::thread::hardware_concurrency();
+
+        std::vector<std::thread> threads(nthread);
+
+        for (int i = 0; i < nthread; ++i) {
+            threads[i] = std::thread([i, nthread, ctx]() {
+                for (uint32_t cp = i; cp < 0x0010ffff; cp += nthread) {
+                    if (!( // NOLINT
+                                (cp < 0x03       || cp >  0x05)   && cp != 0x0b && cp != 0x11 &&
+                                (cp < 0x13       || cp >  0x17)   && cp != 0x19 &&
+                                (cp < 0x1c       || cp >  0x1e)   &&
+                                (cp < 0xd800     || cp >  0xdfff) &&
+                                (cp < 0x00040000 || cp >= 0x000e0000)
+                        )) {
+                        continue;
+                    }
+
+                    std::string str = codepoint_to_utf8(cp);
+                    std::vector<llama_token> tokens = llama_tokenize(ctx, str, false);
+                    std::string check = llama_detokenize_bpe(ctx, tokens);
+                    if (cp != 9601 && str != check) {
+                        fprintf(stderr, "error: codepoint %x detokenizes to '%s'(%zu) instead of '%s'(%zu)\n",
+                                cp, check.c_str(), check.length(), str.c_str(), str.length());
+                        std::exit(3);
+                    }
+                }
+            });
+        }
+
+        for (auto & t : threads) {
+            t.join();
        }
    }
+
    llama_free_model(model);
    llama_free(ctx);

--- a/tests/test-tokenizer-1-llama.cpp
+++ b/tests/test-tokenizer-1-llama.cpp
@ -4,13 +4,13 @@
 #include "console.h"

 #include <cassert>
+#include <codecvt>
 #include <cstdio>
 #include <cstring>
-#include <string>
-#include <codecvt>
-#include <map>
-#include <vector>
 #include <locale>
+#include <string>
+#include <thread>
+#include <vector>

 int main(int argc, char **argv) {
    if (argc < 2) {
@ -25,7 +25,7 @@ int main(int argc, char **argv) {
    llama_model * model;
    llama_context * ctx;

-    llama_backend_init(false);
+    llama_backend_init();

    // load the vocab
    {
@ -72,26 +72,33 @@ int main(int argc, char **argv) {
        }
    }

-    for (uint32_t cp = 0x0000; cp < 0xffff; ++cp) {
-        if (cp < 0xd800 || cp > 0xdfff) {
-            std::string str = codepoint_to_utf8(cp);
-            std::vector<llama_token> tokens = llama_tokenize(ctx, str, false);
-            std::string check = llama_detokenize_spm(ctx, tokens);
-            if (cp != 9601 && str != check) {
-                fprintf(stderr, "%s : error: codepoint %d detokenizes to '%s'(%zu) instead of '%s'(%zu)\n",
-                    __func__, cp, check.c_str(), check.length(), str.c_str(), str.length());
-                return 3;
-            }
+    // unicode
+    {
+        const int nthread = std::thread::hardware_concurrency();
+
+        std::vector<std::thread> threads(nthread);
+
+        for (int i = 0; i < nthread; ++i) {
+            threads[i] = std::thread([i, nthread, ctx]() {
+                for (uint32_t cp = i; cp < 0x0010ffff; cp += nthread) {
+                    if (cp >= 0xd800 && cp <= 0xdfff) {
+                        continue;
+                    }
+
+                    std::string str = codepoint_to_utf8(cp);
+                    std::vector<llama_token> tokens = llama_tokenize(ctx, str, false);
+                    std::string check = llama_detokenize_spm(ctx, tokens);
+                    if (cp != 9601 && str != check) {
+                        fprintf(stderr, "error: codepoint %x detokenizes to '%s'(%zu) instead of '%s'(%zu)\n",
+                                cp, check.c_str(), check.length(), str.c_str(), str.length());
+                        std::exit(3);
+                    }
+                }
+            });
        }
-    }
-    for (uint32_t cp = 0x10000; cp < 0x0010ffff; ++cp) {
-        std::string str = codepoint_to_utf8(cp);
-        std::vector<llama_token> tokens = llama_tokenize(ctx, str, false);
-        std::string check = llama_detokenize_spm(ctx, tokens);
-        if (str != check) {
-            fprintf(stderr, "%s : error: codepoint %d detokenizes to '%s'(%zu) instead of '%s'(%zu)\n",
-                __func__, cp, check.c_str(), check.length(), str.c_str(), str.length());
-            return 4;
+
+        for (auto & t : threads) {
+            t.join();
        }
    }