WIP

2024-05-06 15:14:54 -04:00 · 2024-05-06 15:14:54 -04:00 · 98ba54e5ec
commit 98ba54e5ec
parent 8d2dead681
7 changed files with 5930 additions and 47 deletions
--- a/debug.openelm-2.txt
+++ b/debug.openelm-2.txt
--- a/examples/split-test/CMakeLists.txt
+++ b/examples/split-test/CMakeLists.txt
@ -0,0 +1,5 @@
+set(TARGET baby-llama)
+add_executable(${TARGET} baby-llama.cpp)
+install(TARGETS ${TARGET} RUNTIME)
+target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
+target_compile_features(${TARGET} PRIVATE cxx_std_11)
--- a/examples/split-test/split-test.cpp
+++ b/examples/split-test/split-test.cpp
@ -0,0 +1,40 @@
+#include <iostream>
+#include "ggml.h"
+
+int main() {
+    printf("split_test\n");
+    // Initialization
+    struct ggml_init_params params = ggml_init_params{1024};  // Assuming this initializes memory
+    ggml_context *ctx = ggml_init(params);
+
+    // Tensor Creation (Analogous to the PyTorch code)
+    int64_t size = 18 * 7 * 64;
+    int64_t dims[4] = {1, 18, 7, 64};
+    ggml_tensor *tensor = ggml_new_tensor(ctx, GGML_TYPE_F32, 4, dims);
+
+    // Initialize tensor data (Note: Simplified for this example)
+    float* tensor_data = (float*) tensor->data;
+    for (int i = 0; i < size; i++) {
+        tensor_data[i] = (float) i;
+        printf("%f", tensor_data[i]);
+    }
+    printf("\n");
+
+    // Reshaping and Transpose
+    // ... (You'll need ggml equivalents of reshape and transpose)
+
+    // Splitting (We'll focus on this part)
+    int64_t num_q_heads = 12;
+    int64_t num_k_heads = 3;
+    int64_t num_v_heads = 3;
+
+    ggml_tensor *a = ggml_view_3d(ctx, tensor, /*ne0*/1, /*ne1*/2, /*ne2*/3, /*nb1*/4, /*nb2*/5, /*offset*/6);
+    ggml_tensor *b = ggml_view_3d(ctx, tensor, /*ne0*/1, /*ne1*/2, /*ne2*/3, /*nb1*/4, /*nb2*/5, /*offset*/6);
+    ggml_tensor *c = ggml_view_3d(ctx, tensor, /*ne0*/1, /*ne1*/2, /*ne2*/3, /*nb1*/4, /*nb2*/5, /*offset*/6);
+
+    // Accessing elements (assuming ggml provides similar access)
+    float *a_data = (float*) a->data;
+    std::cout << a_data[0] << std::endl;
+
+    return 0;
+}
--- a/llama.cpp
+++ b/llama.cpp
@ -6320,6 +6320,42 @@ static void llm_build_kv_store(
            (kv_head)*ggml_element_size(kv.v_l[il]));
    cb(v_cache_view, "v_cache_view", il);

+    // important: storing RoPE-ed version of K in the KV cache!
+    // ggml_build_forward_expand(graph, ggml_cpy(ctx, k_cur,   k_cache_view));
+    // ggml_build_forward_expand(graph, ggml_cpy(ctx, v_cur_t, v_cache_view));
+}
+
+static void llm_build_kv_store2(
+        struct ggml_context * ctx,
+        const llama_hparams & hparams,
+       const llama_kv_cache & kv,
+         struct ggml_cgraph * graph,
+         struct ggml_tensor * k_cur,
+         struct ggml_tensor * v_cur,
+                    int64_t   n_ctx,
+                    int32_t   n_tokens,
+                    int32_t   kv_head,
+         const llm_build_cb & cb,
+                    int64_t   il) {
+    const int64_t n_embd_k_gqa = hparams.n_embd_k_gqa()/hparams.n_head_kv;
+    const int64_t n_embd_v_gqa = hparams.n_embd_v_gqa()/hparams.n_head_kv;
+
+    GGML_ASSERT(kv.size == n_ctx);
+
+    // compute the transposed [n_tokens, n_embd] V matrix
+    assert(v_cur->ne[0] == n_embd_v_gqa && v_cur->ne[1] == n_tokens);
+    struct ggml_tensor * v_cur_t = ggml_transpose(ctx, v_cur);
+    cb(v_cur_t, "v_cur_t", il);
+
+    struct ggml_tensor * k_cache_view = ggml_view_1d(ctx, kv.k_l[il], ggml_nbytes(k_cur)/4,
+            (ggml_row_size(kv.k_l[il]->type, n_embd_k_gqa))*kv_head);
+    cb(k_cache_view, "k_cache_view", il);
+
+    struct ggml_tensor * v_cache_view = ggml_view_1d(ctx, kv.v_l[il], ggml_nbytes(v_cur)/4,
+            // (  n_ctx)*ggml_element_size(kv.v_l[il]),
+            (kv_head)*ggml_element_size(kv.v_l[il]));
+    cb(v_cache_view, "v_cache_view", il);
+
    // important: storing RoPE-ed version of K in the KV cache!
    ggml_build_forward_expand(graph, ggml_cpy(ctx, k_cur,   k_cache_view));
    ggml_build_forward_expand(graph, ggml_cpy(ctx, v_cur_t, v_cache_view));
@ -6409,12 +6445,16 @@ static struct ggml_tensor * llm_build_ffn(
            } break;
        case LLM_FFN_SILU2:
        {
-            struct ggml_tensor * one = ggml_view_2d(ctx, cur, cur->ne[0]/2, cur->ne[1], cur->nb[1], 0);
-            int offset = sizeof(float) * (cur->ne[0]/2) * (cur->ne[1]);
-            struct ggml_tensor * two = ggml_view_2d(ctx, cur, cur->ne[0]/2, cur->ne[1], cur->nb[1], offset);
-            cur = ggml_mul(ctx, ggml_silu(ctx, one), two);
+            // Project to 4h. If using swiglu double the output width, see https://arxiv.org/pdf/2002.05202.pdf
+            int64_t split_point = cur->ne[0] / 2;
+            struct ggml_tensor * x0 = ggml_cont(ctx, ggml_view_2d(ctx, cur, split_point, cur->ne[1], cur->nb[1], 0));
+            struct ggml_tensor * x1 = ggml_cont(ctx, ggml_view_2d(ctx, cur, split_point, cur->ne[1], cur->nb[1], split_point * ggml_element_size(cur)));
+
+            x0 = ggml_silu(ctx, x0);
            cb(cur, "ffn_silu", il);

+            cur = ggml_mul(ctx, x0, x1);
+            cb(cur, "ffn_mul", il);
        } break;
        case LLM_FFN_GELU:
            {
@ -6589,7 +6629,7 @@ static struct ggml_tensor * llm_build_kqv(
    struct ggml_tensor * kq = ggml_mul_mat(ctx, k, q);
    cb(kq, "kq", il);

-    if (model.arch == LLM_ARCH_PHI2 || model.arch == LLM_ARCH_PHI3 || model.arch == LLM_ARCH_OPENELM) {
+    if (model.arch == LLM_ARCH_PHI2 || model.arch == LLM_ARCH_PHI3) {
        // for this arch, we need to perform the KQ multiplication with F32 precision, otherwise we get NaNs
        // ref: https://github.com/ggerganov/llama.cpp/pull/4490#issuecomment-1859055847
        ggml_mul_mat_set_prec(kq, GGML_PREC_F32);
@ -6706,6 +6746,44 @@ static struct ggml_tensor * llm_build_kv(
    return cur;
 }

+static struct ggml_tensor * llm_build_kv2(
+        struct ggml_context * ctx,
+          const llama_model & model,
+        const llama_hparams & hparams,
+       const llama_kv_cache & kv,
+         struct ggml_cgraph * graph,
+         struct ggml_tensor * wo,
+         struct ggml_tensor * wo_b,
+         struct ggml_tensor * k_cur,
+         struct ggml_tensor * v_cur,
+         struct ggml_tensor * q_cur,
+         struct ggml_tensor * kq_mask,
+         struct ggml_tensor * kq_pos,
+                    int64_t   n_ctx,
+                    int32_t   n_tokens,
+                    int32_t   kv_head,
+                    int32_t   n_kv,
+                    float     kq_scale,
+         const llm_build_cb & cb,
+                    int       il) {
+
+    // these nodes are added to the graph together so that they are not reordered
+    // by doing so, the number of splits in the graph is reduced
+    ggml_build_forward_expand(graph, q_cur);
+    ggml_build_forward_expand(graph, k_cur);
+    ggml_build_forward_expand(graph, v_cur);
+
+    llm_build_kv_store2(ctx, hparams, kv, graph, k_cur, v_cur, n_ctx, n_tokens, kv_head, cb, il);
+
+    struct ggml_tensor * cur;
+
+    cur  = llm_build_kqv(ctx, model, hparams, kv, graph, wo, wo_b,
+            q_cur, kq_mask, kq_pos, n_ctx, n_tokens, n_kv, kq_scale, cb, il);
+    cb(cur, "kqv_out", il);
+
+    return cur;
+}
+
 struct llm_build_context {
    const llama_model    & model;
          llama_context  & lctx;
@ -10735,7 +10813,6 @@ struct llm_build_context {
        llama_hparams modified_hparams(hparams);
        GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);

-        // struct ggml_tensor * KQ_mask = build_inp_KQ_mask2(n_kv);
        struct ggml_tensor * KQ_mask = build_inp_KQ_mask();


@ -10745,6 +10822,7 @@ struct llm_build_context {
            // This doesn't work at the moment, comment out to test
            const int64_t n_head_k = num_kv_heads[il];
            const int64_t n_head_v = num_kv_heads[il];
+            const int64_t n_head_q = num_query_heads[il];
            const int64_t n_head_kv = n_head_k+n_head_v;
            const int64_t n_head =  n_head_kv+ num_query_heads[il];
            // const int64_t n_kv =  (num_kv_heads[il]+num_kv_heads[il])*n_embd_head; // This makes asserts fail
@ -10752,11 +10830,21 @@ struct llm_build_context {
            modified_hparams.n_head = 4*n_head_k; // somehow this works. Some places expect this to be groups*n_head_kv insteal of n_head. maybe this is the defintiion somewhere.
            modified_hparams.n_head_kv = n_head_kv;
            const int64_t n_embd_gqa =  n_embd_head * n_head;
-            const int64_t n_embd_k_gqa =  modified_hparams.n_embd_k_gqa();
-            const int64_t n_embd_v_gqa =  modified_hparams.n_embd_v_gqa();
+
+
+
+            struct ggml_tensor * attn_q_norm = model.layers[il].attn_q_norm;
+            cb(attn_q_norm, "attn_q_norm", il);
+            struct ggml_tensor * attn_k_norm = model.layers[il].attn_k_norm;
+            cb(attn_k_norm, "attn_k_norm", il);
+
+            // const int64_t n_embd_k_gqa =  modified_hparams.n_embd_k_gqa();
+            // const int64_t n_embd_v_gqa =  modified_hparams.n_embd_v_gqa();

            // self-attention
            {
+
+                cb(model.layers[il].attn_norm, "attn_norm.weight", il);
                struct ggml_tensor* attn_norm_output = llm_build_norm(ctx0, inpL, modified_hparams,
                    model.layers[il].attn_norm,
                    NULL,
@ -10766,30 +10854,61 @@ struct llm_build_context {
                struct ggml_tensor * Qcur = nullptr;
                struct ggml_tensor * Kcur = nullptr;
                struct ggml_tensor * Vcur = nullptr;
-
+                cb(model.layers[il].wqkv, "qkv_proj_weight", il);
                cur = ggml_mul_mat(ctx0, model.layers[il].wqkv, attn_norm_output); // model.layers[il].wqkv -> might not be all 3 qkv
-                cb(cur, "wqkv", il);
-                // model.layers[il].wqkv has dimensionality of
-                // [model_dim][(n_head_k+n_head_v+n_head_q)*head_dim]
-                // In most other impls, this is [model_dim][3*above]
-                // This matches up with the dimensions of the huggingface version
-                Qcur = ggml_cont(ctx0, ggml_view_3d(ctx0, cur, n_embd_head, n_tokens, num_query_heads[il], cur->nb[1], cur->nb[2], 0));
-                Kcur = ggml_cont(ctx0, ggml_view_3d(ctx0, cur, n_embd_head,n_tokens, n_head_k, cur->nb[1], cur->nb[2], 1 * sizeof(float) * (n_embd_head)));
-                Vcur = ggml_cont(ctx0, ggml_view_3d(ctx0, cur, n_embd_head,n_tokens, n_head_k, cur->nb[1], cur->nb[2], 2 * sizeof(float) * (n_embd_head)));
+                cb(cur, "qkv", il);
+                cur = ggml_reshape_3d(ctx0, cur, n_embd_head, n_tokens, n_head);
+                cur = ggml_permute(ctx0, cur, 0, 2, 1, 3);
+                // TODO: these need to be calculated correctly
+
+                /*
+
+
+                struct ggml_tensor * tmpqkv = ggml_reshape_4d(ctx0, cur, n_embd_head, 3, n_head, n_tokens);
+                cb(tmpqkv, "tmpqkv", il);
+
+                struct ggml_tensor * tmpqkv_perm = ggml_cont(ctx0, ggml_permute(ctx0, tmpqkv, 0, 3, 1, 2));
+                cb(tmpqkv_perm, "tmpqkv", il);
+
+                struct ggml_tensor * tmpq = ggml_view_3d(
+                        ctx0, tmpqkv_perm, n_embd_head, n_head, n_tokens,
+                        ggml_element_size(tmpqkv_perm) * n_embd_head,
+                        ggml_element_size(tmpqkv_perm) * n_embd_head * n_head,
+                        0
+                        );
+                cb(tmpq, "tmpq", il);
+
+                struct ggml_tensor * tmpk = ggml_view_3d(
+                        ctx0, tmpqkv_perm, n_embd_head, n_head, n_tokens,
+                        ggml_element_size(tmpqkv_perm) * n_embd_head,
+                        ggml_element_size(tmpqkv_perm) * n_embd_head * n_head,
+                        ggml_element_size(tmpqkv_perm) * n_embd_head * n_head * n_tokens
+                        );
+
+                 */
+                size_t  elemsize = ggml_element_size(cur);
+
+                Qcur = ggml_cont(ctx0, ggml_view_3d(ctx0, cur, n_embd_head, n_tokens, num_query_heads[il], cur->nb[1], cur->nb[2]*num_query_heads[il], 0));
+                cb(Qcur, "queries", il);
+                Kcur = ggml_cont(ctx0, ggml_view_3d(ctx0, cur, n_embd_head, n_tokens, n_head_k, cur->nb[1],  cur->nb[2]*n_head_k, cur->nb[2]*num_query_heads[il]));
+                cb(Kcur, "keys", il);
+                Vcur = ggml_cont(ctx0, ggml_view_3d(ctx0, cur, n_embd_head, n_tokens, n_head_q, cur->nb[1],  cur->nb[2]*n_head_v, cur->nb[2]*(num_query_heads[il]+n_head_k)));
+                cb(Vcur, "values", il);
                // Q/K Layernorm
+
+                cb(Qcur, "queries", il);
                Qcur = llm_build_norm(ctx0, Qcur, modified_hparams,
                        model.layers[il].attn_q_norm,
                        NULL,
                        LLM_NORM_RMS, cb, il);
-                cb(Qcur, "Qcur", il);
+

                Kcur = llm_build_norm(ctx0, Kcur, modified_hparams,
                        model.layers[il].attn_k_norm,
                        NULL,
                        LLM_NORM_RMS, cb, il);
-                cb(Kcur, "Kcur", il);
+                cb(Kcur, "keys", il);

-                cb(Vcur, "Vcur", il);
                // reshape, Qcur -> [64][12(first layer)][n_tokens]
                // reshape, Kcur -> [64][3(first layer)][n_tokens]
                Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head,  num_query_heads[il], n_tokens);
@ -10799,32 +10918,22 @@ struct llm_build_context {
                    ctx0, Qcur, inp_pos, n_rot, rope_type, 0, n_orig_ctx,
                    freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow
                );
-                cb(Qcur, "Qcur", il);
+                cb(Qcur, "queries", il);

-                Qcur = ggml_scale(ctx0, Qcur, 1.0f / sqrtf(float(n_embd_head)));
-                cb(Qcur, "Qcur", il);
+                // Qcur = ggml_scale(ctx0, Qcur, 1.0f / sqrtf(float(n_embd_head)));
+                // cb(Qcur, "Qcur", il);

                Kcur = ggml_rope_custom(
                    ctx0, Kcur, inp_pos, n_rot, rope_type, 0, n_orig_ctx,
                    freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow
                );
-                int64_t nev[GGML_MAX_DIMS] = {2*Vcur->ne[0], Vcur->ne[1], Vcur->ne[2], Vcur->ne[3]};
+                int64_t nev[GGML_MAX_DIMS] = {Vcur->ne[0], Vcur->ne[1], 4*Vcur->ne[2], Vcur->ne[3]};
                struct ggml_tensor * Vcur2 = ggml_new_tensor(ctx0, Vcur->type, GGML_MAX_DIMS, nev);
-                Vcur2->grad = ggml_dup_tensor(ctx0, Vcur);
-                Vcur2 = ggml_reshape_2d(ctx0, Vcur2, modified_hparams.n_embd_k_gqa(),  n_tokens);
-                int64_t nek[GGML_MAX_DIMS] = {2*Kcur->ne[0], Kcur->ne[1], Kcur->ne[2], Kcur->ne[3]};
-                struct ggml_tensor * Kcur2 = ggml_new_tensor(ctx0, Kcur->type, GGML_MAX_DIMS, nek);
-                Kcur2->grad = ggml_dup_tensor(ctx0, Kcur);
-                Kcur2 = ggml_reshape_2d(ctx0, Kcur2, modified_hparams.n_embd_k_gqa(),  n_tokens);
-                cb(Kcur, "Kcur", il);
-                // Attempt at transscreibing from python:
-                // cur = ggml_flash_attn(ctx0, Qcur, Kcur, Vcur, true);
-                // cur = ggml_transpose(ctx0, cur);
-                // cur = ggml_permute(ctx0, cur, 0, 2, 1, 3);
-                // cur = ggml_cont(ctx0, cur);
-                // cur = ggml_reshape_2d(ctx0, cur, n_embd_head_k*(2*n_head_kv),  n_tokens);
-                // cur = ggml_mul_mat(ctx0, cur, model.layers[il].wo);
-                // cur = ggml_transpose(ctx0, cur);
+                Vcur2 = ggml_repeat(ctx0, Vcur, Vcur2);
+
+                int64_t nek[GGML_MAX_DIMS] = {Kcur->ne[0], Kcur->ne[1], 4*Kcur->ne[2], Kcur->ne[3]};
+                struct ggml_tensor * Kcur2 = ggml_new_tensor(ctx0, Vcur->type, GGML_MAX_DIMS, nek);
+                Kcur2 = ggml_repeat(ctx0, Kcur, Kcur2);

                cur = llm_build_kv(ctx0, model, modified_hparams, kv_self, gf,
                     model.layers[il].wo, NULL,
@ -10837,11 +10946,11 @@ struct llm_build_context {
                residual = ggml_get_rows(ctx0, residual, inp_out_ids);
            }
            cur = ggml_add(ctx0, cur, residual);
-            residual = cur;
            cur = llm_build_norm(ctx0, cur, modified_hparams,
                model.layers[il].ffn_norm, NULL,
                LLM_NORM_RMS, cb, il);
            cb(cur, "ffn_norm", il);
+            residual = cur;
            // FF
            {

@ -10860,19 +10969,19 @@ struct llm_build_context {
                    LLM_FFN_SILU2, LLM_FFN_SEQ, cb, il);
                cb(cur, "ffn_out", il);
            }
-
+            residual = cur;
            cur = ggml_add(ctx0, residual, cur);
            cb(cur, "l_out", il);

            inpL = cur;
        }

-        cur = llm_build_norm(ctx0, inpL, modified_hparams,
-            model.output_norm,
-            NULL,
-            LLM_NORM_RMS, cb, -1);
+        cur = llm_build_norm(ctx0, cur, hparams,
+        model.output_norm, NULL,
+        LLM_NORM_RMS, cb, -1);
        cb(cur, "result_norm", -1);

+        // lm_head
        cur = ggml_mul_mat(ctx0, model.output, cur);
        cb(cur, "result_output", -1);

@ -15440,7 +15549,7 @@ struct llama_context_params llama_context_default_params() {
        /*.type_v                      =*/ GGML_TYPE_F16,
        /*.logits_all                  =*/ false,
        /*.embeddings                  =*/ false,
-        /*.offload_kqv                 =*/ true,
+        /*.offload_kqv                 =*/ false,
        /*.abort_callback              =*/ nullptr,
        /*.abort_callback_data         =*/ nullptr,
    };
@ -15606,7 +15715,7 @@ struct llama_context * llama_new_context_with_model(
    cparams.yarn_beta_slow   = params.yarn_beta_slow;
    cparams.defrag_thold     = params.defrag_thold;
    cparams.embeddings       = params.embeddings;
-    cparams.offload_kqv      = params.offload_kqv;
+    cparams.offload_kqv      = false;
    cparams.pooling_type     = params.pooling_type;

    cparams.n_ctx            = params.n_ctx           == 0    ? hparams.n_ctx_train           : params.n_ctx;
--- a/scratch_21.sh
+++ b/scratch_21.sh
@ -0,0 +1,6 @@
+eval-callback \
+  --hf-repo models/openelm-small/ \
+  --model ggml-model-f16.gguf/ \
+  --prompt hello \
+  --seed 42 \
+  -ngl 0
--- a/split_test.cpp
+++ b/split_test.cpp
@ -0,0 +1,40 @@
+#include <iostream>
+#include "ggml.h"
+
+int main() {
+    printf("split_test\n");
+    // Initialization
+    struct ggml_init_params params = ggml_init_params{1024};  // Assuming this initializes memory
+    ggml_context *ctx = ggml_init(params);
+
+    // Tensor Creation (Analogous to the PyTorch code)
+    int64_t size = 18 * 7 * 64;
+    int64_t dims[4] = {1, 18, 7, 64};
+    ggml_tensor *tensor = ggml_new_tensor(ctx, GGML_TYPE_F32, 4, dims);
+
+    // Initialize tensor data (Note: Simplified for this example)
+    float* tensor_data = (float*) tensor->data;
+    for (int i = 0; i < size; i++) {
+        tensor_data[i] = (float) i;
+        printf("%f", tensor_data[i]);
+    }
+    printf("\n");
+
+    // Reshaping and Transpose
+    // ... (You'll need ggml equivalents of reshape and transpose)
+
+    // Splitting (We'll focus on this part)
+    int64_t num_q_heads = 12;
+    int64_t num_k_heads = 3;
+    int64_t num_v_heads = 3;
+
+    ggml_tensor *a = ggml_view_3d(ctx, tensor, /*ne0*/1, /*ne1*/2, /*ne2*/3, /*nb1*/4, /*nb2*/5, /*offset*/6);
+    ggml_tensor *b = ggml_view_3d(ctx, tensor, /*ne0*/1, /*ne1*/2, /*ne2*/3, /*nb1*/4, /*nb2*/5, /*offset*/6);
+    ggml_tensor *c = ggml_view_3d(ctx, tensor, /*ne0*/1, /*ne1*/2, /*ne2*/3, /*nb1*/4, /*nb2*/5, /*offset*/6);
+
+    // Accessing elements (assuming ggml provides similar access)
+    float *a_data = (float*) a->data;
+    std::cout << a_data[0] << std::endl;
+
+    return 0;
+}
--- a/tests/test-split.cpp
+++ b/tests/test-split.cpp
@ -0,0 +1,271 @@
+#include "llama.h"
+#include "common.h"
+#include "console.h"
+
+#include <cstdio>
+#include <string>
+#include <map>
+#include <vector>
+#include <fstream>
+
+//static const std::map<std::string, std::vector<llama_token>> & k_tests() {
+//    static std::map<std::string, std::vector<llama_token>> _k_tests = {
+//        { ""                      , {  }, },
+//        { " "                     , {     220, }, },
+//        { "  "                    , {     256, }, },
+//        { "   "                   , {     262, }, },
+//        { "\t"                    , {     197, }, },
+//        { "\n"                    , {     198, }, },
+//        { "\n\n"                  , {     271, }, },
+//        { "\n\n\n"                , {    1432, }, },
+//        { "\t\n"                  , {    1602, }, },
+//        { "Hello world"           , {    9906,   1917, }, },
+//        { " Hello world"          , {   22691,   1917, }, },
+//        { "Hello World"           , {    9906,   4435, }, },
+//        { " Hello World"          , {   22691,   4435, }, },
+//        { " Hello World!"         , {   22691,   4435,      0, }, },
+//        { "Hello, world!"         , {    9906,     11,   1917,      0, }, },
+//        { " Hello, world!"        , {   22691,     11,   1917,      0, }, },
+//        { " this is 🦙.cpp"        , {     420,    374,  11410,     99,    247,     13,  11055, }, },
+//        { "w048 7tuijk dsdfhu"    , {      86,  23904,    220,     22,     83,   2005,  42908,  11729,   3013,  17156, }, },
+//        { "нещо на Български"     , {   79862, 102118,  13373,  64571,  34694,   3114, 112203,  80112, }, },
+//        { "កាន់តែពិសេសអាចខលចេញ"   , {   21549,    222,  98629,    241,  45358,    233,  21549,    237,  45358,    224,  21549,    244,  21549,    115,  21549,    253,  45358,    223,  21549,    253,  21549,     95,  98629,    227,  21549,    223,  21549,    249,  21549,    227,  45358,    223,  21549,    231, }, },
+//        { "🚀 (normal) 😶‍🌫️ (multiple emojis concatenated) ✅ (only emoji that has its own token)", {    9468,    248,    222,    320,   8416,      8,  27623,    114, 102470,   9468,    234,    104,  31643,    320,  36773, 100166,  98634,      8,  26602,    227,    320,   3323,  43465,    430,    706,   1202,   1866,   4037,      8, }, },
+//        { "Hello"                 , {    9906, }, },
+//        { " Hello"                , {   22691, }, },
+//        { "  Hello"               , {     220,  22691, }, },
+//        { "   Hello"              , {     256,  22691, }, },
+//        { "    Hello"             , {     262,  22691, }, },
+//        { "    Hello\n    Hello"  , {     262,  22691,    198,    262,  22691, }, },
+//        { " ("                    , {     320, }, },
+//        { "\n ="                  , {     198,    284, }, },
+//        { "' era"                 , {       6,  11639, }, },
+//        { "Hello, y'all! How are you 😁 ?我想在apple工作1314151天～", {    9906,     11,    379,  65948,      0,   2650,    527,    499,  27623,    223,    949,  37046, 101067,  19000,  23182, 102301,   9263,  18136,     16,  36827,  21909, }, },
+//        { "3"                     , {      18, }, },
+//        { "33"                    , {    1644, }, },
+//        { "333"                   , {    8765, }, },
+//        { "3333"                  , {    8765,     18, }, },
+//        { "33333"                 , {    8765,   1644, }, },
+//        { "333333"                , {    8765,   8765, }, },
+//        { "3333333"               , {    8765,   8765,     18, }, },
+//        { "33333333"              , {    8765,   8765,   1644, }, },
+//        { "333333333"             , {    8765,   8765,   8765, }, },
+//    };
+//
+//    return _k_tests;
+//}
+
+static std::map<std::string, std::vector<llama_token>> read_tests(const std::string & fname_inp, const std::string & fname_out) {
+    std::map<std::string, std::vector<llama_token>> tests;
+
+    std::ifstream ifs_inp(fname_inp);
+    if (!ifs_inp) {
+        fprintf(stderr, "%s : error: could not open file '%s'\n", __func__, fname_inp.c_str());
+        return tests;
+    }
+
+    std::string sraw((std::istreambuf_iterator<char>(ifs_inp)), std::istreambuf_iterator<char>());
+
+    std::ifstream ifs_out(fname_out);
+    if (!ifs_out) {
+        fprintf(stderr, "%s : error: could not open file '%s'\n", __func__, fname_out.c_str());
+        return tests;
+    }
+
+    std::vector<std::string> sout;
+    for (std::string line; std::getline(ifs_out, line);) {
+        sout.push_back(line);
+    }
+
+    const std::string sep = "\n__ggml_vocab_test__\n";
+
+    std::vector<std::string> sinp;
+
+    size_t pos = 0;
+    while (pos < sraw.size()) {
+        const size_t next = sraw.find(sep, pos);
+        if (next == std::string::npos) {
+            sinp.push_back(sraw.substr(pos));
+            break;
+        }
+        sinp.push_back(sraw.substr(pos, next - pos));
+        pos = next + sep.size();
+    }
+
+    if (sinp.size() != sout.size()) {
+        fprintf(stderr, "%s : error: input and output files have different number of tests\n", __func__);
+        return tests;
+    }
+
+    for (size_t i = 0; i < sinp.size(); ++i) {
+        const std::string & s = sinp[i];
+        const std::string & o = string_strip(sout[i]);
+
+        std::vector<llama_token> toks;
+
+        size_t pos = 0;
+        while (pos < o.size()) {
+            size_t next = o.find(' ', pos);
+            if (next == std::string::npos) {
+                next = o.size();
+            }
+            const std::string stok = o.substr(pos, next - pos);
+            toks.push_back(std::stoi(stok));
+            pos = next + 1;
+        }
+
+        tests[s] = toks;
+    }
+
+    return tests;
+}
+
+int main(int argc, char **argv) {
+    if (argc < 2) {
+        fprintf(stderr, "Usage: %s vocab-file [text-file]\n", argv[0]);
+        return 1;
+    }
+
+    const std::string fname = argv[1];
+
+    const std::string fname_inp = fname + ".inp";
+    const std::string fname_out = fname + ".out";
+
+    std::string fname_text;
+    if (argc > 2) {
+        fname_text = argv[2];
+    }
+
+    fprintf(stderr, "%s : reading vocab from: '%s'\n", __func__, fname.c_str());
+
+    llama_model * model;
+    llama_context * ctx;
+
+    llama_backend_init();
+
+    // load the vocab
+    {
+        auto mparams = llama_model_default_params();
+
+        mparams.vocab_only = true;
+
+        model = llama_load_model_from_file(fname.c_str(), mparams);
+
+        if (model == NULL) {
+            fprintf(stderr, "%s: error: failed to load vocab '%s'\n", __func__, fname.c_str());
+            return 1;
+        }
+
+        auto cparams = llama_context_default_params();
+
+        ctx = llama_new_context_with_model(model, cparams);
+
+        if (ctx == NULL) {
+            fprintf(stderr, "%s: error: failed to load vocab '%s'\n", __func__, fname.c_str());
+            llama_free_model(model);
+            return 1;
+        }
+    }
+
+#ifdef _WIN32
+    // We need this for unicode console support
+    console::init(false, false);
+    atexit([]() { console::cleanup(); });
+#endif
+
+    bool success = true;
+
+    const auto k_tests = read_tests(fname_inp, fname_out);
+
+    if (k_tests.empty()) {
+        fprintf(stderr, "%s : error: no tests found\n", __func__);
+        return 1;
+    }
+
+    const bool add_special = false;
+
+    for (const auto & test_kv : k_tests) {
+        const std::vector<llama_token> res = llama_tokenize(ctx, test_kv.first, add_special);
+
+        printf("\n");
+        printf("src: '%s'\n", test_kv.first.c_str());
+        printf("res: '%s'\n", llama_detokenize_bpe(ctx, res).c_str());
+        printf("tok: ");
+        for (const auto & tok : res) {
+            printf("%d ", tok);
+        }
+        printf("\n");
+
+        bool correct = res.size() == test_kv.second.size();
+        for (int i = 0; i < (int) res.size() && correct; ++i) {
+            if (test_kv.second[i] != res[i]) {
+                correct = false;
+            }
+        }
+
+        if (!correct) {
+            fprintf(stderr, "%s : failed test:    '%s'\n", __func__, test_kv.first.c_str());
+            fprintf(stderr, "%s : detokenized to: '%s' instead of '%s'\n", __func__,
+                llama_detokenize_bpe(ctx, res).c_str(),
+                llama_detokenize_bpe(ctx, test_kv.second).c_str());
+            fprintf(stderr, "%s : expected tokens: ", __func__);
+            for (const auto & t : test_kv.second) {
+                fprintf(stderr, "%6d '%s', ", t, llama_token_to_piece(ctx, t).c_str());
+            }
+            fprintf(stderr, "\n");
+            fprintf(stderr, "%s : got tokens:      ", __func__);
+            for (const auto & t : res) {
+                fprintf(stderr, "%6d '%s', ", t, llama_token_to_piece(ctx, t).c_str());
+            }
+            fprintf(stderr, "\n");
+
+            success = false;
+        }
+    }
+
+    if (!fname_text.empty()) {
+        fprintf(stderr, "%s : tokenizing: '%s'\n", __func__, fname_text.c_str());
+
+        std::string text;
+        {
+            std::ifstream ifs(fname_text);
+            if (!ifs) {
+                fprintf(stderr, "%s : error: could not open file '%s'\n", __func__, fname_text.c_str());
+                return 1;
+            }
+            text = std::string(std::istreambuf_iterator<char>(ifs), std::istreambuf_iterator<char>());
+        }
+
+        fprintf(stderr, "%s : text size: %zu\n", __func__, text.size());
+
+        const std::vector<llama_token> res = llama_tokenize(ctx, text, add_special);
+
+        fprintf(stderr, "%s : tokens: %zu\n", __func__, res.size());
+
+        {
+            const std::string fname_out = fname_text + ".tokcpp";
+
+            std::ofstream ofs(fname_out);
+            if (!ofs) {
+                fprintf(stderr, "%s : error: could not open file '%s'\n", __func__, fname_out.c_str());
+                return 1;
+            }
+
+            for (const auto & tok : res) {
+                ofs << tok << " '" << string_strip(llama_detokenize_bpe(ctx, std::vector<int>{tok})) << "'" << std::endl;
+            }
+        }
+
+        fprintf(stderr, "%s : tokens written to '%s'\n", __func__, (fname_text + ".tokcpp").c_str());
+    }
+
+    llama_free_model(model);
+    llama_free(ctx);
+
+    llama_backend_free();
+
+    printf("\n");
+    printf("Tests %s\n", success ? "passed" : "failed");
+
+    return success ? 0 : 3;
+}