This commit is contained in:
joshcarp 2024-05-06 15:14:54 -04:00
parent 8d2dead681
commit 98ba54e5ec
7 changed files with 5930 additions and 47 deletions

5412
debug.openelm-2.txt Normal file

File diff suppressed because it is too large Load diff

View file

@ -0,0 +1,5 @@
set(TARGET baby-llama)
add_executable(${TARGET} baby-llama.cpp)
install(TARGETS ${TARGET} RUNTIME)
target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
target_compile_features(${TARGET} PRIVATE cxx_std_11)

View file

@ -0,0 +1,40 @@
#include <iostream>
#include "ggml.h"
int main() {
printf("split_test\n");
// Initialization
struct ggml_init_params params = ggml_init_params{1024}; // Assuming this initializes memory
ggml_context *ctx = ggml_init(params);
// Tensor Creation (Analogous to the PyTorch code)
int64_t size = 18 * 7 * 64;
int64_t dims[4] = {1, 18, 7, 64};
ggml_tensor *tensor = ggml_new_tensor(ctx, GGML_TYPE_F32, 4, dims);
// Initialize tensor data (Note: Simplified for this example)
float* tensor_data = (float*) tensor->data;
for (int i = 0; i < size; i++) {
tensor_data[i] = (float) i;
printf("%f", tensor_data[i]);
}
printf("\n");
// Reshaping and Transpose
// ... (You'll need ggml equivalents of reshape and transpose)
// Splitting (We'll focus on this part)
int64_t num_q_heads = 12;
int64_t num_k_heads = 3;
int64_t num_v_heads = 3;
ggml_tensor *a = ggml_view_3d(ctx, tensor, /*ne0*/1, /*ne1*/2, /*ne2*/3, /*nb1*/4, /*nb2*/5, /*offset*/6);
ggml_tensor *b = ggml_view_3d(ctx, tensor, /*ne0*/1, /*ne1*/2, /*ne2*/3, /*nb1*/4, /*nb2*/5, /*offset*/6);
ggml_tensor *c = ggml_view_3d(ctx, tensor, /*ne0*/1, /*ne1*/2, /*ne2*/3, /*nb1*/4, /*nb2*/5, /*offset*/6);
// Accessing elements (assuming ggml provides similar access)
float *a_data = (float*) a->data;
std::cout << a_data[0] << std::endl;
return 0;
}

203
llama.cpp
View file

@ -6320,6 +6320,42 @@ static void llm_build_kv_store(
(kv_head)*ggml_element_size(kv.v_l[il]));
cb(v_cache_view, "v_cache_view", il);
// important: storing RoPE-ed version of K in the KV cache!
// ggml_build_forward_expand(graph, ggml_cpy(ctx, k_cur, k_cache_view));
// ggml_build_forward_expand(graph, ggml_cpy(ctx, v_cur_t, v_cache_view));
}
static void llm_build_kv_store2(
struct ggml_context * ctx,
const llama_hparams & hparams,
const llama_kv_cache & kv,
struct ggml_cgraph * graph,
struct ggml_tensor * k_cur,
struct ggml_tensor * v_cur,
int64_t n_ctx,
int32_t n_tokens,
int32_t kv_head,
const llm_build_cb & cb,
int64_t il) {
const int64_t n_embd_k_gqa = hparams.n_embd_k_gqa()/hparams.n_head_kv;
const int64_t n_embd_v_gqa = hparams.n_embd_v_gqa()/hparams.n_head_kv;
GGML_ASSERT(kv.size == n_ctx);
// compute the transposed [n_tokens, n_embd] V matrix
assert(v_cur->ne[0] == n_embd_v_gqa && v_cur->ne[1] == n_tokens);
struct ggml_tensor * v_cur_t = ggml_transpose(ctx, v_cur);
cb(v_cur_t, "v_cur_t", il);
struct ggml_tensor * k_cache_view = ggml_view_1d(ctx, kv.k_l[il], ggml_nbytes(k_cur)/4,
(ggml_row_size(kv.k_l[il]->type, n_embd_k_gqa))*kv_head);
cb(k_cache_view, "k_cache_view", il);
struct ggml_tensor * v_cache_view = ggml_view_1d(ctx, kv.v_l[il], ggml_nbytes(v_cur)/4,
// ( n_ctx)*ggml_element_size(kv.v_l[il]),
(kv_head)*ggml_element_size(kv.v_l[il]));
cb(v_cache_view, "v_cache_view", il);
// important: storing RoPE-ed version of K in the KV cache!
ggml_build_forward_expand(graph, ggml_cpy(ctx, k_cur, k_cache_view));
ggml_build_forward_expand(graph, ggml_cpy(ctx, v_cur_t, v_cache_view));
@ -6409,12 +6445,16 @@ static struct ggml_tensor * llm_build_ffn(
} break;
case LLM_FFN_SILU2:
{
struct ggml_tensor * one = ggml_view_2d(ctx, cur, cur->ne[0]/2, cur->ne[1], cur->nb[1], 0);
int offset = sizeof(float) * (cur->ne[0]/2) * (cur->ne[1]);
struct ggml_tensor * two = ggml_view_2d(ctx, cur, cur->ne[0]/2, cur->ne[1], cur->nb[1], offset);
cur = ggml_mul(ctx, ggml_silu(ctx, one), two);
// Project to 4h. If using swiglu double the output width, see https://arxiv.org/pdf/2002.05202.pdf
int64_t split_point = cur->ne[0] / 2;
struct ggml_tensor * x0 = ggml_cont(ctx, ggml_view_2d(ctx, cur, split_point, cur->ne[1], cur->nb[1], 0));
struct ggml_tensor * x1 = ggml_cont(ctx, ggml_view_2d(ctx, cur, split_point, cur->ne[1], cur->nb[1], split_point * ggml_element_size(cur)));
x0 = ggml_silu(ctx, x0);
cb(cur, "ffn_silu", il);
cur = ggml_mul(ctx, x0, x1);
cb(cur, "ffn_mul", il);
} break;
case LLM_FFN_GELU:
{
@ -6589,7 +6629,7 @@ static struct ggml_tensor * llm_build_kqv(
struct ggml_tensor * kq = ggml_mul_mat(ctx, k, q);
cb(kq, "kq", il);
if (model.arch == LLM_ARCH_PHI2 || model.arch == LLM_ARCH_PHI3 || model.arch == LLM_ARCH_OPENELM) {
if (model.arch == LLM_ARCH_PHI2 || model.arch == LLM_ARCH_PHI3) {
// for this arch, we need to perform the KQ multiplication with F32 precision, otherwise we get NaNs
// ref: https://github.com/ggerganov/llama.cpp/pull/4490#issuecomment-1859055847
ggml_mul_mat_set_prec(kq, GGML_PREC_F32);
@ -6706,6 +6746,44 @@ static struct ggml_tensor * llm_build_kv(
return cur;
}
static struct ggml_tensor * llm_build_kv2(
struct ggml_context * ctx,
const llama_model & model,
const llama_hparams & hparams,
const llama_kv_cache & kv,
struct ggml_cgraph * graph,
struct ggml_tensor * wo,
struct ggml_tensor * wo_b,
struct ggml_tensor * k_cur,
struct ggml_tensor * v_cur,
struct ggml_tensor * q_cur,
struct ggml_tensor * kq_mask,
struct ggml_tensor * kq_pos,
int64_t n_ctx,
int32_t n_tokens,
int32_t kv_head,
int32_t n_kv,
float kq_scale,
const llm_build_cb & cb,
int il) {
// these nodes are added to the graph together so that they are not reordered
// by doing so, the number of splits in the graph is reduced
ggml_build_forward_expand(graph, q_cur);
ggml_build_forward_expand(graph, k_cur);
ggml_build_forward_expand(graph, v_cur);
llm_build_kv_store2(ctx, hparams, kv, graph, k_cur, v_cur, n_ctx, n_tokens, kv_head, cb, il);
struct ggml_tensor * cur;
cur = llm_build_kqv(ctx, model, hparams, kv, graph, wo, wo_b,
q_cur, kq_mask, kq_pos, n_ctx, n_tokens, n_kv, kq_scale, cb, il);
cb(cur, "kqv_out", il);
return cur;
}
struct llm_build_context {
const llama_model & model;
llama_context & lctx;
@ -10735,7 +10813,6 @@ struct llm_build_context {
llama_hparams modified_hparams(hparams);
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
// struct ggml_tensor * KQ_mask = build_inp_KQ_mask2(n_kv);
struct ggml_tensor * KQ_mask = build_inp_KQ_mask();
@ -10745,6 +10822,7 @@ struct llm_build_context {
// This doesn't work at the moment, comment out to test
const int64_t n_head_k = num_kv_heads[il];
const int64_t n_head_v = num_kv_heads[il];
const int64_t n_head_q = num_query_heads[il];
const int64_t n_head_kv = n_head_k+n_head_v;
const int64_t n_head = n_head_kv+ num_query_heads[il];
// const int64_t n_kv = (num_kv_heads[il]+num_kv_heads[il])*n_embd_head; // This makes asserts fail
@ -10752,11 +10830,21 @@ struct llm_build_context {
modified_hparams.n_head = 4*n_head_k; // somehow this works. Some places expect this to be groups*n_head_kv insteal of n_head. maybe this is the defintiion somewhere.
modified_hparams.n_head_kv = n_head_kv;
const int64_t n_embd_gqa = n_embd_head * n_head;
const int64_t n_embd_k_gqa = modified_hparams.n_embd_k_gqa();
const int64_t n_embd_v_gqa = modified_hparams.n_embd_v_gqa();
struct ggml_tensor * attn_q_norm = model.layers[il].attn_q_norm;
cb(attn_q_norm, "attn_q_norm", il);
struct ggml_tensor * attn_k_norm = model.layers[il].attn_k_norm;
cb(attn_k_norm, "attn_k_norm", il);
// const int64_t n_embd_k_gqa = modified_hparams.n_embd_k_gqa();
// const int64_t n_embd_v_gqa = modified_hparams.n_embd_v_gqa();
// self-attention
{
cb(model.layers[il].attn_norm, "attn_norm.weight", il);
struct ggml_tensor* attn_norm_output = llm_build_norm(ctx0, inpL, modified_hparams,
model.layers[il].attn_norm,
NULL,
@ -10766,30 +10854,61 @@ struct llm_build_context {
struct ggml_tensor * Qcur = nullptr;
struct ggml_tensor * Kcur = nullptr;
struct ggml_tensor * Vcur = nullptr;
cb(model.layers[il].wqkv, "qkv_proj_weight", il);
cur = ggml_mul_mat(ctx0, model.layers[il].wqkv, attn_norm_output); // model.layers[il].wqkv -> might not be all 3 qkv
cb(cur, "wqkv", il);
// model.layers[il].wqkv has dimensionality of
// [model_dim][(n_head_k+n_head_v+n_head_q)*head_dim]
// In most other impls, this is [model_dim][3*above]
// This matches up with the dimensions of the huggingface version
Qcur = ggml_cont(ctx0, ggml_view_3d(ctx0, cur, n_embd_head, n_tokens, num_query_heads[il], cur->nb[1], cur->nb[2], 0));
Kcur = ggml_cont(ctx0, ggml_view_3d(ctx0, cur, n_embd_head,n_tokens, n_head_k, cur->nb[1], cur->nb[2], 1 * sizeof(float) * (n_embd_head)));
Vcur = ggml_cont(ctx0, ggml_view_3d(ctx0, cur, n_embd_head,n_tokens, n_head_k, cur->nb[1], cur->nb[2], 2 * sizeof(float) * (n_embd_head)));
cb(cur, "qkv", il);
cur = ggml_reshape_3d(ctx0, cur, n_embd_head, n_tokens, n_head);
cur = ggml_permute(ctx0, cur, 0, 2, 1, 3);
// TODO: these need to be calculated correctly
/*
struct ggml_tensor * tmpqkv = ggml_reshape_4d(ctx0, cur, n_embd_head, 3, n_head, n_tokens);
cb(tmpqkv, "tmpqkv", il);
struct ggml_tensor * tmpqkv_perm = ggml_cont(ctx0, ggml_permute(ctx0, tmpqkv, 0, 3, 1, 2));
cb(tmpqkv_perm, "tmpqkv", il);
struct ggml_tensor * tmpq = ggml_view_3d(
ctx0, tmpqkv_perm, n_embd_head, n_head, n_tokens,
ggml_element_size(tmpqkv_perm) * n_embd_head,
ggml_element_size(tmpqkv_perm) * n_embd_head * n_head,
0
);
cb(tmpq, "tmpq", il);
struct ggml_tensor * tmpk = ggml_view_3d(
ctx0, tmpqkv_perm, n_embd_head, n_head, n_tokens,
ggml_element_size(tmpqkv_perm) * n_embd_head,
ggml_element_size(tmpqkv_perm) * n_embd_head * n_head,
ggml_element_size(tmpqkv_perm) * n_embd_head * n_head * n_tokens
);
*/
size_t elemsize = ggml_element_size(cur);
Qcur = ggml_cont(ctx0, ggml_view_3d(ctx0, cur, n_embd_head, n_tokens, num_query_heads[il], cur->nb[1], cur->nb[2]*num_query_heads[il], 0));
cb(Qcur, "queries", il);
Kcur = ggml_cont(ctx0, ggml_view_3d(ctx0, cur, n_embd_head, n_tokens, n_head_k, cur->nb[1], cur->nb[2]*n_head_k, cur->nb[2]*num_query_heads[il]));
cb(Kcur, "keys", il);
Vcur = ggml_cont(ctx0, ggml_view_3d(ctx0, cur, n_embd_head, n_tokens, n_head_q, cur->nb[1], cur->nb[2]*n_head_v, cur->nb[2]*(num_query_heads[il]+n_head_k)));
cb(Vcur, "values", il);
// Q/K Layernorm
cb(Qcur, "queries", il);
Qcur = llm_build_norm(ctx0, Qcur, modified_hparams,
model.layers[il].attn_q_norm,
NULL,
LLM_NORM_RMS, cb, il);
cb(Qcur, "Qcur", il);
Kcur = llm_build_norm(ctx0, Kcur, modified_hparams,
model.layers[il].attn_k_norm,
NULL,
LLM_NORM_RMS, cb, il);
cb(Kcur, "Kcur", il);
cb(Kcur, "keys", il);
cb(Vcur, "Vcur", il);
// reshape, Qcur -> [64][12(first layer)][n_tokens]
// reshape, Kcur -> [64][3(first layer)][n_tokens]
Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, num_query_heads[il], n_tokens);
@ -10799,32 +10918,22 @@ struct llm_build_context {
ctx0, Qcur, inp_pos, n_rot, rope_type, 0, n_orig_ctx,
freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow
);
cb(Qcur, "Qcur", il);
cb(Qcur, "queries", il);
Qcur = ggml_scale(ctx0, Qcur, 1.0f / sqrtf(float(n_embd_head)));
cb(Qcur, "Qcur", il);
// Qcur = ggml_scale(ctx0, Qcur, 1.0f / sqrtf(float(n_embd_head)));
// cb(Qcur, "Qcur", il);
Kcur = ggml_rope_custom(
ctx0, Kcur, inp_pos, n_rot, rope_type, 0, n_orig_ctx,
freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow
);
int64_t nev[GGML_MAX_DIMS] = {2*Vcur->ne[0], Vcur->ne[1], Vcur->ne[2], Vcur->ne[3]};
int64_t nev[GGML_MAX_DIMS] = {Vcur->ne[0], Vcur->ne[1], 4*Vcur->ne[2], Vcur->ne[3]};
struct ggml_tensor * Vcur2 = ggml_new_tensor(ctx0, Vcur->type, GGML_MAX_DIMS, nev);
Vcur2->grad = ggml_dup_tensor(ctx0, Vcur);
Vcur2 = ggml_reshape_2d(ctx0, Vcur2, modified_hparams.n_embd_k_gqa(), n_tokens);
int64_t nek[GGML_MAX_DIMS] = {2*Kcur->ne[0], Kcur->ne[1], Kcur->ne[2], Kcur->ne[3]};
struct ggml_tensor * Kcur2 = ggml_new_tensor(ctx0, Kcur->type, GGML_MAX_DIMS, nek);
Kcur2->grad = ggml_dup_tensor(ctx0, Kcur);
Kcur2 = ggml_reshape_2d(ctx0, Kcur2, modified_hparams.n_embd_k_gqa(), n_tokens);
cb(Kcur, "Kcur", il);
// Attempt at transscreibing from python:
// cur = ggml_flash_attn(ctx0, Qcur, Kcur, Vcur, true);
// cur = ggml_transpose(ctx0, cur);
// cur = ggml_permute(ctx0, cur, 0, 2, 1, 3);
// cur = ggml_cont(ctx0, cur);
// cur = ggml_reshape_2d(ctx0, cur, n_embd_head_k*(2*n_head_kv), n_tokens);
// cur = ggml_mul_mat(ctx0, cur, model.layers[il].wo);
// cur = ggml_transpose(ctx0, cur);
Vcur2 = ggml_repeat(ctx0, Vcur, Vcur2);
int64_t nek[GGML_MAX_DIMS] = {Kcur->ne[0], Kcur->ne[1], 4*Kcur->ne[2], Kcur->ne[3]};
struct ggml_tensor * Kcur2 = ggml_new_tensor(ctx0, Vcur->type, GGML_MAX_DIMS, nek);
Kcur2 = ggml_repeat(ctx0, Kcur, Kcur2);
cur = llm_build_kv(ctx0, model, modified_hparams, kv_self, gf,
model.layers[il].wo, NULL,
@ -10837,11 +10946,11 @@ struct llm_build_context {
residual = ggml_get_rows(ctx0, residual, inp_out_ids);
}
cur = ggml_add(ctx0, cur, residual);
residual = cur;
cur = llm_build_norm(ctx0, cur, modified_hparams,
model.layers[il].ffn_norm, NULL,
LLM_NORM_RMS, cb, il);
cb(cur, "ffn_norm", il);
residual = cur;
// FF
{
@ -10860,19 +10969,19 @@ struct llm_build_context {
LLM_FFN_SILU2, LLM_FFN_SEQ, cb, il);
cb(cur, "ffn_out", il);
}
residual = cur;
cur = ggml_add(ctx0, residual, cur);
cb(cur, "l_out", il);
inpL = cur;
}
cur = llm_build_norm(ctx0, inpL, modified_hparams,
model.output_norm,
NULL,
LLM_NORM_RMS, cb, -1);
cur = llm_build_norm(ctx0, cur, hparams,
model.output_norm, NULL,
LLM_NORM_RMS, cb, -1);
cb(cur, "result_norm", -1);
// lm_head
cur = ggml_mul_mat(ctx0, model.output, cur);
cb(cur, "result_output", -1);
@ -15440,7 +15549,7 @@ struct llama_context_params llama_context_default_params() {
/*.type_v =*/ GGML_TYPE_F16,
/*.logits_all =*/ false,
/*.embeddings =*/ false,
/*.offload_kqv =*/ true,
/*.offload_kqv =*/ false,
/*.abort_callback =*/ nullptr,
/*.abort_callback_data =*/ nullptr,
};
@ -15606,7 +15715,7 @@ struct llama_context * llama_new_context_with_model(
cparams.yarn_beta_slow = params.yarn_beta_slow;
cparams.defrag_thold = params.defrag_thold;
cparams.embeddings = params.embeddings;
cparams.offload_kqv = params.offload_kqv;
cparams.offload_kqv = false;
cparams.pooling_type = params.pooling_type;
cparams.n_ctx = params.n_ctx == 0 ? hparams.n_ctx_train : params.n_ctx;

6
scratch_21.sh Normal file
View file

@ -0,0 +1,6 @@
eval-callback \
--hf-repo models/openelm-small/ \
--model ggml-model-f16.gguf/ \
--prompt hello \
--seed 42 \
-ngl 0

40
split_test.cpp Normal file
View file

@ -0,0 +1,40 @@
#include <iostream>
#include "ggml.h"
int main() {
printf("split_test\n");
// Initialization
struct ggml_init_params params = ggml_init_params{1024}; // Assuming this initializes memory
ggml_context *ctx = ggml_init(params);
// Tensor Creation (Analogous to the PyTorch code)
int64_t size = 18 * 7 * 64;
int64_t dims[4] = {1, 18, 7, 64};
ggml_tensor *tensor = ggml_new_tensor(ctx, GGML_TYPE_F32, 4, dims);
// Initialize tensor data (Note: Simplified for this example)
float* tensor_data = (float*) tensor->data;
for (int i = 0; i < size; i++) {
tensor_data[i] = (float) i;
printf("%f", tensor_data[i]);
}
printf("\n");
// Reshaping and Transpose
// ... (You'll need ggml equivalents of reshape and transpose)
// Splitting (We'll focus on this part)
int64_t num_q_heads = 12;
int64_t num_k_heads = 3;
int64_t num_v_heads = 3;
ggml_tensor *a = ggml_view_3d(ctx, tensor, /*ne0*/1, /*ne1*/2, /*ne2*/3, /*nb1*/4, /*nb2*/5, /*offset*/6);
ggml_tensor *b = ggml_view_3d(ctx, tensor, /*ne0*/1, /*ne1*/2, /*ne2*/3, /*nb1*/4, /*nb2*/5, /*offset*/6);
ggml_tensor *c = ggml_view_3d(ctx, tensor, /*ne0*/1, /*ne1*/2, /*ne2*/3, /*nb1*/4, /*nb2*/5, /*offset*/6);
// Accessing elements (assuming ggml provides similar access)
float *a_data = (float*) a->data;
std::cout << a_data[0] << std::endl;
return 0;
}

271
tests/test-split.cpp Normal file
View file

@ -0,0 +1,271 @@
#include "llama.h"
#include "common.h"
#include "console.h"
#include <cstdio>
#include <string>
#include <map>
#include <vector>
#include <fstream>
//static const std::map<std::string, std::vector<llama_token>> & k_tests() {
// static std::map<std::string, std::vector<llama_token>> _k_tests = {
// { "" , { }, },
// { " " , { 220, }, },
// { " " , { 256, }, },
// { " " , { 262, }, },
// { "\t" , { 197, }, },
// { "\n" , { 198, }, },
// { "\n\n" , { 271, }, },
// { "\n\n\n" , { 1432, }, },
// { "\t\n" , { 1602, }, },
// { "Hello world" , { 9906, 1917, }, },
// { " Hello world" , { 22691, 1917, }, },
// { "Hello World" , { 9906, 4435, }, },
// { " Hello World" , { 22691, 4435, }, },
// { " Hello World!" , { 22691, 4435, 0, }, },
// { "Hello, world!" , { 9906, 11, 1917, 0, }, },
// { " Hello, world!" , { 22691, 11, 1917, 0, }, },
// { " this is 🦙.cpp" , { 420, 374, 11410, 99, 247, 13, 11055, }, },
// { "w048 7tuijk dsdfhu" , { 86, 23904, 220, 22, 83, 2005, 42908, 11729, 3013, 17156, }, },
// { "нещо на Български" , { 79862, 102118, 13373, 64571, 34694, 3114, 112203, 80112, }, },
// { "កាន់តែពិសេសអាចខលចេញ" , { 21549, 222, 98629, 241, 45358, 233, 21549, 237, 45358, 224, 21549, 244, 21549, 115, 21549, 253, 45358, 223, 21549, 253, 21549, 95, 98629, 227, 21549, 223, 21549, 249, 21549, 227, 45358, 223, 21549, 231, }, },
// { "🚀 (normal) 😶‍🌫️ (multiple emojis concatenated) ✅ (only emoji that has its own token)", { 9468, 248, 222, 320, 8416, 8, 27623, 114, 102470, 9468, 234, 104, 31643, 320, 36773, 100166, 98634, 8, 26602, 227, 320, 3323, 43465, 430, 706, 1202, 1866, 4037, 8, }, },
// { "Hello" , { 9906, }, },
// { " Hello" , { 22691, }, },
// { " Hello" , { 220, 22691, }, },
// { " Hello" , { 256, 22691, }, },
// { " Hello" , { 262, 22691, }, },
// { " Hello\n Hello" , { 262, 22691, 198, 262, 22691, }, },
// { " (" , { 320, }, },
// { "\n =" , { 198, 284, }, },
// { "' era" , { 6, 11639, }, },
// { "Hello, y'all! How are you 😁 ?我想在apple工作1314151天", { 9906, 11, 379, 65948, 0, 2650, 527, 499, 27623, 223, 949, 37046, 101067, 19000, 23182, 102301, 9263, 18136, 16, 36827, 21909, }, },
// { "3" , { 18, }, },
// { "33" , { 1644, }, },
// { "333" , { 8765, }, },
// { "3333" , { 8765, 18, }, },
// { "33333" , { 8765, 1644, }, },
// { "333333" , { 8765, 8765, }, },
// { "3333333" , { 8765, 8765, 18, }, },
// { "33333333" , { 8765, 8765, 1644, }, },
// { "333333333" , { 8765, 8765, 8765, }, },
// };
//
// return _k_tests;
//}
static std::map<std::string, std::vector<llama_token>> read_tests(const std::string & fname_inp, const std::string & fname_out) {
std::map<std::string, std::vector<llama_token>> tests;
std::ifstream ifs_inp(fname_inp);
if (!ifs_inp) {
fprintf(stderr, "%s : error: could not open file '%s'\n", __func__, fname_inp.c_str());
return tests;
}
std::string sraw((std::istreambuf_iterator<char>(ifs_inp)), std::istreambuf_iterator<char>());
std::ifstream ifs_out(fname_out);
if (!ifs_out) {
fprintf(stderr, "%s : error: could not open file '%s'\n", __func__, fname_out.c_str());
return tests;
}
std::vector<std::string> sout;
for (std::string line; std::getline(ifs_out, line);) {
sout.push_back(line);
}
const std::string sep = "\n__ggml_vocab_test__\n";
std::vector<std::string> sinp;
size_t pos = 0;
while (pos < sraw.size()) {
const size_t next = sraw.find(sep, pos);
if (next == std::string::npos) {
sinp.push_back(sraw.substr(pos));
break;
}
sinp.push_back(sraw.substr(pos, next - pos));
pos = next + sep.size();
}
if (sinp.size() != sout.size()) {
fprintf(stderr, "%s : error: input and output files have different number of tests\n", __func__);
return tests;
}
for (size_t i = 0; i < sinp.size(); ++i) {
const std::string & s = sinp[i];
const std::string & o = string_strip(sout[i]);
std::vector<llama_token> toks;
size_t pos = 0;
while (pos < o.size()) {
size_t next = o.find(' ', pos);
if (next == std::string::npos) {
next = o.size();
}
const std::string stok = o.substr(pos, next - pos);
toks.push_back(std::stoi(stok));
pos = next + 1;
}
tests[s] = toks;
}
return tests;
}
int main(int argc, char **argv) {
if (argc < 2) {
fprintf(stderr, "Usage: %s vocab-file [text-file]\n", argv[0]);
return 1;
}
const std::string fname = argv[1];
const std::string fname_inp = fname + ".inp";
const std::string fname_out = fname + ".out";
std::string fname_text;
if (argc > 2) {
fname_text = argv[2];
}
fprintf(stderr, "%s : reading vocab from: '%s'\n", __func__, fname.c_str());
llama_model * model;
llama_context * ctx;
llama_backend_init();
// load the vocab
{
auto mparams = llama_model_default_params();
mparams.vocab_only = true;
model = llama_load_model_from_file(fname.c_str(), mparams);
if (model == NULL) {
fprintf(stderr, "%s: error: failed to load vocab '%s'\n", __func__, fname.c_str());
return 1;
}
auto cparams = llama_context_default_params();
ctx = llama_new_context_with_model(model, cparams);
if (ctx == NULL) {
fprintf(stderr, "%s: error: failed to load vocab '%s'\n", __func__, fname.c_str());
llama_free_model(model);
return 1;
}
}
#ifdef _WIN32
// We need this for unicode console support
console::init(false, false);
atexit([]() { console::cleanup(); });
#endif
bool success = true;
const auto k_tests = read_tests(fname_inp, fname_out);
if (k_tests.empty()) {
fprintf(stderr, "%s : error: no tests found\n", __func__);
return 1;
}
const bool add_special = false;
for (const auto & test_kv : k_tests) {
const std::vector<llama_token> res = llama_tokenize(ctx, test_kv.first, add_special);
printf("\n");
printf("src: '%s'\n", test_kv.first.c_str());
printf("res: '%s'\n", llama_detokenize_bpe(ctx, res).c_str());
printf("tok: ");
for (const auto & tok : res) {
printf("%d ", tok);
}
printf("\n");
bool correct = res.size() == test_kv.second.size();
for (int i = 0; i < (int) res.size() && correct; ++i) {
if (test_kv.second[i] != res[i]) {
correct = false;
}
}
if (!correct) {
fprintf(stderr, "%s : failed test: '%s'\n", __func__, test_kv.first.c_str());
fprintf(stderr, "%s : detokenized to: '%s' instead of '%s'\n", __func__,
llama_detokenize_bpe(ctx, res).c_str(),
llama_detokenize_bpe(ctx, test_kv.second).c_str());
fprintf(stderr, "%s : expected tokens: ", __func__);
for (const auto & t : test_kv.second) {
fprintf(stderr, "%6d '%s', ", t, llama_token_to_piece(ctx, t).c_str());
}
fprintf(stderr, "\n");
fprintf(stderr, "%s : got tokens: ", __func__);
for (const auto & t : res) {
fprintf(stderr, "%6d '%s', ", t, llama_token_to_piece(ctx, t).c_str());
}
fprintf(stderr, "\n");
success = false;
}
}
if (!fname_text.empty()) {
fprintf(stderr, "%s : tokenizing: '%s'\n", __func__, fname_text.c_str());
std::string text;
{
std::ifstream ifs(fname_text);
if (!ifs) {
fprintf(stderr, "%s : error: could not open file '%s'\n", __func__, fname_text.c_str());
return 1;
}
text = std::string(std::istreambuf_iterator<char>(ifs), std::istreambuf_iterator<char>());
}
fprintf(stderr, "%s : text size: %zu\n", __func__, text.size());
const std::vector<llama_token> res = llama_tokenize(ctx, text, add_special);
fprintf(stderr, "%s : tokens: %zu\n", __func__, res.size());
{
const std::string fname_out = fname_text + ".tokcpp";
std::ofstream ofs(fname_out);
if (!ofs) {
fprintf(stderr, "%s : error: could not open file '%s'\n", __func__, fname_out.c_str());
return 1;
}
for (const auto & tok : res) {
ofs << tok << " '" << string_strip(llama_detokenize_bpe(ctx, std::vector<int>{tok})) << "'" << std::endl;
}
}
fprintf(stderr, "%s : tokens written to '%s'\n", __func__, (fname_text + ".tokcpp").c_str());
}
llama_free_model(model);
llama_free(ctx);
llama_backend_free();
printf("\n");
printf("Tests %s\n", success ? "passed" : "failed");
return success ? 0 : 3;
}