WIP
This commit is contained in:
parent
8d2dead681
commit
98ba54e5ec
7 changed files with 5930 additions and 47 deletions
5412
debug.openelm-2.txt
Normal file
5412
debug.openelm-2.txt
Normal file
File diff suppressed because it is too large
Load diff
5
examples/split-test/CMakeLists.txt
Normal file
5
examples/split-test/CMakeLists.txt
Normal file
|
@ -0,0 +1,5 @@
|
|||
set(TARGET baby-llama)
|
||||
add_executable(${TARGET} baby-llama.cpp)
|
||||
install(TARGETS ${TARGET} RUNTIME)
|
||||
target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
|
||||
target_compile_features(${TARGET} PRIVATE cxx_std_11)
|
40
examples/split-test/split-test.cpp
Normal file
40
examples/split-test/split-test.cpp
Normal file
|
@ -0,0 +1,40 @@
|
|||
#include <iostream>
|
||||
#include "ggml.h"
|
||||
|
||||
int main() {
|
||||
printf("split_test\n");
|
||||
// Initialization
|
||||
struct ggml_init_params params = ggml_init_params{1024}; // Assuming this initializes memory
|
||||
ggml_context *ctx = ggml_init(params);
|
||||
|
||||
// Tensor Creation (Analogous to the PyTorch code)
|
||||
int64_t size = 18 * 7 * 64;
|
||||
int64_t dims[4] = {1, 18, 7, 64};
|
||||
ggml_tensor *tensor = ggml_new_tensor(ctx, GGML_TYPE_F32, 4, dims);
|
||||
|
||||
// Initialize tensor data (Note: Simplified for this example)
|
||||
float* tensor_data = (float*) tensor->data;
|
||||
for (int i = 0; i < size; i++) {
|
||||
tensor_data[i] = (float) i;
|
||||
printf("%f", tensor_data[i]);
|
||||
}
|
||||
printf("\n");
|
||||
|
||||
// Reshaping and Transpose
|
||||
// ... (You'll need ggml equivalents of reshape and transpose)
|
||||
|
||||
// Splitting (We'll focus on this part)
|
||||
int64_t num_q_heads = 12;
|
||||
int64_t num_k_heads = 3;
|
||||
int64_t num_v_heads = 3;
|
||||
|
||||
ggml_tensor *a = ggml_view_3d(ctx, tensor, /*ne0*/1, /*ne1*/2, /*ne2*/3, /*nb1*/4, /*nb2*/5, /*offset*/6);
|
||||
ggml_tensor *b = ggml_view_3d(ctx, tensor, /*ne0*/1, /*ne1*/2, /*ne2*/3, /*nb1*/4, /*nb2*/5, /*offset*/6);
|
||||
ggml_tensor *c = ggml_view_3d(ctx, tensor, /*ne0*/1, /*ne1*/2, /*ne2*/3, /*nb1*/4, /*nb2*/5, /*offset*/6);
|
||||
|
||||
// Accessing elements (assuming ggml provides similar access)
|
||||
float *a_data = (float*) a->data;
|
||||
std::cout << a_data[0] << std::endl;
|
||||
|
||||
return 0;
|
||||
}
|
203
llama.cpp
203
llama.cpp
|
@ -6320,6 +6320,42 @@ static void llm_build_kv_store(
|
|||
(kv_head)*ggml_element_size(kv.v_l[il]));
|
||||
cb(v_cache_view, "v_cache_view", il);
|
||||
|
||||
// important: storing RoPE-ed version of K in the KV cache!
|
||||
// ggml_build_forward_expand(graph, ggml_cpy(ctx, k_cur, k_cache_view));
|
||||
// ggml_build_forward_expand(graph, ggml_cpy(ctx, v_cur_t, v_cache_view));
|
||||
}
|
||||
|
||||
static void llm_build_kv_store2(
|
||||
struct ggml_context * ctx,
|
||||
const llama_hparams & hparams,
|
||||
const llama_kv_cache & kv,
|
||||
struct ggml_cgraph * graph,
|
||||
struct ggml_tensor * k_cur,
|
||||
struct ggml_tensor * v_cur,
|
||||
int64_t n_ctx,
|
||||
int32_t n_tokens,
|
||||
int32_t kv_head,
|
||||
const llm_build_cb & cb,
|
||||
int64_t il) {
|
||||
const int64_t n_embd_k_gqa = hparams.n_embd_k_gqa()/hparams.n_head_kv;
|
||||
const int64_t n_embd_v_gqa = hparams.n_embd_v_gqa()/hparams.n_head_kv;
|
||||
|
||||
GGML_ASSERT(kv.size == n_ctx);
|
||||
|
||||
// compute the transposed [n_tokens, n_embd] V matrix
|
||||
assert(v_cur->ne[0] == n_embd_v_gqa && v_cur->ne[1] == n_tokens);
|
||||
struct ggml_tensor * v_cur_t = ggml_transpose(ctx, v_cur);
|
||||
cb(v_cur_t, "v_cur_t", il);
|
||||
|
||||
struct ggml_tensor * k_cache_view = ggml_view_1d(ctx, kv.k_l[il], ggml_nbytes(k_cur)/4,
|
||||
(ggml_row_size(kv.k_l[il]->type, n_embd_k_gqa))*kv_head);
|
||||
cb(k_cache_view, "k_cache_view", il);
|
||||
|
||||
struct ggml_tensor * v_cache_view = ggml_view_1d(ctx, kv.v_l[il], ggml_nbytes(v_cur)/4,
|
||||
// ( n_ctx)*ggml_element_size(kv.v_l[il]),
|
||||
(kv_head)*ggml_element_size(kv.v_l[il]));
|
||||
cb(v_cache_view, "v_cache_view", il);
|
||||
|
||||
// important: storing RoPE-ed version of K in the KV cache!
|
||||
ggml_build_forward_expand(graph, ggml_cpy(ctx, k_cur, k_cache_view));
|
||||
ggml_build_forward_expand(graph, ggml_cpy(ctx, v_cur_t, v_cache_view));
|
||||
|
@ -6409,12 +6445,16 @@ static struct ggml_tensor * llm_build_ffn(
|
|||
} break;
|
||||
case LLM_FFN_SILU2:
|
||||
{
|
||||
struct ggml_tensor * one = ggml_view_2d(ctx, cur, cur->ne[0]/2, cur->ne[1], cur->nb[1], 0);
|
||||
int offset = sizeof(float) * (cur->ne[0]/2) * (cur->ne[1]);
|
||||
struct ggml_tensor * two = ggml_view_2d(ctx, cur, cur->ne[0]/2, cur->ne[1], cur->nb[1], offset);
|
||||
cur = ggml_mul(ctx, ggml_silu(ctx, one), two);
|
||||
// Project to 4h. If using swiglu double the output width, see https://arxiv.org/pdf/2002.05202.pdf
|
||||
int64_t split_point = cur->ne[0] / 2;
|
||||
struct ggml_tensor * x0 = ggml_cont(ctx, ggml_view_2d(ctx, cur, split_point, cur->ne[1], cur->nb[1], 0));
|
||||
struct ggml_tensor * x1 = ggml_cont(ctx, ggml_view_2d(ctx, cur, split_point, cur->ne[1], cur->nb[1], split_point * ggml_element_size(cur)));
|
||||
|
||||
x0 = ggml_silu(ctx, x0);
|
||||
cb(cur, "ffn_silu", il);
|
||||
|
||||
cur = ggml_mul(ctx, x0, x1);
|
||||
cb(cur, "ffn_mul", il);
|
||||
} break;
|
||||
case LLM_FFN_GELU:
|
||||
{
|
||||
|
@ -6589,7 +6629,7 @@ static struct ggml_tensor * llm_build_kqv(
|
|||
struct ggml_tensor * kq = ggml_mul_mat(ctx, k, q);
|
||||
cb(kq, "kq", il);
|
||||
|
||||
if (model.arch == LLM_ARCH_PHI2 || model.arch == LLM_ARCH_PHI3 || model.arch == LLM_ARCH_OPENELM) {
|
||||
if (model.arch == LLM_ARCH_PHI2 || model.arch == LLM_ARCH_PHI3) {
|
||||
// for this arch, we need to perform the KQ multiplication with F32 precision, otherwise we get NaNs
|
||||
// ref: https://github.com/ggerganov/llama.cpp/pull/4490#issuecomment-1859055847
|
||||
ggml_mul_mat_set_prec(kq, GGML_PREC_F32);
|
||||
|
@ -6706,6 +6746,44 @@ static struct ggml_tensor * llm_build_kv(
|
|||
return cur;
|
||||
}
|
||||
|
||||
static struct ggml_tensor * llm_build_kv2(
|
||||
struct ggml_context * ctx,
|
||||
const llama_model & model,
|
||||
const llama_hparams & hparams,
|
||||
const llama_kv_cache & kv,
|
||||
struct ggml_cgraph * graph,
|
||||
struct ggml_tensor * wo,
|
||||
struct ggml_tensor * wo_b,
|
||||
struct ggml_tensor * k_cur,
|
||||
struct ggml_tensor * v_cur,
|
||||
struct ggml_tensor * q_cur,
|
||||
struct ggml_tensor * kq_mask,
|
||||
struct ggml_tensor * kq_pos,
|
||||
int64_t n_ctx,
|
||||
int32_t n_tokens,
|
||||
int32_t kv_head,
|
||||
int32_t n_kv,
|
||||
float kq_scale,
|
||||
const llm_build_cb & cb,
|
||||
int il) {
|
||||
|
||||
// these nodes are added to the graph together so that they are not reordered
|
||||
// by doing so, the number of splits in the graph is reduced
|
||||
ggml_build_forward_expand(graph, q_cur);
|
||||
ggml_build_forward_expand(graph, k_cur);
|
||||
ggml_build_forward_expand(graph, v_cur);
|
||||
|
||||
llm_build_kv_store2(ctx, hparams, kv, graph, k_cur, v_cur, n_ctx, n_tokens, kv_head, cb, il);
|
||||
|
||||
struct ggml_tensor * cur;
|
||||
|
||||
cur = llm_build_kqv(ctx, model, hparams, kv, graph, wo, wo_b,
|
||||
q_cur, kq_mask, kq_pos, n_ctx, n_tokens, n_kv, kq_scale, cb, il);
|
||||
cb(cur, "kqv_out", il);
|
||||
|
||||
return cur;
|
||||
}
|
||||
|
||||
struct llm_build_context {
|
||||
const llama_model & model;
|
||||
llama_context & lctx;
|
||||
|
@ -10735,7 +10813,6 @@ struct llm_build_context {
|
|||
llama_hparams modified_hparams(hparams);
|
||||
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
|
||||
|
||||
// struct ggml_tensor * KQ_mask = build_inp_KQ_mask2(n_kv);
|
||||
struct ggml_tensor * KQ_mask = build_inp_KQ_mask();
|
||||
|
||||
|
||||
|
@ -10745,6 +10822,7 @@ struct llm_build_context {
|
|||
// This doesn't work at the moment, comment out to test
|
||||
const int64_t n_head_k = num_kv_heads[il];
|
||||
const int64_t n_head_v = num_kv_heads[il];
|
||||
const int64_t n_head_q = num_query_heads[il];
|
||||
const int64_t n_head_kv = n_head_k+n_head_v;
|
||||
const int64_t n_head = n_head_kv+ num_query_heads[il];
|
||||
// const int64_t n_kv = (num_kv_heads[il]+num_kv_heads[il])*n_embd_head; // This makes asserts fail
|
||||
|
@ -10752,11 +10830,21 @@ struct llm_build_context {
|
|||
modified_hparams.n_head = 4*n_head_k; // somehow this works. Some places expect this to be groups*n_head_kv insteal of n_head. maybe this is the defintiion somewhere.
|
||||
modified_hparams.n_head_kv = n_head_kv;
|
||||
const int64_t n_embd_gqa = n_embd_head * n_head;
|
||||
const int64_t n_embd_k_gqa = modified_hparams.n_embd_k_gqa();
|
||||
const int64_t n_embd_v_gqa = modified_hparams.n_embd_v_gqa();
|
||||
|
||||
|
||||
|
||||
struct ggml_tensor * attn_q_norm = model.layers[il].attn_q_norm;
|
||||
cb(attn_q_norm, "attn_q_norm", il);
|
||||
struct ggml_tensor * attn_k_norm = model.layers[il].attn_k_norm;
|
||||
cb(attn_k_norm, "attn_k_norm", il);
|
||||
|
||||
// const int64_t n_embd_k_gqa = modified_hparams.n_embd_k_gqa();
|
||||
// const int64_t n_embd_v_gqa = modified_hparams.n_embd_v_gqa();
|
||||
|
||||
// self-attention
|
||||
{
|
||||
|
||||
cb(model.layers[il].attn_norm, "attn_norm.weight", il);
|
||||
struct ggml_tensor* attn_norm_output = llm_build_norm(ctx0, inpL, modified_hparams,
|
||||
model.layers[il].attn_norm,
|
||||
NULL,
|
||||
|
@ -10766,30 +10854,61 @@ struct llm_build_context {
|
|||
struct ggml_tensor * Qcur = nullptr;
|
||||
struct ggml_tensor * Kcur = nullptr;
|
||||
struct ggml_tensor * Vcur = nullptr;
|
||||
|
||||
cb(model.layers[il].wqkv, "qkv_proj_weight", il);
|
||||
cur = ggml_mul_mat(ctx0, model.layers[il].wqkv, attn_norm_output); // model.layers[il].wqkv -> might not be all 3 qkv
|
||||
cb(cur, "wqkv", il);
|
||||
// model.layers[il].wqkv has dimensionality of
|
||||
// [model_dim][(n_head_k+n_head_v+n_head_q)*head_dim]
|
||||
// In most other impls, this is [model_dim][3*above]
|
||||
// This matches up with the dimensions of the huggingface version
|
||||
Qcur = ggml_cont(ctx0, ggml_view_3d(ctx0, cur, n_embd_head, n_tokens, num_query_heads[il], cur->nb[1], cur->nb[2], 0));
|
||||
Kcur = ggml_cont(ctx0, ggml_view_3d(ctx0, cur, n_embd_head,n_tokens, n_head_k, cur->nb[1], cur->nb[2], 1 * sizeof(float) * (n_embd_head)));
|
||||
Vcur = ggml_cont(ctx0, ggml_view_3d(ctx0, cur, n_embd_head,n_tokens, n_head_k, cur->nb[1], cur->nb[2], 2 * sizeof(float) * (n_embd_head)));
|
||||
cb(cur, "qkv", il);
|
||||
cur = ggml_reshape_3d(ctx0, cur, n_embd_head, n_tokens, n_head);
|
||||
cur = ggml_permute(ctx0, cur, 0, 2, 1, 3);
|
||||
// TODO: these need to be calculated correctly
|
||||
|
||||
/*
|
||||
|
||||
|
||||
struct ggml_tensor * tmpqkv = ggml_reshape_4d(ctx0, cur, n_embd_head, 3, n_head, n_tokens);
|
||||
cb(tmpqkv, "tmpqkv", il);
|
||||
|
||||
struct ggml_tensor * tmpqkv_perm = ggml_cont(ctx0, ggml_permute(ctx0, tmpqkv, 0, 3, 1, 2));
|
||||
cb(tmpqkv_perm, "tmpqkv", il);
|
||||
|
||||
struct ggml_tensor * tmpq = ggml_view_3d(
|
||||
ctx0, tmpqkv_perm, n_embd_head, n_head, n_tokens,
|
||||
ggml_element_size(tmpqkv_perm) * n_embd_head,
|
||||
ggml_element_size(tmpqkv_perm) * n_embd_head * n_head,
|
||||
0
|
||||
);
|
||||
cb(tmpq, "tmpq", il);
|
||||
|
||||
struct ggml_tensor * tmpk = ggml_view_3d(
|
||||
ctx0, tmpqkv_perm, n_embd_head, n_head, n_tokens,
|
||||
ggml_element_size(tmpqkv_perm) * n_embd_head,
|
||||
ggml_element_size(tmpqkv_perm) * n_embd_head * n_head,
|
||||
ggml_element_size(tmpqkv_perm) * n_embd_head * n_head * n_tokens
|
||||
);
|
||||
|
||||
*/
|
||||
size_t elemsize = ggml_element_size(cur);
|
||||
|
||||
Qcur = ggml_cont(ctx0, ggml_view_3d(ctx0, cur, n_embd_head, n_tokens, num_query_heads[il], cur->nb[1], cur->nb[2]*num_query_heads[il], 0));
|
||||
cb(Qcur, "queries", il);
|
||||
Kcur = ggml_cont(ctx0, ggml_view_3d(ctx0, cur, n_embd_head, n_tokens, n_head_k, cur->nb[1], cur->nb[2]*n_head_k, cur->nb[2]*num_query_heads[il]));
|
||||
cb(Kcur, "keys", il);
|
||||
Vcur = ggml_cont(ctx0, ggml_view_3d(ctx0, cur, n_embd_head, n_tokens, n_head_q, cur->nb[1], cur->nb[2]*n_head_v, cur->nb[2]*(num_query_heads[il]+n_head_k)));
|
||||
cb(Vcur, "values", il);
|
||||
// Q/K Layernorm
|
||||
|
||||
cb(Qcur, "queries", il);
|
||||
Qcur = llm_build_norm(ctx0, Qcur, modified_hparams,
|
||||
model.layers[il].attn_q_norm,
|
||||
NULL,
|
||||
LLM_NORM_RMS, cb, il);
|
||||
cb(Qcur, "Qcur", il);
|
||||
|
||||
|
||||
Kcur = llm_build_norm(ctx0, Kcur, modified_hparams,
|
||||
model.layers[il].attn_k_norm,
|
||||
NULL,
|
||||
LLM_NORM_RMS, cb, il);
|
||||
cb(Kcur, "Kcur", il);
|
||||
cb(Kcur, "keys", il);
|
||||
|
||||
cb(Vcur, "Vcur", il);
|
||||
// reshape, Qcur -> [64][12(first layer)][n_tokens]
|
||||
// reshape, Kcur -> [64][3(first layer)][n_tokens]
|
||||
Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, num_query_heads[il], n_tokens);
|
||||
|
@ -10799,32 +10918,22 @@ struct llm_build_context {
|
|||
ctx0, Qcur, inp_pos, n_rot, rope_type, 0, n_orig_ctx,
|
||||
freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow
|
||||
);
|
||||
cb(Qcur, "Qcur", il);
|
||||
cb(Qcur, "queries", il);
|
||||
|
||||
Qcur = ggml_scale(ctx0, Qcur, 1.0f / sqrtf(float(n_embd_head)));
|
||||
cb(Qcur, "Qcur", il);
|
||||
// Qcur = ggml_scale(ctx0, Qcur, 1.0f / sqrtf(float(n_embd_head)));
|
||||
// cb(Qcur, "Qcur", il);
|
||||
|
||||
Kcur = ggml_rope_custom(
|
||||
ctx0, Kcur, inp_pos, n_rot, rope_type, 0, n_orig_ctx,
|
||||
freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow
|
||||
);
|
||||
int64_t nev[GGML_MAX_DIMS] = {2*Vcur->ne[0], Vcur->ne[1], Vcur->ne[2], Vcur->ne[3]};
|
||||
int64_t nev[GGML_MAX_DIMS] = {Vcur->ne[0], Vcur->ne[1], 4*Vcur->ne[2], Vcur->ne[3]};
|
||||
struct ggml_tensor * Vcur2 = ggml_new_tensor(ctx0, Vcur->type, GGML_MAX_DIMS, nev);
|
||||
Vcur2->grad = ggml_dup_tensor(ctx0, Vcur);
|
||||
Vcur2 = ggml_reshape_2d(ctx0, Vcur2, modified_hparams.n_embd_k_gqa(), n_tokens);
|
||||
int64_t nek[GGML_MAX_DIMS] = {2*Kcur->ne[0], Kcur->ne[1], Kcur->ne[2], Kcur->ne[3]};
|
||||
struct ggml_tensor * Kcur2 = ggml_new_tensor(ctx0, Kcur->type, GGML_MAX_DIMS, nek);
|
||||
Kcur2->grad = ggml_dup_tensor(ctx0, Kcur);
|
||||
Kcur2 = ggml_reshape_2d(ctx0, Kcur2, modified_hparams.n_embd_k_gqa(), n_tokens);
|
||||
cb(Kcur, "Kcur", il);
|
||||
// Attempt at transscreibing from python:
|
||||
// cur = ggml_flash_attn(ctx0, Qcur, Kcur, Vcur, true);
|
||||
// cur = ggml_transpose(ctx0, cur);
|
||||
// cur = ggml_permute(ctx0, cur, 0, 2, 1, 3);
|
||||
// cur = ggml_cont(ctx0, cur);
|
||||
// cur = ggml_reshape_2d(ctx0, cur, n_embd_head_k*(2*n_head_kv), n_tokens);
|
||||
// cur = ggml_mul_mat(ctx0, cur, model.layers[il].wo);
|
||||
// cur = ggml_transpose(ctx0, cur);
|
||||
Vcur2 = ggml_repeat(ctx0, Vcur, Vcur2);
|
||||
|
||||
int64_t nek[GGML_MAX_DIMS] = {Kcur->ne[0], Kcur->ne[1], 4*Kcur->ne[2], Kcur->ne[3]};
|
||||
struct ggml_tensor * Kcur2 = ggml_new_tensor(ctx0, Vcur->type, GGML_MAX_DIMS, nek);
|
||||
Kcur2 = ggml_repeat(ctx0, Kcur, Kcur2);
|
||||
|
||||
cur = llm_build_kv(ctx0, model, modified_hparams, kv_self, gf,
|
||||
model.layers[il].wo, NULL,
|
||||
|
@ -10837,11 +10946,11 @@ struct llm_build_context {
|
|||
residual = ggml_get_rows(ctx0, residual, inp_out_ids);
|
||||
}
|
||||
cur = ggml_add(ctx0, cur, residual);
|
||||
residual = cur;
|
||||
cur = llm_build_norm(ctx0, cur, modified_hparams,
|
||||
model.layers[il].ffn_norm, NULL,
|
||||
LLM_NORM_RMS, cb, il);
|
||||
cb(cur, "ffn_norm", il);
|
||||
residual = cur;
|
||||
// FF
|
||||
{
|
||||
|
||||
|
@ -10860,19 +10969,19 @@ struct llm_build_context {
|
|||
LLM_FFN_SILU2, LLM_FFN_SEQ, cb, il);
|
||||
cb(cur, "ffn_out", il);
|
||||
}
|
||||
|
||||
residual = cur;
|
||||
cur = ggml_add(ctx0, residual, cur);
|
||||
cb(cur, "l_out", il);
|
||||
|
||||
inpL = cur;
|
||||
}
|
||||
|
||||
cur = llm_build_norm(ctx0, inpL, modified_hparams,
|
||||
model.output_norm,
|
||||
NULL,
|
||||
LLM_NORM_RMS, cb, -1);
|
||||
cur = llm_build_norm(ctx0, cur, hparams,
|
||||
model.output_norm, NULL,
|
||||
LLM_NORM_RMS, cb, -1);
|
||||
cb(cur, "result_norm", -1);
|
||||
|
||||
// lm_head
|
||||
cur = ggml_mul_mat(ctx0, model.output, cur);
|
||||
cb(cur, "result_output", -1);
|
||||
|
||||
|
@ -15440,7 +15549,7 @@ struct llama_context_params llama_context_default_params() {
|
|||
/*.type_v =*/ GGML_TYPE_F16,
|
||||
/*.logits_all =*/ false,
|
||||
/*.embeddings =*/ false,
|
||||
/*.offload_kqv =*/ true,
|
||||
/*.offload_kqv =*/ false,
|
||||
/*.abort_callback =*/ nullptr,
|
||||
/*.abort_callback_data =*/ nullptr,
|
||||
};
|
||||
|
@ -15606,7 +15715,7 @@ struct llama_context * llama_new_context_with_model(
|
|||
cparams.yarn_beta_slow = params.yarn_beta_slow;
|
||||
cparams.defrag_thold = params.defrag_thold;
|
||||
cparams.embeddings = params.embeddings;
|
||||
cparams.offload_kqv = params.offload_kqv;
|
||||
cparams.offload_kqv = false;
|
||||
cparams.pooling_type = params.pooling_type;
|
||||
|
||||
cparams.n_ctx = params.n_ctx == 0 ? hparams.n_ctx_train : params.n_ctx;
|
||||
|
|
6
scratch_21.sh
Normal file
6
scratch_21.sh
Normal file
|
@ -0,0 +1,6 @@
|
|||
eval-callback \
|
||||
--hf-repo models/openelm-small/ \
|
||||
--model ggml-model-f16.gguf/ \
|
||||
--prompt hello \
|
||||
--seed 42 \
|
||||
-ngl 0
|
40
split_test.cpp
Normal file
40
split_test.cpp
Normal file
|
@ -0,0 +1,40 @@
|
|||
#include <iostream>
|
||||
#include "ggml.h"
|
||||
|
||||
int main() {
|
||||
printf("split_test\n");
|
||||
// Initialization
|
||||
struct ggml_init_params params = ggml_init_params{1024}; // Assuming this initializes memory
|
||||
ggml_context *ctx = ggml_init(params);
|
||||
|
||||
// Tensor Creation (Analogous to the PyTorch code)
|
||||
int64_t size = 18 * 7 * 64;
|
||||
int64_t dims[4] = {1, 18, 7, 64};
|
||||
ggml_tensor *tensor = ggml_new_tensor(ctx, GGML_TYPE_F32, 4, dims);
|
||||
|
||||
// Initialize tensor data (Note: Simplified for this example)
|
||||
float* tensor_data = (float*) tensor->data;
|
||||
for (int i = 0; i < size; i++) {
|
||||
tensor_data[i] = (float) i;
|
||||
printf("%f", tensor_data[i]);
|
||||
}
|
||||
printf("\n");
|
||||
|
||||
// Reshaping and Transpose
|
||||
// ... (You'll need ggml equivalents of reshape and transpose)
|
||||
|
||||
// Splitting (We'll focus on this part)
|
||||
int64_t num_q_heads = 12;
|
||||
int64_t num_k_heads = 3;
|
||||
int64_t num_v_heads = 3;
|
||||
|
||||
ggml_tensor *a = ggml_view_3d(ctx, tensor, /*ne0*/1, /*ne1*/2, /*ne2*/3, /*nb1*/4, /*nb2*/5, /*offset*/6);
|
||||
ggml_tensor *b = ggml_view_3d(ctx, tensor, /*ne0*/1, /*ne1*/2, /*ne2*/3, /*nb1*/4, /*nb2*/5, /*offset*/6);
|
||||
ggml_tensor *c = ggml_view_3d(ctx, tensor, /*ne0*/1, /*ne1*/2, /*ne2*/3, /*nb1*/4, /*nb2*/5, /*offset*/6);
|
||||
|
||||
// Accessing elements (assuming ggml provides similar access)
|
||||
float *a_data = (float*) a->data;
|
||||
std::cout << a_data[0] << std::endl;
|
||||
|
||||
return 0;
|
||||
}
|
271
tests/test-split.cpp
Normal file
271
tests/test-split.cpp
Normal file
|
@ -0,0 +1,271 @@
|
|||
#include "llama.h"
|
||||
#include "common.h"
|
||||
#include "console.h"
|
||||
|
||||
#include <cstdio>
|
||||
#include <string>
|
||||
#include <map>
|
||||
#include <vector>
|
||||
#include <fstream>
|
||||
|
||||
//static const std::map<std::string, std::vector<llama_token>> & k_tests() {
|
||||
// static std::map<std::string, std::vector<llama_token>> _k_tests = {
|
||||
// { "" , { }, },
|
||||
// { " " , { 220, }, },
|
||||
// { " " , { 256, }, },
|
||||
// { " " , { 262, }, },
|
||||
// { "\t" , { 197, }, },
|
||||
// { "\n" , { 198, }, },
|
||||
// { "\n\n" , { 271, }, },
|
||||
// { "\n\n\n" , { 1432, }, },
|
||||
// { "\t\n" , { 1602, }, },
|
||||
// { "Hello world" , { 9906, 1917, }, },
|
||||
// { " Hello world" , { 22691, 1917, }, },
|
||||
// { "Hello World" , { 9906, 4435, }, },
|
||||
// { " Hello World" , { 22691, 4435, }, },
|
||||
// { " Hello World!" , { 22691, 4435, 0, }, },
|
||||
// { "Hello, world!" , { 9906, 11, 1917, 0, }, },
|
||||
// { " Hello, world!" , { 22691, 11, 1917, 0, }, },
|
||||
// { " this is 🦙.cpp" , { 420, 374, 11410, 99, 247, 13, 11055, }, },
|
||||
// { "w048 7tuijk dsdfhu" , { 86, 23904, 220, 22, 83, 2005, 42908, 11729, 3013, 17156, }, },
|
||||
// { "нещо на Български" , { 79862, 102118, 13373, 64571, 34694, 3114, 112203, 80112, }, },
|
||||
// { "កាន់តែពិសេសអាចខលចេញ" , { 21549, 222, 98629, 241, 45358, 233, 21549, 237, 45358, 224, 21549, 244, 21549, 115, 21549, 253, 45358, 223, 21549, 253, 21549, 95, 98629, 227, 21549, 223, 21549, 249, 21549, 227, 45358, 223, 21549, 231, }, },
|
||||
// { "🚀 (normal) 😶🌫️ (multiple emojis concatenated) ✅ (only emoji that has its own token)", { 9468, 248, 222, 320, 8416, 8, 27623, 114, 102470, 9468, 234, 104, 31643, 320, 36773, 100166, 98634, 8, 26602, 227, 320, 3323, 43465, 430, 706, 1202, 1866, 4037, 8, }, },
|
||||
// { "Hello" , { 9906, }, },
|
||||
// { " Hello" , { 22691, }, },
|
||||
// { " Hello" , { 220, 22691, }, },
|
||||
// { " Hello" , { 256, 22691, }, },
|
||||
// { " Hello" , { 262, 22691, }, },
|
||||
// { " Hello\n Hello" , { 262, 22691, 198, 262, 22691, }, },
|
||||
// { " (" , { 320, }, },
|
||||
// { "\n =" , { 198, 284, }, },
|
||||
// { "' era" , { 6, 11639, }, },
|
||||
// { "Hello, y'all! How are you 😁 ?我想在apple工作1314151天~", { 9906, 11, 379, 65948, 0, 2650, 527, 499, 27623, 223, 949, 37046, 101067, 19000, 23182, 102301, 9263, 18136, 16, 36827, 21909, }, },
|
||||
// { "3" , { 18, }, },
|
||||
// { "33" , { 1644, }, },
|
||||
// { "333" , { 8765, }, },
|
||||
// { "3333" , { 8765, 18, }, },
|
||||
// { "33333" , { 8765, 1644, }, },
|
||||
// { "333333" , { 8765, 8765, }, },
|
||||
// { "3333333" , { 8765, 8765, 18, }, },
|
||||
// { "33333333" , { 8765, 8765, 1644, }, },
|
||||
// { "333333333" , { 8765, 8765, 8765, }, },
|
||||
// };
|
||||
//
|
||||
// return _k_tests;
|
||||
//}
|
||||
|
||||
static std::map<std::string, std::vector<llama_token>> read_tests(const std::string & fname_inp, const std::string & fname_out) {
|
||||
std::map<std::string, std::vector<llama_token>> tests;
|
||||
|
||||
std::ifstream ifs_inp(fname_inp);
|
||||
if (!ifs_inp) {
|
||||
fprintf(stderr, "%s : error: could not open file '%s'\n", __func__, fname_inp.c_str());
|
||||
return tests;
|
||||
}
|
||||
|
||||
std::string sraw((std::istreambuf_iterator<char>(ifs_inp)), std::istreambuf_iterator<char>());
|
||||
|
||||
std::ifstream ifs_out(fname_out);
|
||||
if (!ifs_out) {
|
||||
fprintf(stderr, "%s : error: could not open file '%s'\n", __func__, fname_out.c_str());
|
||||
return tests;
|
||||
}
|
||||
|
||||
std::vector<std::string> sout;
|
||||
for (std::string line; std::getline(ifs_out, line);) {
|
||||
sout.push_back(line);
|
||||
}
|
||||
|
||||
const std::string sep = "\n__ggml_vocab_test__\n";
|
||||
|
||||
std::vector<std::string> sinp;
|
||||
|
||||
size_t pos = 0;
|
||||
while (pos < sraw.size()) {
|
||||
const size_t next = sraw.find(sep, pos);
|
||||
if (next == std::string::npos) {
|
||||
sinp.push_back(sraw.substr(pos));
|
||||
break;
|
||||
}
|
||||
sinp.push_back(sraw.substr(pos, next - pos));
|
||||
pos = next + sep.size();
|
||||
}
|
||||
|
||||
if (sinp.size() != sout.size()) {
|
||||
fprintf(stderr, "%s : error: input and output files have different number of tests\n", __func__);
|
||||
return tests;
|
||||
}
|
||||
|
||||
for (size_t i = 0; i < sinp.size(); ++i) {
|
||||
const std::string & s = sinp[i];
|
||||
const std::string & o = string_strip(sout[i]);
|
||||
|
||||
std::vector<llama_token> toks;
|
||||
|
||||
size_t pos = 0;
|
||||
while (pos < o.size()) {
|
||||
size_t next = o.find(' ', pos);
|
||||
if (next == std::string::npos) {
|
||||
next = o.size();
|
||||
}
|
||||
const std::string stok = o.substr(pos, next - pos);
|
||||
toks.push_back(std::stoi(stok));
|
||||
pos = next + 1;
|
||||
}
|
||||
|
||||
tests[s] = toks;
|
||||
}
|
||||
|
||||
return tests;
|
||||
}
|
||||
|
||||
int main(int argc, char **argv) {
|
||||
if (argc < 2) {
|
||||
fprintf(stderr, "Usage: %s vocab-file [text-file]\n", argv[0]);
|
||||
return 1;
|
||||
}
|
||||
|
||||
const std::string fname = argv[1];
|
||||
|
||||
const std::string fname_inp = fname + ".inp";
|
||||
const std::string fname_out = fname + ".out";
|
||||
|
||||
std::string fname_text;
|
||||
if (argc > 2) {
|
||||
fname_text = argv[2];
|
||||
}
|
||||
|
||||
fprintf(stderr, "%s : reading vocab from: '%s'\n", __func__, fname.c_str());
|
||||
|
||||
llama_model * model;
|
||||
llama_context * ctx;
|
||||
|
||||
llama_backend_init();
|
||||
|
||||
// load the vocab
|
||||
{
|
||||
auto mparams = llama_model_default_params();
|
||||
|
||||
mparams.vocab_only = true;
|
||||
|
||||
model = llama_load_model_from_file(fname.c_str(), mparams);
|
||||
|
||||
if (model == NULL) {
|
||||
fprintf(stderr, "%s: error: failed to load vocab '%s'\n", __func__, fname.c_str());
|
||||
return 1;
|
||||
}
|
||||
|
||||
auto cparams = llama_context_default_params();
|
||||
|
||||
ctx = llama_new_context_with_model(model, cparams);
|
||||
|
||||
if (ctx == NULL) {
|
||||
fprintf(stderr, "%s: error: failed to load vocab '%s'\n", __func__, fname.c_str());
|
||||
llama_free_model(model);
|
||||
return 1;
|
||||
}
|
||||
}
|
||||
|
||||
#ifdef _WIN32
|
||||
// We need this for unicode console support
|
||||
console::init(false, false);
|
||||
atexit([]() { console::cleanup(); });
|
||||
#endif
|
||||
|
||||
bool success = true;
|
||||
|
||||
const auto k_tests = read_tests(fname_inp, fname_out);
|
||||
|
||||
if (k_tests.empty()) {
|
||||
fprintf(stderr, "%s : error: no tests found\n", __func__);
|
||||
return 1;
|
||||
}
|
||||
|
||||
const bool add_special = false;
|
||||
|
||||
for (const auto & test_kv : k_tests) {
|
||||
const std::vector<llama_token> res = llama_tokenize(ctx, test_kv.first, add_special);
|
||||
|
||||
printf("\n");
|
||||
printf("src: '%s'\n", test_kv.first.c_str());
|
||||
printf("res: '%s'\n", llama_detokenize_bpe(ctx, res).c_str());
|
||||
printf("tok: ");
|
||||
for (const auto & tok : res) {
|
||||
printf("%d ", tok);
|
||||
}
|
||||
printf("\n");
|
||||
|
||||
bool correct = res.size() == test_kv.second.size();
|
||||
for (int i = 0; i < (int) res.size() && correct; ++i) {
|
||||
if (test_kv.second[i] != res[i]) {
|
||||
correct = false;
|
||||
}
|
||||
}
|
||||
|
||||
if (!correct) {
|
||||
fprintf(stderr, "%s : failed test: '%s'\n", __func__, test_kv.first.c_str());
|
||||
fprintf(stderr, "%s : detokenized to: '%s' instead of '%s'\n", __func__,
|
||||
llama_detokenize_bpe(ctx, res).c_str(),
|
||||
llama_detokenize_bpe(ctx, test_kv.second).c_str());
|
||||
fprintf(stderr, "%s : expected tokens: ", __func__);
|
||||
for (const auto & t : test_kv.second) {
|
||||
fprintf(stderr, "%6d '%s', ", t, llama_token_to_piece(ctx, t).c_str());
|
||||
}
|
||||
fprintf(stderr, "\n");
|
||||
fprintf(stderr, "%s : got tokens: ", __func__);
|
||||
for (const auto & t : res) {
|
||||
fprintf(stderr, "%6d '%s', ", t, llama_token_to_piece(ctx, t).c_str());
|
||||
}
|
||||
fprintf(stderr, "\n");
|
||||
|
||||
success = false;
|
||||
}
|
||||
}
|
||||
|
||||
if (!fname_text.empty()) {
|
||||
fprintf(stderr, "%s : tokenizing: '%s'\n", __func__, fname_text.c_str());
|
||||
|
||||
std::string text;
|
||||
{
|
||||
std::ifstream ifs(fname_text);
|
||||
if (!ifs) {
|
||||
fprintf(stderr, "%s : error: could not open file '%s'\n", __func__, fname_text.c_str());
|
||||
return 1;
|
||||
}
|
||||
text = std::string(std::istreambuf_iterator<char>(ifs), std::istreambuf_iterator<char>());
|
||||
}
|
||||
|
||||
fprintf(stderr, "%s : text size: %zu\n", __func__, text.size());
|
||||
|
||||
const std::vector<llama_token> res = llama_tokenize(ctx, text, add_special);
|
||||
|
||||
fprintf(stderr, "%s : tokens: %zu\n", __func__, res.size());
|
||||
|
||||
{
|
||||
const std::string fname_out = fname_text + ".tokcpp";
|
||||
|
||||
std::ofstream ofs(fname_out);
|
||||
if (!ofs) {
|
||||
fprintf(stderr, "%s : error: could not open file '%s'\n", __func__, fname_out.c_str());
|
||||
return 1;
|
||||
}
|
||||
|
||||
for (const auto & tok : res) {
|
||||
ofs << tok << " '" << string_strip(llama_detokenize_bpe(ctx, std::vector<int>{tok})) << "'" << std::endl;
|
||||
}
|
||||
}
|
||||
|
||||
fprintf(stderr, "%s : tokens written to '%s'\n", __func__, (fname_text + ".tokcpp").c_str());
|
||||
}
|
||||
|
||||
llama_free_model(model);
|
||||
llama_free(ctx);
|
||||
|
||||
llama_backend_free();
|
||||
|
||||
printf("\n");
|
||||
printf("Tests %s\n", success ? "passed" : "failed");
|
||||
|
||||
return success ? 0 : 3;
|
||||
}
|
Loading…
Add table
Add a link
Reference in a new issue