From 8b2420d2494a24d9aa58a07d14a350c4f6868561 Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Sat, 28 Oct 2023 19:54:28 +0300 Subject: [PATCH 01/20] llama : factor out ggml-alloc from graph graph build functions ggml-ci --- ggml.h | 2 +- llama.cpp | 555 +++++++++++++++++++++--------------------------------- 2 files changed, 215 insertions(+), 342 deletions(-) diff --git a/ggml.h b/ggml.h index 08bff5511..aa4b23e70 100644 --- a/ggml.h +++ b/ggml.h @@ -709,7 +709,7 @@ extern "C" { // Context tensor enumeration and lookup GGML_API struct ggml_tensor * ggml_get_first_tensor(struct ggml_context * ctx); GGML_API struct ggml_tensor * ggml_get_next_tensor (struct ggml_context * ctx, struct ggml_tensor * tensor); - GGML_API struct ggml_tensor * ggml_get_tensor(struct ggml_context * ctx, const char * name); + GGML_API struct ggml_tensor * ggml_get_tensor (struct ggml_context * ctx, const char * name); GGML_API struct ggml_tensor * ggml_set_zero(struct ggml_tensor * tensor); GGML_API struct ggml_tensor * ggml_set_i32 (struct ggml_tensor * tensor, int32_t value); diff --git a/llama.cpp b/llama.cpp index 3d431ee7b..43c629358 100644 --- a/llama.cpp +++ b/llama.cpp @@ -3091,8 +3091,9 @@ static bool llama_model_load( } static struct ggml_cgraph * llm_build_llama( - llama_context & lctx, - const llama_batch & batch) { + llama_context & lctx, + const llama_batch & batch, + bool worst_case) { const auto & model = lctx.model; const auto & hparams = model.hparams; const auto & cparams = lctx.cparams; @@ -3118,10 +3119,10 @@ static struct ggml_cgraph * llm_build_llama( const int n_gpu_layers = model.n_gpu_layers; const int32_t n_tokens = batch.n_tokens; - const int32_t n_kv = ggml_allocr_is_measure(lctx.alloc) ? n_ctx : kv_self.n; - const int32_t kv_head = ggml_allocr_is_measure(lctx.alloc) ? n_ctx - n_tokens : kv_self.head; + const int32_t n_kv = worst_case ? n_ctx : kv_self.n; + const int32_t kv_head = worst_case ? n_ctx - n_tokens : kv_self.head; - const bool do_rope_shift = ggml_allocr_is_measure(lctx.alloc) || kv_self.has_shift; + const bool do_rope_shift = worst_case || kv_self.has_shift; //printf("n_kv = %d\n", n_kv); @@ -3142,11 +3143,6 @@ static struct ggml_cgraph * llm_build_llama( if (batch.token) { struct ggml_tensor * inp_tokens = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens); - - ggml_allocr_alloc(lctx.alloc, inp_tokens); - if (!ggml_allocr_is_measure(lctx.alloc)) { - memcpy(inp_tokens->data, batch.token, n_tokens*ggml_element_size(inp_tokens)); - } ggml_set_name(inp_tokens, "inp_tokens"); inpL = ggml_get_rows(ctx0, model.tok_embeddings, inp_tokens); @@ -3156,12 +3152,8 @@ static struct ggml_cgraph * llm_build_llama( #endif inpL = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd, n_tokens); - - ggml_allocr_alloc(lctx.alloc, inpL); - if (!ggml_allocr_is_measure(lctx.alloc)) { - memcpy(inpL->data, batch.embd, n_tokens * n_embd * ggml_element_size(inpL)); - } } + ggml_set_name(inpL, "inp_embd"); const int i_gpu_start = n_layer - n_gpu_layers; (void) i_gpu_start; @@ -3186,59 +3178,23 @@ static struct ggml_cgraph * llm_build_llama( // KQ_scale struct ggml_tensor * KQ_scale = ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, 1); - ggml_set_name(KQ_scale, "1/sqrt(n_embd_head)"); - ggml_allocr_alloc(lctx.alloc, KQ_scale); - if (!ggml_allocr_is_measure(lctx.alloc)) { - ggml_set_f32(KQ_scale, 1.0f/sqrtf(float(n_embd_head))); - } + ggml_set_name(KQ_scale, "KQ_scale"); // KQ_mask (mask for 1 head, it will be broadcasted to all heads) struct ggml_tensor * KQ_mask = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_kv, n_tokens, 1); offload_func_kq(KQ_mask); ggml_set_name(KQ_mask, "KQ_mask"); - ggml_allocr_alloc(lctx.alloc, KQ_mask); - if (!ggml_allocr_is_measure(lctx.alloc)) { - float * data = (float *) KQ_mask->data; - memset(data, 0, ggml_nbytes(KQ_mask)); - - for (int h = 0; h < 1; ++h) { - for (int j = 0; j < n_tokens; ++j) { - const llama_pos pos = batch.pos[j]; - const llama_seq_id seq_id = batch.seq_id[j][0]; - - for (int i = 0; i < n_kv; ++i) { - if (!kv_self.cells[i].has_seq_id(seq_id) || kv_self.cells[i].pos > pos) { - data[h*(n_kv*n_tokens) + j*n_kv + i] = -INFINITY; - } - } - } - } - } // KQ_pos - contains the positions struct ggml_tensor * KQ_pos = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens); offload_func_kq(KQ_pos); ggml_set_name(KQ_pos, "KQ_pos"); - ggml_allocr_alloc(lctx.alloc, KQ_pos); - if (!ggml_allocr_is_measure(lctx.alloc)) { - int * data = (int *) KQ_pos->data; - for (int i = 0; i < n_tokens; ++i) { - data[i] = batch.pos[i]; - } - } // shift the entire K-cache if needed if (do_rope_shift) { struct ggml_tensor * K_shift = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_ctx); offload_func_kq(K_shift); ggml_set_name(K_shift, "K_shift"); - ggml_allocr_alloc(lctx.alloc, K_shift); - if (!ggml_allocr_is_measure(lctx.alloc)) { - int * data = (int *) K_shift->data; - for (int i = 0; i < n_ctx; ++i) { - data[i] = kv_self.cells[i].delta; - } - } for (int il = 0; il < n_layer; ++il) { struct ggml_tensor * tmp = @@ -3480,7 +3436,8 @@ static struct ggml_cgraph * llm_build_llama( static struct ggml_cgraph * llm_build_baichaun( llama_context & lctx, - const llama_batch & batch) { + const llama_batch & batch, + bool worst_case) { const auto & model = lctx.model; const auto & hparams = model.hparams; const auto & cparams = lctx.cparams; @@ -3506,10 +3463,10 @@ static struct ggml_cgraph * llm_build_baichaun( const int n_gpu_layers = model.n_gpu_layers; const int32_t n_tokens = batch.n_tokens; - const int32_t n_kv = ggml_allocr_is_measure(lctx.alloc) ? n_ctx : kv_self.n; - const int32_t kv_head = ggml_allocr_is_measure(lctx.alloc) ? n_ctx - n_tokens : kv_self.head; + const int32_t n_kv = worst_case ? n_ctx : kv_self.n; + const int32_t kv_head = worst_case ? n_ctx - n_tokens : kv_self.head; - const bool do_rope_shift = ggml_allocr_is_measure(lctx.alloc) || kv_self.has_shift; + const bool do_rope_shift = worst_case || kv_self.has_shift; auto & buf_compute = lctx.buf_compute; @@ -3528,11 +3485,6 @@ static struct ggml_cgraph * llm_build_baichaun( if (batch.token) { struct ggml_tensor * inp_tokens = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens); - - ggml_allocr_alloc(lctx.alloc, inp_tokens); - if (!ggml_allocr_is_measure(lctx.alloc)) { - memcpy(inp_tokens->data, batch.token, n_tokens*ggml_element_size(inp_tokens)); - } ggml_set_name(inp_tokens, "inp_tokens"); inpL = ggml_get_rows(ctx0, model.tok_embeddings, inp_tokens); @@ -3542,12 +3494,8 @@ static struct ggml_cgraph * llm_build_baichaun( #endif inpL = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd, n_tokens); - - ggml_allocr_alloc(lctx.alloc, inpL); - if (!ggml_allocr_is_measure(lctx.alloc)) { - memcpy(inpL->data, batch.embd, n_tokens * n_embd * ggml_element_size(inpL)); - } } + ggml_set_name(inpL, "inp_embd"); const int i_gpu_start = n_layer - n_gpu_layers; (void) i_gpu_start; @@ -3572,59 +3520,23 @@ static struct ggml_cgraph * llm_build_baichaun( // KQ_scale struct ggml_tensor * KQ_scale = ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, 1); - ggml_set_name(KQ_scale, "1/sqrt(n_embd_head)"); - ggml_allocr_alloc(lctx.alloc, KQ_scale); - if (!ggml_allocr_is_measure(lctx.alloc)) { - ggml_set_f32(KQ_scale, 1.0f/sqrtf(float(n_embd)/n_head)); - } + ggml_set_name(KQ_scale, "KQ_scale"); // KQ_mask (mask for 1 head, it will be broadcasted to all heads) struct ggml_tensor * KQ_mask = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_kv, n_tokens, 1); offload_func_kq(KQ_mask); ggml_set_name(KQ_mask, "KQ_mask"); - ggml_allocr_alloc(lctx.alloc, KQ_mask); - if (!ggml_allocr_is_measure(lctx.alloc)) { - float * data = (float *) KQ_mask->data; - memset(data, 0, ggml_nbytes(KQ_mask)); - - for (int h = 0; h < 1; ++h) { - for (int j = 0; j < n_tokens; ++j) { - const llama_pos pos = batch.pos[j]; - const llama_seq_id seq_id = batch.seq_id[j][0]; - - for (int i = 0; i < n_kv; ++i) { - if (!kv_self.cells[i].has_seq_id(seq_id) || kv_self.cells[i].pos > pos) { - data[h*(n_kv*n_tokens) + j*n_kv + i] = -INFINITY; - } - } - } - } - } // KQ_pos - contains the positions struct ggml_tensor * KQ_pos = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens); offload_func_kq(KQ_pos); ggml_set_name(KQ_pos, "KQ_pos"); - ggml_allocr_alloc(lctx.alloc, KQ_pos); - if (!ggml_allocr_is_measure(lctx.alloc)) { - int * data = (int *) KQ_pos->data; - for (int i = 0; i < n_tokens; ++i) { - data[i] = batch.pos[i]; - } - } // shift the entire K-cache if needed if (do_rope_shift) { struct ggml_tensor * K_shift = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_ctx); offload_func_kq(K_shift); ggml_set_name(K_shift, "K_shift"); - ggml_allocr_alloc(lctx.alloc, K_shift); - if (!ggml_allocr_is_measure(lctx.alloc)) { - int * data = (int *) K_shift->data; - for (int i = 0; i < n_ctx; ++i) { - data[i] = kv_self.cells[i].delta; - } - } for (int il = 0; il < n_layer; ++il) { struct ggml_tensor * tmp = @@ -3883,7 +3795,8 @@ static struct ggml_cgraph * llm_build_baichaun( static struct ggml_cgraph * llm_build_refact( llama_context & lctx, - const llama_batch & batch) { + const llama_batch & batch, + bool worst_case) { const auto & model = lctx.model; const auto & hparams = model.hparams; const auto & cparams = lctx.cparams; @@ -3905,8 +3818,8 @@ static struct ggml_cgraph * llm_build_refact( const int n_gpu_layers = model.n_gpu_layers; const int32_t n_tokens = batch.n_tokens; - const int32_t n_kv = ggml_allocr_is_measure(lctx.alloc) ? n_ctx : kv_self.n; - const int32_t kv_head = ggml_allocr_is_measure(lctx.alloc) ? n_ctx - n_tokens : kv_self.head; + const int32_t n_kv = worst_case ? n_ctx : kv_self.n; + const int32_t kv_head = worst_case ? n_ctx - n_tokens : kv_self.head; // printf("n_kv = %d\n", n_kv); @@ -3927,11 +3840,6 @@ static struct ggml_cgraph * llm_build_refact( if (batch.token) { struct ggml_tensor * inp_tokens = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens); - - ggml_allocr_alloc(lctx.alloc, inp_tokens); - if (!ggml_allocr_is_measure(lctx.alloc)) { - memcpy(inp_tokens->data, batch.token, n_tokens*ggml_element_size(inp_tokens)); - } ggml_set_name(inp_tokens, "inp_tokens"); inpL = ggml_get_rows(ctx0, model.tok_embeddings, inp_tokens); @@ -3941,12 +3849,8 @@ static struct ggml_cgraph * llm_build_refact( #endif inpL = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd, n_tokens); - - ggml_allocr_alloc(lctx.alloc, inpL); - if (!ggml_allocr_is_measure(lctx.alloc)) { - memcpy(inpL->data, batch.embd, n_tokens * n_embd * ggml_element_size(inpL)); - } } + ggml_set_name(inpL, "inp_embd"); const int i_gpu_start = n_layer - n_gpu_layers; (void) i_gpu_start; @@ -3971,34 +3875,12 @@ static struct ggml_cgraph * llm_build_refact( // KQ_scale struct ggml_tensor * KQ_scale = ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, 1); - ggml_set_name(KQ_scale, "1/sqrt(n_embd_head)"); - ggml_allocr_alloc(lctx.alloc, KQ_scale); - if (!ggml_allocr_is_measure(lctx.alloc)) { - ggml_set_f32(KQ_scale, 1.0f/sqrtf(float(n_embd_head))); - } + ggml_set_name(KQ_scale, "KQ_scale"); // KQ_mask (mask for 1 head, it will be broadcasted to all heads) struct ggml_tensor * KQ_mask = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_kv, n_tokens, 1); offload_func_kq(KQ_mask); ggml_set_name(KQ_mask, "KQ_mask"); - ggml_allocr_alloc(lctx.alloc, KQ_mask); - if (!ggml_allocr_is_measure(lctx.alloc)) { - float * data = (float *) KQ_mask->data; - memset(data, 0, ggml_nbytes(KQ_mask)); - - for (int h = 0; h < 1; ++h) { - for (int j = 0; j < n_tokens; ++j) { - const llama_pos pos = batch.pos[j]; - const llama_seq_id seq_id = batch.seq_id[j][0]; - - for (int i = 0; i < n_kv; ++i) { - if (!kv_self.cells[i].has_seq_id(seq_id) || kv_self.cells[i].pos > pos) { - data[h*(n_kv*n_tokens) + j*n_kv + i] = -INFINITY; - } - } - } - } - } for (int il = 0; il < n_layer; ++il) { ggml_format_name(inpL, "layer_inp_%d", il); @@ -4228,7 +4110,8 @@ static struct ggml_cgraph * llm_build_refact( static struct ggml_cgraph * llm_build_falcon( llama_context & lctx, - const llama_batch & batch) { + const llama_batch & batch, + bool worst_case) { const auto & model = lctx.model; const auto & hparams = model.hparams; const auto & cparams = lctx.cparams; @@ -4254,10 +4137,10 @@ static struct ggml_cgraph * llm_build_falcon( const int n_gpu_layers = model.n_gpu_layers; const int32_t n_tokens = batch.n_tokens; - const int32_t n_kv = ggml_allocr_is_measure(lctx.alloc) ? n_ctx : kv_self.n; - const int32_t kv_head = ggml_allocr_is_measure(lctx.alloc) ? n_ctx - n_tokens : kv_self.head; + const int32_t n_kv = worst_case ? n_ctx : kv_self.n; + const int32_t kv_head = worst_case ? n_ctx - n_tokens : kv_self.head; - const bool do_rope_shift = ggml_allocr_is_measure(lctx.alloc) || kv_self.has_shift; + const bool do_rope_shift = worst_case || kv_self.has_shift; //printf("kv_head = %d, n_kv = %d, n_tokens = %d, n_ctx = %d, is_measure = %d, has_shift = %d\n", // kv_head, n_kv, n_tokens, n_ctx, ggml_allocr_is_measure(lctx.alloc), kv_self.has_shift); @@ -4279,11 +4162,6 @@ static struct ggml_cgraph * llm_build_falcon( if (batch.token) { struct ggml_tensor * inp_tokens = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens); - - ggml_allocr_alloc(lctx.alloc, inp_tokens); - if (!ggml_allocr_is_measure(lctx.alloc)) { - memcpy(inp_tokens->data, batch.token, n_tokens*ggml_element_size(inp_tokens)); - } ggml_set_name(inp_tokens, "inp_tokens"); inpL = ggml_get_rows(ctx0, model.tok_embeddings, inp_tokens); @@ -4293,12 +4171,8 @@ static struct ggml_cgraph * llm_build_falcon( #endif inpL = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd, n_tokens); - - ggml_allocr_alloc(lctx.alloc, inpL); - if (!ggml_allocr_is_measure(lctx.alloc)) { - memcpy(inpL->data, batch.embd, n_tokens * n_embd * ggml_element_size(inpL)); - } } + ggml_set_name(inpL, "inp_embd"); const int i_gpu_start = n_layer - n_gpu_layers; (void) i_gpu_start; @@ -4323,59 +4197,23 @@ static struct ggml_cgraph * llm_build_falcon( // KQ_scale struct ggml_tensor * KQ_scale = ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, 1); - ggml_set_name(KQ_scale, "1/sqrt(n_embd_head)"); - ggml_allocr_alloc(lctx.alloc, KQ_scale); - if (!ggml_allocr_is_measure(lctx.alloc)) { - ggml_set_f32(KQ_scale, 1.0f/sqrtf(float(n_embd)/n_head)); - } + ggml_set_name(KQ_scale, "KQ_scale"); // KQ_mask (mask for 1 head, it will be broadcasted to all heads) struct ggml_tensor * KQ_mask = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_kv, n_tokens, 1); offload_func_kq(KQ_mask); ggml_set_name(KQ_mask, "KQ_mask"); - ggml_allocr_alloc(lctx.alloc, KQ_mask); - if (!ggml_allocr_is_measure(lctx.alloc)) { - float * data = (float *) KQ_mask->data; - memset(data, 0, ggml_nbytes(KQ_mask)); - - for (int h = 0; h < 1; ++h) { - for (int j = 0; j < n_tokens; ++j) { - const llama_pos pos = batch.pos[j]; - const llama_seq_id seq_id = batch.seq_id[j][0]; - - for (int i = 0; i < n_kv; ++i) { - if (!kv_self.cells[i].has_seq_id(seq_id) || kv_self.cells[i].pos > pos) { - data[h*(n_kv*n_tokens) + j*n_kv + i] = -INFINITY; - } - } - } - } - } // KQ_pos - contains the positions struct ggml_tensor * KQ_pos = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens); offload_func_kq(KQ_pos); ggml_set_name(KQ_pos, "KQ_pos"); - ggml_allocr_alloc(lctx.alloc, KQ_pos); - if (!ggml_allocr_is_measure(lctx.alloc)) { - int * data = (int *) KQ_pos->data; - for (int i = 0; i < n_tokens; ++i) { - data[i] = batch.pos[i]; - } - } // shift the entire K-cache if needed if (do_rope_shift) { struct ggml_tensor * K_shift = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_ctx); offload_func_kq(K_shift); ggml_set_name(K_shift, "K_shift"); - ggml_allocr_alloc(lctx.alloc, K_shift); - if (!ggml_allocr_is_measure(lctx.alloc)) { - int * data = (int *) K_shift->data; - for (int i = 0; i < n_ctx; ++i) { - data[i] = kv_self.cells[i].delta; - } - } for (int il = 0; il < n_layer; ++il) { struct ggml_tensor * tmp = @@ -4595,7 +4433,8 @@ static struct ggml_cgraph * llm_build_falcon( static struct ggml_cgraph * llm_build_starcoder( llama_context & lctx, - const llama_batch & batch) { + const llama_batch & batch, + bool worst_case) { const auto & model = lctx.model; const auto & hparams = model.hparams; const auto & cparams = lctx.cparams; @@ -4619,8 +4458,8 @@ static struct ggml_cgraph * llm_build_starcoder( const int n_gpu_layers = model.n_gpu_layers; const int32_t n_tokens = batch.n_tokens; - const int32_t n_kv = ggml_allocr_is_measure(lctx.alloc) ? n_ctx : kv_self.n; - const int32_t kv_head = ggml_allocr_is_measure(lctx.alloc) ? n_ctx - n_tokens : kv_self.head; + const int32_t n_kv = worst_case ? n_ctx : kv_self.n; + const int32_t kv_head = worst_case ? n_ctx - n_tokens : kv_self.head; auto & buf_compute = lctx.buf_compute; @@ -4635,32 +4474,23 @@ static struct ggml_cgraph * llm_build_starcoder( ggml_cgraph * gf = ggml_new_graph(ctx0); struct ggml_tensor * cur; - struct ggml_tensor * token; - struct ggml_tensor * position; + struct ggml_tensor * embd; + struct ggml_tensor * pos; struct ggml_tensor * inpL; if (batch.token) { struct ggml_tensor * inp_tokens = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens); - - ggml_allocr_alloc(lctx.alloc, inp_tokens); - if (!ggml_allocr_is_measure(lctx.alloc)) { - memcpy(inp_tokens->data, batch.token, n_tokens*ggml_element_size(inp_tokens)); - } ggml_set_name(inp_tokens, "inp_tokens"); - token = ggml_get_rows(ctx0, model.tok_embeddings, inp_tokens); + embd = ggml_get_rows(ctx0, model.tok_embeddings, inp_tokens); } else { #ifdef GGML_USE_MPI GGML_ASSERT(false && "not implemented"); #endif - token = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd, n_tokens); - - ggml_allocr_alloc(lctx.alloc, token); - if (!ggml_allocr_is_measure(lctx.alloc)) { - memcpy(token->data, batch.embd, n_tokens * n_embd * ggml_element_size(token)); - } + embd = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd, n_tokens); } + ggml_set_name(embd, "inp_embd"); const int i_gpu_start = n_layer - n_gpu_layers; (void) i_gpu_start; @@ -4684,51 +4514,22 @@ static struct ggml_cgraph * llm_build_starcoder( #endif // GGML_USE_CUBLAS { - // Compute position embeddings. - struct ggml_tensor * inp_positions = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens); - ggml_allocr_alloc(lctx.alloc, inp_positions); - if (!ggml_allocr_is_measure(lctx.alloc)) { - for (int i = 0; i < n_tokens; ++i) { - ((int32_t *) inp_positions->data)[i] = batch.pos[i]; - } - } - ggml_set_name(inp_positions, "inp_positions"); + struct ggml_tensor * inp_pos = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens); + ggml_set_name(inp_pos, "inp_pos"); - position = ggml_get_rows(ctx0, model.pos_embeddings, inp_positions); + pos = ggml_get_rows(ctx0, model.pos_embeddings, inp_pos); } // KQ_scale struct ggml_tensor * KQ_scale = ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, 1); - ggml_set_name(KQ_scale, "1/sqrt(n_embd_head)"); - ggml_allocr_alloc(lctx.alloc, KQ_scale); - if (!ggml_allocr_is_measure(lctx.alloc)) { - ggml_set_f32(KQ_scale, 1.0f/sqrtf(float(n_embd)/n_head)); - } + ggml_set_name(KQ_scale, "KQ_scale"); // KQ_mask (mask for 1 head, it will be broadcasted to all heads) struct ggml_tensor * KQ_mask = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_kv, n_tokens, 1); ggml_set_name(KQ_mask, "KQ_mask"); offload_func_kq(KQ_mask); - ggml_allocr_alloc(lctx.alloc, KQ_mask); - if (!ggml_allocr_is_measure(lctx.alloc)) { - float * data = (float *) KQ_mask->data; - memset(data, 0, ggml_nbytes(KQ_mask)); - for (int h = 0; h < 1; ++h) { - for (int j = 0; j < n_tokens; ++j) { - const llama_pos pos = batch.pos[j]; - const llama_seq_id seq_id = batch.seq_id[j][0]; - - for (int i = 0; i < n_kv; ++i) { - if (!kv_self.cells[i].has_seq_id(seq_id) || kv_self.cells[i].pos > pos) { - data[h*(n_kv*n_tokens) + j*n_kv + i] = -INFINITY; - } - } - } - } - } - - inpL = ggml_add(ctx0, token, position); + inpL = ggml_add(ctx0, embd, pos); ggml_set_name(inpL, "inpL"); for (int il = 0; il < n_layer; ++il) { @@ -4904,7 +4705,8 @@ static struct ggml_cgraph * llm_build_starcoder( static struct ggml_cgraph * llm_build_persimmon( llama_context & lctx, - const llama_batch & batch) { + const llama_batch & batch, + bool worst_case) { const auto & model = lctx.model; const auto & hparams = model.hparams; @@ -4930,10 +4732,10 @@ static struct ggml_cgraph * llm_build_persimmon( const int32_t n_tokens = batch.n_tokens; - const int32_t n_kv = ggml_allocr_is_measure(lctx.alloc) ? n_ctx : kv_self.n; - const int32_t kv_head = ggml_allocr_is_measure(lctx.alloc) ? n_ctx - n_tokens : kv_self.head; + const int32_t n_kv = worst_case ? n_ctx : kv_self.n; + const int32_t kv_head = worst_case ? n_ctx - n_tokens : kv_self.head; - const bool do_rope_shift = ggml_allocr_is_measure(lctx.alloc) || kv_self.has_shift; + const bool do_rope_shift = worst_case || kv_self.has_shift; auto & buf_compute = lctx.buf_compute; struct ggml_init_params params = { @@ -4951,12 +4753,8 @@ static struct ggml_cgraph * llm_build_persimmon( if (batch.token) { struct ggml_tensor * inp_tokens = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens); - - ggml_allocr_alloc(lctx.alloc, inp_tokens); - if (!ggml_allocr_is_measure(lctx.alloc)) { - memcpy(inp_tokens->data, batch.token, n_tokens*ggml_element_size(inp_tokens)); - } ggml_set_name(inp_tokens, "inp_tokens"); + inpL = ggml_get_rows(ctx0, model.tok_embeddings, inp_tokens); } else { inpL = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd, n_tokens); @@ -4976,7 +4774,7 @@ static struct ggml_cgraph * llm_build_persimmon( if (!ggml_allocr_is_measure(lctx.alloc)) { ggml_set_f32(KQ_scale, 1.0f/sqrtf(float(n_embd_head))); } - ggml_set_name(KQ_scale, "1/sqrt(n_embd_head)"); + ggml_set_name(KQ_scale, "KQ_scale"); struct ggml_tensor * KQ_mask = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_kv, n_tokens, 1); offload_func_kq(KQ_mask); ggml_set_name(KQ_mask, "KQ_mask"); @@ -5301,7 +5099,8 @@ static struct ggml_cgraph * llm_build_persimmon( static struct ggml_cgraph * llm_build_bloom( llama_context & lctx, - const llama_batch & batch) { + const llama_batch & batch, + bool worst_case) { const auto & model = lctx.model; const auto & hparams = model.hparams; const auto & cparams = lctx.cparams; @@ -5323,8 +5122,8 @@ static struct ggml_cgraph * llm_build_bloom( const float norm_eps = hparams.f_norm_eps; const int32_t n_tokens = batch.n_tokens; - const int32_t n_kv = ggml_allocr_is_measure(lctx.alloc) ? n_ctx : kv_self.n; - const int32_t kv_head = ggml_allocr_is_measure(lctx.alloc) ? n_ctx - n_tokens : kv_self.head; + const int32_t n_kv = worst_case ? n_ctx : kv_self.n; + const int32_t kv_head = worst_case ? n_ctx - n_tokens : kv_self.head; auto & buf_compute = lctx.buf_compute; @@ -5341,66 +5140,35 @@ static struct ggml_cgraph * llm_build_bloom( ggml_cgraph * gf = ggml_new_graph(ctx0); struct ggml_tensor * cur; - struct ggml_tensor * token; + struct ggml_tensor * embd; struct ggml_tensor * inpL; if (batch.token) { struct ggml_tensor * inp_tokens = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens); - - ggml_allocr_alloc(lctx.alloc, inp_tokens); - if (!ggml_allocr_is_measure(lctx.alloc)) { - memcpy(inp_tokens->data, batch.token, n_tokens*ggml_element_size(inp_tokens)); - } ggml_set_name(inp_tokens, "inp_tokens"); - token = ggml_get_rows(ctx0, model.tok_embeddings, inp_tokens); + embd = ggml_get_rows(ctx0, model.tok_embeddings, inp_tokens); } else { #ifdef GGML_USE_MPI GGML_ASSERT(false && "not implemented"); #endif - token = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd, n_tokens); - - ggml_allocr_alloc(lctx.alloc, token); - if (!ggml_allocr_is_measure(lctx.alloc)) { - memcpy(token->data, batch.embd, n_tokens * n_embd * ggml_element_size(token)); - } + embd = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd, n_tokens); } + ggml_set_name(embd, "embd"); // KQ_scale struct ggml_tensor * KQ_scale = ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, 1); - ggml_set_name(KQ_scale, "1/sqrt(n_embd_head)"); - ggml_allocr_alloc(lctx.alloc, KQ_scale); - if (!ggml_allocr_is_measure(lctx.alloc)) { - ggml_set_f32(KQ_scale, 1.0f/sqrtf(float(n_embd)/n_head)); - } + ggml_set_name(KQ_scale, "KQ_scale"); // KQ_mask (mask for 1 head, it will be broadcasted to all heads) struct ggml_tensor * KQ_mask = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_kv, n_tokens, 1); ggml_set_name(KQ_mask, "KQ_mask"); - ggml_allocr_alloc(lctx.alloc, KQ_mask); - if (!ggml_allocr_is_measure(lctx.alloc)) { - float * data = (float *) KQ_mask->data; - memset(data, 0, ggml_nbytes(KQ_mask)); - - for (int h = 0; h < 1; ++h) { - for (int j = 0; j < n_tokens; ++j) { - const llama_pos pos = batch.pos[j]; - const llama_seq_id seq_id = batch.seq_id[j][0]; - - for (int i = 0; i < n_kv; ++i) { - if (!kv_self.cells[i].has_seq_id(seq_id) || kv_self.cells[i].pos > pos) { - data[h*(n_kv*n_tokens) + j*n_kv + i] = -INFINITY; - } - } - } - } - } // norm { - inpL = ggml_norm(ctx0, token, norm_eps); - inpL = ggml_add(ctx0, ggml_mul(ctx0, inpL, model.tok_norm), model.tok_norm_b); + inpL = ggml_norm(ctx0, embd, norm_eps); + inpL = ggml_add (ctx0, ggml_mul(ctx0, inpL, model.tok_norm), model.tok_norm_b); } ggml_set_name(inpL, "inpL"); @@ -5416,9 +5184,9 @@ static struct ggml_cgraph * llm_build_bloom( // Self Attention cur = ggml_add(ctx0, ggml_mul_mat(ctx0, model.layers[il].wqkv, cur), model.layers[il].bqkv); - struct ggml_tensor * tmpq = ggml_view_2d(ctx0, cur, n_embd, n_tokens, cur->nb[1], 0*sizeof(float)*n_embd); - struct ggml_tensor * tmpk = ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], sizeof(float)*n_embd); - struct ggml_tensor * tmpv = ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], sizeof(float)*(n_embd + n_embd_gqa)); + struct ggml_tensor * tmpq = ggml_view_2d(ctx0, cur, n_embd, n_tokens, cur->nb[1], 0*sizeof(float)*(n_embd)); + struct ggml_tensor * tmpk = ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd)); + struct ggml_tensor * tmpv = ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa)); struct ggml_tensor * Qcur = tmpq; struct ggml_tensor * Kcur = tmpk; @@ -5543,7 +5311,8 @@ static struct ggml_cgraph * llm_build_bloom( static struct ggml_cgraph * llm_build_mpt( llama_context & lctx, - const llama_batch & batch) { + const llama_batch & batch, + bool worst_case) { const auto & model = lctx.model; const auto & hparams = model.hparams; const auto & cparams = lctx.cparams; @@ -5567,8 +5336,8 @@ static struct ggml_cgraph * llm_build_mpt( const int n_gpu_layers = model.n_gpu_layers; const int32_t n_tokens = batch.n_tokens; - const int32_t n_kv = ggml_allocr_is_measure(lctx.alloc) ? n_ctx : kv_self.n; - const int32_t kv_head = ggml_allocr_is_measure(lctx.alloc) ? n_ctx - n_tokens : kv_self.head; + const int32_t n_kv = worst_case ? n_ctx : kv_self.n; + const int32_t kv_head = worst_case ? n_ctx - n_tokens : kv_self.head; auto & buf_compute = lctx.buf_compute; @@ -5590,13 +5359,6 @@ static struct ggml_cgraph * llm_build_mpt( //int warmup = 0; if (batch.token) { struct ggml_tensor * inp_tokens = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens); - - ggml_allocr_alloc(lctx.alloc, inp_tokens); - if (!ggml_allocr_is_measure(lctx.alloc)) { - memcpy(inp_tokens->data, batch.token, n_tokens*ggml_element_size(inp_tokens)); - //warmup = ((uint32_t*) inp_tokens->data)[0] == 0; - } - ggml_set_name(inp_tokens, "inp_tokens"); inpL = ggml_get_rows(ctx0, model.tok_embeddings, inp_tokens); @@ -5606,12 +5368,8 @@ static struct ggml_cgraph * llm_build_mpt( #endif inpL = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd, n_tokens); - - ggml_allocr_alloc(lctx.alloc, inpL); - if (!ggml_allocr_is_measure(lctx.alloc)) { - memcpy(inpL->data, batch.embd, n_tokens * n_embd * ggml_element_size(inpL)); - } } + ggml_set_name(inpL, "inp_embd"); const int i_gpu_start = n_layer - n_gpu_layers; (void) i_gpu_start; @@ -5636,34 +5394,12 @@ static struct ggml_cgraph * llm_build_mpt( // KQ_scale struct ggml_tensor * KQ_scale = ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, 1); - ggml_set_name(KQ_scale, "1/sqrt(n_embd_head)"); - ggml_allocr_alloc(lctx.alloc, KQ_scale); - if (!ggml_allocr_is_measure(lctx.alloc)) { - ggml_set_f32(KQ_scale, 1.0f/sqrtf(float(n_embd)/n_head)); - } + ggml_set_name(KQ_scale, "KQ_scale"); // KQ_mask (mask for 1 head, it will be broadcasted to all heads) struct ggml_tensor * KQ_mask = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_kv, n_tokens, 1); offload_func_kq(KQ_mask); ggml_set_name(KQ_mask, "KQ_mask"); - ggml_allocr_alloc(lctx.alloc, KQ_mask); - if (!ggml_allocr_is_measure(lctx.alloc)) { - float * data = (float *) KQ_mask->data; - memset(data, 0, ggml_nbytes(KQ_mask)); - - for (int h = 0; h < 1; ++h) { - for (int j = 0; j < n_tokens; ++j) { - const llama_pos pos = batch.pos[j]; - const llama_seq_id seq_id = batch.seq_id[j][0]; - - for (int i = 0; i < n_kv; ++i) { - if (!kv_self.cells[i].has_seq_id(seq_id) || kv_self.cells[i].pos > pos) { - data[h*(n_kv*n_tokens) + j*n_kv + i] = -INFINITY; - } - } - } - } - } for (int il = 0; il < n_layer; ++il) { struct ggml_tensor * attn_norm; @@ -5865,43 +5601,180 @@ static struct ggml_cgraph * llama_build_graph( struct ggml_cgraph * result = NULL; + // check if we should build the worst-case graph (for memory measurement) + const bool worst_case = ggml_allocr_is_measure(lctx.alloc); + switch (model.arch) { case LLM_ARCH_LLAMA: { - result = llm_build_llama(lctx, batch); + result = llm_build_llama(lctx, batch, worst_case); } break; case LLM_ARCH_BAICHUAN: { - result = llm_build_baichaun(lctx, batch); + result = llm_build_baichaun(lctx, batch, worst_case); } break; case LLM_ARCH_FALCON: { - result = llm_build_falcon(lctx, batch); + result = llm_build_falcon(lctx, batch, worst_case); } break; case LLM_ARCH_STARCODER: { - result = llm_build_starcoder(lctx, batch); + result = llm_build_starcoder(lctx, batch, worst_case); } break; case LLM_ARCH_PERSIMMON: { - result = llm_build_persimmon(lctx, batch); + result = llm_build_persimmon(lctx, batch, worst_case); } break; case LLM_ARCH_REFACT: { - result = llm_build_refact(lctx, batch); + result = llm_build_refact(lctx, batch, worst_case); } break; case LLM_ARCH_BLOOM: { - result = llm_build_bloom(lctx, batch); + result = llm_build_bloom(lctx, batch, worst_case); } break; case LLM_ARCH_MPT: { - result = llm_build_mpt(lctx, batch); + result = llm_build_mpt(lctx, batch, worst_case); } break; default: GGML_ASSERT(false); } + // set input data to the graph + + // inp_tokens + if (batch.token) { + struct ggml_tensor * cur = ggml_graph_get_tensor(result, "inp_tokens"); + GGML_ASSERT(cur != nullptr); + + ggml_allocr_alloc(lctx.alloc, cur); + + if (!ggml_allocr_is_measure(lctx.alloc)) { + const int64_t n_tokens = cur->ne[0]; + + memcpy(cur->data, batch.token, n_tokens*ggml_element_size(cur)); + } + } else { // inp_embd + struct ggml_tensor * cur = ggml_graph_get_tensor(result, "inp_embd"); + GGML_ASSERT(cur != nullptr); + + ggml_allocr_alloc(lctx.alloc, cur); + + if (!ggml_allocr_is_measure(lctx.alloc)) { + const int64_t n_embd = cur->ne[0]; + const int64_t n_tokens = cur->ne[1]; + + memcpy(cur->data, batch.embd, n_tokens*n_embd*ggml_element_size(cur)); + } + } + + // inp_pos + do { + struct ggml_tensor * cur = ggml_graph_get_tensor(result, "inp_pos"); + if (cur == nullptr) { + break; + } + + ggml_allocr_alloc(lctx.alloc, cur); + + if (!ggml_allocr_is_measure(lctx.alloc)) { + const int64_t n_tokens = cur->ne[0]; + + int32_t * data = (int32_t *) cur->data; + + for (int i = 0; i < n_tokens; ++i) { + data[i] = batch.pos[i]; + } + } + } while (0); + + // KQ_scale + do { + struct ggml_tensor * cur = ggml_graph_get_tensor(result, "KQ_scale"); + if (cur == nullptr) { + break; + } + + ggml_allocr_alloc(lctx.alloc, cur); + + if (!ggml_allocr_is_measure(lctx.alloc)) { + const int64_t n_embd_head = lctx.model.hparams.n_embd_head(); + ggml_set_f32(cur, 1.0f/sqrtf(float(n_embd_head))); + } + } while (0); + + // KQ_mask + do { + struct ggml_tensor * cur = ggml_graph_get_tensor(result, "KQ_mask"); + if (cur == nullptr) { + break; + } + + ggml_allocr_alloc(lctx.alloc, cur); + + if (!ggml_allocr_is_measure(lctx.alloc)) { + const int64_t n_kv = cur->ne[0]; + const int64_t n_tokens = cur->ne[1]; + + float * data = (float *) cur->data; + memset(data, 0, ggml_nbytes(cur)); + + for (int h = 0; h < 1; ++h) { + for (int j = 0; j < n_tokens; ++j) { + const llama_pos pos = batch.pos[j]; + const llama_seq_id seq_id = batch.seq_id[j][0]; + + for (int i = 0; i < n_kv; ++i) { + if (!lctx.kv_self.cells[i].has_seq_id(seq_id) || lctx.kv_self.cells[i].pos > pos) { + data[h*(n_kv*n_tokens) + j*n_kv + i] = -INFINITY; + } + } + } + } + } + } while (0); + + // KQ_pos + do { + struct ggml_tensor * cur = ggml_graph_get_tensor(result, "KQ_pos"); + if (cur == nullptr) { + break; + } + + ggml_allocr_alloc(lctx.alloc, cur); + + if (!ggml_allocr_is_measure(lctx.alloc)) { + const int64_t n_tokens = cur->ne[0]; + + int32_t * data = (int32_t *) cur->data; + + for (int i = 0; i < n_tokens; ++i) { + data[i] = batch.pos[i]; + } + } + } while (0); + + // K_shift + do { + struct ggml_tensor * cur = ggml_graph_get_tensor(result, "K_shift"); + if (cur == nullptr) { + break; + } + + ggml_allocr_alloc(lctx.alloc, cur); + + if (!ggml_allocr_is_measure(lctx.alloc)) { + const int64_t n_ctx = cur->ne[0]; + + int32_t * data = (int32_t *) cur->data; + + for (int i = 0; i < n_ctx; ++i) { + data[i] = lctx.kv_self.cells[i].delta; + } + } + } while (0); + return result; } From 5946d98fc88743be753728051035ff26d587b065 Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Sat, 28 Oct 2023 21:22:01 +0300 Subject: [PATCH 02/20] metal : disable kernel load log --- ggml-metal.m | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/ggml-metal.m b/ggml-metal.m index 2380c4310..bc881395a 100644 --- a/ggml-metal.m +++ b/ggml-metal.m @@ -238,14 +238,17 @@ struct ggml_metal_context * ggml_metal_init(int n_cb) { // load kernels { NSError * error = nil; -#define GGML_METAL_ADD_KERNEL(name) \ - ctx->function_##name = [ctx->library newFunctionWithName:@"kernel_"#name]; \ - ctx->pipeline_##name = [ctx->device newComputePipelineStateWithFunction:ctx->function_##name error:&error]; \ + + /* GGML_METAL_LOG_INFO("%s: loaded %-32s %16p | th_max = %4d | th_width = %4d\n", __func__, "kernel_"#name, (void *) ctx->pipeline_##name, \ (int) ctx->pipeline_##name.maxTotalThreadsPerThreadgroup, \ (int) ctx->pipeline_##name.threadExecutionWidth); \ + */ +#define GGML_METAL_ADD_KERNEL(name) \ + ctx->function_##name = [ctx->library newFunctionWithName:@"kernel_"#name]; \ + ctx->pipeline_##name = [ctx->device newComputePipelineStateWithFunction:ctx->function_##name error:&error]; \ if (error) { \ - GGML_METAL_LOG_ERROR("%s: error: load pipeline error: %s\n", __func__, [[error description] UTF8String]); \ + GGML_METAL_LOG_ERROR("%s: error: load pipeline error: %s\n", __func__, [[error description] UTF8String]); \ return NULL; \ } From 38aca9e1abac5564a5da71ee35a136ce76d2e29a Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Sat, 28 Oct 2023 21:22:31 +0300 Subject: [PATCH 03/20] llama : factor out tensor offloading outside the build call (wip) ggml-ci --- llama.cpp | 200 ++++++++++++++++++++++++++++++++++-------------------- 1 file changed, 126 insertions(+), 74 deletions(-) diff --git a/llama.cpp b/llama.cpp index 43c629358..22129ee43 100644 --- a/llama.cpp +++ b/llama.cpp @@ -3008,10 +3008,10 @@ static void llm_load_tensors( #ifdef GGML_USE_CUBLAS const int max_backend_supported_layers = hparams.n_layer + 3; - const int max_offloadable_layers = hparams.n_layer + 3; + const int max_offloadable_layers = hparams.n_layer + 3; #elif defined(GGML_USE_CLBLAST) const int max_backend_supported_layers = hparams.n_layer + 1; - const int max_offloadable_layers = hparams.n_layer + 1; + const int max_offloadable_layers = hparams.n_layer + 1; #endif // GGML_USE_CUBLAS LLAMA_LOG_INFO("%s: offloaded %d/%d layers to GPU\n", __func__, std::min(n_gpu_layers, max_offloadable_layers), max_backend_supported_layers); @@ -3116,8 +3116,6 @@ static struct ggml_cgraph * llm_build_llama( const float freq_scale = cparams.rope_freq_scale; const float norm_rms_eps = hparams.f_norm_rms_eps; - const int n_gpu_layers = model.n_gpu_layers; - const int32_t n_tokens = batch.n_tokens; const int32_t n_kv = worst_case ? n_ctx : kv_self.n; const int32_t kv_head = worst_case ? n_ctx - n_tokens : kv_self.head; @@ -3155,45 +3153,21 @@ static struct ggml_cgraph * llm_build_llama( } ggml_set_name(inpL, "inp_embd"); - const int i_gpu_start = n_layer - n_gpu_layers; - (void) i_gpu_start; - - // offload functions set the tensor output backend to GPU - // tensors are GPU-accelerated if any input or the output has been offloaded - offload_func_t offload_func_nr = llama_nop; // nr = non-repeating - offload_func_t offload_func_kq = llama_nop; - offload_func_t offload_func_v = llama_nop; - -#ifdef GGML_USE_CUBLAS - if (n_gpu_layers > n_layer) { - offload_func_nr = ggml_cuda_assign_buffers_no_alloc; - } - if (n_gpu_layers > n_layer + 1) { - offload_func_v = ggml_cuda_assign_buffers_no_alloc; - } - if (n_gpu_layers > n_layer + 2) { - offload_func_kq = ggml_cuda_assign_buffers_no_alloc; - } -#endif // GGML_USE_CUBLAS - // KQ_scale struct ggml_tensor * KQ_scale = ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, 1); ggml_set_name(KQ_scale, "KQ_scale"); // KQ_mask (mask for 1 head, it will be broadcasted to all heads) struct ggml_tensor * KQ_mask = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_kv, n_tokens, 1); - offload_func_kq(KQ_mask); ggml_set_name(KQ_mask, "KQ_mask"); // KQ_pos - contains the positions struct ggml_tensor * KQ_pos = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens); - offload_func_kq(KQ_pos); ggml_set_name(KQ_pos, "KQ_pos"); // shift the entire K-cache if needed if (do_rope_shift) { struct ggml_tensor * K_shift = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_ctx); - offload_func_kq(K_shift); ggml_set_name(K_shift, "K_shift"); for (int il = 0; il < n_layer; ++il) { @@ -3205,33 +3179,21 @@ static struct ggml_cgraph * llm_build_llama( ggml_element_size(kv_self.k)*n_embd_gqa, ggml_element_size(kv_self.k)*n_embd_gqa*n_ctx*il), K_shift, n_embd_head, 0, 0, freq_base, freq_scale); - offload_func_kq(tmp); + ggml_set_name(tmp, "K_shifted"); ggml_build_forward_expand(gf, tmp); } } for (int il = 0; il < n_layer; ++il) { - ggml_format_name(inpL, "layer_inp_%d", il); - - offload_func_t offload_func = llama_nop; - -#ifdef GGML_USE_CUBLAS - if (il >= i_gpu_start) { - offload_func = ggml_cuda_assign_buffers_no_alloc; - } -#endif // GGML_USE_CUBLAS - struct ggml_tensor * inpSA = inpL; // norm { cur = ggml_rms_norm(ctx0, inpL, norm_rms_eps); - offload_func(cur); ggml_set_name(cur, "rms_norm_0"); // cur = cur*attn_norm(broadcasted) cur = ggml_mul(ctx0, cur, model.layers[il].attn_norm); - offload_func(cur); ggml_set_name(cur, "attention_norm_0"); } @@ -3239,19 +3201,15 @@ static struct ggml_cgraph * llm_build_llama( { // compute Q and K and RoPE them struct ggml_tensor * tmpk = ggml_mul_mat(ctx0, model.layers[il].wk, cur); - offload_func_kq(tmpk); ggml_set_name(tmpk, "tmpk"); struct ggml_tensor * tmpq = ggml_mul_mat(ctx0, model.layers[il].wq, cur); - offload_func_kq(tmpq); ggml_set_name(tmpq, "tmpq"); struct ggml_tensor * Kcur = ggml_rope_custom(ctx0, ggml_reshape_3d(ctx0, tmpk, n_embd_head, n_head_kv, n_tokens), KQ_pos, n_embd_head, 0, 0, freq_base, freq_scale); - offload_func_kq(Kcur); ggml_set_name(Kcur, "Kcur"); struct ggml_tensor * Qcur = ggml_rope_custom(ctx0, ggml_reshape_3d(ctx0, tmpq, n_embd_head, n_head, n_tokens), KQ_pos, n_embd_head, 0, 0, freq_base, freq_scale); - offload_func_kq(Qcur); ggml_set_name(Qcur, "Qcur"); // store key and value to memory @@ -3259,21 +3217,17 @@ static struct ggml_cgraph * llm_build_llama( // compute the transposed [n_tokens, n_embd] V matrix struct ggml_tensor * tmpv = ggml_mul_mat(ctx0, model.layers[il].wv, cur); - offload_func_v(tmpv); ggml_set_name(tmpv, "tmpv"); struct ggml_tensor * Vcur = ggml_transpose(ctx0, ggml_reshape_2d(ctx0, tmpv, n_embd_gqa, n_tokens)); - offload_func_v(Vcur); ggml_set_name(Vcur, "Vcur"); struct ggml_tensor * k = ggml_view_1d(ctx0, kv_self.k, n_tokens*n_embd_gqa, (ggml_element_size(kv_self.k)*n_embd_gqa)*(il*n_ctx + kv_head)); - offload_func_kq(k); ggml_set_name(k, "k"); struct ggml_tensor * v = ggml_view_2d(ctx0, kv_self.v, n_tokens, n_embd_gqa, ( n_ctx)*ggml_element_size(kv_self.v), (il*n_ctx)*ggml_element_size(kv_self.v)*n_embd_gqa + kv_head*ggml_element_size(kv_self.v)); - offload_func_v(v); ggml_set_name(v, "v"); // important: storing RoPE-ed version of K in the KV cache! @@ -3282,7 +3236,6 @@ static struct ggml_cgraph * llm_build_llama( } struct ggml_tensor * Q = ggml_permute(ctx0, Qcur, 0, 2, 1, 3); - offload_func_kq(Q); ggml_set_name(Q, "Q"); struct ggml_tensor * K = @@ -3291,28 +3244,23 @@ static struct ggml_cgraph * llm_build_llama( ggml_element_size(kv_self.k)*n_embd_gqa, ggml_element_size(kv_self.k)*n_embd_head, ggml_element_size(kv_self.k)*n_embd_gqa*n_ctx*il); - offload_func_kq(K); ggml_set_name(K, "K"); // K * Q struct ggml_tensor * KQ = ggml_mul_mat(ctx0, K, Q); - offload_func_kq(KQ); ggml_set_name(KQ, "KQ"); // KQ_scaled = KQ / sqrt(n_embd_head) // KQ_scaled shape [n_kv, n_tokens, n_head, 1] struct ggml_tensor * KQ_scaled = ggml_scale(ctx0, KQ, KQ_scale); - offload_func_kq(KQ_scaled); ggml_set_name(KQ_scaled, "KQ_scaled"); // KQ_masked = mask_past(KQ_scaled) struct ggml_tensor * KQ_masked = ggml_add(ctx0, KQ_scaled, KQ_mask); - offload_func_kq(KQ_masked); ggml_set_name(KQ_masked, "KQ_masked"); // KQ = soft_max(KQ_masked) struct ggml_tensor * KQ_soft_max = ggml_soft_max(ctx0, KQ_masked); - offload_func_v(KQ_soft_max); ggml_set_name(KQ_soft_max, "KQ_soft_max"); // split cached V into n_head heads @@ -3322,12 +3270,10 @@ static struct ggml_cgraph * llm_build_llama( ggml_element_size(kv_self.v)*n_ctx, ggml_element_size(kv_self.v)*n_ctx*n_embd_head, ggml_element_size(kv_self.v)*n_ctx*n_embd_gqa*il); - offload_func_v(V); ggml_set_name(V, "V"); #if 1 struct ggml_tensor * KQV = ggml_mul_mat(ctx0, V, KQ_soft_max); - offload_func_v(KQV); ggml_set_name(KQV, "KQV"); #else // make V contiguous in memory to speed up the matmul, however we waste time on the copy @@ -3339,24 +3285,20 @@ static struct ggml_cgraph * llm_build_llama( // KQV_merged = KQV.permute(0, 2, 1, 3) struct ggml_tensor * KQV_merged = ggml_permute(ctx0, KQV, 0, 2, 1, 3); - offload_func_v(KQV_merged); ggml_set_name(KQV_merged, "KQV_merged"); // cur = KQV_merged.contiguous().view(n_embd, n_tokens) cur = ggml_cont_2d(ctx0, KQV_merged, n_embd, n_tokens); - offload_func_v(cur); ggml_set_name(cur, "KQV_merged_contiguous"); // projection (no bias) cur = ggml_mul_mat(ctx0, model.layers[il].wo, cur); - offload_func(cur); ggml_set_name(cur, "result_wo"); } struct ggml_tensor * inpFF = ggml_add(ctx0, cur, inpSA); - offload_func(inpFF); ggml_set_name(inpFF, "inpFF"); // feed-forward network @@ -3364,45 +3306,37 @@ static struct ggml_cgraph * llm_build_llama( // norm { cur = ggml_rms_norm(ctx0, inpFF, norm_rms_eps); - offload_func(cur); ggml_set_name(cur, "rms_norm_1"); // cur = cur*ffn_norm(broadcasted) cur = ggml_mul(ctx0, cur, model.layers[il].ffn_norm); - offload_func(cur); ggml_set_name(cur, "ffn_norm"); } struct ggml_tensor * tmp = ggml_mul_mat(ctx0, model.layers[il].w3, cur); - offload_func(tmp); ggml_set_name(tmp, "result_w3"); cur = ggml_mul_mat(ctx0, model.layers[il].w1, cur); - offload_func(cur); ggml_set_name(cur, "result_w1"); // SILU activation cur = ggml_silu(ctx0, cur); - offload_func(cur); ggml_set_name(cur, "silu"); cur = ggml_mul(ctx0, cur, tmp); - offload_func(cur); ggml_set_name(cur, "silu_x_result_w3"); cur = ggml_mul_mat(ctx0, model.layers[il].w2, cur); - offload_func(cur); ggml_set_name(cur, "result_w2"); } cur = ggml_add(ctx0, cur, inpFF); - offload_func(cur); ggml_set_name(cur, "inpFF_+_result_w2"); // input for next layer @@ -3414,12 +3348,10 @@ static struct ggml_cgraph * llm_build_llama( // norm { cur = ggml_rms_norm(ctx0, cur, norm_rms_eps); - offload_func_nr(cur); ggml_set_name(cur, "rms_norm_2"); // cur = cur*norm(broadcasted) cur = ggml_mul(ctx0, cur, model.output_norm); - // offload_func_nr(cur); // TODO CPU + GPU mirrored backend ggml_set_name(cur, "result_norm"); } @@ -3884,7 +3816,6 @@ static struct ggml_cgraph * llm_build_refact( for (int il = 0; il < n_layer; ++il) { ggml_format_name(inpL, "layer_inp_%d", il); - offload_func_t offload_func = llama_nop; #ifdef GGML_USE_CUBLAS @@ -5641,7 +5572,7 @@ static struct ggml_cgraph * llama_build_graph( GGML_ASSERT(false); } - // set input data to the graph + // allocate memory and set the values for the input tensors of the graph // inp_tokens if (batch.token) { @@ -5655,7 +5586,10 @@ static struct ggml_cgraph * llama_build_graph( memcpy(cur->data, batch.token, n_tokens*ggml_element_size(cur)); } - } else { // inp_embd + } + + // inp_embd + if (batch.embd) { struct ggml_tensor * cur = ggml_graph_get_tensor(result, "inp_embd"); GGML_ASSERT(cur != nullptr); @@ -5775,6 +5709,124 @@ static struct ggml_cgraph * llama_build_graph( } } while (0); + // offload layers + + { + const int n_layer = model.hparams.n_layer; + + const int n_gpu_layers = model.n_gpu_layers; + const int i_gpu_start = n_layer - n_gpu_layers; + + GGML_UNUSED(i_gpu_start); + + // offload functions set the tensor output backend to GPU + // tensors are GPU-accelerated if any input or the output has been offloaded + offload_func_t offload_func_nr = llama_nop; // nr = non-repeating + offload_func_t offload_func_kq = llama_nop; + offload_func_t offload_func_v = llama_nop; + offload_func_t offload_func = llama_nop; + +#ifdef GGML_USE_CUBLAS + if (n_gpu_layers > n_layer) { + offload_func_nr = ggml_cuda_assign_buffers_no_alloc; + } + if (n_gpu_layers > n_layer + 1) { + offload_func_v = ggml_cuda_assign_buffers_no_alloc; + } + if (n_gpu_layers > n_layer + 2) { + offload_func_kq = ggml_cuda_assign_buffers_no_alloc; + } + + offload_func = ggml_cuda_assign_buffers_no_alloc; +#endif // GGML_USE_CUBLAS + + static const std::unordered_map k_offload_func = { + { "KQ_mask", offload_func_kq }, + { "KQ_pos", offload_func_kq }, + { "K_shift", offload_func_kq }, + { "K_shifted", offload_func_kq }, + + { "rms_norm_0", offload_func }, + { "attention_norm_0", offload_func }, + + { "tmpk", offload_func_kq }, + { "tmpq", offload_func_kq }, + { "tmpv", offload_func_v }, + { "Kcur", offload_func_kq }, + { "Qcur", offload_func_kq }, + { "Vcur", offload_func_v }, + + { "k", offload_func_kq }, + { "v", offload_func_v }, + + { "Q", offload_func_kq }, + { "K", offload_func_kq }, + { "KQ", offload_func_kq }, + { "KQ_scaled", offload_func_kq }, + { "KQ_scaled_alibi", offload_func_kq }, + { "KQ_masked", offload_func_kq }, + { "KQ_soft_max", offload_func_v }, + { "V", offload_func_v }, + { "KQV", offload_func_v }, + { "KQV_merged", offload_func_v }, + { "KQV_merged_contiguous", offload_func_v }, + + { "result_wo", offload_func }, + + { "inpFF", offload_func }, + + { "rms_norm_1", offload_func }, + { "ffn_norm", offload_func }, + + { "result_w3", offload_func }, + { "result_w2", offload_func }, + { "result_w1", offload_func }, + { "silu", offload_func }, + { "silu_x_result_w3", offload_func }, + { "inpFF_+_result_w2", offload_func }, + + { "rms_norm_2", offload_func_nr }, + //{ "result_norm", offload_func_nr }, // TODO CPU + GPU mirrored backend + //{ "result_output", offload_func }, + }; + + static const std::unordered_map k_offload_func_name = { + { llama_nop, "CPU" }, +#ifdef GGML_USE_CUBLAS + { ggml_cuda_assign_buffers_no_alloc, "GPU (CUDA)" }, +#endif + }; + + std::unordered_map ofn; + + for (int i = 0; i < result->n_nodes; ++i) { + struct ggml_tensor * cur = result->nodes[i]; + + const std::string name = cur->name; + + if (k_offload_func.find(name) == k_offload_func.end()) { + if (worst_case && cur->view_src == nullptr) { + LLAMA_LOG_WARN("%s: %32s: not offloaded (ref: %s)\n", __func__, + name.c_str(), "https://github.com/ggerganov/llama.cpp/pull/3837"); + } + continue; + } + + offload_func_t f = k_offload_func.at(name); + if (f == offload_func) { + if (ofn[name]++ < i_gpu_start) { + f = llama_nop; + } + } + + f(cur); + + if (worst_case && cur->view_src == nullptr) { + LLAMA_LOG_INFO("%s: %32s: %s\n", __func__, name.c_str(), k_offload_func_name.at(f).c_str()); + } + } + } + return result; } From 83d2c43791dee310421e5f43beca6dbdbe97e544 Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Sat, 28 Oct 2023 21:45:03 +0300 Subject: [PATCH 04/20] llama : offload rest of the models ggml-ci --- llama.cpp | 903 +++++++++++++++++++++--------------------------------- 1 file changed, 343 insertions(+), 560 deletions(-) diff --git a/llama.cpp b/llama.cpp index 22129ee43..a2baefd14 100644 --- a/llama.cpp +++ b/llama.cpp @@ -971,7 +971,7 @@ struct llama_mlock { typedef void (*offload_func_t)(struct ggml_tensor * tensor); -static void llama_nop(struct ggml_tensor * tensor) { // don't offload by default +static void ggml_offload_nop(struct ggml_tensor * tensor) { // don't offload by default (void) tensor; } @@ -3009,7 +3009,7 @@ static void llm_load_tensors( #ifdef GGML_USE_CUBLAS const int max_backend_supported_layers = hparams.n_layer + 3; const int max_offloadable_layers = hparams.n_layer + 3; -#elif defined(GGML_USE_CLBLAST) +#elif GGML_USE_CLBLAST const int max_backend_supported_layers = hparams.n_layer + 1; const int max_offloadable_layers = hparams.n_layer + 1; #endif // GGML_USE_CUBLAS @@ -3194,7 +3194,7 @@ static struct ggml_cgraph * llm_build_llama( // cur = cur*attn_norm(broadcasted) cur = ggml_mul(ctx0, cur, model.layers[il].attn_norm); - ggml_set_name(cur, "attention_norm_0"); + ggml_set_name(cur, "attn_norm_0"); } // self-attention @@ -3392,8 +3392,6 @@ static struct ggml_cgraph * llm_build_baichaun( const float freq_scale = cparams.rope_freq_scale; const float norm_rms_eps = hparams.f_norm_rms_eps; - const int n_gpu_layers = model.n_gpu_layers; - const int32_t n_tokens = batch.n_tokens; const int32_t n_kv = worst_case ? n_ctx : kv_self.n; const int32_t kv_head = worst_case ? n_ctx - n_tokens : kv_self.head; @@ -3429,45 +3427,21 @@ static struct ggml_cgraph * llm_build_baichaun( } ggml_set_name(inpL, "inp_embd"); - const int i_gpu_start = n_layer - n_gpu_layers; - (void) i_gpu_start; - - // offload functions set the tensor output backend to GPU - // tensors are GPU-accelerated if any input or the output has been offloaded - offload_func_t offload_func_nr = llama_nop; // nr = non-repeating - offload_func_t offload_func_kq = llama_nop; - offload_func_t offload_func_v = llama_nop; - -#ifdef GGML_USE_CUBLAS - if (n_gpu_layers > n_layer) { - offload_func_nr = ggml_cuda_assign_buffers_no_alloc; - } - if (n_gpu_layers > n_layer + 1) { - offload_func_v = ggml_cuda_assign_buffers_no_alloc; - } - if (n_gpu_layers > n_layer + 2) { - offload_func_kq = ggml_cuda_assign_buffers_no_alloc; - } -#endif // GGML_USE_CUBLAS - // KQ_scale struct ggml_tensor * KQ_scale = ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, 1); ggml_set_name(KQ_scale, "KQ_scale"); // KQ_mask (mask for 1 head, it will be broadcasted to all heads) struct ggml_tensor * KQ_mask = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_kv, n_tokens, 1); - offload_func_kq(KQ_mask); ggml_set_name(KQ_mask, "KQ_mask"); // KQ_pos - contains the positions struct ggml_tensor * KQ_pos = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens); - offload_func_kq(KQ_pos); ggml_set_name(KQ_pos, "KQ_pos"); // shift the entire K-cache if needed if (do_rope_shift) { struct ggml_tensor * K_shift = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_ctx); - offload_func_kq(K_shift); ggml_set_name(K_shift, "K_shift"); for (int il = 0; il < n_layer; ++il) { @@ -3479,45 +3453,31 @@ static struct ggml_cgraph * llm_build_baichaun( ggml_element_size(kv_self.k)*n_embd_gqa, ggml_element_size(kv_self.k)*n_embd_gqa*n_ctx*il), K_shift, n_embd_head, 0, 0, freq_base, freq_scale); - offload_func_kq(tmp); + ggml_set_name(tmp, "K_shifted"); ggml_build_forward_expand(gf, tmp); } } for (int il = 0; il < n_layer; ++il) { - ggml_format_name(inpL, "layer_inp_%d", il); - - offload_func_t offload_func = llama_nop; - -#ifdef GGML_USE_CUBLAS - if (il >= i_gpu_start) { - offload_func = ggml_cuda_assign_buffers_no_alloc; - } -#endif // GGML_USE_CUBLAS - struct ggml_tensor * inpSA = inpL; // norm { cur = ggml_rms_norm(ctx0, inpL, norm_rms_eps); - offload_func(cur); ggml_set_name(cur, "rms_norm_0"); // cur = cur*attn_norm(broadcasted) cur = ggml_mul(ctx0, cur, model.layers[il].attn_norm); - offload_func(cur); - ggml_set_name(cur, "attention_norm_0"); + ggml_set_name(cur, "attn_norm_0"); } // self-attention { // compute Q and K and RoPE them struct ggml_tensor * tmpk = ggml_mul_mat(ctx0, model.layers[il].wk, cur); - offload_func_kq(tmpk); ggml_set_name(tmpk, "tmpk"); struct ggml_tensor * tmpq = ggml_mul_mat(ctx0, model.layers[il].wq, cur); - offload_func_kq(tmpq); ggml_set_name(tmpq, "tmpq"); struct ggml_tensor * Kcur; @@ -3535,10 +3495,8 @@ static struct ggml_cgraph * llm_build_baichaun( GGML_ASSERT(false); } - offload_func_kq(Kcur); ggml_set_name(Kcur, "Kcur"); - offload_func_kq(Qcur); ggml_set_name(Qcur, "Qcur"); // store key and value to memory @@ -3546,21 +3504,17 @@ static struct ggml_cgraph * llm_build_baichaun( // compute the transposed [n_tokens, n_embd] V matrix struct ggml_tensor * tmpv = ggml_mul_mat(ctx0, model.layers[il].wv, cur); - offload_func_v(tmpv); ggml_set_name(tmpv, "tmpv"); struct ggml_tensor * Vcur = ggml_transpose(ctx0, ggml_reshape_2d(ctx0, tmpv, n_embd_gqa, n_tokens)); - offload_func_v(Vcur); ggml_set_name(Vcur, "Vcur"); struct ggml_tensor * k = ggml_view_1d(ctx0, kv_self.k, n_tokens*n_embd_gqa, (ggml_element_size(kv_self.k)*n_embd_gqa)*(il*n_ctx + kv_head)); - offload_func_kq(k); ggml_set_name(k, "k"); struct ggml_tensor * v = ggml_view_2d(ctx0, kv_self.v, n_tokens, n_embd_gqa, ( n_ctx)*ggml_element_size(kv_self.v), (il*n_ctx)*ggml_element_size(kv_self.v)*n_embd_gqa + kv_head*ggml_element_size(kv_self.v)); - offload_func_v(v); ggml_set_name(v, "v"); // important: storing RoPE-ed version of K in the KV cache! @@ -3569,7 +3523,6 @@ static struct ggml_cgraph * llm_build_baichaun( } struct ggml_tensor * Q = ggml_permute(ctx0, Qcur, 0, 2, 1, 3); - offload_func_kq(Q); ggml_set_name(Q, "Q"); struct ggml_tensor * K = @@ -3578,18 +3531,15 @@ static struct ggml_cgraph * llm_build_baichaun( ggml_element_size(kv_self.k)*n_embd_gqa, ggml_element_size(kv_self.k)*n_embd_head, ggml_element_size(kv_self.k)*n_embd_gqa*n_ctx*il); - offload_func_kq(K); ggml_set_name(K, "K"); // K * Q struct ggml_tensor * KQ = ggml_mul_mat(ctx0, K, Q); - offload_func_kq(KQ); ggml_set_name(KQ, "KQ"); // KQ_scaled = KQ / sqrt(n_embd_head) // KQ_scaled shape [n_past + n_tokens, n_tokens, n_head, 1] struct ggml_tensor * KQ_scaled = ggml_scale(ctx0, KQ, KQ_scale); - offload_func_kq(KQ_scaled); ggml_set_name(KQ_scaled, "KQ_scaled"); struct ggml_tensor * KQ_masked; @@ -3611,7 +3561,6 @@ static struct ggml_cgraph * llm_build_baichaun( // KQ = soft_max(KQ_masked) struct ggml_tensor * KQ_soft_max = ggml_soft_max(ctx0, KQ_masked); - offload_func_v(KQ_soft_max); ggml_set_name(KQ_soft_max, "KQ_soft_max"); // split cached V into n_head heads @@ -3621,33 +3570,27 @@ static struct ggml_cgraph * llm_build_baichaun( ggml_element_size(kv_self.v)*n_ctx, ggml_element_size(kv_self.v)*n_ctx*n_embd_head, ggml_element_size(kv_self.v)*n_ctx*n_embd_gqa*il); - offload_func_v(V); ggml_set_name(V, "V"); struct ggml_tensor * KQV = ggml_mul_mat(ctx0, V, KQ_soft_max); - offload_func_v(KQV); ggml_set_name(KQV, "KQV"); // KQV_merged = KQV.permute(0, 2, 1, 3) struct ggml_tensor * KQV_merged = ggml_permute(ctx0, KQV, 0, 2, 1, 3); - offload_func_v(KQV_merged); ggml_set_name(KQV_merged, "KQV_merged"); // cur = KQV_merged.contiguous().view(n_embd, n_tokens) cur = ggml_cont_2d(ctx0, KQV_merged, n_embd, n_tokens); - offload_func_v(cur); ggml_set_name(cur, "KQV_merged_contiguous"); // projection (no bias) cur = ggml_mul_mat(ctx0, model.layers[il].wo, cur); - offload_func(cur); ggml_set_name(cur, "result_wo"); } struct ggml_tensor * inpFF = ggml_add(ctx0, cur, inpSA); - offload_func(inpFF); ggml_set_name(inpFF, "inpFF"); // feed-forward network @@ -3655,45 +3598,37 @@ static struct ggml_cgraph * llm_build_baichaun( // norm { cur = ggml_rms_norm(ctx0, inpFF, norm_rms_eps); - offload_func(cur); ggml_set_name(cur, "rms_norm_1"); // cur = cur*ffn_norm(broadcasted) cur = ggml_mul(ctx0, cur, model.layers[il].ffn_norm); - offload_func(cur); ggml_set_name(cur, "ffn_norm"); } struct ggml_tensor * tmp = ggml_mul_mat(ctx0, model.layers[il].w3, cur); - offload_func(tmp); ggml_set_name(tmp, "result_w3"); cur = ggml_mul_mat(ctx0, model.layers[il].w1, cur); - offload_func(cur); ggml_set_name(cur, "result_w1"); // SILU activation cur = ggml_silu(ctx0, cur); - offload_func(cur); ggml_set_name(cur, "silu"); cur = ggml_mul(ctx0, cur, tmp); - offload_func(cur); ggml_set_name(cur, "silu_x_result_w3"); cur = ggml_mul_mat(ctx0, model.layers[il].w2, cur); - offload_func(cur); ggml_set_name(cur, "result_w2"); } cur = ggml_add(ctx0, cur, inpFF); - offload_func(cur); ggml_set_name(cur, "inpFF_+_result_w2"); // input for next layer @@ -3705,12 +3640,10 @@ static struct ggml_cgraph * llm_build_baichaun( // norm { cur = ggml_rms_norm(ctx0, cur, norm_rms_eps); - offload_func_nr(cur); ggml_set_name(cur, "rms_norm_2"); // cur = cur*norm(broadcasted) cur = ggml_mul(ctx0, cur, model.output_norm); - // offload_func_nr(cur); // TODO CPU + GPU mirrored backend ggml_set_name(cur, "result_norm"); } @@ -3747,8 +3680,6 @@ static struct ggml_cgraph * llm_build_refact( const float norm_rms_eps = hparams.f_norm_rms_eps; - const int n_gpu_layers = model.n_gpu_layers; - const int32_t n_tokens = batch.n_tokens; const int32_t n_kv = worst_case ? n_ctx : kv_self.n; const int32_t kv_head = worst_case ? n_ctx - n_tokens : kv_self.head; @@ -3784,77 +3715,40 @@ static struct ggml_cgraph * llm_build_refact( } ggml_set_name(inpL, "inp_embd"); - const int i_gpu_start = n_layer - n_gpu_layers; - (void) i_gpu_start; - - // offload functions set the tensor output backend to GPU - // tensors are GPU-accelerated if any input or the output has been offloaded - offload_func_t offload_func_nr = llama_nop; // nr = non-repeating - offload_func_t offload_func_kq = llama_nop; - offload_func_t offload_func_v = llama_nop; - -#ifdef GGML_USE_CUBLAS - if (n_gpu_layers > n_layer) { - offload_func_nr = ggml_cuda_assign_buffers_no_alloc; - } - if (n_gpu_layers > n_layer + 1) { - offload_func_v = ggml_cuda_assign_buffers_no_alloc; - } - if (n_gpu_layers > n_layer + 2) { - offload_func_kq = ggml_cuda_assign_buffers_no_alloc; - } -#endif // GGML_USE_CUBLAS - // KQ_scale struct ggml_tensor * KQ_scale = ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, 1); ggml_set_name(KQ_scale, "KQ_scale"); // KQ_mask (mask for 1 head, it will be broadcasted to all heads) struct ggml_tensor * KQ_mask = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_kv, n_tokens, 1); - offload_func_kq(KQ_mask); ggml_set_name(KQ_mask, "KQ_mask"); for (int il = 0; il < n_layer; ++il) { - ggml_format_name(inpL, "layer_inp_%d", il); - offload_func_t offload_func = llama_nop; - -#ifdef GGML_USE_CUBLAS - if (il >= i_gpu_start) { - offload_func = ggml_cuda_assign_buffers_no_alloc; - } -#endif // GGML_USE_CUBLAS - struct ggml_tensor * inpSA = inpL; // norm { cur = ggml_rms_norm(ctx0, inpL, norm_rms_eps); - offload_func(cur); ggml_set_name(cur, "rms_norm_0"); // cur = cur*attn_norm(broadcasted) cur = ggml_mul(ctx0, cur, model.layers[il].attn_norm); - offload_func(cur); - ggml_set_name(cur, "attention_norm_0"); + ggml_set_name(cur, "attn_norm_0"); } // self-attention { // compute Q and K struct ggml_tensor * tmpk = ggml_mul_mat(ctx0, model.layers[il].wk, cur); - offload_func_kq(tmpk); ggml_set_name(tmpk, "tmpk"); struct ggml_tensor * tmpq = ggml_mul_mat(ctx0, model.layers[il].wq, cur); - offload_func_kq(tmpq); ggml_set_name(tmpq, "tmpq"); struct ggml_tensor * Kcur = ggml_reshape_3d(ctx0, tmpk, n_embd_head, n_head_kv, n_tokens); - offload_func_kq(Kcur); ggml_set_name(Kcur, "Kcur"); struct ggml_tensor * Qcur = ggml_reshape_3d(ctx0, tmpq, n_embd_head, n_head, n_tokens); - offload_func_kq(Qcur); ggml_set_name(Qcur, "Qcur"); // store key and value to memory @@ -3862,21 +3756,17 @@ static struct ggml_cgraph * llm_build_refact( // compute the transposed [n_tokens, n_embd] V matrix struct ggml_tensor * tmpv = ggml_mul_mat(ctx0, model.layers[il].wv, cur); - offload_func_v(tmpv); ggml_set_name(tmpv, "tmpv"); struct ggml_tensor * Vcur = ggml_transpose(ctx0, ggml_reshape_2d(ctx0, tmpv, n_embd_gqa, n_tokens)); - offload_func_v(Vcur); ggml_set_name(Vcur, "Vcur"); struct ggml_tensor * k = ggml_view_1d(ctx0, kv_self.k, n_tokens*n_embd_gqa, (ggml_element_size(kv_self.k)*n_embd_gqa)*(il*n_ctx + kv_head)); - offload_func_kq(k); ggml_set_name(k, "k"); struct ggml_tensor * v = ggml_view_2d(ctx0, kv_self.v, n_tokens, n_embd_gqa, ( n_ctx)*ggml_element_size(kv_self.v), (il*n_ctx)*ggml_element_size(kv_self.v)*n_embd_gqa + kv_head*ggml_element_size(kv_self.v)); - offload_func_v(v); ggml_set_name(v, "v"); ggml_build_forward_expand(gf, ggml_cpy(ctx0, Kcur, k)); @@ -3884,7 +3774,6 @@ static struct ggml_cgraph * llm_build_refact( } struct ggml_tensor * Q = ggml_permute(ctx0, Qcur, 0, 2, 1, 3); - offload_func_kq(Q); ggml_set_name(Q, "Q"); struct ggml_tensor * K = @@ -3893,18 +3782,15 @@ static struct ggml_cgraph * llm_build_refact( ggml_element_size(kv_self.k)*n_embd_gqa, ggml_element_size(kv_self.k)*n_embd_head, ggml_element_size(kv_self.k)*n_embd_gqa*n_ctx*il); - offload_func_kq(K); ggml_set_name(K, "K"); // K * Q struct ggml_tensor * KQ = ggml_mul_mat(ctx0, K, Q); - offload_func_kq(KQ); ggml_set_name(KQ, "KQ"); // KQ_scaled = KQ / sqrt(n_embd_head) // KQ_scaled shape [n_kv, n_tokens, n_head, 1] struct ggml_tensor * KQ_scaled = ggml_scale(ctx0, KQ, KQ_scale); - offload_func_kq(KQ_scaled); ggml_set_name(KQ_scaled, "KQ_scaled"); // KQ_masked = mask_past(KQ_scaled) @@ -3912,12 +3798,10 @@ static struct ggml_cgraph * llm_build_refact( ggml_set_name(KQ_scaled_alibi, "KQ_scaled_alibi"); struct ggml_tensor * KQ_masked = ggml_add(ctx0, KQ_scaled_alibi, KQ_mask); - offload_func_kq(KQ_masked); ggml_set_name(KQ_masked, "KQ_masked"); // KQ = soft_max(KQ_masked) struct ggml_tensor * KQ_soft_max = ggml_soft_max(ctx0, KQ_masked); - offload_func_v(KQ_soft_max); ggml_set_name(KQ_soft_max, "KQ_soft_max"); // split cached V into n_head heads @@ -3927,41 +3811,27 @@ static struct ggml_cgraph * llm_build_refact( ggml_element_size(kv_self.v)*n_ctx, ggml_element_size(kv_self.v)*n_ctx*n_embd_head, ggml_element_size(kv_self.v)*n_ctx*n_embd_gqa*il); - offload_func_v(V); ggml_set_name(V, "V"); -#if 1 struct ggml_tensor * KQV = ggml_mul_mat(ctx0, V, KQ_soft_max); - offload_func_v(KQV); ggml_set_name(KQV, "KQV"); -#else - // make V contiguous in memory to speed up the matmul, however we waste time on the copy - // on M1 this is faster for the perplexity computation, but ~5% slower for the single-token generation - // is there a better way? - struct ggml_tensor * V_cont = ggml_cpy(ctx0, V, ggml_new_tensor_3d(ctx0, kv_self.v->type, n_ctx, n_embd_head, n_head)); - struct ggml_tensor * KQV = ggml_mul_mat(ctx0, V_cont, KQ_soft_max); -#endif // KQV_merged = KQV.permute(0, 2, 1, 3) struct ggml_tensor * KQV_merged = ggml_permute(ctx0, KQV, 0, 2, 1, 3); - offload_func_v(KQV_merged); ggml_set_name(KQV_merged, "KQV_merged"); // cur = KQV_merged.contiguous().view(n_embd, n_tokens) cur = ggml_cont_2d(ctx0, KQV_merged, n_embd, n_tokens); - offload_func_v(cur); ggml_set_name(cur, "KQV_merged_contiguous"); // projection (no bias) cur = ggml_mul_mat(ctx0, model.layers[il].wo, cur); - offload_func(cur); ggml_set_name(cur, "result_wo"); } struct ggml_tensor * inpFF = ggml_add(ctx0, cur, inpSA); - offload_func(inpFF); ggml_set_name(inpFF, "inpFF"); // feed-forward network @@ -3969,45 +3839,37 @@ static struct ggml_cgraph * llm_build_refact( // norm { cur = ggml_rms_norm(ctx0, inpFF, norm_rms_eps); - offload_func(cur); ggml_set_name(cur, "rms_norm_1"); // cur = cur*ffn_norm(broadcasted) cur = ggml_mul(ctx0, cur, model.layers[il].ffn_norm); - offload_func(cur); ggml_set_name(cur, "ffn_norm"); } struct ggml_tensor * tmp = ggml_mul_mat(ctx0, model.layers[il].w3, cur); - offload_func(tmp); ggml_set_name(tmp, "result_w3"); cur = ggml_mul_mat(ctx0, model.layers[il].w1, cur); - offload_func(cur); ggml_set_name(cur, "result_w1"); // SILU activation cur = ggml_silu(ctx0, cur); - offload_func(cur); ggml_set_name(cur, "silu"); cur = ggml_mul(ctx0, cur, tmp); - offload_func(cur); ggml_set_name(cur, "silu_x_result_w3"); cur = ggml_mul_mat(ctx0, model.layers[il].w2, cur); - offload_func(cur); ggml_set_name(cur, "result_w2"); } cur = ggml_add(ctx0, cur, inpFF); - offload_func(cur); ggml_set_name(cur, "inpFF_+_result_w2"); // input for next layer @@ -4019,12 +3881,10 @@ static struct ggml_cgraph * llm_build_refact( // norm { cur = ggml_rms_norm(ctx0, cur, norm_rms_eps); - offload_func_nr(cur); ggml_set_name(cur, "rms_norm_2"); // cur = cur*norm(broadcasted) cur = ggml_mul(ctx0, cur, model.output_norm); - // offload_func_nr(cur); // TODO CPU + GPU mirrored backend ggml_set_name(cur, "result_norm"); } @@ -4065,8 +3925,6 @@ static struct ggml_cgraph * llm_build_falcon( const float freq_scale = cparams.rope_freq_scale; const float norm_eps = hparams.f_norm_eps; - const int n_gpu_layers = model.n_gpu_layers; - const int32_t n_tokens = batch.n_tokens; const int32_t n_kv = worst_case ? n_ctx : kv_self.n; const int32_t kv_head = worst_case ? n_ctx - n_tokens : kv_self.head; @@ -4105,45 +3963,21 @@ static struct ggml_cgraph * llm_build_falcon( } ggml_set_name(inpL, "inp_embd"); - const int i_gpu_start = n_layer - n_gpu_layers; - (void) i_gpu_start; - - // offload functions set the tensor output backend to GPU - // tensors are GPU-accelerated if any input or the output has been offloaded - offload_func_t offload_func_nr = llama_nop; // nr = non-repeating - offload_func_t offload_func_kq = llama_nop; - offload_func_t offload_func_v = llama_nop; - -#ifdef GGML_USE_CUBLAS - if (n_gpu_layers > n_layer) { - offload_func_nr = ggml_cuda_assign_buffers_no_alloc; - } - if (n_gpu_layers > n_layer + 1) { - offload_func_v = ggml_cuda_assign_buffers_no_alloc; - } - if (n_gpu_layers > n_layer + 2) { - offload_func_kq = ggml_cuda_assign_buffers_no_alloc; - } -#endif // GGML_USE_CUBLAS - // KQ_scale struct ggml_tensor * KQ_scale = ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, 1); ggml_set_name(KQ_scale, "KQ_scale"); // KQ_mask (mask for 1 head, it will be broadcasted to all heads) struct ggml_tensor * KQ_mask = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_kv, n_tokens, 1); - offload_func_kq(KQ_mask); ggml_set_name(KQ_mask, "KQ_mask"); // KQ_pos - contains the positions struct ggml_tensor * KQ_pos = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens); - offload_func_kq(KQ_pos); ggml_set_name(KQ_pos, "KQ_pos"); // shift the entire K-cache if needed if (do_rope_shift) { struct ggml_tensor * K_shift = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_ctx); - offload_func_kq(K_shift); ggml_set_name(K_shift, "K_shift"); for (int il = 0; il < n_layer; ++il) { @@ -4155,7 +3989,7 @@ static struct ggml_cgraph * llm_build_falcon( ggml_element_size(kv_self.k)*n_embd_gqa, ggml_element_size(kv_self.k)*n_embd_gqa*n_ctx*il), K_shift, n_embd_head, 2, 0, freq_base, freq_scale); - offload_func_kq(tmp); + ggml_set_name(tmp, "K_shifted"); ggml_build_forward_expand(gf, tmp); } } @@ -4163,35 +3997,27 @@ static struct ggml_cgraph * llm_build_falcon( for (int il = 0; il < n_layer; ++il) { struct ggml_tensor * attn_norm; - offload_func_t offload_func = llama_nop; - -#ifdef GGML_USE_CUBLAS - if (il >= i_gpu_start) { - offload_func = ggml_cuda_assign_buffers_no_alloc; - } -#endif // GGML_USE_CUBLAS - // self-attention // TODO: refactor into common function (shared with LLaMA) { attn_norm = ggml_norm(ctx0, inpL, norm_eps); - offload_func(attn_norm); + ggml_set_name(attn_norm, "attn_norm_0"); - attn_norm = ggml_add(ctx0, - ggml_mul(ctx0, attn_norm, model.layers[il].attn_norm), - model.layers[il].attn_norm_b); - offload_func(attn_norm->src[0]); - offload_func(attn_norm); + attn_norm = ggml_mul(ctx0, attn_norm, model.layers[il].attn_norm); + ggml_set_name(attn_norm, "attn_norm_0_w"); + + attn_norm = ggml_add(ctx0, attn_norm, model.layers[il].attn_norm_b); + ggml_set_name(attn_norm, "attn_norm_0_wb"); if (model.layers[il].attn_norm_2) { // Falcon-40B cur = ggml_norm(ctx0, inpL, norm_eps); - offload_func(cur); + ggml_set_name(cur, "attn_norm_2"); - cur = ggml_add(ctx0, - ggml_mul(ctx0, cur, model.layers[il].attn_norm_2), - model.layers[il].attn_norm_2_b); - offload_func(cur->src[0]); - offload_func(cur); + cur = ggml_mul(ctx0, cur, model.layers[il].attn_norm_2); + ggml_set_name(cur, "attn_norm_2_w"); + + cur = ggml_add(ctx0, cur, model.layers[il].attn_norm_2_b); + ggml_set_name(cur, "attn_norm_2_wb"); } else { // Falcon 7B cur = attn_norm; } @@ -4199,7 +4025,7 @@ static struct ggml_cgraph * llm_build_falcon( // compute QKV cur = ggml_mul_mat(ctx0, model.layers[il].wqkv, cur); - offload_func_kq(cur); + ggml_set_name(cur, "wqkv"); // Note that the strides for Kcur, Vcur are set up so that the // resulting views are misaligned with the tensor's storage @@ -4219,49 +4045,49 @@ static struct ggml_cgraph * llm_build_falcon( wsize * n_embd_head, wsize * n_embd_head * (n_head + 2 * n_head_kv), 0)); - offload_func_kq(tmpq); + ggml_set_name(tmpq, "tmpq"); struct ggml_tensor * tmpk = ggml_cont(ctx0, ggml_view_3d( ctx0, cur, n_embd_head, n_head_kv, n_tokens, wsize * n_embd_head, wsize * n_embd_head * (n_head + 2 * n_head_kv), wsize * n_embd_head * n_head)); - offload_func_kq(tmpk); + ggml_set_name(tmpk, "tmpk"); struct ggml_tensor * tmpv = ggml_view_3d( ctx0, cur, n_embd_head, n_head_kv, n_tokens, wsize * n_embd_head, wsize * n_embd_head * (n_head + 2 * n_head_kv), wsize * n_embd_head * (n_head + n_head_kv)); - offload_func_v(tmpv); + ggml_set_name(tmpv, "tmpv"); // using mode = 2 for neox mode struct ggml_tensor * Qcur = ggml_rope_custom(ctx0, tmpq, KQ_pos, n_embd_head, 2, 0, freq_base, freq_scale); - offload_func_kq(Qcur); + ggml_set_name(Qcur, "Qcur"); + struct ggml_tensor * Kcur = ggml_rope_custom(ctx0, tmpk, KQ_pos, n_embd_head, 2, 0, freq_base, freq_scale); - offload_func_kq(Kcur); + ggml_set_name(Kcur, "Kcur"); { - struct ggml_tensor * Vcur = ggml_transpose(ctx0, ggml_reshape_2d(ctx0, ggml_cont(ctx0, tmpv), n_embd_gqa, n_tokens)); - offload_func_v(Vcur); - offload_func_v(Vcur->src[0]->src[0]); - ggml_set_name(Vcur, "Vcur"); + struct ggml_tensor * Vcur = ggml_cont(ctx0, tmpv); + ggml_set_name(Vcur, "Vcur_0"); + + Vcur = ggml_transpose(ctx0, ggml_reshape_2d(ctx0, Vcur, n_embd_gqa, n_tokens)); + ggml_set_name(Vcur, "Vcur_1"); struct ggml_tensor * k = ggml_view_1d(ctx0, kv_self.k, n_tokens*n_embd_gqa, (ggml_element_size(kv_self.k)*n_embd_gqa)*(il*n_ctx + kv_head)); - offload_func_kq(k); ggml_set_name(k, "k"); struct ggml_tensor * v = ggml_view_2d(ctx0, kv_self.v, n_tokens, n_embd_gqa, ( n_ctx)*ggml_element_size(kv_self.v), (il*n_ctx)*ggml_element_size(kv_self.v)*n_embd_gqa + kv_head*ggml_element_size(kv_self.v)); - offload_func_v(v); + ggml_set_name(v, "v"); ggml_build_forward_expand(gf, ggml_cpy(ctx0, Kcur, k)); ggml_build_forward_expand(gf, ggml_cpy(ctx0, Vcur, v)); } struct ggml_tensor * Q = ggml_permute(ctx0, Qcur, 0, 2, 1, 3); - offload_func_kq(Q); ggml_set_name(Q, "Q"); struct ggml_tensor * K = @@ -4270,23 +4096,18 @@ static struct ggml_cgraph * llm_build_falcon( ggml_element_size(kv_self.k)*n_embd_gqa, ggml_element_size(kv_self.k)*n_embd_head, ggml_element_size(kv_self.k)*n_embd_gqa*n_ctx*il); - offload_func_kq(K); ggml_set_name(K, "K"); struct ggml_tensor * KQ = ggml_mul_mat(ctx0, K, Q); - offload_func_kq(KQ); ggml_set_name(KQ, "KQ"); struct ggml_tensor * KQ_scaled = ggml_scale(ctx0, KQ, KQ_scale); - offload_func_kq(KQ_scaled); ggml_set_name(KQ_scaled, "KQ_scaled"); struct ggml_tensor * KQ_masked = ggml_add(ctx0, KQ_scaled, KQ_mask); - offload_func_kq(KQ_masked); ggml_set_name(KQ_masked, "KQ_masked"); struct ggml_tensor * KQ_soft_max = ggml_soft_max(ctx0, KQ_masked); - offload_func_v(KQ_soft_max); ggml_set_name(KQ_soft_max, "KQ_soft_max"); struct ggml_tensor * V = @@ -4295,23 +4116,18 @@ static struct ggml_cgraph * llm_build_falcon( ggml_element_size(kv_self.v)*n_ctx, ggml_element_size(kv_self.v)*n_ctx*n_embd_head, ggml_element_size(kv_self.v)*n_ctx*n_embd_gqa*il); - offload_func_v(V); ggml_set_name(V, "V"); struct ggml_tensor * KQV = ggml_mul_mat(ctx0, V, KQ_soft_max); - offload_func_v(KQV); ggml_set_name(KQV, "KQV"); struct ggml_tensor * KQV_merged = ggml_permute(ctx0, KQV, 0, 2, 1, 3); - offload_func_v(KQV_merged); ggml_set_name(KQV_merged, "KQV_merged"); cur = ggml_cont_2d(ctx0, KQV_merged, n_embd, n_tokens); - offload_func_v(cur); ggml_set_name(cur, "KQV_merged_contiguous"); cur = ggml_mul_mat(ctx0, model.layers[il].wo, cur); - offload_func(cur); ggml_set_name(cur, "result_wo"); } @@ -4322,18 +4138,20 @@ static struct ggml_cgraph * llm_build_falcon( struct ggml_tensor * inpFF = attn_norm; cur = ggml_mul_mat(ctx0, model.layers[il].w3, inpFF); - offload_func(cur); + ggml_set_name(cur, "result_w3"); cur = ggml_gelu(ctx0, cur); - offload_func(cur); + ggml_set_name(cur, "gelu"); + cur = ggml_mul_mat(ctx0, model.layers[il].w2, cur); - offload_func(cur); + ggml_set_name(cur, "result_w2"); } cur = ggml_add(ctx0, cur, attn_out); - offload_func(cur); + ggml_set_name(cur, "inpFF_+_result_w2"); + cur = ggml_add(ctx0, cur, inpL); - offload_func(cur); + ggml_set_name(cur, "inpL_+_inpFF_+_result_w2"); // input for next layer inpL = cur; @@ -4344,11 +4162,12 @@ static struct ggml_cgraph * llm_build_falcon( // norm { cur = ggml_norm(ctx0, cur, norm_eps); - offload_func_nr(cur); + ggml_set_name(cur, "out_norm_0"); - cur = ggml_add(ctx0, - ggml_mul(ctx0, cur, model.output_norm), - model.output_norm_b); + cur = ggml_mul(ctx0, cur, model.output_norm); + ggml_set_name(cur, "out_norm_0_w"); + + cur = ggml_add(ctx0, cur, model.output_norm_b); ggml_set_name(cur, "result_norm"); } @@ -4386,8 +4205,6 @@ static struct ggml_cgraph * llm_build_starcoder( const float norm_eps = hparams.f_norm_eps; - const int n_gpu_layers = model.n_gpu_layers; - const int32_t n_tokens = batch.n_tokens; const int32_t n_kv = worst_case ? n_ctx : kv_self.n; const int32_t kv_head = worst_case ? n_ctx - n_tokens : kv_self.head; @@ -4423,27 +4240,6 @@ static struct ggml_cgraph * llm_build_starcoder( } ggml_set_name(embd, "inp_embd"); - const int i_gpu_start = n_layer - n_gpu_layers; - (void) i_gpu_start; - - // offload functions set the tensor output backend to GPU - // tensors are GPU-accelerated if any input or the output has been offloaded - offload_func_t offload_func_nr = llama_nop; // nr = non-repeating - offload_func_t offload_func_kq = llama_nop; - offload_func_t offload_func_v = llama_nop; - -#ifdef GGML_USE_CUBLAS - if (n_gpu_layers > n_layer) { - offload_func_nr = ggml_cuda_assign_buffers_no_alloc; - } - if (n_gpu_layers > n_layer + 1) { - offload_func_v = ggml_cuda_assign_buffers_no_alloc; - } - if (n_gpu_layers > n_layer + 2) { - offload_func_kq = ggml_cuda_assign_buffers_no_alloc; - } -#endif // GGML_USE_CUBLAS - { struct ggml_tensor * inp_pos = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens); ggml_set_name(inp_pos, "inp_pos"); @@ -4458,36 +4254,30 @@ static struct ggml_cgraph * llm_build_starcoder( // KQ_mask (mask for 1 head, it will be broadcasted to all heads) struct ggml_tensor * KQ_mask = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_kv, n_tokens, 1); ggml_set_name(KQ_mask, "KQ_mask"); - offload_func_kq(KQ_mask); inpL = ggml_add(ctx0, embd, pos); ggml_set_name(inpL, "inpL"); for (int il = 0; il < n_layer; ++il) { - offload_func_t offload_func = llama_nop; - -#ifdef GGML_USE_CUBLAS - if (il >= i_gpu_start) { - offload_func = ggml_cuda_assign_buffers_no_alloc; - } -#endif // GGML_USE_CUBLAS - { // Norm cur = ggml_norm(ctx0, inpL, norm_eps); - offload_func(cur); + ggml_set_name(cur, "attn_norm_0"); - cur = ggml_add(ctx0, ggml_mul(ctx0, cur, model.layers[il].attn_norm), model.layers[il].attn_norm_b); - offload_func(cur); + cur = ggml_mul(ctx0, cur, model.layers[il].attn_norm); + ggml_set_name(cur, "attn_norm_0_w"); + + cur = ggml_add(ctx0, cur, model.layers[il].attn_norm_b); + ggml_set_name(cur, "attn_norm_0_wb"); } { // Self Attention cur = ggml_mul_mat(ctx0, model.layers[il].wqkv, cur); - offload_func_kq(cur); + ggml_set_name(cur, "wqkv"); cur = ggml_add(ctx0, cur, model.layers[il].bqkv); - offload_func_kq(cur); + ggml_set_name(cur, "bqkv"); struct ggml_tensor * tmpq = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd, n_tokens, cur->nb[1], 0*sizeof(float)*(n_embd))); struct ggml_tensor * tmpk = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd))); @@ -4497,26 +4287,19 @@ static struct ggml_cgraph * llm_build_starcoder( ggml_set_name(tmpk, "tmpk"); ggml_set_name(tmpv, "tmpv"); - offload_func_kq(tmpq); - offload_func_kq(tmpk); - offload_func_v (tmpv); - struct ggml_tensor * Qcur = ggml_reshape_3d(ctx0, tmpq, n_embd_head, n_head, n_tokens); struct ggml_tensor * Kcur = tmpk; { struct ggml_tensor * Vcur = ggml_transpose(ctx0, tmpv); - offload_func_v(Vcur); ggml_set_name(Vcur, "Vcur"); struct ggml_tensor * k = ggml_view_1d(ctx0, kv_self.k, n_tokens*n_embd_gqa, (ggml_element_size(kv_self.k)*n_embd_gqa)*(il*n_ctx + kv_head)); - offload_func_kq(k); ggml_set_name(k, "k"); struct ggml_tensor * v = ggml_view_2d(ctx0, kv_self.v, n_tokens, n_embd_gqa, ( n_ctx)*ggml_element_size(kv_self.v), (il*n_ctx)*ggml_element_size(kv_self.v)*n_embd_gqa + kv_head*ggml_element_size(kv_self.v)); - offload_func_v(v); ggml_set_name(v, "v"); ggml_build_forward_expand(gf, ggml_cpy(ctx0, Kcur, k)); @@ -4524,7 +4307,6 @@ static struct ggml_cgraph * llm_build_starcoder( } struct ggml_tensor * Q = ggml_permute(ctx0, Qcur, 0, 2, 1, 3); - offload_func_kq(Q); ggml_set_name(Q, "Q"); struct ggml_tensor * K = @@ -4533,28 +4315,23 @@ static struct ggml_cgraph * llm_build_starcoder( ggml_element_size(kv_self.k)*n_embd_gqa, ggml_element_size(kv_self.k)*n_embd_head, ggml_element_size(kv_self.k)*n_embd_gqa*n_ctx*il); - offload_func_kq(K); ggml_set_name(K, "K"); // K * Q struct ggml_tensor * KQ = ggml_mul_mat(ctx0, K, Q); - offload_func_kq(KQ); ggml_set_name(KQ, "KQ"); // KQ_scaled = KQ / sqrt(n_embd_head) // KQ_scaled shape [n_past + n_tokens, n_tokens, n_head, 1] struct ggml_tensor * KQ_scaled = ggml_scale_inplace(ctx0, KQ, KQ_scale); - offload_func_kq(KQ_scaled); ggml_set_name(KQ_scaled, "KQ_scaled"); // KQ_masked = mask_past(KQ_scaled) struct ggml_tensor * KQ_masked = ggml_add(ctx0, KQ_scaled, KQ_mask); - offload_func_kq(KQ_masked); ggml_set_name(KQ_masked, "KQ_masked"); // KQ = soft_max(KQ_masked) struct ggml_tensor * KQ_soft_max = ggml_soft_max_inplace(ctx0, KQ_masked); - offload_func_v(KQ_soft_max); ggml_set_name(KQ_soft_max, "KQ_soft_max"); // split cached V into n_head heads @@ -4567,25 +4344,22 @@ static struct ggml_cgraph * llm_build_starcoder( ggml_set_name(V, "V"); struct ggml_tensor * KQV = ggml_mul_mat(ctx0, V, KQ_soft_max); - offload_func_v(KQV); ggml_set_name(KQV, "KQV"); struct ggml_tensor * KQV_merged = ggml_permute(ctx0, KQV, 0, 2, 1, 3); - offload_func_v(KQV_merged); ggml_set_name(KQV_merged, "KQV_merged"); cur = ggml_cont_2d(ctx0, KQV_merged, n_embd, n_tokens); - offload_func_v(cur); ggml_set_name(cur, "KQV_merged_contiguous"); } // Projection cur = ggml_add(ctx0, ggml_mul_mat(ctx0, model.layers[il].wo, cur), model.layers[il].bo); - offload_func(cur); + ggml_set_name(cur, "result_wo"); // Add the input cur = ggml_add(ctx0, cur, inpL); - offload_func(cur); + ggml_set_name(cur, "inpL_+_result_wo"); struct ggml_tensor * inpFF = cur; @@ -4594,22 +4368,28 @@ static struct ggml_cgraph * llm_build_starcoder( // Norm { cur = ggml_norm(ctx0, inpFF, norm_eps); - offload_func_nr(cur); + ggml_set_name(cur, "ffn_norm_0"); - cur = ggml_add(ctx0, ggml_mul(ctx0, cur, model.layers[il].ffn_norm), model.layers[il].ffn_norm_b); - offload_func_nr(cur); + cur = ggml_mul(ctx0, cur, model.layers[il].ffn_norm); + ggml_set_name(cur, "ffn_norm_0_w"); + + cur = ggml_add(ctx0, cur, model.layers[il].ffn_norm_b); + ggml_set_name(cur, "ffn_norm_0_wb"); } cur = ggml_add(ctx0, ggml_mul_mat(ctx0, model.layers[il].w3, cur), model.layers[il].b3); - offload_func(cur); + ggml_set_name(cur, "result_w3"); // GELU activation cur = ggml_gelu(ctx0, cur); - offload_func(cur); + ggml_set_name(cur, "gelu"); // Projection - cur = ggml_add(ctx0, ggml_mul_mat(ctx0, model.layers[il].w2, cur), model.layers[il].b2); - offload_func(cur); + cur = ggml_mul_mat(ctx0, model.layers[il].w2, cur); + ggml_set_name(cur, "result_w2"); + + cur = ggml_add(ctx0, cur, model.layers[il].b2); + ggml_set_name(cur, "result_w2_b"); } inpL = ggml_add(ctx0, cur, inpFF); @@ -4619,9 +4399,12 @@ static struct ggml_cgraph * llm_build_starcoder( // Output Norm { cur = ggml_norm(ctx0, inpL, norm_eps); - offload_func_nr(cur); + ggml_set_name(cur, "out_norm_0"); - cur = ggml_add(ctx0, ggml_mul(ctx0, cur, model.output_norm), model.output_norm_b); + cur = ggml_mul(ctx0, cur, model.output_norm); + ggml_set_name(cur, "out_norm_0_w"); + + cur = ggml_add(ctx0, cur, model.output_norm_b); ggml_set_name(cur, "result_norm"); } @@ -4659,9 +4442,6 @@ static struct ggml_cgraph * llm_build_persimmon( const float freq_scale = cparams.rope_freq_scale; const float norm_eps = hparams.f_norm_eps; - const int n_gpu_layers = model.n_gpu_layers; - - const int32_t n_tokens = batch.n_tokens; const int32_t n_kv = worst_case ? n_ctx : kv_self.n; const int32_t kv_head = worst_case ? n_ctx - n_tokens : kv_self.head; @@ -4689,65 +4469,23 @@ static struct ggml_cgraph * llm_build_persimmon( inpL = ggml_get_rows(ctx0, model.tok_embeddings, inp_tokens); } else { inpL = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd, n_tokens); - ggml_allocr_alloc(lctx.alloc, inpL); - if (!ggml_allocr_is_measure(lctx.alloc)) { - memcpy(inpL->data, batch.embd, n_tokens * n_embd * ggml_element_size(inpL)); - } } - const int i_gpu_start = n_layer - n_gpu_layers; - (void) i_gpu_start; - offload_func_t offload_func_nr = llama_nop; // nr = non-repeating - offload_func_t offload_func_kq = llama_nop; - offload_func_t offload_func_v = llama_nop; + ggml_set_name(inpL, "imp_embd"); + // KQ_scale struct ggml_tensor * KQ_scale = ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, 1); - ggml_allocr_alloc(lctx.alloc, KQ_scale); - if (!ggml_allocr_is_measure(lctx.alloc)) { - ggml_set_f32(KQ_scale, 1.0f/sqrtf(float(n_embd_head))); - } ggml_set_name(KQ_scale, "KQ_scale"); - struct ggml_tensor * KQ_mask = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_kv, n_tokens, 1); - offload_func_kq(KQ_mask); - ggml_set_name(KQ_mask, "KQ_mask"); - ggml_allocr_alloc(lctx.alloc, KQ_mask); - if (!ggml_allocr_is_measure(lctx.alloc)) { - float * data = (float *) KQ_mask->data; - memset(data, 0, ggml_nbytes(KQ_mask)); - for (int h = 0; h < 1; ++h) { - for (int j = 0; j < n_tokens; ++j) { - const llama_pos pos = batch.pos[j]; - const llama_seq_id seq_id = batch.seq_id[j][0]; - for (int i = 0; i < n_kv; ++i) { - if (!kv_self.cells[i].has_seq_id(seq_id) || kv_self.cells[i].pos > pos) { - data[h*(n_kv*n_tokens) + j*n_kv + i] = -INFINITY; - } - } - } - } - } + struct ggml_tensor * KQ_mask = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_kv, n_tokens, 1); + ggml_set_name(KQ_mask, "KQ_mask"); struct ggml_tensor * KQ_pos = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens); - offload_func_kq(KQ_pos); ggml_set_name(KQ_pos, "KQ_pos"); - ggml_allocr_alloc(lctx.alloc, KQ_pos); - if (!ggml_allocr_is_measure(lctx.alloc)) { - int * data = (int *) KQ_pos->data; - for (int i = 0; i < n_tokens; ++i) { - data[i] = batch.pos[i]; - } - } + if (do_rope_shift) { struct ggml_tensor * K_shift = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_ctx); - offload_func_kq(K_shift); ggml_set_name(K_shift, "K_shift"); - ggml_allocr_alloc(lctx.alloc, K_shift); - if (!ggml_allocr_is_measure(lctx.alloc)) { - int * data = (int *) K_shift->data; - for (int i = 0; i < n_ctx; ++i) { - data[i] = kv_self.cells[i].delta; - } - } + for (int il = 0; il < n_layer; ++il) { struct ggml_tensor * tmp = // we rotate only the first n_rot dimensions. @@ -4759,65 +4497,76 @@ static struct ggml_cgraph * llm_build_persimmon( ggml_element_size(kv_self.k)*(n_embd_head*n_ctx*il) ), K_shift, n_rot, 2, 0, freq_base, freq_scale); - offload_func_kq(tmp); + ggml_set_name(tmp, "K_shifted"); ggml_build_forward_expand(gf, tmp); } } - for (int il=0; il < n_layer; ++il) { + + for (int il = 0; il < n_layer; ++il) { struct ggml_tensor * residual = inpL; - offload_func_t offload_func = llama_nop; + { cur = ggml_norm(ctx0, inpL, norm_eps); - offload_func(cur); + ggml_set_name(cur, "attn_norm_0"); + cur = ggml_mul(ctx0, cur, model.layers[il].attn_norm); - offload_func(cur); + ggml_set_name(cur, "attn_norm_0_w"); + cur = ggml_add(ctx0, cur, model.layers[il].attn_norm_b); - offload_func(cur); - ggml_format_name(cur, "input_layernorm_%d", il); + ggml_set_name(cur, "attn_norm_0_wb"); } + // self attention { cur = ggml_mul_mat(ctx0, model.layers[il].wqkv, cur); - offload_func_kq(cur); + ggml_set_name(cur, "wqkv"); + cur = ggml_add(ctx0, cur, model.layers[il].bqkv); - offload_func_kq(cur); + ggml_set_name(cur, "bqkv"); // split qkv GGML_ASSERT(n_head_kv == n_head); - ggml_set_name(cur, format("qkv_%d", il).c_str()); + struct ggml_tensor * tmpqkv = ggml_reshape_4d(ctx0, cur, n_embd_head, 3, n_head, n_tokens); - offload_func_kq(tmpqkv); + ggml_set_name(tmpqkv, "tmpqkv"); + struct ggml_tensor * tmpqkv_perm = ggml_cont(ctx0, ggml_permute(ctx0, tmpqkv, 0, 3, 1, 2)); - offload_func_kq(tmpqkv_perm); - ggml_format_name(tmpqkv_perm, "tmpqkv_perm_%d", il); + ggml_set_name(tmpqkv_perm, "tmpqkv"); + struct ggml_tensor * tmpq = ggml_view_3d( ctx0, tmpqkv_perm, n_embd_head, n_head, n_tokens, ggml_element_size(tmpqkv_perm) * n_embd_head, ggml_element_size(tmpqkv_perm) * n_embd_head * n_head, 0 ); - offload_func_kq(tmpq); + ggml_set_name(tmpq, "tmpq"); + struct ggml_tensor * tmpk = ggml_view_3d( ctx0, tmpqkv_perm, n_embd_head, n_head, n_tokens, ggml_element_size(tmpqkv_perm) * n_embd_head, ggml_element_size(tmpqkv_perm) * n_embd_head * n_head, ggml_element_size(tmpqkv_perm) * n_embd_head * n_head * n_tokens ); - offload_func_kq(tmpk); + ggml_set_name(tmpk, "tmpk"); + // Q/K Layernorm tmpq = ggml_norm(ctx0, tmpq, norm_eps); - offload_func_kq(tmpq); - tmpq = ggml_mul(ctx0, tmpq, model.layers[il].attn_q_norm); - offload_func_kq(tmpq); - tmpq = ggml_add(ctx0, tmpq, model.layers[il].attn_q_norm_b); - offload_func_kq(tmpq); + ggml_set_name(tmpq, "tmpq"); + + tmpq = ggml_mul(ctx0, tmpq, model.layers[il].attn_q_norm); + ggml_set_name(tmpq, "tmpq"); + + tmpq = ggml_add(ctx0, tmpq, model.layers[il].attn_q_norm_b); + ggml_set_name(tmpq, "tmpq"); tmpk = ggml_norm(ctx0, tmpk, norm_eps); - offload_func_v(tmpk); + ggml_set_name(tmpk, "tmpk"); + tmpk = ggml_mul(ctx0, tmpk, model.layers[il].attn_k_norm); - offload_func_v(tmpk); + ggml_set_name(tmpk, "tmpk"); + tmpk = ggml_add(ctx0, tmpk, model.layers[il].attn_k_norm_b); - offload_func_v(tmpk); + ggml_set_name(tmpk, "tmpk"); // RoPE the first n_rot of q/k, pass the other half, and concat. struct ggml_tensor * qrot = ggml_view_3d( @@ -4826,16 +4575,15 @@ static struct ggml_cgraph * llm_build_persimmon( ggml_element_size(tmpq) * n_embd_head * n_head, 0 ); - offload_func_kq(qrot); - ggml_format_name(qrot, "qrot_%d", il); + ggml_set_name(qrot, "qrot"); + struct ggml_tensor * krot = ggml_view_3d( ctx0, tmpk, n_rot, n_head, n_tokens, ggml_element_size(tmpk) * n_embd_head, ggml_element_size(tmpk) * n_embd_head * n_head, 0 ); - offload_func_kq(krot); - ggml_format_name(krot, "krot_%d", il); + ggml_set_name(krot, "krot"); // get the second half of tmpq, e.g tmpq[n_rot:, :, :] struct ggml_tensor * qpass = ggml_view_3d( @@ -4844,47 +4592,52 @@ static struct ggml_cgraph * llm_build_persimmon( ggml_element_size(tmpq) * n_embd_head * n_head, ggml_element_size(tmpq) * n_rot ); - offload_func_kq(qpass); - ggml_format_name(qpass, "qpass_%d", il); + ggml_set_name(qpass, "qpass"); + struct ggml_tensor * kpass = ggml_view_3d( ctx0, tmpk, n_rot, n_head, n_tokens, ggml_element_size(tmpk) * n_embd_head, ggml_element_size(tmpk) * n_embd_head * n_head, ggml_element_size(tmpk) * n_rot ); - offload_func_kq(kpass); - ggml_format_name(kpass, "kpass_%d", il); + ggml_set_name(kpass, "kpass"); struct ggml_tensor * qrotated = ggml_rope_custom( ctx0, qrot, KQ_pos, n_rot, 2, 0, freq_base, freq_scale ); - offload_func_kq(qrotated); + ggml_set_name(qrotated, "qrotated"); + struct ggml_tensor * krotated = ggml_rope_custom( ctx0, krot, KQ_pos, n_rot, 2, 0, freq_base, freq_scale ); - offload_func_kq(krotated); + ggml_set_name(krotated, "krotated"); + // ggml currently only supports concatenation on dim=2 // so we need to permute qrot, qpass, concat, then permute back. qrotated = ggml_cont(ctx0, ggml_permute(ctx0, qrotated, 2, 1, 0, 3)); - offload_func_kq(qrotated); + ggml_set_name(qrotated, "qrotated"); + krotated = ggml_cont(ctx0, ggml_permute(ctx0, krotated, 2, 1, 0, 3)); - offload_func_kq(krotated); + ggml_set_name(krotated, "krotated"); qpass = ggml_cont(ctx0, ggml_permute(ctx0, qpass, 2, 1, 0, 3)); - offload_func_kq(qpass); + ggml_set_name(qpass, "qpass"); + kpass = ggml_cont(ctx0, ggml_permute(ctx0, kpass, 2, 1, 0, 3)); - offload_func_kq(kpass); + ggml_set_name(kpass, "kpass"); struct ggml_tensor * Qcur = ggml_concat(ctx0, qrotated, qpass); - offload_func_kq(Qcur); + ggml_set_name(Qcur, "Qcur"); + struct ggml_tensor * Kcur = ggml_concat(ctx0, krotated, kpass); - offload_func_kq(Kcur); + ggml_set_name(Kcur, "Kcur"); struct ggml_tensor * Q = ggml_cont(ctx0, ggml_permute(ctx0, Qcur, 1, 2, 0, 3)); - offload_func_kq(Q); + ggml_set_name(Q, "Q"); Kcur = ggml_cont(ctx0, ggml_permute(ctx0, Kcur, 2, 1, 0, 3)); - offload_func_kq(Kcur); + ggml_set_name(Kcur, "Kcur"); + { struct ggml_tensor * tmpv = ggml_view_3d( ctx0, tmpqkv_perm, n_embd_head, n_head, n_tokens, @@ -4892,23 +4645,21 @@ static struct ggml_cgraph * llm_build_persimmon( ggml_element_size(tmpqkv_perm) * n_embd_head * n_head, ggml_element_size(tmpqkv_perm) * n_embd_head * n_head * n_tokens * 2 ); - offload_func_v(tmpv); + ggml_set_name(tmpv, "tmpv"); + // store K, V in cache struct ggml_tensor * Vcur = ggml_transpose(ctx0, ggml_reshape_2d(ctx0, tmpv, n_embd_gqa, n_tokens)); - offload_func_v(Vcur); ggml_set_name(Vcur, "Vcur"); struct ggml_tensor * k = ggml_view_1d( ctx0, kv_self.k, n_tokens*n_embd_gqa, (ggml_element_size(kv_self.k)*n_embd_gqa)*(il*n_ctx + kv_head) ); - offload_func_kq(k); ggml_set_name(k, "k"); struct ggml_tensor * v = ggml_view_2d(ctx0, kv_self.v, n_tokens, n_embd_gqa, ( n_ctx)*ggml_element_size(kv_self.v), (il*n_ctx)*ggml_element_size(kv_self.v)*n_embd_gqa + kv_head*ggml_element_size(kv_self.v)); - offload_func_v(v); ggml_set_name(v, "v"); // important: storing RoPE-ed version of K in the KV cache! @@ -4920,24 +4671,18 @@ static struct ggml_cgraph * llm_build_persimmon( ggml_element_size(kv_self.k)*n_embd_gqa, ggml_element_size(kv_self.k)*n_embd_head, ggml_element_size(kv_self.k)*n_embd_gqa*n_ctx*il); - - offload_func_kq(K); - ggml_format_name(K, "K_%d", il); + ggml_set_name(K, "K"); struct ggml_tensor * KQ = ggml_mul_mat(ctx0, K, Q); - offload_func_kq(KQ); ggml_set_name(KQ, "KQ"); struct ggml_tensor * KQ_scaled = ggml_scale(ctx0, KQ, KQ_scale); - offload_func_kq(KQ_scaled); ggml_set_name(KQ_scaled, "KQ_scaled"); struct ggml_tensor * KQ_masked = ggml_add(ctx0, KQ_scaled, KQ_mask); - offload_func_kq(KQ_masked); ggml_set_name(KQ_masked, "KQ_masked"); struct ggml_tensor * KQ_soft_max = ggml_soft_max_inplace(ctx0, KQ_masked); - offload_func_kq(KQ_soft_max); ggml_set_name(KQ_soft_max, "KQ_soft_max"); struct ggml_tensor * V = @@ -4946,85 +4691,86 @@ static struct ggml_cgraph * llm_build_persimmon( ggml_element_size(kv_self.v)*n_ctx, ggml_element_size(kv_self.v)*n_ctx*n_embd_head, ggml_element_size(kv_self.v)*n_ctx*n_embd_gqa*il); - offload_func_v(V); ggml_set_name(V, "V"); struct ggml_tensor * KQV = ggml_mul_mat(ctx0, V, KQ_soft_max); - offload_func_v(KQV); ggml_set_name(KQV, "KQV"); struct ggml_tensor * KQV_merged = ggml_permute(ctx0, KQV, 0, 2, 1, 3); - offload_func_v(KQV_merged); ggml_set_name(KQV_merged, "KQV_merged"); cur = ggml_cont_2d(ctx0, KQV_merged, n_embd, n_tokens); - offload_func_v(cur); ggml_set_name(cur, "KQV_merged_contiguous"); cur = ggml_mul_mat(ctx0, model.layers[il].wo, cur); - offload_func(cur); - cur = ggml_add(ctx0, cur, model.layers[il].bo); - offload_func(cur); ggml_set_name(cur, "result_wo"); + + cur = ggml_add(ctx0, cur, model.layers[il].bo); + ggml_set_name(cur, "result_wo_b"); } struct ggml_tensor * inpFF = ggml_add(ctx0, residual, cur); - offload_func(inpFF); ggml_set_name(inpFF, "inpFF"); + { // MLP { // Norm cur = ggml_norm(ctx0, inpFF, norm_eps); - offload_func(cur); - cur = ggml_add(ctx0, - ggml_mul(ctx0, cur, model.layers[il].ffn_norm), - model.layers[il].ffn_norm_b - ); - ggml_set_name(cur, "ffn_norm"); - offload_func(cur); + ggml_set_name(cur, "ffn_norm_0"); + + cur = ggml_mul(ctx0, cur, model.layers[il].ffn_norm); + ggml_set_name(cur, "ffn_norm_0_w"); + + cur = ggml_add(ctx0, cur, model.layers[il].ffn_norm_b); + ggml_set_name(cur, "ffn_norm_0_wb"); } + cur = ggml_mul_mat(ctx0, model.layers[il].w3, cur); - offload_func(cur); + ggml_set_name(cur, "result_w3"); cur = ggml_add(ctx0, cur, model.layers[il].b3); - offload_func(cur); - ggml_set_name(cur, "result_ffn_up"); + ggml_set_name(cur, "result_w3_b"); - cur = ggml_sqr(ctx0, ggml_relu(ctx0, cur)); - ggml_set_name(cur, "result_ffn_act"); - offload_func(cur); - offload_func(cur->src[0]); + cur = ggml_relu(ctx0, cur); + ggml_set_name(cur, "relu"); + + cur = ggml_sqr(ctx0, cur); + ggml_set_name(cur, "sqr(relu)"); cur = ggml_mul_mat(ctx0, model.layers[il].w2, cur); - offload_func(cur); - cur = ggml_add(ctx0, - cur, - model.layers[il].b2); - offload_func(cur); - ggml_set_name(cur, "outFF"); + ggml_set_name(cur, "result_w2"); + + cur = ggml_add(ctx0, cur, model.layers[il].b2); + ggml_set_name(cur, "result_w2_b"); } + cur = ggml_add(ctx0, cur, inpFF); - offload_func(cur); - ggml_set_name(cur, "inpFF_+_outFF"); + ggml_set_name(cur, "inpFF_+_result_w2"); + inpL = cur; } + cur = inpL; + { cur = ggml_norm(ctx0, cur, norm_eps); - offload_func_nr(cur); + ggml_set_name(cur, "out_norm_0"); + cur = ggml_mul(ctx0, cur, model.output_norm); - offload_func_nr(cur); + ggml_set_name(cur, "out_norm_0_w"); cur = ggml_add(ctx0, cur, model.output_norm_b); - // offload_func_nr(cur); - ggml_set_name(cur, "result_norm"); } + cur = ggml_mul_mat(ctx0, model.output, cur); ggml_set_name(cur, "result_output"); + ggml_build_forward_expand(gf, cur); + ggml_free(ctx0); + return gf; } @@ -5086,7 +4832,7 @@ static struct ggml_cgraph * llm_build_bloom( embd = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd, n_tokens); } - ggml_set_name(embd, "embd"); + ggml_set_name(embd, "inp_embd"); // KQ_scale struct ggml_tensor * KQ_scale = ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, 1); @@ -5099,27 +4845,45 @@ static struct ggml_cgraph * llm_build_bloom( // norm { inpL = ggml_norm(ctx0, embd, norm_eps); - inpL = ggml_add (ctx0, ggml_mul(ctx0, inpL, model.tok_norm), model.tok_norm_b); - } + ggml_set_name(inpL, "inp_norm"); - ggml_set_name(inpL, "inpL"); + inpL = ggml_mul(ctx0, inpL, model.tok_norm); + ggml_set_name(inpL, "inp_norm_w"); + + inpL = ggml_add (ctx0, inpL, model.tok_norm_b); + ggml_set_name(inpL, "inp_norm_wb"); + } for (int il = 0; il < n_layer; ++il) { { // Norm cur = ggml_norm(ctx0, inpL, norm_eps); - cur = ggml_add(ctx0, ggml_mul(ctx0, cur, model.layers[il].attn_norm), model.layers[il].attn_norm_b); + ggml_set_name(cur, "attn_norm_0"); + + cur = ggml_mul(ctx0, cur, model.layers[il].attn_norm); + ggml_set_name(cur, "attn_norm_0_w"); + + cur = ggml_add(ctx0, cur, model.layers[il].attn_norm_b); + ggml_set_name(cur, "attn_norm_0_wb"); } { // Self Attention - cur = ggml_add(ctx0, ggml_mul_mat(ctx0, model.layers[il].wqkv, cur), model.layers[il].bqkv); + cur = ggml_mul_mat(ctx0, model.layers[il].wqkv, cur); + ggml_set_name(cur, "wqkv"); - struct ggml_tensor * tmpq = ggml_view_2d(ctx0, cur, n_embd, n_tokens, cur->nb[1], 0*sizeof(float)*(n_embd)); - struct ggml_tensor * tmpk = ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd)); - struct ggml_tensor * tmpv = ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa)); + cur = ggml_add(ctx0, cur, model.layers[il].bqkv); + ggml_set_name(cur, "bqkv"); - struct ggml_tensor * Qcur = tmpq; + struct ggml_tensor * tmpq = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd, n_tokens, cur->nb[1], 0*sizeof(float)*(n_embd))); + struct ggml_tensor * tmpk = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd))); + struct ggml_tensor * tmpv = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa))); + + ggml_set_name(tmpq, "tmpq"); + ggml_set_name(tmpk, "tmpk"); + ggml_set_name(tmpv, "tmpv"); + + struct ggml_tensor * Qcur = ggml_reshape_3d(ctx0, tmpq, n_embd_head, n_head, n_tokens); struct ggml_tensor * Kcur = tmpk; // store key and value to memory @@ -5133,6 +4897,7 @@ static struct ggml_cgraph * llm_build_bloom( struct ggml_tensor * v = ggml_view_2d(ctx0, kv_self.v, n_tokens, n_embd_gqa, ( n_ctx)*ggml_element_size(kv_self.v), (il*n_ctx)*ggml_element_size(kv_self.v)*n_embd_gqa + kv_head*ggml_element_size(kv_self.v)); + ggml_set_name(v, "v"); ggml_build_forward_expand(gf, ggml_cpy(ctx0, Kcur, k)); ggml_build_forward_expand(gf, ggml_cpy(ctx0, Vcur, v)); @@ -5196,10 +4961,15 @@ static struct ggml_cgraph * llm_build_bloom( } // Projection - cur = ggml_add(ctx0, ggml_mul_mat(ctx0, model.layers[il].wo, cur), model.layers[il].bo); + cur = ggml_mul_mat(ctx0, model.layers[il].wo, cur); + ggml_set_name(cur, "result_wo"); + + cur = ggml_add(ctx0, cur, model.layers[il].bo); + ggml_set_name(cur, "result_wo_b"); // Add the input cur = ggml_add(ctx0, cur, inpL); + ggml_set_name(cur, "inpL_+_result_wo"); struct ggml_tensor * inpFF = cur; @@ -5208,27 +4978,46 @@ static struct ggml_cgraph * llm_build_bloom( // Norm { cur = ggml_norm(ctx0, inpFF, norm_eps); - cur = ggml_add(ctx0, ggml_mul(ctx0, cur, model.layers[il].ffn_norm), model.layers[il].ffn_norm_b); + ggml_set_name(cur, "ffn_norm_0"); + + cur = ggml_mul(ctx0, cur, model.layers[il].ffn_norm); + ggml_set_name(cur, "ffn_norm_0_w"); + + cur = ggml_add(ctx0, cur, model.layers[il].ffn_norm_b); + ggml_set_name(cur, "ffn_norm_0_wb"); } - cur = ggml_add(ctx0, ggml_mul_mat(ctx0, model.layers[il].w3, cur), model.layers[il].b3); + cur = ggml_mul_mat(ctx0, model.layers[il].w3, cur); + ggml_set_name(cur, "result_w3"); + + cur = ggml_add(ctx0, cur, model.layers[il].b3); + ggml_set_name(cur, "result_w3_b"); - // GELU activation cur = ggml_gelu(ctx0, cur); + ggml_set_name(cur, "gelu"); - // Projection - cur = ggml_add(ctx0, ggml_mul_mat(ctx0, model.layers[il].w2, cur), model.layers[il].b2); + cur = ggml_mul_mat(ctx0, model.layers[il].w2, cur); + ggml_set_name(cur, "result_w2"); + + cur = ggml_add(ctx0, cur, model.layers[il].b2); + ggml_set_name(cur, "result_w2_b"); } inpL = ggml_add(ctx0, cur, inpFF); + ggml_set_name(inpL, "inpFF_+_result_w2"); } // Output Norm { cur = ggml_norm(ctx0, inpL, norm_eps); - cur = ggml_add(ctx0, ggml_mul(ctx0, cur, model.output_norm), model.output_norm_b); + ggml_set_name(cur, "out_norm_0"); + + cur = ggml_mul(ctx0, cur, model.output_norm); + ggml_set_name(cur, "out_norm_0_w"); + + cur = ggml_add(ctx0, cur, model.output_norm_b); + ggml_set_name(cur, "result_norm"); } - ggml_set_name(cur, "result_norm"); cur = ggml_mul_mat(ctx0, model.output, cur); ggml_set_name(cur, "result_output"); @@ -5264,8 +5053,6 @@ static struct ggml_cgraph * llm_build_mpt( const float clamp_kqv = hparams.f_clamp_kqv; const float max_alibi_bias = hparams.f_max_alibi_bias; - const int n_gpu_layers = model.n_gpu_layers; - const int32_t n_tokens = batch.n_tokens; const int32_t n_kv = worst_case ? n_ctx : kv_self.n; const int32_t kv_head = worst_case ? n_ctx - n_tokens : kv_self.head; @@ -5287,7 +5074,6 @@ static struct ggml_cgraph * llm_build_mpt( struct ggml_tensor * cur; struct ggml_tensor * inpL; - //int warmup = 0; if (batch.token) { struct ggml_tensor * inp_tokens = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens); ggml_set_name(inp_tokens, "inp_tokens"); @@ -5302,55 +5088,25 @@ static struct ggml_cgraph * llm_build_mpt( } ggml_set_name(inpL, "inp_embd"); - const int i_gpu_start = n_layer - n_gpu_layers; - (void) i_gpu_start; - - // offload functions set the tensor output backend to GPU - // tensors are GPU-accelerated if any input or the output has been offloaded - offload_func_t offload_func_nr = llama_nop; // nr = non-repeating - offload_func_t offload_func_kq = llama_nop; - offload_func_t offload_func_v = llama_nop; - -#ifdef GGML_USE_CUBLAS - if (n_gpu_layers > n_layer) { - offload_func_nr = ggml_cuda_assign_buffers_no_alloc; - } - if (n_gpu_layers > n_layer + 1) { - offload_func_v = ggml_cuda_assign_buffers_no_alloc; - } - if (n_gpu_layers > n_layer + 2) { - offload_func_kq = ggml_cuda_assign_buffers_no_alloc; - } -#endif // GGML_USE_CUBLAS - // KQ_scale struct ggml_tensor * KQ_scale = ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, 1); ggml_set_name(KQ_scale, "KQ_scale"); // KQ_mask (mask for 1 head, it will be broadcasted to all heads) struct ggml_tensor * KQ_mask = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_kv, n_tokens, 1); - offload_func_kq(KQ_mask); ggml_set_name(KQ_mask, "KQ_mask"); for (int il = 0; il < n_layer; ++il) { struct ggml_tensor * attn_norm; - offload_func_t offload_func = llama_nop; - -#ifdef GGML_USE_CUBLAS - if (il >= i_gpu_start) { - offload_func = ggml_cuda_assign_buffers_no_alloc; - } -#endif // GGML_USE_CUBLAS - // self-attention // TODO: refactor into common function (shared with LLaMA) { attn_norm = ggml_norm(ctx0, inpL, norm_eps); - offload_func(attn_norm); + ggml_set_name(attn_norm, "attn_norm_0"); attn_norm = ggml_mul(ctx0, attn_norm, model.layers[il].attn_norm); - offload_func(attn_norm); + ggml_set_name(attn_norm, "attn_norm_0_w"); if (1) { cur = attn_norm; @@ -5359,11 +5115,11 @@ static struct ggml_cgraph * llm_build_mpt( // compute QKV cur = ggml_mul_mat(ctx0, model.layers[il].wqkv, cur); - offload_func_kq(cur); + ggml_set_name(cur, "wqkv"); if (clamp_kqv > 0.0f) { cur = ggml_clamp(ctx0, cur, -clamp_kqv, clamp_kqv); - offload_func_kq(cur); + ggml_set_name(cur, "wqkv_clamped"); } const size_t wsize = ggml_type_size(cur->type); @@ -5373,46 +5129,42 @@ static struct ggml_cgraph * llm_build_mpt( wsize * n_embd_head, wsize * n_embd_head * (n_head + 2 * n_head_kv), 0); - offload_func_kq(Qcur); + ggml_set_name(Qcur, "Qcur"); struct ggml_tensor * Kcur = ggml_view_3d( ctx0, cur, n_embd_head, n_head_kv, n_tokens, wsize * n_embd_head, wsize * n_embd_head * (n_head + 2 * n_head_kv), wsize * n_embd_head * n_head); - offload_func_kq(Kcur); + ggml_set_name(Kcur, "Kcur"); struct ggml_tensor * tmpv = ggml_view_3d( ctx0, cur, n_embd_head, n_head_kv, n_tokens, wsize * n_embd_head, wsize * n_embd_head * (n_head + 2 * n_head_kv), wsize * n_embd_head * (n_head + n_head_kv)); - offload_func_kq(Kcur); - - ggml_set_name(Qcur, "Qcur"); - ggml_set_name(Kcur, "Kcur"); + ggml_set_name(tmpv, "tmpv"); { - struct ggml_tensor * Vcur = ggml_transpose(ctx0, ggml_reshape_2d(ctx0, ggml_cont(ctx0, tmpv), n_embd_gqa, n_tokens)); - offload_func_v(Vcur); - offload_func_v(Vcur->src[0]->src[0]); + struct ggml_tensor * Vcur = ggml_cont(ctx0, tmpv); + ggml_set_name(Vcur, "Vcur"); + + Vcur = ggml_transpose(ctx0, ggml_reshape_2d(ctx0, Vcur, n_embd_gqa, n_tokens)); ggml_set_name(Vcur, "Vcur"); struct ggml_tensor * k = ggml_view_1d(ctx0, kv_self.k, n_tokens*n_embd_gqa, (ggml_element_size(kv_self.k)*n_embd_gqa)*(il*n_ctx + kv_head)); - offload_func_kq(k); ggml_set_name(k, "k"); struct ggml_tensor * v = ggml_view_2d(ctx0, kv_self.v, n_tokens, n_embd_gqa, ( n_ctx)*ggml_element_size(kv_self.v), (il*n_ctx)*ggml_element_size(kv_self.v)*n_embd_gqa + kv_head*ggml_element_size(kv_self.v)); - offload_func_v(v); + ggml_set_name(v, "v"); ggml_build_forward_expand(gf, ggml_cpy(ctx0, Kcur, k)); ggml_build_forward_expand(gf, ggml_cpy(ctx0, Vcur, v)); } struct ggml_tensor * Q = ggml_permute(ctx0, Qcur, 0, 2, 1, 3); - offload_func_kq(Q); ggml_set_name(Q, "Q"); struct ggml_tensor * K = @@ -5421,29 +5173,22 @@ static struct ggml_cgraph * llm_build_mpt( ggml_element_size(kv_self.k)*n_embd_gqa, ggml_element_size(kv_self.k)*n_embd_head, ggml_element_size(kv_self.k)*n_embd_gqa*n_ctx*il); - offload_func_kq(K); ggml_set_name(K, "K"); struct ggml_tensor * KQ = ggml_mul_mat(ctx0, K, Q); - offload_func_kq(KQ); ggml_set_name(KQ, "KQ"); struct ggml_tensor * KQ_scaled = ggml_scale(ctx0, KQ, KQ_scale); - offload_func_kq(KQ_scaled); ggml_set_name(KQ_scaled, "KQ_scaled"); // TODO: replace with ggml_add() - struct ggml_tensor * KQ_scaled_alibi = - ggml_alibi(ctx0, KQ_scaled, 0, n_head, max_alibi_bias); - offload_func_kq(KQ_scaled_alibi); + struct ggml_tensor * KQ_scaled_alibi = ggml_alibi(ctx0, KQ_scaled, 0, n_head, max_alibi_bias); ggml_set_name(KQ_scaled_alibi, "KQ_scaled_alibi"); struct ggml_tensor * KQ_masked = ggml_add(ctx0, KQ_scaled_alibi, KQ_mask); - offload_func_kq(KQ_masked); ggml_set_name(KQ_masked, "KQ_masked"); struct ggml_tensor * KQ_soft_max = ggml_soft_max(ctx0, KQ_masked); - offload_func_v(KQ_soft_max); ggml_set_name(KQ_soft_max, "KQ_soft_max"); struct ggml_tensor * V = @@ -5452,29 +5197,24 @@ static struct ggml_cgraph * llm_build_mpt( ggml_element_size(kv_self.v)*n_ctx, ggml_element_size(kv_self.v)*n_ctx*n_embd_head, ggml_element_size(kv_self.v)*n_ctx*n_embd_gqa*il); - offload_func_v(V); ggml_set_name(V, "V"); struct ggml_tensor * KQV = ggml_mul_mat(ctx0, V, KQ_soft_max); - offload_func_v(KQV); ggml_set_name(KQV, "KQV"); struct ggml_tensor * KQV_merged = ggml_permute(ctx0, KQV, 0, 2, 1, 3); - offload_func_v(KQV_merged); ggml_set_name(KQV_merged, "KQV_merged"); cur = ggml_cont_2d(ctx0, KQV_merged, n_embd, n_tokens); - offload_func_v(cur); ggml_set_name(cur, "KQV_merged_contiguous"); cur = ggml_mul_mat(ctx0, model.layers[il].wo, cur); - offload_func(cur); ggml_set_name(cur, "result_wo"); } // Add the input cur = ggml_add(ctx0, cur, inpL); - offload_func(cur); + ggml_set_name(cur, "inpL_+_result_wo"); struct ggml_tensor * attn_out = cur; @@ -5483,23 +5223,25 @@ static struct ggml_cgraph * llm_build_mpt( // Norm { cur = ggml_norm(ctx0, attn_out, norm_eps); - offload_func(cur); + ggml_set_name(cur, "ffn_norm_0"); cur = ggml_mul(ctx0, cur, model.layers[il].ffn_norm); - offload_func(cur); + ggml_set_name(cur, "ffn_norm_0_w"); } cur = ggml_mul_mat(ctx0, model.layers[il].w3, cur); - offload_func(cur); + ggml_set_name(cur, "result_w3"); cur = ggml_gelu(ctx0, cur); - offload_func(cur); + ggml_set_name(cur, "gelu"); + cur = ggml_mul_mat(ctx0, model.layers[il].w2, cur); - offload_func(cur); + ggml_set_name(cur, "result_w2"); } cur = ggml_add(ctx0, cur, attn_out); - offload_func(cur); + ggml_set_name(cur, "inpL_+_inpFF_+_result_w2"); + // input for next layer inpL = cur; } @@ -5509,7 +5251,7 @@ static struct ggml_cgraph * llm_build_mpt( // norm { cur = ggml_norm(ctx0, cur, norm_eps); - offload_func_nr(cur); + ggml_set_name(cur, "out_norm_0"); cur = ggml_mul(ctx0, cur, model.output_norm); ggml_set_name(cur, "result_norm"); @@ -5721,10 +5463,10 @@ static struct ggml_cgraph * llama_build_graph( // offload functions set the tensor output backend to GPU // tensors are GPU-accelerated if any input or the output has been offloaded - offload_func_t offload_func_nr = llama_nop; // nr = non-repeating - offload_func_t offload_func_kq = llama_nop; - offload_func_t offload_func_v = llama_nop; - offload_func_t offload_func = llama_nop; + offload_func_t offload_func_nr = ggml_offload_nop; // nr = non-repeating + offload_func_t offload_func_kq = ggml_offload_nop; + offload_func_t offload_func_v = ggml_offload_nop; + offload_func_t offload_func = ggml_offload_nop; #ifdef GGML_USE_CUBLAS if (n_gpu_layers > n_layer) { @@ -5741,57 +5483,98 @@ static struct ggml_cgraph * llama_build_graph( #endif // GGML_USE_CUBLAS static const std::unordered_map k_offload_func = { - { "KQ_mask", offload_func_kq }, - { "KQ_pos", offload_func_kq }, - { "K_shift", offload_func_kq }, - { "K_shifted", offload_func_kq }, + { "KQ_mask", offload_func_kq }, + { "KQ_pos", offload_func_kq }, + { "K_shift", offload_func_kq }, + { "K_shifted", offload_func_kq }, - { "rms_norm_0", offload_func }, - { "attention_norm_0", offload_func }, + { "inp_norm", offload_func_nr }, + { "inp_norm_w", offload_func_nr }, + { "inp_norm_wb", offload_func_nr }, - { "tmpk", offload_func_kq }, - { "tmpq", offload_func_kq }, - { "tmpv", offload_func_v }, - { "Kcur", offload_func_kq }, - { "Qcur", offload_func_kq }, - { "Vcur", offload_func_v }, + { "rms_norm_0", offload_func }, - { "k", offload_func_kq }, - { "v", offload_func_v }, + { "attn_norm_0", offload_func }, + { "attn_norm_0_w", offload_func }, + { "attn_norm_0_wb", offload_func }, - { "Q", offload_func_kq }, - { "K", offload_func_kq }, - { "KQ", offload_func_kq }, - { "KQ_scaled", offload_func_kq }, - { "KQ_scaled_alibi", offload_func_kq }, - { "KQ_masked", offload_func_kq }, - { "KQ_soft_max", offload_func_v }, - { "V", offload_func_v }, - { "KQV", offload_func_v }, - { "KQV_merged", offload_func_v }, - { "KQV_merged_contiguous", offload_func_v }, + { "attn_norm_2", offload_func }, + { "attn_norm_2_w", offload_func }, + { "attn_norm_2_wb", offload_func }, - { "result_wo", offload_func }, + { "wqkv", offload_func_kq }, + { "bqkv", offload_func_kq }, + { "wqkv_clamped", offload_func_kq }, - { "inpFF", offload_func }, + { "tmpk", offload_func_kq }, + { "tmpq", offload_func_kq }, + { "tmpv", offload_func_v }, + { "tmpkqv", offload_func_kq }, // ?? + { "Kcur", offload_func_kq }, + { "Qcur", offload_func_kq }, + { "Vcur", offload_func_v }, + { "Vcur_0", offload_func_v }, + { "Vcur_1", offload_func_v }, - { "rms_norm_1", offload_func }, - { "ffn_norm", offload_func }, + { "krot", offload_func_kq }, + { "qrot", offload_func_kq }, + { "kpass", offload_func_kq }, + { "qpass", offload_func_kq }, + { "krotated", offload_func_kq }, + { "qrotated", offload_func_kq }, - { "result_w3", offload_func }, - { "result_w2", offload_func }, - { "result_w1", offload_func }, - { "silu", offload_func }, - { "silu_x_result_w3", offload_func }, - { "inpFF_+_result_w2", offload_func }, + { "k", offload_func_kq }, + { "v", offload_func_v }, - { "rms_norm_2", offload_func_nr }, - //{ "result_norm", offload_func_nr }, // TODO CPU + GPU mirrored backend - //{ "result_output", offload_func }, + { "Q", offload_func_kq }, + { "K", offload_func_kq }, + { "KQ", offload_func_kq }, + { "KQ_scaled", offload_func_kq }, + { "KQ_scaled_alibi", offload_func_kq }, + { "KQ_masked", offload_func_kq }, + { "KQ_soft_max", offload_func_v }, + { "V", offload_func_v }, + { "KQV", offload_func_v }, + { "KQV_merged", offload_func_v }, + { "KQV_merged_contiguous", offload_func_v }, + + { "result_wo", offload_func }, + { "result_wo_b", offload_func }, + { "inpL_+_result_wo", offload_func }, + + { "inpFF", offload_func }, + + { "rms_norm_1", offload_func }, + { "ffn_norm", offload_func }, + { "ffn_norm_0", offload_func }, + { "ffn_norm_0_w", offload_func }, + { "ffn_norm_0_wb", offload_func }, + + { "result_w3", offload_func }, + { "result_w3_b", offload_func }, + { "result_w2", offload_func }, + { "result_w2_b", offload_func }, + { "result_w1", offload_func }, + + { "silu", offload_func }, + { "gelu", offload_func }, + { "relu", offload_func }, + { "sqr(relu)", offload_func }, + + { "silu_x_result_w3", offload_func }, + { "inpFF_+_result_w2", offload_func }, + { "inpL_+_inpFF_+_result_w2", offload_func }, + + { "rms_norm_2", offload_func_nr }, + { "out_norm_0", offload_func_nr }, + { "out_norm_0_w", offload_func_nr }, + + //{ "result_norm", offload_func_nr }, // TODO CPU + GPU mirrored backend + //{ "result_output", offload_func }, }; static const std::unordered_map k_offload_func_name = { - { llama_nop, "CPU" }, + { ggml_offload_nop, "CPU" }, #ifdef GGML_USE_CUBLAS { ggml_cuda_assign_buffers_no_alloc, "GPU (CUDA)" }, #endif @@ -5815,7 +5598,7 @@ static struct ggml_cgraph * llama_build_graph( offload_func_t f = k_offload_func.at(name); if (f == offload_func) { if (ofn[name]++ < i_gpu_start) { - f = llama_nop; + f = ggml_offload_nop; } } @@ -8609,8 +8392,8 @@ static int llama_apply_lora_from_file_internal( ggml_tensor * dest_t = model_tensors[base_name]; - offload_func_t offload_func = llama_nop; - offload_func_t offload_func_force_inplace = llama_nop; + offload_func_t offload_func = ggml_offload_nop; + offload_func_t offload_func_force_inplace = ggml_offload_nop; #ifdef GGML_USE_CUBLAS if (dest_t->backend == GGML_BACKEND_GPU || dest_t->backend == GGML_BACKEND_GPU_SPLIT) { From 3af8771389195a7db497ae5c3fe8243f183c002d Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Sat, 28 Oct 2023 22:36:44 +0300 Subject: [PATCH 05/20] llama : update offload log messages to print node index --- llama.cpp | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/llama.cpp b/llama.cpp index a2baefd14..cc7eb0a5a 100644 --- a/llama.cpp +++ b/llama.cpp @@ -5589,8 +5589,8 @@ static struct ggml_cgraph * llama_build_graph( if (k_offload_func.find(name) == k_offload_func.end()) { if (worst_case && cur->view_src == nullptr) { - LLAMA_LOG_WARN("%s: %32s: not offloaded (ref: %s)\n", __func__, - name.c_str(), "https://github.com/ggerganov/llama.cpp/pull/3837"); + LLAMA_LOG_WARN("%s: node %4d %32s: not offloaded (ref: %s)\n", __func__, + i, name.c_str(), "https://github.com/ggerganov/llama.cpp/pull/3837"); } continue; } @@ -5605,7 +5605,7 @@ static struct ggml_cgraph * llama_build_graph( f(cur); if (worst_case && cur->view_src == nullptr) { - LLAMA_LOG_INFO("%s: %32s: %s\n", __func__, name.c_str(), k_offload_func_name.at(f).c_str()); + LLAMA_LOG_INFO("%s: node %4d %32s: %s\n", __func__, i, name.c_str(), k_offload_func_name.at(f).c_str()); } } } From 51c4f9ee9f4f2e3e68f4f379bb1bc91959815555 Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Sat, 28 Oct 2023 22:50:08 +0300 Subject: [PATCH 06/20] llama : comments --- llama.cpp | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/llama.cpp b/llama.cpp index cc7eb0a5a..b3d84c57d 100644 --- a/llama.cpp +++ b/llama.cpp @@ -5452,15 +5452,12 @@ static struct ggml_cgraph * llama_build_graph( } while (0); // offload layers - { const int n_layer = model.hparams.n_layer; const int n_gpu_layers = model.n_gpu_layers; const int i_gpu_start = n_layer - n_gpu_layers; - GGML_UNUSED(i_gpu_start); - // offload functions set the tensor output backend to GPU // tensors are GPU-accelerated if any input or the output has been offloaded offload_func_t offload_func_nr = ggml_offload_nop; // nr = non-repeating @@ -5588,13 +5585,16 @@ static struct ggml_cgraph * llama_build_graph( const std::string name = cur->name; if (k_offload_func.find(name) == k_offload_func.end()) { + // if a tensor that is not view hasn't been offloaded, we warn the user if (worst_case && cur->view_src == nullptr) { LLAMA_LOG_WARN("%s: node %4d %32s: not offloaded (ref: %s)\n", __func__, i, name.c_str(), "https://github.com/ggerganov/llama.cpp/pull/3837"); } + continue; } + // count the number of layers and respect the provided n_gpu_layers offload_func_t f = k_offload_func.at(name); if (f == offload_func) { if (ofn[name]++ < i_gpu_start) { @@ -5602,6 +5602,7 @@ static struct ggml_cgraph * llama_build_graph( } } + // apply offload function to the tensor f(cur); if (worst_case && cur->view_src == nullptr) { From 4e98897ede5e8adcbdffc6fb629a11e8a0acc745 Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Sun, 29 Oct 2023 07:36:07 +0200 Subject: [PATCH 07/20] llama : support offloading result_norm + comments --- llama.cpp | 21 +++++++++++++++------ 1 file changed, 15 insertions(+), 6 deletions(-) diff --git a/llama.cpp b/llama.cpp index b3d84c57d..5ce5840a3 100644 --- a/llama.cpp +++ b/llama.cpp @@ -5452,12 +5452,16 @@ static struct ggml_cgraph * llama_build_graph( } while (0); // offload layers + // TODO: this code will be obsoleted with backend v2 { const int n_layer = model.hparams.n_layer; const int n_gpu_layers = model.n_gpu_layers; const int i_gpu_start = n_layer - n_gpu_layers; + // should we offload the final norm? yes if we are not computing embeddings + const bool off_res_norm = !lctx.embedding.empty(); + // offload functions set the tensor output backend to GPU // tensors are GPU-accelerated if any input or the output has been offloaded offload_func_t offload_func_nr = ggml_offload_nop; // nr = non-repeating @@ -5566,7 +5570,7 @@ static struct ggml_cgraph * llama_build_graph( { "out_norm_0", offload_func_nr }, { "out_norm_0_w", offload_func_nr }, - //{ "result_norm", offload_func_nr }, // TODO CPU + GPU mirrored backend + { "result_norm", off_res_norm ? offload_func_nr : ggml_offload_nop }, //{ "result_output", offload_func }, }; @@ -5584,7 +5588,8 @@ static struct ggml_cgraph * llama_build_graph( const std::string name = cur->name; - if (k_offload_func.find(name) == k_offload_func.end()) { + const auto it = k_offload_func.find(name); + if (it == k_offload_func.end()) { // if a tensor that is not view hasn't been offloaded, we warn the user if (worst_case && cur->view_src == nullptr) { LLAMA_LOG_WARN("%s: node %4d %32s: not offloaded (ref: %s)\n", __func__, @@ -5595,7 +5600,7 @@ static struct ggml_cgraph * llama_build_graph( } // count the number of layers and respect the provided n_gpu_layers - offload_func_t f = k_offload_func.at(name); + offload_func_t f = it->second; if (f == offload_func) { if (ofn[name]++ < i_gpu_start) { f = ggml_offload_nop; @@ -5753,11 +5758,13 @@ static int llama_decode_internal( } // If all tensors can be run on the GPU then using more than 1 thread is detrimental. - const bool full_offload_supported = model.arch == LLM_ARCH_LLAMA || + const bool full_offload_supported = + model.arch == LLM_ARCH_LLAMA || model.arch == LLM_ARCH_BAICHUAN || - model.arch == LLM_ARCH_FALCON || - model.arch == LLM_ARCH_REFACT || + model.arch == LLM_ARCH_FALCON || + model.arch == LLM_ARCH_REFACT || model.arch == LLM_ARCH_MPT; + const bool fully_offloaded = model.n_gpu_layers >= (int) hparams.n_layer + 3; if (ggml_cpu_has_cublas() && full_offload_supported && fully_offloaded) { n_threads = 1; @@ -5803,6 +5810,8 @@ static int llama_decode_internal( //} // extract logits + // TODO: do not compute and extract logits if only embeddings are needed + // need to update the graphs to skip "result_output" { auto & logits_out = lctx.logits; From 0dc05b8433d950bfbb8291cd4c7a8aeae899da3a Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Sun, 29 Oct 2023 07:52:43 +0200 Subject: [PATCH 08/20] llama : factor graph input into a function --- llama.cpp | 260 ++++++++++++++++++++++++++---------------------------- 1 file changed, 125 insertions(+), 135 deletions(-) diff --git a/llama.cpp b/llama.cpp index 5ce5840a3..68e0a9457 100644 --- a/llama.cpp +++ b/llama.cpp @@ -5267,6 +5267,130 @@ static struct ggml_cgraph * llm_build_mpt( return gf; } +static void llama_build_graph_input( + llama_context & lctx, + const llama_batch & batch, + struct ggml_cgraph * graph) { + struct ggml_tensor * cur = nullptr; + + // inp_tokens + if (batch.token) { + cur = ggml_graph_get_tensor(graph, "inp_tokens"); + GGML_ASSERT(cur != nullptr); // required + + ggml_allocr_alloc(lctx.alloc, cur); + + if (!ggml_allocr_is_measure(lctx.alloc)) { + const int64_t n_tokens = cur->ne[0]; + + memcpy(cur->data, batch.token, n_tokens*ggml_element_size(cur)); + } + } + + // inp_embd + if (batch.embd) { + cur = ggml_graph_get_tensor(graph, "inp_embd"); + GGML_ASSERT(cur != nullptr); // required + + ggml_allocr_alloc(lctx.alloc, cur); + + if (!ggml_allocr_is_measure(lctx.alloc)) { + const int64_t n_embd = cur->ne[0]; + const int64_t n_tokens = cur->ne[1]; + + memcpy(cur->data, batch.embd, n_tokens*n_embd*ggml_element_size(cur)); + } + } + + // TODO: make the following required based on the ARCH + + // inp_pos + cur = ggml_graph_get_tensor(graph, "inp_pos"); + if (cur) { + ggml_allocr_alloc(lctx.alloc, cur); + + if (!ggml_allocr_is_measure(lctx.alloc)) { + const int64_t n_tokens = cur->ne[0]; + + int32_t * data = (int32_t *) cur->data; + + for (int i = 0; i < n_tokens; ++i) { + data[i] = batch.pos[i]; + } + } + } + + // KQ_scale + cur = ggml_graph_get_tensor(graph, "KQ_scale"); + if (cur) { + ggml_allocr_alloc(lctx.alloc, cur); + + if (!ggml_allocr_is_measure(lctx.alloc)) { + const int64_t n_embd_head = lctx.model.hparams.n_embd_head(); + ggml_set_f32(cur, 1.0f/sqrtf(float(n_embd_head))); + } + } + + // KQ_mask + cur = ggml_graph_get_tensor(graph, "KQ_mask"); + if (cur) { + ggml_allocr_alloc(lctx.alloc, cur); + + if (!ggml_allocr_is_measure(lctx.alloc)) { + const int64_t n_kv = cur->ne[0]; + const int64_t n_tokens = cur->ne[1]; + + float * data = (float *) cur->data; + memset(data, 0, ggml_nbytes(cur)); + + for (int h = 0; h < 1; ++h) { + for (int j = 0; j < n_tokens; ++j) { + const llama_pos pos = batch.pos[j]; + const llama_seq_id seq_id = batch.seq_id[j][0]; + + for (int i = 0; i < n_kv; ++i) { + if (!lctx.kv_self.cells[i].has_seq_id(seq_id) || lctx.kv_self.cells[i].pos > pos) { + data[h*(n_kv*n_tokens) + j*n_kv + i] = -INFINITY; + } + } + } + } + } + } + + // KQ_pos + cur = ggml_graph_get_tensor(graph, "KQ_pos"); + if (cur) { + ggml_allocr_alloc(lctx.alloc, cur); + + if (!ggml_allocr_is_measure(lctx.alloc)) { + const int64_t n_tokens = cur->ne[0]; + + int32_t * data = (int32_t *) cur->data; + + for (int i = 0; i < n_tokens; ++i) { + data[i] = batch.pos[i]; + } + } + } + + // K_shift + cur = ggml_graph_get_tensor(graph, "K_shift"); + if (cur) { + ggml_allocr_alloc(lctx.alloc, cur); + + if (!ggml_allocr_is_measure(lctx.alloc)) { + const int64_t n_ctx = cur->ne[0]; + + int32_t * data = (int32_t *) cur->data; + + for (int i = 0; i < n_ctx; ++i) { + data[i] = lctx.kv_self.cells[i].delta; + } + } + } while (0); +} + static struct ggml_cgraph * llama_build_graph( llama_context & lctx, const llama_batch & batch) { @@ -5315,141 +5439,7 @@ static struct ggml_cgraph * llama_build_graph( } // allocate memory and set the values for the input tensors of the graph - - // inp_tokens - if (batch.token) { - struct ggml_tensor * cur = ggml_graph_get_tensor(result, "inp_tokens"); - GGML_ASSERT(cur != nullptr); - - ggml_allocr_alloc(lctx.alloc, cur); - - if (!ggml_allocr_is_measure(lctx.alloc)) { - const int64_t n_tokens = cur->ne[0]; - - memcpy(cur->data, batch.token, n_tokens*ggml_element_size(cur)); - } - } - - // inp_embd - if (batch.embd) { - struct ggml_tensor * cur = ggml_graph_get_tensor(result, "inp_embd"); - GGML_ASSERT(cur != nullptr); - - ggml_allocr_alloc(lctx.alloc, cur); - - if (!ggml_allocr_is_measure(lctx.alloc)) { - const int64_t n_embd = cur->ne[0]; - const int64_t n_tokens = cur->ne[1]; - - memcpy(cur->data, batch.embd, n_tokens*n_embd*ggml_element_size(cur)); - } - } - - // inp_pos - do { - struct ggml_tensor * cur = ggml_graph_get_tensor(result, "inp_pos"); - if (cur == nullptr) { - break; - } - - ggml_allocr_alloc(lctx.alloc, cur); - - if (!ggml_allocr_is_measure(lctx.alloc)) { - const int64_t n_tokens = cur->ne[0]; - - int32_t * data = (int32_t *) cur->data; - - for (int i = 0; i < n_tokens; ++i) { - data[i] = batch.pos[i]; - } - } - } while (0); - - // KQ_scale - do { - struct ggml_tensor * cur = ggml_graph_get_tensor(result, "KQ_scale"); - if (cur == nullptr) { - break; - } - - ggml_allocr_alloc(lctx.alloc, cur); - - if (!ggml_allocr_is_measure(lctx.alloc)) { - const int64_t n_embd_head = lctx.model.hparams.n_embd_head(); - ggml_set_f32(cur, 1.0f/sqrtf(float(n_embd_head))); - } - } while (0); - - // KQ_mask - do { - struct ggml_tensor * cur = ggml_graph_get_tensor(result, "KQ_mask"); - if (cur == nullptr) { - break; - } - - ggml_allocr_alloc(lctx.alloc, cur); - - if (!ggml_allocr_is_measure(lctx.alloc)) { - const int64_t n_kv = cur->ne[0]; - const int64_t n_tokens = cur->ne[1]; - - float * data = (float *) cur->data; - memset(data, 0, ggml_nbytes(cur)); - - for (int h = 0; h < 1; ++h) { - for (int j = 0; j < n_tokens; ++j) { - const llama_pos pos = batch.pos[j]; - const llama_seq_id seq_id = batch.seq_id[j][0]; - - for (int i = 0; i < n_kv; ++i) { - if (!lctx.kv_self.cells[i].has_seq_id(seq_id) || lctx.kv_self.cells[i].pos > pos) { - data[h*(n_kv*n_tokens) + j*n_kv + i] = -INFINITY; - } - } - } - } - } - } while (0); - - // KQ_pos - do { - struct ggml_tensor * cur = ggml_graph_get_tensor(result, "KQ_pos"); - if (cur == nullptr) { - break; - } - - ggml_allocr_alloc(lctx.alloc, cur); - - if (!ggml_allocr_is_measure(lctx.alloc)) { - const int64_t n_tokens = cur->ne[0]; - - int32_t * data = (int32_t *) cur->data; - - for (int i = 0; i < n_tokens; ++i) { - data[i] = batch.pos[i]; - } - } - } while (0); - - // K_shift - do { - struct ggml_tensor * cur = ggml_graph_get_tensor(result, "K_shift"); - if (cur == nullptr) { - break; - } - - ggml_allocr_alloc(lctx.alloc, cur); - - if (!ggml_allocr_is_measure(lctx.alloc)) { - const int64_t n_ctx = cur->ne[0]; - - int32_t * data = (int32_t *) cur->data; - - for (int i = 0; i < n_ctx; ++i) { - data[i] = lctx.kv_self.cells[i].delta; - } - } - } while (0); + llama_build_graph_input(lctx, batch, result); // offload layers // TODO: this code will be obsoleted with backend v2 From e14aa461515e58245e40251ad3cb72c19af87945 Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Sun, 29 Oct 2023 08:03:46 +0200 Subject: [PATCH 09/20] llama : do tensor offload only with CUDA --- llama.cpp | 17 ++++++++++++++--- 1 file changed, 14 insertions(+), 3 deletions(-) diff --git a/llama.cpp b/llama.cpp index 68e0a9457..f684ebe0b 100644 --- a/llama.cpp +++ b/llama.cpp @@ -5443,7 +5443,13 @@ static struct ggml_cgraph * llama_build_graph( // offload layers // TODO: this code will be obsoleted with backend v2 - { +#ifdef GGML_USE_CUBLAS + const bool do_offload = true; +#else + const bool do_offload = false; +#endif + + if (do_offload) { const int n_layer = model.hparams.n_layer; const int n_gpu_layers = model.n_gpu_layers; @@ -5576,12 +5582,17 @@ static struct ggml_cgraph * llama_build_graph( for (int i = 0; i < result->n_nodes; ++i) { struct ggml_tensor * cur = result->nodes[i]; + // view tensors are not offloaded + if (cur->view_src != nullptr) { + continue; + } + const std::string name = cur->name; const auto it = k_offload_func.find(name); if (it == k_offload_func.end()) { // if a tensor that is not view hasn't been offloaded, we warn the user - if (worst_case && cur->view_src == nullptr) { + if (worst_case) { LLAMA_LOG_WARN("%s: node %4d %32s: not offloaded (ref: %s)\n", __func__, i, name.c_str(), "https://github.com/ggerganov/llama.cpp/pull/3837"); } @@ -5600,7 +5611,7 @@ static struct ggml_cgraph * llama_build_graph( // apply offload function to the tensor f(cur); - if (worst_case && cur->view_src == nullptr) { + if (worst_case) { LLAMA_LOG_INFO("%s: node %4d %32s: %s\n", __func__, i, name.c_str(), k_offload_func_name.at(f).c_str()); } } From 79617902eaa68b3522fc3ce5649cb128d0778dfe Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Sun, 29 Oct 2023 09:20:35 +0200 Subject: [PATCH 10/20] llama : fix res_norm offloading --- llama.cpp | 201 ++++++++++++++++++++++++++++-------------------------- 1 file changed, 103 insertions(+), 98 deletions(-) diff --git a/llama.cpp b/llama.cpp index f684ebe0b..49f1cf910 100644 --- a/llama.cpp +++ b/llama.cpp @@ -5456,14 +5456,16 @@ static struct ggml_cgraph * llama_build_graph( const int i_gpu_start = n_layer - n_gpu_layers; // should we offload the final norm? yes if we are not computing embeddings - const bool off_res_norm = !lctx.embedding.empty(); + const bool off_res_norm = lctx.embedding.empty(); // offload functions set the tensor output backend to GPU // tensors are GPU-accelerated if any input or the output has been offloaded - offload_func_t offload_func_nr = ggml_offload_nop; // nr = non-repeating - offload_func_t offload_func_kq = ggml_offload_nop; - offload_func_t offload_func_v = ggml_offload_nop; - offload_func_t offload_func = ggml_offload_nop; + offload_func_t offload_func_nr = ggml_offload_nop; // nr = non-repeating + offload_func_t offload_func_kq = ggml_offload_nop; + offload_func_t offload_func_v = ggml_offload_nop; + offload_func_t offload_func_emb = ggml_offload_nop; + offload_func_t offload_func_out = ggml_offload_nop; + offload_func_t offload_func = ggml_offload_nop; #ifdef GGML_USE_CUBLAS if (n_gpu_layers > n_layer) { @@ -5476,100 +5478,12 @@ static struct ggml_cgraph * llama_build_graph( offload_func_kq = ggml_cuda_assign_buffers_no_alloc; } + offload_func_emb = off_res_norm ? ggml_cuda_assign_buffers_no_alloc : ggml_offload_nop; + offload_func_out = ggml_offload_nop; + offload_func = ggml_cuda_assign_buffers_no_alloc; #endif // GGML_USE_CUBLAS - static const std::unordered_map k_offload_func = { - { "KQ_mask", offload_func_kq }, - { "KQ_pos", offload_func_kq }, - { "K_shift", offload_func_kq }, - { "K_shifted", offload_func_kq }, - - { "inp_norm", offload_func_nr }, - { "inp_norm_w", offload_func_nr }, - { "inp_norm_wb", offload_func_nr }, - - { "rms_norm_0", offload_func }, - - { "attn_norm_0", offload_func }, - { "attn_norm_0_w", offload_func }, - { "attn_norm_0_wb", offload_func }, - - { "attn_norm_2", offload_func }, - { "attn_norm_2_w", offload_func }, - { "attn_norm_2_wb", offload_func }, - - { "wqkv", offload_func_kq }, - { "bqkv", offload_func_kq }, - { "wqkv_clamped", offload_func_kq }, - - { "tmpk", offload_func_kq }, - { "tmpq", offload_func_kq }, - { "tmpv", offload_func_v }, - { "tmpkqv", offload_func_kq }, // ?? - { "Kcur", offload_func_kq }, - { "Qcur", offload_func_kq }, - { "Vcur", offload_func_v }, - { "Vcur_0", offload_func_v }, - { "Vcur_1", offload_func_v }, - - { "krot", offload_func_kq }, - { "qrot", offload_func_kq }, - { "kpass", offload_func_kq }, - { "qpass", offload_func_kq }, - { "krotated", offload_func_kq }, - { "qrotated", offload_func_kq }, - - { "k", offload_func_kq }, - { "v", offload_func_v }, - - { "Q", offload_func_kq }, - { "K", offload_func_kq }, - { "KQ", offload_func_kq }, - { "KQ_scaled", offload_func_kq }, - { "KQ_scaled_alibi", offload_func_kq }, - { "KQ_masked", offload_func_kq }, - { "KQ_soft_max", offload_func_v }, - { "V", offload_func_v }, - { "KQV", offload_func_v }, - { "KQV_merged", offload_func_v }, - { "KQV_merged_contiguous", offload_func_v }, - - { "result_wo", offload_func }, - { "result_wo_b", offload_func }, - { "inpL_+_result_wo", offload_func }, - - { "inpFF", offload_func }, - - { "rms_norm_1", offload_func }, - { "ffn_norm", offload_func }, - { "ffn_norm_0", offload_func }, - { "ffn_norm_0_w", offload_func }, - { "ffn_norm_0_wb", offload_func }, - - { "result_w3", offload_func }, - { "result_w3_b", offload_func }, - { "result_w2", offload_func }, - { "result_w2_b", offload_func }, - { "result_w1", offload_func }, - - { "silu", offload_func }, - { "gelu", offload_func }, - { "relu", offload_func }, - { "sqr(relu)", offload_func }, - - { "silu_x_result_w3", offload_func }, - { "inpFF_+_result_w2", offload_func }, - { "inpL_+_inpFF_+_result_w2", offload_func }, - - { "rms_norm_2", offload_func_nr }, - { "out_norm_0", offload_func_nr }, - { "out_norm_0_w", offload_func_nr }, - - { "result_norm", off_res_norm ? offload_func_nr : ggml_offload_nop }, - //{ "result_output", offload_func }, - }; - static const std::unordered_map k_offload_func_name = { { ggml_offload_nop, "CPU" }, #ifdef GGML_USE_CUBLAS @@ -5577,6 +5491,97 @@ static struct ggml_cgraph * llama_build_graph( #endif }; + const std::unordered_map k_offload_func = { + { "KQ_mask", offload_func_kq }, + { "KQ_pos", offload_func_kq }, + { "K_shift", offload_func_kq }, + { "K_shifted", offload_func_kq }, + + { "inp_norm", offload_func_nr }, + { "inp_norm_w", offload_func_nr }, + { "inp_norm_wb", offload_func_nr }, + + { "rms_norm_0", offload_func }, + + { "attn_norm_0", offload_func }, + { "attn_norm_0_w", offload_func }, + { "attn_norm_0_wb", offload_func }, + + { "attn_norm_2", offload_func }, + { "attn_norm_2_w", offload_func }, + { "attn_norm_2_wb", offload_func }, + + { "wqkv", offload_func_kq }, + { "bqkv", offload_func_kq }, + { "wqkv_clamped", offload_func_kq }, + + { "tmpk", offload_func_kq }, + { "tmpq", offload_func_kq }, + { "tmpv", offload_func_v }, + { "tmpkqv", offload_func_kq }, // ?? + { "Kcur", offload_func_kq }, + { "Qcur", offload_func_kq }, + { "Vcur", offload_func_v }, + { "Vcur_0", offload_func_v }, + { "Vcur_1", offload_func_v }, + + { "krot", offload_func_kq }, + { "qrot", offload_func_kq }, + { "kpass", offload_func_kq }, + { "qpass", offload_func_kq }, + { "krotated", offload_func_kq }, + { "qrotated", offload_func_kq }, + + { "k", offload_func_kq }, + { "v", offload_func_v }, + + { "Q", offload_func_kq }, + { "K", offload_func_kq }, + { "KQ", offload_func_kq }, + { "KQ_scaled", offload_func_kq }, + { "KQ_scaled_alibi", offload_func_kq }, + { "KQ_masked", offload_func_kq }, + { "KQ_soft_max", offload_func_v }, + { "V", offload_func_v }, + { "KQV", offload_func_v }, + { "KQV_merged", offload_func_v }, + { "KQV_merged_contiguous", offload_func_v }, + + { "result_wo", offload_func }, + { "result_wo_b", offload_func }, + { "inpL_+_result_wo", offload_func }, + + { "inpFF", offload_func }, + + { "rms_norm_1", offload_func }, + { "ffn_norm", offload_func }, + { "ffn_norm_0", offload_func }, + { "ffn_norm_0_w", offload_func }, + { "ffn_norm_0_wb", offload_func }, + + { "result_w3", offload_func }, + { "result_w3_b", offload_func }, + { "result_w2", offload_func }, + { "result_w2_b", offload_func }, + { "result_w1", offload_func }, + + { "silu", offload_func }, + { "gelu", offload_func }, + { "relu", offload_func }, + { "sqr(relu)", offload_func }, + + { "silu_x_result_w3", offload_func }, + { "inpFF_+_result_w2", offload_func }, + { "inpL_+_inpFF_+_result_w2", offload_func }, + + { "rms_norm_2", offload_func_nr }, + { "out_norm_0", offload_func_nr }, + { "out_norm_0_w", offload_func_nr }, + + { "result_norm", offload_func_emb }, + { "result_output", offload_func_out }, + }; + std::unordered_map ofn; for (int i = 0; i < result->n_nodes; ++i) { @@ -5591,7 +5596,7 @@ static struct ggml_cgraph * llama_build_graph( const auto it = k_offload_func.find(name); if (it == k_offload_func.end()) { - // if a tensor that is not view hasn't been offloaded, we warn the user + // if a tensor hasn't been offloaded, we warn the user if (worst_case) { LLAMA_LOG_WARN("%s: node %4d %32s: not offloaded (ref: %s)\n", __func__, i, name.c_str(), "https://github.com/ggerganov/llama.cpp/pull/3837"); @@ -5602,7 +5607,7 @@ static struct ggml_cgraph * llama_build_graph( // count the number of layers and respect the provided n_gpu_layers offload_func_t f = it->second; - if (f == offload_func) { + if (n_gpu_layers < n_layer && f == offload_func) { if (ofn[name]++ < i_gpu_start) { f = ggml_offload_nop; } From b4ad03b3a7a956ab18a9c345bd94519a8072457f Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Sun, 29 Oct 2023 10:33:11 +0200 Subject: [PATCH 11/20] llama : try to optimize offloading code --- llama.cpp | 335 ++++++++++++++++++++++++++++++++++++------------------ 1 file changed, 222 insertions(+), 113 deletions(-) diff --git a/llama.cpp b/llama.cpp index 49f1cf910..90fc698a9 100644 --- a/llama.cpp +++ b/llama.cpp @@ -5267,6 +5267,90 @@ static struct ggml_cgraph * llm_build_mpt( return gf; } +enum offload_func_e { + OFFLOAD_FUNC_NOP, + OFFLOAD_FUNC, + OFFLOAD_FUNC_KQ, + OFFLOAD_FUNC_V, + OFFLOAD_FUNC_NR, + OFFLOAD_FUNC_EMB, + OFFLOAD_FUNC_OUT, +}; + +struct llama_offload_trie { + struct node { + ~node() { + for (int i = 0; i < 256; ++i) { + if (children[i]) { + delete children[i]; + } + } + } + + node * children[256] = { nullptr }; + offload_func_e func = OFFLOAD_FUNC_NOP; + }; + + llama_offload_trie() { + root = new node; + } + + llama_offload_trie(const std::unordered_map & map) { + root = new node; + + for (const auto & kv : map) { + add(kv.first, kv.second); + } + } + + ~llama_offload_trie() { + delete root; + } + + void add(const char * name, offload_func_e func) { + node * cur = root; + + for (int i = 0; ; ++i) { + const uint8_t c = name[i]; + + if (!c) { + break; + } + + if (!cur->children[c]) { + cur->children[c] = new node; + } + + cur = cur->children[c]; + } + + cur->func = func; + } + + offload_func_e find(const char * name) const { + const node * cur = root; + + for (int i = 0; ; ++i) { + const uint8_t c = name[i]; + + if (!c) { + break; + } + + if (!cur->children[c]) { + return OFFLOAD_FUNC_NOP; + } + + cur = cur->children[c]; + } + + return cur->func; + } + + node * root = nullptr; +}; + + static void llama_build_graph_input( llama_context & lctx, const llama_batch & batch, @@ -5441,6 +5525,8 @@ static struct ggml_cgraph * llama_build_graph( // allocate memory and set the values for the input tensors of the graph llama_build_graph_input(lctx, batch, result); + //auto t_start = std::chrono::high_resolution_clock::now(); + // offload layers // TODO: this code will be obsoleted with backend v2 #ifdef GGML_USE_CUBLAS @@ -5456,132 +5542,113 @@ static struct ggml_cgraph * llama_build_graph( const int i_gpu_start = n_layer - n_gpu_layers; // should we offload the final norm? yes if we are not computing embeddings - const bool off_res_norm = lctx.embedding.empty(); - - // offload functions set the tensor output backend to GPU - // tensors are GPU-accelerated if any input or the output has been offloaded - offload_func_t offload_func_nr = ggml_offload_nop; // nr = non-repeating - offload_func_t offload_func_kq = ggml_offload_nop; - offload_func_t offload_func_v = ggml_offload_nop; - offload_func_t offload_func_emb = ggml_offload_nop; - offload_func_t offload_func_out = ggml_offload_nop; - offload_func_t offload_func = ggml_offload_nop; + const bool offload_emb = lctx.embedding.empty(); + static const std::unordered_map k_offload_func_name = { + { OFFLOAD_FUNC_NOP, "CPU" }, #ifdef GGML_USE_CUBLAS - if (n_gpu_layers > n_layer) { - offload_func_nr = ggml_cuda_assign_buffers_no_alloc; - } - if (n_gpu_layers > n_layer + 1) { - offload_func_v = ggml_cuda_assign_buffers_no_alloc; - } - if (n_gpu_layers > n_layer + 2) { - offload_func_kq = ggml_cuda_assign_buffers_no_alloc; - } - - offload_func_emb = off_res_norm ? ggml_cuda_assign_buffers_no_alloc : ggml_offload_nop; - offload_func_out = ggml_offload_nop; - - offload_func = ggml_cuda_assign_buffers_no_alloc; + { OFFLOAD_FUNC, "GPU (CUDA)" }, + { OFFLOAD_FUNC_KQ, "GPU (CUDA) KQ" }, + { OFFLOAD_FUNC_V, "GPU (CUDA) V" }, + { OFFLOAD_FUNC_NR, "GPU (CUDA) NR" }, + { OFFLOAD_FUNC_EMB, "GPU (CUDA) EMB" }, + { OFFLOAD_FUNC_OUT, "GPU (CUDA) OUT" }, #endif // GGML_USE_CUBLAS - - static const std::unordered_map k_offload_func_name = { - { ggml_offload_nop, "CPU" }, -#ifdef GGML_USE_CUBLAS - { ggml_cuda_assign_buffers_no_alloc, "GPU (CUDA)" }, -#endif }; - const std::unordered_map k_offload_func = { - { "KQ_mask", offload_func_kq }, - { "KQ_pos", offload_func_kq }, - { "K_shift", offload_func_kq }, - { "K_shifted", offload_func_kq }, + static const std::unordered_map k_offload_func = { + { "KQ_mask", OFFLOAD_FUNC_KQ }, + { "KQ_pos", OFFLOAD_FUNC_KQ }, + { "K_shift", OFFLOAD_FUNC_KQ }, + { "K_shifted", OFFLOAD_FUNC_KQ }, - { "inp_norm", offload_func_nr }, - { "inp_norm_w", offload_func_nr }, - { "inp_norm_wb", offload_func_nr }, + { "inp_norm", OFFLOAD_FUNC_NR }, + { "inp_norm_w", OFFLOAD_FUNC_NR }, + { "inp_norm_wb", OFFLOAD_FUNC_NR }, - { "rms_norm_0", offload_func }, + { "rms_norm_0", OFFLOAD_FUNC }, - { "attn_norm_0", offload_func }, - { "attn_norm_0_w", offload_func }, - { "attn_norm_0_wb", offload_func }, + { "attn_norm_0", OFFLOAD_FUNC }, + { "attn_norm_0_w", OFFLOAD_FUNC }, + { "attn_norm_0_wb", OFFLOAD_FUNC }, - { "attn_norm_2", offload_func }, - { "attn_norm_2_w", offload_func }, - { "attn_norm_2_wb", offload_func }, + { "attn_norm_2", OFFLOAD_FUNC }, + { "attn_norm_2_w", OFFLOAD_FUNC }, + { "attn_norm_2_wb", OFFLOAD_FUNC }, - { "wqkv", offload_func_kq }, - { "bqkv", offload_func_kq }, - { "wqkv_clamped", offload_func_kq }, + { "wqkv", OFFLOAD_FUNC_KQ }, + { "bqkv", OFFLOAD_FUNC_KQ }, + { "wqkv_clamped", OFFLOAD_FUNC_KQ }, - { "tmpk", offload_func_kq }, - { "tmpq", offload_func_kq }, - { "tmpv", offload_func_v }, - { "tmpkqv", offload_func_kq }, // ?? - { "Kcur", offload_func_kq }, - { "Qcur", offload_func_kq }, - { "Vcur", offload_func_v }, - { "Vcur_0", offload_func_v }, - { "Vcur_1", offload_func_v }, + { "tmpk", OFFLOAD_FUNC_KQ }, + { "tmpq", OFFLOAD_FUNC_KQ }, + { "tmpv", OFFLOAD_FUNC_V }, + { "tmpkqv", OFFLOAD_FUNC_KQ }, // ?? + { "Kcur", OFFLOAD_FUNC_KQ }, + { "Qcur", OFFLOAD_FUNC_KQ }, + { "Vcur", OFFLOAD_FUNC_V }, + { "Vcur_0", OFFLOAD_FUNC_V }, + { "Vcur_1", OFFLOAD_FUNC_V }, - { "krot", offload_func_kq }, - { "qrot", offload_func_kq }, - { "kpass", offload_func_kq }, - { "qpass", offload_func_kq }, - { "krotated", offload_func_kq }, - { "qrotated", offload_func_kq }, + { "krot", OFFLOAD_FUNC_KQ }, + { "qrot", OFFLOAD_FUNC_KQ }, + { "kpass", OFFLOAD_FUNC_KQ }, + { "qpass", OFFLOAD_FUNC_KQ }, + { "krotated", OFFLOAD_FUNC_KQ }, + { "qrotated", OFFLOAD_FUNC_KQ }, - { "k", offload_func_kq }, - { "v", offload_func_v }, + { "k", OFFLOAD_FUNC_KQ }, + { "v", OFFLOAD_FUNC_V }, - { "Q", offload_func_kq }, - { "K", offload_func_kq }, - { "KQ", offload_func_kq }, - { "KQ_scaled", offload_func_kq }, - { "KQ_scaled_alibi", offload_func_kq }, - { "KQ_masked", offload_func_kq }, - { "KQ_soft_max", offload_func_v }, - { "V", offload_func_v }, - { "KQV", offload_func_v }, - { "KQV_merged", offload_func_v }, - { "KQV_merged_contiguous", offload_func_v }, + { "Q", OFFLOAD_FUNC_KQ }, + { "K", OFFLOAD_FUNC_KQ }, + { "KQ", OFFLOAD_FUNC_KQ }, + { "KQ_scaled", OFFLOAD_FUNC_KQ }, + { "KQ_scaled_alibi", OFFLOAD_FUNC_KQ }, + { "KQ_masked", OFFLOAD_FUNC_KQ }, + { "KQ_soft_max", OFFLOAD_FUNC_V }, + { "V", OFFLOAD_FUNC_V }, + { "KQV", OFFLOAD_FUNC_V }, + { "KQV_merged", OFFLOAD_FUNC_V }, + { "KQV_merged_contiguous", OFFLOAD_FUNC_V }, - { "result_wo", offload_func }, - { "result_wo_b", offload_func }, - { "inpL_+_result_wo", offload_func }, + { "result_wo", OFFLOAD_FUNC }, + { "result_wo_b", OFFLOAD_FUNC }, + { "inpL_+_result_wo", OFFLOAD_FUNC }, - { "inpFF", offload_func }, + { "inpFF", OFFLOAD_FUNC }, - { "rms_norm_1", offload_func }, - { "ffn_norm", offload_func }, - { "ffn_norm_0", offload_func }, - { "ffn_norm_0_w", offload_func }, - { "ffn_norm_0_wb", offload_func }, + { "rms_norm_1", OFFLOAD_FUNC }, + { "ffn_norm", OFFLOAD_FUNC }, + { "ffn_norm_0", OFFLOAD_FUNC }, + { "ffn_norm_0_w", OFFLOAD_FUNC }, + { "ffn_norm_0_wb", OFFLOAD_FUNC }, - { "result_w3", offload_func }, - { "result_w3_b", offload_func }, - { "result_w2", offload_func }, - { "result_w2_b", offload_func }, - { "result_w1", offload_func }, + { "result_w3", OFFLOAD_FUNC }, + { "result_w3_b", OFFLOAD_FUNC }, + { "result_w2", OFFLOAD_FUNC }, + { "result_w2_b", OFFLOAD_FUNC }, + { "result_w1", OFFLOAD_FUNC }, - { "silu", offload_func }, - { "gelu", offload_func }, - { "relu", offload_func }, - { "sqr(relu)", offload_func }, + { "silu", OFFLOAD_FUNC }, + { "gelu", OFFLOAD_FUNC }, + { "relu", OFFLOAD_FUNC }, + { "sqr(relu)", OFFLOAD_FUNC }, - { "silu_x_result_w3", offload_func }, - { "inpFF_+_result_w2", offload_func }, - { "inpL_+_inpFF_+_result_w2", offload_func }, + { "silu_x_result_w3", OFFLOAD_FUNC }, + { "inpFF_+_result_w2", OFFLOAD_FUNC }, + { "inpL_+_inpFF_+_result_w2", OFFLOAD_FUNC }, - { "rms_norm_2", offload_func_nr }, - { "out_norm_0", offload_func_nr }, - { "out_norm_0_w", offload_func_nr }, + { "rms_norm_2", OFFLOAD_FUNC_NR }, + { "out_norm_0", OFFLOAD_FUNC_NR }, + { "out_norm_0_w", OFFLOAD_FUNC_NR }, - { "result_norm", offload_func_emb }, - { "result_output", offload_func_out }, + { "result_norm", OFFLOAD_FUNC_EMB }, + { "result_output", OFFLOAD_FUNC_OUT }, }; + static llama_offload_trie k_offload_func_trie(k_offload_func); + std::unordered_map ofn; for (int i = 0; i < result->n_nodes; ++i) { @@ -5592,36 +5659,78 @@ static struct ggml_cgraph * llama_build_graph( continue; } - const std::string name = cur->name; + offload_func_e func_e = k_offload_func_trie.find(cur->name); - const auto it = k_offload_func.find(name); - if (it == k_offload_func.end()) { + if (func_e == OFFLOAD_FUNC_NOP) { // if a tensor hasn't been offloaded, we warn the user if (worst_case) { LLAMA_LOG_WARN("%s: node %4d %32s: not offloaded (ref: %s)\n", __func__, - i, name.c_str(), "https://github.com/ggerganov/llama.cpp/pull/3837"); + i, cur->name, "https://github.com/ggerganov/llama.cpp/pull/3837"); } continue; } // count the number of layers and respect the provided n_gpu_layers - offload_func_t f = it->second; - if (n_gpu_layers < n_layer && f == offload_func) { - if (ofn[name]++ < i_gpu_start) { - f = ggml_offload_nop; - } + switch (func_e) { + case OFFLOAD_FUNC_NOP: + case OFFLOAD_FUNC_OUT: break; + case OFFLOAD_FUNC: + if (n_gpu_layers < n_layer) { + if (ofn[cur->name]++ < i_gpu_start) { + func_e = OFFLOAD_FUNC_NOP; + } + } + break; + case OFFLOAD_FUNC_NR: + if (n_gpu_layers <= n_layer + 0) { + func_e = OFFLOAD_FUNC_NOP; + } + break; + case OFFLOAD_FUNC_V: + if (n_gpu_layers <= n_layer + 1) { + func_e = OFFLOAD_FUNC_NOP; + } + break; + case OFFLOAD_FUNC_KQ: + if (n_gpu_layers <= n_layer + 2) { + func_e = OFFLOAD_FUNC_NOP; + } + break; + case OFFLOAD_FUNC_EMB: + if (!offload_emb || n_gpu_layers < n_layer) { + func_e = OFFLOAD_FUNC_NOP; + } + break; + default: GGML_ASSERT(false); + } + + offload_func_t func = ggml_offload_nop; + + switch (func_e) { + case OFFLOAD_FUNC_NOP: + case OFFLOAD_FUNC_OUT: func = ggml_offload_nop; break; + case OFFLOAD_FUNC: + case OFFLOAD_FUNC_KQ: + case OFFLOAD_FUNC_V: + case OFFLOAD_FUNC_NR: + case OFFLOAD_FUNC_EMB: func = ggml_cuda_assign_buffers_no_alloc; break; + default: GGML_ASSERT(false); } // apply offload function to the tensor - f(cur); + func(cur); if (worst_case) { - LLAMA_LOG_INFO("%s: node %4d %32s: %s\n", __func__, i, name.c_str(), k_offload_func_name.at(f).c_str()); + LLAMA_LOG_INFO("%s: node %4d %32s: %s\n", __func__, i, cur->name, k_offload_func_name.at(func_e).c_str()); } } } + //auto t_end = std::chrono::high_resolution_clock::now(); + + //printf("offload time: %f ms\n", std::chrono::duration(t_end - t_start).count()); + return result; } From 25cfbf6776d44ee428b9414240d7c725b3709258 Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Sun, 29 Oct 2023 11:12:03 +0200 Subject: [PATCH 12/20] llama : fix non-CUDA build --- llama.cpp | 15 ++++++++++++++- 1 file changed, 14 insertions(+), 1 deletion(-) diff --git a/llama.cpp b/llama.cpp index 90fc698a9..d9c4fb3e3 100644 --- a/llama.cpp +++ b/llama.cpp @@ -5553,6 +5553,13 @@ static struct ggml_cgraph * llama_build_graph( { OFFLOAD_FUNC_NR, "GPU (CUDA) NR" }, { OFFLOAD_FUNC_EMB, "GPU (CUDA) EMB" }, { OFFLOAD_FUNC_OUT, "GPU (CUDA) OUT" }, +#else + { OFFLOAD_FUNC, "CPU" }, + { OFFLOAD_FUNC_KQ, "CPU" }, + { OFFLOAD_FUNC_V, "CPU" }, + { OFFLOAD_FUNC_NR, "CPU" }, + { OFFLOAD_FUNC_EMB, "CPU" }, + { OFFLOAD_FUNC_OUT, "CPU" }, #endif // GGML_USE_CUBLAS }; @@ -5707,6 +5714,12 @@ static struct ggml_cgraph * llama_build_graph( offload_func_t func = ggml_offload_nop; +#ifdef GGML_USE_CUBLAS + static offload_func_t ggml_offload_gpu = ggml_cuda_assign_buffers_no_alloc; +#else + static offload_func_t ggml_offload_gpu = ggml_offload_nop; +#endif + switch (func_e) { case OFFLOAD_FUNC_NOP: case OFFLOAD_FUNC_OUT: func = ggml_offload_nop; break; @@ -5714,7 +5727,7 @@ static struct ggml_cgraph * llama_build_graph( case OFFLOAD_FUNC_KQ: case OFFLOAD_FUNC_V: case OFFLOAD_FUNC_NR: - case OFFLOAD_FUNC_EMB: func = ggml_cuda_assign_buffers_no_alloc; break; + case OFFLOAD_FUNC_EMB: func = ggml_offload_gpu; break; default: GGML_ASSERT(false); } From 739b85c98564a2f48678550f3ab5b6da7302c6bc Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Sun, 29 Oct 2023 11:25:32 +0200 Subject: [PATCH 13/20] llama : try to fix build --- llama.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/llama.cpp b/llama.cpp index d9c4fb3e3..ac359da69 100644 --- a/llama.cpp +++ b/llama.cpp @@ -5544,7 +5544,7 @@ static struct ggml_cgraph * llama_build_graph( // should we offload the final norm? yes if we are not computing embeddings const bool offload_emb = lctx.embedding.empty(); - static const std::unordered_map k_offload_func_name = { + static const std::unordered_map> k_offload_func_name = { { OFFLOAD_FUNC_NOP, "CPU" }, #ifdef GGML_USE_CUBLAS { OFFLOAD_FUNC, "GPU (CUDA)" }, From da936188d87d70be6091840ed990c92dc66d8d46 Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Sun, 29 Oct 2023 11:48:24 +0200 Subject: [PATCH 14/20] llama : move refact in correct place + optimize graph input --- llama.cpp | 620 ++++++++++++++++++++++++++++-------------------------- 1 file changed, 317 insertions(+), 303 deletions(-) diff --git a/llama.cpp b/llama.cpp index ac359da69..72678a438 100644 --- a/llama.cpp +++ b/llama.cpp @@ -3166,10 +3166,10 @@ static struct ggml_cgraph * llm_build_llama( ggml_set_name(KQ_pos, "KQ_pos"); // shift the entire K-cache if needed - if (do_rope_shift) { - struct ggml_tensor * K_shift = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_ctx); - ggml_set_name(K_shift, "K_shift"); + struct ggml_tensor * K_shift = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_ctx); + ggml_set_name(K_shift, "K_shift"); + if (do_rope_shift) { for (int il = 0; il < n_layer; ++il) { struct ggml_tensor * tmp = ggml_rope_custom_inplace(ctx0, @@ -3440,10 +3440,10 @@ static struct ggml_cgraph * llm_build_baichaun( ggml_set_name(KQ_pos, "KQ_pos"); // shift the entire K-cache if needed - if (do_rope_shift) { - struct ggml_tensor * K_shift = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_ctx); - ggml_set_name(K_shift, "K_shift"); + struct ggml_tensor * K_shift = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_ctx); + ggml_set_name(K_shift, "K_shift"); + if (do_rope_shift) { for (int il = 0; il < n_layer; ++il) { struct ggml_tensor * tmp = ggml_rope_custom_inplace(ctx0, @@ -3658,247 +3658,6 @@ static struct ggml_cgraph * llm_build_baichaun( return gf; } -static struct ggml_cgraph * llm_build_refact( - llama_context & lctx, - const llama_batch & batch, - bool worst_case) { - const auto & model = lctx.model; - const auto & hparams = model.hparams; - const auto & cparams = lctx.cparams; - - const auto & kv_self = lctx.kv_self; - - GGML_ASSERT(!!kv_self.ctx); - - const int64_t n_embd = hparams.n_embd; - const int64_t n_layer = hparams.n_layer; - const int64_t n_ctx = cparams.n_ctx; - const int64_t n_head = hparams.n_head; - const int64_t n_head_kv = hparams.n_head_kv; - const int64_t n_embd_head = hparams.n_embd_head(); - const int64_t n_embd_gqa = hparams.n_embd_gqa(); - - const float norm_rms_eps = hparams.f_norm_rms_eps; - - const int32_t n_tokens = batch.n_tokens; - const int32_t n_kv = worst_case ? n_ctx : kv_self.n; - const int32_t kv_head = worst_case ? n_ctx - n_tokens : kv_self.head; - - // printf("n_kv = %d\n", n_kv); - - auto & buf_compute = lctx.buf_compute; - - struct ggml_init_params params = { - /*.mem_size =*/ buf_compute.size, - /*.mem_buffer =*/ buf_compute.data, - /*.no_alloc =*/ true, - }; - - struct ggml_context * ctx0 = ggml_init(params); - - ggml_cgraph * gf = ggml_new_graph(ctx0); - - struct ggml_tensor * cur; - struct ggml_tensor * inpL; - - if (batch.token) { - struct ggml_tensor * inp_tokens = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens); - ggml_set_name(inp_tokens, "inp_tokens"); - - inpL = ggml_get_rows(ctx0, model.tok_embeddings, inp_tokens); - } else { -#ifdef GGML_USE_MPI - GGML_ASSERT(false && "not implemented"); -#endif - - inpL = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd, n_tokens); - } - ggml_set_name(inpL, "inp_embd"); - - // KQ_scale - struct ggml_tensor * KQ_scale = ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, 1); - ggml_set_name(KQ_scale, "KQ_scale"); - - // KQ_mask (mask for 1 head, it will be broadcasted to all heads) - struct ggml_tensor * KQ_mask = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_kv, n_tokens, 1); - ggml_set_name(KQ_mask, "KQ_mask"); - - for (int il = 0; il < n_layer; ++il) { - struct ggml_tensor * inpSA = inpL; - - // norm - { - cur = ggml_rms_norm(ctx0, inpL, norm_rms_eps); - ggml_set_name(cur, "rms_norm_0"); - - // cur = cur*attn_norm(broadcasted) - cur = ggml_mul(ctx0, cur, model.layers[il].attn_norm); - ggml_set_name(cur, "attn_norm_0"); - } - - // self-attention - { - // compute Q and K - struct ggml_tensor * tmpk = ggml_mul_mat(ctx0, model.layers[il].wk, cur); - ggml_set_name(tmpk, "tmpk"); - - struct ggml_tensor * tmpq = ggml_mul_mat(ctx0, model.layers[il].wq, cur); - ggml_set_name(tmpq, "tmpq"); - - struct ggml_tensor * Kcur = ggml_reshape_3d(ctx0, tmpk, n_embd_head, n_head_kv, n_tokens); - ggml_set_name(Kcur, "Kcur"); - - struct ggml_tensor * Qcur = ggml_reshape_3d(ctx0, tmpq, n_embd_head, n_head, n_tokens); - ggml_set_name(Qcur, "Qcur"); - - // store key and value to memory - { - // compute the transposed [n_tokens, n_embd] V matrix - - struct ggml_tensor * tmpv = ggml_mul_mat(ctx0, model.layers[il].wv, cur); - ggml_set_name(tmpv, "tmpv"); - - struct ggml_tensor * Vcur = ggml_transpose(ctx0, ggml_reshape_2d(ctx0, tmpv, n_embd_gqa, n_tokens)); - ggml_set_name(Vcur, "Vcur"); - - struct ggml_tensor * k = ggml_view_1d(ctx0, kv_self.k, n_tokens*n_embd_gqa, (ggml_element_size(kv_self.k)*n_embd_gqa)*(il*n_ctx + kv_head)); - ggml_set_name(k, "k"); - - struct ggml_tensor * v = ggml_view_2d(ctx0, kv_self.v, n_tokens, n_embd_gqa, - ( n_ctx)*ggml_element_size(kv_self.v), - (il*n_ctx)*ggml_element_size(kv_self.v)*n_embd_gqa + kv_head*ggml_element_size(kv_self.v)); - ggml_set_name(v, "v"); - - ggml_build_forward_expand(gf, ggml_cpy(ctx0, Kcur, k)); - ggml_build_forward_expand(gf, ggml_cpy(ctx0, Vcur, v)); - } - - struct ggml_tensor * Q = ggml_permute(ctx0, Qcur, 0, 2, 1, 3); - ggml_set_name(Q, "Q"); - - struct ggml_tensor * K = - ggml_view_3d(ctx0, kv_self.k, - n_embd_head, n_kv, n_head_kv, - ggml_element_size(kv_self.k)*n_embd_gqa, - ggml_element_size(kv_self.k)*n_embd_head, - ggml_element_size(kv_self.k)*n_embd_gqa*n_ctx*il); - ggml_set_name(K, "K"); - - // K * Q - struct ggml_tensor * KQ = ggml_mul_mat(ctx0, K, Q); - ggml_set_name(KQ, "KQ"); - - // KQ_scaled = KQ / sqrt(n_embd_head) - // KQ_scaled shape [n_kv, n_tokens, n_head, 1] - struct ggml_tensor * KQ_scaled = ggml_scale(ctx0, KQ, KQ_scale); - ggml_set_name(KQ_scaled, "KQ_scaled"); - - // KQ_masked = mask_past(KQ_scaled) - struct ggml_tensor * KQ_scaled_alibi = ggml_alibi(ctx0, KQ_scaled, /*n_past*/ 0, n_head, 8); - ggml_set_name(KQ_scaled_alibi, "KQ_scaled_alibi"); - - struct ggml_tensor * KQ_masked = ggml_add(ctx0, KQ_scaled_alibi, KQ_mask); - ggml_set_name(KQ_masked, "KQ_masked"); - - // KQ = soft_max(KQ_masked) - struct ggml_tensor * KQ_soft_max = ggml_soft_max(ctx0, KQ_masked); - ggml_set_name(KQ_soft_max, "KQ_soft_max"); - - // split cached V into n_head heads - struct ggml_tensor * V = - ggml_view_3d(ctx0, kv_self.v, - n_kv, n_embd_head, n_head_kv, - ggml_element_size(kv_self.v)*n_ctx, - ggml_element_size(kv_self.v)*n_ctx*n_embd_head, - ggml_element_size(kv_self.v)*n_ctx*n_embd_gqa*il); - ggml_set_name(V, "V"); - - struct ggml_tensor * KQV = ggml_mul_mat(ctx0, V, KQ_soft_max); - ggml_set_name(KQV, "KQV"); - - // KQV_merged = KQV.permute(0, 2, 1, 3) - struct ggml_tensor * KQV_merged = ggml_permute(ctx0, KQV, 0, 2, 1, 3); - ggml_set_name(KQV_merged, "KQV_merged"); - - // cur = KQV_merged.contiguous().view(n_embd, n_tokens) - cur = ggml_cont_2d(ctx0, KQV_merged, n_embd, n_tokens); - ggml_set_name(cur, "KQV_merged_contiguous"); - - // projection (no bias) - cur = ggml_mul_mat(ctx0, - model.layers[il].wo, - cur); - ggml_set_name(cur, "result_wo"); - } - - struct ggml_tensor * inpFF = ggml_add(ctx0, cur, inpSA); - ggml_set_name(inpFF, "inpFF"); - - // feed-forward network - { - // norm - { - cur = ggml_rms_norm(ctx0, inpFF, norm_rms_eps); - ggml_set_name(cur, "rms_norm_1"); - - // cur = cur*ffn_norm(broadcasted) - cur = ggml_mul(ctx0, cur, model.layers[il].ffn_norm); - ggml_set_name(cur, "ffn_norm"); - } - - struct ggml_tensor * tmp = ggml_mul_mat(ctx0, - model.layers[il].w3, - cur); - ggml_set_name(tmp, "result_w3"); - - cur = ggml_mul_mat(ctx0, - model.layers[il].w1, - cur); - ggml_set_name(cur, "result_w1"); - - // SILU activation - cur = ggml_silu(ctx0, cur); - ggml_set_name(cur, "silu"); - - cur = ggml_mul(ctx0, cur, tmp); - ggml_set_name(cur, "silu_x_result_w3"); - - cur = ggml_mul_mat(ctx0, - model.layers[il].w2, - cur); - ggml_set_name(cur, "result_w2"); - } - - cur = ggml_add(ctx0, cur, inpFF); - ggml_set_name(cur, "inpFF_+_result_w2"); - - // input for next layer - inpL = cur; - } - - cur = inpL; - - // norm - { - cur = ggml_rms_norm(ctx0, cur, norm_rms_eps); - ggml_set_name(cur, "rms_norm_2"); - - // cur = cur*norm(broadcasted) - cur = ggml_mul(ctx0, cur, model.output_norm); - ggml_set_name(cur, "result_norm"); - } - - // lm_head - cur = ggml_mul_mat(ctx0, model.output, cur); - ggml_set_name(cur, "result_output"); - - ggml_build_forward_expand(gf, cur); - - ggml_free(ctx0); - - return gf; -} - static struct ggml_cgraph * llm_build_falcon( llama_context & lctx, const llama_batch & batch, @@ -3976,10 +3735,10 @@ static struct ggml_cgraph * llm_build_falcon( ggml_set_name(KQ_pos, "KQ_pos"); // shift the entire K-cache if needed - if (do_rope_shift) { - struct ggml_tensor * K_shift = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_ctx); - ggml_set_name(K_shift, "K_shift"); + struct ggml_tensor * K_shift = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_ctx); + ggml_set_name(K_shift, "K_shift"); + if (do_rope_shift) { for (int il = 0; il < n_layer; ++il) { struct ggml_tensor * tmp = ggml_rope_custom_inplace(ctx0, @@ -4774,6 +4533,247 @@ static struct ggml_cgraph * llm_build_persimmon( return gf; } +static struct ggml_cgraph * llm_build_refact( + llama_context & lctx, + const llama_batch & batch, + bool worst_case) { + const auto & model = lctx.model; + const auto & hparams = model.hparams; + const auto & cparams = lctx.cparams; + + const auto & kv_self = lctx.kv_self; + + GGML_ASSERT(!!kv_self.ctx); + + const int64_t n_embd = hparams.n_embd; + const int64_t n_layer = hparams.n_layer; + const int64_t n_ctx = cparams.n_ctx; + const int64_t n_head = hparams.n_head; + const int64_t n_head_kv = hparams.n_head_kv; + const int64_t n_embd_head = hparams.n_embd_head(); + const int64_t n_embd_gqa = hparams.n_embd_gqa(); + + const float norm_rms_eps = hparams.f_norm_rms_eps; + + const int32_t n_tokens = batch.n_tokens; + const int32_t n_kv = worst_case ? n_ctx : kv_self.n; + const int32_t kv_head = worst_case ? n_ctx - n_tokens : kv_self.head; + + // printf("n_kv = %d\n", n_kv); + + auto & buf_compute = lctx.buf_compute; + + struct ggml_init_params params = { + /*.mem_size =*/ buf_compute.size, + /*.mem_buffer =*/ buf_compute.data, + /*.no_alloc =*/ true, + }; + + struct ggml_context * ctx0 = ggml_init(params); + + ggml_cgraph * gf = ggml_new_graph(ctx0); + + struct ggml_tensor * cur; + struct ggml_tensor * inpL; + + if (batch.token) { + struct ggml_tensor * inp_tokens = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens); + ggml_set_name(inp_tokens, "inp_tokens"); + + inpL = ggml_get_rows(ctx0, model.tok_embeddings, inp_tokens); + } else { +#ifdef GGML_USE_MPI + GGML_ASSERT(false && "not implemented"); +#endif + + inpL = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd, n_tokens); + } + ggml_set_name(inpL, "inp_embd"); + + // KQ_scale + struct ggml_tensor * KQ_scale = ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, 1); + ggml_set_name(KQ_scale, "KQ_scale"); + + // KQ_mask (mask for 1 head, it will be broadcasted to all heads) + struct ggml_tensor * KQ_mask = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_kv, n_tokens, 1); + ggml_set_name(KQ_mask, "KQ_mask"); + + for (int il = 0; il < n_layer; ++il) { + struct ggml_tensor * inpSA = inpL; + + // norm + { + cur = ggml_rms_norm(ctx0, inpL, norm_rms_eps); + ggml_set_name(cur, "rms_norm_0"); + + // cur = cur*attn_norm(broadcasted) + cur = ggml_mul(ctx0, cur, model.layers[il].attn_norm); + ggml_set_name(cur, "attn_norm_0"); + } + + // self-attention + { + // compute Q and K + struct ggml_tensor * tmpk = ggml_mul_mat(ctx0, model.layers[il].wk, cur); + ggml_set_name(tmpk, "tmpk"); + + struct ggml_tensor * tmpq = ggml_mul_mat(ctx0, model.layers[il].wq, cur); + ggml_set_name(tmpq, "tmpq"); + + struct ggml_tensor * Kcur = ggml_reshape_3d(ctx0, tmpk, n_embd_head, n_head_kv, n_tokens); + ggml_set_name(Kcur, "Kcur"); + + struct ggml_tensor * Qcur = ggml_reshape_3d(ctx0, tmpq, n_embd_head, n_head, n_tokens); + ggml_set_name(Qcur, "Qcur"); + + // store key and value to memory + { + // compute the transposed [n_tokens, n_embd] V matrix + + struct ggml_tensor * tmpv = ggml_mul_mat(ctx0, model.layers[il].wv, cur); + ggml_set_name(tmpv, "tmpv"); + + struct ggml_tensor * Vcur = ggml_transpose(ctx0, ggml_reshape_2d(ctx0, tmpv, n_embd_gqa, n_tokens)); + ggml_set_name(Vcur, "Vcur"); + + struct ggml_tensor * k = ggml_view_1d(ctx0, kv_self.k, n_tokens*n_embd_gqa, (ggml_element_size(kv_self.k)*n_embd_gqa)*(il*n_ctx + kv_head)); + ggml_set_name(k, "k"); + + struct ggml_tensor * v = ggml_view_2d(ctx0, kv_self.v, n_tokens, n_embd_gqa, + ( n_ctx)*ggml_element_size(kv_self.v), + (il*n_ctx)*ggml_element_size(kv_self.v)*n_embd_gqa + kv_head*ggml_element_size(kv_self.v)); + ggml_set_name(v, "v"); + + ggml_build_forward_expand(gf, ggml_cpy(ctx0, Kcur, k)); + ggml_build_forward_expand(gf, ggml_cpy(ctx0, Vcur, v)); + } + + struct ggml_tensor * Q = ggml_permute(ctx0, Qcur, 0, 2, 1, 3); + ggml_set_name(Q, "Q"); + + struct ggml_tensor * K = + ggml_view_3d(ctx0, kv_self.k, + n_embd_head, n_kv, n_head_kv, + ggml_element_size(kv_self.k)*n_embd_gqa, + ggml_element_size(kv_self.k)*n_embd_head, + ggml_element_size(kv_self.k)*n_embd_gqa*n_ctx*il); + ggml_set_name(K, "K"); + + // K * Q + struct ggml_tensor * KQ = ggml_mul_mat(ctx0, K, Q); + ggml_set_name(KQ, "KQ"); + + // KQ_scaled = KQ / sqrt(n_embd_head) + // KQ_scaled shape [n_kv, n_tokens, n_head, 1] + struct ggml_tensor * KQ_scaled = ggml_scale(ctx0, KQ, KQ_scale); + ggml_set_name(KQ_scaled, "KQ_scaled"); + + // KQ_masked = mask_past(KQ_scaled) + struct ggml_tensor * KQ_scaled_alibi = ggml_alibi(ctx0, KQ_scaled, /*n_past*/ 0, n_head, 8); + ggml_set_name(KQ_scaled_alibi, "KQ_scaled_alibi"); + + struct ggml_tensor * KQ_masked = ggml_add(ctx0, KQ_scaled_alibi, KQ_mask); + ggml_set_name(KQ_masked, "KQ_masked"); + + // KQ = soft_max(KQ_masked) + struct ggml_tensor * KQ_soft_max = ggml_soft_max(ctx0, KQ_masked); + ggml_set_name(KQ_soft_max, "KQ_soft_max"); + + // split cached V into n_head heads + struct ggml_tensor * V = + ggml_view_3d(ctx0, kv_self.v, + n_kv, n_embd_head, n_head_kv, + ggml_element_size(kv_self.v)*n_ctx, + ggml_element_size(kv_self.v)*n_ctx*n_embd_head, + ggml_element_size(kv_self.v)*n_ctx*n_embd_gqa*il); + ggml_set_name(V, "V"); + + struct ggml_tensor * KQV = ggml_mul_mat(ctx0, V, KQ_soft_max); + ggml_set_name(KQV, "KQV"); + + // KQV_merged = KQV.permute(0, 2, 1, 3) + struct ggml_tensor * KQV_merged = ggml_permute(ctx0, KQV, 0, 2, 1, 3); + ggml_set_name(KQV_merged, "KQV_merged"); + + // cur = KQV_merged.contiguous().view(n_embd, n_tokens) + cur = ggml_cont_2d(ctx0, KQV_merged, n_embd, n_tokens); + ggml_set_name(cur, "KQV_merged_contiguous"); + + // projection (no bias) + cur = ggml_mul_mat(ctx0, + model.layers[il].wo, + cur); + ggml_set_name(cur, "result_wo"); + } + + struct ggml_tensor * inpFF = ggml_add(ctx0, cur, inpSA); + ggml_set_name(inpFF, "inpFF"); + + // feed-forward network + { + // norm + { + cur = ggml_rms_norm(ctx0, inpFF, norm_rms_eps); + ggml_set_name(cur, "rms_norm_1"); + + // cur = cur*ffn_norm(broadcasted) + cur = ggml_mul(ctx0, cur, model.layers[il].ffn_norm); + ggml_set_name(cur, "ffn_norm"); + } + + struct ggml_tensor * tmp = ggml_mul_mat(ctx0, + model.layers[il].w3, + cur); + ggml_set_name(tmp, "result_w3"); + + cur = ggml_mul_mat(ctx0, + model.layers[il].w1, + cur); + ggml_set_name(cur, "result_w1"); + + // SILU activation + cur = ggml_silu(ctx0, cur); + ggml_set_name(cur, "silu"); + + cur = ggml_mul(ctx0, cur, tmp); + ggml_set_name(cur, "silu_x_result_w3"); + + cur = ggml_mul_mat(ctx0, + model.layers[il].w2, + cur); + ggml_set_name(cur, "result_w2"); + } + + cur = ggml_add(ctx0, cur, inpFF); + ggml_set_name(cur, "inpFF_+_result_w2"); + + // input for next layer + inpL = cur; + } + + cur = inpL; + + // norm + { + cur = ggml_rms_norm(ctx0, cur, norm_rms_eps); + ggml_set_name(cur, "rms_norm_2"); + + // cur = cur*norm(broadcasted) + cur = ggml_mul(ctx0, cur, model.output_norm); + ggml_set_name(cur, "result_norm"); + } + + // lm_head + cur = ggml_mul_mat(ctx0, model.output, cur); + ggml_set_name(cur, "result_output"); + + ggml_build_forward_expand(gf, cur); + + ggml_free(ctx0); + + return gf; +} + static struct ggml_cgraph * llm_build_bloom( llama_context & lctx, const llama_batch & batch, @@ -5360,7 +5360,7 @@ static void llama_build_graph_input( // inp_tokens if (batch.token) { cur = ggml_graph_get_tensor(graph, "inp_tokens"); - GGML_ASSERT(cur != nullptr); // required + GGML_ASSERT(cur != nullptr && "missing tensor 'inp_tokens'"); ggml_allocr_alloc(lctx.alloc, cur); @@ -5374,7 +5374,7 @@ static void llama_build_graph_input( // inp_embd if (batch.embd) { cur = ggml_graph_get_tensor(graph, "inp_embd"); - GGML_ASSERT(cur != nullptr); // required + GGML_ASSERT(cur != nullptr && "missing tensor 'inp_embd'"); ggml_allocr_alloc(lctx.alloc, cur); @@ -5386,38 +5386,84 @@ static void llama_build_graph_input( } } - // TODO: make the following required based on the ARCH + switch (lctx.model.arch) { + case LLM_ARCH_LLAMA: + case LLM_ARCH_BAICHUAN: + case LLM_ARCH_FALCON: + case LLM_ARCH_PERSIMMON: + { + // KQ_pos + cur = ggml_graph_get_tensor(graph, "KQ_pos"); + GGML_ASSERT(cur != nullptr && "missing tensor 'KQ_pos'"); - // inp_pos - cur = ggml_graph_get_tensor(graph, "inp_pos"); - if (cur) { - ggml_allocr_alloc(lctx.alloc, cur); + ggml_allocr_alloc(lctx.alloc, cur); - if (!ggml_allocr_is_measure(lctx.alloc)) { - const int64_t n_tokens = cur->ne[0]; + if (!ggml_allocr_is_measure(lctx.alloc)) { + const int64_t n_tokens = cur->ne[0]; - int32_t * data = (int32_t *) cur->data; + int32_t * data = (int32_t *) cur->data; - for (int i = 0; i < n_tokens; ++i) { - data[i] = batch.pos[i]; - } - } + for (int i = 0; i < n_tokens; ++i) { + data[i] = batch.pos[i]; + } + } + + // K_shift + cur = ggml_graph_get_tensor(graph, "K_shift"); + //GGML_ASSERT(cur != nullptr && "missing tensor 'K_shift'"); + if (cur) { + ggml_allocr_alloc(lctx.alloc, cur); + + if (!ggml_allocr_is_measure(lctx.alloc)) { + const int64_t n_ctx = cur->ne[0]; + + int32_t * data = (int32_t *) cur->data; + + for (int i = 0; i < n_ctx; ++i) { + data[i] = lctx.kv_self.cells[i].delta; + } + } + } + } break; + case LLM_ARCH_STARCODER: + { + // inp_pos + cur = ggml_graph_get_tensor(graph, "inp_pos"); + GGML_ASSERT(cur != nullptr && "missing tensor 'inp_pos'"); + + ggml_allocr_alloc(lctx.alloc, cur); + + if (!ggml_allocr_is_measure(lctx.alloc)) { + const int64_t n_tokens = cur->ne[0]; + + int32_t * data = (int32_t *) cur->data; + + for (int i = 0; i < n_tokens; ++i) { + data[i] = batch.pos[i]; + } + } + } break; + default: + break; } - // KQ_scale - cur = ggml_graph_get_tensor(graph, "KQ_scale"); - if (cur) { + // common + { + // KQ_scale + cur = ggml_graph_get_tensor(graph, "KQ_scale"); + GGML_ASSERT(cur != nullptr && "missing tensor 'KQ_scale'"); + ggml_allocr_alloc(lctx.alloc, cur); if (!ggml_allocr_is_measure(lctx.alloc)) { const int64_t n_embd_head = lctx.model.hparams.n_embd_head(); ggml_set_f32(cur, 1.0f/sqrtf(float(n_embd_head))); } - } - // KQ_mask - cur = ggml_graph_get_tensor(graph, "KQ_mask"); - if (cur) { + // KQ_mask + cur = ggml_graph_get_tensor(graph, "KQ_mask"); + GGML_ASSERT(cur != nullptr && "missing tensor 'KQ_mask'"); + ggml_allocr_alloc(lctx.alloc, cur); if (!ggml_allocr_is_measure(lctx.alloc)) { @@ -5441,38 +5487,6 @@ static void llama_build_graph_input( } } } - - // KQ_pos - cur = ggml_graph_get_tensor(graph, "KQ_pos"); - if (cur) { - ggml_allocr_alloc(lctx.alloc, cur); - - if (!ggml_allocr_is_measure(lctx.alloc)) { - const int64_t n_tokens = cur->ne[0]; - - int32_t * data = (int32_t *) cur->data; - - for (int i = 0; i < n_tokens; ++i) { - data[i] = batch.pos[i]; - } - } - } - - // K_shift - cur = ggml_graph_get_tensor(graph, "K_shift"); - if (cur) { - ggml_allocr_alloc(lctx.alloc, cur); - - if (!ggml_allocr_is_measure(lctx.alloc)) { - const int64_t n_ctx = cur->ne[0]; - - int32_t * data = (int32_t *) cur->data; - - for (int i = 0; i < n_ctx; ++i) { - data[i] = lctx.kv_self.cells[i].delta; - } - } - } while (0); } static struct ggml_cgraph * llama_build_graph( From 1e9c5443c2c7f6e5ad7f64d1a89e637bba98b7f1 Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Sun, 29 Oct 2023 12:35:07 +0200 Subject: [PATCH 15/20] llama : refactor tensor offloading as callback --- llama.cpp | 1430 ++++++++++++++++++++++++++--------------------------- 1 file changed, 704 insertions(+), 726 deletions(-) diff --git a/llama.cpp b/llama.cpp index 72678a438..fcb75716f 100644 --- a/llama.cpp +++ b/llama.cpp @@ -3090,10 +3090,13 @@ static bool llama_model_load( return true; } +using llm_build_cb = std::function; + static struct ggml_cgraph * llm_build_llama( - llama_context & lctx, - const llama_batch & batch, - bool worst_case) { + llama_context & lctx, + const llama_batch & batch, + const llm_build_cb & cb, + bool worst_case) { const auto & model = lctx.model; const auto & hparams = model.hparams; const auto & cparams = lctx.cparams; @@ -3141,7 +3144,7 @@ static struct ggml_cgraph * llm_build_llama( if (batch.token) { struct ggml_tensor * inp_tokens = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens); - ggml_set_name(inp_tokens, "inp_tokens"); + cb(inp_tokens, "inp_tokens"); inpL = ggml_get_rows(ctx0, model.tok_embeddings, inp_tokens); } else { @@ -3151,35 +3154,35 @@ static struct ggml_cgraph * llm_build_llama( inpL = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd, n_tokens); } - ggml_set_name(inpL, "inp_embd"); + cb(inpL, "inp_embd"); + + // inp_pos - contains the positions + struct ggml_tensor * inp_pos = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens); + cb(inp_pos, "inp_pos"); // KQ_scale struct ggml_tensor * KQ_scale = ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, 1); - ggml_set_name(KQ_scale, "KQ_scale"); + cb(KQ_scale, "KQ_scale"); // KQ_mask (mask for 1 head, it will be broadcasted to all heads) struct ggml_tensor * KQ_mask = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_kv, n_tokens, 1); - ggml_set_name(KQ_mask, "KQ_mask"); - - // KQ_pos - contains the positions - struct ggml_tensor * KQ_pos = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens); - ggml_set_name(KQ_pos, "KQ_pos"); + cb(KQ_mask, "KQ_mask"); // shift the entire K-cache if needed - struct ggml_tensor * K_shift = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_ctx); - ggml_set_name(K_shift, "K_shift"); - if (do_rope_shift) { + struct ggml_tensor * K_shift = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_ctx); + cb(K_shift, "K_shift"); + for (int il = 0; il < n_layer; ++il) { struct ggml_tensor * tmp = - ggml_rope_custom_inplace(ctx0, + ggml_rope_custom_inplace(ctx0, ggml_view_3d(ctx0, kv_self.k, n_embd_head, n_head_kv, n_ctx, ggml_element_size(kv_self.k)*n_embd_head, ggml_element_size(kv_self.k)*n_embd_gqa, ggml_element_size(kv_self.k)*n_embd_gqa*n_ctx*il), K_shift, n_embd_head, 0, 0, freq_base, freq_scale); - ggml_set_name(tmp, "K_shifted"); + cb(tmp, "K_shifted"); ggml_build_forward_expand(gf, tmp); } } @@ -3190,45 +3193,45 @@ static struct ggml_cgraph * llm_build_llama( // norm { cur = ggml_rms_norm(ctx0, inpL, norm_rms_eps); - ggml_set_name(cur, "rms_norm_0"); + cb(cur, "rms_norm_0"); // cur = cur*attn_norm(broadcasted) cur = ggml_mul(ctx0, cur, model.layers[il].attn_norm); - ggml_set_name(cur, "attn_norm_0"); + cb(cur, "attn_norm_0"); } // self-attention { // compute Q and K and RoPE them struct ggml_tensor * tmpk = ggml_mul_mat(ctx0, model.layers[il].wk, cur); - ggml_set_name(tmpk, "tmpk"); + cb(tmpk, "tmpk"); struct ggml_tensor * tmpq = ggml_mul_mat(ctx0, model.layers[il].wq, cur); - ggml_set_name(tmpq, "tmpq"); + cb(tmpq, "tmpq"); - struct ggml_tensor * Kcur = ggml_rope_custom(ctx0, ggml_reshape_3d(ctx0, tmpk, n_embd_head, n_head_kv, n_tokens), KQ_pos, n_embd_head, 0, 0, freq_base, freq_scale); - ggml_set_name(Kcur, "Kcur"); + struct ggml_tensor * Kcur = ggml_rope_custom(ctx0, ggml_reshape_3d(ctx0, tmpk, n_embd_head, n_head_kv, n_tokens), inp_pos, n_embd_head, 0, 0, freq_base, freq_scale); + cb(Kcur, "Kcur"); - struct ggml_tensor * Qcur = ggml_rope_custom(ctx0, ggml_reshape_3d(ctx0, tmpq, n_embd_head, n_head, n_tokens), KQ_pos, n_embd_head, 0, 0, freq_base, freq_scale); - ggml_set_name(Qcur, "Qcur"); + struct ggml_tensor * Qcur = ggml_rope_custom(ctx0, ggml_reshape_3d(ctx0, tmpq, n_embd_head, n_head, n_tokens), inp_pos, n_embd_head, 0, 0, freq_base, freq_scale); + cb(Qcur, "Qcur"); // store key and value to memory { // compute the transposed [n_tokens, n_embd] V matrix struct ggml_tensor * tmpv = ggml_mul_mat(ctx0, model.layers[il].wv, cur); - ggml_set_name(tmpv, "tmpv"); + cb(tmpv, "tmpv"); struct ggml_tensor * Vcur = ggml_transpose(ctx0, ggml_reshape_2d(ctx0, tmpv, n_embd_gqa, n_tokens)); - ggml_set_name(Vcur, "Vcur"); + cb(Vcur, "Vcur"); struct ggml_tensor * k = ggml_view_1d(ctx0, kv_self.k, n_tokens*n_embd_gqa, (ggml_element_size(kv_self.k)*n_embd_gqa)*(il*n_ctx + kv_head)); - ggml_set_name(k, "k"); + cb(k, "k"); struct ggml_tensor * v = ggml_view_2d(ctx0, kv_self.v, n_tokens, n_embd_gqa, ( n_ctx)*ggml_element_size(kv_self.v), (il*n_ctx)*ggml_element_size(kv_self.v)*n_embd_gqa + kv_head*ggml_element_size(kv_self.v)); - ggml_set_name(v, "v"); + cb(v, "v"); // important: storing RoPE-ed version of K in the KV cache! ggml_build_forward_expand(gf, ggml_cpy(ctx0, Kcur, k)); @@ -3236,7 +3239,7 @@ static struct ggml_cgraph * llm_build_llama( } struct ggml_tensor * Q = ggml_permute(ctx0, Qcur, 0, 2, 1, 3); - ggml_set_name(Q, "Q"); + cb(Q, "Q"); struct ggml_tensor * K = ggml_view_3d(ctx0, kv_self.k, @@ -3244,24 +3247,24 @@ static struct ggml_cgraph * llm_build_llama( ggml_element_size(kv_self.k)*n_embd_gqa, ggml_element_size(kv_self.k)*n_embd_head, ggml_element_size(kv_self.k)*n_embd_gqa*n_ctx*il); - ggml_set_name(K, "K"); + cb(K, "K"); // K * Q struct ggml_tensor * KQ = ggml_mul_mat(ctx0, K, Q); - ggml_set_name(KQ, "KQ"); + cb(KQ, "KQ"); // KQ_scaled = KQ / sqrt(n_embd_head) // KQ_scaled shape [n_kv, n_tokens, n_head, 1] struct ggml_tensor * KQ_scaled = ggml_scale(ctx0, KQ, KQ_scale); - ggml_set_name(KQ_scaled, "KQ_scaled"); + cb(KQ_scaled, "KQ_scaled"); // KQ_masked = mask_past(KQ_scaled) struct ggml_tensor * KQ_masked = ggml_add(ctx0, KQ_scaled, KQ_mask); - ggml_set_name(KQ_masked, "KQ_masked"); + cb(KQ_masked, "KQ_masked"); // KQ = soft_max(KQ_masked) struct ggml_tensor * KQ_soft_max = ggml_soft_max(ctx0, KQ_masked); - ggml_set_name(KQ_soft_max, "KQ_soft_max"); + cb(KQ_soft_max, "KQ_soft_max"); // split cached V into n_head heads struct ggml_tensor * V = @@ -3270,11 +3273,11 @@ static struct ggml_cgraph * llm_build_llama( ggml_element_size(kv_self.v)*n_ctx, ggml_element_size(kv_self.v)*n_ctx*n_embd_head, ggml_element_size(kv_self.v)*n_ctx*n_embd_gqa*il); - ggml_set_name(V, "V"); + cb(V, "V"); #if 1 struct ggml_tensor * KQV = ggml_mul_mat(ctx0, V, KQ_soft_max); - ggml_set_name(KQV, "KQV"); + cb(KQV, "KQV"); #else // make V contiguous in memory to speed up the matmul, however we waste time on the copy // on M1 this is faster for the perplexity computation, but ~5% slower for the single-token generation @@ -3285,59 +3288,59 @@ static struct ggml_cgraph * llm_build_llama( // KQV_merged = KQV.permute(0, 2, 1, 3) struct ggml_tensor * KQV_merged = ggml_permute(ctx0, KQV, 0, 2, 1, 3); - ggml_set_name(KQV_merged, "KQV_merged"); + cb(KQV_merged, "KQV_merged"); // cur = KQV_merged.contiguous().view(n_embd, n_tokens) cur = ggml_cont_2d(ctx0, KQV_merged, n_embd, n_tokens); - ggml_set_name(cur, "KQV_merged_contiguous"); + cb(cur, "KQV_merged_contiguous"); // projection (no bias) cur = ggml_mul_mat(ctx0, model.layers[il].wo, cur); - ggml_set_name(cur, "result_wo"); + cb(cur, "result_wo"); } struct ggml_tensor * inpFF = ggml_add(ctx0, cur, inpSA); - ggml_set_name(inpFF, "inpFF"); + cb(inpFF, "inpFF"); // feed-forward network { // norm { cur = ggml_rms_norm(ctx0, inpFF, norm_rms_eps); - ggml_set_name(cur, "rms_norm_1"); + cb(cur, "rms_norm_1"); // cur = cur*ffn_norm(broadcasted) cur = ggml_mul(ctx0, cur, model.layers[il].ffn_norm); - ggml_set_name(cur, "ffn_norm"); + cb(cur, "ffn_norm"); } struct ggml_tensor * tmp = ggml_mul_mat(ctx0, model.layers[il].w3, cur); - ggml_set_name(tmp, "result_w3"); + cb(tmp, "result_w3"); cur = ggml_mul_mat(ctx0, model.layers[il].w1, cur); - ggml_set_name(cur, "result_w1"); + cb(cur, "result_w1"); // SILU activation cur = ggml_silu(ctx0, cur); - ggml_set_name(cur, "silu"); + cb(cur, "silu"); cur = ggml_mul(ctx0, cur, tmp); - ggml_set_name(cur, "silu_x_result_w3"); + cb(cur, "silu_x_result_w3"); cur = ggml_mul_mat(ctx0, model.layers[il].w2, cur); - ggml_set_name(cur, "result_w2"); + cb(cur, "result_w2"); } cur = ggml_add(ctx0, cur, inpFF); - ggml_set_name(cur, "inpFF_+_result_w2"); + cb(cur, "inpFF_+_result_w2"); // input for next layer inpL = cur; @@ -3348,16 +3351,16 @@ static struct ggml_cgraph * llm_build_llama( // norm { cur = ggml_rms_norm(ctx0, cur, norm_rms_eps); - ggml_set_name(cur, "rms_norm_2"); + cb(cur, "rms_norm_2"); // cur = cur*norm(broadcasted) cur = ggml_mul(ctx0, cur, model.output_norm); - ggml_set_name(cur, "result_norm"); + cb(cur, "result_norm"); } // lm_head cur = ggml_mul_mat(ctx0, model.output, cur); - ggml_set_name(cur, "result_output"); + cb(cur, "result_output"); ggml_build_forward_expand(gf, cur); @@ -3369,6 +3372,7 @@ static struct ggml_cgraph * llm_build_llama( static struct ggml_cgraph * llm_build_baichaun( llama_context & lctx, const llama_batch & batch, + const llm_build_cb & cb, bool worst_case) { const auto & model = lctx.model; const auto & hparams = model.hparams; @@ -3415,7 +3419,7 @@ static struct ggml_cgraph * llm_build_baichaun( if (batch.token) { struct ggml_tensor * inp_tokens = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens); - ggml_set_name(inp_tokens, "inp_tokens"); + cb(inp_tokens, "inp_tokens"); inpL = ggml_get_rows(ctx0, model.tok_embeddings, inp_tokens); } else { @@ -3425,25 +3429,25 @@ static struct ggml_cgraph * llm_build_baichaun( inpL = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd, n_tokens); } - ggml_set_name(inpL, "inp_embd"); + cb(inpL, "inp_embd"); + + // inp_pos - contains the positions + struct ggml_tensor * inp_pos = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens); + cb(inp_pos, "inp_pos"); // KQ_scale struct ggml_tensor * KQ_scale = ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, 1); - ggml_set_name(KQ_scale, "KQ_scale"); + cb(KQ_scale, "KQ_scale"); // KQ_mask (mask for 1 head, it will be broadcasted to all heads) struct ggml_tensor * KQ_mask = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_kv, n_tokens, 1); - ggml_set_name(KQ_mask, "KQ_mask"); - - // KQ_pos - contains the positions - struct ggml_tensor * KQ_pos = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens); - ggml_set_name(KQ_pos, "KQ_pos"); + cb(KQ_mask, "KQ_mask"); // shift the entire K-cache if needed - struct ggml_tensor * K_shift = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_ctx); - ggml_set_name(K_shift, "K_shift"); - if (do_rope_shift) { + struct ggml_tensor * K_shift = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_ctx); + cb(K_shift, "K_shift"); + for (int il = 0; il < n_layer; ++il) { struct ggml_tensor * tmp = ggml_rope_custom_inplace(ctx0, @@ -3453,7 +3457,7 @@ static struct ggml_cgraph * llm_build_baichaun( ggml_element_size(kv_self.k)*n_embd_gqa, ggml_element_size(kv_self.k)*n_embd_gqa*n_ctx*il), K_shift, n_embd_head, 0, 0, freq_base, freq_scale); - ggml_set_name(tmp, "K_shifted"); + cb(tmp, "K_shifted"); ggml_build_forward_expand(gf, tmp); } } @@ -3464,28 +3468,28 @@ static struct ggml_cgraph * llm_build_baichaun( // norm { cur = ggml_rms_norm(ctx0, inpL, norm_rms_eps); - ggml_set_name(cur, "rms_norm_0"); + cb(cur, "rms_norm_0"); // cur = cur*attn_norm(broadcasted) cur = ggml_mul(ctx0, cur, model.layers[il].attn_norm); - ggml_set_name(cur, "attn_norm_0"); + cb(cur, "attn_norm_0"); } // self-attention { // compute Q and K and RoPE them struct ggml_tensor * tmpk = ggml_mul_mat(ctx0, model.layers[il].wk, cur); - ggml_set_name(tmpk, "tmpk"); + cb(tmpk, "tmpk"); struct ggml_tensor * tmpq = ggml_mul_mat(ctx0, model.layers[il].wq, cur); - ggml_set_name(tmpq, "tmpq"); + cb(tmpq, "tmpq"); struct ggml_tensor * Kcur; struct ggml_tensor * Qcur; switch (model.type) { case MODEL_7B: - Kcur = ggml_rope_custom(ctx0, ggml_reshape_3d(ctx0, tmpk, n_embd_head, n_head_kv, n_tokens), KQ_pos, n_embd_head, 0, 0, freq_base, freq_scale); - Qcur = ggml_rope_custom(ctx0, ggml_reshape_3d(ctx0, tmpq, n_embd_head, n_head, n_tokens), KQ_pos, n_embd_head, 0, 0, freq_base, freq_scale); + Kcur = ggml_rope_custom(ctx0, ggml_reshape_3d(ctx0, tmpk, n_embd_head, n_head_kv, n_tokens), inp_pos, n_embd_head, 0, 0, freq_base, freq_scale); + Qcur = ggml_rope_custom(ctx0, ggml_reshape_3d(ctx0, tmpq, n_embd_head, n_head, n_tokens), inp_pos, n_embd_head, 0, 0, freq_base, freq_scale); break; case MODEL_13B: Kcur = ggml_reshape_3d(ctx0, tmpk, n_embd/n_head, n_head, n_tokens); @@ -3495,27 +3499,27 @@ static struct ggml_cgraph * llm_build_baichaun( GGML_ASSERT(false); } - ggml_set_name(Kcur, "Kcur"); + cb(Kcur, "Kcur"); - ggml_set_name(Qcur, "Qcur"); + cb(Qcur, "Qcur"); // store key and value to memory { // compute the transposed [n_tokens, n_embd] V matrix struct ggml_tensor * tmpv = ggml_mul_mat(ctx0, model.layers[il].wv, cur); - ggml_set_name(tmpv, "tmpv"); + cb(tmpv, "tmpv"); struct ggml_tensor * Vcur = ggml_transpose(ctx0, ggml_reshape_2d(ctx0, tmpv, n_embd_gqa, n_tokens)); - ggml_set_name(Vcur, "Vcur"); + cb(Vcur, "Vcur"); struct ggml_tensor * k = ggml_view_1d(ctx0, kv_self.k, n_tokens*n_embd_gqa, (ggml_element_size(kv_self.k)*n_embd_gqa)*(il*n_ctx + kv_head)); - ggml_set_name(k, "k"); + cb(k, "k"); struct ggml_tensor * v = ggml_view_2d(ctx0, kv_self.v, n_tokens, n_embd_gqa, ( n_ctx)*ggml_element_size(kv_self.v), (il*n_ctx)*ggml_element_size(kv_self.v)*n_embd_gqa + kv_head*ggml_element_size(kv_self.v)); - ggml_set_name(v, "v"); + cb(v, "v"); // important: storing RoPE-ed version of K in the KV cache! ggml_build_forward_expand(gf, ggml_cpy(ctx0, Kcur, k)); @@ -3523,7 +3527,7 @@ static struct ggml_cgraph * llm_build_baichaun( } struct ggml_tensor * Q = ggml_permute(ctx0, Qcur, 0, 2, 1, 3); - ggml_set_name(Q, "Q"); + cb(Q, "Q"); struct ggml_tensor * K = ggml_view_3d(ctx0, kv_self.k, @@ -3531,16 +3535,16 @@ static struct ggml_cgraph * llm_build_baichaun( ggml_element_size(kv_self.k)*n_embd_gqa, ggml_element_size(kv_self.k)*n_embd_head, ggml_element_size(kv_self.k)*n_embd_gqa*n_ctx*il); - ggml_set_name(K, "K"); + cb(K, "K"); // K * Q struct ggml_tensor * KQ = ggml_mul_mat(ctx0, K, Q); - ggml_set_name(KQ, "KQ"); + cb(KQ, "KQ"); // KQ_scaled = KQ / sqrt(n_embd_head) // KQ_scaled shape [n_past + n_tokens, n_tokens, n_head, 1] struct ggml_tensor * KQ_scaled = ggml_scale(ctx0, KQ, KQ_scale); - ggml_set_name(KQ_scaled, "KQ_scaled"); + cb(KQ_scaled, "KQ_scaled"); struct ggml_tensor * KQ_masked; struct ggml_tensor * KQ_scaled_alibi; @@ -3552,7 +3556,7 @@ static struct ggml_cgraph * llm_build_baichaun( case MODEL_13B: // TODO: replace with ggml_add() KQ_scaled_alibi = ggml_alibi(ctx0, KQ_scaled, /*n_past*/ 0, n_head, 8); - ggml_set_name(KQ_scaled_alibi, "KQ_scaled_alibi"); + cb(KQ_scaled_alibi, "KQ_scaled_alibi"); KQ_masked = ggml_add(ctx0, KQ_scaled_alibi, KQ_mask); break; default: @@ -3561,7 +3565,7 @@ static struct ggml_cgraph * llm_build_baichaun( // KQ = soft_max(KQ_masked) struct ggml_tensor * KQ_soft_max = ggml_soft_max(ctx0, KQ_masked); - ggml_set_name(KQ_soft_max, "KQ_soft_max"); + cb(KQ_soft_max, "KQ_soft_max"); // split cached V into n_head heads struct ggml_tensor * V = @@ -3570,66 +3574,66 @@ static struct ggml_cgraph * llm_build_baichaun( ggml_element_size(kv_self.v)*n_ctx, ggml_element_size(kv_self.v)*n_ctx*n_embd_head, ggml_element_size(kv_self.v)*n_ctx*n_embd_gqa*il); - ggml_set_name(V, "V"); + cb(V, "V"); struct ggml_tensor * KQV = ggml_mul_mat(ctx0, V, KQ_soft_max); - ggml_set_name(KQV, "KQV"); + cb(KQV, "KQV"); // KQV_merged = KQV.permute(0, 2, 1, 3) struct ggml_tensor * KQV_merged = ggml_permute(ctx0, KQV, 0, 2, 1, 3); - ggml_set_name(KQV_merged, "KQV_merged"); + cb(KQV_merged, "KQV_merged"); // cur = KQV_merged.contiguous().view(n_embd, n_tokens) cur = ggml_cont_2d(ctx0, KQV_merged, n_embd, n_tokens); - ggml_set_name(cur, "KQV_merged_contiguous"); + cb(cur, "KQV_merged_contiguous"); // projection (no bias) cur = ggml_mul_mat(ctx0, model.layers[il].wo, cur); - ggml_set_name(cur, "result_wo"); + cb(cur, "result_wo"); } struct ggml_tensor * inpFF = ggml_add(ctx0, cur, inpSA); - ggml_set_name(inpFF, "inpFF"); + cb(inpFF, "inpFF"); // feed-forward network { // norm { cur = ggml_rms_norm(ctx0, inpFF, norm_rms_eps); - ggml_set_name(cur, "rms_norm_1"); + cb(cur, "rms_norm_1"); // cur = cur*ffn_norm(broadcasted) cur = ggml_mul(ctx0, cur, model.layers[il].ffn_norm); - ggml_set_name(cur, "ffn_norm"); + cb(cur, "ffn_norm"); } struct ggml_tensor * tmp = ggml_mul_mat(ctx0, model.layers[il].w3, cur); - ggml_set_name(tmp, "result_w3"); + cb(tmp, "result_w3"); cur = ggml_mul_mat(ctx0, model.layers[il].w1, cur); - ggml_set_name(cur, "result_w1"); + cb(cur, "result_w1"); // SILU activation cur = ggml_silu(ctx0, cur); - ggml_set_name(cur, "silu"); + cb(cur, "silu"); cur = ggml_mul(ctx0, cur, tmp); - ggml_set_name(cur, "silu_x_result_w3"); + cb(cur, "silu_x_result_w3"); cur = ggml_mul_mat(ctx0, model.layers[il].w2, cur); - ggml_set_name(cur, "result_w2"); + cb(cur, "result_w2"); } cur = ggml_add(ctx0, cur, inpFF); - ggml_set_name(cur, "inpFF_+_result_w2"); + cb(cur, "inpFF_+_result_w2"); // input for next layer inpL = cur; @@ -3640,16 +3644,16 @@ static struct ggml_cgraph * llm_build_baichaun( // norm { cur = ggml_rms_norm(ctx0, cur, norm_rms_eps); - ggml_set_name(cur, "rms_norm_2"); + cb(cur, "rms_norm_2"); // cur = cur*norm(broadcasted) cur = ggml_mul(ctx0, cur, model.output_norm); - ggml_set_name(cur, "result_norm"); + cb(cur, "result_norm"); } // lm_head cur = ggml_mul_mat(ctx0, model.output, cur); - ggml_set_name(cur, "result_output"); + cb(cur, "result_output"); ggml_build_forward_expand(gf, cur); @@ -3661,6 +3665,7 @@ static struct ggml_cgraph * llm_build_baichaun( static struct ggml_cgraph * llm_build_falcon( llama_context & lctx, const llama_batch & batch, + const llm_build_cb & cb, bool worst_case) { const auto & model = lctx.model; const auto & hparams = model.hparams; @@ -3710,7 +3715,7 @@ static struct ggml_cgraph * llm_build_falcon( if (batch.token) { struct ggml_tensor * inp_tokens = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens); - ggml_set_name(inp_tokens, "inp_tokens"); + cb(inp_tokens, "inp_tokens"); inpL = ggml_get_rows(ctx0, model.tok_embeddings, inp_tokens); } else { @@ -3720,25 +3725,25 @@ static struct ggml_cgraph * llm_build_falcon( inpL = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd, n_tokens); } - ggml_set_name(inpL, "inp_embd"); + cb(inpL, "inp_embd"); + + // inp_pos - contains the positions + struct ggml_tensor * inp_pos = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens); + cb(inp_pos, "inp_pos"); // KQ_scale struct ggml_tensor * KQ_scale = ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, 1); - ggml_set_name(KQ_scale, "KQ_scale"); + cb(KQ_scale, "KQ_scale"); // KQ_mask (mask for 1 head, it will be broadcasted to all heads) struct ggml_tensor * KQ_mask = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_kv, n_tokens, 1); - ggml_set_name(KQ_mask, "KQ_mask"); - - // KQ_pos - contains the positions - struct ggml_tensor * KQ_pos = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens); - ggml_set_name(KQ_pos, "KQ_pos"); + cb(KQ_mask, "KQ_mask"); // shift the entire K-cache if needed - struct ggml_tensor * K_shift = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_ctx); - ggml_set_name(K_shift, "K_shift"); - if (do_rope_shift) { + struct ggml_tensor * K_shift = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_ctx); + cb(K_shift, "K_shift"); + for (int il = 0; il < n_layer; ++il) { struct ggml_tensor * tmp = ggml_rope_custom_inplace(ctx0, @@ -3748,7 +3753,7 @@ static struct ggml_cgraph * llm_build_falcon( ggml_element_size(kv_self.k)*n_embd_gqa, ggml_element_size(kv_self.k)*n_embd_gqa*n_ctx*il), K_shift, n_embd_head, 2, 0, freq_base, freq_scale); - ggml_set_name(tmp, "K_shifted"); + cb(tmp, "K_shifted"); ggml_build_forward_expand(gf, tmp); } } @@ -3760,23 +3765,23 @@ static struct ggml_cgraph * llm_build_falcon( // TODO: refactor into common function (shared with LLaMA) { attn_norm = ggml_norm(ctx0, inpL, norm_eps); - ggml_set_name(attn_norm, "attn_norm_0"); + cb(attn_norm, "attn_norm_0"); attn_norm = ggml_mul(ctx0, attn_norm, model.layers[il].attn_norm); - ggml_set_name(attn_norm, "attn_norm_0_w"); + cb(attn_norm, "attn_norm_0_w"); attn_norm = ggml_add(ctx0, attn_norm, model.layers[il].attn_norm_b); - ggml_set_name(attn_norm, "attn_norm_0_wb"); + cb(attn_norm, "attn_norm_0_wb"); if (model.layers[il].attn_norm_2) { // Falcon-40B cur = ggml_norm(ctx0, inpL, norm_eps); - ggml_set_name(cur, "attn_norm_2"); + cb(cur, "attn_norm_2"); cur = ggml_mul(ctx0, cur, model.layers[il].attn_norm_2); - ggml_set_name(cur, "attn_norm_2_w"); + cb(cur, "attn_norm_2_w"); cur = ggml_add(ctx0, cur, model.layers[il].attn_norm_2_b); - ggml_set_name(cur, "attn_norm_2_wb"); + cb(cur, "attn_norm_2_wb"); } else { // Falcon 7B cur = attn_norm; } @@ -3784,7 +3789,7 @@ static struct ggml_cgraph * llm_build_falcon( // compute QKV cur = ggml_mul_mat(ctx0, model.layers[il].wqkv, cur); - ggml_set_name(cur, "wqkv"); + cb(cur, "wqkv"); // Note that the strides for Kcur, Vcur are set up so that the // resulting views are misaligned with the tensor's storage @@ -3804,50 +3809,50 @@ static struct ggml_cgraph * llm_build_falcon( wsize * n_embd_head, wsize * n_embd_head * (n_head + 2 * n_head_kv), 0)); - ggml_set_name(tmpq, "tmpq"); + cb(tmpq, "tmpq"); struct ggml_tensor * tmpk = ggml_cont(ctx0, ggml_view_3d( ctx0, cur, n_embd_head, n_head_kv, n_tokens, wsize * n_embd_head, wsize * n_embd_head * (n_head + 2 * n_head_kv), wsize * n_embd_head * n_head)); - ggml_set_name(tmpk, "tmpk"); + cb(tmpk, "tmpk"); struct ggml_tensor * tmpv = ggml_view_3d( ctx0, cur, n_embd_head, n_head_kv, n_tokens, wsize * n_embd_head, wsize * n_embd_head * (n_head + 2 * n_head_kv), wsize * n_embd_head * (n_head + n_head_kv)); - ggml_set_name(tmpv, "tmpv"); + cb(tmpv, "tmpv"); // using mode = 2 for neox mode - struct ggml_tensor * Qcur = ggml_rope_custom(ctx0, tmpq, KQ_pos, n_embd_head, 2, 0, freq_base, freq_scale); - ggml_set_name(Qcur, "Qcur"); + struct ggml_tensor * Qcur = ggml_rope_custom(ctx0, tmpq, inp_pos, n_embd_head, 2, 0, freq_base, freq_scale); + cb(Qcur, "Qcur"); - struct ggml_tensor * Kcur = ggml_rope_custom(ctx0, tmpk, KQ_pos, n_embd_head, 2, 0, freq_base, freq_scale); - ggml_set_name(Kcur, "Kcur"); + struct ggml_tensor * Kcur = ggml_rope_custom(ctx0, tmpk, inp_pos, n_embd_head, 2, 0, freq_base, freq_scale); + cb(Kcur, "Kcur"); { struct ggml_tensor * Vcur = ggml_cont(ctx0, tmpv); - ggml_set_name(Vcur, "Vcur_0"); + cb(Vcur, "Vcur_0"); Vcur = ggml_transpose(ctx0, ggml_reshape_2d(ctx0, Vcur, n_embd_gqa, n_tokens)); - ggml_set_name(Vcur, "Vcur_1"); + cb(Vcur, "Vcur_1"); struct ggml_tensor * k = ggml_view_1d(ctx0, kv_self.k, n_tokens*n_embd_gqa, (ggml_element_size(kv_self.k)*n_embd_gqa)*(il*n_ctx + kv_head)); - ggml_set_name(k, "k"); + cb(k, "k"); struct ggml_tensor * v = ggml_view_2d(ctx0, kv_self.v, n_tokens, n_embd_gqa, ( n_ctx)*ggml_element_size(kv_self.v), (il*n_ctx)*ggml_element_size(kv_self.v)*n_embd_gqa + kv_head*ggml_element_size(kv_self.v)); - ggml_set_name(v, "v"); + cb(v, "v"); ggml_build_forward_expand(gf, ggml_cpy(ctx0, Kcur, k)); ggml_build_forward_expand(gf, ggml_cpy(ctx0, Vcur, v)); } struct ggml_tensor * Q = ggml_permute(ctx0, Qcur, 0, 2, 1, 3); - ggml_set_name(Q, "Q"); + cb(Q, "Q"); struct ggml_tensor * K = ggml_view_3d(ctx0, kv_self.k, @@ -3855,19 +3860,19 @@ static struct ggml_cgraph * llm_build_falcon( ggml_element_size(kv_self.k)*n_embd_gqa, ggml_element_size(kv_self.k)*n_embd_head, ggml_element_size(kv_self.k)*n_embd_gqa*n_ctx*il); - ggml_set_name(K, "K"); + cb(K, "K"); struct ggml_tensor * KQ = ggml_mul_mat(ctx0, K, Q); - ggml_set_name(KQ, "KQ"); + cb(KQ, "KQ"); struct ggml_tensor * KQ_scaled = ggml_scale(ctx0, KQ, KQ_scale); - ggml_set_name(KQ_scaled, "KQ_scaled"); + cb(KQ_scaled, "KQ_scaled"); struct ggml_tensor * KQ_masked = ggml_add(ctx0, KQ_scaled, KQ_mask); - ggml_set_name(KQ_masked, "KQ_masked"); + cb(KQ_masked, "KQ_masked"); struct ggml_tensor * KQ_soft_max = ggml_soft_max(ctx0, KQ_masked); - ggml_set_name(KQ_soft_max, "KQ_soft_max"); + cb(KQ_soft_max, "KQ_soft_max"); struct ggml_tensor * V = ggml_view_3d(ctx0, kv_self.v, @@ -3875,19 +3880,19 @@ static struct ggml_cgraph * llm_build_falcon( ggml_element_size(kv_self.v)*n_ctx, ggml_element_size(kv_self.v)*n_ctx*n_embd_head, ggml_element_size(kv_self.v)*n_ctx*n_embd_gqa*il); - ggml_set_name(V, "V"); + cb(V, "V"); struct ggml_tensor * KQV = ggml_mul_mat(ctx0, V, KQ_soft_max); - ggml_set_name(KQV, "KQV"); + cb(KQV, "KQV"); struct ggml_tensor * KQV_merged = ggml_permute(ctx0, KQV, 0, 2, 1, 3); - ggml_set_name(KQV_merged, "KQV_merged"); + cb(KQV_merged, "KQV_merged"); cur = ggml_cont_2d(ctx0, KQV_merged, n_embd, n_tokens); - ggml_set_name(cur, "KQV_merged_contiguous"); + cb(cur, "KQV_merged_contiguous"); cur = ggml_mul_mat(ctx0, model.layers[il].wo, cur); - ggml_set_name(cur, "result_wo"); + cb(cur, "result_wo"); } struct ggml_tensor * attn_out = cur; @@ -3897,20 +3902,20 @@ static struct ggml_cgraph * llm_build_falcon( struct ggml_tensor * inpFF = attn_norm; cur = ggml_mul_mat(ctx0, model.layers[il].w3, inpFF); - ggml_set_name(cur, "result_w3"); + cb(cur, "result_w3"); cur = ggml_gelu(ctx0, cur); - ggml_set_name(cur, "gelu"); + cb(cur, "gelu"); cur = ggml_mul_mat(ctx0, model.layers[il].w2, cur); - ggml_set_name(cur, "result_w2"); + cb(cur, "result_w2"); } cur = ggml_add(ctx0, cur, attn_out); - ggml_set_name(cur, "inpFF_+_result_w2"); + cb(cur, "inpFF_+_result_w2"); cur = ggml_add(ctx0, cur, inpL); - ggml_set_name(cur, "inpL_+_inpFF_+_result_w2"); + cb(cur, "inpL_+_inpFF_+_result_w2"); // input for next layer inpL = cur; @@ -3921,17 +3926,17 @@ static struct ggml_cgraph * llm_build_falcon( // norm { cur = ggml_norm(ctx0, cur, norm_eps); - ggml_set_name(cur, "out_norm_0"); + cb(cur, "out_norm_0"); cur = ggml_mul(ctx0, cur, model.output_norm); - ggml_set_name(cur, "out_norm_0_w"); + cb(cur, "out_norm_0_w"); cur = ggml_add(ctx0, cur, model.output_norm_b); - ggml_set_name(cur, "result_norm"); + cb(cur, "result_norm"); } cur = ggml_mul_mat(ctx0, model.output, cur); - ggml_set_name(cur, "result_output"); + cb(cur, "result_output"); ggml_build_forward_expand(gf, cur); @@ -3943,6 +3948,7 @@ static struct ggml_cgraph * llm_build_falcon( static struct ggml_cgraph * llm_build_starcoder( llama_context & lctx, const llama_batch & batch, + const llm_build_cb & cb, bool worst_case) { const auto & model = lctx.model; const auto & hparams = model.hparams; @@ -3987,7 +3993,7 @@ static struct ggml_cgraph * llm_build_starcoder( if (batch.token) { struct ggml_tensor * inp_tokens = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens); - ggml_set_name(inp_tokens, "inp_tokens"); + cb(inp_tokens, "inp_tokens"); embd = ggml_get_rows(ctx0, model.tok_embeddings, inp_tokens); } else { @@ -3997,76 +4003,75 @@ static struct ggml_cgraph * llm_build_starcoder( embd = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd, n_tokens); } - ggml_set_name(embd, "inp_embd"); + cb(embd, "inp_embd"); - { - struct ggml_tensor * inp_pos = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens); - ggml_set_name(inp_pos, "inp_pos"); - - pos = ggml_get_rows(ctx0, model.pos_embeddings, inp_pos); - } + // inp_pos - contains the positions + struct ggml_tensor * inp_pos = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens); + cb(inp_pos, "inp_pos"); // KQ_scale struct ggml_tensor * KQ_scale = ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, 1); - ggml_set_name(KQ_scale, "KQ_scale"); + cb(KQ_scale, "KQ_scale"); // KQ_mask (mask for 1 head, it will be broadcasted to all heads) struct ggml_tensor * KQ_mask = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_kv, n_tokens, 1); - ggml_set_name(KQ_mask, "KQ_mask"); + cb(KQ_mask, "KQ_mask"); + + pos = ggml_get_rows(ctx0, model.pos_embeddings, inp_pos); inpL = ggml_add(ctx0, embd, pos); - ggml_set_name(inpL, "inpL"); + cb(inpL, "inpL"); for (int il = 0; il < n_layer; ++il) { { // Norm cur = ggml_norm(ctx0, inpL, norm_eps); - ggml_set_name(cur, "attn_norm_0"); + cb(cur, "attn_norm_0"); cur = ggml_mul(ctx0, cur, model.layers[il].attn_norm); - ggml_set_name(cur, "attn_norm_0_w"); + cb(cur, "attn_norm_0_w"); cur = ggml_add(ctx0, cur, model.layers[il].attn_norm_b); - ggml_set_name(cur, "attn_norm_0_wb"); + cb(cur, "attn_norm_0_wb"); } { // Self Attention cur = ggml_mul_mat(ctx0, model.layers[il].wqkv, cur); - ggml_set_name(cur, "wqkv"); + cb(cur, "wqkv"); cur = ggml_add(ctx0, cur, model.layers[il].bqkv); - ggml_set_name(cur, "bqkv"); + cb(cur, "bqkv"); struct ggml_tensor * tmpq = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd, n_tokens, cur->nb[1], 0*sizeof(float)*(n_embd))); struct ggml_tensor * tmpk = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd))); struct ggml_tensor * tmpv = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa))); - ggml_set_name(tmpq, "tmpq"); - ggml_set_name(tmpk, "tmpk"); - ggml_set_name(tmpv, "tmpv"); + cb(tmpq, "tmpq"); + cb(tmpk, "tmpk"); + cb(tmpv, "tmpv"); struct ggml_tensor * Qcur = ggml_reshape_3d(ctx0, tmpq, n_embd_head, n_head, n_tokens); struct ggml_tensor * Kcur = tmpk; { struct ggml_tensor * Vcur = ggml_transpose(ctx0, tmpv); - ggml_set_name(Vcur, "Vcur"); + cb(Vcur, "Vcur"); struct ggml_tensor * k = ggml_view_1d(ctx0, kv_self.k, n_tokens*n_embd_gqa, (ggml_element_size(kv_self.k)*n_embd_gqa)*(il*n_ctx + kv_head)); - ggml_set_name(k, "k"); + cb(k, "k"); struct ggml_tensor * v = ggml_view_2d(ctx0, kv_self.v, n_tokens, n_embd_gqa, ( n_ctx)*ggml_element_size(kv_self.v), (il*n_ctx)*ggml_element_size(kv_self.v)*n_embd_gqa + kv_head*ggml_element_size(kv_self.v)); - ggml_set_name(v, "v"); + cb(v, "v"); ggml_build_forward_expand(gf, ggml_cpy(ctx0, Kcur, k)); ggml_build_forward_expand(gf, ggml_cpy(ctx0, Vcur, v)); } struct ggml_tensor * Q = ggml_permute(ctx0, Qcur, 0, 2, 1, 3); - ggml_set_name(Q, "Q"); + cb(Q, "Q"); struct ggml_tensor * K = ggml_view_3d(ctx0, kv_self.k, @@ -4074,24 +4079,24 @@ static struct ggml_cgraph * llm_build_starcoder( ggml_element_size(kv_self.k)*n_embd_gqa, ggml_element_size(kv_self.k)*n_embd_head, ggml_element_size(kv_self.k)*n_embd_gqa*n_ctx*il); - ggml_set_name(K, "K"); + cb(K, "K"); // K * Q struct ggml_tensor * KQ = ggml_mul_mat(ctx0, K, Q); - ggml_set_name(KQ, "KQ"); + cb(KQ, "KQ"); // KQ_scaled = KQ / sqrt(n_embd_head) // KQ_scaled shape [n_past + n_tokens, n_tokens, n_head, 1] struct ggml_tensor * KQ_scaled = ggml_scale_inplace(ctx0, KQ, KQ_scale); - ggml_set_name(KQ_scaled, "KQ_scaled"); + cb(KQ_scaled, "KQ_scaled"); // KQ_masked = mask_past(KQ_scaled) struct ggml_tensor * KQ_masked = ggml_add(ctx0, KQ_scaled, KQ_mask); - ggml_set_name(KQ_masked, "KQ_masked"); + cb(KQ_masked, "KQ_masked"); // KQ = soft_max(KQ_masked) struct ggml_tensor * KQ_soft_max = ggml_soft_max_inplace(ctx0, KQ_masked); - ggml_set_name(KQ_soft_max, "KQ_soft_max"); + cb(KQ_soft_max, "KQ_soft_max"); // split cached V into n_head heads struct ggml_tensor * V = @@ -4100,25 +4105,25 @@ static struct ggml_cgraph * llm_build_starcoder( ggml_element_size(kv_self.v)*n_ctx, ggml_element_size(kv_self.v)*n_ctx*n_embd_head, ggml_element_size(kv_self.v)*n_ctx*n_embd_gqa*il); - ggml_set_name(V, "V"); + cb(V, "V"); struct ggml_tensor * KQV = ggml_mul_mat(ctx0, V, KQ_soft_max); - ggml_set_name(KQV, "KQV"); + cb(KQV, "KQV"); struct ggml_tensor * KQV_merged = ggml_permute(ctx0, KQV, 0, 2, 1, 3); - ggml_set_name(KQV_merged, "KQV_merged"); + cb(KQV_merged, "KQV_merged"); cur = ggml_cont_2d(ctx0, KQV_merged, n_embd, n_tokens); - ggml_set_name(cur, "KQV_merged_contiguous"); + cb(cur, "KQV_merged_contiguous"); } // Projection cur = ggml_add(ctx0, ggml_mul_mat(ctx0, model.layers[il].wo, cur), model.layers[il].bo); - ggml_set_name(cur, "result_wo"); + cb(cur, "result_wo"); // Add the input cur = ggml_add(ctx0, cur, inpL); - ggml_set_name(cur, "inpL_+_result_wo"); + cb(cur, "inpL_+_result_wo"); struct ggml_tensor * inpFF = cur; @@ -4127,28 +4132,28 @@ static struct ggml_cgraph * llm_build_starcoder( // Norm { cur = ggml_norm(ctx0, inpFF, norm_eps); - ggml_set_name(cur, "ffn_norm_0"); + cb(cur, "ffn_norm_0"); cur = ggml_mul(ctx0, cur, model.layers[il].ffn_norm); - ggml_set_name(cur, "ffn_norm_0_w"); + cb(cur, "ffn_norm_0_w"); cur = ggml_add(ctx0, cur, model.layers[il].ffn_norm_b); - ggml_set_name(cur, "ffn_norm_0_wb"); + cb(cur, "ffn_norm_0_wb"); } cur = ggml_add(ctx0, ggml_mul_mat(ctx0, model.layers[il].w3, cur), model.layers[il].b3); - ggml_set_name(cur, "result_w3"); + cb(cur, "result_w3"); // GELU activation cur = ggml_gelu(ctx0, cur); - ggml_set_name(cur, "gelu"); + cb(cur, "gelu"); // Projection cur = ggml_mul_mat(ctx0, model.layers[il].w2, cur); - ggml_set_name(cur, "result_w2"); + cb(cur, "result_w2"); cur = ggml_add(ctx0, cur, model.layers[il].b2); - ggml_set_name(cur, "result_w2_b"); + cb(cur, "result_w2_b"); } inpL = ggml_add(ctx0, cur, inpFF); @@ -4158,17 +4163,17 @@ static struct ggml_cgraph * llm_build_starcoder( // Output Norm { cur = ggml_norm(ctx0, inpL, norm_eps); - ggml_set_name(cur, "out_norm_0"); + cb(cur, "out_norm_0"); cur = ggml_mul(ctx0, cur, model.output_norm); - ggml_set_name(cur, "out_norm_0_w"); + cb(cur, "out_norm_0_w"); cur = ggml_add(ctx0, cur, model.output_norm_b); - ggml_set_name(cur, "result_norm"); + cb(cur, "result_norm"); } cur = ggml_mul_mat(ctx0, model.output, cur); - ggml_set_name(cur, "result_output"); + cb(cur, "result_output"); ggml_build_forward_expand(gf, cur); ggml_free(ctx0); @@ -4179,6 +4184,7 @@ static struct ggml_cgraph * llm_build_starcoder( static struct ggml_cgraph * llm_build_persimmon( llama_context & lctx, const llama_batch & batch, + const llm_build_cb & cb, bool worst_case) { const auto & model = lctx.model; const auto & hparams = model.hparams; @@ -4223,27 +4229,27 @@ static struct ggml_cgraph * llm_build_persimmon( if (batch.token) { struct ggml_tensor * inp_tokens = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens); - ggml_set_name(inp_tokens, "inp_tokens"); + cb(inp_tokens, "inp_tokens"); inpL = ggml_get_rows(ctx0, model.tok_embeddings, inp_tokens); } else { inpL = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd, n_tokens); } - ggml_set_name(inpL, "imp_embd"); + cb(inpL, "imp_embd"); + + struct ggml_tensor * inp_pos = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens); + cb(inp_pos, "inp_pos"); // KQ_scale struct ggml_tensor * KQ_scale = ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, 1); - ggml_set_name(KQ_scale, "KQ_scale"); + cb(KQ_scale, "KQ_scale"); struct ggml_tensor * KQ_mask = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_kv, n_tokens, 1); - ggml_set_name(KQ_mask, "KQ_mask"); - - struct ggml_tensor * KQ_pos = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens); - ggml_set_name(KQ_pos, "KQ_pos"); + cb(KQ_mask, "KQ_mask"); if (do_rope_shift) { struct ggml_tensor * K_shift = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_ctx); - ggml_set_name(K_shift, "K_shift"); + cb(K_shift, "K_shift"); for (int il = 0; il < n_layer; ++il) { struct ggml_tensor * tmp = @@ -4256,7 +4262,7 @@ static struct ggml_cgraph * llm_build_persimmon( ggml_element_size(kv_self.k)*(n_embd_head*n_ctx*il) ), K_shift, n_rot, 2, 0, freq_base, freq_scale); - ggml_set_name(tmp, "K_shifted"); + cb(tmp, "K_shifted"); ggml_build_forward_expand(gf, tmp); } } @@ -4266,31 +4272,31 @@ static struct ggml_cgraph * llm_build_persimmon( { cur = ggml_norm(ctx0, inpL, norm_eps); - ggml_set_name(cur, "attn_norm_0"); + cb(cur, "attn_norm_0"); cur = ggml_mul(ctx0, cur, model.layers[il].attn_norm); - ggml_set_name(cur, "attn_norm_0_w"); + cb(cur, "attn_norm_0_w"); cur = ggml_add(ctx0, cur, model.layers[il].attn_norm_b); - ggml_set_name(cur, "attn_norm_0_wb"); + cb(cur, "attn_norm_0_wb"); } // self attention { cur = ggml_mul_mat(ctx0, model.layers[il].wqkv, cur); - ggml_set_name(cur, "wqkv"); + cb(cur, "wqkv"); cur = ggml_add(ctx0, cur, model.layers[il].bqkv); - ggml_set_name(cur, "bqkv"); + cb(cur, "bqkv"); // split qkv GGML_ASSERT(n_head_kv == n_head); struct ggml_tensor * tmpqkv = ggml_reshape_4d(ctx0, cur, n_embd_head, 3, n_head, n_tokens); - ggml_set_name(tmpqkv, "tmpqkv"); + cb(tmpqkv, "tmpqkv"); struct ggml_tensor * tmpqkv_perm = ggml_cont(ctx0, ggml_permute(ctx0, tmpqkv, 0, 3, 1, 2)); - ggml_set_name(tmpqkv_perm, "tmpqkv"); + cb(tmpqkv_perm, "tmpqkv"); struct ggml_tensor * tmpq = ggml_view_3d( ctx0, tmpqkv_perm, n_embd_head, n_head, n_tokens, @@ -4298,7 +4304,7 @@ static struct ggml_cgraph * llm_build_persimmon( ggml_element_size(tmpqkv_perm) * n_embd_head * n_head, 0 ); - ggml_set_name(tmpq, "tmpq"); + cb(tmpq, "tmpq"); struct ggml_tensor * tmpk = ggml_view_3d( ctx0, tmpqkv_perm, n_embd_head, n_head, n_tokens, @@ -4306,26 +4312,26 @@ static struct ggml_cgraph * llm_build_persimmon( ggml_element_size(tmpqkv_perm) * n_embd_head * n_head, ggml_element_size(tmpqkv_perm) * n_embd_head * n_head * n_tokens ); - ggml_set_name(tmpk, "tmpk"); + cb(tmpk, "tmpk"); // Q/K Layernorm tmpq = ggml_norm(ctx0, tmpq, norm_eps); - ggml_set_name(tmpq, "tmpq"); + cb(tmpq, "tmpq"); tmpq = ggml_mul(ctx0, tmpq, model.layers[il].attn_q_norm); - ggml_set_name(tmpq, "tmpq"); + cb(tmpq, "tmpq"); tmpq = ggml_add(ctx0, tmpq, model.layers[il].attn_q_norm_b); - ggml_set_name(tmpq, "tmpq"); + cb(tmpq, "tmpq"); tmpk = ggml_norm(ctx0, tmpk, norm_eps); - ggml_set_name(tmpk, "tmpk"); + cb(tmpk, "tmpk"); tmpk = ggml_mul(ctx0, tmpk, model.layers[il].attn_k_norm); - ggml_set_name(tmpk, "tmpk"); + cb(tmpk, "tmpk"); tmpk = ggml_add(ctx0, tmpk, model.layers[il].attn_k_norm_b); - ggml_set_name(tmpk, "tmpk"); + cb(tmpk, "tmpk"); // RoPE the first n_rot of q/k, pass the other half, and concat. struct ggml_tensor * qrot = ggml_view_3d( @@ -4334,7 +4340,7 @@ static struct ggml_cgraph * llm_build_persimmon( ggml_element_size(tmpq) * n_embd_head * n_head, 0 ); - ggml_set_name(qrot, "qrot"); + cb(qrot, "qrot"); struct ggml_tensor * krot = ggml_view_3d( ctx0, tmpk, n_rot, n_head, n_tokens, @@ -4342,7 +4348,7 @@ static struct ggml_cgraph * llm_build_persimmon( ggml_element_size(tmpk) * n_embd_head * n_head, 0 ); - ggml_set_name(krot, "krot"); + cb(krot, "krot"); // get the second half of tmpq, e.g tmpq[n_rot:, :, :] struct ggml_tensor * qpass = ggml_view_3d( @@ -4351,7 +4357,7 @@ static struct ggml_cgraph * llm_build_persimmon( ggml_element_size(tmpq) * n_embd_head * n_head, ggml_element_size(tmpq) * n_rot ); - ggml_set_name(qpass, "qpass"); + cb(qpass, "qpass"); struct ggml_tensor * kpass = ggml_view_3d( ctx0, tmpk, n_rot, n_head, n_tokens, @@ -4359,43 +4365,43 @@ static struct ggml_cgraph * llm_build_persimmon( ggml_element_size(tmpk) * n_embd_head * n_head, ggml_element_size(tmpk) * n_rot ); - ggml_set_name(kpass, "kpass"); + cb(kpass, "kpass"); - struct ggml_tensor * qrotated = ggml_rope_custom( - ctx0, qrot, KQ_pos, n_rot, 2, 0, freq_base, freq_scale + struct ggml_tensor * qrotated = ggml_rope_custom( + ctx0, qrot, inp_pos, n_rot, 2, 0, freq_base, freq_scale ); - ggml_set_name(qrotated, "qrotated"); + cb(qrotated, "qrotated"); struct ggml_tensor * krotated = ggml_rope_custom( - ctx0, krot, KQ_pos, n_rot, 2, 0, freq_base, freq_scale + ctx0, krot, inp_pos, n_rot, 2, 0, freq_base, freq_scale ); - ggml_set_name(krotated, "krotated"); + cb(krotated, "krotated"); // ggml currently only supports concatenation on dim=2 // so we need to permute qrot, qpass, concat, then permute back. qrotated = ggml_cont(ctx0, ggml_permute(ctx0, qrotated, 2, 1, 0, 3)); - ggml_set_name(qrotated, "qrotated"); + cb(qrotated, "qrotated"); krotated = ggml_cont(ctx0, ggml_permute(ctx0, krotated, 2, 1, 0, 3)); - ggml_set_name(krotated, "krotated"); + cb(krotated, "krotated"); qpass = ggml_cont(ctx0, ggml_permute(ctx0, qpass, 2, 1, 0, 3)); - ggml_set_name(qpass, "qpass"); + cb(qpass, "qpass"); kpass = ggml_cont(ctx0, ggml_permute(ctx0, kpass, 2, 1, 0, 3)); - ggml_set_name(kpass, "kpass"); + cb(kpass, "kpass"); struct ggml_tensor * Qcur = ggml_concat(ctx0, qrotated, qpass); - ggml_set_name(Qcur, "Qcur"); + cb(Qcur, "Qcur"); struct ggml_tensor * Kcur = ggml_concat(ctx0, krotated, kpass); - ggml_set_name(Kcur, "Kcur"); + cb(Kcur, "Kcur"); struct ggml_tensor * Q = ggml_cont(ctx0, ggml_permute(ctx0, Qcur, 1, 2, 0, 3)); - ggml_set_name(Q, "Q"); + cb(Q, "Q"); Kcur = ggml_cont(ctx0, ggml_permute(ctx0, Kcur, 2, 1, 0, 3)); - ggml_set_name(Kcur, "Kcur"); + cb(Kcur, "Kcur"); { struct ggml_tensor * tmpv = ggml_view_3d( @@ -4404,22 +4410,22 @@ static struct ggml_cgraph * llm_build_persimmon( ggml_element_size(tmpqkv_perm) * n_embd_head * n_head, ggml_element_size(tmpqkv_perm) * n_embd_head * n_head * n_tokens * 2 ); - ggml_set_name(tmpv, "tmpv"); + cb(tmpv, "tmpv"); // store K, V in cache struct ggml_tensor * Vcur = ggml_transpose(ctx0, ggml_reshape_2d(ctx0, tmpv, n_embd_gqa, n_tokens)); - ggml_set_name(Vcur, "Vcur"); + cb(Vcur, "Vcur"); struct ggml_tensor * k = ggml_view_1d( ctx0, kv_self.k, n_tokens*n_embd_gqa, (ggml_element_size(kv_self.k)*n_embd_gqa)*(il*n_ctx + kv_head) ); - ggml_set_name(k, "k"); + cb(k, "k"); struct ggml_tensor * v = ggml_view_2d(ctx0, kv_self.v, n_tokens, n_embd_gqa, ( n_ctx)*ggml_element_size(kv_self.v), (il*n_ctx)*ggml_element_size(kv_self.v)*n_embd_gqa + kv_head*ggml_element_size(kv_self.v)); - ggml_set_name(v, "v"); + cb(v, "v"); // important: storing RoPE-ed version of K in the KV cache! ggml_build_forward_expand(gf, ggml_cpy(ctx0, Kcur, k)); @@ -4430,19 +4436,19 @@ static struct ggml_cgraph * llm_build_persimmon( ggml_element_size(kv_self.k)*n_embd_gqa, ggml_element_size(kv_self.k)*n_embd_head, ggml_element_size(kv_self.k)*n_embd_gqa*n_ctx*il); - ggml_set_name(K, "K"); + cb(K, "K"); struct ggml_tensor * KQ = ggml_mul_mat(ctx0, K, Q); - ggml_set_name(KQ, "KQ"); + cb(KQ, "KQ"); struct ggml_tensor * KQ_scaled = ggml_scale(ctx0, KQ, KQ_scale); - ggml_set_name(KQ_scaled, "KQ_scaled"); + cb(KQ_scaled, "KQ_scaled"); struct ggml_tensor * KQ_masked = ggml_add(ctx0, KQ_scaled, KQ_mask); - ggml_set_name(KQ_masked, "KQ_masked"); + cb(KQ_masked, "KQ_masked"); struct ggml_tensor * KQ_soft_max = ggml_soft_max_inplace(ctx0, KQ_masked); - ggml_set_name(KQ_soft_max, "KQ_soft_max"); + cb(KQ_soft_max, "KQ_soft_max"); struct ggml_tensor * V = ggml_view_3d(ctx0, kv_self.v, @@ -4450,62 +4456,62 @@ static struct ggml_cgraph * llm_build_persimmon( ggml_element_size(kv_self.v)*n_ctx, ggml_element_size(kv_self.v)*n_ctx*n_embd_head, ggml_element_size(kv_self.v)*n_ctx*n_embd_gqa*il); - ggml_set_name(V, "V"); + cb(V, "V"); struct ggml_tensor * KQV = ggml_mul_mat(ctx0, V, KQ_soft_max); - ggml_set_name(KQV, "KQV"); + cb(KQV, "KQV"); struct ggml_tensor * KQV_merged = ggml_permute(ctx0, KQV, 0, 2, 1, 3); - ggml_set_name(KQV_merged, "KQV_merged"); + cb(KQV_merged, "KQV_merged"); cur = ggml_cont_2d(ctx0, KQV_merged, n_embd, n_tokens); - ggml_set_name(cur, "KQV_merged_contiguous"); + cb(cur, "KQV_merged_contiguous"); cur = ggml_mul_mat(ctx0, model.layers[il].wo, cur); - ggml_set_name(cur, "result_wo"); + cb(cur, "result_wo"); cur = ggml_add(ctx0, cur, model.layers[il].bo); - ggml_set_name(cur, "result_wo_b"); + cb(cur, "result_wo_b"); } struct ggml_tensor * inpFF = ggml_add(ctx0, residual, cur); - ggml_set_name(inpFF, "inpFF"); + cb(inpFF, "inpFF"); { // MLP { // Norm cur = ggml_norm(ctx0, inpFF, norm_eps); - ggml_set_name(cur, "ffn_norm_0"); + cb(cur, "ffn_norm_0"); cur = ggml_mul(ctx0, cur, model.layers[il].ffn_norm); - ggml_set_name(cur, "ffn_norm_0_w"); + cb(cur, "ffn_norm_0_w"); cur = ggml_add(ctx0, cur, model.layers[il].ffn_norm_b); - ggml_set_name(cur, "ffn_norm_0_wb"); + cb(cur, "ffn_norm_0_wb"); } cur = ggml_mul_mat(ctx0, model.layers[il].w3, cur); - ggml_set_name(cur, "result_w3"); + cb(cur, "result_w3"); cur = ggml_add(ctx0, cur, model.layers[il].b3); - ggml_set_name(cur, "result_w3_b"); + cb(cur, "result_w3_b"); cur = ggml_relu(ctx0, cur); - ggml_set_name(cur, "relu"); + cb(cur, "relu"); cur = ggml_sqr(ctx0, cur); - ggml_set_name(cur, "sqr(relu)"); + cb(cur, "sqr(relu)"); cur = ggml_mul_mat(ctx0, model.layers[il].w2, cur); - ggml_set_name(cur, "result_w2"); + cb(cur, "result_w2"); cur = ggml_add(ctx0, cur, model.layers[il].b2); - ggml_set_name(cur, "result_w2_b"); + cb(cur, "result_w2_b"); } cur = ggml_add(ctx0, cur, inpFF); - ggml_set_name(cur, "inpFF_+_result_w2"); + cb(cur, "inpFF_+_result_w2"); inpL = cur; } @@ -4514,17 +4520,17 @@ static struct ggml_cgraph * llm_build_persimmon( { cur = ggml_norm(ctx0, cur, norm_eps); - ggml_set_name(cur, "out_norm_0"); + cb(cur, "out_norm_0"); cur = ggml_mul(ctx0, cur, model.output_norm); - ggml_set_name(cur, "out_norm_0_w"); + cb(cur, "out_norm_0_w"); cur = ggml_add(ctx0, cur, model.output_norm_b); - ggml_set_name(cur, "result_norm"); + cb(cur, "result_norm"); } cur = ggml_mul_mat(ctx0, model.output, cur); - ggml_set_name(cur, "result_output"); + cb(cur, "result_output"); ggml_build_forward_expand(gf, cur); @@ -4536,6 +4542,7 @@ static struct ggml_cgraph * llm_build_persimmon( static struct ggml_cgraph * llm_build_refact( llama_context & lctx, const llama_batch & batch, + const llm_build_cb & cb, bool worst_case) { const auto & model = lctx.model; const auto & hparams = model.hparams; @@ -4578,7 +4585,7 @@ static struct ggml_cgraph * llm_build_refact( if (batch.token) { struct ggml_tensor * inp_tokens = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens); - ggml_set_name(inp_tokens, "inp_tokens"); + cb(inp_tokens, "inp_tokens"); inpL = ggml_get_rows(ctx0, model.tok_embeddings, inp_tokens); } else { @@ -4588,15 +4595,15 @@ static struct ggml_cgraph * llm_build_refact( inpL = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd, n_tokens); } - ggml_set_name(inpL, "inp_embd"); + cb(inpL, "inp_embd"); // KQ_scale struct ggml_tensor * KQ_scale = ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, 1); - ggml_set_name(KQ_scale, "KQ_scale"); + cb(KQ_scale, "KQ_scale"); // KQ_mask (mask for 1 head, it will be broadcasted to all heads) struct ggml_tensor * KQ_mask = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_kv, n_tokens, 1); - ggml_set_name(KQ_mask, "KQ_mask"); + cb(KQ_mask, "KQ_mask"); for (int il = 0; il < n_layer; ++il) { struct ggml_tensor * inpSA = inpL; @@ -4604,52 +4611,52 @@ static struct ggml_cgraph * llm_build_refact( // norm { cur = ggml_rms_norm(ctx0, inpL, norm_rms_eps); - ggml_set_name(cur, "rms_norm_0"); + cb(cur, "rms_norm_0"); // cur = cur*attn_norm(broadcasted) cur = ggml_mul(ctx0, cur, model.layers[il].attn_norm); - ggml_set_name(cur, "attn_norm_0"); + cb(cur, "attn_norm_0"); } // self-attention { // compute Q and K struct ggml_tensor * tmpk = ggml_mul_mat(ctx0, model.layers[il].wk, cur); - ggml_set_name(tmpk, "tmpk"); + cb(tmpk, "tmpk"); struct ggml_tensor * tmpq = ggml_mul_mat(ctx0, model.layers[il].wq, cur); - ggml_set_name(tmpq, "tmpq"); + cb(tmpq, "tmpq"); struct ggml_tensor * Kcur = ggml_reshape_3d(ctx0, tmpk, n_embd_head, n_head_kv, n_tokens); - ggml_set_name(Kcur, "Kcur"); + cb(Kcur, "Kcur"); struct ggml_tensor * Qcur = ggml_reshape_3d(ctx0, tmpq, n_embd_head, n_head, n_tokens); - ggml_set_name(Qcur, "Qcur"); + cb(Qcur, "Qcur"); // store key and value to memory { // compute the transposed [n_tokens, n_embd] V matrix struct ggml_tensor * tmpv = ggml_mul_mat(ctx0, model.layers[il].wv, cur); - ggml_set_name(tmpv, "tmpv"); + cb(tmpv, "tmpv"); struct ggml_tensor * Vcur = ggml_transpose(ctx0, ggml_reshape_2d(ctx0, tmpv, n_embd_gqa, n_tokens)); - ggml_set_name(Vcur, "Vcur"); + cb(Vcur, "Vcur"); struct ggml_tensor * k = ggml_view_1d(ctx0, kv_self.k, n_tokens*n_embd_gqa, (ggml_element_size(kv_self.k)*n_embd_gqa)*(il*n_ctx + kv_head)); - ggml_set_name(k, "k"); + cb(k, "k"); struct ggml_tensor * v = ggml_view_2d(ctx0, kv_self.v, n_tokens, n_embd_gqa, ( n_ctx)*ggml_element_size(kv_self.v), (il*n_ctx)*ggml_element_size(kv_self.v)*n_embd_gqa + kv_head*ggml_element_size(kv_self.v)); - ggml_set_name(v, "v"); + cb(v, "v"); ggml_build_forward_expand(gf, ggml_cpy(ctx0, Kcur, k)); ggml_build_forward_expand(gf, ggml_cpy(ctx0, Vcur, v)); } struct ggml_tensor * Q = ggml_permute(ctx0, Qcur, 0, 2, 1, 3); - ggml_set_name(Q, "Q"); + cb(Q, "Q"); struct ggml_tensor * K = ggml_view_3d(ctx0, kv_self.k, @@ -4657,27 +4664,27 @@ static struct ggml_cgraph * llm_build_refact( ggml_element_size(kv_self.k)*n_embd_gqa, ggml_element_size(kv_self.k)*n_embd_head, ggml_element_size(kv_self.k)*n_embd_gqa*n_ctx*il); - ggml_set_name(K, "K"); + cb(K, "K"); // K * Q struct ggml_tensor * KQ = ggml_mul_mat(ctx0, K, Q); - ggml_set_name(KQ, "KQ"); + cb(KQ, "KQ"); // KQ_scaled = KQ / sqrt(n_embd_head) // KQ_scaled shape [n_kv, n_tokens, n_head, 1] struct ggml_tensor * KQ_scaled = ggml_scale(ctx0, KQ, KQ_scale); - ggml_set_name(KQ_scaled, "KQ_scaled"); + cb(KQ_scaled, "KQ_scaled"); // KQ_masked = mask_past(KQ_scaled) struct ggml_tensor * KQ_scaled_alibi = ggml_alibi(ctx0, KQ_scaled, /*n_past*/ 0, n_head, 8); - ggml_set_name(KQ_scaled_alibi, "KQ_scaled_alibi"); + cb(KQ_scaled_alibi, "KQ_scaled_alibi"); struct ggml_tensor * KQ_masked = ggml_add(ctx0, KQ_scaled_alibi, KQ_mask); - ggml_set_name(KQ_masked, "KQ_masked"); + cb(KQ_masked, "KQ_masked"); // KQ = soft_max(KQ_masked) struct ggml_tensor * KQ_soft_max = ggml_soft_max(ctx0, KQ_masked); - ggml_set_name(KQ_soft_max, "KQ_soft_max"); + cb(KQ_soft_max, "KQ_soft_max"); // split cached V into n_head heads struct ggml_tensor * V = @@ -4686,66 +4693,66 @@ static struct ggml_cgraph * llm_build_refact( ggml_element_size(kv_self.v)*n_ctx, ggml_element_size(kv_self.v)*n_ctx*n_embd_head, ggml_element_size(kv_self.v)*n_ctx*n_embd_gqa*il); - ggml_set_name(V, "V"); + cb(V, "V"); struct ggml_tensor * KQV = ggml_mul_mat(ctx0, V, KQ_soft_max); - ggml_set_name(KQV, "KQV"); + cb(KQV, "KQV"); // KQV_merged = KQV.permute(0, 2, 1, 3) struct ggml_tensor * KQV_merged = ggml_permute(ctx0, KQV, 0, 2, 1, 3); - ggml_set_name(KQV_merged, "KQV_merged"); + cb(KQV_merged, "KQV_merged"); // cur = KQV_merged.contiguous().view(n_embd, n_tokens) cur = ggml_cont_2d(ctx0, KQV_merged, n_embd, n_tokens); - ggml_set_name(cur, "KQV_merged_contiguous"); + cb(cur, "KQV_merged_contiguous"); // projection (no bias) cur = ggml_mul_mat(ctx0, model.layers[il].wo, cur); - ggml_set_name(cur, "result_wo"); + cb(cur, "result_wo"); } struct ggml_tensor * inpFF = ggml_add(ctx0, cur, inpSA); - ggml_set_name(inpFF, "inpFF"); + cb(inpFF, "inpFF"); // feed-forward network { // norm { cur = ggml_rms_norm(ctx0, inpFF, norm_rms_eps); - ggml_set_name(cur, "rms_norm_1"); + cb(cur, "rms_norm_1"); // cur = cur*ffn_norm(broadcasted) cur = ggml_mul(ctx0, cur, model.layers[il].ffn_norm); - ggml_set_name(cur, "ffn_norm"); + cb(cur, "ffn_norm"); } struct ggml_tensor * tmp = ggml_mul_mat(ctx0, model.layers[il].w3, cur); - ggml_set_name(tmp, "result_w3"); + cb(tmp, "result_w3"); cur = ggml_mul_mat(ctx0, model.layers[il].w1, cur); - ggml_set_name(cur, "result_w1"); + cb(cur, "result_w1"); // SILU activation cur = ggml_silu(ctx0, cur); - ggml_set_name(cur, "silu"); + cb(cur, "silu"); cur = ggml_mul(ctx0, cur, tmp); - ggml_set_name(cur, "silu_x_result_w3"); + cb(cur, "silu_x_result_w3"); cur = ggml_mul_mat(ctx0, model.layers[il].w2, cur); - ggml_set_name(cur, "result_w2"); + cb(cur, "result_w2"); } cur = ggml_add(ctx0, cur, inpFF); - ggml_set_name(cur, "inpFF_+_result_w2"); + cb(cur, "inpFF_+_result_w2"); // input for next layer inpL = cur; @@ -4756,16 +4763,16 @@ static struct ggml_cgraph * llm_build_refact( // norm { cur = ggml_rms_norm(ctx0, cur, norm_rms_eps); - ggml_set_name(cur, "rms_norm_2"); + cb(cur, "rms_norm_2"); // cur = cur*norm(broadcasted) cur = ggml_mul(ctx0, cur, model.output_norm); - ggml_set_name(cur, "result_norm"); + cb(cur, "result_norm"); } // lm_head cur = ggml_mul_mat(ctx0, model.output, cur); - ggml_set_name(cur, "result_output"); + cb(cur, "result_output"); ggml_build_forward_expand(gf, cur); @@ -4777,6 +4784,7 @@ static struct ggml_cgraph * llm_build_refact( static struct ggml_cgraph * llm_build_bloom( llama_context & lctx, const llama_batch & batch, + const llm_build_cb & cb, bool worst_case) { const auto & model = lctx.model; const auto & hparams = model.hparams; @@ -4822,7 +4830,7 @@ static struct ggml_cgraph * llm_build_bloom( if (batch.token) { struct ggml_tensor * inp_tokens = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens); - ggml_set_name(inp_tokens, "inp_tokens"); + cb(inp_tokens, "inp_tokens"); embd = ggml_get_rows(ctx0, model.tok_embeddings, inp_tokens); } else { @@ -4832,56 +4840,56 @@ static struct ggml_cgraph * llm_build_bloom( embd = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd, n_tokens); } - ggml_set_name(embd, "inp_embd"); + cb(embd, "inp_embd"); // KQ_scale struct ggml_tensor * KQ_scale = ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, 1); - ggml_set_name(KQ_scale, "KQ_scale"); + cb(KQ_scale, "KQ_scale"); // KQ_mask (mask for 1 head, it will be broadcasted to all heads) struct ggml_tensor * KQ_mask = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_kv, n_tokens, 1); - ggml_set_name(KQ_mask, "KQ_mask"); + cb(KQ_mask, "KQ_mask"); // norm { inpL = ggml_norm(ctx0, embd, norm_eps); - ggml_set_name(inpL, "inp_norm"); + cb(inpL, "inp_norm"); inpL = ggml_mul(ctx0, inpL, model.tok_norm); - ggml_set_name(inpL, "inp_norm_w"); + cb(inpL, "inp_norm_w"); inpL = ggml_add (ctx0, inpL, model.tok_norm_b); - ggml_set_name(inpL, "inp_norm_wb"); + cb(inpL, "inp_norm_wb"); } for (int il = 0; il < n_layer; ++il) { { // Norm cur = ggml_norm(ctx0, inpL, norm_eps); - ggml_set_name(cur, "attn_norm_0"); + cb(cur, "attn_norm_0"); cur = ggml_mul(ctx0, cur, model.layers[il].attn_norm); - ggml_set_name(cur, "attn_norm_0_w"); + cb(cur, "attn_norm_0_w"); cur = ggml_add(ctx0, cur, model.layers[il].attn_norm_b); - ggml_set_name(cur, "attn_norm_0_wb"); + cb(cur, "attn_norm_0_wb"); } { // Self Attention cur = ggml_mul_mat(ctx0, model.layers[il].wqkv, cur); - ggml_set_name(cur, "wqkv"); + cb(cur, "wqkv"); cur = ggml_add(ctx0, cur, model.layers[il].bqkv); - ggml_set_name(cur, "bqkv"); + cb(cur, "bqkv"); struct ggml_tensor * tmpq = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd, n_tokens, cur->nb[1], 0*sizeof(float)*(n_embd))); struct ggml_tensor * tmpk = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd))); struct ggml_tensor * tmpv = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa))); - ggml_set_name(tmpq, "tmpq"); - ggml_set_name(tmpk, "tmpk"); - ggml_set_name(tmpv, "tmpv"); + cb(tmpq, "tmpq"); + cb(tmpk, "tmpk"); + cb(tmpv, "tmpv"); struct ggml_tensor * Qcur = ggml_reshape_3d(ctx0, tmpq, n_embd_head, n_head, n_tokens); struct ggml_tensor * Kcur = tmpk; @@ -4889,15 +4897,15 @@ static struct ggml_cgraph * llm_build_bloom( // store key and value to memory { struct ggml_tensor * Vcur = ggml_transpose(ctx0, ggml_reshape_2d(ctx0, ggml_cont(ctx0, tmpv), n_embd_gqa, n_tokens)); - ggml_set_name(Vcur, "Vcur"); + cb(Vcur, "Vcur"); struct ggml_tensor * k = ggml_view_1d(ctx0, kv_self.k, n_tokens*n_embd_gqa, (ggml_element_size(kv_self.k)*n_embd_gqa)*(il*n_ctx + kv_head)); - ggml_set_name(k, "k"); + cb(k, "k"); struct ggml_tensor * v = ggml_view_2d(ctx0, kv_self.v, n_tokens, n_embd_gqa, ( n_ctx)*ggml_element_size(kv_self.v), (il*n_ctx)*ggml_element_size(kv_self.v)*n_embd_gqa + kv_head*ggml_element_size(kv_self.v)); - ggml_set_name(v, "v"); + cb(v, "v"); ggml_build_forward_expand(gf, ggml_cpy(ctx0, Kcur, k)); ggml_build_forward_expand(gf, ggml_cpy(ctx0, Vcur, v)); @@ -4909,7 +4917,7 @@ static struct ggml_cgraph * llm_build_bloom( Qcur, ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_embd_head, n_head, n_tokens)), 0, 2, 1, 3); - ggml_set_name(Q, "Q"); + cb(Q, "Q"); struct ggml_tensor * K = ggml_view_3d(ctx0, kv_self.k, @@ -4917,27 +4925,27 @@ static struct ggml_cgraph * llm_build_bloom( ggml_element_size(kv_self.k)*n_embd_gqa, ggml_element_size(kv_self.k)*n_embd_head, ggml_element_size(kv_self.k)*n_embd_gqa*n_ctx*il); - ggml_set_name(K, "K"); + cb(K, "K"); // K * Q struct ggml_tensor * KQ = ggml_mul_mat(ctx0, K, Q); - ggml_set_name(KQ, "KQ"); + cb(KQ, "KQ"); // KQ_scaled = KQ / sqrt(n_embd_head) // KQ_scaled shape [n_past + n_tokens, n_tokens, n_head, 1] struct ggml_tensor * KQ_scaled = ggml_scale_inplace(ctx0, KQ, KQ_scale); - ggml_set_name(KQ_scaled, "KQ_scaled"); + cb(KQ_scaled, "KQ_scaled"); struct ggml_tensor * KQ_scaled_alibi = ggml_alibi(ctx0, KQ_scaled, /*n_past*/ kv_head, n_head, 8); - ggml_set_name(KQ_scaled_alibi, "KQ_scaled_alibi"); + cb(KQ_scaled_alibi, "KQ_scaled_alibi"); // KQ_masked = mask_past(KQ_scaled) struct ggml_tensor * KQ_masked = ggml_add(ctx0, KQ_scaled_alibi, KQ_mask); - ggml_set_name(KQ_masked, "KQ_masked"); + cb(KQ_masked, "KQ_masked"); // KQ = soft_max(KQ_masked) struct ggml_tensor * KQ_soft_max = ggml_soft_max_inplace(ctx0, KQ_masked); - ggml_set_name(KQ_soft_max, "KQ_soft_max"); + cb(KQ_soft_max, "KQ_soft_max"); // split cached V into n_head heads struct ggml_tensor * V = @@ -4946,30 +4954,30 @@ static struct ggml_cgraph * llm_build_bloom( ggml_element_size(kv_self.v)*n_ctx, ggml_element_size(kv_self.v)*n_ctx*n_embd_head, ggml_element_size(kv_self.v)*n_ctx*n_embd_gqa*il); - ggml_set_name(V, "V"); + cb(V, "V"); struct ggml_tensor * KQV = ggml_mul_mat(ctx0, V, KQ_soft_max); - ggml_set_name(KQV, "KQV"); + cb(KQV, "KQV"); // KQV_merged = KQV.permute(0, 2, 1, 3) struct ggml_tensor * KQV_merged = ggml_permute(ctx0, KQV, 0, 2, 1, 3); - ggml_set_name(KQV_merged, "KQV_merged"); + cb(KQV_merged, "KQV_merged"); // cur = KQV_merged.contiguous().view(n_embd, n_tokens) cur = ggml_cont_2d(ctx0, KQV_merged, n_embd, n_tokens); - ggml_set_name(cur, "KQV_merged_contiguous"); + cb(cur, "KQV_merged_contiguous"); } // Projection cur = ggml_mul_mat(ctx0, model.layers[il].wo, cur); - ggml_set_name(cur, "result_wo"); + cb(cur, "result_wo"); cur = ggml_add(ctx0, cur, model.layers[il].bo); - ggml_set_name(cur, "result_wo_b"); + cb(cur, "result_wo_b"); // Add the input cur = ggml_add(ctx0, cur, inpL); - ggml_set_name(cur, "inpL_+_result_wo"); + cb(cur, "inpL_+_result_wo"); struct ggml_tensor * inpFF = cur; @@ -4978,49 +4986,49 @@ static struct ggml_cgraph * llm_build_bloom( // Norm { cur = ggml_norm(ctx0, inpFF, norm_eps); - ggml_set_name(cur, "ffn_norm_0"); + cb(cur, "ffn_norm_0"); cur = ggml_mul(ctx0, cur, model.layers[il].ffn_norm); - ggml_set_name(cur, "ffn_norm_0_w"); + cb(cur, "ffn_norm_0_w"); cur = ggml_add(ctx0, cur, model.layers[il].ffn_norm_b); - ggml_set_name(cur, "ffn_norm_0_wb"); + cb(cur, "ffn_norm_0_wb"); } cur = ggml_mul_mat(ctx0, model.layers[il].w3, cur); - ggml_set_name(cur, "result_w3"); + cb(cur, "result_w3"); cur = ggml_add(ctx0, cur, model.layers[il].b3); - ggml_set_name(cur, "result_w3_b"); + cb(cur, "result_w3_b"); cur = ggml_gelu(ctx0, cur); - ggml_set_name(cur, "gelu"); + cb(cur, "gelu"); cur = ggml_mul_mat(ctx0, model.layers[il].w2, cur); - ggml_set_name(cur, "result_w2"); + cb(cur, "result_w2"); cur = ggml_add(ctx0, cur, model.layers[il].b2); - ggml_set_name(cur, "result_w2_b"); + cb(cur, "result_w2_b"); } inpL = ggml_add(ctx0, cur, inpFF); - ggml_set_name(inpL, "inpFF_+_result_w2"); + cb(inpL, "inpFF_+_result_w2"); } // Output Norm { cur = ggml_norm(ctx0, inpL, norm_eps); - ggml_set_name(cur, "out_norm_0"); + cb(cur, "out_norm_0"); cur = ggml_mul(ctx0, cur, model.output_norm); - ggml_set_name(cur, "out_norm_0_w"); + cb(cur, "out_norm_0_w"); cur = ggml_add(ctx0, cur, model.output_norm_b); - ggml_set_name(cur, "result_norm"); + cb(cur, "result_norm"); } cur = ggml_mul_mat(ctx0, model.output, cur); - ggml_set_name(cur, "result_output"); + cb(cur, "result_output"); ggml_build_forward_expand(gf, cur); @@ -5032,6 +5040,7 @@ static struct ggml_cgraph * llm_build_bloom( static struct ggml_cgraph * llm_build_mpt( llama_context & lctx, const llama_batch & batch, + const llm_build_cb & cb, bool worst_case) { const auto & model = lctx.model; const auto & hparams = model.hparams; @@ -5076,7 +5085,7 @@ static struct ggml_cgraph * llm_build_mpt( if (batch.token) { struct ggml_tensor * inp_tokens = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens); - ggml_set_name(inp_tokens, "inp_tokens"); + cb(inp_tokens, "inp_tokens"); inpL = ggml_get_rows(ctx0, model.tok_embeddings, inp_tokens); } else { @@ -5086,15 +5095,15 @@ static struct ggml_cgraph * llm_build_mpt( inpL = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd, n_tokens); } - ggml_set_name(inpL, "inp_embd"); + cb(inpL, "inp_embd"); // KQ_scale struct ggml_tensor * KQ_scale = ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, 1); - ggml_set_name(KQ_scale, "KQ_scale"); + cb(KQ_scale, "KQ_scale"); // KQ_mask (mask for 1 head, it will be broadcasted to all heads) struct ggml_tensor * KQ_mask = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_kv, n_tokens, 1); - ggml_set_name(KQ_mask, "KQ_mask"); + cb(KQ_mask, "KQ_mask"); for (int il = 0; il < n_layer; ++il) { struct ggml_tensor * attn_norm; @@ -5103,10 +5112,10 @@ static struct ggml_cgraph * llm_build_mpt( // TODO: refactor into common function (shared with LLaMA) { attn_norm = ggml_norm(ctx0, inpL, norm_eps); - ggml_set_name(attn_norm, "attn_norm_0"); + cb(attn_norm, "attn_norm_0"); attn_norm = ggml_mul(ctx0, attn_norm, model.layers[il].attn_norm); - ggml_set_name(attn_norm, "attn_norm_0_w"); + cb(attn_norm, "attn_norm_0_w"); if (1) { cur = attn_norm; @@ -5115,11 +5124,11 @@ static struct ggml_cgraph * llm_build_mpt( // compute QKV cur = ggml_mul_mat(ctx0, model.layers[il].wqkv, cur); - ggml_set_name(cur, "wqkv"); + cb(cur, "wqkv"); if (clamp_kqv > 0.0f) { cur = ggml_clamp(ctx0, cur, -clamp_kqv, clamp_kqv); - ggml_set_name(cur, "wqkv_clamped"); + cb(cur, "wqkv_clamped"); } const size_t wsize = ggml_type_size(cur->type); @@ -5129,43 +5138,43 @@ static struct ggml_cgraph * llm_build_mpt( wsize * n_embd_head, wsize * n_embd_head * (n_head + 2 * n_head_kv), 0); - ggml_set_name(Qcur, "Qcur"); + cb(Qcur, "Qcur"); struct ggml_tensor * Kcur = ggml_view_3d( ctx0, cur, n_embd_head, n_head_kv, n_tokens, wsize * n_embd_head, wsize * n_embd_head * (n_head + 2 * n_head_kv), wsize * n_embd_head * n_head); - ggml_set_name(Kcur, "Kcur"); + cb(Kcur, "Kcur"); struct ggml_tensor * tmpv = ggml_view_3d( ctx0, cur, n_embd_head, n_head_kv, n_tokens, wsize * n_embd_head, wsize * n_embd_head * (n_head + 2 * n_head_kv), wsize * n_embd_head * (n_head + n_head_kv)); - ggml_set_name(tmpv, "tmpv"); + cb(tmpv, "tmpv"); { struct ggml_tensor * Vcur = ggml_cont(ctx0, tmpv); - ggml_set_name(Vcur, "Vcur"); + cb(Vcur, "Vcur"); Vcur = ggml_transpose(ctx0, ggml_reshape_2d(ctx0, Vcur, n_embd_gqa, n_tokens)); - ggml_set_name(Vcur, "Vcur"); + cb(Vcur, "Vcur"); struct ggml_tensor * k = ggml_view_1d(ctx0, kv_self.k, n_tokens*n_embd_gqa, (ggml_element_size(kv_self.k)*n_embd_gqa)*(il*n_ctx + kv_head)); - ggml_set_name(k, "k"); + cb(k, "k"); struct ggml_tensor * v = ggml_view_2d(ctx0, kv_self.v, n_tokens, n_embd_gqa, ( n_ctx)*ggml_element_size(kv_self.v), (il*n_ctx)*ggml_element_size(kv_self.v)*n_embd_gqa + kv_head*ggml_element_size(kv_self.v)); - ggml_set_name(v, "v"); + cb(v, "v"); ggml_build_forward_expand(gf, ggml_cpy(ctx0, Kcur, k)); ggml_build_forward_expand(gf, ggml_cpy(ctx0, Vcur, v)); } struct ggml_tensor * Q = ggml_permute(ctx0, Qcur, 0, 2, 1, 3); - ggml_set_name(Q, "Q"); + cb(Q, "Q"); struct ggml_tensor * K = ggml_view_3d(ctx0, kv_self.k, @@ -5173,23 +5182,23 @@ static struct ggml_cgraph * llm_build_mpt( ggml_element_size(kv_self.k)*n_embd_gqa, ggml_element_size(kv_self.k)*n_embd_head, ggml_element_size(kv_self.k)*n_embd_gqa*n_ctx*il); - ggml_set_name(K, "K"); + cb(K, "K"); struct ggml_tensor * KQ = ggml_mul_mat(ctx0, K, Q); - ggml_set_name(KQ, "KQ"); + cb(KQ, "KQ"); struct ggml_tensor * KQ_scaled = ggml_scale(ctx0, KQ, KQ_scale); - ggml_set_name(KQ_scaled, "KQ_scaled"); + cb(KQ_scaled, "KQ_scaled"); // TODO: replace with ggml_add() struct ggml_tensor * KQ_scaled_alibi = ggml_alibi(ctx0, KQ_scaled, 0, n_head, max_alibi_bias); - ggml_set_name(KQ_scaled_alibi, "KQ_scaled_alibi"); + cb(KQ_scaled_alibi, "KQ_scaled_alibi"); struct ggml_tensor * KQ_masked = ggml_add(ctx0, KQ_scaled_alibi, KQ_mask); - ggml_set_name(KQ_masked, "KQ_masked"); + cb(KQ_masked, "KQ_masked"); struct ggml_tensor * KQ_soft_max = ggml_soft_max(ctx0, KQ_masked); - ggml_set_name(KQ_soft_max, "KQ_soft_max"); + cb(KQ_soft_max, "KQ_soft_max"); struct ggml_tensor * V = ggml_view_3d(ctx0, kv_self.v, @@ -5197,24 +5206,24 @@ static struct ggml_cgraph * llm_build_mpt( ggml_element_size(kv_self.v)*n_ctx, ggml_element_size(kv_self.v)*n_ctx*n_embd_head, ggml_element_size(kv_self.v)*n_ctx*n_embd_gqa*il); - ggml_set_name(V, "V"); + cb(V, "V"); struct ggml_tensor * KQV = ggml_mul_mat(ctx0, V, KQ_soft_max); - ggml_set_name(KQV, "KQV"); + cb(KQV, "KQV"); struct ggml_tensor * KQV_merged = ggml_permute(ctx0, KQV, 0, 2, 1, 3); - ggml_set_name(KQV_merged, "KQV_merged"); + cb(KQV_merged, "KQV_merged"); cur = ggml_cont_2d(ctx0, KQV_merged, n_embd, n_tokens); - ggml_set_name(cur, "KQV_merged_contiguous"); + cb(cur, "KQV_merged_contiguous"); cur = ggml_mul_mat(ctx0, model.layers[il].wo, cur); - ggml_set_name(cur, "result_wo"); + cb(cur, "result_wo"); } // Add the input cur = ggml_add(ctx0, cur, inpL); - ggml_set_name(cur, "inpL_+_result_wo"); + cb(cur, "inpL_+_result_wo"); struct ggml_tensor * attn_out = cur; @@ -5223,24 +5232,24 @@ static struct ggml_cgraph * llm_build_mpt( // Norm { cur = ggml_norm(ctx0, attn_out, norm_eps); - ggml_set_name(cur, "ffn_norm_0"); + cb(cur, "ffn_norm_0"); cur = ggml_mul(ctx0, cur, model.layers[il].ffn_norm); - ggml_set_name(cur, "ffn_norm_0_w"); + cb(cur, "ffn_norm_0_w"); } cur = ggml_mul_mat(ctx0, model.layers[il].w3, cur); - ggml_set_name(cur, "result_w3"); + cb(cur, "result_w3"); cur = ggml_gelu(ctx0, cur); - ggml_set_name(cur, "gelu"); + cb(cur, "gelu"); cur = ggml_mul_mat(ctx0, model.layers[il].w2, cur); - ggml_set_name(cur, "result_w2"); + cb(cur, "result_w2"); } cur = ggml_add(ctx0, cur, attn_out); - ggml_set_name(cur, "inpL_+_inpFF_+_result_w2"); + cb(cur, "inpL_+_inpFF_+_result_w2"); // input for next layer inpL = cur; @@ -5251,14 +5260,14 @@ static struct ggml_cgraph * llm_build_mpt( // norm { cur = ggml_norm(ctx0, cur, norm_eps); - ggml_set_name(cur, "out_norm_0"); + cb(cur, "out_norm_0"); cur = ggml_mul(ctx0, cur, model.output_norm); - ggml_set_name(cur, "result_norm"); + cb(cur, "result_norm"); } cur = ggml_mul_mat(ctx0, model.output, cur); - ggml_set_name(cur, "result_output"); + cb(cur, "result_output"); ggml_build_forward_expand(gf, cur); @@ -5267,7 +5276,7 @@ static struct ggml_cgraph * llm_build_mpt( return gf; } -enum offload_func_e { +enum llm_offload_func_e { OFFLOAD_FUNC_NOP, OFFLOAD_FUNC, OFFLOAD_FUNC_KQ, @@ -5277,7 +5286,7 @@ enum offload_func_e { OFFLOAD_FUNC_OUT, }; -struct llama_offload_trie { +struct llm_offload_trie { struct node { ~node() { for (int i = 0; i < 256; ++i) { @@ -5288,14 +5297,14 @@ struct llama_offload_trie { } node * children[256] = { nullptr }; - offload_func_e func = OFFLOAD_FUNC_NOP; + llm_offload_func_e func = OFFLOAD_FUNC_NOP; }; - llama_offload_trie() { + llm_offload_trie() { root = new node; } - llama_offload_trie(const std::unordered_map & map) { + llm_offload_trie(const std::unordered_map & map) { root = new node; for (const auto & kv : map) { @@ -5303,11 +5312,11 @@ struct llama_offload_trie { } } - ~llama_offload_trie() { + ~llm_offload_trie() { delete root; } - void add(const char * name, offload_func_e func) { + void add(const char * name, llm_offload_func_e func) { node * cur = root; for (int i = 0; ; ++i) { @@ -5327,7 +5336,7 @@ struct llama_offload_trie { cur->func = func; } - offload_func_e find(const char * name) const { + llm_offload_func_e find(const char * name) const { const node * cur = root; for (int i = 0; ; ++i) { @@ -5350,206 +5359,241 @@ struct llama_offload_trie { node * root = nullptr; }; +static const std::unordered_map k_offload_map = { + { "inp_pos", OFFLOAD_FUNC_KQ }, + { "KQ_mask", OFFLOAD_FUNC_KQ }, + { "K_shift", OFFLOAD_FUNC_KQ }, + { "K_shifted", OFFLOAD_FUNC_KQ }, -static void llama_build_graph_input( - llama_context & lctx, - const llama_batch & batch, - struct ggml_cgraph * graph) { - struct ggml_tensor * cur = nullptr; + { "inp_norm", OFFLOAD_FUNC_NR }, + { "inp_norm_w", OFFLOAD_FUNC_NR }, + { "inp_norm_wb", OFFLOAD_FUNC_NR }, - // inp_tokens - if (batch.token) { - cur = ggml_graph_get_tensor(graph, "inp_tokens"); - GGML_ASSERT(cur != nullptr && "missing tensor 'inp_tokens'"); + { "rms_norm_0", OFFLOAD_FUNC }, - ggml_allocr_alloc(lctx.alloc, cur); + { "attn_norm_0", OFFLOAD_FUNC }, + { "attn_norm_0_w", OFFLOAD_FUNC }, + { "attn_norm_0_wb", OFFLOAD_FUNC }, - if (!ggml_allocr_is_measure(lctx.alloc)) { - const int64_t n_tokens = cur->ne[0]; + { "attn_norm_2", OFFLOAD_FUNC }, + { "attn_norm_2_w", OFFLOAD_FUNC }, + { "attn_norm_2_wb", OFFLOAD_FUNC }, - memcpy(cur->data, batch.token, n_tokens*ggml_element_size(cur)); - } - } + { "wqkv", OFFLOAD_FUNC_KQ }, + { "bqkv", OFFLOAD_FUNC_KQ }, + { "wqkv_clamped", OFFLOAD_FUNC_KQ }, - // inp_embd - if (batch.embd) { - cur = ggml_graph_get_tensor(graph, "inp_embd"); - GGML_ASSERT(cur != nullptr && "missing tensor 'inp_embd'"); + { "tmpk", OFFLOAD_FUNC_KQ }, + { "tmpq", OFFLOAD_FUNC_KQ }, + { "tmpv", OFFLOAD_FUNC_V }, + { "tmpkqv", OFFLOAD_FUNC_KQ }, // ?? + { "Kcur", OFFLOAD_FUNC_KQ }, + { "Qcur", OFFLOAD_FUNC_KQ }, + { "Vcur", OFFLOAD_FUNC_V }, + { "Vcur_0", OFFLOAD_FUNC_V }, + { "Vcur_1", OFFLOAD_FUNC_V }, - ggml_allocr_alloc(lctx.alloc, cur); + { "krot", OFFLOAD_FUNC_KQ }, + { "qrot", OFFLOAD_FUNC_KQ }, + { "kpass", OFFLOAD_FUNC_KQ }, + { "qpass", OFFLOAD_FUNC_KQ }, + { "krotated", OFFLOAD_FUNC_KQ }, + { "qrotated", OFFLOAD_FUNC_KQ }, - if (!ggml_allocr_is_measure(lctx.alloc)) { - const int64_t n_embd = cur->ne[0]; - const int64_t n_tokens = cur->ne[1]; + { "k", OFFLOAD_FUNC_KQ }, + { "v", OFFLOAD_FUNC_V }, - memcpy(cur->data, batch.embd, n_tokens*n_embd*ggml_element_size(cur)); - } - } + { "Q", OFFLOAD_FUNC_KQ }, + { "K", OFFLOAD_FUNC_KQ }, + { "KQ", OFFLOAD_FUNC_KQ }, + { "KQ_scaled", OFFLOAD_FUNC_KQ }, + { "KQ_scaled_alibi", OFFLOAD_FUNC_KQ }, + { "KQ_masked", OFFLOAD_FUNC_KQ }, + { "KQ_soft_max", OFFLOAD_FUNC_V }, + { "V", OFFLOAD_FUNC_V }, + { "KQV", OFFLOAD_FUNC_V }, + { "KQV_merged", OFFLOAD_FUNC_V }, + { "KQV_merged_contiguous", OFFLOAD_FUNC_V }, - switch (lctx.model.arch) { - case LLM_ARCH_LLAMA: - case LLM_ARCH_BAICHUAN: - case LLM_ARCH_FALCON: - case LLM_ARCH_PERSIMMON: - { - // KQ_pos - cur = ggml_graph_get_tensor(graph, "KQ_pos"); - GGML_ASSERT(cur != nullptr && "missing tensor 'KQ_pos'"); + { "result_wo", OFFLOAD_FUNC }, + { "result_wo_b", OFFLOAD_FUNC }, + { "inpL_+_result_wo", OFFLOAD_FUNC }, - ggml_allocr_alloc(lctx.alloc, cur); + { "inpFF", OFFLOAD_FUNC }, - if (!ggml_allocr_is_measure(lctx.alloc)) { - const int64_t n_tokens = cur->ne[0]; + { "rms_norm_1", OFFLOAD_FUNC }, + { "ffn_norm", OFFLOAD_FUNC }, + { "ffn_norm_0", OFFLOAD_FUNC }, + { "ffn_norm_0_w", OFFLOAD_FUNC }, + { "ffn_norm_0_wb", OFFLOAD_FUNC }, - int32_t * data = (int32_t *) cur->data; + { "result_w3", OFFLOAD_FUNC }, + { "result_w3_b", OFFLOAD_FUNC }, + { "result_w2", OFFLOAD_FUNC }, + { "result_w2_b", OFFLOAD_FUNC }, + { "result_w1", OFFLOAD_FUNC }, - for (int i = 0; i < n_tokens; ++i) { - data[i] = batch.pos[i]; - } - } + { "silu", OFFLOAD_FUNC }, + { "gelu", OFFLOAD_FUNC }, + { "relu", OFFLOAD_FUNC }, + { "sqr(relu)", OFFLOAD_FUNC }, - // K_shift - cur = ggml_graph_get_tensor(graph, "K_shift"); - //GGML_ASSERT(cur != nullptr && "missing tensor 'K_shift'"); - if (cur) { - ggml_allocr_alloc(lctx.alloc, cur); + { "silu_x_result_w3", OFFLOAD_FUNC }, + { "inpFF_+_result_w2", OFFLOAD_FUNC }, + { "inpL_+_inpFF_+_result_w2", OFFLOAD_FUNC }, - if (!ggml_allocr_is_measure(lctx.alloc)) { - const int64_t n_ctx = cur->ne[0]; + { "rms_norm_2", OFFLOAD_FUNC_NR }, + { "out_norm_0", OFFLOAD_FUNC_NR }, + { "out_norm_0_w", OFFLOAD_FUNC_NR }, - int32_t * data = (int32_t *) cur->data; + { "result_norm", OFFLOAD_FUNC_EMB }, + { "result_output", OFFLOAD_FUNC_OUT }, +}; - for (int i = 0; i < n_ctx; ++i) { - data[i] = lctx.kv_self.cells[i].delta; - } - } - } - } break; - case LLM_ARCH_STARCODER: - { - // inp_pos - cur = ggml_graph_get_tensor(graph, "inp_pos"); - GGML_ASSERT(cur != nullptr && "missing tensor 'inp_pos'"); - - ggml_allocr_alloc(lctx.alloc, cur); - - if (!ggml_allocr_is_measure(lctx.alloc)) { - const int64_t n_tokens = cur->ne[0]; - - int32_t * data = (int32_t *) cur->data; - - for (int i = 0; i < n_tokens; ++i) { - data[i] = batch.pos[i]; - } - } - } break; - default: - break; - } - - // common - { - // KQ_scale - cur = ggml_graph_get_tensor(graph, "KQ_scale"); - GGML_ASSERT(cur != nullptr && "missing tensor 'KQ_scale'"); - - ggml_allocr_alloc(lctx.alloc, cur); - - if (!ggml_allocr_is_measure(lctx.alloc)) { - const int64_t n_embd_head = lctx.model.hparams.n_embd_head(); - ggml_set_f32(cur, 1.0f/sqrtf(float(n_embd_head))); - } - - // KQ_mask - cur = ggml_graph_get_tensor(graph, "KQ_mask"); - GGML_ASSERT(cur != nullptr && "missing tensor 'KQ_mask'"); - - ggml_allocr_alloc(lctx.alloc, cur); - - if (!ggml_allocr_is_measure(lctx.alloc)) { - const int64_t n_kv = cur->ne[0]; - const int64_t n_tokens = cur->ne[1]; - - float * data = (float *) cur->data; - memset(data, 0, ggml_nbytes(cur)); - - for (int h = 0; h < 1; ++h) { - for (int j = 0; j < n_tokens; ++j) { - const llama_pos pos = batch.pos[j]; - const llama_seq_id seq_id = batch.seq_id[j][0]; - - for (int i = 0; i < n_kv; ++i) { - if (!lctx.kv_self.cells[i].has_seq_id(seq_id) || lctx.kv_self.cells[i].pos > pos) { - data[h*(n_kv*n_tokens) + j*n_kv + i] = -INFINITY; - } - } - } - } - } - } -} +static llm_offload_trie k_offload_func_trie(k_offload_map); static struct ggml_cgraph * llama_build_graph( llama_context & lctx, const llama_batch & batch) { const auto & model = lctx.model; - struct ggml_cgraph * result = NULL; - // check if we should build the worst-case graph (for memory measurement) const bool worst_case = ggml_allocr_is_measure(lctx.alloc); - switch (model.arch) { - case LLM_ARCH_LLAMA: - { - result = llm_build_llama(lctx, batch, worst_case); - } break; - case LLM_ARCH_BAICHUAN: - { - result = llm_build_baichaun(lctx, batch, worst_case); - } break; - case LLM_ARCH_FALCON: - { - result = llm_build_falcon(lctx, batch, worst_case); - } break; - case LLM_ARCH_STARCODER: - { - result = llm_build_starcoder(lctx, batch, worst_case); - } break; - case LLM_ARCH_PERSIMMON: - { - result = llm_build_persimmon(lctx, batch, worst_case); - } break; - case LLM_ARCH_REFACT: - { - result = llm_build_refact(lctx, batch, worst_case); - } break; - case LLM_ARCH_BLOOM: - { - result = llm_build_bloom(lctx, batch, worst_case); - } break; - case LLM_ARCH_MPT: - { - result = llm_build_mpt(lctx, batch, worst_case); - } break; - default: - GGML_ASSERT(false); - } + // count the number of times a tensor with a given name has been offloaded + std::unordered_map offload_n; - // allocate memory and set the values for the input tensors of the graph - llama_build_graph_input(lctx, batch, result); + // keep track of the input that has already been allocated + bool alloc_inp_tokens = false; + bool alloc_inp_embd = false; + bool alloc_inp_pos = false; + bool alloc_inp_KQ_scale = false; + bool alloc_inp_KQ_mask = false; + bool alloc_inp_K_shift = false; - //auto t_start = std::chrono::high_resolution_clock::now(); + // this callback allows us to apply custom logic to each tensor (e.g. ggml-alloc, offloading, etc.) + llm_build_cb cb = [&](struct ggml_tensor * cur, const char * name) { + ggml_set_name(cur, name); + + // + // allocate input tensors and set input data + // + + if (batch.token && !alloc_inp_tokens && strcmp(name, "inp_tokens") == 0) { + ggml_allocr_alloc(lctx.alloc, cur); + + if (!ggml_allocr_is_measure(lctx.alloc)) { + const int64_t n_tokens = cur->ne[0]; + + memcpy(cur->data, batch.token, n_tokens*ggml_element_size(cur)); + } + + alloc_inp_tokens = true; + } + + if (batch.embd && !alloc_inp_embd && strcmp(name, "inp_embd") == 0) { + ggml_allocr_alloc(lctx.alloc, cur); + + if (!ggml_allocr_is_measure(lctx.alloc)) { + const int64_t n_embd = cur->ne[0]; + const int64_t n_tokens = cur->ne[1]; + + memcpy(cur->data, batch.embd, n_tokens*n_embd*ggml_element_size(cur)); + } + + alloc_inp_embd = true; + } + + if (batch.pos && !alloc_inp_pos && strcmp(name, "inp_pos") == 0) { + ggml_allocr_alloc(lctx.alloc, cur); + + if (!ggml_allocr_is_measure(lctx.alloc)) { + const int64_t n_tokens = cur->ne[0]; + + int32_t * data = (int32_t *) cur->data; + + for (int i = 0; i < n_tokens; ++i) { + data[i] = batch.pos[i]; + } + } + + alloc_inp_pos = true; + } + + if (!alloc_inp_KQ_scale && strcmp(name, "KQ_scale") == 0) { + ggml_allocr_alloc(lctx.alloc, cur); + + if (!ggml_allocr_is_measure(lctx.alloc)) { + const int64_t n_embd_head = model.hparams.n_embd_head(); + ggml_set_f32(cur, 1.0f/sqrtf(float(n_embd_head))); + } + + alloc_inp_KQ_scale = true; + } + + if (!alloc_inp_KQ_mask && strcmp(name, "KQ_mask") == 0) { + ggml_allocr_alloc(lctx.alloc, cur); + + if (!ggml_allocr_is_measure(lctx.alloc)) { + const int64_t n_kv = cur->ne[0]; + const int64_t n_tokens = cur->ne[1]; + + float * data = (float *) cur->data; + memset(data, 0, ggml_nbytes(cur)); + + for (int h = 0; h < 1; ++h) { + for (int j = 0; j < n_tokens; ++j) { + const llama_pos pos = batch.pos[j]; + const llama_seq_id seq_id = batch.seq_id[j][0]; + + for (int i = 0; i < n_kv; ++i) { + if (!lctx.kv_self.cells[i].has_seq_id(seq_id) || lctx.kv_self.cells[i].pos > pos) { + data[h*(n_kv*n_tokens) + j*n_kv + i] = -INFINITY; + } + } + } + } + } + + alloc_inp_KQ_mask = true; + } + + if (!alloc_inp_K_shift && strcmp(name, "K_shift") == 0) { + ggml_allocr_alloc(lctx.alloc, cur); + + if (!ggml_allocr_is_measure(lctx.alloc)) { + const int64_t n_ctx = cur->ne[0]; + + int32_t * data = (int32_t *) cur->data; + + for (int i = 0; i < n_ctx; ++i) { + data[i] = lctx.kv_self.cells[i].delta; + } + } + + alloc_inp_K_shift = true; + } + + // + // offload layers + // + // TODO: this code will be obsoleted with backend v2 - // offload layers - // TODO: this code will be obsoleted with backend v2 #ifdef GGML_USE_CUBLAS - const bool do_offload = true; + const bool do_offload = true; #else - const bool do_offload = false; + const bool do_offload = false; #endif - if (do_offload) { + if (!do_offload) { + return; + } + + // view tensors are not offloaded + if (cur->view_src != nullptr) { + return; + } + const int n_layer = model.hparams.n_layer; const int n_gpu_layers = model.n_gpu_layers; @@ -5558,7 +5602,7 @@ static struct ggml_cgraph * llama_build_graph( // should we offload the final norm? yes if we are not computing embeddings const bool offload_emb = lctx.embedding.empty(); - static const std::unordered_map> k_offload_func_name = { + static const std::unordered_map> k_offload_func_name = { { OFFLOAD_FUNC_NOP, "CPU" }, #ifdef GGML_USE_CUBLAS { OFFLOAD_FUNC, "GPU (CUDA)" }, @@ -5577,187 +5621,121 @@ static struct ggml_cgraph * llama_build_graph( #endif // GGML_USE_CUBLAS }; - static const std::unordered_map k_offload_func = { - { "KQ_mask", OFFLOAD_FUNC_KQ }, - { "KQ_pos", OFFLOAD_FUNC_KQ }, - { "K_shift", OFFLOAD_FUNC_KQ }, - { "K_shifted", OFFLOAD_FUNC_KQ }, + // check the global map for what offload function to use for this tensor + llm_offload_func_e func_e = k_offload_func_trie.find(cur->name); - { "inp_norm", OFFLOAD_FUNC_NR }, - { "inp_norm_w", OFFLOAD_FUNC_NR }, - { "inp_norm_wb", OFFLOAD_FUNC_NR }, - - { "rms_norm_0", OFFLOAD_FUNC }, - - { "attn_norm_0", OFFLOAD_FUNC }, - { "attn_norm_0_w", OFFLOAD_FUNC }, - { "attn_norm_0_wb", OFFLOAD_FUNC }, - - { "attn_norm_2", OFFLOAD_FUNC }, - { "attn_norm_2_w", OFFLOAD_FUNC }, - { "attn_norm_2_wb", OFFLOAD_FUNC }, - - { "wqkv", OFFLOAD_FUNC_KQ }, - { "bqkv", OFFLOAD_FUNC_KQ }, - { "wqkv_clamped", OFFLOAD_FUNC_KQ }, - - { "tmpk", OFFLOAD_FUNC_KQ }, - { "tmpq", OFFLOAD_FUNC_KQ }, - { "tmpv", OFFLOAD_FUNC_V }, - { "tmpkqv", OFFLOAD_FUNC_KQ }, // ?? - { "Kcur", OFFLOAD_FUNC_KQ }, - { "Qcur", OFFLOAD_FUNC_KQ }, - { "Vcur", OFFLOAD_FUNC_V }, - { "Vcur_0", OFFLOAD_FUNC_V }, - { "Vcur_1", OFFLOAD_FUNC_V }, - - { "krot", OFFLOAD_FUNC_KQ }, - { "qrot", OFFLOAD_FUNC_KQ }, - { "kpass", OFFLOAD_FUNC_KQ }, - { "qpass", OFFLOAD_FUNC_KQ }, - { "krotated", OFFLOAD_FUNC_KQ }, - { "qrotated", OFFLOAD_FUNC_KQ }, - - { "k", OFFLOAD_FUNC_KQ }, - { "v", OFFLOAD_FUNC_V }, - - { "Q", OFFLOAD_FUNC_KQ }, - { "K", OFFLOAD_FUNC_KQ }, - { "KQ", OFFLOAD_FUNC_KQ }, - { "KQ_scaled", OFFLOAD_FUNC_KQ }, - { "KQ_scaled_alibi", OFFLOAD_FUNC_KQ }, - { "KQ_masked", OFFLOAD_FUNC_KQ }, - { "KQ_soft_max", OFFLOAD_FUNC_V }, - { "V", OFFLOAD_FUNC_V }, - { "KQV", OFFLOAD_FUNC_V }, - { "KQV_merged", OFFLOAD_FUNC_V }, - { "KQV_merged_contiguous", OFFLOAD_FUNC_V }, - - { "result_wo", OFFLOAD_FUNC }, - { "result_wo_b", OFFLOAD_FUNC }, - { "inpL_+_result_wo", OFFLOAD_FUNC }, - - { "inpFF", OFFLOAD_FUNC }, - - { "rms_norm_1", OFFLOAD_FUNC }, - { "ffn_norm", OFFLOAD_FUNC }, - { "ffn_norm_0", OFFLOAD_FUNC }, - { "ffn_norm_0_w", OFFLOAD_FUNC }, - { "ffn_norm_0_wb", OFFLOAD_FUNC }, - - { "result_w3", OFFLOAD_FUNC }, - { "result_w3_b", OFFLOAD_FUNC }, - { "result_w2", OFFLOAD_FUNC }, - { "result_w2_b", OFFLOAD_FUNC }, - { "result_w1", OFFLOAD_FUNC }, - - { "silu", OFFLOAD_FUNC }, - { "gelu", OFFLOAD_FUNC }, - { "relu", OFFLOAD_FUNC }, - { "sqr(relu)", OFFLOAD_FUNC }, - - { "silu_x_result_w3", OFFLOAD_FUNC }, - { "inpFF_+_result_w2", OFFLOAD_FUNC }, - { "inpL_+_inpFF_+_result_w2", OFFLOAD_FUNC }, - - { "rms_norm_2", OFFLOAD_FUNC_NR }, - { "out_norm_0", OFFLOAD_FUNC_NR }, - { "out_norm_0_w", OFFLOAD_FUNC_NR }, - - { "result_norm", OFFLOAD_FUNC_EMB }, - { "result_output", OFFLOAD_FUNC_OUT }, - }; - - static llama_offload_trie k_offload_func_trie(k_offload_func); - - std::unordered_map ofn; - - for (int i = 0; i < result->n_nodes; ++i) { - struct ggml_tensor * cur = result->nodes[i]; - - // view tensors are not offloaded - if (cur->view_src != nullptr) { - continue; + if (func_e == OFFLOAD_FUNC_NOP) { + // if a tensor hasn't been offloaded, we warn the user + if (worst_case) { + LLAMA_LOG_WARN("%s: %32s: not offloaded (ref: %s)\n", __func__, + cur->name, "https://github.com/ggerganov/llama.cpp/pull/3837"); } - offload_func_e func_e = k_offload_func_trie.find(cur->name); + return; + } - if (func_e == OFFLOAD_FUNC_NOP) { - // if a tensor hasn't been offloaded, we warn the user - if (worst_case) { - LLAMA_LOG_WARN("%s: node %4d %32s: not offloaded (ref: %s)\n", __func__, - i, cur->name, "https://github.com/ggerganov/llama.cpp/pull/3837"); + // count the number of layers and respect the provided n_gpu_layers + switch (func_e) { + case OFFLOAD_FUNC_NOP: + case OFFLOAD_FUNC_OUT: + break; + case OFFLOAD_FUNC: + if (n_gpu_layers < n_layer) { + if (offload_n[cur->name]++ < i_gpu_start) { + func_e = OFFLOAD_FUNC_NOP; + } } + break; + case OFFLOAD_FUNC_NR: + if (n_gpu_layers <= n_layer + 0) { + func_e = OFFLOAD_FUNC_NOP; + } + break; + case OFFLOAD_FUNC_V: + if (n_gpu_layers <= n_layer + 1) { + func_e = OFFLOAD_FUNC_NOP; + } + break; + case OFFLOAD_FUNC_KQ: + if (n_gpu_layers <= n_layer + 2) { + func_e = OFFLOAD_FUNC_NOP; + } + break; + case OFFLOAD_FUNC_EMB: + if (!offload_emb || n_gpu_layers < n_layer) { + func_e = OFFLOAD_FUNC_NOP; + } + break; + default: GGML_ASSERT(false); + } - continue; - } - - // count the number of layers and respect the provided n_gpu_layers - switch (func_e) { - case OFFLOAD_FUNC_NOP: - case OFFLOAD_FUNC_OUT: break; - case OFFLOAD_FUNC: - if (n_gpu_layers < n_layer) { - if (ofn[cur->name]++ < i_gpu_start) { - func_e = OFFLOAD_FUNC_NOP; - } - } - break; - case OFFLOAD_FUNC_NR: - if (n_gpu_layers <= n_layer + 0) { - func_e = OFFLOAD_FUNC_NOP; - } - break; - case OFFLOAD_FUNC_V: - if (n_gpu_layers <= n_layer + 1) { - func_e = OFFLOAD_FUNC_NOP; - } - break; - case OFFLOAD_FUNC_KQ: - if (n_gpu_layers <= n_layer + 2) { - func_e = OFFLOAD_FUNC_NOP; - } - break; - case OFFLOAD_FUNC_EMB: - if (!offload_emb || n_gpu_layers < n_layer) { - func_e = OFFLOAD_FUNC_NOP; - } - break; - default: GGML_ASSERT(false); - } - - offload_func_t func = ggml_offload_nop; + offload_func_t func = ggml_offload_nop; + // this is needed for compatibility with Metal for example #ifdef GGML_USE_CUBLAS - static offload_func_t ggml_offload_gpu = ggml_cuda_assign_buffers_no_alloc; + static offload_func_t ggml_offload_gpu = ggml_cuda_assign_buffers_no_alloc; #else - static offload_func_t ggml_offload_gpu = ggml_offload_nop; + static offload_func_t ggml_offload_gpu = ggml_offload_nop; #endif - switch (func_e) { - case OFFLOAD_FUNC_NOP: - case OFFLOAD_FUNC_OUT: func = ggml_offload_nop; break; - case OFFLOAD_FUNC: - case OFFLOAD_FUNC_KQ: - case OFFLOAD_FUNC_V: - case OFFLOAD_FUNC_NR: - case OFFLOAD_FUNC_EMB: func = ggml_offload_gpu; break; - default: GGML_ASSERT(false); - } - - // apply offload function to the tensor - func(cur); - - if (worst_case) { - LLAMA_LOG_INFO("%s: node %4d %32s: %s\n", __func__, i, cur->name, k_offload_func_name.at(func_e).c_str()); - } + switch (func_e) { + case OFFLOAD_FUNC_NOP: + case OFFLOAD_FUNC_OUT: func = ggml_offload_nop; break; + case OFFLOAD_FUNC: + case OFFLOAD_FUNC_KQ: + case OFFLOAD_FUNC_V: + case OFFLOAD_FUNC_NR: + case OFFLOAD_FUNC_EMB: func = ggml_offload_gpu; break; + default: GGML_ASSERT(false); } + + // apply offload function to the tensor + func(cur); + + if (worst_case) { + LLAMA_LOG_INFO("%s: %32s: %s\n", __func__, cur->name, k_offload_func_name.at(func_e).c_str()); + } + }; + + struct ggml_cgraph * result = NULL; + + switch (model.arch) { + case LLM_ARCH_LLAMA: + { + result = llm_build_llama(lctx, batch, cb, worst_case); + } break; + case LLM_ARCH_BAICHUAN: + { + result = llm_build_baichaun(lctx, batch, cb, worst_case); + } break; + case LLM_ARCH_FALCON: + { + result = llm_build_falcon(lctx, batch, cb, worst_case); + } break; + case LLM_ARCH_STARCODER: + { + result = llm_build_starcoder(lctx, batch, cb, worst_case); + } break; + case LLM_ARCH_PERSIMMON: + { + result = llm_build_persimmon(lctx, batch, cb, worst_case); + } break; + case LLM_ARCH_REFACT: + { + result = llm_build_refact(lctx, batch, cb, worst_case); + } break; + case LLM_ARCH_BLOOM: + { + result = llm_build_bloom(lctx, batch, cb, worst_case); + } break; + case LLM_ARCH_MPT: + { + result = llm_build_mpt(lctx, batch, cb, worst_case); + } break; + default: + GGML_ASSERT(false); } - //auto t_end = std::chrono::high_resolution_clock::now(); - - //printf("offload time: %f ms\n", std::chrono::duration(t_end - t_start).count()); - return result; } From 8925cf9ef8216621de37d41b787d25305c9d556d Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Sun, 29 Oct 2023 13:22:15 +0200 Subject: [PATCH 16/20] llama : add layer index to all tensor names --- llama.cpp | 723 +++++++++++++++++++++++++++--------------------------- 1 file changed, 365 insertions(+), 358 deletions(-) diff --git a/llama.cpp b/llama.cpp index fcb75716f..83816e33a 100644 --- a/llama.cpp +++ b/llama.cpp @@ -3090,7 +3090,7 @@ static bool llama_model_load( return true; } -using llm_build_cb = std::function; +using llm_build_cb = std::function; static struct ggml_cgraph * llm_build_llama( llama_context & lctx, @@ -3144,7 +3144,7 @@ static struct ggml_cgraph * llm_build_llama( if (batch.token) { struct ggml_tensor * inp_tokens = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens); - cb(inp_tokens, "inp_tokens"); + cb(inp_tokens, "inp_tokens", -1); inpL = ggml_get_rows(ctx0, model.tok_embeddings, inp_tokens); } else { @@ -3154,24 +3154,24 @@ static struct ggml_cgraph * llm_build_llama( inpL = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd, n_tokens); } - cb(inpL, "inp_embd"); + cb(inpL, "inp_embd", -1); // inp_pos - contains the positions struct ggml_tensor * inp_pos = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens); - cb(inp_pos, "inp_pos"); + cb(inp_pos, "inp_pos", -1); // KQ_scale struct ggml_tensor * KQ_scale = ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, 1); - cb(KQ_scale, "KQ_scale"); + cb(KQ_scale, "KQ_scale", -1); // KQ_mask (mask for 1 head, it will be broadcasted to all heads) struct ggml_tensor * KQ_mask = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_kv, n_tokens, 1); - cb(KQ_mask, "KQ_mask"); + cb(KQ_mask, "KQ_mask", -1); // shift the entire K-cache if needed if (do_rope_shift) { struct ggml_tensor * K_shift = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_ctx); - cb(K_shift, "K_shift"); + cb(K_shift, "K_shift", -1); for (int il = 0; il < n_layer; ++il) { struct ggml_tensor * tmp = @@ -3182,7 +3182,7 @@ static struct ggml_cgraph * llm_build_llama( ggml_element_size(kv_self.k)*n_embd_gqa, ggml_element_size(kv_self.k)*n_embd_gqa*n_ctx*il), K_shift, n_embd_head, 0, 0, freq_base, freq_scale); - cb(tmp, "K_shifted"); + cb(tmp, "K_shifted", il); ggml_build_forward_expand(gf, tmp); } } @@ -3193,45 +3193,45 @@ static struct ggml_cgraph * llm_build_llama( // norm { cur = ggml_rms_norm(ctx0, inpL, norm_rms_eps); - cb(cur, "rms_norm_0"); + cb(cur, "rms_norm_0", il); // cur = cur*attn_norm(broadcasted) cur = ggml_mul(ctx0, cur, model.layers[il].attn_norm); - cb(cur, "attn_norm_0"); + cb(cur, "attn_norm_0", il); } // self-attention { // compute Q and K and RoPE them struct ggml_tensor * tmpk = ggml_mul_mat(ctx0, model.layers[il].wk, cur); - cb(tmpk, "tmpk"); + cb(tmpk, "tmpk", il); struct ggml_tensor * tmpq = ggml_mul_mat(ctx0, model.layers[il].wq, cur); - cb(tmpq, "tmpq"); + cb(tmpq, "tmpq", il); struct ggml_tensor * Kcur = ggml_rope_custom(ctx0, ggml_reshape_3d(ctx0, tmpk, n_embd_head, n_head_kv, n_tokens), inp_pos, n_embd_head, 0, 0, freq_base, freq_scale); - cb(Kcur, "Kcur"); + cb(Kcur, "Kcur", il); struct ggml_tensor * Qcur = ggml_rope_custom(ctx0, ggml_reshape_3d(ctx0, tmpq, n_embd_head, n_head, n_tokens), inp_pos, n_embd_head, 0, 0, freq_base, freq_scale); - cb(Qcur, "Qcur"); + cb(Qcur, "Qcur", il); // store key and value to memory { // compute the transposed [n_tokens, n_embd] V matrix struct ggml_tensor * tmpv = ggml_mul_mat(ctx0, model.layers[il].wv, cur); - cb(tmpv, "tmpv"); + cb(tmpv, "tmpv", il); struct ggml_tensor * Vcur = ggml_transpose(ctx0, ggml_reshape_2d(ctx0, tmpv, n_embd_gqa, n_tokens)); - cb(Vcur, "Vcur"); + cb(Vcur, "Vcur", il); struct ggml_tensor * k = ggml_view_1d(ctx0, kv_self.k, n_tokens*n_embd_gqa, (ggml_element_size(kv_self.k)*n_embd_gqa)*(il*n_ctx + kv_head)); - cb(k, "k"); + cb(k, "k", il); struct ggml_tensor * v = ggml_view_2d(ctx0, kv_self.v, n_tokens, n_embd_gqa, ( n_ctx)*ggml_element_size(kv_self.v), (il*n_ctx)*ggml_element_size(kv_self.v)*n_embd_gqa + kv_head*ggml_element_size(kv_self.v)); - cb(v, "v"); + cb(v, "v", il); // important: storing RoPE-ed version of K in the KV cache! ggml_build_forward_expand(gf, ggml_cpy(ctx0, Kcur, k)); @@ -3239,7 +3239,7 @@ static struct ggml_cgraph * llm_build_llama( } struct ggml_tensor * Q = ggml_permute(ctx0, Qcur, 0, 2, 1, 3); - cb(Q, "Q"); + cb(Q, "Q", il); struct ggml_tensor * K = ggml_view_3d(ctx0, kv_self.k, @@ -3247,24 +3247,24 @@ static struct ggml_cgraph * llm_build_llama( ggml_element_size(kv_self.k)*n_embd_gqa, ggml_element_size(kv_self.k)*n_embd_head, ggml_element_size(kv_self.k)*n_embd_gqa*n_ctx*il); - cb(K, "K"); + cb(K, "K", il); // K * Q struct ggml_tensor * KQ = ggml_mul_mat(ctx0, K, Q); - cb(KQ, "KQ"); + cb(KQ, "KQ", il); // KQ_scaled = KQ / sqrt(n_embd_head) // KQ_scaled shape [n_kv, n_tokens, n_head, 1] struct ggml_tensor * KQ_scaled = ggml_scale(ctx0, KQ, KQ_scale); - cb(KQ_scaled, "KQ_scaled"); + cb(KQ_scaled, "KQ_scaled", il); // KQ_masked = mask_past(KQ_scaled) struct ggml_tensor * KQ_masked = ggml_add(ctx0, KQ_scaled, KQ_mask); - cb(KQ_masked, "KQ_masked"); + cb(KQ_masked, "KQ_masked", il); // KQ = soft_max(KQ_masked) struct ggml_tensor * KQ_soft_max = ggml_soft_max(ctx0, KQ_masked); - cb(KQ_soft_max, "KQ_soft_max"); + cb(KQ_soft_max, "KQ_soft_max", il); // split cached V into n_head heads struct ggml_tensor * V = @@ -3273,11 +3273,11 @@ static struct ggml_cgraph * llm_build_llama( ggml_element_size(kv_self.v)*n_ctx, ggml_element_size(kv_self.v)*n_ctx*n_embd_head, ggml_element_size(kv_self.v)*n_ctx*n_embd_gqa*il); - cb(V, "V"); + cb(V, "V", il); #if 1 struct ggml_tensor * KQV = ggml_mul_mat(ctx0, V, KQ_soft_max); - cb(KQV, "KQV"); + cb(KQV, "KQV", il); #else // make V contiguous in memory to speed up the matmul, however we waste time on the copy // on M1 this is faster for the perplexity computation, but ~5% slower for the single-token generation @@ -3288,59 +3288,59 @@ static struct ggml_cgraph * llm_build_llama( // KQV_merged = KQV.permute(0, 2, 1, 3) struct ggml_tensor * KQV_merged = ggml_permute(ctx0, KQV, 0, 2, 1, 3); - cb(KQV_merged, "KQV_merged"); + cb(KQV_merged, "KQV_merged", il); // cur = KQV_merged.contiguous().view(n_embd, n_tokens) cur = ggml_cont_2d(ctx0, KQV_merged, n_embd, n_tokens); - cb(cur, "KQV_merged_contiguous"); + cb(cur, "KQV_merged_contiguous", il); // projection (no bias) cur = ggml_mul_mat(ctx0, model.layers[il].wo, cur); - cb(cur, "result_wo"); + cb(cur, "result_wo", il); } struct ggml_tensor * inpFF = ggml_add(ctx0, cur, inpSA); - cb(inpFF, "inpFF"); + cb(inpFF, "inpFF", il); // feed-forward network { // norm { cur = ggml_rms_norm(ctx0, inpFF, norm_rms_eps); - cb(cur, "rms_norm_1"); + cb(cur, "rms_norm_1", il); // cur = cur*ffn_norm(broadcasted) cur = ggml_mul(ctx0, cur, model.layers[il].ffn_norm); - cb(cur, "ffn_norm"); + cb(cur, "ffn_norm", il); } struct ggml_tensor * tmp = ggml_mul_mat(ctx0, model.layers[il].w3, cur); - cb(tmp, "result_w3"); + cb(tmp, "result_w3", il); cur = ggml_mul_mat(ctx0, model.layers[il].w1, cur); - cb(cur, "result_w1"); + cb(cur, "result_w1", il); // SILU activation cur = ggml_silu(ctx0, cur); - cb(cur, "silu"); + cb(cur, "silu", il); cur = ggml_mul(ctx0, cur, tmp); - cb(cur, "silu_x_result_w3"); + cb(cur, "silu_x_result_w3", il); cur = ggml_mul_mat(ctx0, model.layers[il].w2, cur); - cb(cur, "result_w2"); + cb(cur, "result_w2", il); } cur = ggml_add(ctx0, cur, inpFF); - cb(cur, "inpFF_+_result_w2"); + cb(cur, "inpFF_+_result_w2", il); // input for next layer inpL = cur; @@ -3351,16 +3351,16 @@ static struct ggml_cgraph * llm_build_llama( // norm { cur = ggml_rms_norm(ctx0, cur, norm_rms_eps); - cb(cur, "rms_norm_2"); + cb(cur, "rms_norm_2", -1); // cur = cur*norm(broadcasted) cur = ggml_mul(ctx0, cur, model.output_norm); - cb(cur, "result_norm"); + cb(cur, "result_norm", -1); } // lm_head cur = ggml_mul_mat(ctx0, model.output, cur); - cb(cur, "result_output"); + cb(cur, "result_output", -1); ggml_build_forward_expand(gf, cur); @@ -3419,7 +3419,7 @@ static struct ggml_cgraph * llm_build_baichaun( if (batch.token) { struct ggml_tensor * inp_tokens = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens); - cb(inp_tokens, "inp_tokens"); + cb(inp_tokens, "inp_tokens", -1); inpL = ggml_get_rows(ctx0, model.tok_embeddings, inp_tokens); } else { @@ -3429,24 +3429,24 @@ static struct ggml_cgraph * llm_build_baichaun( inpL = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd, n_tokens); } - cb(inpL, "inp_embd"); + cb(inpL, "inp_embd", -1); // inp_pos - contains the positions struct ggml_tensor * inp_pos = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens); - cb(inp_pos, "inp_pos"); + cb(inp_pos, "inp_pos", -1); // KQ_scale struct ggml_tensor * KQ_scale = ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, 1); - cb(KQ_scale, "KQ_scale"); + cb(KQ_scale, "KQ_scale", -1); // KQ_mask (mask for 1 head, it will be broadcasted to all heads) struct ggml_tensor * KQ_mask = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_kv, n_tokens, 1); - cb(KQ_mask, "KQ_mask"); + cb(KQ_mask, "KQ_mask", -1); // shift the entire K-cache if needed if (do_rope_shift) { struct ggml_tensor * K_shift = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_ctx); - cb(K_shift, "K_shift"); + cb(K_shift, "K_shift", -1); for (int il = 0; il < n_layer; ++il) { struct ggml_tensor * tmp = @@ -3457,7 +3457,7 @@ static struct ggml_cgraph * llm_build_baichaun( ggml_element_size(kv_self.k)*n_embd_gqa, ggml_element_size(kv_self.k)*n_embd_gqa*n_ctx*il), K_shift, n_embd_head, 0, 0, freq_base, freq_scale); - cb(tmp, "K_shifted"); + cb(tmp, "K_shifted", il); ggml_build_forward_expand(gf, tmp); } } @@ -3468,21 +3468,21 @@ static struct ggml_cgraph * llm_build_baichaun( // norm { cur = ggml_rms_norm(ctx0, inpL, norm_rms_eps); - cb(cur, "rms_norm_0"); + cb(cur, "rms_norm_0", il); // cur = cur*attn_norm(broadcasted) cur = ggml_mul(ctx0, cur, model.layers[il].attn_norm); - cb(cur, "attn_norm_0"); + cb(cur, "attn_norm_0", il); } // self-attention { // compute Q and K and RoPE them struct ggml_tensor * tmpk = ggml_mul_mat(ctx0, model.layers[il].wk, cur); - cb(tmpk, "tmpk"); + cb(tmpk, "tmpk", il); struct ggml_tensor * tmpq = ggml_mul_mat(ctx0, model.layers[il].wq, cur); - cb(tmpq, "tmpq"); + cb(tmpq, "tmpq", il); struct ggml_tensor * Kcur; struct ggml_tensor * Qcur; @@ -3499,27 +3499,27 @@ static struct ggml_cgraph * llm_build_baichaun( GGML_ASSERT(false); } - cb(Kcur, "Kcur"); + cb(Kcur, "Kcur", il); - cb(Qcur, "Qcur"); + cb(Qcur, "Qcur", il); // store key and value to memory { // compute the transposed [n_tokens, n_embd] V matrix struct ggml_tensor * tmpv = ggml_mul_mat(ctx0, model.layers[il].wv, cur); - cb(tmpv, "tmpv"); + cb(tmpv, "tmpv", il); struct ggml_tensor * Vcur = ggml_transpose(ctx0, ggml_reshape_2d(ctx0, tmpv, n_embd_gqa, n_tokens)); - cb(Vcur, "Vcur"); + cb(Vcur, "Vcur", il); struct ggml_tensor * k = ggml_view_1d(ctx0, kv_self.k, n_tokens*n_embd_gqa, (ggml_element_size(kv_self.k)*n_embd_gqa)*(il*n_ctx + kv_head)); - cb(k, "k"); + cb(k, "k", il); struct ggml_tensor * v = ggml_view_2d(ctx0, kv_self.v, n_tokens, n_embd_gqa, ( n_ctx)*ggml_element_size(kv_self.v), (il*n_ctx)*ggml_element_size(kv_self.v)*n_embd_gqa + kv_head*ggml_element_size(kv_self.v)); - cb(v, "v"); + cb(v, "v", il); // important: storing RoPE-ed version of K in the KV cache! ggml_build_forward_expand(gf, ggml_cpy(ctx0, Kcur, k)); @@ -3527,7 +3527,7 @@ static struct ggml_cgraph * llm_build_baichaun( } struct ggml_tensor * Q = ggml_permute(ctx0, Qcur, 0, 2, 1, 3); - cb(Q, "Q"); + cb(Q, "Q", il); struct ggml_tensor * K = ggml_view_3d(ctx0, kv_self.k, @@ -3535,16 +3535,16 @@ static struct ggml_cgraph * llm_build_baichaun( ggml_element_size(kv_self.k)*n_embd_gqa, ggml_element_size(kv_self.k)*n_embd_head, ggml_element_size(kv_self.k)*n_embd_gqa*n_ctx*il); - cb(K, "K"); + cb(K, "K", il); // K * Q struct ggml_tensor * KQ = ggml_mul_mat(ctx0, K, Q); - cb(KQ, "KQ"); + cb(KQ, "KQ", il); // KQ_scaled = KQ / sqrt(n_embd_head) // KQ_scaled shape [n_past + n_tokens, n_tokens, n_head, 1] struct ggml_tensor * KQ_scaled = ggml_scale(ctx0, KQ, KQ_scale); - cb(KQ_scaled, "KQ_scaled"); + cb(KQ_scaled, "KQ_scaled", il); struct ggml_tensor * KQ_masked; struct ggml_tensor * KQ_scaled_alibi; @@ -3556,7 +3556,7 @@ static struct ggml_cgraph * llm_build_baichaun( case MODEL_13B: // TODO: replace with ggml_add() KQ_scaled_alibi = ggml_alibi(ctx0, KQ_scaled, /*n_past*/ 0, n_head, 8); - cb(KQ_scaled_alibi, "KQ_scaled_alibi"); + cb(KQ_scaled_alibi, "KQ_scaled_alibi", il); KQ_masked = ggml_add(ctx0, KQ_scaled_alibi, KQ_mask); break; default: @@ -3565,7 +3565,7 @@ static struct ggml_cgraph * llm_build_baichaun( // KQ = soft_max(KQ_masked) struct ggml_tensor * KQ_soft_max = ggml_soft_max(ctx0, KQ_masked); - cb(KQ_soft_max, "KQ_soft_max"); + cb(KQ_soft_max, "KQ_soft_max", il); // split cached V into n_head heads struct ggml_tensor * V = @@ -3574,66 +3574,66 @@ static struct ggml_cgraph * llm_build_baichaun( ggml_element_size(kv_self.v)*n_ctx, ggml_element_size(kv_self.v)*n_ctx*n_embd_head, ggml_element_size(kv_self.v)*n_ctx*n_embd_gqa*il); - cb(V, "V"); + cb(V, "V", il); struct ggml_tensor * KQV = ggml_mul_mat(ctx0, V, KQ_soft_max); - cb(KQV, "KQV"); + cb(KQV, "KQV", il); // KQV_merged = KQV.permute(0, 2, 1, 3) struct ggml_tensor * KQV_merged = ggml_permute(ctx0, KQV, 0, 2, 1, 3); - cb(KQV_merged, "KQV_merged"); + cb(KQV_merged, "KQV_merged", il); // cur = KQV_merged.contiguous().view(n_embd, n_tokens) cur = ggml_cont_2d(ctx0, KQV_merged, n_embd, n_tokens); - cb(cur, "KQV_merged_contiguous"); + cb(cur, "KQV_merged_contiguous", il); // projection (no bias) cur = ggml_mul_mat(ctx0, model.layers[il].wo, cur); - cb(cur, "result_wo"); + cb(cur, "result_wo", il); } struct ggml_tensor * inpFF = ggml_add(ctx0, cur, inpSA); - cb(inpFF, "inpFF"); + cb(inpFF, "inpFF", il); // feed-forward network { // norm { cur = ggml_rms_norm(ctx0, inpFF, norm_rms_eps); - cb(cur, "rms_norm_1"); + cb(cur, "rms_norm_1", il); // cur = cur*ffn_norm(broadcasted) cur = ggml_mul(ctx0, cur, model.layers[il].ffn_norm); - cb(cur, "ffn_norm"); + cb(cur, "ffn_norm", il); } struct ggml_tensor * tmp = ggml_mul_mat(ctx0, model.layers[il].w3, cur); - cb(tmp, "result_w3"); + cb(tmp, "result_w3", il); cur = ggml_mul_mat(ctx0, model.layers[il].w1, cur); - cb(cur, "result_w1"); + cb(cur, "result_w1", il); // SILU activation cur = ggml_silu(ctx0, cur); - cb(cur, "silu"); + cb(cur, "silu", il); cur = ggml_mul(ctx0, cur, tmp); - cb(cur, "silu_x_result_w3"); + cb(cur, "silu_x_result_w3", il); cur = ggml_mul_mat(ctx0, model.layers[il].w2, cur); - cb(cur, "result_w2"); + cb(cur, "result_w2", il); } cur = ggml_add(ctx0, cur, inpFF); - cb(cur, "inpFF_+_result_w2"); + cb(cur, "inpFF_+_result_w2", il); // input for next layer inpL = cur; @@ -3644,16 +3644,16 @@ static struct ggml_cgraph * llm_build_baichaun( // norm { cur = ggml_rms_norm(ctx0, cur, norm_rms_eps); - cb(cur, "rms_norm_2"); + cb(cur, "rms_norm_2", -1); // cur = cur*norm(broadcasted) cur = ggml_mul(ctx0, cur, model.output_norm); - cb(cur, "result_norm"); + cb(cur, "result_norm", -1); } // lm_head cur = ggml_mul_mat(ctx0, model.output, cur); - cb(cur, "result_output"); + cb(cur, "result_output", -1); ggml_build_forward_expand(gf, cur); @@ -3715,7 +3715,7 @@ static struct ggml_cgraph * llm_build_falcon( if (batch.token) { struct ggml_tensor * inp_tokens = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens); - cb(inp_tokens, "inp_tokens"); + cb(inp_tokens, "inp_tokens", -1); inpL = ggml_get_rows(ctx0, model.tok_embeddings, inp_tokens); } else { @@ -3725,24 +3725,24 @@ static struct ggml_cgraph * llm_build_falcon( inpL = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd, n_tokens); } - cb(inpL, "inp_embd"); + cb(inpL, "inp_embd", -1); // inp_pos - contains the positions struct ggml_tensor * inp_pos = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens); - cb(inp_pos, "inp_pos"); + cb(inp_pos, "inp_pos", -1); // KQ_scale struct ggml_tensor * KQ_scale = ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, 1); - cb(KQ_scale, "KQ_scale"); + cb(KQ_scale, "KQ_scale", -1); // KQ_mask (mask for 1 head, it will be broadcasted to all heads) struct ggml_tensor * KQ_mask = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_kv, n_tokens, 1); - cb(KQ_mask, "KQ_mask"); + cb(KQ_mask, "KQ_mask", -1); // shift the entire K-cache if needed if (do_rope_shift) { struct ggml_tensor * K_shift = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_ctx); - cb(K_shift, "K_shift"); + cb(K_shift, "K_shift", -1); for (int il = 0; il < n_layer; ++il) { struct ggml_tensor * tmp = @@ -3753,7 +3753,7 @@ static struct ggml_cgraph * llm_build_falcon( ggml_element_size(kv_self.k)*n_embd_gqa, ggml_element_size(kv_self.k)*n_embd_gqa*n_ctx*il), K_shift, n_embd_head, 2, 0, freq_base, freq_scale); - cb(tmp, "K_shifted"); + cb(tmp, "K_shifted", il); ggml_build_forward_expand(gf, tmp); } } @@ -3765,23 +3765,23 @@ static struct ggml_cgraph * llm_build_falcon( // TODO: refactor into common function (shared with LLaMA) { attn_norm = ggml_norm(ctx0, inpL, norm_eps); - cb(attn_norm, "attn_norm_0"); + cb(attn_norm, "attn_norm_0", il); attn_norm = ggml_mul(ctx0, attn_norm, model.layers[il].attn_norm); - cb(attn_norm, "attn_norm_0_w"); + cb(attn_norm, "attn_norm_0_w", il); attn_norm = ggml_add(ctx0, attn_norm, model.layers[il].attn_norm_b); - cb(attn_norm, "attn_norm_0_wb"); + cb(attn_norm, "attn_norm_0_wb", il); if (model.layers[il].attn_norm_2) { // Falcon-40B cur = ggml_norm(ctx0, inpL, norm_eps); - cb(cur, "attn_norm_2"); + cb(cur, "attn_norm_2", il); cur = ggml_mul(ctx0, cur, model.layers[il].attn_norm_2); - cb(cur, "attn_norm_2_w"); + cb(cur, "attn_norm_2_w", il); cur = ggml_add(ctx0, cur, model.layers[il].attn_norm_2_b); - cb(cur, "attn_norm_2_wb"); + cb(cur, "attn_norm_2_wb", il); } else { // Falcon 7B cur = attn_norm; } @@ -3789,7 +3789,7 @@ static struct ggml_cgraph * llm_build_falcon( // compute QKV cur = ggml_mul_mat(ctx0, model.layers[il].wqkv, cur); - cb(cur, "wqkv"); + cb(cur, "wqkv", il); // Note that the strides for Kcur, Vcur are set up so that the // resulting views are misaligned with the tensor's storage @@ -3809,50 +3809,50 @@ static struct ggml_cgraph * llm_build_falcon( wsize * n_embd_head, wsize * n_embd_head * (n_head + 2 * n_head_kv), 0)); - cb(tmpq, "tmpq"); + cb(tmpq, "tmpq", il); struct ggml_tensor * tmpk = ggml_cont(ctx0, ggml_view_3d( ctx0, cur, n_embd_head, n_head_kv, n_tokens, wsize * n_embd_head, wsize * n_embd_head * (n_head + 2 * n_head_kv), wsize * n_embd_head * n_head)); - cb(tmpk, "tmpk"); + cb(tmpk, "tmpk", il); struct ggml_tensor * tmpv = ggml_view_3d( ctx0, cur, n_embd_head, n_head_kv, n_tokens, wsize * n_embd_head, wsize * n_embd_head * (n_head + 2 * n_head_kv), wsize * n_embd_head * (n_head + n_head_kv)); - cb(tmpv, "tmpv"); + cb(tmpv, "tmpv", il); // using mode = 2 for neox mode struct ggml_tensor * Qcur = ggml_rope_custom(ctx0, tmpq, inp_pos, n_embd_head, 2, 0, freq_base, freq_scale); - cb(Qcur, "Qcur"); + cb(Qcur, "Qcur", il); struct ggml_tensor * Kcur = ggml_rope_custom(ctx0, tmpk, inp_pos, n_embd_head, 2, 0, freq_base, freq_scale); - cb(Kcur, "Kcur"); + cb(Kcur, "Kcur", il); { struct ggml_tensor * Vcur = ggml_cont(ctx0, tmpv); - cb(Vcur, "Vcur_0"); + cb(Vcur, "Vcur_0", il); Vcur = ggml_transpose(ctx0, ggml_reshape_2d(ctx0, Vcur, n_embd_gqa, n_tokens)); - cb(Vcur, "Vcur_1"); + cb(Vcur, "Vcur_1", il); struct ggml_tensor * k = ggml_view_1d(ctx0, kv_self.k, n_tokens*n_embd_gqa, (ggml_element_size(kv_self.k)*n_embd_gqa)*(il*n_ctx + kv_head)); - cb(k, "k"); + cb(k, "k", il); struct ggml_tensor * v = ggml_view_2d(ctx0, kv_self.v, n_tokens, n_embd_gqa, ( n_ctx)*ggml_element_size(kv_self.v), (il*n_ctx)*ggml_element_size(kv_self.v)*n_embd_gqa + kv_head*ggml_element_size(kv_self.v)); - cb(v, "v"); + cb(v, "v", il); ggml_build_forward_expand(gf, ggml_cpy(ctx0, Kcur, k)); ggml_build_forward_expand(gf, ggml_cpy(ctx0, Vcur, v)); } struct ggml_tensor * Q = ggml_permute(ctx0, Qcur, 0, 2, 1, 3); - cb(Q, "Q"); + cb(Q, "Q", il); struct ggml_tensor * K = ggml_view_3d(ctx0, kv_self.k, @@ -3860,19 +3860,19 @@ static struct ggml_cgraph * llm_build_falcon( ggml_element_size(kv_self.k)*n_embd_gqa, ggml_element_size(kv_self.k)*n_embd_head, ggml_element_size(kv_self.k)*n_embd_gqa*n_ctx*il); - cb(K, "K"); + cb(K, "K", il); struct ggml_tensor * KQ = ggml_mul_mat(ctx0, K, Q); - cb(KQ, "KQ"); + cb(KQ, "KQ", il); struct ggml_tensor * KQ_scaled = ggml_scale(ctx0, KQ, KQ_scale); - cb(KQ_scaled, "KQ_scaled"); + cb(KQ_scaled, "KQ_scaled", il); struct ggml_tensor * KQ_masked = ggml_add(ctx0, KQ_scaled, KQ_mask); - cb(KQ_masked, "KQ_masked"); + cb(KQ_masked, "KQ_masked", il); struct ggml_tensor * KQ_soft_max = ggml_soft_max(ctx0, KQ_masked); - cb(KQ_soft_max, "KQ_soft_max"); + cb(KQ_soft_max, "KQ_soft_max", il); struct ggml_tensor * V = ggml_view_3d(ctx0, kv_self.v, @@ -3880,19 +3880,19 @@ static struct ggml_cgraph * llm_build_falcon( ggml_element_size(kv_self.v)*n_ctx, ggml_element_size(kv_self.v)*n_ctx*n_embd_head, ggml_element_size(kv_self.v)*n_ctx*n_embd_gqa*il); - cb(V, "V"); + cb(V, "V", il); struct ggml_tensor * KQV = ggml_mul_mat(ctx0, V, KQ_soft_max); - cb(KQV, "KQV"); + cb(KQV, "KQV", il); struct ggml_tensor * KQV_merged = ggml_permute(ctx0, KQV, 0, 2, 1, 3); - cb(KQV_merged, "KQV_merged"); + cb(KQV_merged, "KQV_merged", il); cur = ggml_cont_2d(ctx0, KQV_merged, n_embd, n_tokens); - cb(cur, "KQV_merged_contiguous"); + cb(cur, "KQV_merged_contiguous", il); cur = ggml_mul_mat(ctx0, model.layers[il].wo, cur); - cb(cur, "result_wo"); + cb(cur, "result_wo", il); } struct ggml_tensor * attn_out = cur; @@ -3902,20 +3902,20 @@ static struct ggml_cgraph * llm_build_falcon( struct ggml_tensor * inpFF = attn_norm; cur = ggml_mul_mat(ctx0, model.layers[il].w3, inpFF); - cb(cur, "result_w3"); + cb(cur, "result_w3", il); cur = ggml_gelu(ctx0, cur); - cb(cur, "gelu"); + cb(cur, "gelu", il); cur = ggml_mul_mat(ctx0, model.layers[il].w2, cur); - cb(cur, "result_w2"); + cb(cur, "result_w2", il); } cur = ggml_add(ctx0, cur, attn_out); - cb(cur, "inpFF_+_result_w2"); + cb(cur, "inpFF_+_result_w2", il); cur = ggml_add(ctx0, cur, inpL); - cb(cur, "inpL_+_inpFF_+_result_w2"); + cb(cur, "inpL_+_inpFF_+_result_w2", il); // input for next layer inpL = cur; @@ -3926,17 +3926,17 @@ static struct ggml_cgraph * llm_build_falcon( // norm { cur = ggml_norm(ctx0, cur, norm_eps); - cb(cur, "out_norm_0"); + cb(cur, "out_norm_0", -1); cur = ggml_mul(ctx0, cur, model.output_norm); - cb(cur, "out_norm_0_w"); + cb(cur, "out_norm_0_w", -1); cur = ggml_add(ctx0, cur, model.output_norm_b); - cb(cur, "result_norm"); + cb(cur, "result_norm", -1); } cur = ggml_mul_mat(ctx0, model.output, cur); - cb(cur, "result_output"); + cb(cur, "result_output", -1); ggml_build_forward_expand(gf, cur); @@ -3993,7 +3993,7 @@ static struct ggml_cgraph * llm_build_starcoder( if (batch.token) { struct ggml_tensor * inp_tokens = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens); - cb(inp_tokens, "inp_tokens"); + cb(inp_tokens, "inp_tokens", -1); embd = ggml_get_rows(ctx0, model.tok_embeddings, inp_tokens); } else { @@ -4003,75 +4003,75 @@ static struct ggml_cgraph * llm_build_starcoder( embd = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd, n_tokens); } - cb(embd, "inp_embd"); + cb(embd, "inp_embd", -1); // inp_pos - contains the positions struct ggml_tensor * inp_pos = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens); - cb(inp_pos, "inp_pos"); + cb(inp_pos, "inp_pos", -1); // KQ_scale struct ggml_tensor * KQ_scale = ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, 1); - cb(KQ_scale, "KQ_scale"); + cb(KQ_scale, "KQ_scale", -1); // KQ_mask (mask for 1 head, it will be broadcasted to all heads) struct ggml_tensor * KQ_mask = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_kv, n_tokens, 1); - cb(KQ_mask, "KQ_mask"); + cb(KQ_mask, "KQ_mask", -1); pos = ggml_get_rows(ctx0, model.pos_embeddings, inp_pos); inpL = ggml_add(ctx0, embd, pos); - cb(inpL, "inpL"); + cb(inpL, "inpL", -1); for (int il = 0; il < n_layer; ++il) { { // Norm cur = ggml_norm(ctx0, inpL, norm_eps); - cb(cur, "attn_norm_0"); + cb(cur, "attn_norm_0", il); cur = ggml_mul(ctx0, cur, model.layers[il].attn_norm); - cb(cur, "attn_norm_0_w"); + cb(cur, "attn_norm_0_w", il); cur = ggml_add(ctx0, cur, model.layers[il].attn_norm_b); - cb(cur, "attn_norm_0_wb"); + cb(cur, "attn_norm_0_wb", il); } { // Self Attention cur = ggml_mul_mat(ctx0, model.layers[il].wqkv, cur); - cb(cur, "wqkv"); + cb(cur, "wqkv", il); cur = ggml_add(ctx0, cur, model.layers[il].bqkv); - cb(cur, "bqkv"); + cb(cur, "bqkv", il); struct ggml_tensor * tmpq = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd, n_tokens, cur->nb[1], 0*sizeof(float)*(n_embd))); struct ggml_tensor * tmpk = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd))); struct ggml_tensor * tmpv = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa))); - cb(tmpq, "tmpq"); - cb(tmpk, "tmpk"); - cb(tmpv, "tmpv"); + cb(tmpq, "tmpq", il); + cb(tmpk, "tmpk", il); + cb(tmpv, "tmpv", il); struct ggml_tensor * Qcur = ggml_reshape_3d(ctx0, tmpq, n_embd_head, n_head, n_tokens); struct ggml_tensor * Kcur = tmpk; { struct ggml_tensor * Vcur = ggml_transpose(ctx0, tmpv); - cb(Vcur, "Vcur"); + cb(Vcur, "Vcur", il); struct ggml_tensor * k = ggml_view_1d(ctx0, kv_self.k, n_tokens*n_embd_gqa, (ggml_element_size(kv_self.k)*n_embd_gqa)*(il*n_ctx + kv_head)); - cb(k, "k"); + cb(k, "k", il); struct ggml_tensor * v = ggml_view_2d(ctx0, kv_self.v, n_tokens, n_embd_gqa, ( n_ctx)*ggml_element_size(kv_self.v), (il*n_ctx)*ggml_element_size(kv_self.v)*n_embd_gqa + kv_head*ggml_element_size(kv_self.v)); - cb(v, "v"); + cb(v, "v", il); ggml_build_forward_expand(gf, ggml_cpy(ctx0, Kcur, k)); ggml_build_forward_expand(gf, ggml_cpy(ctx0, Vcur, v)); } struct ggml_tensor * Q = ggml_permute(ctx0, Qcur, 0, 2, 1, 3); - cb(Q, "Q"); + cb(Q, "Q", il); struct ggml_tensor * K = ggml_view_3d(ctx0, kv_self.k, @@ -4079,24 +4079,24 @@ static struct ggml_cgraph * llm_build_starcoder( ggml_element_size(kv_self.k)*n_embd_gqa, ggml_element_size(kv_self.k)*n_embd_head, ggml_element_size(kv_self.k)*n_embd_gqa*n_ctx*il); - cb(K, "K"); + cb(K, "K", il); // K * Q struct ggml_tensor * KQ = ggml_mul_mat(ctx0, K, Q); - cb(KQ, "KQ"); + cb(KQ, "KQ", il); // KQ_scaled = KQ / sqrt(n_embd_head) // KQ_scaled shape [n_past + n_tokens, n_tokens, n_head, 1] struct ggml_tensor * KQ_scaled = ggml_scale_inplace(ctx0, KQ, KQ_scale); - cb(KQ_scaled, "KQ_scaled"); + cb(KQ_scaled, "KQ_scaled", il); // KQ_masked = mask_past(KQ_scaled) struct ggml_tensor * KQ_masked = ggml_add(ctx0, KQ_scaled, KQ_mask); - cb(KQ_masked, "KQ_masked"); + cb(KQ_masked, "KQ_masked", il); // KQ = soft_max(KQ_masked) struct ggml_tensor * KQ_soft_max = ggml_soft_max_inplace(ctx0, KQ_masked); - cb(KQ_soft_max, "KQ_soft_max"); + cb(KQ_soft_max, "KQ_soft_max", il); // split cached V into n_head heads struct ggml_tensor * V = @@ -4105,25 +4105,25 @@ static struct ggml_cgraph * llm_build_starcoder( ggml_element_size(kv_self.v)*n_ctx, ggml_element_size(kv_self.v)*n_ctx*n_embd_head, ggml_element_size(kv_self.v)*n_ctx*n_embd_gqa*il); - cb(V, "V"); + cb(V, "V", il); struct ggml_tensor * KQV = ggml_mul_mat(ctx0, V, KQ_soft_max); - cb(KQV, "KQV"); + cb(KQV, "KQV", il); struct ggml_tensor * KQV_merged = ggml_permute(ctx0, KQV, 0, 2, 1, 3); - cb(KQV_merged, "KQV_merged"); + cb(KQV_merged, "KQV_merged", il); cur = ggml_cont_2d(ctx0, KQV_merged, n_embd, n_tokens); - cb(cur, "KQV_merged_contiguous"); + cb(cur, "KQV_merged_contiguous", il); } // Projection cur = ggml_add(ctx0, ggml_mul_mat(ctx0, model.layers[il].wo, cur), model.layers[il].bo); - cb(cur, "result_wo"); + cb(cur, "result_wo", il); // Add the input cur = ggml_add(ctx0, cur, inpL); - cb(cur, "inpL_+_result_wo"); + cb(cur, "inpL_+_result_wo", il); struct ggml_tensor * inpFF = cur; @@ -4132,28 +4132,28 @@ static struct ggml_cgraph * llm_build_starcoder( // Norm { cur = ggml_norm(ctx0, inpFF, norm_eps); - cb(cur, "ffn_norm_0"); + cb(cur, "ffn_norm_0", il); cur = ggml_mul(ctx0, cur, model.layers[il].ffn_norm); - cb(cur, "ffn_norm_0_w"); + cb(cur, "ffn_norm_0_w", il); cur = ggml_add(ctx0, cur, model.layers[il].ffn_norm_b); - cb(cur, "ffn_norm_0_wb"); + cb(cur, "ffn_norm_0_wb", il); } cur = ggml_add(ctx0, ggml_mul_mat(ctx0, model.layers[il].w3, cur), model.layers[il].b3); - cb(cur, "result_w3"); + cb(cur, "result_w3", il); // GELU activation cur = ggml_gelu(ctx0, cur); - cb(cur, "gelu"); + cb(cur, "gelu", il); // Projection cur = ggml_mul_mat(ctx0, model.layers[il].w2, cur); - cb(cur, "result_w2"); + cb(cur, "result_w2", il); cur = ggml_add(ctx0, cur, model.layers[il].b2); - cb(cur, "result_w2_b"); + cb(cur, "result_w2_b", il); } inpL = ggml_add(ctx0, cur, inpFF); @@ -4163,17 +4163,17 @@ static struct ggml_cgraph * llm_build_starcoder( // Output Norm { cur = ggml_norm(ctx0, inpL, norm_eps); - cb(cur, "out_norm_0"); + cb(cur, "out_norm_0", -1); cur = ggml_mul(ctx0, cur, model.output_norm); - cb(cur, "out_norm_0_w"); + cb(cur, "out_norm_0_w", -1); cur = ggml_add(ctx0, cur, model.output_norm_b); - cb(cur, "result_norm"); + cb(cur, "result_norm", -1); } cur = ggml_mul_mat(ctx0, model.output, cur); - cb(cur, "result_output"); + cb(cur, "result_output", -1); ggml_build_forward_expand(gf, cur); ggml_free(ctx0); @@ -4229,27 +4229,27 @@ static struct ggml_cgraph * llm_build_persimmon( if (batch.token) { struct ggml_tensor * inp_tokens = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens); - cb(inp_tokens, "inp_tokens"); + cb(inp_tokens, "inp_tokens", -1); inpL = ggml_get_rows(ctx0, model.tok_embeddings, inp_tokens); } else { inpL = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd, n_tokens); } - cb(inpL, "imp_embd"); + cb(inpL, "imp_embd", -1); struct ggml_tensor * inp_pos = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens); - cb(inp_pos, "inp_pos"); + cb(inp_pos, "inp_pos", -1); // KQ_scale struct ggml_tensor * KQ_scale = ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, 1); - cb(KQ_scale, "KQ_scale"); + cb(KQ_scale, "KQ_scale", -1); struct ggml_tensor * KQ_mask = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_kv, n_tokens, 1); - cb(KQ_mask, "KQ_mask"); + cb(KQ_mask, "KQ_mask", -1); if (do_rope_shift) { struct ggml_tensor * K_shift = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_ctx); - cb(K_shift, "K_shift"); + cb(K_shift, "K_shift", -1); for (int il = 0; il < n_layer; ++il) { struct ggml_tensor * tmp = @@ -4262,7 +4262,7 @@ static struct ggml_cgraph * llm_build_persimmon( ggml_element_size(kv_self.k)*(n_embd_head*n_ctx*il) ), K_shift, n_rot, 2, 0, freq_base, freq_scale); - cb(tmp, "K_shifted"); + cb(tmp, "K_shifted", il); ggml_build_forward_expand(gf, tmp); } } @@ -4272,31 +4272,31 @@ static struct ggml_cgraph * llm_build_persimmon( { cur = ggml_norm(ctx0, inpL, norm_eps); - cb(cur, "attn_norm_0"); + cb(cur, "attn_norm_0", il); cur = ggml_mul(ctx0, cur, model.layers[il].attn_norm); - cb(cur, "attn_norm_0_w"); + cb(cur, "attn_norm_0_w", il); cur = ggml_add(ctx0, cur, model.layers[il].attn_norm_b); - cb(cur, "attn_norm_0_wb"); + cb(cur, "attn_norm_0_wb", il); } // self attention { cur = ggml_mul_mat(ctx0, model.layers[il].wqkv, cur); - cb(cur, "wqkv"); + cb(cur, "wqkv", il); cur = ggml_add(ctx0, cur, model.layers[il].bqkv); - cb(cur, "bqkv"); + cb(cur, "bqkv", il); // split qkv GGML_ASSERT(n_head_kv == n_head); struct ggml_tensor * tmpqkv = ggml_reshape_4d(ctx0, cur, n_embd_head, 3, n_head, n_tokens); - cb(tmpqkv, "tmpqkv"); + cb(tmpqkv, "tmpqkv", il); struct ggml_tensor * tmpqkv_perm = ggml_cont(ctx0, ggml_permute(ctx0, tmpqkv, 0, 3, 1, 2)); - cb(tmpqkv_perm, "tmpqkv"); + cb(tmpqkv_perm, "tmpqkv", il); struct ggml_tensor * tmpq = ggml_view_3d( ctx0, tmpqkv_perm, n_embd_head, n_head, n_tokens, @@ -4304,7 +4304,7 @@ static struct ggml_cgraph * llm_build_persimmon( ggml_element_size(tmpqkv_perm) * n_embd_head * n_head, 0 ); - cb(tmpq, "tmpq"); + cb(tmpq, "tmpq", il); struct ggml_tensor * tmpk = ggml_view_3d( ctx0, tmpqkv_perm, n_embd_head, n_head, n_tokens, @@ -4312,26 +4312,26 @@ static struct ggml_cgraph * llm_build_persimmon( ggml_element_size(tmpqkv_perm) * n_embd_head * n_head, ggml_element_size(tmpqkv_perm) * n_embd_head * n_head * n_tokens ); - cb(tmpk, "tmpk"); + cb(tmpk, "tmpk", il); // Q/K Layernorm tmpq = ggml_norm(ctx0, tmpq, norm_eps); - cb(tmpq, "tmpq"); + cb(tmpq, "tmpq", il); tmpq = ggml_mul(ctx0, tmpq, model.layers[il].attn_q_norm); - cb(tmpq, "tmpq"); + cb(tmpq, "tmpq", il); tmpq = ggml_add(ctx0, tmpq, model.layers[il].attn_q_norm_b); - cb(tmpq, "tmpq"); + cb(tmpq, "tmpq", il); tmpk = ggml_norm(ctx0, tmpk, norm_eps); - cb(tmpk, "tmpk"); + cb(tmpk, "tmpk", il); tmpk = ggml_mul(ctx0, tmpk, model.layers[il].attn_k_norm); - cb(tmpk, "tmpk"); + cb(tmpk, "tmpk", il); tmpk = ggml_add(ctx0, tmpk, model.layers[il].attn_k_norm_b); - cb(tmpk, "tmpk"); + cb(tmpk, "tmpk", il); // RoPE the first n_rot of q/k, pass the other half, and concat. struct ggml_tensor * qrot = ggml_view_3d( @@ -4340,7 +4340,7 @@ static struct ggml_cgraph * llm_build_persimmon( ggml_element_size(tmpq) * n_embd_head * n_head, 0 ); - cb(qrot, "qrot"); + cb(qrot, "qrot", il); struct ggml_tensor * krot = ggml_view_3d( ctx0, tmpk, n_rot, n_head, n_tokens, @@ -4348,7 +4348,7 @@ static struct ggml_cgraph * llm_build_persimmon( ggml_element_size(tmpk) * n_embd_head * n_head, 0 ); - cb(krot, "krot"); + cb(krot, "krot", il); // get the second half of tmpq, e.g tmpq[n_rot:, :, :] struct ggml_tensor * qpass = ggml_view_3d( @@ -4357,7 +4357,7 @@ static struct ggml_cgraph * llm_build_persimmon( ggml_element_size(tmpq) * n_embd_head * n_head, ggml_element_size(tmpq) * n_rot ); - cb(qpass, "qpass"); + cb(qpass, "qpass", il); struct ggml_tensor * kpass = ggml_view_3d( ctx0, tmpk, n_rot, n_head, n_tokens, @@ -4365,43 +4365,43 @@ static struct ggml_cgraph * llm_build_persimmon( ggml_element_size(tmpk) * n_embd_head * n_head, ggml_element_size(tmpk) * n_rot ); - cb(kpass, "kpass"); + cb(kpass, "kpass", il); struct ggml_tensor * qrotated = ggml_rope_custom( ctx0, qrot, inp_pos, n_rot, 2, 0, freq_base, freq_scale ); - cb(qrotated, "qrotated"); + cb(qrotated, "qrotated", il); struct ggml_tensor * krotated = ggml_rope_custom( ctx0, krot, inp_pos, n_rot, 2, 0, freq_base, freq_scale ); - cb(krotated, "krotated"); + cb(krotated, "krotated", il); // ggml currently only supports concatenation on dim=2 // so we need to permute qrot, qpass, concat, then permute back. qrotated = ggml_cont(ctx0, ggml_permute(ctx0, qrotated, 2, 1, 0, 3)); - cb(qrotated, "qrotated"); + cb(qrotated, "qrotated", il); krotated = ggml_cont(ctx0, ggml_permute(ctx0, krotated, 2, 1, 0, 3)); - cb(krotated, "krotated"); + cb(krotated, "krotated", il); qpass = ggml_cont(ctx0, ggml_permute(ctx0, qpass, 2, 1, 0, 3)); - cb(qpass, "qpass"); + cb(qpass, "qpass", il); kpass = ggml_cont(ctx0, ggml_permute(ctx0, kpass, 2, 1, 0, 3)); - cb(kpass, "kpass"); + cb(kpass, "kpass", il); struct ggml_tensor * Qcur = ggml_concat(ctx0, qrotated, qpass); - cb(Qcur, "Qcur"); + cb(Qcur, "Qcur", il); struct ggml_tensor * Kcur = ggml_concat(ctx0, krotated, kpass); - cb(Kcur, "Kcur"); + cb(Kcur, "Kcur", il); struct ggml_tensor * Q = ggml_cont(ctx0, ggml_permute(ctx0, Qcur, 1, 2, 0, 3)); - cb(Q, "Q"); + cb(Q, "Q", il); Kcur = ggml_cont(ctx0, ggml_permute(ctx0, Kcur, 2, 1, 0, 3)); - cb(Kcur, "Kcur"); + cb(Kcur, "Kcur", il); { struct ggml_tensor * tmpv = ggml_view_3d( @@ -4410,22 +4410,22 @@ static struct ggml_cgraph * llm_build_persimmon( ggml_element_size(tmpqkv_perm) * n_embd_head * n_head, ggml_element_size(tmpqkv_perm) * n_embd_head * n_head * n_tokens * 2 ); - cb(tmpv, "tmpv"); + cb(tmpv, "tmpv", il); // store K, V in cache struct ggml_tensor * Vcur = ggml_transpose(ctx0, ggml_reshape_2d(ctx0, tmpv, n_embd_gqa, n_tokens)); - cb(Vcur, "Vcur"); + cb(Vcur, "Vcur", il); struct ggml_tensor * k = ggml_view_1d( ctx0, kv_self.k, n_tokens*n_embd_gqa, (ggml_element_size(kv_self.k)*n_embd_gqa)*(il*n_ctx + kv_head) ); - cb(k, "k"); + cb(k, "k", il); struct ggml_tensor * v = ggml_view_2d(ctx0, kv_self.v, n_tokens, n_embd_gqa, ( n_ctx)*ggml_element_size(kv_self.v), (il*n_ctx)*ggml_element_size(kv_self.v)*n_embd_gqa + kv_head*ggml_element_size(kv_self.v)); - cb(v, "v"); + cb(v, "v", il); // important: storing RoPE-ed version of K in the KV cache! ggml_build_forward_expand(gf, ggml_cpy(ctx0, Kcur, k)); @@ -4436,19 +4436,19 @@ static struct ggml_cgraph * llm_build_persimmon( ggml_element_size(kv_self.k)*n_embd_gqa, ggml_element_size(kv_self.k)*n_embd_head, ggml_element_size(kv_self.k)*n_embd_gqa*n_ctx*il); - cb(K, "K"); + cb(K, "K", il); struct ggml_tensor * KQ = ggml_mul_mat(ctx0, K, Q); - cb(KQ, "KQ"); + cb(KQ, "KQ", il); struct ggml_tensor * KQ_scaled = ggml_scale(ctx0, KQ, KQ_scale); - cb(KQ_scaled, "KQ_scaled"); + cb(KQ_scaled, "KQ_scaled", il); struct ggml_tensor * KQ_masked = ggml_add(ctx0, KQ_scaled, KQ_mask); - cb(KQ_masked, "KQ_masked"); + cb(KQ_masked, "KQ_masked", il); struct ggml_tensor * KQ_soft_max = ggml_soft_max_inplace(ctx0, KQ_masked); - cb(KQ_soft_max, "KQ_soft_max"); + cb(KQ_soft_max, "KQ_soft_max", il); struct ggml_tensor * V = ggml_view_3d(ctx0, kv_self.v, @@ -4456,62 +4456,62 @@ static struct ggml_cgraph * llm_build_persimmon( ggml_element_size(kv_self.v)*n_ctx, ggml_element_size(kv_self.v)*n_ctx*n_embd_head, ggml_element_size(kv_self.v)*n_ctx*n_embd_gqa*il); - cb(V, "V"); + cb(V, "V", il); struct ggml_tensor * KQV = ggml_mul_mat(ctx0, V, KQ_soft_max); - cb(KQV, "KQV"); + cb(KQV, "KQV", il); struct ggml_tensor * KQV_merged = ggml_permute(ctx0, KQV, 0, 2, 1, 3); - cb(KQV_merged, "KQV_merged"); + cb(KQV_merged, "KQV_merged", il); cur = ggml_cont_2d(ctx0, KQV_merged, n_embd, n_tokens); - cb(cur, "KQV_merged_contiguous"); + cb(cur, "KQV_merged_contiguous", il); cur = ggml_mul_mat(ctx0, model.layers[il].wo, cur); - cb(cur, "result_wo"); + cb(cur, "result_wo", il); cur = ggml_add(ctx0, cur, model.layers[il].bo); - cb(cur, "result_wo_b"); + cb(cur, "result_wo_b", il); } struct ggml_tensor * inpFF = ggml_add(ctx0, residual, cur); - cb(inpFF, "inpFF"); + cb(inpFF, "inpFF", il); { // MLP { // Norm cur = ggml_norm(ctx0, inpFF, norm_eps); - cb(cur, "ffn_norm_0"); + cb(cur, "ffn_norm_0", il); cur = ggml_mul(ctx0, cur, model.layers[il].ffn_norm); - cb(cur, "ffn_norm_0_w"); + cb(cur, "ffn_norm_0_w", il); cur = ggml_add(ctx0, cur, model.layers[il].ffn_norm_b); - cb(cur, "ffn_norm_0_wb"); + cb(cur, "ffn_norm_0_wb", il); } cur = ggml_mul_mat(ctx0, model.layers[il].w3, cur); - cb(cur, "result_w3"); + cb(cur, "result_w3", il); cur = ggml_add(ctx0, cur, model.layers[il].b3); - cb(cur, "result_w3_b"); + cb(cur, "result_w3_b", il); cur = ggml_relu(ctx0, cur); - cb(cur, "relu"); + cb(cur, "relu", il); cur = ggml_sqr(ctx0, cur); - cb(cur, "sqr(relu)"); + cb(cur, "sqr(relu)", il); cur = ggml_mul_mat(ctx0, model.layers[il].w2, cur); - cb(cur, "result_w2"); + cb(cur, "result_w2", il); cur = ggml_add(ctx0, cur, model.layers[il].b2); - cb(cur, "result_w2_b"); + cb(cur, "result_w2_b", il); } cur = ggml_add(ctx0, cur, inpFF); - cb(cur, "inpFF_+_result_w2"); + cb(cur, "inpFF_+_result_w2", il); inpL = cur; } @@ -4520,17 +4520,17 @@ static struct ggml_cgraph * llm_build_persimmon( { cur = ggml_norm(ctx0, cur, norm_eps); - cb(cur, "out_norm_0"); + cb(cur, "out_norm_0", -1); cur = ggml_mul(ctx0, cur, model.output_norm); - cb(cur, "out_norm_0_w"); + cb(cur, "out_norm_0_w", -1); cur = ggml_add(ctx0, cur, model.output_norm_b); - cb(cur, "result_norm"); + cb(cur, "result_norm", -1); } cur = ggml_mul_mat(ctx0, model.output, cur); - cb(cur, "result_output"); + cb(cur, "result_output", -1); ggml_build_forward_expand(gf, cur); @@ -4585,7 +4585,7 @@ static struct ggml_cgraph * llm_build_refact( if (batch.token) { struct ggml_tensor * inp_tokens = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens); - cb(inp_tokens, "inp_tokens"); + cb(inp_tokens, "inp_tokens", -1); inpL = ggml_get_rows(ctx0, model.tok_embeddings, inp_tokens); } else { @@ -4595,15 +4595,15 @@ static struct ggml_cgraph * llm_build_refact( inpL = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd, n_tokens); } - cb(inpL, "inp_embd"); + cb(inpL, "inp_embd", -1); // KQ_scale struct ggml_tensor * KQ_scale = ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, 1); - cb(KQ_scale, "KQ_scale"); + cb(KQ_scale, "KQ_scale", -1); // KQ_mask (mask for 1 head, it will be broadcasted to all heads) struct ggml_tensor * KQ_mask = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_kv, n_tokens, 1); - cb(KQ_mask, "KQ_mask"); + cb(KQ_mask, "KQ_mask", -1); for (int il = 0; il < n_layer; ++il) { struct ggml_tensor * inpSA = inpL; @@ -4611,52 +4611,52 @@ static struct ggml_cgraph * llm_build_refact( // norm { cur = ggml_rms_norm(ctx0, inpL, norm_rms_eps); - cb(cur, "rms_norm_0"); + cb(cur, "rms_norm_0", il); // cur = cur*attn_norm(broadcasted) cur = ggml_mul(ctx0, cur, model.layers[il].attn_norm); - cb(cur, "attn_norm_0"); + cb(cur, "attn_norm_0", il); } // self-attention { // compute Q and K struct ggml_tensor * tmpk = ggml_mul_mat(ctx0, model.layers[il].wk, cur); - cb(tmpk, "tmpk"); + cb(tmpk, "tmpk", il); struct ggml_tensor * tmpq = ggml_mul_mat(ctx0, model.layers[il].wq, cur); - cb(tmpq, "tmpq"); + cb(tmpq, "tmpq", il); struct ggml_tensor * Kcur = ggml_reshape_3d(ctx0, tmpk, n_embd_head, n_head_kv, n_tokens); - cb(Kcur, "Kcur"); + cb(Kcur, "Kcur", il); struct ggml_tensor * Qcur = ggml_reshape_3d(ctx0, tmpq, n_embd_head, n_head, n_tokens); - cb(Qcur, "Qcur"); + cb(Qcur, "Qcur", il); // store key and value to memory { // compute the transposed [n_tokens, n_embd] V matrix struct ggml_tensor * tmpv = ggml_mul_mat(ctx0, model.layers[il].wv, cur); - cb(tmpv, "tmpv"); + cb(tmpv, "tmpv", il); struct ggml_tensor * Vcur = ggml_transpose(ctx0, ggml_reshape_2d(ctx0, tmpv, n_embd_gqa, n_tokens)); - cb(Vcur, "Vcur"); + cb(Vcur, "Vcur", il); struct ggml_tensor * k = ggml_view_1d(ctx0, kv_self.k, n_tokens*n_embd_gqa, (ggml_element_size(kv_self.k)*n_embd_gqa)*(il*n_ctx + kv_head)); - cb(k, "k"); + cb(k, "k", il); struct ggml_tensor * v = ggml_view_2d(ctx0, kv_self.v, n_tokens, n_embd_gqa, ( n_ctx)*ggml_element_size(kv_self.v), (il*n_ctx)*ggml_element_size(kv_self.v)*n_embd_gqa + kv_head*ggml_element_size(kv_self.v)); - cb(v, "v"); + cb(v, "v", il); ggml_build_forward_expand(gf, ggml_cpy(ctx0, Kcur, k)); ggml_build_forward_expand(gf, ggml_cpy(ctx0, Vcur, v)); } struct ggml_tensor * Q = ggml_permute(ctx0, Qcur, 0, 2, 1, 3); - cb(Q, "Q"); + cb(Q, "Q", il); struct ggml_tensor * K = ggml_view_3d(ctx0, kv_self.k, @@ -4664,27 +4664,27 @@ static struct ggml_cgraph * llm_build_refact( ggml_element_size(kv_self.k)*n_embd_gqa, ggml_element_size(kv_self.k)*n_embd_head, ggml_element_size(kv_self.k)*n_embd_gqa*n_ctx*il); - cb(K, "K"); + cb(K, "K", il); // K * Q struct ggml_tensor * KQ = ggml_mul_mat(ctx0, K, Q); - cb(KQ, "KQ"); + cb(KQ, "KQ", il); // KQ_scaled = KQ / sqrt(n_embd_head) // KQ_scaled shape [n_kv, n_tokens, n_head, 1] struct ggml_tensor * KQ_scaled = ggml_scale(ctx0, KQ, KQ_scale); - cb(KQ_scaled, "KQ_scaled"); + cb(KQ_scaled, "KQ_scaled", il); // KQ_masked = mask_past(KQ_scaled) struct ggml_tensor * KQ_scaled_alibi = ggml_alibi(ctx0, KQ_scaled, /*n_past*/ 0, n_head, 8); - cb(KQ_scaled_alibi, "KQ_scaled_alibi"); + cb(KQ_scaled_alibi, "KQ_scaled_alibi", il); struct ggml_tensor * KQ_masked = ggml_add(ctx0, KQ_scaled_alibi, KQ_mask); - cb(KQ_masked, "KQ_masked"); + cb(KQ_masked, "KQ_masked", il); // KQ = soft_max(KQ_masked) struct ggml_tensor * KQ_soft_max = ggml_soft_max(ctx0, KQ_masked); - cb(KQ_soft_max, "KQ_soft_max"); + cb(KQ_soft_max, "KQ_soft_max", il); // split cached V into n_head heads struct ggml_tensor * V = @@ -4693,66 +4693,66 @@ static struct ggml_cgraph * llm_build_refact( ggml_element_size(kv_self.v)*n_ctx, ggml_element_size(kv_self.v)*n_ctx*n_embd_head, ggml_element_size(kv_self.v)*n_ctx*n_embd_gqa*il); - cb(V, "V"); + cb(V, "V", il); struct ggml_tensor * KQV = ggml_mul_mat(ctx0, V, KQ_soft_max); - cb(KQV, "KQV"); + cb(KQV, "KQV", il); // KQV_merged = KQV.permute(0, 2, 1, 3) struct ggml_tensor * KQV_merged = ggml_permute(ctx0, KQV, 0, 2, 1, 3); - cb(KQV_merged, "KQV_merged"); + cb(KQV_merged, "KQV_merged", il); // cur = KQV_merged.contiguous().view(n_embd, n_tokens) cur = ggml_cont_2d(ctx0, KQV_merged, n_embd, n_tokens); - cb(cur, "KQV_merged_contiguous"); + cb(cur, "KQV_merged_contiguous", il); // projection (no bias) cur = ggml_mul_mat(ctx0, model.layers[il].wo, cur); - cb(cur, "result_wo"); + cb(cur, "result_wo", il); } struct ggml_tensor * inpFF = ggml_add(ctx0, cur, inpSA); - cb(inpFF, "inpFF"); + cb(inpFF, "inpFF", il); // feed-forward network { // norm { cur = ggml_rms_norm(ctx0, inpFF, norm_rms_eps); - cb(cur, "rms_norm_1"); + cb(cur, "rms_norm_1", il); // cur = cur*ffn_norm(broadcasted) cur = ggml_mul(ctx0, cur, model.layers[il].ffn_norm); - cb(cur, "ffn_norm"); + cb(cur, "ffn_norm", il); } struct ggml_tensor * tmp = ggml_mul_mat(ctx0, model.layers[il].w3, cur); - cb(tmp, "result_w3"); + cb(tmp, "result_w3", il); cur = ggml_mul_mat(ctx0, model.layers[il].w1, cur); - cb(cur, "result_w1"); + cb(cur, "result_w1", il); // SILU activation cur = ggml_silu(ctx0, cur); - cb(cur, "silu"); + cb(cur, "silu", il); cur = ggml_mul(ctx0, cur, tmp); - cb(cur, "silu_x_result_w3"); + cb(cur, "silu_x_result_w3", il); cur = ggml_mul_mat(ctx0, model.layers[il].w2, cur); - cb(cur, "result_w2"); + cb(cur, "result_w2", il); } cur = ggml_add(ctx0, cur, inpFF); - cb(cur, "inpFF_+_result_w2"); + cb(cur, "inpFF_+_result_w2", il); // input for next layer inpL = cur; @@ -4763,16 +4763,16 @@ static struct ggml_cgraph * llm_build_refact( // norm { cur = ggml_rms_norm(ctx0, cur, norm_rms_eps); - cb(cur, "rms_norm_2"); + cb(cur, "rms_norm_2", -1); // cur = cur*norm(broadcasted) cur = ggml_mul(ctx0, cur, model.output_norm); - cb(cur, "result_norm"); + cb(cur, "result_norm", -1); } // lm_head cur = ggml_mul_mat(ctx0, model.output, cur); - cb(cur, "result_output"); + cb(cur, "result_output", -1); ggml_build_forward_expand(gf, cur); @@ -4830,7 +4830,7 @@ static struct ggml_cgraph * llm_build_bloom( if (batch.token) { struct ggml_tensor * inp_tokens = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens); - cb(inp_tokens, "inp_tokens"); + cb(inp_tokens, "inp_tokens", -1); embd = ggml_get_rows(ctx0, model.tok_embeddings, inp_tokens); } else { @@ -4840,56 +4840,56 @@ static struct ggml_cgraph * llm_build_bloom( embd = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd, n_tokens); } - cb(embd, "inp_embd"); + cb(embd, "inp_embd", -1); // KQ_scale struct ggml_tensor * KQ_scale = ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, 1); - cb(KQ_scale, "KQ_scale"); + cb(KQ_scale, "KQ_scale", -1); // KQ_mask (mask for 1 head, it will be broadcasted to all heads) struct ggml_tensor * KQ_mask = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_kv, n_tokens, 1); - cb(KQ_mask, "KQ_mask"); + cb(KQ_mask, "KQ_mask", -1); // norm { inpL = ggml_norm(ctx0, embd, norm_eps); - cb(inpL, "inp_norm"); + cb(inpL, "inp_norm", -1); inpL = ggml_mul(ctx0, inpL, model.tok_norm); - cb(inpL, "inp_norm_w"); + cb(inpL, "inp_norm_w", -1); inpL = ggml_add (ctx0, inpL, model.tok_norm_b); - cb(inpL, "inp_norm_wb"); + cb(inpL, "inp_norm_wb", -1); } for (int il = 0; il < n_layer; ++il) { { // Norm cur = ggml_norm(ctx0, inpL, norm_eps); - cb(cur, "attn_norm_0"); + cb(cur, "attn_norm_0", il); cur = ggml_mul(ctx0, cur, model.layers[il].attn_norm); - cb(cur, "attn_norm_0_w"); + cb(cur, "attn_norm_0_w", il); cur = ggml_add(ctx0, cur, model.layers[il].attn_norm_b); - cb(cur, "attn_norm_0_wb"); + cb(cur, "attn_norm_0_wb", il); } { // Self Attention cur = ggml_mul_mat(ctx0, model.layers[il].wqkv, cur); - cb(cur, "wqkv"); + cb(cur, "wqkv", il); cur = ggml_add(ctx0, cur, model.layers[il].bqkv); - cb(cur, "bqkv"); + cb(cur, "bqkv", il); struct ggml_tensor * tmpq = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd, n_tokens, cur->nb[1], 0*sizeof(float)*(n_embd))); struct ggml_tensor * tmpk = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd))); struct ggml_tensor * tmpv = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa))); - cb(tmpq, "tmpq"); - cb(tmpk, "tmpk"); - cb(tmpv, "tmpv"); + cb(tmpq, "tmpq", il); + cb(tmpk, "tmpk", il); + cb(tmpv, "tmpv", il); struct ggml_tensor * Qcur = ggml_reshape_3d(ctx0, tmpq, n_embd_head, n_head, n_tokens); struct ggml_tensor * Kcur = tmpk; @@ -4897,15 +4897,15 @@ static struct ggml_cgraph * llm_build_bloom( // store key and value to memory { struct ggml_tensor * Vcur = ggml_transpose(ctx0, ggml_reshape_2d(ctx0, ggml_cont(ctx0, tmpv), n_embd_gqa, n_tokens)); - cb(Vcur, "Vcur"); + cb(Vcur, "Vcur", il); struct ggml_tensor * k = ggml_view_1d(ctx0, kv_self.k, n_tokens*n_embd_gqa, (ggml_element_size(kv_self.k)*n_embd_gqa)*(il*n_ctx + kv_head)); - cb(k, "k"); + cb(k, "k", il); struct ggml_tensor * v = ggml_view_2d(ctx0, kv_self.v, n_tokens, n_embd_gqa, ( n_ctx)*ggml_element_size(kv_self.v), (il*n_ctx)*ggml_element_size(kv_self.v)*n_embd_gqa + kv_head*ggml_element_size(kv_self.v)); - cb(v, "v"); + cb(v, "v", il); ggml_build_forward_expand(gf, ggml_cpy(ctx0, Kcur, k)); ggml_build_forward_expand(gf, ggml_cpy(ctx0, Vcur, v)); @@ -4917,7 +4917,7 @@ static struct ggml_cgraph * llm_build_bloom( Qcur, ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_embd_head, n_head, n_tokens)), 0, 2, 1, 3); - cb(Q, "Q"); + cb(Q, "Q", il); struct ggml_tensor * K = ggml_view_3d(ctx0, kv_self.k, @@ -4925,27 +4925,27 @@ static struct ggml_cgraph * llm_build_bloom( ggml_element_size(kv_self.k)*n_embd_gqa, ggml_element_size(kv_self.k)*n_embd_head, ggml_element_size(kv_self.k)*n_embd_gqa*n_ctx*il); - cb(K, "K"); + cb(K, "K", il); // K * Q struct ggml_tensor * KQ = ggml_mul_mat(ctx0, K, Q); - cb(KQ, "KQ"); + cb(KQ, "KQ", il); // KQ_scaled = KQ / sqrt(n_embd_head) // KQ_scaled shape [n_past + n_tokens, n_tokens, n_head, 1] struct ggml_tensor * KQ_scaled = ggml_scale_inplace(ctx0, KQ, KQ_scale); - cb(KQ_scaled, "KQ_scaled"); + cb(KQ_scaled, "KQ_scaled", il); struct ggml_tensor * KQ_scaled_alibi = ggml_alibi(ctx0, KQ_scaled, /*n_past*/ kv_head, n_head, 8); - cb(KQ_scaled_alibi, "KQ_scaled_alibi"); + cb(KQ_scaled_alibi, "KQ_scaled_alibi", il); // KQ_masked = mask_past(KQ_scaled) struct ggml_tensor * KQ_masked = ggml_add(ctx0, KQ_scaled_alibi, KQ_mask); - cb(KQ_masked, "KQ_masked"); + cb(KQ_masked, "KQ_masked", il); // KQ = soft_max(KQ_masked) struct ggml_tensor * KQ_soft_max = ggml_soft_max_inplace(ctx0, KQ_masked); - cb(KQ_soft_max, "KQ_soft_max"); + cb(KQ_soft_max, "KQ_soft_max", il); // split cached V into n_head heads struct ggml_tensor * V = @@ -4954,30 +4954,30 @@ static struct ggml_cgraph * llm_build_bloom( ggml_element_size(kv_self.v)*n_ctx, ggml_element_size(kv_self.v)*n_ctx*n_embd_head, ggml_element_size(kv_self.v)*n_ctx*n_embd_gqa*il); - cb(V, "V"); + cb(V, "V", il); struct ggml_tensor * KQV = ggml_mul_mat(ctx0, V, KQ_soft_max); - cb(KQV, "KQV"); + cb(KQV, "KQV", il); // KQV_merged = KQV.permute(0, 2, 1, 3) struct ggml_tensor * KQV_merged = ggml_permute(ctx0, KQV, 0, 2, 1, 3); - cb(KQV_merged, "KQV_merged"); + cb(KQV_merged, "KQV_merged", il); // cur = KQV_merged.contiguous().view(n_embd, n_tokens) cur = ggml_cont_2d(ctx0, KQV_merged, n_embd, n_tokens); - cb(cur, "KQV_merged_contiguous"); + cb(cur, "KQV_merged_contiguous", il); } // Projection cur = ggml_mul_mat(ctx0, model.layers[il].wo, cur); - cb(cur, "result_wo"); + cb(cur, "result_wo", il); cur = ggml_add(ctx0, cur, model.layers[il].bo); - cb(cur, "result_wo_b"); + cb(cur, "result_wo_b", il); // Add the input cur = ggml_add(ctx0, cur, inpL); - cb(cur, "inpL_+_result_wo"); + cb(cur, "inpL_+_result_wo", il); struct ggml_tensor * inpFF = cur; @@ -4986,49 +4986,49 @@ static struct ggml_cgraph * llm_build_bloom( // Norm { cur = ggml_norm(ctx0, inpFF, norm_eps); - cb(cur, "ffn_norm_0"); + cb(cur, "ffn_norm_0", il); cur = ggml_mul(ctx0, cur, model.layers[il].ffn_norm); - cb(cur, "ffn_norm_0_w"); + cb(cur, "ffn_norm_0_w", il); cur = ggml_add(ctx0, cur, model.layers[il].ffn_norm_b); - cb(cur, "ffn_norm_0_wb"); + cb(cur, "ffn_norm_0_wb", il); } cur = ggml_mul_mat(ctx0, model.layers[il].w3, cur); - cb(cur, "result_w3"); + cb(cur, "result_w3", il); cur = ggml_add(ctx0, cur, model.layers[il].b3); - cb(cur, "result_w3_b"); + cb(cur, "result_w3_b", il); cur = ggml_gelu(ctx0, cur); - cb(cur, "gelu"); + cb(cur, "gelu", il); cur = ggml_mul_mat(ctx0, model.layers[il].w2, cur); - cb(cur, "result_w2"); + cb(cur, "result_w2", il); cur = ggml_add(ctx0, cur, model.layers[il].b2); - cb(cur, "result_w2_b"); + cb(cur, "result_w2_b", il); } inpL = ggml_add(ctx0, cur, inpFF); - cb(inpL, "inpFF_+_result_w2"); + cb(inpL, "inpFF_+_result_w2", il); } // Output Norm { cur = ggml_norm(ctx0, inpL, norm_eps); - cb(cur, "out_norm_0"); + cb(cur, "out_norm_0", -1); cur = ggml_mul(ctx0, cur, model.output_norm); - cb(cur, "out_norm_0_w"); + cb(cur, "out_norm_0_w", -1); cur = ggml_add(ctx0, cur, model.output_norm_b); - cb(cur, "result_norm"); + cb(cur, "result_norm", -1); } cur = ggml_mul_mat(ctx0, model.output, cur); - cb(cur, "result_output"); + cb(cur, "result_output", -1); ggml_build_forward_expand(gf, cur); @@ -5085,7 +5085,7 @@ static struct ggml_cgraph * llm_build_mpt( if (batch.token) { struct ggml_tensor * inp_tokens = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens); - cb(inp_tokens, "inp_tokens"); + cb(inp_tokens, "inp_tokens", -1); inpL = ggml_get_rows(ctx0, model.tok_embeddings, inp_tokens); } else { @@ -5095,15 +5095,15 @@ static struct ggml_cgraph * llm_build_mpt( inpL = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd, n_tokens); } - cb(inpL, "inp_embd"); + cb(inpL, "inp_embd", -1); // KQ_scale struct ggml_tensor * KQ_scale = ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, 1); - cb(KQ_scale, "KQ_scale"); + cb(KQ_scale, "KQ_scale", -1); // KQ_mask (mask for 1 head, it will be broadcasted to all heads) struct ggml_tensor * KQ_mask = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_kv, n_tokens, 1); - cb(KQ_mask, "KQ_mask"); + cb(KQ_mask, "KQ_mask", -1); for (int il = 0; il < n_layer; ++il) { struct ggml_tensor * attn_norm; @@ -5112,10 +5112,10 @@ static struct ggml_cgraph * llm_build_mpt( // TODO: refactor into common function (shared with LLaMA) { attn_norm = ggml_norm(ctx0, inpL, norm_eps); - cb(attn_norm, "attn_norm_0"); + cb(attn_norm, "attn_norm_0", il); attn_norm = ggml_mul(ctx0, attn_norm, model.layers[il].attn_norm); - cb(attn_norm, "attn_norm_0_w"); + cb(attn_norm, "attn_norm_0_w", il); if (1) { cur = attn_norm; @@ -5124,11 +5124,11 @@ static struct ggml_cgraph * llm_build_mpt( // compute QKV cur = ggml_mul_mat(ctx0, model.layers[il].wqkv, cur); - cb(cur, "wqkv"); + cb(cur, "wqkv", il); if (clamp_kqv > 0.0f) { cur = ggml_clamp(ctx0, cur, -clamp_kqv, clamp_kqv); - cb(cur, "wqkv_clamped"); + cb(cur, "wqkv_clamped", il); } const size_t wsize = ggml_type_size(cur->type); @@ -5138,43 +5138,43 @@ static struct ggml_cgraph * llm_build_mpt( wsize * n_embd_head, wsize * n_embd_head * (n_head + 2 * n_head_kv), 0); - cb(Qcur, "Qcur"); + cb(Qcur, "Qcur", il); struct ggml_tensor * Kcur = ggml_view_3d( ctx0, cur, n_embd_head, n_head_kv, n_tokens, wsize * n_embd_head, wsize * n_embd_head * (n_head + 2 * n_head_kv), wsize * n_embd_head * n_head); - cb(Kcur, "Kcur"); + cb(Kcur, "Kcur", il); struct ggml_tensor * tmpv = ggml_view_3d( ctx0, cur, n_embd_head, n_head_kv, n_tokens, wsize * n_embd_head, wsize * n_embd_head * (n_head + 2 * n_head_kv), wsize * n_embd_head * (n_head + n_head_kv)); - cb(tmpv, "tmpv"); + cb(tmpv, "tmpv", il); { struct ggml_tensor * Vcur = ggml_cont(ctx0, tmpv); - cb(Vcur, "Vcur"); + cb(Vcur, "Vcur", il); Vcur = ggml_transpose(ctx0, ggml_reshape_2d(ctx0, Vcur, n_embd_gqa, n_tokens)); - cb(Vcur, "Vcur"); + cb(Vcur, "Vcur", il); struct ggml_tensor * k = ggml_view_1d(ctx0, kv_self.k, n_tokens*n_embd_gqa, (ggml_element_size(kv_self.k)*n_embd_gqa)*(il*n_ctx + kv_head)); - cb(k, "k"); + cb(k, "k", il); struct ggml_tensor * v = ggml_view_2d(ctx0, kv_self.v, n_tokens, n_embd_gqa, ( n_ctx)*ggml_element_size(kv_self.v), (il*n_ctx)*ggml_element_size(kv_self.v)*n_embd_gqa + kv_head*ggml_element_size(kv_self.v)); - cb(v, "v"); + cb(v, "v", il); ggml_build_forward_expand(gf, ggml_cpy(ctx0, Kcur, k)); ggml_build_forward_expand(gf, ggml_cpy(ctx0, Vcur, v)); } struct ggml_tensor * Q = ggml_permute(ctx0, Qcur, 0, 2, 1, 3); - cb(Q, "Q"); + cb(Q, "Q", il); struct ggml_tensor * K = ggml_view_3d(ctx0, kv_self.k, @@ -5182,23 +5182,23 @@ static struct ggml_cgraph * llm_build_mpt( ggml_element_size(kv_self.k)*n_embd_gqa, ggml_element_size(kv_self.k)*n_embd_head, ggml_element_size(kv_self.k)*n_embd_gqa*n_ctx*il); - cb(K, "K"); + cb(K, "K", il); struct ggml_tensor * KQ = ggml_mul_mat(ctx0, K, Q); - cb(KQ, "KQ"); + cb(KQ, "KQ", il); struct ggml_tensor * KQ_scaled = ggml_scale(ctx0, KQ, KQ_scale); - cb(KQ_scaled, "KQ_scaled"); + cb(KQ_scaled, "KQ_scaled", il); // TODO: replace with ggml_add() struct ggml_tensor * KQ_scaled_alibi = ggml_alibi(ctx0, KQ_scaled, 0, n_head, max_alibi_bias); - cb(KQ_scaled_alibi, "KQ_scaled_alibi"); + cb(KQ_scaled_alibi, "KQ_scaled_alibi", il); struct ggml_tensor * KQ_masked = ggml_add(ctx0, KQ_scaled_alibi, KQ_mask); - cb(KQ_masked, "KQ_masked"); + cb(KQ_masked, "KQ_masked", il); struct ggml_tensor * KQ_soft_max = ggml_soft_max(ctx0, KQ_masked); - cb(KQ_soft_max, "KQ_soft_max"); + cb(KQ_soft_max, "KQ_soft_max", il); struct ggml_tensor * V = ggml_view_3d(ctx0, kv_self.v, @@ -5206,24 +5206,24 @@ static struct ggml_cgraph * llm_build_mpt( ggml_element_size(kv_self.v)*n_ctx, ggml_element_size(kv_self.v)*n_ctx*n_embd_head, ggml_element_size(kv_self.v)*n_ctx*n_embd_gqa*il); - cb(V, "V"); + cb(V, "V", il); struct ggml_tensor * KQV = ggml_mul_mat(ctx0, V, KQ_soft_max); - cb(KQV, "KQV"); + cb(KQV, "KQV", il); struct ggml_tensor * KQV_merged = ggml_permute(ctx0, KQV, 0, 2, 1, 3); - cb(KQV_merged, "KQV_merged"); + cb(KQV_merged, "KQV_merged", il); cur = ggml_cont_2d(ctx0, KQV_merged, n_embd, n_tokens); - cb(cur, "KQV_merged_contiguous"); + cb(cur, "KQV_merged_contiguous", il); cur = ggml_mul_mat(ctx0, model.layers[il].wo, cur); - cb(cur, "result_wo"); + cb(cur, "result_wo", il); } // Add the input cur = ggml_add(ctx0, cur, inpL); - cb(cur, "inpL_+_result_wo"); + cb(cur, "inpL_+_result_wo", il); struct ggml_tensor * attn_out = cur; @@ -5232,24 +5232,24 @@ static struct ggml_cgraph * llm_build_mpt( // Norm { cur = ggml_norm(ctx0, attn_out, norm_eps); - cb(cur, "ffn_norm_0"); + cb(cur, "ffn_norm_0", il); cur = ggml_mul(ctx0, cur, model.layers[il].ffn_norm); - cb(cur, "ffn_norm_0_w"); + cb(cur, "ffn_norm_0_w", il); } cur = ggml_mul_mat(ctx0, model.layers[il].w3, cur); - cb(cur, "result_w3"); + cb(cur, "result_w3", il); cur = ggml_gelu(ctx0, cur); - cb(cur, "gelu"); + cb(cur, "gelu", il); cur = ggml_mul_mat(ctx0, model.layers[il].w2, cur); - cb(cur, "result_w2"); + cb(cur, "result_w2", il); } cur = ggml_add(ctx0, cur, attn_out); - cb(cur, "inpL_+_inpFF_+_result_w2"); + cb(cur, "inpL_+_inpFF_+_result_w2", il); // input for next layer inpL = cur; @@ -5260,14 +5260,14 @@ static struct ggml_cgraph * llm_build_mpt( // norm { cur = ggml_norm(ctx0, cur, norm_eps); - cb(cur, "out_norm_0"); + cb(cur, "out_norm_0", -1); cur = ggml_mul(ctx0, cur, model.output_norm); - cb(cur, "result_norm"); + cb(cur, "result_norm", -1); } cur = ggml_mul_mat(ctx0, model.output, cur); - cb(cur, "result_output"); + cb(cur, "result_output", -1); ggml_build_forward_expand(gf, cur); @@ -5360,10 +5360,13 @@ struct llm_offload_trie { }; static const std::unordered_map k_offload_map = { - { "inp_pos", OFFLOAD_FUNC_KQ }, - { "KQ_mask", OFFLOAD_FUNC_KQ }, - { "K_shift", OFFLOAD_FUNC_KQ }, - { "K_shifted", OFFLOAD_FUNC_KQ }, + //{ "inp_tokens", OFFLOAD_FUNC_NR }, // TODO: missing K-quants get_rows kernel + //{ "inp_embd", OFFLOAD_FUNC_NR }, // TODO: missing K-quants get_rows kernel + { "inp_pos", OFFLOAD_FUNC_NR }, + + { "KQ_mask", OFFLOAD_FUNC_NR }, + { "K_shift", OFFLOAD_FUNC_NR }, + { "K_shifted", OFFLOAD_FUNC_NR }, { "inp_norm", OFFLOAD_FUNC_NR }, { "inp_norm_w", OFFLOAD_FUNC_NR }, @@ -5472,8 +5475,12 @@ static struct ggml_cgraph * llama_build_graph( bool alloc_inp_K_shift = false; // this callback allows us to apply custom logic to each tensor (e.g. ggml-alloc, offloading, etc.) - llm_build_cb cb = [&](struct ggml_tensor * cur, const char * name) { - ggml_set_name(cur, name); + llm_build_cb cb = [&](struct ggml_tensor * cur, const char * name, int il) { + if (il >= 0) { + ggml_format_name(cur, "%s-%d", name, il); + } else { + ggml_set_name(cur, name); + } // // allocate input tensors and set input data @@ -5622,7 +5629,7 @@ static struct ggml_cgraph * llama_build_graph( }; // check the global map for what offload function to use for this tensor - llm_offload_func_e func_e = k_offload_func_trie.find(cur->name); + llm_offload_func_e func_e = k_offload_func_trie.find(name); if (func_e == OFFLOAD_FUNC_NOP) { // if a tensor hasn't been offloaded, we warn the user @@ -5641,7 +5648,7 @@ static struct ggml_cgraph * llama_build_graph( break; case OFFLOAD_FUNC: if (n_gpu_layers < n_layer) { - if (offload_n[cur->name]++ < i_gpu_start) { + if (offload_n[name]++ < i_gpu_start) { func_e = OFFLOAD_FUNC_NOP; } } From 761087932b9d76efc4f9b92fc207faddfa4f55e7 Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Sun, 29 Oct 2023 13:26:23 +0200 Subject: [PATCH 17/20] llama : add functional header --- llama.cpp | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/llama.cpp b/llama.cpp index 83816e33a..d80062082 100644 --- a/llama.cpp +++ b/llama.cpp @@ -62,7 +62,9 @@ #include #include #include +#include #include +#include #include #include #include @@ -71,11 +73,10 @@ #include #include #include +#include #include #include #include -#include -#include #if defined(_MSC_VER) #pragma warning(disable: 4244 4267) // possible loss of data From 79ad73441790d4fcf7e1f1b8e8eed6cc7a6d6ca7 Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Sun, 29 Oct 2023 13:27:53 +0200 Subject: [PATCH 18/20] llama : comment ggml-ci --- llama.cpp | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/llama.cpp b/llama.cpp index d80062082..f30a98ba1 100644 --- a/llama.cpp +++ b/llama.cpp @@ -5277,6 +5277,11 @@ static struct ggml_cgraph * llm_build_mpt( return gf; } +// +// tensor offloading helpers +// +// TODO: will be removed with backend v2 + enum llm_offload_func_e { OFFLOAD_FUNC_NOP, OFFLOAD_FUNC, From 210e6e5d0264f18dac942634d67fe86c97d05b25 Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Sun, 29 Oct 2023 13:39:04 +0200 Subject: [PATCH 19/20] llama : remove obsolete map for layer counting --- llama.cpp | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/llama.cpp b/llama.cpp index f30a98ba1..bad25de4b 100644 --- a/llama.cpp +++ b/llama.cpp @@ -5469,9 +5469,6 @@ static struct ggml_cgraph * llama_build_graph( // check if we should build the worst-case graph (for memory measurement) const bool worst_case = ggml_allocr_is_measure(lctx.alloc); - // count the number of times a tensor with a given name has been offloaded - std::unordered_map offload_n; - // keep track of the input that has already been allocated bool alloc_inp_tokens = false; bool alloc_inp_embd = false; @@ -5654,7 +5651,7 @@ static struct ggml_cgraph * llama_build_graph( break; case OFFLOAD_FUNC: if (n_gpu_layers < n_layer) { - if (offload_n[name]++ < i_gpu_start) { + if (il < i_gpu_start) { func_e = OFFLOAD_FUNC_NOP; } } From 5baefef4972ac04b05a4777619f77f330f05dc2f Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Tue, 31 Oct 2023 19:23:12 +0200 Subject: [PATCH 20/20] llama : add llm_build helper functions (#3848) * llama : add llm_build_norm helper function ggml-ci * llama : add llm_build_ffn helper function (#3849) ggml-ci * llama : add llm_build_k_shift helper ggml-ci * llama : fix offloading after recent changes * llama : add llm_build_kv_store helper ggml-ci * llama : remove obsolete offload names * llama : fix llm_build_k_shift to use n_head_kv instead of n_head * llama : simplify falcon Q, K, V computation * llama : remove obsolete comments in build graphs * llama : add llm_build_kqv helper ggml-ci * llama : minor * llama : add LLAMA_OFFLOAD_DEBUG + fix starcoder offloading * llama : fix input allocation logic * llama : update offload functions for KQ tensors * llama : normalize tensor names ggml-ci * llama : enable warning about not offloaded tensors * llama : remove extra ; + deduplicate gate_b logic * llama : add llm_build_inp_embd helper --- llama.cpp | 2255 ++++++++++++++++++++--------------------------------- 1 file changed, 845 insertions(+), 1410 deletions(-) diff --git a/llama.cpp b/llama.cpp index bad25de4b..f3db4dc21 100644 --- a/llama.cpp +++ b/llama.cpp @@ -972,7 +972,7 @@ struct llama_mlock { typedef void (*offload_func_t)(struct ggml_tensor * tensor); -static void ggml_offload_nop(struct ggml_tensor * tensor) { // don't offload by default +static void ggml_offload_nop(struct ggml_tensor * tensor) { (void) tensor; } @@ -1116,13 +1116,13 @@ struct llama_layer { struct ggml_tensor * ffn_norm_b; // ff - struct ggml_tensor * w1; // ffn_gate - struct ggml_tensor * w2; // ffn_down - struct ggml_tensor * w3; // ffn_up + struct ggml_tensor * ffn_gate; // w1 + struct ggml_tensor * ffn_down; // w2 + struct ggml_tensor * ffn_up; // w3 // ff bias - struct ggml_tensor * b2; // ffn_down - struct ggml_tensor * b3; // ffn_up + struct ggml_tensor * ffn_down_b; // b2 + struct ggml_tensor * ffn_up_b; // b3 }; struct llama_kv_cell { @@ -1228,8 +1228,8 @@ struct llama_model { llama_hparams hparams = {}; llama_vocab vocab; - struct ggml_tensor * tok_embeddings; - struct ggml_tensor * pos_embeddings; + struct ggml_tensor * tok_embd; + struct ggml_tensor * pos_embd; struct ggml_tensor * tok_norm; struct ggml_tensor * tok_norm_b; @@ -2484,7 +2484,7 @@ static void llm_load_tensors( case LLM_ARCH_LLAMA: case LLM_ARCH_REFACT: { - model.tok_embeddings = ml.create_tensor(ctx, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, GGML_BACKEND_CPU); + model.tok_embd = ml.create_tensor(ctx, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, GGML_BACKEND_CPU); // output { @@ -2538,21 +2538,21 @@ static void llm_load_tensors( layer.ffn_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, backend); - layer.w1 = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, backend_split); - layer.w2 = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, backend_split); - layer.w3 = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, backend_split); + layer.ffn_gate = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, backend_split); + layer.ffn_down = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, backend_split); + layer.ffn_up = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, backend_split); if (backend == GGML_BACKEND_GPU) { vram_weights += - ggml_nbytes(layer.attn_norm) + ggml_nbytes(layer.wq) + ggml_nbytes(layer.wk) + - ggml_nbytes(layer.wv) + ggml_nbytes(layer.wo) + ggml_nbytes(layer.ffn_norm) + - ggml_nbytes(layer.w1) + ggml_nbytes(layer.w2) + ggml_nbytes(layer.w3); + ggml_nbytes(layer.attn_norm) + ggml_nbytes(layer.wq) + ggml_nbytes(layer.wk) + + ggml_nbytes(layer.wv) + ggml_nbytes(layer.wo) + ggml_nbytes(layer.ffn_norm) + + ggml_nbytes(layer.ffn_gate) + ggml_nbytes(layer.ffn_down) + ggml_nbytes(layer.ffn_up); } } } break; case LLM_ARCH_BAICHUAN: { - model.tok_embeddings = ml.create_tensor(ctx, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, GGML_BACKEND_CPU); + model.tok_embd = ml.create_tensor(ctx, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, GGML_BACKEND_CPU); { ggml_backend_type backend_norm; ggml_backend_type backend_output; @@ -2604,15 +2604,15 @@ static void llm_load_tensors( layer.ffn_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, backend); - layer.w1 = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, backend_split); - layer.w2 = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, backend_split); - layer.w3 = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, backend_split); + layer.ffn_gate = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, backend_split); + layer.ffn_down = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, backend_split); + layer.ffn_up = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, backend_split); if (backend == GGML_BACKEND_GPU) { vram_weights += - ggml_nbytes(layer.attn_norm) + ggml_nbytes(layer.wq) + ggml_nbytes(layer.wk) + - ggml_nbytes(layer.wv) + ggml_nbytes(layer.wo) + ggml_nbytes(layer.ffn_norm) + - ggml_nbytes(layer.w1) + ggml_nbytes(layer.w2) + ggml_nbytes(layer.w3); + ggml_nbytes(layer.attn_norm) + ggml_nbytes(layer.wq) + ggml_nbytes(layer.wk) + + ggml_nbytes(layer.wv) + ggml_nbytes(layer.wo) + ggml_nbytes(layer.ffn_norm) + + ggml_nbytes(layer.ffn_gate) + ggml_nbytes(layer.ffn_down) + ggml_nbytes(layer.ffn_up); } } } break; @@ -2620,7 +2620,7 @@ static void llm_load_tensors( { // TODO: CPU-only for now - model.tok_embeddings = ml.create_tensor(ctx, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, GGML_BACKEND_CPU); + model.tok_embd = ml.create_tensor(ctx, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, GGML_BACKEND_CPU); // output { @@ -2683,21 +2683,21 @@ static void llm_load_tensors( layer.wqkv = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa}, backend_split); layer.wo = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, backend_split); - layer.w2 = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, backend_split); - layer.w3 = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, backend_split); + layer.ffn_down = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, backend_split); + layer.ffn_up = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, backend_split); if (backend == GGML_BACKEND_GPU) { vram_weights += ggml_nbytes(layer.attn_norm) + ggml_nbytes(layer.attn_norm_b) + ggml_nbytes(layer.wqkv) + ggml_nbytes(layer.wo) + - ggml_nbytes(layer.w2) + ggml_nbytes(layer.w3); + ggml_nbytes(layer.ffn_down) + ggml_nbytes(layer.ffn_up); } } } break; case LLM_ARCH_STARCODER: { - model.tok_embeddings = ml.create_tensor(ctx, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, GGML_BACKEND_CPU); - model.pos_embeddings = ml.create_tensor(ctx, tn(LLM_TENSOR_POS_EMBD, "weight"), {n_embd, hparams.n_ctx_train}, GGML_BACKEND_CPU); + model.tok_embd = ml.create_tensor(ctx, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, GGML_BACKEND_CPU); + model.pos_embd = ml.create_tensor(ctx, tn(LLM_TENSOR_POS_EMBD, "weight"), {n_embd, hparams.n_ctx_train}, GGML_BACKEND_CPU); // output { @@ -2756,11 +2756,11 @@ static void llm_load_tensors( layer.ffn_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, backend); layer.ffn_norm_b = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_NORM, "bias", i), {n_embd}, backend); - layer.w2 = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}, backend_split); - layer.b2 = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd}, backend); + layer.ffn_down = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}, backend_split); + layer.ffn_down_b = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd}, backend); - layer.w3 = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, backend_split); - layer.b3 = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_UP, "bias", i), {n_ff}, backend); + layer.ffn_up = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, backend_split); + layer.ffn_up_b = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_UP, "bias", i), {n_ff}, backend); if (backend == GGML_BACKEND_GPU) { vram_weights += @@ -2768,14 +2768,14 @@ static void llm_load_tensors( ggml_nbytes(layer.wqkv) + ggml_nbytes(layer.bqkv) + ggml_nbytes(layer.wo) + ggml_nbytes(layer.bo) + ggml_nbytes(layer.ffn_norm) + ggml_nbytes(layer.ffn_norm_b) + - ggml_nbytes(layer.w2) + ggml_nbytes(layer.b2) + - ggml_nbytes(layer.w3) + ggml_nbytes(layer.b3); + ggml_nbytes(layer.ffn_down) + ggml_nbytes(layer.ffn_down_b) + + ggml_nbytes(layer.ffn_up) + ggml_nbytes(layer.ffn_up_b); } } } break; case LLM_ARCH_PERSIMMON: { - model.tok_embeddings = ml.create_tensor(ctx, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, GGML_BACKEND_CPU); + model.tok_embd = ml.create_tensor(ctx, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, GGML_BACKEND_CPU); { ggml_backend_type backend_norm; @@ -2816,31 +2816,31 @@ static void llm_load_tensors( const ggml_backend_type backend = int(i) < i_gpu_start ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD; const ggml_backend_type backend_split = int(i) < i_gpu_start ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD_SPLIT; auto & layer = model.layers[i]; - layer.attn_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, backend); - layer.attn_norm_b = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_NORM, "bias", i), {n_embd}, backend); - layer.wqkv = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa}, backend_split); - layer.bqkv = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_QKV, "bias", i), {n_embd + 2*n_embd_gqa}, backend_split); - layer.wo = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, backend_split); - layer.bo = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, backend_split); - layer.w2 = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}, backend_split); - layer.b2 = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd}, backend_split); - layer.w3 = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, backend_split); - layer.b3 = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_UP, "bias", i), {n_ff}, backend_split); - layer.ffn_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, backend); - layer.ffn_norm_b = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_NORM, "bias", i), {n_embd}, backend); + layer.attn_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, backend); + layer.attn_norm_b = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_NORM, "bias", i), {n_embd}, backend); + layer.wqkv = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa}, backend_split); + layer.bqkv = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_QKV, "bias", i), {n_embd + 2*n_embd_gqa}, backend_split); + layer.wo = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, backend_split); + layer.bo = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, backend_split); + layer.ffn_down = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}, backend_split); + layer.ffn_down_b = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd}, backend_split); + layer.ffn_up = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, backend_split); + layer.ffn_up_b = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_UP, "bias", i), {n_ff}, backend_split); + layer.ffn_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, backend); + layer.ffn_norm_b = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_NORM, "bias", i), {n_embd}, backend); layer.attn_q_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {64}, backend); - layer.attn_q_norm_b = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_Q_NORM, "bias", i), {64}, backend); + layer.attn_q_norm_b = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_Q_NORM, "bias", i), {64}, backend); layer.attn_k_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {64}, backend); - layer.attn_k_norm_b = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_K_NORM, "bias", i), {64}, backend); + layer.attn_k_norm_b = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_K_NORM, "bias", i), {64}, backend); } } break; case LLM_ARCH_BLOOM: { // TODO: CPU-only for now - model.tok_embeddings = ml.create_tensor(ctx, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, GGML_BACKEND_CPU); - model.tok_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_TOKEN_EMBD_NORM, "weight"), {n_embd}, GGML_BACKEND_CPU); - model.tok_norm_b = ml.create_tensor(ctx, tn(LLM_TENSOR_TOKEN_EMBD_NORM, "bias"), {n_embd}, GGML_BACKEND_CPU); + model.tok_embd = ml.create_tensor(ctx, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, GGML_BACKEND_CPU); + model.tok_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_TOKEN_EMBD_NORM, "weight"), {n_embd}, GGML_BACKEND_CPU); + model.tok_norm_b = ml.create_tensor(ctx, tn(LLM_TENSOR_TOKEN_EMBD_NORM, "bias"), {n_embd}, GGML_BACKEND_CPU); // output { @@ -2899,11 +2899,11 @@ static void llm_load_tensors( layer.ffn_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, backend); layer.ffn_norm_b = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_NORM, "bias", i), {n_embd}, backend); - layer.w2 = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}, backend_split); - layer.b2 = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd}, backend_split); + layer.ffn_down = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}, backend_split); + layer.ffn_down_b = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd}, backend_split); - layer.w3 = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, backend_split); - layer.b3 = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_UP, "bias", i), {n_ff}, backend_split); + layer.ffn_up = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, backend_split); + layer.ffn_up_b = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_UP, "bias", i), {n_ff}, backend_split); if (backend == GGML_BACKEND_GPU) { vram_weights += @@ -2911,14 +2911,14 @@ static void llm_load_tensors( ggml_nbytes(layer.wqkv) + ggml_nbytes(layer.bqkv) + ggml_nbytes(layer.wo) + ggml_nbytes(layer.bo) + ggml_nbytes(layer.ffn_norm) + ggml_nbytes(layer.ffn_norm_b) + - ggml_nbytes(layer.w3) + ggml_nbytes(layer.b3) + - ggml_nbytes(layer.w2) + ggml_nbytes(layer.b2); + ggml_nbytes(layer.ffn_up) + ggml_nbytes(layer.ffn_up_b) + + ggml_nbytes(layer.ffn_down) + ggml_nbytes(layer.ffn_down_b); } } } break; case LLM_ARCH_MPT: { - model.tok_embeddings = ml.create_tensor(ctx, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, GGML_BACKEND_CPU); + model.tok_embd = ml.create_tensor(ctx, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, GGML_BACKEND_CPU); // output { @@ -2969,8 +2969,8 @@ static void llm_load_tensors( layer.ffn_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, backend); - layer.w2 = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, backend_split); - layer.w3 = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, backend_split); + layer.ffn_down = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, backend_split); + layer.ffn_up = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, backend_split); if (backend == GGML_BACKEND_GPU) { vram_weights += @@ -2978,8 +2978,8 @@ static void llm_load_tensors( ggml_nbytes(layer.wqkv) + ggml_nbytes(layer.wo) + ggml_nbytes(layer.ffn_norm) + - ggml_nbytes(layer.w2) + - ggml_nbytes(layer.w3); + ggml_nbytes(layer.ffn_down) + + ggml_nbytes(layer.ffn_up); } } } break; @@ -3093,6 +3093,352 @@ static bool llama_model_load( using llm_build_cb = std::function; +enum llm_rope_type { + LLM_ROPE, + LLM_ROPE_NEOX, + LLM_ROPE_GLM, +}; + +static struct ggml_tensor * llm_build_inp_embd( + struct ggml_context * ctx, + const llama_batch & batch, + struct ggml_tensor * tok_embd, + int64_t n_embd, + int32_t n_tokens, + const llm_build_cb & cb) { + struct ggml_tensor * inpL; + + if (batch.token) { + struct ggml_tensor * inp_tokens = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, n_tokens); + cb(inp_tokens, "inp_tokens", -1); + + inpL = ggml_get_rows(ctx, tok_embd, inp_tokens); + } else { +#ifdef GGML_USE_MPI + GGML_ASSERT(false && "not implemented"); +#endif + + inpL = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd, n_tokens); + } + + return inpL; +} + +// Persimmon: n_rot = n_embd_head/2 +// Other: n_rot = n_embd_head +static void llm_build_k_shift( + const llama_context & lctx, + struct ggml_context * ctx, + struct ggml_cgraph * graph, + int64_t n_rot, + llm_rope_type type, + const llm_build_cb & cb) { + const auto & model = lctx.model; + const auto & kv_self = lctx.kv_self; + const auto & cparams = lctx.cparams; + + const auto & hparams = model.hparams; + + const int64_t n_layer = hparams.n_layer; + const int64_t n_head_kv = hparams.n_head_kv; + const int64_t n_embd_gqa = hparams.n_embd_gqa(); + const int64_t n_embd_head = hparams.n_embd_head(); + + const int64_t n_ctx = lctx.cparams.n_ctx; + + const float freq_base = cparams.rope_freq_base; + const float freq_scale = cparams.rope_freq_scale; + + GGML_ASSERT(n_embd_head % n_rot == 0); + + struct ggml_tensor * K_shift = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, n_ctx); + cb(K_shift, "K_shift", -1); + + int rope_type = 0; + + switch (type) { + case LLM_ROPE: rope_type = 0; break; + case LLM_ROPE_NEOX: rope_type = 2; break; + case LLM_ROPE_GLM: rope_type = 4; break; + } + + for (int il = 0; il < n_layer; ++il) { + struct ggml_tensor * tmp = + // we rotate only the first n_rot dimensions + ggml_rope_custom_inplace(ctx, + ggml_view_3d(ctx, kv_self.k, + n_rot, n_head_kv, n_ctx, + ggml_element_size(kv_self.k)*n_embd_head, + ggml_element_size(kv_self.k)*n_embd_gqa, + ggml_element_size(kv_self.k)*n_embd_gqa*n_ctx*il), + K_shift, n_rot, rope_type, 0, freq_base, freq_scale); + cb(tmp, "K_shifted", il); + ggml_build_forward_expand(graph, tmp); + } +} + +static void llm_build_kv_store( + const llama_context & lctx, + struct ggml_context * ctx, + struct ggml_cgraph * graph, + struct ggml_tensor * k_cur, + struct ggml_tensor * v_cur, + int32_t n_tokens, + int32_t kv_head, + const llm_build_cb & cb, + int64_t il) { + const auto & model = lctx.model; + const auto & kv_self = lctx.kv_self; + const auto & cparams = lctx.cparams; + + const auto & hparams = model.hparams; + + const int64_t n_ctx = cparams.n_ctx; + const int64_t n_embd_gqa = hparams.n_embd_gqa(); + + // compute the transposed [n_tokens, n_embd] V matrix + struct ggml_tensor * v_cur_t = ggml_transpose(ctx, ggml_reshape_2d(ctx, v_cur, n_embd_gqa, n_tokens)); + //struct ggml_tensor * v_cur_t = ggml_transpose(ctx, v_cur); // TODO: reshape above is likely not needed + cb(v_cur_t, "v_cur_t", il); + + struct ggml_tensor * k_cache_view = ggml_view_1d(ctx, kv_self.k, n_tokens*n_embd_gqa, + (ggml_element_size(kv_self.k)*n_embd_gqa)*(il*n_ctx + kv_head)); + cb(k_cache_view, "k_cache_view", il); + + struct ggml_tensor * v_cache_view = ggml_view_2d(ctx, kv_self.v, n_tokens, n_embd_gqa, + ( n_ctx)*ggml_element_size(kv_self.v), + (il*n_ctx)*ggml_element_size(kv_self.v)*n_embd_gqa + kv_head*ggml_element_size(kv_self.v)); + cb(v_cache_view, "v_cache_view", il); + + // important: storing RoPE-ed version of K in the KV cache! + ggml_build_forward_expand(graph, ggml_cpy(ctx, k_cur, k_cache_view)); + ggml_build_forward_expand(graph, ggml_cpy(ctx, v_cur_t, v_cache_view)); +} + +enum llm_norm_type { + LLM_NORM, + LLM_NORM_RMS, +}; + +static struct ggml_tensor * llm_build_norm( + struct ggml_context * ctx, + struct ggml_tensor * cur, + struct ggml_tensor * mw, + struct ggml_tensor * mb, + llm_norm_type type, + float eps, + const llm_build_cb & cb, + int il) { + switch (type) { + case LLM_NORM: cur = ggml_norm (ctx, cur, eps); break; + case LLM_NORM_RMS: cur = ggml_rms_norm(ctx, cur, eps); break; + } + + if (mw || mb) { + cb(cur, "norm", il); + } + + if (mw) { + cur = ggml_mul(ctx, cur, mw); + if (mb) { + cb(cur, "norm_w", il); + } + } + + if (mb) { + cur = ggml_add(ctx, cur, mb); + } + + return cur; +} + +enum llm_ffn_op_type { + LLM_FFN_SILU, + LLM_FFN_GELU, + LLM_FFN_RELU, + LLM_FFN_RELU_SQR, +}; + +enum llm_ffn_gate_type { + LLM_FFN_SEQ, + LLM_FFN_PAR, // ffn_gate is parallel to ffn_up +}; + +static struct ggml_tensor * llm_build_ffn( + struct ggml_context * ctx, + struct ggml_tensor * cur, + struct ggml_tensor * up, + struct ggml_tensor * up_b, + struct ggml_tensor * gate, + struct ggml_tensor * gate_b, + struct ggml_tensor * down, + struct ggml_tensor * down_b, + llm_ffn_op_type type_op, + llm_ffn_gate_type type_gate, + const llm_build_cb & cb, + int il) { + struct ggml_tensor * tmp = ggml_mul_mat(ctx, up, cur); + cb(tmp, "ffn_up", il); + + if (up_b) { + tmp = ggml_add(ctx, tmp, up_b); + cb(tmp, "ffn_up_b", il); + } + + if (gate) { + switch (type_gate) { + case LLM_FFN_SEQ: + { + cur = ggml_mul_mat(ctx, gate, tmp); + cb(cur, "ffn_gate", il); + } break; + case LLM_FFN_PAR: + { + cur = ggml_mul_mat(ctx, gate, cur); + cb(cur, "ffn_gate", il); + } break; + } + + if (gate_b) { + cur = ggml_add(ctx, cur, gate_b); + cb(cur, "ffn_gate_b", il); + } + } else { + cur = tmp; + } + + switch (type_op) { + case LLM_FFN_SILU: + { + cur = ggml_silu(ctx, cur); + cb(cur, "ffn_silu", il); + } break; + case LLM_FFN_GELU: + { + cur = ggml_gelu(ctx, cur); + cb(cur, "ffn_gelu", il); + } break; + case LLM_FFN_RELU: + { + cur = ggml_relu(ctx, cur); + cb(cur, "ffn_relu", il); + } break; + case LLM_FFN_RELU_SQR: + { + cur = ggml_relu(ctx, cur); + cb(cur, "ffn_relu", il); + + cur = ggml_sqr(ctx, cur); + cb(cur, "ffn_sqr(relu)", il); + } break; + } + + if (type_gate == LLM_FFN_PAR) { + cur = ggml_mul(ctx, cur, tmp); + cb(cur, "ffn_gate_par", il); + } + + cur = ggml_mul_mat(ctx, down, cur); + if (down_b) { + cb(cur, "ffn_down", il); + } + + if (down_b) { + cur = ggml_add(ctx, cur, down_b); + } + + return cur; +} + +// if max_alibi_bias > 0 then apply ALiBi +static struct ggml_tensor * llm_build_kqv( + const llama_context & lctx, + struct ggml_context * ctx, + struct ggml_tensor * cur, + struct ggml_tensor * wo, + struct ggml_tensor * wo_b, + struct ggml_tensor * q_cur, + struct ggml_tensor * kq_scale, + struct ggml_tensor * kq_mask, + int32_t n_tokens, + int32_t n_kv, + float alibi_bias_max, + const llm_build_cb & cb, + int il) { + const auto & model = lctx.model; + const auto & kv_self = lctx.kv_self; + const auto & cparams = lctx.cparams; + + const auto & hparams = model.hparams; + + const int64_t n_ctx = cparams.n_ctx; + const int64_t n_embd = hparams.n_embd; + const int64_t n_head = hparams.n_head; + const int64_t n_head_kv = hparams.n_head_kv; + const int64_t n_embd_head = hparams.n_embd_head(); + const int64_t n_embd_gqa = hparams.n_embd_gqa(); + + struct ggml_tensor * q = ggml_permute(ctx, q_cur, 0, 2, 1, 3); + cb(q, "q", il); + + struct ggml_tensor * k = + ggml_view_3d(ctx, kv_self.k, + n_embd_head, n_kv, n_head_kv, + ggml_element_size(kv_self.k)*n_embd_gqa, + ggml_element_size(kv_self.k)*n_embd_head, + ggml_element_size(kv_self.k)*n_embd_gqa*n_ctx*il); + cb(k, "k", il); + + struct ggml_tensor * kq = ggml_mul_mat(ctx, k, q); + cb(kq, "kq", il); + + kq = ggml_scale(ctx, kq, kq_scale); + cb(kq, "kq_scaled", il); + + if (alibi_bias_max > 0.0f) { + // TODO: n_head or n_head_kv + // TODO: K-shift is likely not working + // TODO: change to ggml_add + kq = ggml_alibi(ctx, kq, /*n_past*/ 0, n_head, alibi_bias_max); + cb(kq, "kq_scaled_alibi", il); + } + + kq = ggml_add(ctx, kq, kq_mask); + cb(kq, "kq_masked", il); + + kq = ggml_soft_max(ctx, kq); + cb(kq, "kq_soft_max", il); + + // split cached v into n_head heads + struct ggml_tensor * v = + ggml_view_3d(ctx, kv_self.v, + n_kv, n_embd_head, n_head_kv, + ggml_element_size(kv_self.v)*n_ctx, + ggml_element_size(kv_self.v)*n_ctx*n_embd_head, + ggml_element_size(kv_self.v)*n_ctx*n_embd_gqa*il); + cb(v, "v", il); + + struct ggml_tensor * kqv = ggml_mul_mat(ctx, v, kq); + cb(kqv, "kqv", il); + + struct ggml_tensor * kqv_merged = ggml_permute(ctx, kqv, 0, 2, 1, 3); + cb(kqv_merged, "kqv_merged", il); + + cur = ggml_cont_2d(ctx, kqv_merged, n_embd, n_tokens); + cb(cur, "kqv_merged_cont", il); + + cur = ggml_mul_mat(ctx, wo, cur); + if (wo_b) { + cb(cur, "kqv_wo", il); + } + + if (wo_b) { + cur = ggml_add(ctx, cur, wo_b); + } + + return cur; +} + static struct ggml_cgraph * llm_build_llama( llama_context & lctx, const llama_batch & batch, @@ -3112,7 +3458,6 @@ static struct ggml_cgraph * llm_build_llama( const int64_t n_head = hparams.n_head; const int64_t n_head_kv = hparams.n_head_kv; const int64_t n_embd_head = hparams.n_embd_head(); - const int64_t n_embd_gqa = hparams.n_embd_gqa(); GGML_ASSERT(n_embd_head == hparams.n_rot); @@ -3143,18 +3488,7 @@ static struct ggml_cgraph * llm_build_llama( struct ggml_tensor * cur; struct ggml_tensor * inpL; - if (batch.token) { - struct ggml_tensor * inp_tokens = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens); - cb(inp_tokens, "inp_tokens", -1); - - inpL = ggml_get_rows(ctx0, model.tok_embeddings, inp_tokens); - } else { -#ifdef GGML_USE_MPI - GGML_ASSERT(false && "not implemented"); -#endif - - inpL = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd, n_tokens); - } + inpL = llm_build_inp_embd(ctx0, batch, model.tok_embd, n_embd, n_tokens, cb); cb(inpL, "inp_embd", -1); // inp_pos - contains the positions @@ -3171,177 +3505,64 @@ static struct ggml_cgraph * llm_build_llama( // shift the entire K-cache if needed if (do_rope_shift) { - struct ggml_tensor * K_shift = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_ctx); - cb(K_shift, "K_shift", -1); - - for (int il = 0; il < n_layer; ++il) { - struct ggml_tensor * tmp = - ggml_rope_custom_inplace(ctx0, - ggml_view_3d(ctx0, kv_self.k, - n_embd_head, n_head_kv, n_ctx, - ggml_element_size(kv_self.k)*n_embd_head, - ggml_element_size(kv_self.k)*n_embd_gqa, - ggml_element_size(kv_self.k)*n_embd_gqa*n_ctx*il), - K_shift, n_embd_head, 0, 0, freq_base, freq_scale); - cb(tmp, "K_shifted", il); - ggml_build_forward_expand(gf, tmp); - } + llm_build_k_shift(lctx, ctx0, gf, n_embd_head, LLM_ROPE, cb); } for (int il = 0; il < n_layer; ++il) { struct ggml_tensor * inpSA = inpL; // norm - { - cur = ggml_rms_norm(ctx0, inpL, norm_rms_eps); - cb(cur, "rms_norm_0", il); - - // cur = cur*attn_norm(broadcasted) - cur = ggml_mul(ctx0, cur, model.layers[il].attn_norm); - cb(cur, "attn_norm_0", il); - } + cur = llm_build_norm(ctx0, inpL, + model.layers[il].attn_norm, NULL, + LLM_NORM_RMS, norm_rms_eps, cb, il); + cb(cur, "attn_norm", il); // self-attention { // compute Q and K and RoPE them - struct ggml_tensor * tmpk = ggml_mul_mat(ctx0, model.layers[il].wk, cur); - cb(tmpk, "tmpk", il); - - struct ggml_tensor * tmpq = ggml_mul_mat(ctx0, model.layers[il].wq, cur); - cb(tmpq, "tmpq", il); - - struct ggml_tensor * Kcur = ggml_rope_custom(ctx0, ggml_reshape_3d(ctx0, tmpk, n_embd_head, n_head_kv, n_tokens), inp_pos, n_embd_head, 0, 0, freq_base, freq_scale); - cb(Kcur, "Kcur", il); - - struct ggml_tensor * Qcur = ggml_rope_custom(ctx0, ggml_reshape_3d(ctx0, tmpq, n_embd_head, n_head, n_tokens), inp_pos, n_embd_head, 0, 0, freq_base, freq_scale); + struct ggml_tensor * Qcur = ggml_mul_mat(ctx0, model.layers[il].wq, cur); cb(Qcur, "Qcur", il); - // store key and value to memory - { - // compute the transposed [n_tokens, n_embd] V matrix + struct ggml_tensor * Kcur = ggml_mul_mat(ctx0, model.layers[il].wk, cur); + cb(Kcur, "Kcur", il); - struct ggml_tensor * tmpv = ggml_mul_mat(ctx0, model.layers[il].wv, cur); - cb(tmpv, "tmpv", il); + struct ggml_tensor * Vcur = ggml_mul_mat(ctx0, model.layers[il].wv, cur); + cb(Vcur, "Vcur", il); - struct ggml_tensor * Vcur = ggml_transpose(ctx0, ggml_reshape_2d(ctx0, tmpv, n_embd_gqa, n_tokens)); - cb(Vcur, "Vcur", il); + Qcur = ggml_rope_custom(ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, n_embd_head, 0, 0, freq_base, freq_scale); + cb(Qcur, "Qcur", il); - struct ggml_tensor * k = ggml_view_1d(ctx0, kv_self.k, n_tokens*n_embd_gqa, (ggml_element_size(kv_self.k)*n_embd_gqa)*(il*n_ctx + kv_head)); - cb(k, "k", il); + Kcur = ggml_rope_custom(ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, n_embd_head, 0, 0, freq_base, freq_scale); + cb(Kcur, "Kcur", il); - struct ggml_tensor * v = ggml_view_2d(ctx0, kv_self.v, n_tokens, n_embd_gqa, - ( n_ctx)*ggml_element_size(kv_self.v), - (il*n_ctx)*ggml_element_size(kv_self.v)*n_embd_gqa + kv_head*ggml_element_size(kv_self.v)); - cb(v, "v", il); + llm_build_kv_store(lctx, ctx0, gf, Kcur, Vcur, n_tokens, kv_head, cb, il); - // important: storing RoPE-ed version of K in the KV cache! - ggml_build_forward_expand(gf, ggml_cpy(ctx0, Kcur, k)); - ggml_build_forward_expand(gf, ggml_cpy(ctx0, Vcur, v)); - } - - struct ggml_tensor * Q = ggml_permute(ctx0, Qcur, 0, 2, 1, 3); - cb(Q, "Q", il); - - struct ggml_tensor * K = - ggml_view_3d(ctx0, kv_self.k, - n_embd_head, n_kv, n_head_kv, - ggml_element_size(kv_self.k)*n_embd_gqa, - ggml_element_size(kv_self.k)*n_embd_head, - ggml_element_size(kv_self.k)*n_embd_gqa*n_ctx*il); - cb(K, "K", il); - - // K * Q - struct ggml_tensor * KQ = ggml_mul_mat(ctx0, K, Q); - cb(KQ, "KQ", il); - - // KQ_scaled = KQ / sqrt(n_embd_head) - // KQ_scaled shape [n_kv, n_tokens, n_head, 1] - struct ggml_tensor * KQ_scaled = ggml_scale(ctx0, KQ, KQ_scale); - cb(KQ_scaled, "KQ_scaled", il); - - // KQ_masked = mask_past(KQ_scaled) - struct ggml_tensor * KQ_masked = ggml_add(ctx0, KQ_scaled, KQ_mask); - cb(KQ_masked, "KQ_masked", il); - - // KQ = soft_max(KQ_masked) - struct ggml_tensor * KQ_soft_max = ggml_soft_max(ctx0, KQ_masked); - cb(KQ_soft_max, "KQ_soft_max", il); - - // split cached V into n_head heads - struct ggml_tensor * V = - ggml_view_3d(ctx0, kv_self.v, - n_kv, n_embd_head, n_head_kv, - ggml_element_size(kv_self.v)*n_ctx, - ggml_element_size(kv_self.v)*n_ctx*n_embd_head, - ggml_element_size(kv_self.v)*n_ctx*n_embd_gqa*il); - cb(V, "V", il); - -#if 1 - struct ggml_tensor * KQV = ggml_mul_mat(ctx0, V, KQ_soft_max); - cb(KQV, "KQV", il); -#else - // make V contiguous in memory to speed up the matmul, however we waste time on the copy - // on M1 this is faster for the perplexity computation, but ~5% slower for the single-token generation - // is there a better way? - struct ggml_tensor * V_cont = ggml_cpy(ctx0, V, ggml_new_tensor_3d(ctx0, kv_self.v->type, n_ctx, n_embd_head, n_head)); - struct ggml_tensor * KQV = ggml_mul_mat(ctx0, V_cont, KQ_soft_max); -#endif - - // KQV_merged = KQV.permute(0, 2, 1, 3) - struct ggml_tensor * KQV_merged = ggml_permute(ctx0, KQV, 0, 2, 1, 3); - cb(KQV_merged, "KQV_merged", il); - - // cur = KQV_merged.contiguous().view(n_embd, n_tokens) - cur = ggml_cont_2d(ctx0, KQV_merged, n_embd, n_tokens); - cb(cur, "KQV_merged_contiguous", il); - - // projection (no bias) - cur = ggml_mul_mat(ctx0, - model.layers[il].wo, - cur); - cb(cur, "result_wo", il); + cur = llm_build_kqv(lctx, ctx0, cur, + model.layers[il].wo, NULL, + Qcur, KQ_scale, KQ_mask, n_tokens, n_kv, -1.0f, cb, il); + cb(cur, "kqv_out", il); } - struct ggml_tensor * inpFF = ggml_add(ctx0, cur, inpSA); - cb(inpFF, "inpFF", il); + struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA); + cb(ffn_inp, "ffn_inp", il); // feed-forward network { - // norm - { - cur = ggml_rms_norm(ctx0, inpFF, norm_rms_eps); - cb(cur, "rms_norm_1", il); + cur = llm_build_norm(ctx0, ffn_inp, + model.layers[il].ffn_norm, NULL, + LLM_NORM_RMS, norm_rms_eps, cb, il); + cb(cur, "ffn_norm", il); - // cur = cur*ffn_norm(broadcasted) - cur = ggml_mul(ctx0, cur, model.layers[il].ffn_norm); - cb(cur, "ffn_norm", il); - } - - struct ggml_tensor * tmp = ggml_mul_mat(ctx0, - model.layers[il].w3, - cur); - cb(tmp, "result_w3", il); - - cur = ggml_mul_mat(ctx0, - model.layers[il].w1, - cur); - cb(cur, "result_w1", il); - - // SILU activation - cur = ggml_silu(ctx0, cur); - cb(cur, "silu", il); - - cur = ggml_mul(ctx0, cur, tmp); - cb(cur, "silu_x_result_w3", il); - - cur = ggml_mul_mat(ctx0, - model.layers[il].w2, - cur); - cb(cur, "result_w2", il); + cur = llm_build_ffn(ctx0, cur, + model.layers[il].ffn_up, NULL, + model.layers[il].ffn_gate, NULL, + model.layers[il].ffn_down, NULL, + LLM_FFN_SILU, LLM_FFN_PAR, cb, il); + cb(cur, "ffn_out", il); } - cur = ggml_add(ctx0, cur, inpFF); - cb(cur, "inpFF_+_result_w2", il); + cur = ggml_add(ctx0, cur, ffn_inp); + cb(cur, "l_out", il); // input for next layer inpL = cur; @@ -3349,15 +3570,10 @@ static struct ggml_cgraph * llm_build_llama( cur = inpL; - // norm - { - cur = ggml_rms_norm(ctx0, cur, norm_rms_eps); - cb(cur, "rms_norm_2", -1); - - // cur = cur*norm(broadcasted) - cur = ggml_mul(ctx0, cur, model.output_norm); - cb(cur, "result_norm", -1); - } + cur = llm_build_norm(ctx0, cur, + model.output_norm, NULL, + LLM_NORM_RMS, norm_rms_eps, cb, -1); + cb(cur, "result_norm", -1); // lm_head cur = ggml_mul_mat(ctx0, model.output, cur); @@ -3389,7 +3605,6 @@ static struct ggml_cgraph * llm_build_baichaun( const int64_t n_head = hparams.n_head; const int64_t n_head_kv = hparams.n_head_kv; const int64_t n_embd_head = hparams.n_embd_head(); - const int64_t n_embd_gqa = hparams.n_embd_gqa(); GGML_ASSERT(n_embd_head == hparams.n_rot); @@ -3418,18 +3633,7 @@ static struct ggml_cgraph * llm_build_baichaun( struct ggml_tensor * cur; struct ggml_tensor * inpL; - if (batch.token) { - struct ggml_tensor * inp_tokens = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens); - cb(inp_tokens, "inp_tokens", -1); - - inpL = ggml_get_rows(ctx0, model.tok_embeddings, inp_tokens); - } else { -#ifdef GGML_USE_MPI - GGML_ASSERT(false && "not implemented"); -#endif - - inpL = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd, n_tokens); - } + inpL = llm_build_inp_embd(ctx0, batch, model.tok_embd, n_embd, n_tokens, cb); cb(inpL, "inp_embd", -1); // inp_pos - contains the positions @@ -3446,195 +3650,74 @@ static struct ggml_cgraph * llm_build_baichaun( // shift the entire K-cache if needed if (do_rope_shift) { - struct ggml_tensor * K_shift = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_ctx); - cb(K_shift, "K_shift", -1); - - for (int il = 0; il < n_layer; ++il) { - struct ggml_tensor * tmp = - ggml_rope_custom_inplace(ctx0, - ggml_view_3d(ctx0, kv_self.k, - n_embd_head, n_head_kv, n_ctx, - ggml_element_size(kv_self.k)*n_embd_head, - ggml_element_size(kv_self.k)*n_embd_gqa, - ggml_element_size(kv_self.k)*n_embd_gqa*n_ctx*il), - K_shift, n_embd_head, 0, 0, freq_base, freq_scale); - cb(tmp, "K_shifted", il); - ggml_build_forward_expand(gf, tmp); - } + llm_build_k_shift(lctx, ctx0, gf, n_embd_head, LLM_ROPE, cb); } for (int il = 0; il < n_layer; ++il) { struct ggml_tensor * inpSA = inpL; - // norm - { - cur = ggml_rms_norm(ctx0, inpL, norm_rms_eps); - cb(cur, "rms_norm_0", il); - - // cur = cur*attn_norm(broadcasted) - cur = ggml_mul(ctx0, cur, model.layers[il].attn_norm); - cb(cur, "attn_norm_0", il); - } + cur = llm_build_norm(ctx0, inpL, + model.layers[il].attn_norm, NULL, + LLM_NORM_RMS, norm_rms_eps, cb, il); + cb(cur, "attn_norm", il); // self-attention { - // compute Q and K and RoPE them - struct ggml_tensor * tmpk = ggml_mul_mat(ctx0, model.layers[il].wk, cur); - cb(tmpk, "tmpk", il); - - struct ggml_tensor * tmpq = ggml_mul_mat(ctx0, model.layers[il].wq, cur); - cb(tmpq, "tmpq", il); - - struct ggml_tensor * Kcur; - struct ggml_tensor * Qcur; - switch (model.type) { - case MODEL_7B: - Kcur = ggml_rope_custom(ctx0, ggml_reshape_3d(ctx0, tmpk, n_embd_head, n_head_kv, n_tokens), inp_pos, n_embd_head, 0, 0, freq_base, freq_scale); - Qcur = ggml_rope_custom(ctx0, ggml_reshape_3d(ctx0, tmpq, n_embd_head, n_head, n_tokens), inp_pos, n_embd_head, 0, 0, freq_base, freq_scale); - break; - case MODEL_13B: - Kcur = ggml_reshape_3d(ctx0, tmpk, n_embd/n_head, n_head, n_tokens); - Qcur = ggml_reshape_3d(ctx0, tmpq, n_embd/n_head, n_head, n_tokens); - break; - default: - GGML_ASSERT(false); - } - - cb(Kcur, "Kcur", il); - + struct ggml_tensor * Qcur = ggml_mul_mat(ctx0, model.layers[il].wq, cur); cb(Qcur, "Qcur", il); - // store key and value to memory - { - // compute the transposed [n_tokens, n_embd] V matrix + struct ggml_tensor * Kcur = ggml_mul_mat(ctx0, model.layers[il].wk, cur); + cb(Kcur, "Kcur", il); - struct ggml_tensor * tmpv = ggml_mul_mat(ctx0, model.layers[il].wv, cur); - cb(tmpv, "tmpv", il); - - struct ggml_tensor * Vcur = ggml_transpose(ctx0, ggml_reshape_2d(ctx0, tmpv, n_embd_gqa, n_tokens)); - cb(Vcur, "Vcur", il); - - struct ggml_tensor * k = ggml_view_1d(ctx0, kv_self.k, n_tokens*n_embd_gqa, (ggml_element_size(kv_self.k)*n_embd_gqa)*(il*n_ctx + kv_head)); - cb(k, "k", il); - - struct ggml_tensor * v = ggml_view_2d(ctx0, kv_self.v, n_tokens, n_embd_gqa, - ( n_ctx)*ggml_element_size(kv_self.v), - (il*n_ctx)*ggml_element_size(kv_self.v)*n_embd_gqa + kv_head*ggml_element_size(kv_self.v)); - cb(v, "v", il); - - // important: storing RoPE-ed version of K in the KV cache! - ggml_build_forward_expand(gf, ggml_cpy(ctx0, Kcur, k)); - ggml_build_forward_expand(gf, ggml_cpy(ctx0, Vcur, v)); - } - - struct ggml_tensor * Q = ggml_permute(ctx0, Qcur, 0, 2, 1, 3); - cb(Q, "Q", il); - - struct ggml_tensor * K = - ggml_view_3d(ctx0, kv_self.k, - n_embd_head, n_kv, n_head_kv, - ggml_element_size(kv_self.k)*n_embd_gqa, - ggml_element_size(kv_self.k)*n_embd_head, - ggml_element_size(kv_self.k)*n_embd_gqa*n_ctx*il); - cb(K, "K", il); - - // K * Q - struct ggml_tensor * KQ = ggml_mul_mat(ctx0, K, Q); - cb(KQ, "KQ", il); - - // KQ_scaled = KQ / sqrt(n_embd_head) - // KQ_scaled shape [n_past + n_tokens, n_tokens, n_head, 1] - struct ggml_tensor * KQ_scaled = ggml_scale(ctx0, KQ, KQ_scale); - cb(KQ_scaled, "KQ_scaled", il); - - struct ggml_tensor * KQ_masked; - struct ggml_tensor * KQ_scaled_alibi; + struct ggml_tensor * Vcur = ggml_mul_mat(ctx0, model.layers[il].wv, cur); + cb(Vcur, "Vcur", il); switch (model.type) { case MODEL_7B: - KQ_masked = ggml_add(ctx0, KQ_scaled, KQ_mask); + Qcur = ggml_rope_custom(ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, n_embd_head, 0, 0, freq_base, freq_scale); + Kcur = ggml_rope_custom(ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, n_embd_head, 0, 0, freq_base, freq_scale); break; case MODEL_13B: - // TODO: replace with ggml_add() - KQ_scaled_alibi = ggml_alibi(ctx0, KQ_scaled, /*n_past*/ 0, n_head, 8); - cb(KQ_scaled_alibi, "KQ_scaled_alibi", il); - KQ_masked = ggml_add(ctx0, KQ_scaled_alibi, KQ_mask); + Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd/n_head, n_head, n_tokens); + Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd/n_head, n_head, n_tokens); break; default: GGML_ASSERT(false); } + cb(Qcur, "Qcur", il); + cb(Kcur, "Kcur", il); - // KQ = soft_max(KQ_masked) - struct ggml_tensor * KQ_soft_max = ggml_soft_max(ctx0, KQ_masked); - cb(KQ_soft_max, "KQ_soft_max", il); + llm_build_kv_store(lctx, ctx0, gf, Kcur, Vcur, n_tokens, kv_head, cb, il); - // split cached V into n_head heads - struct ggml_tensor * V = - ggml_view_3d(ctx0, kv_self.v, - n_kv, n_embd_head, n_head_kv, - ggml_element_size(kv_self.v)*n_ctx, - ggml_element_size(kv_self.v)*n_ctx*n_embd_head, - ggml_element_size(kv_self.v)*n_ctx*n_embd_gqa*il); - cb(V, "V", il); + // apply ALiBi for 13B model + const float alibi_bias_max = model.type == MODEL_13B ? 8.0f : -1.0f; - struct ggml_tensor * KQV = ggml_mul_mat(ctx0, V, KQ_soft_max); - cb(KQV, "KQV", il); - - // KQV_merged = KQV.permute(0, 2, 1, 3) - struct ggml_tensor * KQV_merged = ggml_permute(ctx0, KQV, 0, 2, 1, 3); - cb(KQV_merged, "KQV_merged", il); - - // cur = KQV_merged.contiguous().view(n_embd, n_tokens) - cur = ggml_cont_2d(ctx0, KQV_merged, n_embd, n_tokens); - cb(cur, "KQV_merged_contiguous", il); - - // projection (no bias) - cur = ggml_mul_mat(ctx0, - model.layers[il].wo, - cur); - cb(cur, "result_wo", il); + cur = llm_build_kqv(lctx, ctx0, cur, + model.layers[il].wo, NULL, + Qcur, KQ_scale, KQ_mask, n_tokens, n_kv, alibi_bias_max, cb, il); + cb(cur, "kqv_out", il); } - struct ggml_tensor * inpFF = ggml_add(ctx0, cur, inpSA); - cb(inpFF, "inpFF", il); + struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA); + cb(ffn_inp, "ffn_inp", il); // feed-forward network { - // norm - { - cur = ggml_rms_norm(ctx0, inpFF, norm_rms_eps); - cb(cur, "rms_norm_1", il); + cur = llm_build_norm(ctx0, ffn_inp, + model.layers[il].ffn_norm, NULL, + LLM_NORM_RMS, norm_rms_eps, cb, il); + cb(cur, "ffn_norm", il); - // cur = cur*ffn_norm(broadcasted) - cur = ggml_mul(ctx0, cur, model.layers[il].ffn_norm); - cb(cur, "ffn_norm", il); - } - - struct ggml_tensor * tmp = ggml_mul_mat(ctx0, - model.layers[il].w3, - cur); - cb(tmp, "result_w3", il); - - cur = ggml_mul_mat(ctx0, - model.layers[il].w1, - cur); - cb(cur, "result_w1", il); - - // SILU activation - cur = ggml_silu(ctx0, cur); - cb(cur, "silu", il); - - cur = ggml_mul(ctx0, cur, tmp); - cb(cur, "silu_x_result_w3", il); - - cur = ggml_mul_mat(ctx0, - model.layers[il].w2, - cur); - cb(cur, "result_w2", il); + cur = llm_build_ffn(ctx0, cur, + model.layers[il].ffn_up, NULL, + model.layers[il].ffn_gate, NULL, + model.layers[il].ffn_down, NULL, + LLM_FFN_SILU, LLM_FFN_PAR, cb, il); + cb(cur, "ffn_out", il); } - cur = ggml_add(ctx0, cur, inpFF); - cb(cur, "inpFF_+_result_w2", il); + cur = ggml_add(ctx0, cur, ffn_inp); + cb(cur, "l_out", il); // input for next layer inpL = cur; @@ -3642,15 +3725,10 @@ static struct ggml_cgraph * llm_build_baichaun( cur = inpL; - // norm - { - cur = ggml_rms_norm(ctx0, cur, norm_rms_eps); - cb(cur, "rms_norm_2", -1); - - // cur = cur*norm(broadcasted) - cur = ggml_mul(ctx0, cur, model.output_norm); - cb(cur, "result_norm", -1); - } + cur = llm_build_norm(ctx0, cur, + model.output_norm, NULL, + LLM_NORM_RMS, norm_rms_eps, cb, -1); + cb(cur, "result_norm", -1); // lm_head cur = ggml_mul_mat(ctx0, model.output, cur); @@ -3714,18 +3792,7 @@ static struct ggml_cgraph * llm_build_falcon( struct ggml_tensor * cur; struct ggml_tensor * inpL; - if (batch.token) { - struct ggml_tensor * inp_tokens = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens); - cb(inp_tokens, "inp_tokens", -1); - - inpL = ggml_get_rows(ctx0, model.tok_embeddings, inp_tokens); - } else { -#ifdef GGML_USE_MPI - GGML_ASSERT(false && "not implemented"); -#endif - - inpL = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd, n_tokens); - } + inpL = llm_build_inp_embd(ctx0, batch, model.tok_embd, n_embd, n_tokens, cb); cb(inpL, "inp_embd", -1); // inp_pos - contains the positions @@ -3742,181 +3809,77 @@ static struct ggml_cgraph * llm_build_falcon( // shift the entire K-cache if needed if (do_rope_shift) { - struct ggml_tensor * K_shift = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_ctx); - cb(K_shift, "K_shift", -1); - - for (int il = 0; il < n_layer; ++il) { - struct ggml_tensor * tmp = - ggml_rope_custom_inplace(ctx0, - ggml_view_3d(ctx0, kv_self.k, - n_embd_head, n_head_kv, n_ctx, - ggml_element_size(kv_self.k)*n_embd_head, - ggml_element_size(kv_self.k)*n_embd_gqa, - ggml_element_size(kv_self.k)*n_embd_gqa*n_ctx*il), - K_shift, n_embd_head, 2, 0, freq_base, freq_scale); - cb(tmp, "K_shifted", il); - ggml_build_forward_expand(gf, tmp); - } + llm_build_k_shift(lctx, ctx0, gf, n_embd_head, LLM_ROPE_NEOX, cb); } for (int il = 0; il < n_layer; ++il) { struct ggml_tensor * attn_norm; + attn_norm = llm_build_norm(ctx0, inpL, + model.layers[il].attn_norm, + model.layers[il].attn_norm_b, + LLM_NORM, norm_eps, cb, il); + cb(attn_norm, "attn_norm", il); + // self-attention - // TODO: refactor into common function (shared with LLaMA) { - attn_norm = ggml_norm(ctx0, inpL, norm_eps); - cb(attn_norm, "attn_norm_0", il); - - attn_norm = ggml_mul(ctx0, attn_norm, model.layers[il].attn_norm); - cb(attn_norm, "attn_norm_0_w", il); - - attn_norm = ggml_add(ctx0, attn_norm, model.layers[il].attn_norm_b); - cb(attn_norm, "attn_norm_0_wb", il); - - if (model.layers[il].attn_norm_2) { // Falcon-40B - cur = ggml_norm(ctx0, inpL, norm_eps); + if (model.layers[il].attn_norm_2) { + // Falcon-40B + cur = llm_build_norm(ctx0, attn_norm, + model.layers[il].attn_norm_2, + model.layers[il].attn_norm_2_b, + LLM_NORM, norm_eps, cb, il); cb(cur, "attn_norm_2", il); - - cur = ggml_mul(ctx0, cur, model.layers[il].attn_norm_2); - cb(cur, "attn_norm_2_w", il); - - cur = ggml_add(ctx0, cur, model.layers[il].attn_norm_2_b); - cb(cur, "attn_norm_2_wb", il); - } else { // Falcon 7B + } else { cur = attn_norm; } - // compute QKV - cur = ggml_mul_mat(ctx0, model.layers[il].wqkv, cur); cb(cur, "wqkv", il); - // Note that the strides for Kcur, Vcur are set up so that the - // resulting views are misaligned with the tensor's storage - // (by applying the K/V offset we shift the tensor's original - // view to stick out behind the viewed QKV tensor's allocated - // memory, so to say). This is ok because no actual accesses - // happen to that out-of-range memory, but it can require some - // trickery when trying to accurately dump these views for - // debugging. + struct ggml_tensor * Qcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd, n_tokens, cur->nb[1], 0*sizeof(float)*(n_embd))); + struct ggml_tensor * Kcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd))); + struct ggml_tensor * Vcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa))); - const size_t wsize = ggml_type_size(cur->type); + cb(Qcur, "Qcur", il); + cb(Kcur, "Kcur", il); + cb(Vcur, "Vcur", il); - // TODO: these 2 ggml_conts are technically not needed, but we add them until CUDA support for - // non-contiguous views is added for the rope operator - struct ggml_tensor * tmpq = ggml_cont(ctx0, ggml_view_3d( - ctx0, cur, n_embd_head, n_head, n_tokens, - wsize * n_embd_head, - wsize * n_embd_head * (n_head + 2 * n_head_kv), - 0)); - cb(tmpq, "tmpq", il); - - struct ggml_tensor * tmpk = ggml_cont(ctx0, ggml_view_3d( - ctx0, cur, n_embd_head, n_head_kv, n_tokens, - wsize * n_embd_head, - wsize * n_embd_head * (n_head + 2 * n_head_kv), - wsize * n_embd_head * n_head)); - cb(tmpk, "tmpk", il); - - struct ggml_tensor * tmpv = ggml_view_3d( - ctx0, cur, n_embd_head, n_head_kv, n_tokens, - wsize * n_embd_head, - wsize * n_embd_head * (n_head + 2 * n_head_kv), - wsize * n_embd_head * (n_head + n_head_kv)); - cb(tmpv, "tmpv", il); + Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens); + Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens); // using mode = 2 for neox mode - struct ggml_tensor * Qcur = ggml_rope_custom(ctx0, tmpq, inp_pos, n_embd_head, 2, 0, freq_base, freq_scale); + Qcur = ggml_rope_custom(ctx0, Qcur, inp_pos, n_embd_head, 2, 0, freq_base, freq_scale); cb(Qcur, "Qcur", il); - struct ggml_tensor * Kcur = ggml_rope_custom(ctx0, tmpk, inp_pos, n_embd_head, 2, 0, freq_base, freq_scale); + Kcur = ggml_rope_custom(ctx0, Kcur, inp_pos, n_embd_head, 2, 0, freq_base, freq_scale); cb(Kcur, "Kcur", il); - { - struct ggml_tensor * Vcur = ggml_cont(ctx0, tmpv); - cb(Vcur, "Vcur_0", il); + llm_build_kv_store(lctx, ctx0, gf, Kcur, Vcur, n_tokens, kv_head, cb, il); - Vcur = ggml_transpose(ctx0, ggml_reshape_2d(ctx0, Vcur, n_embd_gqa, n_tokens)); - cb(Vcur, "Vcur_1", il); - - struct ggml_tensor * k = ggml_view_1d(ctx0, kv_self.k, n_tokens*n_embd_gqa, (ggml_element_size(kv_self.k)*n_embd_gqa)*(il*n_ctx + kv_head)); - cb(k, "k", il); - - struct ggml_tensor * v = ggml_view_2d(ctx0, kv_self.v, n_tokens, n_embd_gqa, - ( n_ctx)*ggml_element_size(kv_self.v), - (il*n_ctx)*ggml_element_size(kv_self.v)*n_embd_gqa + kv_head*ggml_element_size(kv_self.v)); - cb(v, "v", il); - - ggml_build_forward_expand(gf, ggml_cpy(ctx0, Kcur, k)); - ggml_build_forward_expand(gf, ggml_cpy(ctx0, Vcur, v)); - } - - struct ggml_tensor * Q = ggml_permute(ctx0, Qcur, 0, 2, 1, 3); - cb(Q, "Q", il); - - struct ggml_tensor * K = - ggml_view_3d(ctx0, kv_self.k, - n_embd_head, n_kv, n_head_kv, - ggml_element_size(kv_self.k)*n_embd_gqa, - ggml_element_size(kv_self.k)*n_embd_head, - ggml_element_size(kv_self.k)*n_embd_gqa*n_ctx*il); - cb(K, "K", il); - - struct ggml_tensor * KQ = ggml_mul_mat(ctx0, K, Q); - cb(KQ, "KQ", il); - - struct ggml_tensor * KQ_scaled = ggml_scale(ctx0, KQ, KQ_scale); - cb(KQ_scaled, "KQ_scaled", il); - - struct ggml_tensor * KQ_masked = ggml_add(ctx0, KQ_scaled, KQ_mask); - cb(KQ_masked, "KQ_masked", il); - - struct ggml_tensor * KQ_soft_max = ggml_soft_max(ctx0, KQ_masked); - cb(KQ_soft_max, "KQ_soft_max", il); - - struct ggml_tensor * V = - ggml_view_3d(ctx0, kv_self.v, - n_kv, n_embd_head, n_head_kv, - ggml_element_size(kv_self.v)*n_ctx, - ggml_element_size(kv_self.v)*n_ctx*n_embd_head, - ggml_element_size(kv_self.v)*n_ctx*n_embd_gqa*il); - cb(V, "V", il); - - struct ggml_tensor * KQV = ggml_mul_mat(ctx0, V, KQ_soft_max); - cb(KQV, "KQV", il); - - struct ggml_tensor * KQV_merged = ggml_permute(ctx0, KQV, 0, 2, 1, 3); - cb(KQV_merged, "KQV_merged", il); - - cur = ggml_cont_2d(ctx0, KQV_merged, n_embd, n_tokens); - cb(cur, "KQV_merged_contiguous", il); - - cur = ggml_mul_mat(ctx0, model.layers[il].wo, cur); - cb(cur, "result_wo", il); + cur = llm_build_kqv(lctx, ctx0, attn_norm, + model.layers[il].wo, NULL, + Qcur, KQ_scale, KQ_mask, n_tokens, n_kv, -1.0f, cb, il); + cb(cur, "kqv_out", il); } - struct ggml_tensor * attn_out = cur; + struct ggml_tensor * ffn_inp = cur; // feed forward { - struct ggml_tensor * inpFF = attn_norm; - - cur = ggml_mul_mat(ctx0, model.layers[il].w3, inpFF); - cb(cur, "result_w3", il); - - cur = ggml_gelu(ctx0, cur); - cb(cur, "gelu", il); - - cur = ggml_mul_mat(ctx0, model.layers[il].w2, cur); - cb(cur, "result_w2", il); + cur = llm_build_ffn(ctx0, attn_norm, // !! use the attn norm, not the result + model.layers[il].ffn_up, NULL, + NULL, NULL, + model.layers[il].ffn_down, NULL, + LLM_FFN_GELU, LLM_FFN_SEQ, cb, il); + cb(cur, "ffn_out", il); } - cur = ggml_add(ctx0, cur, attn_out); - cb(cur, "inpFF_+_result_w2", il); + cur = ggml_add(ctx0, cur, ffn_inp); + cb(cur, "l_out", il); cur = ggml_add(ctx0, cur, inpL); - cb(cur, "inpL_+_inpFF_+_result_w2", il); + cb(cur, "l_out", il); // input for next layer inpL = cur; @@ -3925,16 +3888,11 @@ static struct ggml_cgraph * llm_build_falcon( cur = inpL; // norm - { - cur = ggml_norm(ctx0, cur, norm_eps); - cb(cur, "out_norm_0", -1); - - cur = ggml_mul(ctx0, cur, model.output_norm); - cb(cur, "out_norm_0_w", -1); - - cur = ggml_add(ctx0, cur, model.output_norm_b); - cb(cur, "result_norm", -1); - } + cur = llm_build_norm(ctx0, cur, + model.output_norm, + model.output_norm_b, + LLM_NORM, norm_eps, cb, -1); + cb(cur, "result_norm", -1); cur = ggml_mul_mat(ctx0, model.output, cur); cb(cur, "result_output", -1); @@ -3963,7 +3921,6 @@ static struct ggml_cgraph * llm_build_starcoder( const int64_t n_layer = hparams.n_layer; const int64_t n_ctx = cparams.n_ctx; const int64_t n_head = hparams.n_head; - const int64_t n_head_kv = hparams.n_head_kv; const int64_t n_embd_head = hparams.n_embd_head(); const int64_t n_embd_gqa = hparams.n_embd_gqa(); @@ -3988,23 +3945,11 @@ static struct ggml_cgraph * llm_build_starcoder( ggml_cgraph * gf = ggml_new_graph(ctx0); struct ggml_tensor * cur; - struct ggml_tensor * embd; struct ggml_tensor * pos; struct ggml_tensor * inpL; - if (batch.token) { - struct ggml_tensor * inp_tokens = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens); - cb(inp_tokens, "inp_tokens", -1); - - embd = ggml_get_rows(ctx0, model.tok_embeddings, inp_tokens); - } else { -#ifdef GGML_USE_MPI - GGML_ASSERT(false && "not implemented"); -#endif - - embd = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd, n_tokens); - } - cb(embd, "inp_embd", -1); + inpL = llm_build_inp_embd(ctx0, batch, model.tok_embd, n_embd, n_tokens, cb); + cb(inpL, "inp_embd", -1); // inp_pos - contains the positions struct ggml_tensor * inp_pos = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens); @@ -4018,160 +3963,74 @@ static struct ggml_cgraph * llm_build_starcoder( struct ggml_tensor * KQ_mask = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_kv, n_tokens, 1); cb(KQ_mask, "KQ_mask", -1); - pos = ggml_get_rows(ctx0, model.pos_embeddings, inp_pos); + pos = ggml_get_rows(ctx0, model.pos_embd, inp_pos); + cb(pos, "pos_embd", -1); - inpL = ggml_add(ctx0, embd, pos); + inpL = ggml_add(ctx0, inpL, pos); cb(inpL, "inpL", -1); for (int il = 0; il < n_layer; ++il) { + cur = llm_build_norm(ctx0, inpL, + model.layers[il].attn_norm, + model.layers[il].attn_norm_b, + LLM_NORM, norm_eps, cb, il); + cb(cur, "attn_norm", il); + + // self-attention { - // Norm - cur = ggml_norm(ctx0, inpL, norm_eps); - cb(cur, "attn_norm_0", il); - - cur = ggml_mul(ctx0, cur, model.layers[il].attn_norm); - cb(cur, "attn_norm_0_w", il); - - cur = ggml_add(ctx0, cur, model.layers[il].attn_norm_b); - cb(cur, "attn_norm_0_wb", il); - } - - { - // Self Attention cur = ggml_mul_mat(ctx0, model.layers[il].wqkv, cur); cb(cur, "wqkv", il); cur = ggml_add(ctx0, cur, model.layers[il].bqkv); cb(cur, "bqkv", il); - struct ggml_tensor * tmpq = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd, n_tokens, cur->nb[1], 0*sizeof(float)*(n_embd))); - struct ggml_tensor * tmpk = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd))); - struct ggml_tensor * tmpv = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa))); + struct ggml_tensor * Qcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd, n_tokens, cur->nb[1], 0*sizeof(float)*(n_embd))); + struct ggml_tensor * Kcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd))); + struct ggml_tensor * Vcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa))); - cb(tmpq, "tmpq", il); - cb(tmpk, "tmpk", il); - cb(tmpv, "tmpv", il); + cb(Qcur, "Qcur", il); + cb(Kcur, "Kcur", il); + cb(Vcur, "Vcur", il); - struct ggml_tensor * Qcur = ggml_reshape_3d(ctx0, tmpq, n_embd_head, n_head, n_tokens); - struct ggml_tensor * Kcur = tmpk; + Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens); - { - struct ggml_tensor * Vcur = ggml_transpose(ctx0, tmpv); - cb(Vcur, "Vcur", il); + llm_build_kv_store(lctx, ctx0, gf, Kcur, Vcur, n_tokens, kv_head, cb, il); - struct ggml_tensor * k = ggml_view_1d(ctx0, kv_self.k, n_tokens*n_embd_gqa, (ggml_element_size(kv_self.k)*n_embd_gqa)*(il*n_ctx + kv_head)); - cb(k, "k", il); - - struct ggml_tensor * v = ggml_view_2d(ctx0, kv_self.v, n_tokens, n_embd_gqa, - ( n_ctx)*ggml_element_size(kv_self.v), - (il*n_ctx)*ggml_element_size(kv_self.v)*n_embd_gqa + kv_head*ggml_element_size(kv_self.v)); - cb(v, "v", il); - - ggml_build_forward_expand(gf, ggml_cpy(ctx0, Kcur, k)); - ggml_build_forward_expand(gf, ggml_cpy(ctx0, Vcur, v)); - } - - struct ggml_tensor * Q = ggml_permute(ctx0, Qcur, 0, 2, 1, 3); - cb(Q, "Q", il); - - struct ggml_tensor * K = - ggml_view_3d(ctx0, kv_self.k, - n_embd_head, n_kv, n_head_kv, - ggml_element_size(kv_self.k)*n_embd_gqa, - ggml_element_size(kv_self.k)*n_embd_head, - ggml_element_size(kv_self.k)*n_embd_gqa*n_ctx*il); - cb(K, "K", il); - - // K * Q - struct ggml_tensor * KQ = ggml_mul_mat(ctx0, K, Q); - cb(KQ, "KQ", il); - - // KQ_scaled = KQ / sqrt(n_embd_head) - // KQ_scaled shape [n_past + n_tokens, n_tokens, n_head, 1] - struct ggml_tensor * KQ_scaled = ggml_scale_inplace(ctx0, KQ, KQ_scale); - cb(KQ_scaled, "KQ_scaled", il); - - // KQ_masked = mask_past(KQ_scaled) - struct ggml_tensor * KQ_masked = ggml_add(ctx0, KQ_scaled, KQ_mask); - cb(KQ_masked, "KQ_masked", il); - - // KQ = soft_max(KQ_masked) - struct ggml_tensor * KQ_soft_max = ggml_soft_max_inplace(ctx0, KQ_masked); - cb(KQ_soft_max, "KQ_soft_max", il); - - // split cached V into n_head heads - struct ggml_tensor * V = - ggml_view_3d(ctx0, kv_self.v, - n_kv, n_embd_head, n_head_kv, - ggml_element_size(kv_self.v)*n_ctx, - ggml_element_size(kv_self.v)*n_ctx*n_embd_head, - ggml_element_size(kv_self.v)*n_ctx*n_embd_gqa*il); - cb(V, "V", il); - - struct ggml_tensor * KQV = ggml_mul_mat(ctx0, V, KQ_soft_max); - cb(KQV, "KQV", il); - - struct ggml_tensor * KQV_merged = ggml_permute(ctx0, KQV, 0, 2, 1, 3); - cb(KQV_merged, "KQV_merged", il); - - cur = ggml_cont_2d(ctx0, KQV_merged, n_embd, n_tokens); - cb(cur, "KQV_merged_contiguous", il); + cur = llm_build_kqv(lctx, ctx0, cur, + model.layers[il].wo, model.layers[il].bo, + Qcur, KQ_scale, KQ_mask, n_tokens, n_kv, -1.0f, cb, il); + cb(cur, "kqv_out", il); } - // Projection - cur = ggml_add(ctx0, ggml_mul_mat(ctx0, model.layers[il].wo, cur), model.layers[il].bo); - cb(cur, "result_wo", il); - - // Add the input - cur = ggml_add(ctx0, cur, inpL); - cb(cur, "inpL_+_result_wo", il); - - struct ggml_tensor * inpFF = cur; + // add the input + struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpL); + cb(ffn_inp, "ffn_inp", il); // FF { - // Norm - { - cur = ggml_norm(ctx0, inpFF, norm_eps); - cb(cur, "ffn_norm_0", il); + cur = llm_build_norm(ctx0, ffn_inp, + model.layers[il].ffn_norm, + model.layers[il].ffn_norm_b, + LLM_NORM, norm_eps, cb, il); + cb(cur, "ffn_norm", il); - cur = ggml_mul(ctx0, cur, model.layers[il].ffn_norm); - cb(cur, "ffn_norm_0_w", il); - - cur = ggml_add(ctx0, cur, model.layers[il].ffn_norm_b); - cb(cur, "ffn_norm_0_wb", il); - } - - cur = ggml_add(ctx0, ggml_mul_mat(ctx0, model.layers[il].w3, cur), model.layers[il].b3); - cb(cur, "result_w3", il); - - // GELU activation - cur = ggml_gelu(ctx0, cur); - cb(cur, "gelu", il); - - // Projection - cur = ggml_mul_mat(ctx0, model.layers[il].w2, cur); - cb(cur, "result_w2", il); - - cur = ggml_add(ctx0, cur, model.layers[il].b2); - cb(cur, "result_w2_b", il); + cur = llm_build_ffn(ctx0, cur, + model.layers[il].ffn_up, model.layers[il].ffn_up_b, + NULL, NULL, + model.layers[il].ffn_down, model.layers[il].ffn_down_b, + LLM_FFN_GELU, LLM_FFN_SEQ, cb, il); + cb(cur, "ffn_out", il); } - inpL = ggml_add(ctx0, cur, inpFF); - + inpL = ggml_add(ctx0, cur, ffn_inp); + cb(inpL, "l_out", il); } - // Output Norm - { - cur = ggml_norm(ctx0, inpL, norm_eps); - cb(cur, "out_norm_0", -1); - - cur = ggml_mul(ctx0, cur, model.output_norm); - cb(cur, "out_norm_0_w", -1); - - cur = ggml_add(ctx0, cur, model.output_norm_b); - cb(cur, "result_norm", -1); - } + cur = llm_build_norm(ctx0, inpL, + model.output_norm, + model.output_norm_b, + LLM_NORM, norm_eps, cb, -1); + cb(cur, "result_norm", -1); cur = ggml_mul_mat(ctx0, model.output, cur); cb(cur, "result_output", -1); @@ -4195,18 +4054,18 @@ static struct ggml_cgraph * llm_build_persimmon( GGML_ASSERT(!!kv_self.ctx); const auto & cparams = lctx.cparams; + const int64_t n_embd = hparams.n_embd; const int64_t n_layer = hparams.n_layer; const int64_t n_ctx = cparams.n_ctx; const int64_t n_head_kv = hparams.n_head_kv; const int64_t n_head = hparams.n_head; const int64_t n_embd_head = hparams.n_embd_head(); - const int64_t n_embd_gqa = hparams.n_embd_gqa(); - const size_t n_rot = n_embd_head / 2; + const int64_t n_rot = n_embd_head / 2; const float freq_base = cparams.rope_freq_base; const float freq_scale = cparams.rope_freq_scale; - const float norm_eps = hparams.f_norm_eps; + const float norm_eps = hparams.f_norm_eps; const int32_t n_tokens = batch.n_tokens; const int32_t n_kv = worst_case ? n_ctx : kv_self.n; @@ -4215,6 +4074,7 @@ static struct ggml_cgraph * llm_build_persimmon( const bool do_rope_shift = worst_case || kv_self.has_shift; auto & buf_compute = lctx.buf_compute; + struct ggml_init_params params = { /*.mem_size =*/ buf_compute.size, /*.mem_buffer =*/ buf_compute.data, @@ -4228,14 +4088,7 @@ static struct ggml_cgraph * llm_build_persimmon( struct ggml_tensor * cur; struct ggml_tensor * inpL; - if (batch.token) { - struct ggml_tensor * inp_tokens = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens); - cb(inp_tokens, "inp_tokens", -1); - - inpL = ggml_get_rows(ctx0, model.tok_embeddings, inp_tokens); - } else { - inpL = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd, n_tokens); - } + inpL = llm_build_inp_embd(ctx0, batch, model.tok_embd, n_embd, n_tokens, cb); cb(inpL, "imp_embd", -1); struct ggml_tensor * inp_pos = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens); @@ -4249,38 +4102,17 @@ static struct ggml_cgraph * llm_build_persimmon( cb(KQ_mask, "KQ_mask", -1); if (do_rope_shift) { - struct ggml_tensor * K_shift = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_ctx); - cb(K_shift, "K_shift", -1); - - for (int il = 0; il < n_layer; ++il) { - struct ggml_tensor * tmp = - // we rotate only the first n_rot dimensions. - ggml_rope_custom_inplace(ctx0, - ggml_view_3d(ctx0, kv_self.k, - n_rot, n_head, n_ctx, - ggml_element_size(kv_self.k)*n_embd_gqa, - ggml_element_size(kv_self.k)*n_embd_head, - ggml_element_size(kv_self.k)*(n_embd_head*n_ctx*il) - ), - K_shift, n_rot, 2, 0, freq_base, freq_scale); - cb(tmp, "K_shifted", il); - ggml_build_forward_expand(gf, tmp); - } + llm_build_k_shift(lctx, ctx0, gf, n_rot, LLM_ROPE_NEOX, cb); } for (int il = 0; il < n_layer; ++il) { struct ggml_tensor * residual = inpL; - { - cur = ggml_norm(ctx0, inpL, norm_eps); - cb(cur, "attn_norm_0", il); - - cur = ggml_mul(ctx0, cur, model.layers[il].attn_norm); - cb(cur, "attn_norm_0_w", il); - - cur = ggml_add(ctx0, cur, model.layers[il].attn_norm_b); - cb(cur, "attn_norm_0_wb", il); - } + cur = llm_build_norm(ctx0, inpL, + model.layers[il].attn_norm, + model.layers[il].attn_norm_b, + LLM_NORM, norm_eps, cb, il); + cb(cur, "attn_norm", il); // self attention { @@ -4316,22 +4148,16 @@ static struct ggml_cgraph * llm_build_persimmon( cb(tmpk, "tmpk", il); // Q/K Layernorm - tmpq = ggml_norm(ctx0, tmpq, norm_eps); + tmpq = llm_build_norm(ctx0, tmpq, + model.layers[il].attn_q_norm, + model.layers[il].attn_q_norm_b, + LLM_NORM, norm_eps, cb, il); cb(tmpq, "tmpq", il); - tmpq = ggml_mul(ctx0, tmpq, model.layers[il].attn_q_norm); - cb(tmpq, "tmpq", il); - - tmpq = ggml_add(ctx0, tmpq, model.layers[il].attn_q_norm_b); - cb(tmpq, "tmpq", il); - - tmpk = ggml_norm(ctx0, tmpk, norm_eps); - cb(tmpk, "tmpk", il); - - tmpk = ggml_mul(ctx0, tmpk, model.layers[il].attn_k_norm); - cb(tmpk, "tmpk", il); - - tmpk = ggml_add(ctx0, tmpk, model.layers[il].attn_k_norm_b); + tmpk = llm_build_norm(ctx0, tmpk, + model.layers[il].attn_k_norm, + model.layers[il].attn_k_norm_b, + LLM_NORM, norm_eps, cb, il); cb(tmpk, "tmpk", il); // RoPE the first n_rot of q/k, pass the other half, and concat. @@ -4404,131 +4230,55 @@ static struct ggml_cgraph * llm_build_persimmon( Kcur = ggml_cont(ctx0, ggml_permute(ctx0, Kcur, 2, 1, 0, 3)); cb(Kcur, "Kcur", il); - { - struct ggml_tensor * tmpv = ggml_view_3d( - ctx0, tmpqkv_perm, n_embd_head, n_head, n_tokens, - ggml_element_size(tmpqkv_perm) * n_embd_head, - ggml_element_size(tmpqkv_perm) * n_embd_head * n_head, - ggml_element_size(tmpqkv_perm) * n_embd_head * n_head * n_tokens * 2 + struct ggml_tensor * Vcur = ggml_view_3d( + ctx0, tmpqkv_perm, n_embd_head, n_head, n_tokens, + ggml_element_size(tmpqkv_perm) * n_embd_head, + ggml_element_size(tmpqkv_perm) * n_embd_head * n_head, + ggml_element_size(tmpqkv_perm) * n_embd_head * n_head * n_tokens * 2 ); - cb(tmpv, "tmpv", il); + cb(Vcur, "Vcur", il); - // store K, V in cache - struct ggml_tensor * Vcur = ggml_transpose(ctx0, ggml_reshape_2d(ctx0, tmpv, n_embd_gqa, n_tokens)); - cb(Vcur, "Vcur", il); + llm_build_kv_store(lctx, ctx0, gf, Kcur, Vcur, n_tokens, kv_head, cb, il); - struct ggml_tensor * k = ggml_view_1d( - ctx0, kv_self.k, n_tokens*n_embd_gqa, - (ggml_element_size(kv_self.k)*n_embd_gqa)*(il*n_ctx + kv_head) - ); - cb(k, "k", il); - - struct ggml_tensor * v = ggml_view_2d(ctx0, kv_self.v, n_tokens, n_embd_gqa, - ( n_ctx)*ggml_element_size(kv_self.v), - (il*n_ctx)*ggml_element_size(kv_self.v)*n_embd_gqa + kv_head*ggml_element_size(kv_self.v)); - cb(v, "v", il); - - // important: storing RoPE-ed version of K in the KV cache! - ggml_build_forward_expand(gf, ggml_cpy(ctx0, Kcur, k)); - ggml_build_forward_expand(gf, ggml_cpy(ctx0, Vcur, v)); - } - struct ggml_tensor * K = ggml_view_3d(ctx0, kv_self.k, - n_embd_head, n_kv, n_head_kv, - ggml_element_size(kv_self.k)*n_embd_gqa, - ggml_element_size(kv_self.k)*n_embd_head, - ggml_element_size(kv_self.k)*n_embd_gqa*n_ctx*il); - cb(K, "K", il); - - struct ggml_tensor * KQ = ggml_mul_mat(ctx0, K, Q); - cb(KQ, "KQ", il); - - struct ggml_tensor * KQ_scaled = ggml_scale(ctx0, KQ, KQ_scale); - cb(KQ_scaled, "KQ_scaled", il); - - struct ggml_tensor * KQ_masked = ggml_add(ctx0, KQ_scaled, KQ_mask); - cb(KQ_masked, "KQ_masked", il); - - struct ggml_tensor * KQ_soft_max = ggml_soft_max_inplace(ctx0, KQ_masked); - cb(KQ_soft_max, "KQ_soft_max", il); - - struct ggml_tensor * V = - ggml_view_3d(ctx0, kv_self.v, - n_kv, n_embd_head, n_head_kv, - ggml_element_size(kv_self.v)*n_ctx, - ggml_element_size(kv_self.v)*n_ctx*n_embd_head, - ggml_element_size(kv_self.v)*n_ctx*n_embd_gqa*il); - cb(V, "V", il); - - struct ggml_tensor * KQV = ggml_mul_mat(ctx0, V, KQ_soft_max); - cb(KQV, "KQV", il); - - struct ggml_tensor * KQV_merged = ggml_permute(ctx0, KQV, 0, 2, 1, 3); - cb(KQV_merged, "KQV_merged", il); - - cur = ggml_cont_2d(ctx0, KQV_merged, n_embd, n_tokens); - cb(cur, "KQV_merged_contiguous", il); - - cur = ggml_mul_mat(ctx0, model.layers[il].wo, cur); - cb(cur, "result_wo", il); - - cur = ggml_add(ctx0, cur, model.layers[il].bo); - cb(cur, "result_wo_b", il); + // TODO: not tested, could be broken + cur = llm_build_kqv(lctx, ctx0, Q, + model.layers[il].wo, model.layers[il].bo, + Q, KQ_scale, KQ_mask, n_tokens, n_kv, -1.0f, cb, il); + cb(cur, "kqv_out", il); } - struct ggml_tensor * inpFF = ggml_add(ctx0, residual, cur); - cb(inpFF, "inpFF", il); + struct ggml_tensor * ffn_inp = ggml_add(ctx0, residual, cur); + cb(ffn_inp, "ffn_inp", il); + // feed-forward network { - // MLP - { - // Norm - cur = ggml_norm(ctx0, inpFF, norm_eps); - cb(cur, "ffn_norm_0", il); + cur = llm_build_norm(ctx0, ffn_inp, + model.layers[il].ffn_norm, + model.layers[il].ffn_norm_b, + LLM_NORM, norm_eps, cb, il); + cb(cur, "ffn_norm", il); - cur = ggml_mul(ctx0, cur, model.layers[il].ffn_norm); - cb(cur, "ffn_norm_0_w", il); - - cur = ggml_add(ctx0, cur, model.layers[il].ffn_norm_b); - cb(cur, "ffn_norm_0_wb", il); - } - - cur = ggml_mul_mat(ctx0, model.layers[il].w3, cur); - cb(cur, "result_w3", il); - - cur = ggml_add(ctx0, cur, model.layers[il].b3); - cb(cur, "result_w3_b", il); - - cur = ggml_relu(ctx0, cur); - cb(cur, "relu", il); - - cur = ggml_sqr(ctx0, cur); - cb(cur, "sqr(relu)", il); - - cur = ggml_mul_mat(ctx0, model.layers[il].w2, cur); - cb(cur, "result_w2", il); - - cur = ggml_add(ctx0, cur, model.layers[il].b2); - cb(cur, "result_w2_b", il); + cur = llm_build_ffn(ctx0, cur, + model.layers[il].ffn_up, model.layers[il].ffn_up_b, + NULL, NULL, + model.layers[il].ffn_down, model.layers[il].ffn_down_b, + LLM_FFN_RELU_SQR, LLM_FFN_SEQ, cb, il); + cb(cur, "ffn_out", il); } - cur = ggml_add(ctx0, cur, inpFF); - cb(cur, "inpFF_+_result_w2", il); + cur = ggml_add(ctx0, cur, ffn_inp); + cb(cur, "l_out", il); inpL = cur; } cur = inpL; - { - cur = ggml_norm(ctx0, cur, norm_eps); - cb(cur, "out_norm_0", -1); - - cur = ggml_mul(ctx0, cur, model.output_norm); - cb(cur, "out_norm_0_w", -1); - - cur = ggml_add(ctx0, cur, model.output_norm_b); - cb(cur, "result_norm", -1); - } + cur = llm_build_norm(ctx0, cur, + model.output_norm, + model.output_norm_b, + LLM_NORM, norm_eps, cb, -1); + cb(cur, "result_norm", -1); cur = ggml_mul_mat(ctx0, model.output, cur); cb(cur, "result_output", -1); @@ -4559,7 +4309,6 @@ static struct ggml_cgraph * llm_build_refact( const int64_t n_head = hparams.n_head; const int64_t n_head_kv = hparams.n_head_kv; const int64_t n_embd_head = hparams.n_embd_head(); - const int64_t n_embd_gqa = hparams.n_embd_gqa(); const float norm_rms_eps = hparams.f_norm_rms_eps; @@ -4567,8 +4316,6 @@ static struct ggml_cgraph * llm_build_refact( const int32_t n_kv = worst_case ? n_ctx : kv_self.n; const int32_t kv_head = worst_case ? n_ctx - n_tokens : kv_self.head; - // printf("n_kv = %d\n", n_kv); - auto & buf_compute = lctx.buf_compute; struct ggml_init_params params = { @@ -4584,18 +4331,7 @@ static struct ggml_cgraph * llm_build_refact( struct ggml_tensor * cur; struct ggml_tensor * inpL; - if (batch.token) { - struct ggml_tensor * inp_tokens = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens); - cb(inp_tokens, "inp_tokens", -1); - - inpL = ggml_get_rows(ctx0, model.tok_embeddings, inp_tokens); - } else { -#ifdef GGML_USE_MPI - GGML_ASSERT(false && "not implemented"); -#endif - - inpL = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd, n_tokens); - } + inpL = llm_build_inp_embd(ctx0, batch, model.tok_embd, n_embd, n_tokens, cb); cb(inpL, "inp_embd", -1); // KQ_scale @@ -4609,151 +4345,56 @@ static struct ggml_cgraph * llm_build_refact( for (int il = 0; il < n_layer; ++il) { struct ggml_tensor * inpSA = inpL; - // norm - { - cur = ggml_rms_norm(ctx0, inpL, norm_rms_eps); - cb(cur, "rms_norm_0", il); - - // cur = cur*attn_norm(broadcasted) - cur = ggml_mul(ctx0, cur, model.layers[il].attn_norm); - cb(cur, "attn_norm_0", il); - } + cur = llm_build_norm(ctx0, inpL, + model.layers[il].attn_norm, NULL, + LLM_NORM_RMS, norm_rms_eps, cb, il); + cb(cur, "attn_norm", il); // self-attention { - // compute Q and K - struct ggml_tensor * tmpk = ggml_mul_mat(ctx0, model.layers[il].wk, cur); - cb(tmpk, "tmpk", il); - - struct ggml_tensor * tmpq = ggml_mul_mat(ctx0, model.layers[il].wq, cur); - cb(tmpq, "tmpq", il); - - struct ggml_tensor * Kcur = ggml_reshape_3d(ctx0, tmpk, n_embd_head, n_head_kv, n_tokens); - cb(Kcur, "Kcur", il); - - struct ggml_tensor * Qcur = ggml_reshape_3d(ctx0, tmpq, n_embd_head, n_head, n_tokens); + struct ggml_tensor * Qcur = ggml_mul_mat(ctx0, model.layers[il].wq, cur); cb(Qcur, "Qcur", il); - // store key and value to memory - { - // compute the transposed [n_tokens, n_embd] V matrix + struct ggml_tensor * Kcur = ggml_mul_mat(ctx0, model.layers[il].wk, cur); + cb(Kcur, "Kcur", il); - struct ggml_tensor * tmpv = ggml_mul_mat(ctx0, model.layers[il].wv, cur); - cb(tmpv, "tmpv", il); + struct ggml_tensor * Vcur = ggml_mul_mat(ctx0, model.layers[il].wv, cur); + cb(Vcur, "Vcur", il); - struct ggml_tensor * Vcur = ggml_transpose(ctx0, ggml_reshape_2d(ctx0, tmpv, n_embd_gqa, n_tokens)); - cb(Vcur, "Vcur", il); + Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens); + cb(Kcur, "Kcur", il); - struct ggml_tensor * k = ggml_view_1d(ctx0, kv_self.k, n_tokens*n_embd_gqa, (ggml_element_size(kv_self.k)*n_embd_gqa)*(il*n_ctx + kv_head)); - cb(k, "k", il); + Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens); + cb(Qcur, "Qcur", il); - struct ggml_tensor * v = ggml_view_2d(ctx0, kv_self.v, n_tokens, n_embd_gqa, - ( n_ctx)*ggml_element_size(kv_self.v), - (il*n_ctx)*ggml_element_size(kv_self.v)*n_embd_gqa + kv_head*ggml_element_size(kv_self.v)); - cb(v, "v", il); + llm_build_kv_store(lctx, ctx0, gf, Kcur, Vcur, n_tokens, kv_head, cb, il); - ggml_build_forward_expand(gf, ggml_cpy(ctx0, Kcur, k)); - ggml_build_forward_expand(gf, ggml_cpy(ctx0, Vcur, v)); - } - - struct ggml_tensor * Q = ggml_permute(ctx0, Qcur, 0, 2, 1, 3); - cb(Q, "Q", il); - - struct ggml_tensor * K = - ggml_view_3d(ctx0, kv_self.k, - n_embd_head, n_kv, n_head_kv, - ggml_element_size(kv_self.k)*n_embd_gqa, - ggml_element_size(kv_self.k)*n_embd_head, - ggml_element_size(kv_self.k)*n_embd_gqa*n_ctx*il); - cb(K, "K", il); - - // K * Q - struct ggml_tensor * KQ = ggml_mul_mat(ctx0, K, Q); - cb(KQ, "KQ", il); - - // KQ_scaled = KQ / sqrt(n_embd_head) - // KQ_scaled shape [n_kv, n_tokens, n_head, 1] - struct ggml_tensor * KQ_scaled = ggml_scale(ctx0, KQ, KQ_scale); - cb(KQ_scaled, "KQ_scaled", il); - - // KQ_masked = mask_past(KQ_scaled) - struct ggml_tensor * KQ_scaled_alibi = ggml_alibi(ctx0, KQ_scaled, /*n_past*/ 0, n_head, 8); - cb(KQ_scaled_alibi, "KQ_scaled_alibi", il); - - struct ggml_tensor * KQ_masked = ggml_add(ctx0, KQ_scaled_alibi, KQ_mask); - cb(KQ_masked, "KQ_masked", il); - - // KQ = soft_max(KQ_masked) - struct ggml_tensor * KQ_soft_max = ggml_soft_max(ctx0, KQ_masked); - cb(KQ_soft_max, "KQ_soft_max", il); - - // split cached V into n_head heads - struct ggml_tensor * V = - ggml_view_3d(ctx0, kv_self.v, - n_kv, n_embd_head, n_head_kv, - ggml_element_size(kv_self.v)*n_ctx, - ggml_element_size(kv_self.v)*n_ctx*n_embd_head, - ggml_element_size(kv_self.v)*n_ctx*n_embd_gqa*il); - cb(V, "V", il); - - struct ggml_tensor * KQV = ggml_mul_mat(ctx0, V, KQ_soft_max); - cb(KQV, "KQV", il); - - // KQV_merged = KQV.permute(0, 2, 1, 3) - struct ggml_tensor * KQV_merged = ggml_permute(ctx0, KQV, 0, 2, 1, 3); - cb(KQV_merged, "KQV_merged", il); - - // cur = KQV_merged.contiguous().view(n_embd, n_tokens) - cur = ggml_cont_2d(ctx0, KQV_merged, n_embd, n_tokens); - cb(cur, "KQV_merged_contiguous", il); - - // projection (no bias) - cur = ggml_mul_mat(ctx0, - model.layers[il].wo, - cur); - cb(cur, "result_wo", il); + cur = llm_build_kqv(lctx, ctx0, Qcur, + model.layers[il].wo, NULL, + Qcur, KQ_scale, KQ_mask, n_tokens, n_kv, 8.0f, cb, il); + cb(cur, "kqv_out", il); } - struct ggml_tensor * inpFF = ggml_add(ctx0, cur, inpSA); - cb(inpFF, "inpFF", il); + struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA); + cb(ffn_inp, "ffn_inp", il); // feed-forward network { - // norm - { - cur = ggml_rms_norm(ctx0, inpFF, norm_rms_eps); - cb(cur, "rms_norm_1", il); + cur = llm_build_norm(ctx0, ffn_inp, + model.layers[il].ffn_norm, NULL, + LLM_NORM_RMS, norm_rms_eps, cb, il); + cb(cur, "ffn_norm", il); - // cur = cur*ffn_norm(broadcasted) - cur = ggml_mul(ctx0, cur, model.layers[il].ffn_norm); - cb(cur, "ffn_norm", il); - } - - struct ggml_tensor * tmp = ggml_mul_mat(ctx0, - model.layers[il].w3, - cur); - cb(tmp, "result_w3", il); - - cur = ggml_mul_mat(ctx0, - model.layers[il].w1, - cur); - cb(cur, "result_w1", il); - - // SILU activation - cur = ggml_silu(ctx0, cur); - cb(cur, "silu", il); - - cur = ggml_mul(ctx0, cur, tmp); - cb(cur, "silu_x_result_w3", il); - - cur = ggml_mul_mat(ctx0, - model.layers[il].w2, - cur); - cb(cur, "result_w2", il); + cur = llm_build_ffn(ctx0, cur, + model.layers[il].ffn_up, NULL, + model.layers[il].ffn_gate, NULL, + model.layers[il].ffn_down, NULL, + LLM_FFN_SILU, LLM_FFN_PAR, cb, il); + cb(cur, "ffn_out", il); } - cur = ggml_add(ctx0, cur, inpFF); - cb(cur, "inpFF_+_result_w2", il); + cur = ggml_add(ctx0, cur, ffn_inp); + cb(cur, "l_out", il); // input for next layer inpL = cur; @@ -4761,15 +4402,10 @@ static struct ggml_cgraph * llm_build_refact( cur = inpL; - // norm - { - cur = ggml_rms_norm(ctx0, cur, norm_rms_eps); - cb(cur, "rms_norm_2", -1); - - // cur = cur*norm(broadcasted) - cur = ggml_mul(ctx0, cur, model.output_norm); - cb(cur, "result_norm", -1); - } + cur = llm_build_norm(ctx0, cur, + model.output_norm, NULL, + LLM_NORM_RMS, norm_rms_eps, cb, -1); + cb(cur, "result_norm", -1); // lm_head cur = ggml_mul_mat(ctx0, model.output, cur); @@ -4799,7 +4435,6 @@ static struct ggml_cgraph * llm_build_bloom( const int64_t n_layer = hparams.n_layer; const int64_t n_ctx = cparams.n_ctx; const int64_t n_head = hparams.n_head; - const int64_t n_head_kv = hparams.n_head_kv; const int64_t n_embd_head = hparams.n_embd_head(); const int64_t n_embd_gqa = hparams.n_embd_gqa(); @@ -4826,22 +4461,10 @@ static struct ggml_cgraph * llm_build_bloom( ggml_cgraph * gf = ggml_new_graph(ctx0); struct ggml_tensor * cur; - struct ggml_tensor * embd; struct ggml_tensor * inpL; - if (batch.token) { - struct ggml_tensor * inp_tokens = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens); - cb(inp_tokens, "inp_tokens", -1); - - embd = ggml_get_rows(ctx0, model.tok_embeddings, inp_tokens); - } else { -#ifdef GGML_USE_MPI - GGML_ASSERT(false && "not implemented"); -#endif - - embd = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd, n_tokens); - } - cb(embd, "inp_embd", -1); + inpL = llm_build_inp_embd(ctx0, batch, model.tok_embd, n_embd, n_tokens, cb); + cb(inpL, "inp_embd", -1); // KQ_scale struct ggml_tensor * KQ_scale = ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, 1); @@ -4851,182 +4474,74 @@ static struct ggml_cgraph * llm_build_bloom( struct ggml_tensor * KQ_mask = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_kv, n_tokens, 1); cb(KQ_mask, "KQ_mask", -1); - // norm - { - inpL = ggml_norm(ctx0, embd, norm_eps); - cb(inpL, "inp_norm", -1); - - inpL = ggml_mul(ctx0, inpL, model.tok_norm); - cb(inpL, "inp_norm_w", -1); - - inpL = ggml_add (ctx0, inpL, model.tok_norm_b); - cb(inpL, "inp_norm_wb", -1); - } + inpL = llm_build_norm(ctx0, inpL, + model.tok_norm, + model.tok_norm_b, + LLM_NORM, norm_eps, cb, -1); + cb(inpL, "inp_norm", -1); for (int il = 0; il < n_layer; ++il) { + cur = llm_build_norm(ctx0, inpL, + model.layers[il].attn_norm, + model.layers[il].attn_norm_b, + LLM_NORM, norm_eps, cb, il); + cb(cur, "attn_norm", il); + + // self-attention { - // Norm - cur = ggml_norm(ctx0, inpL, norm_eps); - cb(cur, "attn_norm_0", il); - - cur = ggml_mul(ctx0, cur, model.layers[il].attn_norm); - cb(cur, "attn_norm_0_w", il); - - cur = ggml_add(ctx0, cur, model.layers[il].attn_norm_b); - cb(cur, "attn_norm_0_wb", il); - } - - { - // Self Attention cur = ggml_mul_mat(ctx0, model.layers[il].wqkv, cur); cb(cur, "wqkv", il); cur = ggml_add(ctx0, cur, model.layers[il].bqkv); cb(cur, "bqkv", il); - struct ggml_tensor * tmpq = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd, n_tokens, cur->nb[1], 0*sizeof(float)*(n_embd))); - struct ggml_tensor * tmpk = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd))); - struct ggml_tensor * tmpv = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa))); + struct ggml_tensor * Qcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd, n_tokens, cur->nb[1], 0*sizeof(float)*(n_embd))); + struct ggml_tensor * Kcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd))); + struct ggml_tensor * Vcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa))); - cb(tmpq, "tmpq", il); - cb(tmpk, "tmpk", il); - cb(tmpv, "tmpv", il); + cb(Qcur, "Qcur", il); + cb(Kcur, "Kcur", il); + cb(Vcur, "Vcur", il); - struct ggml_tensor * Qcur = ggml_reshape_3d(ctx0, tmpq, n_embd_head, n_head, n_tokens); - struct ggml_tensor * Kcur = tmpk; + Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens); - // store key and value to memory - { - struct ggml_tensor * Vcur = ggml_transpose(ctx0, ggml_reshape_2d(ctx0, ggml_cont(ctx0, tmpv), n_embd_gqa, n_tokens)); - cb(Vcur, "Vcur", il); + llm_build_kv_store(lctx, ctx0, gf, Kcur, Vcur, n_tokens, kv_head, cb, il); - struct ggml_tensor * k = ggml_view_1d(ctx0, kv_self.k, n_tokens*n_embd_gqa, (ggml_element_size(kv_self.k)*n_embd_gqa)*(il*n_ctx + kv_head)); - cb(k, "k", il); - - struct ggml_tensor * v = ggml_view_2d(ctx0, kv_self.v, n_tokens, n_embd_gqa, - ( n_ctx)*ggml_element_size(kv_self.v), - (il*n_ctx)*ggml_element_size(kv_self.v)*n_embd_gqa + kv_head*ggml_element_size(kv_self.v)); - cb(v, "v", il); - - ggml_build_forward_expand(gf, ggml_cpy(ctx0, Kcur, k)); - ggml_build_forward_expand(gf, ggml_cpy(ctx0, Vcur, v)); - } - - struct ggml_tensor * Q = - ggml_permute(ctx0, - ggml_cpy(ctx0, - Qcur, - ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_embd_head, n_head, n_tokens)), - 0, 2, 1, 3); - cb(Q, "Q", il); - - struct ggml_tensor * K = - ggml_view_3d(ctx0, kv_self.k, - n_embd_head, n_kv, n_head_kv, - ggml_element_size(kv_self.k)*n_embd_gqa, - ggml_element_size(kv_self.k)*n_embd_head, - ggml_element_size(kv_self.k)*n_embd_gqa*n_ctx*il); - cb(K, "K", il); - - // K * Q - struct ggml_tensor * KQ = ggml_mul_mat(ctx0, K, Q); - cb(KQ, "KQ", il); - - // KQ_scaled = KQ / sqrt(n_embd_head) - // KQ_scaled shape [n_past + n_tokens, n_tokens, n_head, 1] - struct ggml_tensor * KQ_scaled = ggml_scale_inplace(ctx0, KQ, KQ_scale); - cb(KQ_scaled, "KQ_scaled", il); - - struct ggml_tensor * KQ_scaled_alibi = ggml_alibi(ctx0, KQ_scaled, /*n_past*/ kv_head, n_head, 8); - cb(KQ_scaled_alibi, "KQ_scaled_alibi", il); - - // KQ_masked = mask_past(KQ_scaled) - struct ggml_tensor * KQ_masked = ggml_add(ctx0, KQ_scaled_alibi, KQ_mask); - cb(KQ_masked, "KQ_masked", il); - - // KQ = soft_max(KQ_masked) - struct ggml_tensor * KQ_soft_max = ggml_soft_max_inplace(ctx0, KQ_masked); - cb(KQ_soft_max, "KQ_soft_max", il); - - // split cached V into n_head heads - struct ggml_tensor * V = - ggml_view_3d(ctx0, kv_self.v, - n_kv, n_embd_head, n_head_kv, - ggml_element_size(kv_self.v)*n_ctx, - ggml_element_size(kv_self.v)*n_ctx*n_embd_head, - ggml_element_size(kv_self.v)*n_ctx*n_embd_gqa*il); - cb(V, "V", il); - - struct ggml_tensor * KQV = ggml_mul_mat(ctx0, V, KQ_soft_max); - cb(KQV, "KQV", il); - - // KQV_merged = KQV.permute(0, 2, 1, 3) - struct ggml_tensor * KQV_merged = ggml_permute(ctx0, KQV, 0, 2, 1, 3); - cb(KQV_merged, "KQV_merged", il); - - // cur = KQV_merged.contiguous().view(n_embd, n_tokens) - cur = ggml_cont_2d(ctx0, KQV_merged, n_embd, n_tokens); - cb(cur, "KQV_merged_contiguous", il); + cur = llm_build_kqv(lctx, ctx0, Qcur, + model.layers[il].wo, model.layers[il].bo, + Qcur, KQ_scale, KQ_mask, n_tokens, n_kv, 8.0f, cb, il); + cb(cur, "kqv_out", il); } - // Projection - cur = ggml_mul_mat(ctx0, model.layers[il].wo, cur); - cb(cur, "result_wo", il); - - cur = ggml_add(ctx0, cur, model.layers[il].bo); - cb(cur, "result_wo_b", il); - // Add the input - cur = ggml_add(ctx0, cur, inpL); - cb(cur, "inpL_+_result_wo", il); - - struct ggml_tensor * inpFF = cur; + struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpL); + cb(ffn_inp, "ffn_inp", il); // FF { - // Norm - { - cur = ggml_norm(ctx0, inpFF, norm_eps); - cb(cur, "ffn_norm_0", il); + cur = llm_build_norm(ctx0, ffn_inp, + model.layers[il].ffn_norm, + model.layers[il].ffn_norm_b, + LLM_NORM, norm_eps, cb, il); + cb(cur, "ffn_norm", il); - cur = ggml_mul(ctx0, cur, model.layers[il].ffn_norm); - cb(cur, "ffn_norm_0_w", il); - - cur = ggml_add(ctx0, cur, model.layers[il].ffn_norm_b); - cb(cur, "ffn_norm_0_wb", il); - } - - cur = ggml_mul_mat(ctx0, model.layers[il].w3, cur); - cb(cur, "result_w3", il); - - cur = ggml_add(ctx0, cur, model.layers[il].b3); - cb(cur, "result_w3_b", il); - - cur = ggml_gelu(ctx0, cur); - cb(cur, "gelu", il); - - cur = ggml_mul_mat(ctx0, model.layers[il].w2, cur); - cb(cur, "result_w2", il); - - cur = ggml_add(ctx0, cur, model.layers[il].b2); - cb(cur, "result_w2_b", il); + cur = llm_build_ffn(ctx0, cur, + model.layers[il].ffn_up, model.layers[il].ffn_up_b, + NULL, NULL, + model.layers[il].ffn_down, model.layers[il].ffn_down_b, + LLM_FFN_GELU, LLM_FFN_SEQ, cb, il); + cb(cur, "ffn_out", il); } - inpL = ggml_add(ctx0, cur, inpFF); - cb(inpL, "inpFF_+_result_w2", il); + inpL = ggml_add(ctx0, cur, ffn_inp); + cb(inpL, "l_out", il); } - // Output Norm - { - cur = ggml_norm(ctx0, inpL, norm_eps); - cb(cur, "out_norm_0", -1); - - cur = ggml_mul(ctx0, cur, model.output_norm); - cb(cur, "out_norm_0_w", -1); - - cur = ggml_add(ctx0, cur, model.output_norm_b); - cb(cur, "result_norm", -1); - } + cur = llm_build_norm(ctx0, inpL, + model.output_norm, + model.output_norm_b, + LLM_NORM, norm_eps, cb, -1); + cb(cur, "result_norm", -1); cur = ggml_mul_mat(ctx0, model.output, cur); cb(cur, "result_output", -1); @@ -5055,7 +4570,6 @@ static struct ggml_cgraph * llm_build_mpt( const int64_t n_layer = hparams.n_layer; const int64_t n_ctx = cparams.n_ctx; const int64_t n_head = hparams.n_head; - const int64_t n_head_kv = hparams.n_head_kv; const int64_t n_embd_head = hparams.n_embd_head(); const int64_t n_embd_gqa = hparams.n_embd_gqa(); @@ -5084,18 +4598,7 @@ static struct ggml_cgraph * llm_build_mpt( struct ggml_tensor * cur; struct ggml_tensor * inpL; - if (batch.token) { - struct ggml_tensor * inp_tokens = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens); - cb(inp_tokens, "inp_tokens", -1); - - inpL = ggml_get_rows(ctx0, model.tok_embeddings, inp_tokens); - } else { -#ifdef GGML_USE_MPI - GGML_ASSERT(false && "not implemented"); -#endif - - inpL = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd, n_tokens); - } + inpL = llm_build_inp_embd(ctx0, batch, model.tok_embd, n_embd, n_tokens, cb); cb(inpL, "inp_embd", -1); // KQ_scale @@ -5109,20 +4612,15 @@ static struct ggml_cgraph * llm_build_mpt( for (int il = 0; il < n_layer; ++il) { struct ggml_tensor * attn_norm; + attn_norm = llm_build_norm(ctx0, inpL, + model.layers[il].attn_norm, + NULL, + LLM_NORM, norm_eps, cb, il); + cb(attn_norm, "attn_norm", il); + // self-attention - // TODO: refactor into common function (shared with LLaMA) { - attn_norm = ggml_norm(ctx0, inpL, norm_eps); - cb(attn_norm, "attn_norm_0", il); - - attn_norm = ggml_mul(ctx0, attn_norm, model.layers[il].attn_norm); - cb(attn_norm, "attn_norm_0_w", il); - - if (1) { - cur = attn_norm; - } - - // compute QKV + cur = attn_norm; cur = ggml_mul_mat(ctx0, model.layers[il].wqkv, cur); cb(cur, "wqkv", il); @@ -5132,125 +4630,46 @@ static struct ggml_cgraph * llm_build_mpt( cb(cur, "wqkv_clamped", il); } - const size_t wsize = ggml_type_size(cur->type); + struct ggml_tensor * Qcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd, n_tokens, cur->nb[1], 0*sizeof(float)*(n_embd))); + struct ggml_tensor * Kcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd))); + struct ggml_tensor * Vcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa))); - struct ggml_tensor * Qcur = ggml_view_3d( - ctx0, cur, n_embd_head, n_head, n_tokens, - wsize * n_embd_head, - wsize * n_embd_head * (n_head + 2 * n_head_kv), - 0); cb(Qcur, "Qcur", il); - - struct ggml_tensor * Kcur = ggml_view_3d( - ctx0, cur, n_embd_head, n_head_kv, n_tokens, - wsize * n_embd_head, - wsize * n_embd_head * (n_head + 2 * n_head_kv), - wsize * n_embd_head * n_head); cb(Kcur, "Kcur", il); + cb(Vcur, "Vcur", il); - struct ggml_tensor * tmpv = ggml_view_3d( - ctx0, cur, n_embd_head, n_head_kv, n_tokens, - wsize * n_embd_head, - wsize * n_embd_head * (n_head + 2 * n_head_kv), - wsize * n_embd_head * (n_head + n_head_kv)); - cb(tmpv, "tmpv", il); + Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens); - { - struct ggml_tensor * Vcur = ggml_cont(ctx0, tmpv); - cb(Vcur, "Vcur", il); + llm_build_kv_store(lctx, ctx0, gf, Kcur, Vcur, n_tokens, kv_head, cb, il); - Vcur = ggml_transpose(ctx0, ggml_reshape_2d(ctx0, Vcur, n_embd_gqa, n_tokens)); - cb(Vcur, "Vcur", il); - - struct ggml_tensor * k = ggml_view_1d(ctx0, kv_self.k, n_tokens*n_embd_gqa, (ggml_element_size(kv_self.k)*n_embd_gqa)*(il*n_ctx + kv_head)); - cb(k, "k", il); - - struct ggml_tensor * v = ggml_view_2d(ctx0, kv_self.v, n_tokens, n_embd_gqa, - ( n_ctx)*ggml_element_size(kv_self.v), - (il*n_ctx)*ggml_element_size(kv_self.v)*n_embd_gqa + kv_head*ggml_element_size(kv_self.v)); - cb(v, "v", il); - - ggml_build_forward_expand(gf, ggml_cpy(ctx0, Kcur, k)); - ggml_build_forward_expand(gf, ggml_cpy(ctx0, Vcur, v)); - } - - struct ggml_tensor * Q = ggml_permute(ctx0, Qcur, 0, 2, 1, 3); - cb(Q, "Q", il); - - struct ggml_tensor * K = - ggml_view_3d(ctx0, kv_self.k, - n_embd_head, n_kv, n_head_kv, - ggml_element_size(kv_self.k)*n_embd_gqa, - ggml_element_size(kv_self.k)*n_embd_head, - ggml_element_size(kv_self.k)*n_embd_gqa*n_ctx*il); - cb(K, "K", il); - - struct ggml_tensor * KQ = ggml_mul_mat(ctx0, K, Q); - cb(KQ, "KQ", il); - - struct ggml_tensor * KQ_scaled = ggml_scale(ctx0, KQ, KQ_scale); - cb(KQ_scaled, "KQ_scaled", il); - - // TODO: replace with ggml_add() - struct ggml_tensor * KQ_scaled_alibi = ggml_alibi(ctx0, KQ_scaled, 0, n_head, max_alibi_bias); - cb(KQ_scaled_alibi, "KQ_scaled_alibi", il); - - struct ggml_tensor * KQ_masked = ggml_add(ctx0, KQ_scaled_alibi, KQ_mask); - cb(KQ_masked, "KQ_masked", il); - - struct ggml_tensor * KQ_soft_max = ggml_soft_max(ctx0, KQ_masked); - cb(KQ_soft_max, "KQ_soft_max", il); - - struct ggml_tensor * V = - ggml_view_3d(ctx0, kv_self.v, - n_kv, n_embd_head, n_head_kv, - ggml_element_size(kv_self.v)*n_ctx, - ggml_element_size(kv_self.v)*n_ctx*n_embd_head, - ggml_element_size(kv_self.v)*n_ctx*n_embd_gqa*il); - cb(V, "V", il); - - struct ggml_tensor * KQV = ggml_mul_mat(ctx0, V, KQ_soft_max); - cb(KQV, "KQV", il); - - struct ggml_tensor * KQV_merged = ggml_permute(ctx0, KQV, 0, 2, 1, 3); - cb(KQV_merged, "KQV_merged", il); - - cur = ggml_cont_2d(ctx0, KQV_merged, n_embd, n_tokens); - cb(cur, "KQV_merged_contiguous", il); - - cur = ggml_mul_mat(ctx0, model.layers[il].wo, cur); - cb(cur, "result_wo", il); + cur = llm_build_kqv(lctx, ctx0, Qcur, + model.layers[il].wo, NULL, + Qcur, KQ_scale, KQ_mask, n_tokens, n_kv, max_alibi_bias, cb, il); + cb(cur, "kqv_out", il); } // Add the input - cur = ggml_add(ctx0, cur, inpL); - cb(cur, "inpL_+_result_wo", il); - - struct ggml_tensor * attn_out = cur; + struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpL); + cb(ffn_inp, "ffn_inp", il); // feed forward { - // Norm - { - cur = ggml_norm(ctx0, attn_out, norm_eps); - cb(cur, "ffn_norm_0", il); + cur = llm_build_norm(ctx0, ffn_inp, + model.layers[il].ffn_norm, + NULL, + LLM_NORM, norm_eps, cb, il); + cb(cur, "ffn_norm", il); - cur = ggml_mul(ctx0, cur, model.layers[il].ffn_norm); - cb(cur, "ffn_norm_0_w", il); - } - - cur = ggml_mul_mat(ctx0, model.layers[il].w3, cur); - cb(cur, "result_w3", il); - - cur = ggml_gelu(ctx0, cur); - cb(cur, "gelu", il); - - cur = ggml_mul_mat(ctx0, model.layers[il].w2, cur); - cb(cur, "result_w2", il); + cur = llm_build_ffn(ctx0, cur, + model.layers[il].ffn_up, NULL, + NULL, NULL, + model.layers[il].ffn_down, NULL, + LLM_FFN_GELU, LLM_FFN_SEQ, cb, il); + cb(cur, "ffn_out", il); } - cur = ggml_add(ctx0, cur, attn_out); - cb(cur, "inpL_+_inpFF_+_result_w2", il); + cur = ggml_add(ctx0, cur, ffn_inp); + cb(cur, "l_out", il); // input for next layer inpL = cur; @@ -5258,14 +4677,11 @@ static struct ggml_cgraph * llm_build_mpt( cur = inpL; - // norm - { - cur = ggml_norm(ctx0, cur, norm_eps); - cb(cur, "out_norm_0", -1); - - cur = ggml_mul(ctx0, cur, model.output_norm); - cb(cur, "result_norm", -1); - } + cur = llm_build_norm(ctx0, cur, + model.output_norm, + NULL, + LLM_NORM, norm_eps, cb, -1); + cb(cur, "result_norm", -1); cur = ggml_mul_mat(ctx0, model.output, cur); cb(cur, "result_output", -1); @@ -5292,6 +4708,7 @@ enum llm_offload_func_e { OFFLOAD_FUNC_OUT, }; +// TODO: will be removed with backend v2 struct llm_offload_trie { struct node { ~node() { @@ -5365,28 +4782,28 @@ struct llm_offload_trie { node * root = nullptr; }; +// TODO: will be removed with backend v2 static const std::unordered_map k_offload_map = { //{ "inp_tokens", OFFLOAD_FUNC_NR }, // TODO: missing K-quants get_rows kernel //{ "inp_embd", OFFLOAD_FUNC_NR }, // TODO: missing K-quants get_rows kernel - { "inp_pos", OFFLOAD_FUNC_NR }, + { "pos_embd", OFFLOAD_FUNC_NR }, - { "KQ_mask", OFFLOAD_FUNC_NR }, - { "K_shift", OFFLOAD_FUNC_NR }, - { "K_shifted", OFFLOAD_FUNC_NR }, + { "inp_pos", OFFLOAD_FUNC_KQ }, // this is often used for KQ ops (e.g. rope) + { "KQ_scale", OFFLOAD_FUNC_KQ }, + { "KQ_mask", OFFLOAD_FUNC_KQ }, + { "K_shift", OFFLOAD_FUNC_KQ }, + { "K_shifted", OFFLOAD_FUNC_KQ }, { "inp_norm", OFFLOAD_FUNC_NR }, { "inp_norm_w", OFFLOAD_FUNC_NR }, { "inp_norm_wb", OFFLOAD_FUNC_NR }, - { "rms_norm_0", OFFLOAD_FUNC }, - - { "attn_norm_0", OFFLOAD_FUNC }, - { "attn_norm_0_w", OFFLOAD_FUNC }, - { "attn_norm_0_wb", OFFLOAD_FUNC }, + { "norm", OFFLOAD_FUNC }, + { "norm_w", OFFLOAD_FUNC }, + { "norm_wb", OFFLOAD_FUNC }, + { "attn_norm", OFFLOAD_FUNC }, { "attn_norm_2", OFFLOAD_FUNC }, - { "attn_norm_2_w", OFFLOAD_FUNC }, - { "attn_norm_2_wb", OFFLOAD_FUNC }, { "wqkv", OFFLOAD_FUNC_KQ }, { "bqkv", OFFLOAD_FUNC_KQ }, @@ -5395,12 +4812,9 @@ static const std::unordered_map k_offload_map { "tmpk", OFFLOAD_FUNC_KQ }, { "tmpq", OFFLOAD_FUNC_KQ }, { "tmpv", OFFLOAD_FUNC_V }, - { "tmpkqv", OFFLOAD_FUNC_KQ }, // ?? { "Kcur", OFFLOAD_FUNC_KQ }, { "Qcur", OFFLOAD_FUNC_KQ }, { "Vcur", OFFLOAD_FUNC_V }, - { "Vcur_0", OFFLOAD_FUNC_V }, - { "Vcur_1", OFFLOAD_FUNC_V }, { "krot", OFFLOAD_FUNC_KQ }, { "qrot", OFFLOAD_FUNC_KQ }, @@ -5409,51 +4823,38 @@ static const std::unordered_map k_offload_map { "krotated", OFFLOAD_FUNC_KQ }, { "qrotated", OFFLOAD_FUNC_KQ }, + { "q", OFFLOAD_FUNC_KQ }, { "k", OFFLOAD_FUNC_KQ }, + { "kq", OFFLOAD_FUNC_KQ }, + { "kq_scaled", OFFLOAD_FUNC_KQ }, + { "kq_scaled_alibi", OFFLOAD_FUNC_KQ }, + { "kq_masked", OFFLOAD_FUNC_KQ }, + { "kq_soft_max", OFFLOAD_FUNC_V }, { "v", OFFLOAD_FUNC_V }, + { "kqv", OFFLOAD_FUNC_V }, + { "kqv_merged", OFFLOAD_FUNC_V }, + { "kqv_merged_cont", OFFLOAD_FUNC_V }, + { "kqv_wo", OFFLOAD_FUNC_V }, + { "kqv_out", OFFLOAD_FUNC_V }, - { "Q", OFFLOAD_FUNC_KQ }, - { "K", OFFLOAD_FUNC_KQ }, - { "KQ", OFFLOAD_FUNC_KQ }, - { "KQ_scaled", OFFLOAD_FUNC_KQ }, - { "KQ_scaled_alibi", OFFLOAD_FUNC_KQ }, - { "KQ_masked", OFFLOAD_FUNC_KQ }, - { "KQ_soft_max", OFFLOAD_FUNC_V }, - { "V", OFFLOAD_FUNC_V }, - { "KQV", OFFLOAD_FUNC_V }, - { "KQV_merged", OFFLOAD_FUNC_V }, - { "KQV_merged_contiguous", OFFLOAD_FUNC_V }, - - { "result_wo", OFFLOAD_FUNC }, - { "result_wo_b", OFFLOAD_FUNC }, - { "inpL_+_result_wo", OFFLOAD_FUNC }, - - { "inpFF", OFFLOAD_FUNC }, - - { "rms_norm_1", OFFLOAD_FUNC }, + { "ffn_inp", OFFLOAD_FUNC }, { "ffn_norm", OFFLOAD_FUNC }, - { "ffn_norm_0", OFFLOAD_FUNC }, - { "ffn_norm_0_w", OFFLOAD_FUNC }, - { "ffn_norm_0_wb", OFFLOAD_FUNC }, - { "result_w3", OFFLOAD_FUNC }, - { "result_w3_b", OFFLOAD_FUNC }, - { "result_w2", OFFLOAD_FUNC }, - { "result_w2_b", OFFLOAD_FUNC }, - { "result_w1", OFFLOAD_FUNC }, + { "ffn_up", OFFLOAD_FUNC }, + { "ffn_up_b", OFFLOAD_FUNC }, + { "ffn_gate", OFFLOAD_FUNC }, + { "ffn_gate_b", OFFLOAD_FUNC }, + { "ffn_gate_par", OFFLOAD_FUNC }, + { "ffn_down", OFFLOAD_FUNC }, + { "ffn_down_b", OFFLOAD_FUNC }, + { "ffn_out", OFFLOAD_FUNC }, - { "silu", OFFLOAD_FUNC }, - { "gelu", OFFLOAD_FUNC }, - { "relu", OFFLOAD_FUNC }, - { "sqr(relu)", OFFLOAD_FUNC }, + { "ffn_silu", OFFLOAD_FUNC }, + { "ffn_gelu", OFFLOAD_FUNC }, + { "ffn_relu", OFFLOAD_FUNC }, + { "ffn_sqr(relu)", OFFLOAD_FUNC }, - { "silu_x_result_w3", OFFLOAD_FUNC }, - { "inpFF_+_result_w2", OFFLOAD_FUNC }, - { "inpL_+_inpFF_+_result_w2", OFFLOAD_FUNC }, - - { "rms_norm_2", OFFLOAD_FUNC_NR }, - { "out_norm_0", OFFLOAD_FUNC_NR }, - { "out_norm_0_w", OFFLOAD_FUNC_NR }, + { "l_out", OFFLOAD_FUNC }, { "result_norm", OFFLOAD_FUNC_EMB }, { "result_output", OFFLOAD_FUNC_OUT }, @@ -5477,7 +4878,16 @@ static struct ggml_cgraph * llama_build_graph( bool alloc_inp_KQ_mask = false; bool alloc_inp_K_shift = false; +#ifdef GGML_USE_CUBLAS + const bool do_offload = true; +#else + const bool do_offload = true; // TODO: set to false after finishing refactoring +#endif + + int n_non_view = 0; // number of non-view tensors that have been processed by the callback + // this callback allows us to apply custom logic to each tensor (e.g. ggml-alloc, offloading, etc.) + // TODO: will be removed with backend v2 llm_build_cb cb = [&](struct ggml_tensor * cur, const char * name, int il) { if (il >= 0) { ggml_format_name(cur, "%s-%d", name, il); @@ -5488,11 +4898,12 @@ static struct ggml_cgraph * llama_build_graph( // // allocate input tensors and set input data // + // TODO: will be removed with backend v2 - if (batch.token && !alloc_inp_tokens && strcmp(name, "inp_tokens") == 0) { + if (!alloc_inp_tokens && strcmp(name, "inp_tokens") == 0) { ggml_allocr_alloc(lctx.alloc, cur); - if (!ggml_allocr_is_measure(lctx.alloc)) { + if (!ggml_allocr_is_measure(lctx.alloc) && batch.token) { const int64_t n_tokens = cur->ne[0]; memcpy(cur->data, batch.token, n_tokens*ggml_element_size(cur)); @@ -5501,10 +4912,10 @@ static struct ggml_cgraph * llama_build_graph( alloc_inp_tokens = true; } - if (batch.embd && !alloc_inp_embd && strcmp(name, "inp_embd") == 0) { + if (!alloc_inp_embd && strcmp(name, "inp_embd") == 0) { ggml_allocr_alloc(lctx.alloc, cur); - if (!ggml_allocr_is_measure(lctx.alloc)) { + if (!ggml_allocr_is_measure(lctx.alloc) && batch.embd) { const int64_t n_embd = cur->ne[0]; const int64_t n_tokens = cur->ne[1]; @@ -5514,10 +4925,10 @@ static struct ggml_cgraph * llama_build_graph( alloc_inp_embd = true; } - if (batch.pos && !alloc_inp_pos && strcmp(name, "inp_pos") == 0) { + if (!alloc_inp_pos && strcmp(name, "inp_pos") == 0) { ggml_allocr_alloc(lctx.alloc, cur); - if (!ggml_allocr_is_measure(lctx.alloc)) { + if (!ggml_allocr_is_measure(lctx.alloc) && batch.pos) { const int64_t n_tokens = cur->ne[0]; int32_t * data = (int32_t *) cur->data; @@ -5584,23 +4995,23 @@ static struct ggml_cgraph * llama_build_graph( alloc_inp_K_shift = true; } - // - // offload layers - // - // TODO: this code will be obsoleted with backend v2 - -#ifdef GGML_USE_CUBLAS - const bool do_offload = true; -#else - const bool do_offload = false; -#endif - - if (!do_offload) { + // view tensors are not processed further + if (cur->view_src != nullptr) { return; } - // view tensors are not offloaded - if (cur->view_src != nullptr) { + if (cur->op != GGML_OP_NONE) { + n_non_view++; + } + + // + // offload layers + // + // TODO: will be removed with backend v2 + +//#define LLAMA_OFFLOAD_DEBUG + + if (!do_offload) { return; } @@ -5614,20 +5025,19 @@ static struct ggml_cgraph * llama_build_graph( static const std::unordered_map> k_offload_func_name = { { OFFLOAD_FUNC_NOP, "CPU" }, + { OFFLOAD_FUNC_OUT, "CPU" }, #ifdef GGML_USE_CUBLAS { OFFLOAD_FUNC, "GPU (CUDA)" }, { OFFLOAD_FUNC_KQ, "GPU (CUDA) KQ" }, { OFFLOAD_FUNC_V, "GPU (CUDA) V" }, { OFFLOAD_FUNC_NR, "GPU (CUDA) NR" }, { OFFLOAD_FUNC_EMB, "GPU (CUDA) EMB" }, - { OFFLOAD_FUNC_OUT, "GPU (CUDA) OUT" }, #else { OFFLOAD_FUNC, "CPU" }, { OFFLOAD_FUNC_KQ, "CPU" }, { OFFLOAD_FUNC_V, "CPU" }, { OFFLOAD_FUNC_NR, "CPU" }, { OFFLOAD_FUNC_EMB, "CPU" }, - { OFFLOAD_FUNC_OUT, "CPU" }, #endif // GGML_USE_CUBLAS }; @@ -5635,11 +5045,13 @@ static struct ggml_cgraph * llama_build_graph( llm_offload_func_e func_e = k_offload_func_trie.find(name); if (func_e == OFFLOAD_FUNC_NOP) { +#ifdef LLAMA_OFFLOAD_DEBUG // if a tensor hasn't been offloaded, we warn the user if (worst_case) { LLAMA_LOG_WARN("%s: %32s: not offloaded (ref: %s)\n", __func__, cur->name, "https://github.com/ggerganov/llama.cpp/pull/3837"); } +#endif return; } @@ -5702,9 +5114,11 @@ static struct ggml_cgraph * llama_build_graph( // apply offload function to the tensor func(cur); +#ifdef LLAMA_OFFLOAD_DEBUG if (worst_case) { LLAMA_LOG_INFO("%s: %32s: %s\n", __func__, cur->name, k_offload_func_name.at(func_e).c_str()); } +#endif }; struct ggml_cgraph * result = NULL; @@ -5746,6 +5160,27 @@ static struct ggml_cgraph * llama_build_graph( GGML_ASSERT(false); } + if (worst_case) { + int n_non_view_total = 0; + + for (int i = 0; i < result->n_nodes; ++i) { + if (result->nodes[i]->view_src == nullptr) { + n_non_view_total++; + } + } + + LLAMA_LOG_INFO("%s: non-view tensors processed: %d/%d\n", __func__, n_non_view, n_non_view_total); + + if (n_non_view != n_non_view_total) { + LLAMA_LOG_WARN("%s: ****************************************************************\n", __func__); + LLAMA_LOG_WARN("%s: not all non-view tensors have been processed with a callback\n", __func__); + LLAMA_LOG_WARN("%s: this can indicate an inefficiency in the graph implementation\n", __func__); + LLAMA_LOG_WARN("%s: build with LLAMA_OFFLOAD_DEBUG for more info\n", __func__); + LLAMA_LOG_WARN("%s: ref: https://github.com/ggerganov/llama.cpp/pull/3837\n", __func__); + LLAMA_LOG_WARN("%s: ****************************************************************\n", __func__); + } + } + return result; }