diff --git a/ggml-metal.m b/ggml-metal.m index 2380c4310..bc881395a 100644 --- a/ggml-metal.m +++ b/ggml-metal.m @@ -238,14 +238,17 @@ struct ggml_metal_context * ggml_metal_init(int n_cb) { // load kernels { NSError * error = nil; -#define GGML_METAL_ADD_KERNEL(name) \ - ctx->function_##name = [ctx->library newFunctionWithName:@"kernel_"#name]; \ - ctx->pipeline_##name = [ctx->device newComputePipelineStateWithFunction:ctx->function_##name error:&error]; \ + + /* GGML_METAL_LOG_INFO("%s: loaded %-32s %16p | th_max = %4d | th_width = %4d\n", __func__, "kernel_"#name, (void *) ctx->pipeline_##name, \ (int) ctx->pipeline_##name.maxTotalThreadsPerThreadgroup, \ (int) ctx->pipeline_##name.threadExecutionWidth); \ + */ +#define GGML_METAL_ADD_KERNEL(name) \ + ctx->function_##name = [ctx->library newFunctionWithName:@"kernel_"#name]; \ + ctx->pipeline_##name = [ctx->device newComputePipelineStateWithFunction:ctx->function_##name error:&error]; \ if (error) { \ - GGML_METAL_LOG_ERROR("%s: error: load pipeline error: %s\n", __func__, [[error description] UTF8String]); \ + GGML_METAL_LOG_ERROR("%s: error: load pipeline error: %s\n", __func__, [[error description] UTF8String]); \ return NULL; \ } diff --git a/ggml.h b/ggml.h index 08bff5511..aa4b23e70 100644 --- a/ggml.h +++ b/ggml.h @@ -709,7 +709,7 @@ extern "C" { // Context tensor enumeration and lookup GGML_API struct ggml_tensor * ggml_get_first_tensor(struct ggml_context * ctx); GGML_API struct ggml_tensor * ggml_get_next_tensor (struct ggml_context * ctx, struct ggml_tensor * tensor); - GGML_API struct ggml_tensor * ggml_get_tensor(struct ggml_context * ctx, const char * name); + GGML_API struct ggml_tensor * ggml_get_tensor (struct ggml_context * ctx, const char * name); GGML_API struct ggml_tensor * ggml_set_zero(struct ggml_tensor * tensor); GGML_API struct ggml_tensor * ggml_set_i32 (struct ggml_tensor * tensor, int32_t value); diff --git a/llama.cpp b/llama.cpp index 3d431ee7b..fcb75716f 100644 --- a/llama.cpp +++ b/llama.cpp @@ -971,7 +971,7 @@ struct llama_mlock { typedef void (*offload_func_t)(struct ggml_tensor * tensor); -static void llama_nop(struct ggml_tensor * tensor) { // don't offload by default +static void ggml_offload_nop(struct ggml_tensor * tensor) { // don't offload by default (void) tensor; } @@ -3008,10 +3008,10 @@ static void llm_load_tensors( #ifdef GGML_USE_CUBLAS const int max_backend_supported_layers = hparams.n_layer + 3; - const int max_offloadable_layers = hparams.n_layer + 3; -#elif defined(GGML_USE_CLBLAST) + const int max_offloadable_layers = hparams.n_layer + 3; +#elif GGML_USE_CLBLAST const int max_backend_supported_layers = hparams.n_layer + 1; - const int max_offloadable_layers = hparams.n_layer + 1; + const int max_offloadable_layers = hparams.n_layer + 1; #endif // GGML_USE_CUBLAS LLAMA_LOG_INFO("%s: offloaded %d/%d layers to GPU\n", __func__, std::min(n_gpu_layers, max_offloadable_layers), max_backend_supported_layers); @@ -3090,9 +3090,13 @@ static bool llama_model_load( return true; } +using llm_build_cb = std::function; + static struct ggml_cgraph * llm_build_llama( - llama_context & lctx, - const llama_batch & batch) { + llama_context & lctx, + const llama_batch & batch, + const llm_build_cb & cb, + bool worst_case) { const auto & model = lctx.model; const auto & hparams = model.hparams; const auto & cparams = lctx.cparams; @@ -3115,13 +3119,11 @@ static struct ggml_cgraph * llm_build_llama( const float freq_scale = cparams.rope_freq_scale; const float norm_rms_eps = hparams.f_norm_rms_eps; - const int n_gpu_layers = model.n_gpu_layers; - const int32_t n_tokens = batch.n_tokens; - const int32_t n_kv = ggml_allocr_is_measure(lctx.alloc) ? n_ctx : kv_self.n; - const int32_t kv_head = ggml_allocr_is_measure(lctx.alloc) ? n_ctx - n_tokens : kv_self.head; + const int32_t n_kv = worst_case ? n_ctx : kv_self.n; + const int32_t kv_head = worst_case ? n_ctx - n_tokens : kv_self.head; - const bool do_rope_shift = ggml_allocr_is_measure(lctx.alloc) || kv_self.has_shift; + const bool do_rope_shift = worst_case || kv_self.has_shift; //printf("n_kv = %d\n", n_kv); @@ -3142,12 +3144,7 @@ static struct ggml_cgraph * llm_build_llama( if (batch.token) { struct ggml_tensor * inp_tokens = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens); - - ggml_allocr_alloc(lctx.alloc, inp_tokens); - if (!ggml_allocr_is_measure(lctx.alloc)) { - memcpy(inp_tokens->data, batch.token, n_tokens*ggml_element_size(inp_tokens)); - } - ggml_set_name(inp_tokens, "inp_tokens"); + cb(inp_tokens, "inp_tokens"); inpL = ggml_get_rows(ctx0, model.tok_embeddings, inp_tokens); } else { @@ -3156,169 +3153,85 @@ static struct ggml_cgraph * llm_build_llama( #endif inpL = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd, n_tokens); - - ggml_allocr_alloc(lctx.alloc, inpL); - if (!ggml_allocr_is_measure(lctx.alloc)) { - memcpy(inpL->data, batch.embd, n_tokens * n_embd * ggml_element_size(inpL)); - } } + cb(inpL, "inp_embd"); - const int i_gpu_start = n_layer - n_gpu_layers; - (void) i_gpu_start; - - // offload functions set the tensor output backend to GPU - // tensors are GPU-accelerated if any input or the output has been offloaded - offload_func_t offload_func_nr = llama_nop; // nr = non-repeating - offload_func_t offload_func_kq = llama_nop; - offload_func_t offload_func_v = llama_nop; - -#ifdef GGML_USE_CUBLAS - if (n_gpu_layers > n_layer) { - offload_func_nr = ggml_cuda_assign_buffers_no_alloc; - } - if (n_gpu_layers > n_layer + 1) { - offload_func_v = ggml_cuda_assign_buffers_no_alloc; - } - if (n_gpu_layers > n_layer + 2) { - offload_func_kq = ggml_cuda_assign_buffers_no_alloc; - } -#endif // GGML_USE_CUBLAS + // inp_pos - contains the positions + struct ggml_tensor * inp_pos = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens); + cb(inp_pos, "inp_pos"); // KQ_scale struct ggml_tensor * KQ_scale = ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, 1); - ggml_set_name(KQ_scale, "1/sqrt(n_embd_head)"); - ggml_allocr_alloc(lctx.alloc, KQ_scale); - if (!ggml_allocr_is_measure(lctx.alloc)) { - ggml_set_f32(KQ_scale, 1.0f/sqrtf(float(n_embd_head))); - } + cb(KQ_scale, "KQ_scale"); // KQ_mask (mask for 1 head, it will be broadcasted to all heads) struct ggml_tensor * KQ_mask = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_kv, n_tokens, 1); - offload_func_kq(KQ_mask); - ggml_set_name(KQ_mask, "KQ_mask"); - ggml_allocr_alloc(lctx.alloc, KQ_mask); - if (!ggml_allocr_is_measure(lctx.alloc)) { - float * data = (float *) KQ_mask->data; - memset(data, 0, ggml_nbytes(KQ_mask)); - - for (int h = 0; h < 1; ++h) { - for (int j = 0; j < n_tokens; ++j) { - const llama_pos pos = batch.pos[j]; - const llama_seq_id seq_id = batch.seq_id[j][0]; - - for (int i = 0; i < n_kv; ++i) { - if (!kv_self.cells[i].has_seq_id(seq_id) || kv_self.cells[i].pos > pos) { - data[h*(n_kv*n_tokens) + j*n_kv + i] = -INFINITY; - } - } - } - } - } - - // KQ_pos - contains the positions - struct ggml_tensor * KQ_pos = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens); - offload_func_kq(KQ_pos); - ggml_set_name(KQ_pos, "KQ_pos"); - ggml_allocr_alloc(lctx.alloc, KQ_pos); - if (!ggml_allocr_is_measure(lctx.alloc)) { - int * data = (int *) KQ_pos->data; - for (int i = 0; i < n_tokens; ++i) { - data[i] = batch.pos[i]; - } - } + cb(KQ_mask, "KQ_mask"); // shift the entire K-cache if needed if (do_rope_shift) { struct ggml_tensor * K_shift = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_ctx); - offload_func_kq(K_shift); - ggml_set_name(K_shift, "K_shift"); - ggml_allocr_alloc(lctx.alloc, K_shift); - if (!ggml_allocr_is_measure(lctx.alloc)) { - int * data = (int *) K_shift->data; - for (int i = 0; i < n_ctx; ++i) { - data[i] = kv_self.cells[i].delta; - } - } + cb(K_shift, "K_shift"); for (int il = 0; il < n_layer; ++il) { struct ggml_tensor * tmp = - ggml_rope_custom_inplace(ctx0, + ggml_rope_custom_inplace(ctx0, ggml_view_3d(ctx0, kv_self.k, n_embd_head, n_head_kv, n_ctx, ggml_element_size(kv_self.k)*n_embd_head, ggml_element_size(kv_self.k)*n_embd_gqa, ggml_element_size(kv_self.k)*n_embd_gqa*n_ctx*il), K_shift, n_embd_head, 0, 0, freq_base, freq_scale); - offload_func_kq(tmp); + cb(tmp, "K_shifted"); ggml_build_forward_expand(gf, tmp); } } for (int il = 0; il < n_layer; ++il) { - ggml_format_name(inpL, "layer_inp_%d", il); - - offload_func_t offload_func = llama_nop; - -#ifdef GGML_USE_CUBLAS - if (il >= i_gpu_start) { - offload_func = ggml_cuda_assign_buffers_no_alloc; - } -#endif // GGML_USE_CUBLAS - struct ggml_tensor * inpSA = inpL; // norm { cur = ggml_rms_norm(ctx0, inpL, norm_rms_eps); - offload_func(cur); - ggml_set_name(cur, "rms_norm_0"); + cb(cur, "rms_norm_0"); // cur = cur*attn_norm(broadcasted) cur = ggml_mul(ctx0, cur, model.layers[il].attn_norm); - offload_func(cur); - ggml_set_name(cur, "attention_norm_0"); + cb(cur, "attn_norm_0"); } // self-attention { // compute Q and K and RoPE them struct ggml_tensor * tmpk = ggml_mul_mat(ctx0, model.layers[il].wk, cur); - offload_func_kq(tmpk); - ggml_set_name(tmpk, "tmpk"); + cb(tmpk, "tmpk"); struct ggml_tensor * tmpq = ggml_mul_mat(ctx0, model.layers[il].wq, cur); - offload_func_kq(tmpq); - ggml_set_name(tmpq, "tmpq"); + cb(tmpq, "tmpq"); - struct ggml_tensor * Kcur = ggml_rope_custom(ctx0, ggml_reshape_3d(ctx0, tmpk, n_embd_head, n_head_kv, n_tokens), KQ_pos, n_embd_head, 0, 0, freq_base, freq_scale); - offload_func_kq(Kcur); - ggml_set_name(Kcur, "Kcur"); + struct ggml_tensor * Kcur = ggml_rope_custom(ctx0, ggml_reshape_3d(ctx0, tmpk, n_embd_head, n_head_kv, n_tokens), inp_pos, n_embd_head, 0, 0, freq_base, freq_scale); + cb(Kcur, "Kcur"); - struct ggml_tensor * Qcur = ggml_rope_custom(ctx0, ggml_reshape_3d(ctx0, tmpq, n_embd_head, n_head, n_tokens), KQ_pos, n_embd_head, 0, 0, freq_base, freq_scale); - offload_func_kq(Qcur); - ggml_set_name(Qcur, "Qcur"); + struct ggml_tensor * Qcur = ggml_rope_custom(ctx0, ggml_reshape_3d(ctx0, tmpq, n_embd_head, n_head, n_tokens), inp_pos, n_embd_head, 0, 0, freq_base, freq_scale); + cb(Qcur, "Qcur"); // store key and value to memory { // compute the transposed [n_tokens, n_embd] V matrix struct ggml_tensor * tmpv = ggml_mul_mat(ctx0, model.layers[il].wv, cur); - offload_func_v(tmpv); - ggml_set_name(tmpv, "tmpv"); + cb(tmpv, "tmpv"); struct ggml_tensor * Vcur = ggml_transpose(ctx0, ggml_reshape_2d(ctx0, tmpv, n_embd_gqa, n_tokens)); - offload_func_v(Vcur); - ggml_set_name(Vcur, "Vcur"); + cb(Vcur, "Vcur"); struct ggml_tensor * k = ggml_view_1d(ctx0, kv_self.k, n_tokens*n_embd_gqa, (ggml_element_size(kv_self.k)*n_embd_gqa)*(il*n_ctx + kv_head)); - offload_func_kq(k); - ggml_set_name(k, "k"); + cb(k, "k"); struct ggml_tensor * v = ggml_view_2d(ctx0, kv_self.v, n_tokens, n_embd_gqa, ( n_ctx)*ggml_element_size(kv_self.v), (il*n_ctx)*ggml_element_size(kv_self.v)*n_embd_gqa + kv_head*ggml_element_size(kv_self.v)); - offload_func_v(v); - ggml_set_name(v, "v"); + cb(v, "v"); // important: storing RoPE-ed version of K in the KV cache! ggml_build_forward_expand(gf, ggml_cpy(ctx0, Kcur, k)); @@ -3326,8 +3239,7 @@ static struct ggml_cgraph * llm_build_llama( } struct ggml_tensor * Q = ggml_permute(ctx0, Qcur, 0, 2, 1, 3); - offload_func_kq(Q); - ggml_set_name(Q, "Q"); + cb(Q, "Q"); struct ggml_tensor * K = ggml_view_3d(ctx0, kv_self.k, @@ -3335,29 +3247,24 @@ static struct ggml_cgraph * llm_build_llama( ggml_element_size(kv_self.k)*n_embd_gqa, ggml_element_size(kv_self.k)*n_embd_head, ggml_element_size(kv_self.k)*n_embd_gqa*n_ctx*il); - offload_func_kq(K); - ggml_set_name(K, "K"); + cb(K, "K"); // K * Q struct ggml_tensor * KQ = ggml_mul_mat(ctx0, K, Q); - offload_func_kq(KQ); - ggml_set_name(KQ, "KQ"); + cb(KQ, "KQ"); // KQ_scaled = KQ / sqrt(n_embd_head) // KQ_scaled shape [n_kv, n_tokens, n_head, 1] struct ggml_tensor * KQ_scaled = ggml_scale(ctx0, KQ, KQ_scale); - offload_func_kq(KQ_scaled); - ggml_set_name(KQ_scaled, "KQ_scaled"); + cb(KQ_scaled, "KQ_scaled"); // KQ_masked = mask_past(KQ_scaled) struct ggml_tensor * KQ_masked = ggml_add(ctx0, KQ_scaled, KQ_mask); - offload_func_kq(KQ_masked); - ggml_set_name(KQ_masked, "KQ_masked"); + cb(KQ_masked, "KQ_masked"); // KQ = soft_max(KQ_masked) struct ggml_tensor * KQ_soft_max = ggml_soft_max(ctx0, KQ_masked); - offload_func_v(KQ_soft_max); - ggml_set_name(KQ_soft_max, "KQ_soft_max"); + cb(KQ_soft_max, "KQ_soft_max"); // split cached V into n_head heads struct ggml_tensor * V = @@ -3366,13 +3273,11 @@ static struct ggml_cgraph * llm_build_llama( ggml_element_size(kv_self.v)*n_ctx, ggml_element_size(kv_self.v)*n_ctx*n_embd_head, ggml_element_size(kv_self.v)*n_ctx*n_embd_gqa*il); - offload_func_v(V); - ggml_set_name(V, "V"); + cb(V, "V"); #if 1 struct ggml_tensor * KQV = ggml_mul_mat(ctx0, V, KQ_soft_max); - offload_func_v(KQV); - ggml_set_name(KQV, "KQV"); + cb(KQV, "KQV"); #else // make V contiguous in memory to speed up the matmul, however we waste time on the copy // on M1 this is faster for the perplexity computation, but ~5% slower for the single-token generation @@ -3383,71 +3288,59 @@ static struct ggml_cgraph * llm_build_llama( // KQV_merged = KQV.permute(0, 2, 1, 3) struct ggml_tensor * KQV_merged = ggml_permute(ctx0, KQV, 0, 2, 1, 3); - offload_func_v(KQV_merged); - ggml_set_name(KQV_merged, "KQV_merged"); + cb(KQV_merged, "KQV_merged"); // cur = KQV_merged.contiguous().view(n_embd, n_tokens) cur = ggml_cont_2d(ctx0, KQV_merged, n_embd, n_tokens); - offload_func_v(cur); - ggml_set_name(cur, "KQV_merged_contiguous"); + cb(cur, "KQV_merged_contiguous"); // projection (no bias) cur = ggml_mul_mat(ctx0, model.layers[il].wo, cur); - offload_func(cur); - ggml_set_name(cur, "result_wo"); + cb(cur, "result_wo"); } struct ggml_tensor * inpFF = ggml_add(ctx0, cur, inpSA); - offload_func(inpFF); - ggml_set_name(inpFF, "inpFF"); + cb(inpFF, "inpFF"); // feed-forward network { // norm { cur = ggml_rms_norm(ctx0, inpFF, norm_rms_eps); - offload_func(cur); - ggml_set_name(cur, "rms_norm_1"); + cb(cur, "rms_norm_1"); // cur = cur*ffn_norm(broadcasted) cur = ggml_mul(ctx0, cur, model.layers[il].ffn_norm); - offload_func(cur); - ggml_set_name(cur, "ffn_norm"); + cb(cur, "ffn_norm"); } struct ggml_tensor * tmp = ggml_mul_mat(ctx0, model.layers[il].w3, cur); - offload_func(tmp); - ggml_set_name(tmp, "result_w3"); + cb(tmp, "result_w3"); cur = ggml_mul_mat(ctx0, model.layers[il].w1, cur); - offload_func(cur); - ggml_set_name(cur, "result_w1"); + cb(cur, "result_w1"); // SILU activation cur = ggml_silu(ctx0, cur); - offload_func(cur); - ggml_set_name(cur, "silu"); + cb(cur, "silu"); cur = ggml_mul(ctx0, cur, tmp); - offload_func(cur); - ggml_set_name(cur, "silu_x_result_w3"); + cb(cur, "silu_x_result_w3"); cur = ggml_mul_mat(ctx0, model.layers[il].w2, cur); - offload_func(cur); - ggml_set_name(cur, "result_w2"); + cb(cur, "result_w2"); } cur = ggml_add(ctx0, cur, inpFF); - offload_func(cur); - ggml_set_name(cur, "inpFF_+_result_w2"); + cb(cur, "inpFF_+_result_w2"); // input for next layer inpL = cur; @@ -3458,18 +3351,16 @@ static struct ggml_cgraph * llm_build_llama( // norm { cur = ggml_rms_norm(ctx0, cur, norm_rms_eps); - offload_func_nr(cur); - ggml_set_name(cur, "rms_norm_2"); + cb(cur, "rms_norm_2"); // cur = cur*norm(broadcasted) cur = ggml_mul(ctx0, cur, model.output_norm); - // offload_func_nr(cur); // TODO CPU + GPU mirrored backend - ggml_set_name(cur, "result_norm"); + cb(cur, "result_norm"); } // lm_head cur = ggml_mul_mat(ctx0, model.output, cur); - ggml_set_name(cur, "result_output"); + cb(cur, "result_output"); ggml_build_forward_expand(gf, cur); @@ -3480,7 +3371,9 @@ static struct ggml_cgraph * llm_build_llama( static struct ggml_cgraph * llm_build_baichaun( llama_context & lctx, - const llama_batch & batch) { + const llama_batch & batch, + const llm_build_cb & cb, + bool worst_case) { const auto & model = lctx.model; const auto & hparams = model.hparams; const auto & cparams = lctx.cparams; @@ -3503,13 +3396,11 @@ static struct ggml_cgraph * llm_build_baichaun( const float freq_scale = cparams.rope_freq_scale; const float norm_rms_eps = hparams.f_norm_rms_eps; - const int n_gpu_layers = model.n_gpu_layers; - const int32_t n_tokens = batch.n_tokens; - const int32_t n_kv = ggml_allocr_is_measure(lctx.alloc) ? n_ctx : kv_self.n; - const int32_t kv_head = ggml_allocr_is_measure(lctx.alloc) ? n_ctx - n_tokens : kv_self.head; + const int32_t n_kv = worst_case ? n_ctx : kv_self.n; + const int32_t kv_head = worst_case ? n_ctx - n_tokens : kv_self.head; - const bool do_rope_shift = ggml_allocr_is_measure(lctx.alloc) || kv_self.has_shift; + const bool do_rope_shift = worst_case || kv_self.has_shift; auto & buf_compute = lctx.buf_compute; @@ -3528,12 +3419,7 @@ static struct ggml_cgraph * llm_build_baichaun( if (batch.token) { struct ggml_tensor * inp_tokens = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens); - - ggml_allocr_alloc(lctx.alloc, inp_tokens); - if (!ggml_allocr_is_measure(lctx.alloc)) { - memcpy(inp_tokens->data, batch.token, n_tokens*ggml_element_size(inp_tokens)); - } - ggml_set_name(inp_tokens, "inp_tokens"); + cb(inp_tokens, "inp_tokens"); inpL = ggml_get_rows(ctx0, model.tok_embeddings, inp_tokens); } else { @@ -3542,89 +3428,25 @@ static struct ggml_cgraph * llm_build_baichaun( #endif inpL = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd, n_tokens); - - ggml_allocr_alloc(lctx.alloc, inpL); - if (!ggml_allocr_is_measure(lctx.alloc)) { - memcpy(inpL->data, batch.embd, n_tokens * n_embd * ggml_element_size(inpL)); - } } + cb(inpL, "inp_embd"); - const int i_gpu_start = n_layer - n_gpu_layers; - (void) i_gpu_start; - - // offload functions set the tensor output backend to GPU - // tensors are GPU-accelerated if any input or the output has been offloaded - offload_func_t offload_func_nr = llama_nop; // nr = non-repeating - offload_func_t offload_func_kq = llama_nop; - offload_func_t offload_func_v = llama_nop; - -#ifdef GGML_USE_CUBLAS - if (n_gpu_layers > n_layer) { - offload_func_nr = ggml_cuda_assign_buffers_no_alloc; - } - if (n_gpu_layers > n_layer + 1) { - offload_func_v = ggml_cuda_assign_buffers_no_alloc; - } - if (n_gpu_layers > n_layer + 2) { - offload_func_kq = ggml_cuda_assign_buffers_no_alloc; - } -#endif // GGML_USE_CUBLAS + // inp_pos - contains the positions + struct ggml_tensor * inp_pos = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens); + cb(inp_pos, "inp_pos"); // KQ_scale struct ggml_tensor * KQ_scale = ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, 1); - ggml_set_name(KQ_scale, "1/sqrt(n_embd_head)"); - ggml_allocr_alloc(lctx.alloc, KQ_scale); - if (!ggml_allocr_is_measure(lctx.alloc)) { - ggml_set_f32(KQ_scale, 1.0f/sqrtf(float(n_embd)/n_head)); - } + cb(KQ_scale, "KQ_scale"); // KQ_mask (mask for 1 head, it will be broadcasted to all heads) struct ggml_tensor * KQ_mask = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_kv, n_tokens, 1); - offload_func_kq(KQ_mask); - ggml_set_name(KQ_mask, "KQ_mask"); - ggml_allocr_alloc(lctx.alloc, KQ_mask); - if (!ggml_allocr_is_measure(lctx.alloc)) { - float * data = (float *) KQ_mask->data; - memset(data, 0, ggml_nbytes(KQ_mask)); - - for (int h = 0; h < 1; ++h) { - for (int j = 0; j < n_tokens; ++j) { - const llama_pos pos = batch.pos[j]; - const llama_seq_id seq_id = batch.seq_id[j][0]; - - for (int i = 0; i < n_kv; ++i) { - if (!kv_self.cells[i].has_seq_id(seq_id) || kv_self.cells[i].pos > pos) { - data[h*(n_kv*n_tokens) + j*n_kv + i] = -INFINITY; - } - } - } - } - } - - // KQ_pos - contains the positions - struct ggml_tensor * KQ_pos = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens); - offload_func_kq(KQ_pos); - ggml_set_name(KQ_pos, "KQ_pos"); - ggml_allocr_alloc(lctx.alloc, KQ_pos); - if (!ggml_allocr_is_measure(lctx.alloc)) { - int * data = (int *) KQ_pos->data; - for (int i = 0; i < n_tokens; ++i) { - data[i] = batch.pos[i]; - } - } + cb(KQ_mask, "KQ_mask"); // shift the entire K-cache if needed if (do_rope_shift) { struct ggml_tensor * K_shift = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_ctx); - offload_func_kq(K_shift); - ggml_set_name(K_shift, "K_shift"); - ggml_allocr_alloc(lctx.alloc, K_shift); - if (!ggml_allocr_is_measure(lctx.alloc)) { - int * data = (int *) K_shift->data; - for (int i = 0; i < n_ctx; ++i) { - data[i] = kv_self.cells[i].delta; - } - } + cb(K_shift, "K_shift"); for (int il = 0; il < n_layer; ++il) { struct ggml_tensor * tmp = @@ -3635,53 +3457,39 @@ static struct ggml_cgraph * llm_build_baichaun( ggml_element_size(kv_self.k)*n_embd_gqa, ggml_element_size(kv_self.k)*n_embd_gqa*n_ctx*il), K_shift, n_embd_head, 0, 0, freq_base, freq_scale); - offload_func_kq(tmp); + cb(tmp, "K_shifted"); ggml_build_forward_expand(gf, tmp); } } for (int il = 0; il < n_layer; ++il) { - ggml_format_name(inpL, "layer_inp_%d", il); - - offload_func_t offload_func = llama_nop; - -#ifdef GGML_USE_CUBLAS - if (il >= i_gpu_start) { - offload_func = ggml_cuda_assign_buffers_no_alloc; - } -#endif // GGML_USE_CUBLAS - struct ggml_tensor * inpSA = inpL; // norm { cur = ggml_rms_norm(ctx0, inpL, norm_rms_eps); - offload_func(cur); - ggml_set_name(cur, "rms_norm_0"); + cb(cur, "rms_norm_0"); // cur = cur*attn_norm(broadcasted) cur = ggml_mul(ctx0, cur, model.layers[il].attn_norm); - offload_func(cur); - ggml_set_name(cur, "attention_norm_0"); + cb(cur, "attn_norm_0"); } // self-attention { // compute Q and K and RoPE them struct ggml_tensor * tmpk = ggml_mul_mat(ctx0, model.layers[il].wk, cur); - offload_func_kq(tmpk); - ggml_set_name(tmpk, "tmpk"); + cb(tmpk, "tmpk"); struct ggml_tensor * tmpq = ggml_mul_mat(ctx0, model.layers[il].wq, cur); - offload_func_kq(tmpq); - ggml_set_name(tmpq, "tmpq"); + cb(tmpq, "tmpq"); struct ggml_tensor * Kcur; struct ggml_tensor * Qcur; switch (model.type) { case MODEL_7B: - Kcur = ggml_rope_custom(ctx0, ggml_reshape_3d(ctx0, tmpk, n_embd_head, n_head_kv, n_tokens), KQ_pos, n_embd_head, 0, 0, freq_base, freq_scale); - Qcur = ggml_rope_custom(ctx0, ggml_reshape_3d(ctx0, tmpq, n_embd_head, n_head, n_tokens), KQ_pos, n_embd_head, 0, 0, freq_base, freq_scale); + Kcur = ggml_rope_custom(ctx0, ggml_reshape_3d(ctx0, tmpk, n_embd_head, n_head_kv, n_tokens), inp_pos, n_embd_head, 0, 0, freq_base, freq_scale); + Qcur = ggml_rope_custom(ctx0, ggml_reshape_3d(ctx0, tmpq, n_embd_head, n_head, n_tokens), inp_pos, n_embd_head, 0, 0, freq_base, freq_scale); break; case MODEL_13B: Kcur = ggml_reshape_3d(ctx0, tmpk, n_embd/n_head, n_head, n_tokens); @@ -3691,33 +3499,27 @@ static struct ggml_cgraph * llm_build_baichaun( GGML_ASSERT(false); } - offload_func_kq(Kcur); - ggml_set_name(Kcur, "Kcur"); + cb(Kcur, "Kcur"); - offload_func_kq(Qcur); - ggml_set_name(Qcur, "Qcur"); + cb(Qcur, "Qcur"); // store key and value to memory { // compute the transposed [n_tokens, n_embd] V matrix struct ggml_tensor * tmpv = ggml_mul_mat(ctx0, model.layers[il].wv, cur); - offload_func_v(tmpv); - ggml_set_name(tmpv, "tmpv"); + cb(tmpv, "tmpv"); struct ggml_tensor * Vcur = ggml_transpose(ctx0, ggml_reshape_2d(ctx0, tmpv, n_embd_gqa, n_tokens)); - offload_func_v(Vcur); - ggml_set_name(Vcur, "Vcur"); + cb(Vcur, "Vcur"); struct ggml_tensor * k = ggml_view_1d(ctx0, kv_self.k, n_tokens*n_embd_gqa, (ggml_element_size(kv_self.k)*n_embd_gqa)*(il*n_ctx + kv_head)); - offload_func_kq(k); - ggml_set_name(k, "k"); + cb(k, "k"); struct ggml_tensor * v = ggml_view_2d(ctx0, kv_self.v, n_tokens, n_embd_gqa, ( n_ctx)*ggml_element_size(kv_self.v), (il*n_ctx)*ggml_element_size(kv_self.v)*n_embd_gqa + kv_head*ggml_element_size(kv_self.v)); - offload_func_v(v); - ggml_set_name(v, "v"); + cb(v, "v"); // important: storing RoPE-ed version of K in the KV cache! ggml_build_forward_expand(gf, ggml_cpy(ctx0, Kcur, k)); @@ -3725,8 +3527,7 @@ static struct ggml_cgraph * llm_build_baichaun( } struct ggml_tensor * Q = ggml_permute(ctx0, Qcur, 0, 2, 1, 3); - offload_func_kq(Q); - ggml_set_name(Q, "Q"); + cb(Q, "Q"); struct ggml_tensor * K = ggml_view_3d(ctx0, kv_self.k, @@ -3734,19 +3535,16 @@ static struct ggml_cgraph * llm_build_baichaun( ggml_element_size(kv_self.k)*n_embd_gqa, ggml_element_size(kv_self.k)*n_embd_head, ggml_element_size(kv_self.k)*n_embd_gqa*n_ctx*il); - offload_func_kq(K); - ggml_set_name(K, "K"); + cb(K, "K"); // K * Q struct ggml_tensor * KQ = ggml_mul_mat(ctx0, K, Q); - offload_func_kq(KQ); - ggml_set_name(KQ, "KQ"); + cb(KQ, "KQ"); // KQ_scaled = KQ / sqrt(n_embd_head) // KQ_scaled shape [n_past + n_tokens, n_tokens, n_head, 1] struct ggml_tensor * KQ_scaled = ggml_scale(ctx0, KQ, KQ_scale); - offload_func_kq(KQ_scaled); - ggml_set_name(KQ_scaled, "KQ_scaled"); + cb(KQ_scaled, "KQ_scaled"); struct ggml_tensor * KQ_masked; struct ggml_tensor * KQ_scaled_alibi; @@ -3758,7 +3556,7 @@ static struct ggml_cgraph * llm_build_baichaun( case MODEL_13B: // TODO: replace with ggml_add() KQ_scaled_alibi = ggml_alibi(ctx0, KQ_scaled, /*n_past*/ 0, n_head, 8); - ggml_set_name(KQ_scaled_alibi, "KQ_scaled_alibi"); + cb(KQ_scaled_alibi, "KQ_scaled_alibi"); KQ_masked = ggml_add(ctx0, KQ_scaled_alibi, KQ_mask); break; default: @@ -3767,8 +3565,7 @@ static struct ggml_cgraph * llm_build_baichaun( // KQ = soft_max(KQ_masked) struct ggml_tensor * KQ_soft_max = ggml_soft_max(ctx0, KQ_masked); - offload_func_v(KQ_soft_max); - ggml_set_name(KQ_soft_max, "KQ_soft_max"); + cb(KQ_soft_max, "KQ_soft_max"); // split cached V into n_head heads struct ggml_tensor * V = @@ -3777,80 +3574,66 @@ static struct ggml_cgraph * llm_build_baichaun( ggml_element_size(kv_self.v)*n_ctx, ggml_element_size(kv_self.v)*n_ctx*n_embd_head, ggml_element_size(kv_self.v)*n_ctx*n_embd_gqa*il); - offload_func_v(V); - ggml_set_name(V, "V"); + cb(V, "V"); struct ggml_tensor * KQV = ggml_mul_mat(ctx0, V, KQ_soft_max); - offload_func_v(KQV); - ggml_set_name(KQV, "KQV"); + cb(KQV, "KQV"); // KQV_merged = KQV.permute(0, 2, 1, 3) struct ggml_tensor * KQV_merged = ggml_permute(ctx0, KQV, 0, 2, 1, 3); - offload_func_v(KQV_merged); - ggml_set_name(KQV_merged, "KQV_merged"); + cb(KQV_merged, "KQV_merged"); // cur = KQV_merged.contiguous().view(n_embd, n_tokens) cur = ggml_cont_2d(ctx0, KQV_merged, n_embd, n_tokens); - offload_func_v(cur); - ggml_set_name(cur, "KQV_merged_contiguous"); + cb(cur, "KQV_merged_contiguous"); // projection (no bias) cur = ggml_mul_mat(ctx0, model.layers[il].wo, cur); - offload_func(cur); - ggml_set_name(cur, "result_wo"); + cb(cur, "result_wo"); } struct ggml_tensor * inpFF = ggml_add(ctx0, cur, inpSA); - offload_func(inpFF); - ggml_set_name(inpFF, "inpFF"); + cb(inpFF, "inpFF"); // feed-forward network { // norm { cur = ggml_rms_norm(ctx0, inpFF, norm_rms_eps); - offload_func(cur); - ggml_set_name(cur, "rms_norm_1"); + cb(cur, "rms_norm_1"); // cur = cur*ffn_norm(broadcasted) cur = ggml_mul(ctx0, cur, model.layers[il].ffn_norm); - offload_func(cur); - ggml_set_name(cur, "ffn_norm"); + cb(cur, "ffn_norm"); } struct ggml_tensor * tmp = ggml_mul_mat(ctx0, model.layers[il].w3, cur); - offload_func(tmp); - ggml_set_name(tmp, "result_w3"); + cb(tmp, "result_w3"); cur = ggml_mul_mat(ctx0, model.layers[il].w1, cur); - offload_func(cur); - ggml_set_name(cur, "result_w1"); + cb(cur, "result_w1"); // SILU activation cur = ggml_silu(ctx0, cur); - offload_func(cur); - ggml_set_name(cur, "silu"); + cb(cur, "silu"); cur = ggml_mul(ctx0, cur, tmp); - offload_func(cur); - ggml_set_name(cur, "silu_x_result_w3"); + cb(cur, "silu_x_result_w3"); cur = ggml_mul_mat(ctx0, model.layers[il].w2, cur); - offload_func(cur); - ggml_set_name(cur, "result_w2"); + cb(cur, "result_w2"); } cur = ggml_add(ctx0, cur, inpFF); - offload_func(cur); - ggml_set_name(cur, "inpFF_+_result_w2"); + cb(cur, "inpFF_+_result_w2"); // input for next layer inpL = cur; @@ -3861,363 +3644,16 @@ static struct ggml_cgraph * llm_build_baichaun( // norm { cur = ggml_rms_norm(ctx0, cur, norm_rms_eps); - offload_func_nr(cur); - ggml_set_name(cur, "rms_norm_2"); + cb(cur, "rms_norm_2"); // cur = cur*norm(broadcasted) cur = ggml_mul(ctx0, cur, model.output_norm); - // offload_func_nr(cur); // TODO CPU + GPU mirrored backend - ggml_set_name(cur, "result_norm"); + cb(cur, "result_norm"); } // lm_head cur = ggml_mul_mat(ctx0, model.output, cur); - ggml_set_name(cur, "result_output"); - - ggml_build_forward_expand(gf, cur); - - ggml_free(ctx0); - - return gf; -} - -static struct ggml_cgraph * llm_build_refact( - llama_context & lctx, - const llama_batch & batch) { - const auto & model = lctx.model; - const auto & hparams = model.hparams; - const auto & cparams = lctx.cparams; - - const auto & kv_self = lctx.kv_self; - - GGML_ASSERT(!!kv_self.ctx); - - const int64_t n_embd = hparams.n_embd; - const int64_t n_layer = hparams.n_layer; - const int64_t n_ctx = cparams.n_ctx; - const int64_t n_head = hparams.n_head; - const int64_t n_head_kv = hparams.n_head_kv; - const int64_t n_embd_head = hparams.n_embd_head(); - const int64_t n_embd_gqa = hparams.n_embd_gqa(); - - const float norm_rms_eps = hparams.f_norm_rms_eps; - - const int n_gpu_layers = model.n_gpu_layers; - - const int32_t n_tokens = batch.n_tokens; - const int32_t n_kv = ggml_allocr_is_measure(lctx.alloc) ? n_ctx : kv_self.n; - const int32_t kv_head = ggml_allocr_is_measure(lctx.alloc) ? n_ctx - n_tokens : kv_self.head; - - // printf("n_kv = %d\n", n_kv); - - auto & buf_compute = lctx.buf_compute; - - struct ggml_init_params params = { - /*.mem_size =*/ buf_compute.size, - /*.mem_buffer =*/ buf_compute.data, - /*.no_alloc =*/ true, - }; - - struct ggml_context * ctx0 = ggml_init(params); - - ggml_cgraph * gf = ggml_new_graph(ctx0); - - struct ggml_tensor * cur; - struct ggml_tensor * inpL; - - if (batch.token) { - struct ggml_tensor * inp_tokens = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens); - - ggml_allocr_alloc(lctx.alloc, inp_tokens); - if (!ggml_allocr_is_measure(lctx.alloc)) { - memcpy(inp_tokens->data, batch.token, n_tokens*ggml_element_size(inp_tokens)); - } - ggml_set_name(inp_tokens, "inp_tokens"); - - inpL = ggml_get_rows(ctx0, model.tok_embeddings, inp_tokens); - } else { -#ifdef GGML_USE_MPI - GGML_ASSERT(false && "not implemented"); -#endif - - inpL = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd, n_tokens); - - ggml_allocr_alloc(lctx.alloc, inpL); - if (!ggml_allocr_is_measure(lctx.alloc)) { - memcpy(inpL->data, batch.embd, n_tokens * n_embd * ggml_element_size(inpL)); - } - } - - const int i_gpu_start = n_layer - n_gpu_layers; - (void) i_gpu_start; - - // offload functions set the tensor output backend to GPU - // tensors are GPU-accelerated if any input or the output has been offloaded - offload_func_t offload_func_nr = llama_nop; // nr = non-repeating - offload_func_t offload_func_kq = llama_nop; - offload_func_t offload_func_v = llama_nop; - -#ifdef GGML_USE_CUBLAS - if (n_gpu_layers > n_layer) { - offload_func_nr = ggml_cuda_assign_buffers_no_alloc; - } - if (n_gpu_layers > n_layer + 1) { - offload_func_v = ggml_cuda_assign_buffers_no_alloc; - } - if (n_gpu_layers > n_layer + 2) { - offload_func_kq = ggml_cuda_assign_buffers_no_alloc; - } -#endif // GGML_USE_CUBLAS - - // KQ_scale - struct ggml_tensor * KQ_scale = ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, 1); - ggml_set_name(KQ_scale, "1/sqrt(n_embd_head)"); - ggml_allocr_alloc(lctx.alloc, KQ_scale); - if (!ggml_allocr_is_measure(lctx.alloc)) { - ggml_set_f32(KQ_scale, 1.0f/sqrtf(float(n_embd_head))); - } - - // KQ_mask (mask for 1 head, it will be broadcasted to all heads) - struct ggml_tensor * KQ_mask = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_kv, n_tokens, 1); - offload_func_kq(KQ_mask); - ggml_set_name(KQ_mask, "KQ_mask"); - ggml_allocr_alloc(lctx.alloc, KQ_mask); - if (!ggml_allocr_is_measure(lctx.alloc)) { - float * data = (float *) KQ_mask->data; - memset(data, 0, ggml_nbytes(KQ_mask)); - - for (int h = 0; h < 1; ++h) { - for (int j = 0; j < n_tokens; ++j) { - const llama_pos pos = batch.pos[j]; - const llama_seq_id seq_id = batch.seq_id[j][0]; - - for (int i = 0; i < n_kv; ++i) { - if (!kv_self.cells[i].has_seq_id(seq_id) || kv_self.cells[i].pos > pos) { - data[h*(n_kv*n_tokens) + j*n_kv + i] = -INFINITY; - } - } - } - } - } - - for (int il = 0; il < n_layer; ++il) { - ggml_format_name(inpL, "layer_inp_%d", il); - - offload_func_t offload_func = llama_nop; - -#ifdef GGML_USE_CUBLAS - if (il >= i_gpu_start) { - offload_func = ggml_cuda_assign_buffers_no_alloc; - } -#endif // GGML_USE_CUBLAS - - struct ggml_tensor * inpSA = inpL; - - // norm - { - cur = ggml_rms_norm(ctx0, inpL, norm_rms_eps); - offload_func(cur); - ggml_set_name(cur, "rms_norm_0"); - - // cur = cur*attn_norm(broadcasted) - cur = ggml_mul(ctx0, cur, model.layers[il].attn_norm); - offload_func(cur); - ggml_set_name(cur, "attention_norm_0"); - } - - // self-attention - { - // compute Q and K - struct ggml_tensor * tmpk = ggml_mul_mat(ctx0, model.layers[il].wk, cur); - offload_func_kq(tmpk); - ggml_set_name(tmpk, "tmpk"); - - struct ggml_tensor * tmpq = ggml_mul_mat(ctx0, model.layers[il].wq, cur); - offload_func_kq(tmpq); - ggml_set_name(tmpq, "tmpq"); - - struct ggml_tensor * Kcur = ggml_reshape_3d(ctx0, tmpk, n_embd_head, n_head_kv, n_tokens); - offload_func_kq(Kcur); - ggml_set_name(Kcur, "Kcur"); - - struct ggml_tensor * Qcur = ggml_reshape_3d(ctx0, tmpq, n_embd_head, n_head, n_tokens); - offload_func_kq(Qcur); - ggml_set_name(Qcur, "Qcur"); - - // store key and value to memory - { - // compute the transposed [n_tokens, n_embd] V matrix - - struct ggml_tensor * tmpv = ggml_mul_mat(ctx0, model.layers[il].wv, cur); - offload_func_v(tmpv); - ggml_set_name(tmpv, "tmpv"); - - struct ggml_tensor * Vcur = ggml_transpose(ctx0, ggml_reshape_2d(ctx0, tmpv, n_embd_gqa, n_tokens)); - offload_func_v(Vcur); - ggml_set_name(Vcur, "Vcur"); - - struct ggml_tensor * k = ggml_view_1d(ctx0, kv_self.k, n_tokens*n_embd_gqa, (ggml_element_size(kv_self.k)*n_embd_gqa)*(il*n_ctx + kv_head)); - offload_func_kq(k); - ggml_set_name(k, "k"); - - struct ggml_tensor * v = ggml_view_2d(ctx0, kv_self.v, n_tokens, n_embd_gqa, - ( n_ctx)*ggml_element_size(kv_self.v), - (il*n_ctx)*ggml_element_size(kv_self.v)*n_embd_gqa + kv_head*ggml_element_size(kv_self.v)); - offload_func_v(v); - ggml_set_name(v, "v"); - - ggml_build_forward_expand(gf, ggml_cpy(ctx0, Kcur, k)); - ggml_build_forward_expand(gf, ggml_cpy(ctx0, Vcur, v)); - } - - struct ggml_tensor * Q = ggml_permute(ctx0, Qcur, 0, 2, 1, 3); - offload_func_kq(Q); - ggml_set_name(Q, "Q"); - - struct ggml_tensor * K = - ggml_view_3d(ctx0, kv_self.k, - n_embd_head, n_kv, n_head_kv, - ggml_element_size(kv_self.k)*n_embd_gqa, - ggml_element_size(kv_self.k)*n_embd_head, - ggml_element_size(kv_self.k)*n_embd_gqa*n_ctx*il); - offload_func_kq(K); - ggml_set_name(K, "K"); - - // K * Q - struct ggml_tensor * KQ = ggml_mul_mat(ctx0, K, Q); - offload_func_kq(KQ); - ggml_set_name(KQ, "KQ"); - - // KQ_scaled = KQ / sqrt(n_embd_head) - // KQ_scaled shape [n_kv, n_tokens, n_head, 1] - struct ggml_tensor * KQ_scaled = ggml_scale(ctx0, KQ, KQ_scale); - offload_func_kq(KQ_scaled); - ggml_set_name(KQ_scaled, "KQ_scaled"); - - // KQ_masked = mask_past(KQ_scaled) - struct ggml_tensor * KQ_scaled_alibi = ggml_alibi(ctx0, KQ_scaled, /*n_past*/ 0, n_head, 8); - ggml_set_name(KQ_scaled_alibi, "KQ_scaled_alibi"); - - struct ggml_tensor * KQ_masked = ggml_add(ctx0, KQ_scaled_alibi, KQ_mask); - offload_func_kq(KQ_masked); - ggml_set_name(KQ_masked, "KQ_masked"); - - // KQ = soft_max(KQ_masked) - struct ggml_tensor * KQ_soft_max = ggml_soft_max(ctx0, KQ_masked); - offload_func_v(KQ_soft_max); - ggml_set_name(KQ_soft_max, "KQ_soft_max"); - - // split cached V into n_head heads - struct ggml_tensor * V = - ggml_view_3d(ctx0, kv_self.v, - n_kv, n_embd_head, n_head_kv, - ggml_element_size(kv_self.v)*n_ctx, - ggml_element_size(kv_self.v)*n_ctx*n_embd_head, - ggml_element_size(kv_self.v)*n_ctx*n_embd_gqa*il); - offload_func_v(V); - ggml_set_name(V, "V"); - -#if 1 - struct ggml_tensor * KQV = ggml_mul_mat(ctx0, V, KQ_soft_max); - offload_func_v(KQV); - ggml_set_name(KQV, "KQV"); -#else - // make V contiguous in memory to speed up the matmul, however we waste time on the copy - // on M1 this is faster for the perplexity computation, but ~5% slower for the single-token generation - // is there a better way? - struct ggml_tensor * V_cont = ggml_cpy(ctx0, V, ggml_new_tensor_3d(ctx0, kv_self.v->type, n_ctx, n_embd_head, n_head)); - struct ggml_tensor * KQV = ggml_mul_mat(ctx0, V_cont, KQ_soft_max); -#endif - - // KQV_merged = KQV.permute(0, 2, 1, 3) - struct ggml_tensor * KQV_merged = ggml_permute(ctx0, KQV, 0, 2, 1, 3); - offload_func_v(KQV_merged); - ggml_set_name(KQV_merged, "KQV_merged"); - - // cur = KQV_merged.contiguous().view(n_embd, n_tokens) - cur = ggml_cont_2d(ctx0, KQV_merged, n_embd, n_tokens); - offload_func_v(cur); - ggml_set_name(cur, "KQV_merged_contiguous"); - - // projection (no bias) - cur = ggml_mul_mat(ctx0, - model.layers[il].wo, - cur); - offload_func(cur); - ggml_set_name(cur, "result_wo"); - } - - struct ggml_tensor * inpFF = ggml_add(ctx0, cur, inpSA); - offload_func(inpFF); - ggml_set_name(inpFF, "inpFF"); - - // feed-forward network - { - // norm - { - cur = ggml_rms_norm(ctx0, inpFF, norm_rms_eps); - offload_func(cur); - ggml_set_name(cur, "rms_norm_1"); - - // cur = cur*ffn_norm(broadcasted) - cur = ggml_mul(ctx0, cur, model.layers[il].ffn_norm); - offload_func(cur); - ggml_set_name(cur, "ffn_norm"); - } - - struct ggml_tensor * tmp = ggml_mul_mat(ctx0, - model.layers[il].w3, - cur); - offload_func(tmp); - ggml_set_name(tmp, "result_w3"); - - cur = ggml_mul_mat(ctx0, - model.layers[il].w1, - cur); - offload_func(cur); - ggml_set_name(cur, "result_w1"); - - // SILU activation - cur = ggml_silu(ctx0, cur); - offload_func(cur); - ggml_set_name(cur, "silu"); - - cur = ggml_mul(ctx0, cur, tmp); - offload_func(cur); - ggml_set_name(cur, "silu_x_result_w3"); - - cur = ggml_mul_mat(ctx0, - model.layers[il].w2, - cur); - offload_func(cur); - ggml_set_name(cur, "result_w2"); - } - - cur = ggml_add(ctx0, cur, inpFF); - offload_func(cur); - ggml_set_name(cur, "inpFF_+_result_w2"); - - // input for next layer - inpL = cur; - } - - cur = inpL; - - // norm - { - cur = ggml_rms_norm(ctx0, cur, norm_rms_eps); - offload_func_nr(cur); - ggml_set_name(cur, "rms_norm_2"); - - // cur = cur*norm(broadcasted) - cur = ggml_mul(ctx0, cur, model.output_norm); - // offload_func_nr(cur); // TODO CPU + GPU mirrored backend - ggml_set_name(cur, "result_norm"); - } - - // lm_head - cur = ggml_mul_mat(ctx0, model.output, cur); - ggml_set_name(cur, "result_output"); + cb(cur, "result_output"); ggml_build_forward_expand(gf, cur); @@ -4228,7 +3664,9 @@ static struct ggml_cgraph * llm_build_refact( static struct ggml_cgraph * llm_build_falcon( llama_context & lctx, - const llama_batch & batch) { + const llama_batch & batch, + const llm_build_cb & cb, + bool worst_case) { const auto & model = lctx.model; const auto & hparams = model.hparams; const auto & cparams = lctx.cparams; @@ -4251,13 +3689,11 @@ static struct ggml_cgraph * llm_build_falcon( const float freq_scale = cparams.rope_freq_scale; const float norm_eps = hparams.f_norm_eps; - const int n_gpu_layers = model.n_gpu_layers; - const int32_t n_tokens = batch.n_tokens; - const int32_t n_kv = ggml_allocr_is_measure(lctx.alloc) ? n_ctx : kv_self.n; - const int32_t kv_head = ggml_allocr_is_measure(lctx.alloc) ? n_ctx - n_tokens : kv_self.head; + const int32_t n_kv = worst_case ? n_ctx : kv_self.n; + const int32_t kv_head = worst_case ? n_ctx - n_tokens : kv_self.head; - const bool do_rope_shift = ggml_allocr_is_measure(lctx.alloc) || kv_self.has_shift; + const bool do_rope_shift = worst_case || kv_self.has_shift; //printf("kv_head = %d, n_kv = %d, n_tokens = %d, n_ctx = %d, is_measure = %d, has_shift = %d\n", // kv_head, n_kv, n_tokens, n_ctx, ggml_allocr_is_measure(lctx.alloc), kv_self.has_shift); @@ -4279,12 +3715,7 @@ static struct ggml_cgraph * llm_build_falcon( if (batch.token) { struct ggml_tensor * inp_tokens = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens); - - ggml_allocr_alloc(lctx.alloc, inp_tokens); - if (!ggml_allocr_is_measure(lctx.alloc)) { - memcpy(inp_tokens->data, batch.token, n_tokens*ggml_element_size(inp_tokens)); - } - ggml_set_name(inp_tokens, "inp_tokens"); + cb(inp_tokens, "inp_tokens"); inpL = ggml_get_rows(ctx0, model.tok_embeddings, inp_tokens); } else { @@ -4293,89 +3724,25 @@ static struct ggml_cgraph * llm_build_falcon( #endif inpL = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd, n_tokens); - - ggml_allocr_alloc(lctx.alloc, inpL); - if (!ggml_allocr_is_measure(lctx.alloc)) { - memcpy(inpL->data, batch.embd, n_tokens * n_embd * ggml_element_size(inpL)); - } } + cb(inpL, "inp_embd"); - const int i_gpu_start = n_layer - n_gpu_layers; - (void) i_gpu_start; - - // offload functions set the tensor output backend to GPU - // tensors are GPU-accelerated if any input or the output has been offloaded - offload_func_t offload_func_nr = llama_nop; // nr = non-repeating - offload_func_t offload_func_kq = llama_nop; - offload_func_t offload_func_v = llama_nop; - -#ifdef GGML_USE_CUBLAS - if (n_gpu_layers > n_layer) { - offload_func_nr = ggml_cuda_assign_buffers_no_alloc; - } - if (n_gpu_layers > n_layer + 1) { - offload_func_v = ggml_cuda_assign_buffers_no_alloc; - } - if (n_gpu_layers > n_layer + 2) { - offload_func_kq = ggml_cuda_assign_buffers_no_alloc; - } -#endif // GGML_USE_CUBLAS + // inp_pos - contains the positions + struct ggml_tensor * inp_pos = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens); + cb(inp_pos, "inp_pos"); // KQ_scale struct ggml_tensor * KQ_scale = ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, 1); - ggml_set_name(KQ_scale, "1/sqrt(n_embd_head)"); - ggml_allocr_alloc(lctx.alloc, KQ_scale); - if (!ggml_allocr_is_measure(lctx.alloc)) { - ggml_set_f32(KQ_scale, 1.0f/sqrtf(float(n_embd)/n_head)); - } + cb(KQ_scale, "KQ_scale"); // KQ_mask (mask for 1 head, it will be broadcasted to all heads) struct ggml_tensor * KQ_mask = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_kv, n_tokens, 1); - offload_func_kq(KQ_mask); - ggml_set_name(KQ_mask, "KQ_mask"); - ggml_allocr_alloc(lctx.alloc, KQ_mask); - if (!ggml_allocr_is_measure(lctx.alloc)) { - float * data = (float *) KQ_mask->data; - memset(data, 0, ggml_nbytes(KQ_mask)); - - for (int h = 0; h < 1; ++h) { - for (int j = 0; j < n_tokens; ++j) { - const llama_pos pos = batch.pos[j]; - const llama_seq_id seq_id = batch.seq_id[j][0]; - - for (int i = 0; i < n_kv; ++i) { - if (!kv_self.cells[i].has_seq_id(seq_id) || kv_self.cells[i].pos > pos) { - data[h*(n_kv*n_tokens) + j*n_kv + i] = -INFINITY; - } - } - } - } - } - - // KQ_pos - contains the positions - struct ggml_tensor * KQ_pos = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens); - offload_func_kq(KQ_pos); - ggml_set_name(KQ_pos, "KQ_pos"); - ggml_allocr_alloc(lctx.alloc, KQ_pos); - if (!ggml_allocr_is_measure(lctx.alloc)) { - int * data = (int *) KQ_pos->data; - for (int i = 0; i < n_tokens; ++i) { - data[i] = batch.pos[i]; - } - } + cb(KQ_mask, "KQ_mask"); // shift the entire K-cache if needed if (do_rope_shift) { struct ggml_tensor * K_shift = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_ctx); - offload_func_kq(K_shift); - ggml_set_name(K_shift, "K_shift"); - ggml_allocr_alloc(lctx.alloc, K_shift); - if (!ggml_allocr_is_measure(lctx.alloc)) { - int * data = (int *) K_shift->data; - for (int i = 0; i < n_ctx; ++i) { - data[i] = kv_self.cells[i].delta; - } - } + cb(K_shift, "K_shift"); for (int il = 0; il < n_layer; ++il) { struct ggml_tensor * tmp = @@ -4386,7 +3753,7 @@ static struct ggml_cgraph * llm_build_falcon( ggml_element_size(kv_self.k)*n_embd_gqa, ggml_element_size(kv_self.k)*n_embd_gqa*n_ctx*il), K_shift, n_embd_head, 2, 0, freq_base, freq_scale); - offload_func_kq(tmp); + cb(tmp, "K_shifted"); ggml_build_forward_expand(gf, tmp); } } @@ -4394,35 +3761,27 @@ static struct ggml_cgraph * llm_build_falcon( for (int il = 0; il < n_layer; ++il) { struct ggml_tensor * attn_norm; - offload_func_t offload_func = llama_nop; - -#ifdef GGML_USE_CUBLAS - if (il >= i_gpu_start) { - offload_func = ggml_cuda_assign_buffers_no_alloc; - } -#endif // GGML_USE_CUBLAS - // self-attention // TODO: refactor into common function (shared with LLaMA) { attn_norm = ggml_norm(ctx0, inpL, norm_eps); - offload_func(attn_norm); + cb(attn_norm, "attn_norm_0"); - attn_norm = ggml_add(ctx0, - ggml_mul(ctx0, attn_norm, model.layers[il].attn_norm), - model.layers[il].attn_norm_b); - offload_func(attn_norm->src[0]); - offload_func(attn_norm); + attn_norm = ggml_mul(ctx0, attn_norm, model.layers[il].attn_norm); + cb(attn_norm, "attn_norm_0_w"); + + attn_norm = ggml_add(ctx0, attn_norm, model.layers[il].attn_norm_b); + cb(attn_norm, "attn_norm_0_wb"); if (model.layers[il].attn_norm_2) { // Falcon-40B cur = ggml_norm(ctx0, inpL, norm_eps); - offload_func(cur); + cb(cur, "attn_norm_2"); - cur = ggml_add(ctx0, - ggml_mul(ctx0, cur, model.layers[il].attn_norm_2), - model.layers[il].attn_norm_2_b); - offload_func(cur->src[0]); - offload_func(cur); + cur = ggml_mul(ctx0, cur, model.layers[il].attn_norm_2); + cb(cur, "attn_norm_2_w"); + + cur = ggml_add(ctx0, cur, model.layers[il].attn_norm_2_b); + cb(cur, "attn_norm_2_wb"); } else { // Falcon 7B cur = attn_norm; } @@ -4430,7 +3789,7 @@ static struct ggml_cgraph * llm_build_falcon( // compute QKV cur = ggml_mul_mat(ctx0, model.layers[il].wqkv, cur); - offload_func_kq(cur); + cb(cur, "wqkv"); // Note that the strides for Kcur, Vcur are set up so that the // resulting views are misaligned with the tensor's storage @@ -4450,50 +3809,50 @@ static struct ggml_cgraph * llm_build_falcon( wsize * n_embd_head, wsize * n_embd_head * (n_head + 2 * n_head_kv), 0)); - offload_func_kq(tmpq); + cb(tmpq, "tmpq"); struct ggml_tensor * tmpk = ggml_cont(ctx0, ggml_view_3d( ctx0, cur, n_embd_head, n_head_kv, n_tokens, wsize * n_embd_head, wsize * n_embd_head * (n_head + 2 * n_head_kv), wsize * n_embd_head * n_head)); - offload_func_kq(tmpk); + cb(tmpk, "tmpk"); struct ggml_tensor * tmpv = ggml_view_3d( ctx0, cur, n_embd_head, n_head_kv, n_tokens, wsize * n_embd_head, wsize * n_embd_head * (n_head + 2 * n_head_kv), wsize * n_embd_head * (n_head + n_head_kv)); - offload_func_v(tmpv); + cb(tmpv, "tmpv"); // using mode = 2 for neox mode - struct ggml_tensor * Qcur = ggml_rope_custom(ctx0, tmpq, KQ_pos, n_embd_head, 2, 0, freq_base, freq_scale); - offload_func_kq(Qcur); - struct ggml_tensor * Kcur = ggml_rope_custom(ctx0, tmpk, KQ_pos, n_embd_head, 2, 0, freq_base, freq_scale); - offload_func_kq(Kcur); + struct ggml_tensor * Qcur = ggml_rope_custom(ctx0, tmpq, inp_pos, n_embd_head, 2, 0, freq_base, freq_scale); + cb(Qcur, "Qcur"); + + struct ggml_tensor * Kcur = ggml_rope_custom(ctx0, tmpk, inp_pos, n_embd_head, 2, 0, freq_base, freq_scale); + cb(Kcur, "Kcur"); { - struct ggml_tensor * Vcur = ggml_transpose(ctx0, ggml_reshape_2d(ctx0, ggml_cont(ctx0, tmpv), n_embd_gqa, n_tokens)); - offload_func_v(Vcur); - offload_func_v(Vcur->src[0]->src[0]); - ggml_set_name(Vcur, "Vcur"); + struct ggml_tensor * Vcur = ggml_cont(ctx0, tmpv); + cb(Vcur, "Vcur_0"); + + Vcur = ggml_transpose(ctx0, ggml_reshape_2d(ctx0, Vcur, n_embd_gqa, n_tokens)); + cb(Vcur, "Vcur_1"); struct ggml_tensor * k = ggml_view_1d(ctx0, kv_self.k, n_tokens*n_embd_gqa, (ggml_element_size(kv_self.k)*n_embd_gqa)*(il*n_ctx + kv_head)); - offload_func_kq(k); - ggml_set_name(k, "k"); + cb(k, "k"); struct ggml_tensor * v = ggml_view_2d(ctx0, kv_self.v, n_tokens, n_embd_gqa, ( n_ctx)*ggml_element_size(kv_self.v), (il*n_ctx)*ggml_element_size(kv_self.v)*n_embd_gqa + kv_head*ggml_element_size(kv_self.v)); - offload_func_v(v); + cb(v, "v"); ggml_build_forward_expand(gf, ggml_cpy(ctx0, Kcur, k)); ggml_build_forward_expand(gf, ggml_cpy(ctx0, Vcur, v)); } struct ggml_tensor * Q = ggml_permute(ctx0, Qcur, 0, 2, 1, 3); - offload_func_kq(Q); - ggml_set_name(Q, "Q"); + cb(Q, "Q"); struct ggml_tensor * K = ggml_view_3d(ctx0, kv_self.k, @@ -4501,24 +3860,19 @@ static struct ggml_cgraph * llm_build_falcon( ggml_element_size(kv_self.k)*n_embd_gqa, ggml_element_size(kv_self.k)*n_embd_head, ggml_element_size(kv_self.k)*n_embd_gqa*n_ctx*il); - offload_func_kq(K); - ggml_set_name(K, "K"); + cb(K, "K"); struct ggml_tensor * KQ = ggml_mul_mat(ctx0, K, Q); - offload_func_kq(KQ); - ggml_set_name(KQ, "KQ"); + cb(KQ, "KQ"); struct ggml_tensor * KQ_scaled = ggml_scale(ctx0, KQ, KQ_scale); - offload_func_kq(KQ_scaled); - ggml_set_name(KQ_scaled, "KQ_scaled"); + cb(KQ_scaled, "KQ_scaled"); struct ggml_tensor * KQ_masked = ggml_add(ctx0, KQ_scaled, KQ_mask); - offload_func_kq(KQ_masked); - ggml_set_name(KQ_masked, "KQ_masked"); + cb(KQ_masked, "KQ_masked"); struct ggml_tensor * KQ_soft_max = ggml_soft_max(ctx0, KQ_masked); - offload_func_v(KQ_soft_max); - ggml_set_name(KQ_soft_max, "KQ_soft_max"); + cb(KQ_soft_max, "KQ_soft_max"); struct ggml_tensor * V = ggml_view_3d(ctx0, kv_self.v, @@ -4526,24 +3880,19 @@ static struct ggml_cgraph * llm_build_falcon( ggml_element_size(kv_self.v)*n_ctx, ggml_element_size(kv_self.v)*n_ctx*n_embd_head, ggml_element_size(kv_self.v)*n_ctx*n_embd_gqa*il); - offload_func_v(V); - ggml_set_name(V, "V"); + cb(V, "V"); struct ggml_tensor * KQV = ggml_mul_mat(ctx0, V, KQ_soft_max); - offload_func_v(KQV); - ggml_set_name(KQV, "KQV"); + cb(KQV, "KQV"); struct ggml_tensor * KQV_merged = ggml_permute(ctx0, KQV, 0, 2, 1, 3); - offload_func_v(KQV_merged); - ggml_set_name(KQV_merged, "KQV_merged"); + cb(KQV_merged, "KQV_merged"); cur = ggml_cont_2d(ctx0, KQV_merged, n_embd, n_tokens); - offload_func_v(cur); - ggml_set_name(cur, "KQV_merged_contiguous"); + cb(cur, "KQV_merged_contiguous"); cur = ggml_mul_mat(ctx0, model.layers[il].wo, cur); - offload_func(cur); - ggml_set_name(cur, "result_wo"); + cb(cur, "result_wo"); } struct ggml_tensor * attn_out = cur; @@ -4553,18 +3902,20 @@ static struct ggml_cgraph * llm_build_falcon( struct ggml_tensor * inpFF = attn_norm; cur = ggml_mul_mat(ctx0, model.layers[il].w3, inpFF); - offload_func(cur); + cb(cur, "result_w3"); cur = ggml_gelu(ctx0, cur); - offload_func(cur); + cb(cur, "gelu"); + cur = ggml_mul_mat(ctx0, model.layers[il].w2, cur); - offload_func(cur); + cb(cur, "result_w2"); } cur = ggml_add(ctx0, cur, attn_out); - offload_func(cur); + cb(cur, "inpFF_+_result_w2"); + cur = ggml_add(ctx0, cur, inpL); - offload_func(cur); + cb(cur, "inpL_+_inpFF_+_result_w2"); // input for next layer inpL = cur; @@ -4575,16 +3926,17 @@ static struct ggml_cgraph * llm_build_falcon( // norm { cur = ggml_norm(ctx0, cur, norm_eps); - offload_func_nr(cur); + cb(cur, "out_norm_0"); - cur = ggml_add(ctx0, - ggml_mul(ctx0, cur, model.output_norm), - model.output_norm_b); - ggml_set_name(cur, "result_norm"); + cur = ggml_mul(ctx0, cur, model.output_norm); + cb(cur, "out_norm_0_w"); + + cur = ggml_add(ctx0, cur, model.output_norm_b); + cb(cur, "result_norm"); } cur = ggml_mul_mat(ctx0, model.output, cur); - ggml_set_name(cur, "result_output"); + cb(cur, "result_output"); ggml_build_forward_expand(gf, cur); @@ -4595,7 +3947,9 @@ static struct ggml_cgraph * llm_build_falcon( static struct ggml_cgraph * llm_build_starcoder( llama_context & lctx, - const llama_batch & batch) { + const llama_batch & batch, + const llm_build_cb & cb, + bool worst_case) { const auto & model = lctx.model; const auto & hparams = model.hparams; const auto & cparams = lctx.cparams; @@ -4616,11 +3970,9 @@ static struct ggml_cgraph * llm_build_starcoder( const float norm_eps = hparams.f_norm_eps; - const int n_gpu_layers = model.n_gpu_layers; - const int32_t n_tokens = batch.n_tokens; - const int32_t n_kv = ggml_allocr_is_measure(lctx.alloc) ? n_ctx : kv_self.n; - const int32_t kv_head = ggml_allocr_is_measure(lctx.alloc) ? n_ctx - n_tokens : kv_self.head; + const int32_t n_kv = worst_case ? n_ctx : kv_self.n; + const int32_t kv_head = worst_case ? n_ctx - n_tokens : kv_self.head; auto & buf_compute = lctx.buf_compute; @@ -4635,165 +3987,91 @@ static struct ggml_cgraph * llm_build_starcoder( ggml_cgraph * gf = ggml_new_graph(ctx0); struct ggml_tensor * cur; - struct ggml_tensor * token; - struct ggml_tensor * position; + struct ggml_tensor * embd; + struct ggml_tensor * pos; struct ggml_tensor * inpL; if (batch.token) { struct ggml_tensor * inp_tokens = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens); + cb(inp_tokens, "inp_tokens"); - ggml_allocr_alloc(lctx.alloc, inp_tokens); - if (!ggml_allocr_is_measure(lctx.alloc)) { - memcpy(inp_tokens->data, batch.token, n_tokens*ggml_element_size(inp_tokens)); - } - ggml_set_name(inp_tokens, "inp_tokens"); - - token = ggml_get_rows(ctx0, model.tok_embeddings, inp_tokens); + embd = ggml_get_rows(ctx0, model.tok_embeddings, inp_tokens); } else { #ifdef GGML_USE_MPI GGML_ASSERT(false && "not implemented"); #endif - token = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd, n_tokens); - - ggml_allocr_alloc(lctx.alloc, token); - if (!ggml_allocr_is_measure(lctx.alloc)) { - memcpy(token->data, batch.embd, n_tokens * n_embd * ggml_element_size(token)); - } + embd = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd, n_tokens); } + cb(embd, "inp_embd"); - const int i_gpu_start = n_layer - n_gpu_layers; - (void) i_gpu_start; - - // offload functions set the tensor output backend to GPU - // tensors are GPU-accelerated if any input or the output has been offloaded - offload_func_t offload_func_nr = llama_nop; // nr = non-repeating - offload_func_t offload_func_kq = llama_nop; - offload_func_t offload_func_v = llama_nop; - -#ifdef GGML_USE_CUBLAS - if (n_gpu_layers > n_layer) { - offload_func_nr = ggml_cuda_assign_buffers_no_alloc; - } - if (n_gpu_layers > n_layer + 1) { - offload_func_v = ggml_cuda_assign_buffers_no_alloc; - } - if (n_gpu_layers > n_layer + 2) { - offload_func_kq = ggml_cuda_assign_buffers_no_alloc; - } -#endif // GGML_USE_CUBLAS - - { - // Compute position embeddings. - struct ggml_tensor * inp_positions = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens); - ggml_allocr_alloc(lctx.alloc, inp_positions); - if (!ggml_allocr_is_measure(lctx.alloc)) { - for (int i = 0; i < n_tokens; ++i) { - ((int32_t *) inp_positions->data)[i] = batch.pos[i]; - } - } - ggml_set_name(inp_positions, "inp_positions"); - - position = ggml_get_rows(ctx0, model.pos_embeddings, inp_positions); - } + // inp_pos - contains the positions + struct ggml_tensor * inp_pos = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens); + cb(inp_pos, "inp_pos"); // KQ_scale struct ggml_tensor * KQ_scale = ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, 1); - ggml_set_name(KQ_scale, "1/sqrt(n_embd_head)"); - ggml_allocr_alloc(lctx.alloc, KQ_scale); - if (!ggml_allocr_is_measure(lctx.alloc)) { - ggml_set_f32(KQ_scale, 1.0f/sqrtf(float(n_embd)/n_head)); - } + cb(KQ_scale, "KQ_scale"); // KQ_mask (mask for 1 head, it will be broadcasted to all heads) struct ggml_tensor * KQ_mask = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_kv, n_tokens, 1); - ggml_set_name(KQ_mask, "KQ_mask"); - offload_func_kq(KQ_mask); - ggml_allocr_alloc(lctx.alloc, KQ_mask); - if (!ggml_allocr_is_measure(lctx.alloc)) { - float * data = (float *) KQ_mask->data; - memset(data, 0, ggml_nbytes(KQ_mask)); + cb(KQ_mask, "KQ_mask"); - for (int h = 0; h < 1; ++h) { - for (int j = 0; j < n_tokens; ++j) { - const llama_pos pos = batch.pos[j]; - const llama_seq_id seq_id = batch.seq_id[j][0]; + pos = ggml_get_rows(ctx0, model.pos_embeddings, inp_pos); - for (int i = 0; i < n_kv; ++i) { - if (!kv_self.cells[i].has_seq_id(seq_id) || kv_self.cells[i].pos > pos) { - data[h*(n_kv*n_tokens) + j*n_kv + i] = -INFINITY; - } - } - } - } - } - - inpL = ggml_add(ctx0, token, position); - ggml_set_name(inpL, "inpL"); + inpL = ggml_add(ctx0, embd, pos); + cb(inpL, "inpL"); for (int il = 0; il < n_layer; ++il) { - offload_func_t offload_func = llama_nop; - -#ifdef GGML_USE_CUBLAS - if (il >= i_gpu_start) { - offload_func = ggml_cuda_assign_buffers_no_alloc; - } -#endif // GGML_USE_CUBLAS - { // Norm cur = ggml_norm(ctx0, inpL, norm_eps); - offload_func(cur); + cb(cur, "attn_norm_0"); - cur = ggml_add(ctx0, ggml_mul(ctx0, cur, model.layers[il].attn_norm), model.layers[il].attn_norm_b); - offload_func(cur); + cur = ggml_mul(ctx0, cur, model.layers[il].attn_norm); + cb(cur, "attn_norm_0_w"); + + cur = ggml_add(ctx0, cur, model.layers[il].attn_norm_b); + cb(cur, "attn_norm_0_wb"); } { // Self Attention cur = ggml_mul_mat(ctx0, model.layers[il].wqkv, cur); - offload_func_kq(cur); + cb(cur, "wqkv"); cur = ggml_add(ctx0, cur, model.layers[il].bqkv); - offload_func_kq(cur); + cb(cur, "bqkv"); struct ggml_tensor * tmpq = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd, n_tokens, cur->nb[1], 0*sizeof(float)*(n_embd))); struct ggml_tensor * tmpk = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd))); struct ggml_tensor * tmpv = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa))); - ggml_set_name(tmpq, "tmpq"); - ggml_set_name(tmpk, "tmpk"); - ggml_set_name(tmpv, "tmpv"); - - offload_func_kq(tmpq); - offload_func_kq(tmpk); - offload_func_v (tmpv); + cb(tmpq, "tmpq"); + cb(tmpk, "tmpk"); + cb(tmpv, "tmpv"); struct ggml_tensor * Qcur = ggml_reshape_3d(ctx0, tmpq, n_embd_head, n_head, n_tokens); struct ggml_tensor * Kcur = tmpk; { struct ggml_tensor * Vcur = ggml_transpose(ctx0, tmpv); - offload_func_v(Vcur); - ggml_set_name(Vcur, "Vcur"); + cb(Vcur, "Vcur"); struct ggml_tensor * k = ggml_view_1d(ctx0, kv_self.k, n_tokens*n_embd_gqa, (ggml_element_size(kv_self.k)*n_embd_gqa)*(il*n_ctx + kv_head)); - offload_func_kq(k); - ggml_set_name(k, "k"); + cb(k, "k"); struct ggml_tensor * v = ggml_view_2d(ctx0, kv_self.v, n_tokens, n_embd_gqa, ( n_ctx)*ggml_element_size(kv_self.v), (il*n_ctx)*ggml_element_size(kv_self.v)*n_embd_gqa + kv_head*ggml_element_size(kv_self.v)); - offload_func_v(v); - ggml_set_name(v, "v"); + cb(v, "v"); ggml_build_forward_expand(gf, ggml_cpy(ctx0, Kcur, k)); ggml_build_forward_expand(gf, ggml_cpy(ctx0, Vcur, v)); } struct ggml_tensor * Q = ggml_permute(ctx0, Qcur, 0, 2, 1, 3); - offload_func_kq(Q); - ggml_set_name(Q, "Q"); + cb(Q, "Q"); struct ggml_tensor * K = ggml_view_3d(ctx0, kv_self.k, @@ -4801,29 +4079,24 @@ static struct ggml_cgraph * llm_build_starcoder( ggml_element_size(kv_self.k)*n_embd_gqa, ggml_element_size(kv_self.k)*n_embd_head, ggml_element_size(kv_self.k)*n_embd_gqa*n_ctx*il); - offload_func_kq(K); - ggml_set_name(K, "K"); + cb(K, "K"); // K * Q struct ggml_tensor * KQ = ggml_mul_mat(ctx0, K, Q); - offload_func_kq(KQ); - ggml_set_name(KQ, "KQ"); + cb(KQ, "KQ"); // KQ_scaled = KQ / sqrt(n_embd_head) // KQ_scaled shape [n_past + n_tokens, n_tokens, n_head, 1] struct ggml_tensor * KQ_scaled = ggml_scale_inplace(ctx0, KQ, KQ_scale); - offload_func_kq(KQ_scaled); - ggml_set_name(KQ_scaled, "KQ_scaled"); + cb(KQ_scaled, "KQ_scaled"); // KQ_masked = mask_past(KQ_scaled) struct ggml_tensor * KQ_masked = ggml_add(ctx0, KQ_scaled, KQ_mask); - offload_func_kq(KQ_masked); - ggml_set_name(KQ_masked, "KQ_masked"); + cb(KQ_masked, "KQ_masked"); // KQ = soft_max(KQ_masked) struct ggml_tensor * KQ_soft_max = ggml_soft_max_inplace(ctx0, KQ_masked); - offload_func_v(KQ_soft_max); - ggml_set_name(KQ_soft_max, "KQ_soft_max"); + cb(KQ_soft_max, "KQ_soft_max"); // split cached V into n_head heads struct ggml_tensor * V = @@ -4832,28 +4105,25 @@ static struct ggml_cgraph * llm_build_starcoder( ggml_element_size(kv_self.v)*n_ctx, ggml_element_size(kv_self.v)*n_ctx*n_embd_head, ggml_element_size(kv_self.v)*n_ctx*n_embd_gqa*il); - ggml_set_name(V, "V"); + cb(V, "V"); struct ggml_tensor * KQV = ggml_mul_mat(ctx0, V, KQ_soft_max); - offload_func_v(KQV); - ggml_set_name(KQV, "KQV"); + cb(KQV, "KQV"); struct ggml_tensor * KQV_merged = ggml_permute(ctx0, KQV, 0, 2, 1, 3); - offload_func_v(KQV_merged); - ggml_set_name(KQV_merged, "KQV_merged"); + cb(KQV_merged, "KQV_merged"); cur = ggml_cont_2d(ctx0, KQV_merged, n_embd, n_tokens); - offload_func_v(cur); - ggml_set_name(cur, "KQV_merged_contiguous"); + cb(cur, "KQV_merged_contiguous"); } // Projection cur = ggml_add(ctx0, ggml_mul_mat(ctx0, model.layers[il].wo, cur), model.layers[il].bo); - offload_func(cur); + cb(cur, "result_wo"); // Add the input cur = ggml_add(ctx0, cur, inpL); - offload_func(cur); + cb(cur, "inpL_+_result_wo"); struct ggml_tensor * inpFF = cur; @@ -4862,22 +4132,28 @@ static struct ggml_cgraph * llm_build_starcoder( // Norm { cur = ggml_norm(ctx0, inpFF, norm_eps); - offload_func_nr(cur); + cb(cur, "ffn_norm_0"); - cur = ggml_add(ctx0, ggml_mul(ctx0, cur, model.layers[il].ffn_norm), model.layers[il].ffn_norm_b); - offload_func_nr(cur); + cur = ggml_mul(ctx0, cur, model.layers[il].ffn_norm); + cb(cur, "ffn_norm_0_w"); + + cur = ggml_add(ctx0, cur, model.layers[il].ffn_norm_b); + cb(cur, "ffn_norm_0_wb"); } cur = ggml_add(ctx0, ggml_mul_mat(ctx0, model.layers[il].w3, cur), model.layers[il].b3); - offload_func(cur); + cb(cur, "result_w3"); // GELU activation cur = ggml_gelu(ctx0, cur); - offload_func(cur); + cb(cur, "gelu"); // Projection - cur = ggml_add(ctx0, ggml_mul_mat(ctx0, model.layers[il].w2, cur), model.layers[il].b2); - offload_func(cur); + cur = ggml_mul_mat(ctx0, model.layers[il].w2, cur); + cb(cur, "result_w2"); + + cur = ggml_add(ctx0, cur, model.layers[il].b2); + cb(cur, "result_w2_b"); } inpL = ggml_add(ctx0, cur, inpFF); @@ -4887,14 +4163,17 @@ static struct ggml_cgraph * llm_build_starcoder( // Output Norm { cur = ggml_norm(ctx0, inpL, norm_eps); - offload_func_nr(cur); + cb(cur, "out_norm_0"); - cur = ggml_add(ctx0, ggml_mul(ctx0, cur, model.output_norm), model.output_norm_b); - ggml_set_name(cur, "result_norm"); + cur = ggml_mul(ctx0, cur, model.output_norm); + cb(cur, "out_norm_0_w"); + + cur = ggml_add(ctx0, cur, model.output_norm_b); + cb(cur, "result_norm"); } cur = ggml_mul_mat(ctx0, model.output, cur); - ggml_set_name(cur, "result_output"); + cb(cur, "result_output"); ggml_build_forward_expand(gf, cur); ggml_free(ctx0); @@ -4904,7 +4183,9 @@ static struct ggml_cgraph * llm_build_starcoder( static struct ggml_cgraph * llm_build_persimmon( llama_context & lctx, - const llama_batch & batch) { + const llama_batch & batch, + const llm_build_cb & cb, + bool worst_case) { const auto & model = lctx.model; const auto & hparams = model.hparams; @@ -4926,14 +4207,11 @@ static struct ggml_cgraph * llm_build_persimmon( const float freq_scale = cparams.rope_freq_scale; const float norm_eps = hparams.f_norm_eps; - const int n_gpu_layers = model.n_gpu_layers; - - const int32_t n_tokens = batch.n_tokens; - const int32_t n_kv = ggml_allocr_is_measure(lctx.alloc) ? n_ctx : kv_self.n; - const int32_t kv_head = ggml_allocr_is_measure(lctx.alloc) ? n_ctx - n_tokens : kv_self.head; + const int32_t n_kv = worst_case ? n_ctx : kv_self.n; + const int32_t kv_head = worst_case ? n_ctx - n_tokens : kv_self.head; - const bool do_rope_shift = ggml_allocr_is_measure(lctx.alloc) || kv_self.has_shift; + const bool do_rope_shift = worst_case || kv_self.has_shift; auto & buf_compute = lctx.buf_compute; struct ggml_init_params params = { @@ -4951,74 +4229,28 @@ static struct ggml_cgraph * llm_build_persimmon( if (batch.token) { struct ggml_tensor * inp_tokens = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens); + cb(inp_tokens, "inp_tokens"); - ggml_allocr_alloc(lctx.alloc, inp_tokens); - if (!ggml_allocr_is_measure(lctx.alloc)) { - memcpy(inp_tokens->data, batch.token, n_tokens*ggml_element_size(inp_tokens)); - } - ggml_set_name(inp_tokens, "inp_tokens"); inpL = ggml_get_rows(ctx0, model.tok_embeddings, inp_tokens); } else { inpL = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd, n_tokens); - ggml_allocr_alloc(lctx.alloc, inpL); - if (!ggml_allocr_is_measure(lctx.alloc)) { - memcpy(inpL->data, batch.embd, n_tokens * n_embd * ggml_element_size(inpL)); - } } - const int i_gpu_start = n_layer - n_gpu_layers; - (void) i_gpu_start; - offload_func_t offload_func_nr = llama_nop; // nr = non-repeating - offload_func_t offload_func_kq = llama_nop; - offload_func_t offload_func_v = llama_nop; + cb(inpL, "imp_embd"); + + struct ggml_tensor * inp_pos = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens); + cb(inp_pos, "inp_pos"); + // KQ_scale struct ggml_tensor * KQ_scale = ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, 1); - ggml_allocr_alloc(lctx.alloc, KQ_scale); - if (!ggml_allocr_is_measure(lctx.alloc)) { - ggml_set_f32(KQ_scale, 1.0f/sqrtf(float(n_embd_head))); - } - ggml_set_name(KQ_scale, "1/sqrt(n_embd_head)"); + cb(KQ_scale, "KQ_scale"); + struct ggml_tensor * KQ_mask = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_kv, n_tokens, 1); - offload_func_kq(KQ_mask); - ggml_set_name(KQ_mask, "KQ_mask"); - ggml_allocr_alloc(lctx.alloc, KQ_mask); + cb(KQ_mask, "KQ_mask"); - if (!ggml_allocr_is_measure(lctx.alloc)) { - float * data = (float *) KQ_mask->data; - memset(data, 0, ggml_nbytes(KQ_mask)); - for (int h = 0; h < 1; ++h) { - for (int j = 0; j < n_tokens; ++j) { - const llama_pos pos = batch.pos[j]; - const llama_seq_id seq_id = batch.seq_id[j][0]; - for (int i = 0; i < n_kv; ++i) { - if (!kv_self.cells[i].has_seq_id(seq_id) || kv_self.cells[i].pos > pos) { - data[h*(n_kv*n_tokens) + j*n_kv + i] = -INFINITY; - } - } - } - } - } - - struct ggml_tensor * KQ_pos = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens); - offload_func_kq(KQ_pos); - ggml_set_name(KQ_pos, "KQ_pos"); - ggml_allocr_alloc(lctx.alloc, KQ_pos); - if (!ggml_allocr_is_measure(lctx.alloc)) { - int * data = (int *) KQ_pos->data; - for (int i = 0; i < n_tokens; ++i) { - data[i] = batch.pos[i]; - } - } if (do_rope_shift) { struct ggml_tensor * K_shift = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_ctx); - offload_func_kq(K_shift); - ggml_set_name(K_shift, "K_shift"); - ggml_allocr_alloc(lctx.alloc, K_shift); - if (!ggml_allocr_is_measure(lctx.alloc)) { - int * data = (int *) K_shift->data; - for (int i = 0; i < n_ctx; ++i) { - data[i] = kv_self.cells[i].delta; - } - } + cb(K_shift, "K_shift"); + for (int il = 0; il < n_layer; ++il) { struct ggml_tensor * tmp = // we rotate only the first n_rot dimensions. @@ -5030,65 +4262,76 @@ static struct ggml_cgraph * llm_build_persimmon( ggml_element_size(kv_self.k)*(n_embd_head*n_ctx*il) ), K_shift, n_rot, 2, 0, freq_base, freq_scale); - offload_func_kq(tmp); + cb(tmp, "K_shifted"); ggml_build_forward_expand(gf, tmp); } } - for (int il=0; il < n_layer; ++il) { + + for (int il = 0; il < n_layer; ++il) { struct ggml_tensor * residual = inpL; - offload_func_t offload_func = llama_nop; + { cur = ggml_norm(ctx0, inpL, norm_eps); - offload_func(cur); + cb(cur, "attn_norm_0"); + cur = ggml_mul(ctx0, cur, model.layers[il].attn_norm); - offload_func(cur); + cb(cur, "attn_norm_0_w"); + cur = ggml_add(ctx0, cur, model.layers[il].attn_norm_b); - offload_func(cur); - ggml_format_name(cur, "input_layernorm_%d", il); + cb(cur, "attn_norm_0_wb"); } + // self attention { cur = ggml_mul_mat(ctx0, model.layers[il].wqkv, cur); - offload_func_kq(cur); + cb(cur, "wqkv"); + cur = ggml_add(ctx0, cur, model.layers[il].bqkv); - offload_func_kq(cur); + cb(cur, "bqkv"); // split qkv GGML_ASSERT(n_head_kv == n_head); - ggml_set_name(cur, format("qkv_%d", il).c_str()); + struct ggml_tensor * tmpqkv = ggml_reshape_4d(ctx0, cur, n_embd_head, 3, n_head, n_tokens); - offload_func_kq(tmpqkv); + cb(tmpqkv, "tmpqkv"); + struct ggml_tensor * tmpqkv_perm = ggml_cont(ctx0, ggml_permute(ctx0, tmpqkv, 0, 3, 1, 2)); - offload_func_kq(tmpqkv_perm); - ggml_format_name(tmpqkv_perm, "tmpqkv_perm_%d", il); + cb(tmpqkv_perm, "tmpqkv"); + struct ggml_tensor * tmpq = ggml_view_3d( ctx0, tmpqkv_perm, n_embd_head, n_head, n_tokens, ggml_element_size(tmpqkv_perm) * n_embd_head, ggml_element_size(tmpqkv_perm) * n_embd_head * n_head, 0 ); - offload_func_kq(tmpq); + cb(tmpq, "tmpq"); + struct ggml_tensor * tmpk = ggml_view_3d( ctx0, tmpqkv_perm, n_embd_head, n_head, n_tokens, ggml_element_size(tmpqkv_perm) * n_embd_head, ggml_element_size(tmpqkv_perm) * n_embd_head * n_head, ggml_element_size(tmpqkv_perm) * n_embd_head * n_head * n_tokens ); - offload_func_kq(tmpk); + cb(tmpk, "tmpk"); + // Q/K Layernorm tmpq = ggml_norm(ctx0, tmpq, norm_eps); - offload_func_kq(tmpq); - tmpq = ggml_mul(ctx0, tmpq, model.layers[il].attn_q_norm); - offload_func_kq(tmpq); - tmpq = ggml_add(ctx0, tmpq, model.layers[il].attn_q_norm_b); - offload_func_kq(tmpq); + cb(tmpq, "tmpq"); + + tmpq = ggml_mul(ctx0, tmpq, model.layers[il].attn_q_norm); + cb(tmpq, "tmpq"); + + tmpq = ggml_add(ctx0, tmpq, model.layers[il].attn_q_norm_b); + cb(tmpq, "tmpq"); tmpk = ggml_norm(ctx0, tmpk, norm_eps); - offload_func_v(tmpk); + cb(tmpk, "tmpk"); + tmpk = ggml_mul(ctx0, tmpk, model.layers[il].attn_k_norm); - offload_func_v(tmpk); + cb(tmpk, "tmpk"); + tmpk = ggml_add(ctx0, tmpk, model.layers[il].attn_k_norm_b); - offload_func_v(tmpk); + cb(tmpk, "tmpk"); // RoPE the first n_rot of q/k, pass the other half, and concat. struct ggml_tensor * qrot = ggml_view_3d( @@ -5097,16 +4340,15 @@ static struct ggml_cgraph * llm_build_persimmon( ggml_element_size(tmpq) * n_embd_head * n_head, 0 ); - offload_func_kq(qrot); - ggml_format_name(qrot, "qrot_%d", il); + cb(qrot, "qrot"); + struct ggml_tensor * krot = ggml_view_3d( ctx0, tmpk, n_rot, n_head, n_tokens, ggml_element_size(tmpk) * n_embd_head, ggml_element_size(tmpk) * n_embd_head * n_head, 0 ); - offload_func_kq(krot); - ggml_format_name(krot, "krot_%d", il); + cb(krot, "krot"); // get the second half of tmpq, e.g tmpq[n_rot:, :, :] struct ggml_tensor * qpass = ggml_view_3d( @@ -5115,47 +4357,52 @@ static struct ggml_cgraph * llm_build_persimmon( ggml_element_size(tmpq) * n_embd_head * n_head, ggml_element_size(tmpq) * n_rot ); - offload_func_kq(qpass); - ggml_format_name(qpass, "qpass_%d", il); + cb(qpass, "qpass"); + struct ggml_tensor * kpass = ggml_view_3d( ctx0, tmpk, n_rot, n_head, n_tokens, ggml_element_size(tmpk) * n_embd_head, ggml_element_size(tmpk) * n_embd_head * n_head, ggml_element_size(tmpk) * n_rot ); - offload_func_kq(kpass); - ggml_format_name(kpass, "kpass_%d", il); + cb(kpass, "kpass"); - struct ggml_tensor * qrotated = ggml_rope_custom( - ctx0, qrot, KQ_pos, n_rot, 2, 0, freq_base, freq_scale + struct ggml_tensor * qrotated = ggml_rope_custom( + ctx0, qrot, inp_pos, n_rot, 2, 0, freq_base, freq_scale ); - offload_func_kq(qrotated); + cb(qrotated, "qrotated"); + struct ggml_tensor * krotated = ggml_rope_custom( - ctx0, krot, KQ_pos, n_rot, 2, 0, freq_base, freq_scale + ctx0, krot, inp_pos, n_rot, 2, 0, freq_base, freq_scale ); - offload_func_kq(krotated); + cb(krotated, "krotated"); + // ggml currently only supports concatenation on dim=2 // so we need to permute qrot, qpass, concat, then permute back. qrotated = ggml_cont(ctx0, ggml_permute(ctx0, qrotated, 2, 1, 0, 3)); - offload_func_kq(qrotated); + cb(qrotated, "qrotated"); + krotated = ggml_cont(ctx0, ggml_permute(ctx0, krotated, 2, 1, 0, 3)); - offload_func_kq(krotated); + cb(krotated, "krotated"); qpass = ggml_cont(ctx0, ggml_permute(ctx0, qpass, 2, 1, 0, 3)); - offload_func_kq(qpass); + cb(qpass, "qpass"); + kpass = ggml_cont(ctx0, ggml_permute(ctx0, kpass, 2, 1, 0, 3)); - offload_func_kq(kpass); + cb(kpass, "kpass"); struct ggml_tensor * Qcur = ggml_concat(ctx0, qrotated, qpass); - offload_func_kq(Qcur); + cb(Qcur, "Qcur"); + struct ggml_tensor * Kcur = ggml_concat(ctx0, krotated, kpass); - offload_func_kq(Kcur); + cb(Kcur, "Kcur"); struct ggml_tensor * Q = ggml_cont(ctx0, ggml_permute(ctx0, Qcur, 1, 2, 0, 3)); - offload_func_kq(Q); + cb(Q, "Q"); Kcur = ggml_cont(ctx0, ggml_permute(ctx0, Kcur, 2, 1, 0, 3)); - offload_func_kq(Kcur); + cb(Kcur, "Kcur"); + { struct ggml_tensor * tmpv = ggml_view_3d( ctx0, tmpqkv_perm, n_embd_head, n_head, n_tokens, @@ -5163,24 +4410,22 @@ static struct ggml_cgraph * llm_build_persimmon( ggml_element_size(tmpqkv_perm) * n_embd_head * n_head, ggml_element_size(tmpqkv_perm) * n_embd_head * n_head * n_tokens * 2 ); - offload_func_v(tmpv); + cb(tmpv, "tmpv"); + // store K, V in cache struct ggml_tensor * Vcur = ggml_transpose(ctx0, ggml_reshape_2d(ctx0, tmpv, n_embd_gqa, n_tokens)); - offload_func_v(Vcur); - ggml_set_name(Vcur, "Vcur"); + cb(Vcur, "Vcur"); struct ggml_tensor * k = ggml_view_1d( ctx0, kv_self.k, n_tokens*n_embd_gqa, (ggml_element_size(kv_self.k)*n_embd_gqa)*(il*n_ctx + kv_head) ); - offload_func_kq(k); - ggml_set_name(k, "k"); + cb(k, "k"); struct ggml_tensor * v = ggml_view_2d(ctx0, kv_self.v, n_tokens, n_embd_gqa, ( n_ctx)*ggml_element_size(kv_self.v), (il*n_ctx)*ggml_element_size(kv_self.v)*n_embd_gqa + kv_head*ggml_element_size(kv_self.v)); - offload_func_v(v); - ggml_set_name(v, "v"); + cb(v, "v"); // important: storing RoPE-ed version of K in the KV cache! ggml_build_forward_expand(gf, ggml_cpy(ctx0, Kcur, k)); @@ -5191,25 +4436,19 @@ static struct ggml_cgraph * llm_build_persimmon( ggml_element_size(kv_self.k)*n_embd_gqa, ggml_element_size(kv_self.k)*n_embd_head, ggml_element_size(kv_self.k)*n_embd_gqa*n_ctx*il); - - offload_func_kq(K); - ggml_format_name(K, "K_%d", il); + cb(K, "K"); struct ggml_tensor * KQ = ggml_mul_mat(ctx0, K, Q); - offload_func_kq(KQ); - ggml_set_name(KQ, "KQ"); + cb(KQ, "KQ"); struct ggml_tensor * KQ_scaled = ggml_scale(ctx0, KQ, KQ_scale); - offload_func_kq(KQ_scaled); - ggml_set_name(KQ_scaled, "KQ_scaled"); + cb(KQ_scaled, "KQ_scaled"); struct ggml_tensor * KQ_masked = ggml_add(ctx0, KQ_scaled, KQ_mask); - offload_func_kq(KQ_masked); - ggml_set_name(KQ_masked, "KQ_masked"); + cb(KQ_masked, "KQ_masked"); struct ggml_tensor * KQ_soft_max = ggml_soft_max_inplace(ctx0, KQ_masked); - offload_func_kq(KQ_soft_max); - ggml_set_name(KQ_soft_max, "KQ_soft_max"); + cb(KQ_soft_max, "KQ_soft_max"); struct ggml_tensor * V = ggml_view_3d(ctx0, kv_self.v, @@ -5217,91 +4456,336 @@ static struct ggml_cgraph * llm_build_persimmon( ggml_element_size(kv_self.v)*n_ctx, ggml_element_size(kv_self.v)*n_ctx*n_embd_head, ggml_element_size(kv_self.v)*n_ctx*n_embd_gqa*il); - offload_func_v(V); - ggml_set_name(V, "V"); + cb(V, "V"); struct ggml_tensor * KQV = ggml_mul_mat(ctx0, V, KQ_soft_max); - offload_func_v(KQV); - ggml_set_name(KQV, "KQV"); + cb(KQV, "KQV"); struct ggml_tensor * KQV_merged = ggml_permute(ctx0, KQV, 0, 2, 1, 3); - offload_func_v(KQV_merged); - ggml_set_name(KQV_merged, "KQV_merged"); + cb(KQV_merged, "KQV_merged"); cur = ggml_cont_2d(ctx0, KQV_merged, n_embd, n_tokens); - offload_func_v(cur); - ggml_set_name(cur, "KQV_merged_contiguous"); + cb(cur, "KQV_merged_contiguous"); cur = ggml_mul_mat(ctx0, model.layers[il].wo, cur); - offload_func(cur); + cb(cur, "result_wo"); + cur = ggml_add(ctx0, cur, model.layers[il].bo); - offload_func(cur); - ggml_set_name(cur, "result_wo"); + cb(cur, "result_wo_b"); } struct ggml_tensor * inpFF = ggml_add(ctx0, residual, cur); - offload_func(inpFF); - ggml_set_name(inpFF, "inpFF"); + cb(inpFF, "inpFF"); + { // MLP { // Norm cur = ggml_norm(ctx0, inpFF, norm_eps); - offload_func(cur); - cur = ggml_add(ctx0, - ggml_mul(ctx0, cur, model.layers[il].ffn_norm), - model.layers[il].ffn_norm_b - ); - ggml_set_name(cur, "ffn_norm"); - offload_func(cur); + cb(cur, "ffn_norm_0"); + + cur = ggml_mul(ctx0, cur, model.layers[il].ffn_norm); + cb(cur, "ffn_norm_0_w"); + + cur = ggml_add(ctx0, cur, model.layers[il].ffn_norm_b); + cb(cur, "ffn_norm_0_wb"); } + cur = ggml_mul_mat(ctx0, model.layers[il].w3, cur); - offload_func(cur); + cb(cur, "result_w3"); cur = ggml_add(ctx0, cur, model.layers[il].b3); - offload_func(cur); - ggml_set_name(cur, "result_ffn_up"); + cb(cur, "result_w3_b"); - cur = ggml_sqr(ctx0, ggml_relu(ctx0, cur)); - ggml_set_name(cur, "result_ffn_act"); - offload_func(cur); - offload_func(cur->src[0]); + cur = ggml_relu(ctx0, cur); + cb(cur, "relu"); + + cur = ggml_sqr(ctx0, cur); + cb(cur, "sqr(relu)"); cur = ggml_mul_mat(ctx0, model.layers[il].w2, cur); - offload_func(cur); - cur = ggml_add(ctx0, - cur, - model.layers[il].b2); - offload_func(cur); - ggml_set_name(cur, "outFF"); + cb(cur, "result_w2"); + + cur = ggml_add(ctx0, cur, model.layers[il].b2); + cb(cur, "result_w2_b"); } + cur = ggml_add(ctx0, cur, inpFF); - offload_func(cur); - ggml_set_name(cur, "inpFF_+_outFF"); + cb(cur, "inpFF_+_result_w2"); + inpL = cur; } + cur = inpL; + { cur = ggml_norm(ctx0, cur, norm_eps); - offload_func_nr(cur); + cb(cur, "out_norm_0"); + cur = ggml_mul(ctx0, cur, model.output_norm); - offload_func_nr(cur); + cb(cur, "out_norm_0_w"); cur = ggml_add(ctx0, cur, model.output_norm_b); - // offload_func_nr(cur); - - ggml_set_name(cur, "result_norm"); + cb(cur, "result_norm"); } + cur = ggml_mul_mat(ctx0, model.output, cur); - ggml_set_name(cur, "result_output"); + cb(cur, "result_output"); + ggml_build_forward_expand(gf, cur); + ggml_free(ctx0); + + return gf; +} + +static struct ggml_cgraph * llm_build_refact( + llama_context & lctx, + const llama_batch & batch, + const llm_build_cb & cb, + bool worst_case) { + const auto & model = lctx.model; + const auto & hparams = model.hparams; + const auto & cparams = lctx.cparams; + + const auto & kv_self = lctx.kv_self; + + GGML_ASSERT(!!kv_self.ctx); + + const int64_t n_embd = hparams.n_embd; + const int64_t n_layer = hparams.n_layer; + const int64_t n_ctx = cparams.n_ctx; + const int64_t n_head = hparams.n_head; + const int64_t n_head_kv = hparams.n_head_kv; + const int64_t n_embd_head = hparams.n_embd_head(); + const int64_t n_embd_gqa = hparams.n_embd_gqa(); + + const float norm_rms_eps = hparams.f_norm_rms_eps; + + const int32_t n_tokens = batch.n_tokens; + const int32_t n_kv = worst_case ? n_ctx : kv_self.n; + const int32_t kv_head = worst_case ? n_ctx - n_tokens : kv_self.head; + + // printf("n_kv = %d\n", n_kv); + + auto & buf_compute = lctx.buf_compute; + + struct ggml_init_params params = { + /*.mem_size =*/ buf_compute.size, + /*.mem_buffer =*/ buf_compute.data, + /*.no_alloc =*/ true, + }; + + struct ggml_context * ctx0 = ggml_init(params); + + ggml_cgraph * gf = ggml_new_graph(ctx0); + + struct ggml_tensor * cur; + struct ggml_tensor * inpL; + + if (batch.token) { + struct ggml_tensor * inp_tokens = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens); + cb(inp_tokens, "inp_tokens"); + + inpL = ggml_get_rows(ctx0, model.tok_embeddings, inp_tokens); + } else { +#ifdef GGML_USE_MPI + GGML_ASSERT(false && "not implemented"); +#endif + + inpL = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd, n_tokens); + } + cb(inpL, "inp_embd"); + + // KQ_scale + struct ggml_tensor * KQ_scale = ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, 1); + cb(KQ_scale, "KQ_scale"); + + // KQ_mask (mask for 1 head, it will be broadcasted to all heads) + struct ggml_tensor * KQ_mask = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_kv, n_tokens, 1); + cb(KQ_mask, "KQ_mask"); + + for (int il = 0; il < n_layer; ++il) { + struct ggml_tensor * inpSA = inpL; + + // norm + { + cur = ggml_rms_norm(ctx0, inpL, norm_rms_eps); + cb(cur, "rms_norm_0"); + + // cur = cur*attn_norm(broadcasted) + cur = ggml_mul(ctx0, cur, model.layers[il].attn_norm); + cb(cur, "attn_norm_0"); + } + + // self-attention + { + // compute Q and K + struct ggml_tensor * tmpk = ggml_mul_mat(ctx0, model.layers[il].wk, cur); + cb(tmpk, "tmpk"); + + struct ggml_tensor * tmpq = ggml_mul_mat(ctx0, model.layers[il].wq, cur); + cb(tmpq, "tmpq"); + + struct ggml_tensor * Kcur = ggml_reshape_3d(ctx0, tmpk, n_embd_head, n_head_kv, n_tokens); + cb(Kcur, "Kcur"); + + struct ggml_tensor * Qcur = ggml_reshape_3d(ctx0, tmpq, n_embd_head, n_head, n_tokens); + cb(Qcur, "Qcur"); + + // store key and value to memory + { + // compute the transposed [n_tokens, n_embd] V matrix + + struct ggml_tensor * tmpv = ggml_mul_mat(ctx0, model.layers[il].wv, cur); + cb(tmpv, "tmpv"); + + struct ggml_tensor * Vcur = ggml_transpose(ctx0, ggml_reshape_2d(ctx0, tmpv, n_embd_gqa, n_tokens)); + cb(Vcur, "Vcur"); + + struct ggml_tensor * k = ggml_view_1d(ctx0, kv_self.k, n_tokens*n_embd_gqa, (ggml_element_size(kv_self.k)*n_embd_gqa)*(il*n_ctx + kv_head)); + cb(k, "k"); + + struct ggml_tensor * v = ggml_view_2d(ctx0, kv_self.v, n_tokens, n_embd_gqa, + ( n_ctx)*ggml_element_size(kv_self.v), + (il*n_ctx)*ggml_element_size(kv_self.v)*n_embd_gqa + kv_head*ggml_element_size(kv_self.v)); + cb(v, "v"); + + ggml_build_forward_expand(gf, ggml_cpy(ctx0, Kcur, k)); + ggml_build_forward_expand(gf, ggml_cpy(ctx0, Vcur, v)); + } + + struct ggml_tensor * Q = ggml_permute(ctx0, Qcur, 0, 2, 1, 3); + cb(Q, "Q"); + + struct ggml_tensor * K = + ggml_view_3d(ctx0, kv_self.k, + n_embd_head, n_kv, n_head_kv, + ggml_element_size(kv_self.k)*n_embd_gqa, + ggml_element_size(kv_self.k)*n_embd_head, + ggml_element_size(kv_self.k)*n_embd_gqa*n_ctx*il); + cb(K, "K"); + + // K * Q + struct ggml_tensor * KQ = ggml_mul_mat(ctx0, K, Q); + cb(KQ, "KQ"); + + // KQ_scaled = KQ / sqrt(n_embd_head) + // KQ_scaled shape [n_kv, n_tokens, n_head, 1] + struct ggml_tensor * KQ_scaled = ggml_scale(ctx0, KQ, KQ_scale); + cb(KQ_scaled, "KQ_scaled"); + + // KQ_masked = mask_past(KQ_scaled) + struct ggml_tensor * KQ_scaled_alibi = ggml_alibi(ctx0, KQ_scaled, /*n_past*/ 0, n_head, 8); + cb(KQ_scaled_alibi, "KQ_scaled_alibi"); + + struct ggml_tensor * KQ_masked = ggml_add(ctx0, KQ_scaled_alibi, KQ_mask); + cb(KQ_masked, "KQ_masked"); + + // KQ = soft_max(KQ_masked) + struct ggml_tensor * KQ_soft_max = ggml_soft_max(ctx0, KQ_masked); + cb(KQ_soft_max, "KQ_soft_max"); + + // split cached V into n_head heads + struct ggml_tensor * V = + ggml_view_3d(ctx0, kv_self.v, + n_kv, n_embd_head, n_head_kv, + ggml_element_size(kv_self.v)*n_ctx, + ggml_element_size(kv_self.v)*n_ctx*n_embd_head, + ggml_element_size(kv_self.v)*n_ctx*n_embd_gqa*il); + cb(V, "V"); + + struct ggml_tensor * KQV = ggml_mul_mat(ctx0, V, KQ_soft_max); + cb(KQV, "KQV"); + + // KQV_merged = KQV.permute(0, 2, 1, 3) + struct ggml_tensor * KQV_merged = ggml_permute(ctx0, KQV, 0, 2, 1, 3); + cb(KQV_merged, "KQV_merged"); + + // cur = KQV_merged.contiguous().view(n_embd, n_tokens) + cur = ggml_cont_2d(ctx0, KQV_merged, n_embd, n_tokens); + cb(cur, "KQV_merged_contiguous"); + + // projection (no bias) + cur = ggml_mul_mat(ctx0, + model.layers[il].wo, + cur); + cb(cur, "result_wo"); + } + + struct ggml_tensor * inpFF = ggml_add(ctx0, cur, inpSA); + cb(inpFF, "inpFF"); + + // feed-forward network + { + // norm + { + cur = ggml_rms_norm(ctx0, inpFF, norm_rms_eps); + cb(cur, "rms_norm_1"); + + // cur = cur*ffn_norm(broadcasted) + cur = ggml_mul(ctx0, cur, model.layers[il].ffn_norm); + cb(cur, "ffn_norm"); + } + + struct ggml_tensor * tmp = ggml_mul_mat(ctx0, + model.layers[il].w3, + cur); + cb(tmp, "result_w3"); + + cur = ggml_mul_mat(ctx0, + model.layers[il].w1, + cur); + cb(cur, "result_w1"); + + // SILU activation + cur = ggml_silu(ctx0, cur); + cb(cur, "silu"); + + cur = ggml_mul(ctx0, cur, tmp); + cb(cur, "silu_x_result_w3"); + + cur = ggml_mul_mat(ctx0, + model.layers[il].w2, + cur); + cb(cur, "result_w2"); + } + + cur = ggml_add(ctx0, cur, inpFF); + cb(cur, "inpFF_+_result_w2"); + + // input for next layer + inpL = cur; + } + + cur = inpL; + + // norm + { + cur = ggml_rms_norm(ctx0, cur, norm_rms_eps); + cb(cur, "rms_norm_2"); + + // cur = cur*norm(broadcasted) + cur = ggml_mul(ctx0, cur, model.output_norm); + cb(cur, "result_norm"); + } + + // lm_head + cur = ggml_mul_mat(ctx0, model.output, cur); + cb(cur, "result_output"); + + ggml_build_forward_expand(gf, cur); + + ggml_free(ctx0); + return gf; } static struct ggml_cgraph * llm_build_bloom( llama_context & lctx, - const llama_batch & batch) { + const llama_batch & batch, + const llm_build_cb & cb, + bool worst_case) { const auto & model = lctx.model; const auto & hparams = model.hparams; const auto & cparams = lctx.cparams; @@ -5323,8 +4807,8 @@ static struct ggml_cgraph * llm_build_bloom( const float norm_eps = hparams.f_norm_eps; const int32_t n_tokens = batch.n_tokens; - const int32_t n_kv = ggml_allocr_is_measure(lctx.alloc) ? n_ctx : kv_self.n; - const int32_t kv_head = ggml_allocr_is_measure(lctx.alloc) ? n_ctx - n_tokens : kv_self.head; + const int32_t n_kv = worst_case ? n_ctx : kv_self.n; + const int32_t kv_head = worst_case ? n_ctx - n_tokens : kv_self.head; auto & buf_compute = lctx.buf_compute; @@ -5341,99 +4825,87 @@ static struct ggml_cgraph * llm_build_bloom( ggml_cgraph * gf = ggml_new_graph(ctx0); struct ggml_tensor * cur; - struct ggml_tensor * token; + struct ggml_tensor * embd; struct ggml_tensor * inpL; if (batch.token) { struct ggml_tensor * inp_tokens = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens); + cb(inp_tokens, "inp_tokens"); - ggml_allocr_alloc(lctx.alloc, inp_tokens); - if (!ggml_allocr_is_measure(lctx.alloc)) { - memcpy(inp_tokens->data, batch.token, n_tokens*ggml_element_size(inp_tokens)); - } - ggml_set_name(inp_tokens, "inp_tokens"); - - token = ggml_get_rows(ctx0, model.tok_embeddings, inp_tokens); + embd = ggml_get_rows(ctx0, model.tok_embeddings, inp_tokens); } else { #ifdef GGML_USE_MPI GGML_ASSERT(false && "not implemented"); #endif - token = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd, n_tokens); - - ggml_allocr_alloc(lctx.alloc, token); - if (!ggml_allocr_is_measure(lctx.alloc)) { - memcpy(token->data, batch.embd, n_tokens * n_embd * ggml_element_size(token)); - } + embd = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd, n_tokens); } + cb(embd, "inp_embd"); // KQ_scale struct ggml_tensor * KQ_scale = ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, 1); - ggml_set_name(KQ_scale, "1/sqrt(n_embd_head)"); - ggml_allocr_alloc(lctx.alloc, KQ_scale); - if (!ggml_allocr_is_measure(lctx.alloc)) { - ggml_set_f32(KQ_scale, 1.0f/sqrtf(float(n_embd)/n_head)); - } + cb(KQ_scale, "KQ_scale"); // KQ_mask (mask for 1 head, it will be broadcasted to all heads) struct ggml_tensor * KQ_mask = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_kv, n_tokens, 1); - ggml_set_name(KQ_mask, "KQ_mask"); - ggml_allocr_alloc(lctx.alloc, KQ_mask); - if (!ggml_allocr_is_measure(lctx.alloc)) { - float * data = (float *) KQ_mask->data; - memset(data, 0, ggml_nbytes(KQ_mask)); - - for (int h = 0; h < 1; ++h) { - for (int j = 0; j < n_tokens; ++j) { - const llama_pos pos = batch.pos[j]; - const llama_seq_id seq_id = batch.seq_id[j][0]; - - for (int i = 0; i < n_kv; ++i) { - if (!kv_self.cells[i].has_seq_id(seq_id) || kv_self.cells[i].pos > pos) { - data[h*(n_kv*n_tokens) + j*n_kv + i] = -INFINITY; - } - } - } - } - } + cb(KQ_mask, "KQ_mask"); // norm { - inpL = ggml_norm(ctx0, token, norm_eps); - inpL = ggml_add(ctx0, ggml_mul(ctx0, inpL, model.tok_norm), model.tok_norm_b); - } + inpL = ggml_norm(ctx0, embd, norm_eps); + cb(inpL, "inp_norm"); - ggml_set_name(inpL, "inpL"); + inpL = ggml_mul(ctx0, inpL, model.tok_norm); + cb(inpL, "inp_norm_w"); + + inpL = ggml_add (ctx0, inpL, model.tok_norm_b); + cb(inpL, "inp_norm_wb"); + } for (int il = 0; il < n_layer; ++il) { { // Norm cur = ggml_norm(ctx0, inpL, norm_eps); - cur = ggml_add(ctx0, ggml_mul(ctx0, cur, model.layers[il].attn_norm), model.layers[il].attn_norm_b); + cb(cur, "attn_norm_0"); + + cur = ggml_mul(ctx0, cur, model.layers[il].attn_norm); + cb(cur, "attn_norm_0_w"); + + cur = ggml_add(ctx0, cur, model.layers[il].attn_norm_b); + cb(cur, "attn_norm_0_wb"); } { // Self Attention - cur = ggml_add(ctx0, ggml_mul_mat(ctx0, model.layers[il].wqkv, cur), model.layers[il].bqkv); + cur = ggml_mul_mat(ctx0, model.layers[il].wqkv, cur); + cb(cur, "wqkv"); - struct ggml_tensor * tmpq = ggml_view_2d(ctx0, cur, n_embd, n_tokens, cur->nb[1], 0*sizeof(float)*n_embd); - struct ggml_tensor * tmpk = ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], sizeof(float)*n_embd); - struct ggml_tensor * tmpv = ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], sizeof(float)*(n_embd + n_embd_gqa)); + cur = ggml_add(ctx0, cur, model.layers[il].bqkv); + cb(cur, "bqkv"); - struct ggml_tensor * Qcur = tmpq; + struct ggml_tensor * tmpq = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd, n_tokens, cur->nb[1], 0*sizeof(float)*(n_embd))); + struct ggml_tensor * tmpk = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd))); + struct ggml_tensor * tmpv = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa))); + + cb(tmpq, "tmpq"); + cb(tmpk, "tmpk"); + cb(tmpv, "tmpv"); + + struct ggml_tensor * Qcur = ggml_reshape_3d(ctx0, tmpq, n_embd_head, n_head, n_tokens); struct ggml_tensor * Kcur = tmpk; // store key and value to memory { struct ggml_tensor * Vcur = ggml_transpose(ctx0, ggml_reshape_2d(ctx0, ggml_cont(ctx0, tmpv), n_embd_gqa, n_tokens)); - ggml_set_name(Vcur, "Vcur"); + cb(Vcur, "Vcur"); struct ggml_tensor * k = ggml_view_1d(ctx0, kv_self.k, n_tokens*n_embd_gqa, (ggml_element_size(kv_self.k)*n_embd_gqa)*(il*n_ctx + kv_head)); - ggml_set_name(k, "k"); + cb(k, "k"); struct ggml_tensor * v = ggml_view_2d(ctx0, kv_self.v, n_tokens, n_embd_gqa, ( n_ctx)*ggml_element_size(kv_self.v), (il*n_ctx)*ggml_element_size(kv_self.v)*n_embd_gqa + kv_head*ggml_element_size(kv_self.v)); + cb(v, "v"); ggml_build_forward_expand(gf, ggml_cpy(ctx0, Kcur, k)); ggml_build_forward_expand(gf, ggml_cpy(ctx0, Vcur, v)); @@ -5445,7 +4917,7 @@ static struct ggml_cgraph * llm_build_bloom( Qcur, ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_embd_head, n_head, n_tokens)), 0, 2, 1, 3); - ggml_set_name(Q, "Q"); + cb(Q, "Q"); struct ggml_tensor * K = ggml_view_3d(ctx0, kv_self.k, @@ -5453,27 +4925,27 @@ static struct ggml_cgraph * llm_build_bloom( ggml_element_size(kv_self.k)*n_embd_gqa, ggml_element_size(kv_self.k)*n_embd_head, ggml_element_size(kv_self.k)*n_embd_gqa*n_ctx*il); - ggml_set_name(K, "K"); + cb(K, "K"); // K * Q struct ggml_tensor * KQ = ggml_mul_mat(ctx0, K, Q); - ggml_set_name(KQ, "KQ"); + cb(KQ, "KQ"); // KQ_scaled = KQ / sqrt(n_embd_head) // KQ_scaled shape [n_past + n_tokens, n_tokens, n_head, 1] struct ggml_tensor * KQ_scaled = ggml_scale_inplace(ctx0, KQ, KQ_scale); - ggml_set_name(KQ_scaled, "KQ_scaled"); + cb(KQ_scaled, "KQ_scaled"); struct ggml_tensor * KQ_scaled_alibi = ggml_alibi(ctx0, KQ_scaled, /*n_past*/ kv_head, n_head, 8); - ggml_set_name(KQ_scaled_alibi, "KQ_scaled_alibi"); + cb(KQ_scaled_alibi, "KQ_scaled_alibi"); // KQ_masked = mask_past(KQ_scaled) struct ggml_tensor * KQ_masked = ggml_add(ctx0, KQ_scaled_alibi, KQ_mask); - ggml_set_name(KQ_masked, "KQ_masked"); + cb(KQ_masked, "KQ_masked"); // KQ = soft_max(KQ_masked) struct ggml_tensor * KQ_soft_max = ggml_soft_max_inplace(ctx0, KQ_masked); - ggml_set_name(KQ_soft_max, "KQ_soft_max"); + cb(KQ_soft_max, "KQ_soft_max"); // split cached V into n_head heads struct ggml_tensor * V = @@ -5482,25 +4954,30 @@ static struct ggml_cgraph * llm_build_bloom( ggml_element_size(kv_self.v)*n_ctx, ggml_element_size(kv_self.v)*n_ctx*n_embd_head, ggml_element_size(kv_self.v)*n_ctx*n_embd_gqa*il); - ggml_set_name(V, "V"); + cb(V, "V"); struct ggml_tensor * KQV = ggml_mul_mat(ctx0, V, KQ_soft_max); - ggml_set_name(KQV, "KQV"); + cb(KQV, "KQV"); // KQV_merged = KQV.permute(0, 2, 1, 3) struct ggml_tensor * KQV_merged = ggml_permute(ctx0, KQV, 0, 2, 1, 3); - ggml_set_name(KQV_merged, "KQV_merged"); + cb(KQV_merged, "KQV_merged"); // cur = KQV_merged.contiguous().view(n_embd, n_tokens) cur = ggml_cont_2d(ctx0, KQV_merged, n_embd, n_tokens); - ggml_set_name(cur, "KQV_merged_contiguous"); + cb(cur, "KQV_merged_contiguous"); } // Projection - cur = ggml_add(ctx0, ggml_mul_mat(ctx0, model.layers[il].wo, cur), model.layers[il].bo); + cur = ggml_mul_mat(ctx0, model.layers[il].wo, cur); + cb(cur, "result_wo"); + + cur = ggml_add(ctx0, cur, model.layers[il].bo); + cb(cur, "result_wo_b"); // Add the input cur = ggml_add(ctx0, cur, inpL); + cb(cur, "inpL_+_result_wo"); struct ggml_tensor * inpFF = cur; @@ -5509,30 +4986,49 @@ static struct ggml_cgraph * llm_build_bloom( // Norm { cur = ggml_norm(ctx0, inpFF, norm_eps); - cur = ggml_add(ctx0, ggml_mul(ctx0, cur, model.layers[il].ffn_norm), model.layers[il].ffn_norm_b); + cb(cur, "ffn_norm_0"); + + cur = ggml_mul(ctx0, cur, model.layers[il].ffn_norm); + cb(cur, "ffn_norm_0_w"); + + cur = ggml_add(ctx0, cur, model.layers[il].ffn_norm_b); + cb(cur, "ffn_norm_0_wb"); } - cur = ggml_add(ctx0, ggml_mul_mat(ctx0, model.layers[il].w3, cur), model.layers[il].b3); + cur = ggml_mul_mat(ctx0, model.layers[il].w3, cur); + cb(cur, "result_w3"); + + cur = ggml_add(ctx0, cur, model.layers[il].b3); + cb(cur, "result_w3_b"); - // GELU activation cur = ggml_gelu(ctx0, cur); + cb(cur, "gelu"); - // Projection - cur = ggml_add(ctx0, ggml_mul_mat(ctx0, model.layers[il].w2, cur), model.layers[il].b2); + cur = ggml_mul_mat(ctx0, model.layers[il].w2, cur); + cb(cur, "result_w2"); + + cur = ggml_add(ctx0, cur, model.layers[il].b2); + cb(cur, "result_w2_b"); } inpL = ggml_add(ctx0, cur, inpFF); + cb(inpL, "inpFF_+_result_w2"); } // Output Norm { cur = ggml_norm(ctx0, inpL, norm_eps); - cur = ggml_add(ctx0, ggml_mul(ctx0, cur, model.output_norm), model.output_norm_b); + cb(cur, "out_norm_0"); + + cur = ggml_mul(ctx0, cur, model.output_norm); + cb(cur, "out_norm_0_w"); + + cur = ggml_add(ctx0, cur, model.output_norm_b); + cb(cur, "result_norm"); } - ggml_set_name(cur, "result_norm"); cur = ggml_mul_mat(ctx0, model.output, cur); - ggml_set_name(cur, "result_output"); + cb(cur, "result_output"); ggml_build_forward_expand(gf, cur); @@ -5543,7 +5039,9 @@ static struct ggml_cgraph * llm_build_bloom( static struct ggml_cgraph * llm_build_mpt( llama_context & lctx, - const llama_batch & batch) { + const llama_batch & batch, + const llm_build_cb & cb, + bool worst_case) { const auto & model = lctx.model; const auto & hparams = model.hparams; const auto & cparams = lctx.cparams; @@ -5564,11 +5062,9 @@ static struct ggml_cgraph * llm_build_mpt( const float clamp_kqv = hparams.f_clamp_kqv; const float max_alibi_bias = hparams.f_max_alibi_bias; - const int n_gpu_layers = model.n_gpu_layers; - const int32_t n_tokens = batch.n_tokens; - const int32_t n_kv = ggml_allocr_is_measure(lctx.alloc) ? n_ctx : kv_self.n; - const int32_t kv_head = ggml_allocr_is_measure(lctx.alloc) ? n_ctx - n_tokens : kv_self.head; + const int32_t n_kv = worst_case ? n_ctx : kv_self.n; + const int32_t kv_head = worst_case ? n_ctx - n_tokens : kv_self.head; auto & buf_compute = lctx.buf_compute; @@ -5587,17 +5083,9 @@ static struct ggml_cgraph * llm_build_mpt( struct ggml_tensor * cur; struct ggml_tensor * inpL; - //int warmup = 0; if (batch.token) { struct ggml_tensor * inp_tokens = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens); - - ggml_allocr_alloc(lctx.alloc, inp_tokens); - if (!ggml_allocr_is_measure(lctx.alloc)) { - memcpy(inp_tokens->data, batch.token, n_tokens*ggml_element_size(inp_tokens)); - //warmup = ((uint32_t*) inp_tokens->data)[0] == 0; - } - - ggml_set_name(inp_tokens, "inp_tokens"); + cb(inp_tokens, "inp_tokens"); inpL = ggml_get_rows(ctx0, model.tok_embeddings, inp_tokens); } else { @@ -5606,84 +5094,28 @@ static struct ggml_cgraph * llm_build_mpt( #endif inpL = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd, n_tokens); - - ggml_allocr_alloc(lctx.alloc, inpL); - if (!ggml_allocr_is_measure(lctx.alloc)) { - memcpy(inpL->data, batch.embd, n_tokens * n_embd * ggml_element_size(inpL)); - } } - - const int i_gpu_start = n_layer - n_gpu_layers; - (void) i_gpu_start; - - // offload functions set the tensor output backend to GPU - // tensors are GPU-accelerated if any input or the output has been offloaded - offload_func_t offload_func_nr = llama_nop; // nr = non-repeating - offload_func_t offload_func_kq = llama_nop; - offload_func_t offload_func_v = llama_nop; - -#ifdef GGML_USE_CUBLAS - if (n_gpu_layers > n_layer) { - offload_func_nr = ggml_cuda_assign_buffers_no_alloc; - } - if (n_gpu_layers > n_layer + 1) { - offload_func_v = ggml_cuda_assign_buffers_no_alloc; - } - if (n_gpu_layers > n_layer + 2) { - offload_func_kq = ggml_cuda_assign_buffers_no_alloc; - } -#endif // GGML_USE_CUBLAS + cb(inpL, "inp_embd"); // KQ_scale struct ggml_tensor * KQ_scale = ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, 1); - ggml_set_name(KQ_scale, "1/sqrt(n_embd_head)"); - ggml_allocr_alloc(lctx.alloc, KQ_scale); - if (!ggml_allocr_is_measure(lctx.alloc)) { - ggml_set_f32(KQ_scale, 1.0f/sqrtf(float(n_embd)/n_head)); - } + cb(KQ_scale, "KQ_scale"); // KQ_mask (mask for 1 head, it will be broadcasted to all heads) struct ggml_tensor * KQ_mask = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_kv, n_tokens, 1); - offload_func_kq(KQ_mask); - ggml_set_name(KQ_mask, "KQ_mask"); - ggml_allocr_alloc(lctx.alloc, KQ_mask); - if (!ggml_allocr_is_measure(lctx.alloc)) { - float * data = (float *) KQ_mask->data; - memset(data, 0, ggml_nbytes(KQ_mask)); - - for (int h = 0; h < 1; ++h) { - for (int j = 0; j < n_tokens; ++j) { - const llama_pos pos = batch.pos[j]; - const llama_seq_id seq_id = batch.seq_id[j][0]; - - for (int i = 0; i < n_kv; ++i) { - if (!kv_self.cells[i].has_seq_id(seq_id) || kv_self.cells[i].pos > pos) { - data[h*(n_kv*n_tokens) + j*n_kv + i] = -INFINITY; - } - } - } - } - } + cb(KQ_mask, "KQ_mask"); for (int il = 0; il < n_layer; ++il) { struct ggml_tensor * attn_norm; - offload_func_t offload_func = llama_nop; - -#ifdef GGML_USE_CUBLAS - if (il >= i_gpu_start) { - offload_func = ggml_cuda_assign_buffers_no_alloc; - } -#endif // GGML_USE_CUBLAS - // self-attention // TODO: refactor into common function (shared with LLaMA) { attn_norm = ggml_norm(ctx0, inpL, norm_eps); - offload_func(attn_norm); + cb(attn_norm, "attn_norm_0"); attn_norm = ggml_mul(ctx0, attn_norm, model.layers[il].attn_norm); - offload_func(attn_norm); + cb(attn_norm, "attn_norm_0_w"); if (1) { cur = attn_norm; @@ -5692,11 +5124,11 @@ static struct ggml_cgraph * llm_build_mpt( // compute QKV cur = ggml_mul_mat(ctx0, model.layers[il].wqkv, cur); - offload_func_kq(cur); + cb(cur, "wqkv"); if (clamp_kqv > 0.0f) { cur = ggml_clamp(ctx0, cur, -clamp_kqv, clamp_kqv); - offload_func_kq(cur); + cb(cur, "wqkv_clamped"); } const size_t wsize = ggml_type_size(cur->type); @@ -5706,47 +5138,43 @@ static struct ggml_cgraph * llm_build_mpt( wsize * n_embd_head, wsize * n_embd_head * (n_head + 2 * n_head_kv), 0); - offload_func_kq(Qcur); + cb(Qcur, "Qcur"); struct ggml_tensor * Kcur = ggml_view_3d( ctx0, cur, n_embd_head, n_head_kv, n_tokens, wsize * n_embd_head, wsize * n_embd_head * (n_head + 2 * n_head_kv), wsize * n_embd_head * n_head); - offload_func_kq(Kcur); + cb(Kcur, "Kcur"); struct ggml_tensor * tmpv = ggml_view_3d( ctx0, cur, n_embd_head, n_head_kv, n_tokens, wsize * n_embd_head, wsize * n_embd_head * (n_head + 2 * n_head_kv), wsize * n_embd_head * (n_head + n_head_kv)); - offload_func_kq(Kcur); - - ggml_set_name(Qcur, "Qcur"); - ggml_set_name(Kcur, "Kcur"); + cb(tmpv, "tmpv"); { - struct ggml_tensor * Vcur = ggml_transpose(ctx0, ggml_reshape_2d(ctx0, ggml_cont(ctx0, tmpv), n_embd_gqa, n_tokens)); - offload_func_v(Vcur); - offload_func_v(Vcur->src[0]->src[0]); - ggml_set_name(Vcur, "Vcur"); + struct ggml_tensor * Vcur = ggml_cont(ctx0, tmpv); + cb(Vcur, "Vcur"); + + Vcur = ggml_transpose(ctx0, ggml_reshape_2d(ctx0, Vcur, n_embd_gqa, n_tokens)); + cb(Vcur, "Vcur"); struct ggml_tensor * k = ggml_view_1d(ctx0, kv_self.k, n_tokens*n_embd_gqa, (ggml_element_size(kv_self.k)*n_embd_gqa)*(il*n_ctx + kv_head)); - offload_func_kq(k); - ggml_set_name(k, "k"); + cb(k, "k"); struct ggml_tensor * v = ggml_view_2d(ctx0, kv_self.v, n_tokens, n_embd_gqa, ( n_ctx)*ggml_element_size(kv_self.v), (il*n_ctx)*ggml_element_size(kv_self.v)*n_embd_gqa + kv_head*ggml_element_size(kv_self.v)); - offload_func_v(v); + cb(v, "v"); ggml_build_forward_expand(gf, ggml_cpy(ctx0, Kcur, k)); ggml_build_forward_expand(gf, ggml_cpy(ctx0, Vcur, v)); } struct ggml_tensor * Q = ggml_permute(ctx0, Qcur, 0, 2, 1, 3); - offload_func_kq(Q); - ggml_set_name(Q, "Q"); + cb(Q, "Q"); struct ggml_tensor * K = ggml_view_3d(ctx0, kv_self.k, @@ -5754,30 +5182,23 @@ static struct ggml_cgraph * llm_build_mpt( ggml_element_size(kv_self.k)*n_embd_gqa, ggml_element_size(kv_self.k)*n_embd_head, ggml_element_size(kv_self.k)*n_embd_gqa*n_ctx*il); - offload_func_kq(K); - ggml_set_name(K, "K"); + cb(K, "K"); struct ggml_tensor * KQ = ggml_mul_mat(ctx0, K, Q); - offload_func_kq(KQ); - ggml_set_name(KQ, "KQ"); + cb(KQ, "KQ"); struct ggml_tensor * KQ_scaled = ggml_scale(ctx0, KQ, KQ_scale); - offload_func_kq(KQ_scaled); - ggml_set_name(KQ_scaled, "KQ_scaled"); + cb(KQ_scaled, "KQ_scaled"); // TODO: replace with ggml_add() - struct ggml_tensor * KQ_scaled_alibi = - ggml_alibi(ctx0, KQ_scaled, 0, n_head, max_alibi_bias); - offload_func_kq(KQ_scaled_alibi); - ggml_set_name(KQ_scaled_alibi, "KQ_scaled_alibi"); + struct ggml_tensor * KQ_scaled_alibi = ggml_alibi(ctx0, KQ_scaled, 0, n_head, max_alibi_bias); + cb(KQ_scaled_alibi, "KQ_scaled_alibi"); struct ggml_tensor * KQ_masked = ggml_add(ctx0, KQ_scaled_alibi, KQ_mask); - offload_func_kq(KQ_masked); - ggml_set_name(KQ_masked, "KQ_masked"); + cb(KQ_masked, "KQ_masked"); struct ggml_tensor * KQ_soft_max = ggml_soft_max(ctx0, KQ_masked); - offload_func_v(KQ_soft_max); - ggml_set_name(KQ_soft_max, "KQ_soft_max"); + cb(KQ_soft_max, "KQ_soft_max"); struct ggml_tensor * V = ggml_view_3d(ctx0, kv_self.v, @@ -5785,29 +5206,24 @@ static struct ggml_cgraph * llm_build_mpt( ggml_element_size(kv_self.v)*n_ctx, ggml_element_size(kv_self.v)*n_ctx*n_embd_head, ggml_element_size(kv_self.v)*n_ctx*n_embd_gqa*il); - offload_func_v(V); - ggml_set_name(V, "V"); + cb(V, "V"); struct ggml_tensor * KQV = ggml_mul_mat(ctx0, V, KQ_soft_max); - offload_func_v(KQV); - ggml_set_name(KQV, "KQV"); + cb(KQV, "KQV"); struct ggml_tensor * KQV_merged = ggml_permute(ctx0, KQV, 0, 2, 1, 3); - offload_func_v(KQV_merged); - ggml_set_name(KQV_merged, "KQV_merged"); + cb(KQV_merged, "KQV_merged"); cur = ggml_cont_2d(ctx0, KQV_merged, n_embd, n_tokens); - offload_func_v(cur); - ggml_set_name(cur, "KQV_merged_contiguous"); + cb(cur, "KQV_merged_contiguous"); cur = ggml_mul_mat(ctx0, model.layers[il].wo, cur); - offload_func(cur); - ggml_set_name(cur, "result_wo"); + cb(cur, "result_wo"); } // Add the input cur = ggml_add(ctx0, cur, inpL); - offload_func(cur); + cb(cur, "inpL_+_result_wo"); struct ggml_tensor * attn_out = cur; @@ -5816,23 +5232,25 @@ static struct ggml_cgraph * llm_build_mpt( // Norm { cur = ggml_norm(ctx0, attn_out, norm_eps); - offload_func(cur); + cb(cur, "ffn_norm_0"); cur = ggml_mul(ctx0, cur, model.layers[il].ffn_norm); - offload_func(cur); + cb(cur, "ffn_norm_0_w"); } cur = ggml_mul_mat(ctx0, model.layers[il].w3, cur); - offload_func(cur); + cb(cur, "result_w3"); cur = ggml_gelu(ctx0, cur); - offload_func(cur); + cb(cur, "gelu"); + cur = ggml_mul_mat(ctx0, model.layers[il].w2, cur); - offload_func(cur); + cb(cur, "result_w2"); } cur = ggml_add(ctx0, cur, attn_out); - offload_func(cur); + cb(cur, "inpL_+_inpFF_+_result_w2"); + // input for next layer inpL = cur; } @@ -5842,14 +5260,14 @@ static struct ggml_cgraph * llm_build_mpt( // norm { cur = ggml_norm(ctx0, cur, norm_eps); - offload_func_nr(cur); + cb(cur, "out_norm_0"); cur = ggml_mul(ctx0, cur, model.output_norm); - ggml_set_name(cur, "result_norm"); + cb(cur, "result_norm"); } cur = ggml_mul_mat(ctx0, model.output, cur); - ggml_set_name(cur, "result_output"); + cb(cur, "result_output"); ggml_build_forward_expand(gf, cur); @@ -5858,45 +5276,461 @@ static struct ggml_cgraph * llm_build_mpt( return gf; } +enum llm_offload_func_e { + OFFLOAD_FUNC_NOP, + OFFLOAD_FUNC, + OFFLOAD_FUNC_KQ, + OFFLOAD_FUNC_V, + OFFLOAD_FUNC_NR, + OFFLOAD_FUNC_EMB, + OFFLOAD_FUNC_OUT, +}; + +struct llm_offload_trie { + struct node { + ~node() { + for (int i = 0; i < 256; ++i) { + if (children[i]) { + delete children[i]; + } + } + } + + node * children[256] = { nullptr }; + llm_offload_func_e func = OFFLOAD_FUNC_NOP; + }; + + llm_offload_trie() { + root = new node; + } + + llm_offload_trie(const std::unordered_map & map) { + root = new node; + + for (const auto & kv : map) { + add(kv.first, kv.second); + } + } + + ~llm_offload_trie() { + delete root; + } + + void add(const char * name, llm_offload_func_e func) { + node * cur = root; + + for (int i = 0; ; ++i) { + const uint8_t c = name[i]; + + if (!c) { + break; + } + + if (!cur->children[c]) { + cur->children[c] = new node; + } + + cur = cur->children[c]; + } + + cur->func = func; + } + + llm_offload_func_e find(const char * name) const { + const node * cur = root; + + for (int i = 0; ; ++i) { + const uint8_t c = name[i]; + + if (!c) { + break; + } + + if (!cur->children[c]) { + return OFFLOAD_FUNC_NOP; + } + + cur = cur->children[c]; + } + + return cur->func; + } + + node * root = nullptr; +}; + +static const std::unordered_map k_offload_map = { + { "inp_pos", OFFLOAD_FUNC_KQ }, + { "KQ_mask", OFFLOAD_FUNC_KQ }, + { "K_shift", OFFLOAD_FUNC_KQ }, + { "K_shifted", OFFLOAD_FUNC_KQ }, + + { "inp_norm", OFFLOAD_FUNC_NR }, + { "inp_norm_w", OFFLOAD_FUNC_NR }, + { "inp_norm_wb", OFFLOAD_FUNC_NR }, + + { "rms_norm_0", OFFLOAD_FUNC }, + + { "attn_norm_0", OFFLOAD_FUNC }, + { "attn_norm_0_w", OFFLOAD_FUNC }, + { "attn_norm_0_wb", OFFLOAD_FUNC }, + + { "attn_norm_2", OFFLOAD_FUNC }, + { "attn_norm_2_w", OFFLOAD_FUNC }, + { "attn_norm_2_wb", OFFLOAD_FUNC }, + + { "wqkv", OFFLOAD_FUNC_KQ }, + { "bqkv", OFFLOAD_FUNC_KQ }, + { "wqkv_clamped", OFFLOAD_FUNC_KQ }, + + { "tmpk", OFFLOAD_FUNC_KQ }, + { "tmpq", OFFLOAD_FUNC_KQ }, + { "tmpv", OFFLOAD_FUNC_V }, + { "tmpkqv", OFFLOAD_FUNC_KQ }, // ?? + { "Kcur", OFFLOAD_FUNC_KQ }, + { "Qcur", OFFLOAD_FUNC_KQ }, + { "Vcur", OFFLOAD_FUNC_V }, + { "Vcur_0", OFFLOAD_FUNC_V }, + { "Vcur_1", OFFLOAD_FUNC_V }, + + { "krot", OFFLOAD_FUNC_KQ }, + { "qrot", OFFLOAD_FUNC_KQ }, + { "kpass", OFFLOAD_FUNC_KQ }, + { "qpass", OFFLOAD_FUNC_KQ }, + { "krotated", OFFLOAD_FUNC_KQ }, + { "qrotated", OFFLOAD_FUNC_KQ }, + + { "k", OFFLOAD_FUNC_KQ }, + { "v", OFFLOAD_FUNC_V }, + + { "Q", OFFLOAD_FUNC_KQ }, + { "K", OFFLOAD_FUNC_KQ }, + { "KQ", OFFLOAD_FUNC_KQ }, + { "KQ_scaled", OFFLOAD_FUNC_KQ }, + { "KQ_scaled_alibi", OFFLOAD_FUNC_KQ }, + { "KQ_masked", OFFLOAD_FUNC_KQ }, + { "KQ_soft_max", OFFLOAD_FUNC_V }, + { "V", OFFLOAD_FUNC_V }, + { "KQV", OFFLOAD_FUNC_V }, + { "KQV_merged", OFFLOAD_FUNC_V }, + { "KQV_merged_contiguous", OFFLOAD_FUNC_V }, + + { "result_wo", OFFLOAD_FUNC }, + { "result_wo_b", OFFLOAD_FUNC }, + { "inpL_+_result_wo", OFFLOAD_FUNC }, + + { "inpFF", OFFLOAD_FUNC }, + + { "rms_norm_1", OFFLOAD_FUNC }, + { "ffn_norm", OFFLOAD_FUNC }, + { "ffn_norm_0", OFFLOAD_FUNC }, + { "ffn_norm_0_w", OFFLOAD_FUNC }, + { "ffn_norm_0_wb", OFFLOAD_FUNC }, + + { "result_w3", OFFLOAD_FUNC }, + { "result_w3_b", OFFLOAD_FUNC }, + { "result_w2", OFFLOAD_FUNC }, + { "result_w2_b", OFFLOAD_FUNC }, + { "result_w1", OFFLOAD_FUNC }, + + { "silu", OFFLOAD_FUNC }, + { "gelu", OFFLOAD_FUNC }, + { "relu", OFFLOAD_FUNC }, + { "sqr(relu)", OFFLOAD_FUNC }, + + { "silu_x_result_w3", OFFLOAD_FUNC }, + { "inpFF_+_result_w2", OFFLOAD_FUNC }, + { "inpL_+_inpFF_+_result_w2", OFFLOAD_FUNC }, + + { "rms_norm_2", OFFLOAD_FUNC_NR }, + { "out_norm_0", OFFLOAD_FUNC_NR }, + { "out_norm_0_w", OFFLOAD_FUNC_NR }, + + { "result_norm", OFFLOAD_FUNC_EMB }, + { "result_output", OFFLOAD_FUNC_OUT }, +}; + +static llm_offload_trie k_offload_func_trie(k_offload_map); + static struct ggml_cgraph * llama_build_graph( llama_context & lctx, const llama_batch & batch) { const auto & model = lctx.model; + // check if we should build the worst-case graph (for memory measurement) + const bool worst_case = ggml_allocr_is_measure(lctx.alloc); + + // count the number of times a tensor with a given name has been offloaded + std::unordered_map offload_n; + + // keep track of the input that has already been allocated + bool alloc_inp_tokens = false; + bool alloc_inp_embd = false; + bool alloc_inp_pos = false; + bool alloc_inp_KQ_scale = false; + bool alloc_inp_KQ_mask = false; + bool alloc_inp_K_shift = false; + + // this callback allows us to apply custom logic to each tensor (e.g. ggml-alloc, offloading, etc.) + llm_build_cb cb = [&](struct ggml_tensor * cur, const char * name) { + ggml_set_name(cur, name); + + // + // allocate input tensors and set input data + // + + if (batch.token && !alloc_inp_tokens && strcmp(name, "inp_tokens") == 0) { + ggml_allocr_alloc(lctx.alloc, cur); + + if (!ggml_allocr_is_measure(lctx.alloc)) { + const int64_t n_tokens = cur->ne[0]; + + memcpy(cur->data, batch.token, n_tokens*ggml_element_size(cur)); + } + + alloc_inp_tokens = true; + } + + if (batch.embd && !alloc_inp_embd && strcmp(name, "inp_embd") == 0) { + ggml_allocr_alloc(lctx.alloc, cur); + + if (!ggml_allocr_is_measure(lctx.alloc)) { + const int64_t n_embd = cur->ne[0]; + const int64_t n_tokens = cur->ne[1]; + + memcpy(cur->data, batch.embd, n_tokens*n_embd*ggml_element_size(cur)); + } + + alloc_inp_embd = true; + } + + if (batch.pos && !alloc_inp_pos && strcmp(name, "inp_pos") == 0) { + ggml_allocr_alloc(lctx.alloc, cur); + + if (!ggml_allocr_is_measure(lctx.alloc)) { + const int64_t n_tokens = cur->ne[0]; + + int32_t * data = (int32_t *) cur->data; + + for (int i = 0; i < n_tokens; ++i) { + data[i] = batch.pos[i]; + } + } + + alloc_inp_pos = true; + } + + if (!alloc_inp_KQ_scale && strcmp(name, "KQ_scale") == 0) { + ggml_allocr_alloc(lctx.alloc, cur); + + if (!ggml_allocr_is_measure(lctx.alloc)) { + const int64_t n_embd_head = model.hparams.n_embd_head(); + ggml_set_f32(cur, 1.0f/sqrtf(float(n_embd_head))); + } + + alloc_inp_KQ_scale = true; + } + + if (!alloc_inp_KQ_mask && strcmp(name, "KQ_mask") == 0) { + ggml_allocr_alloc(lctx.alloc, cur); + + if (!ggml_allocr_is_measure(lctx.alloc)) { + const int64_t n_kv = cur->ne[0]; + const int64_t n_tokens = cur->ne[1]; + + float * data = (float *) cur->data; + memset(data, 0, ggml_nbytes(cur)); + + for (int h = 0; h < 1; ++h) { + for (int j = 0; j < n_tokens; ++j) { + const llama_pos pos = batch.pos[j]; + const llama_seq_id seq_id = batch.seq_id[j][0]; + + for (int i = 0; i < n_kv; ++i) { + if (!lctx.kv_self.cells[i].has_seq_id(seq_id) || lctx.kv_self.cells[i].pos > pos) { + data[h*(n_kv*n_tokens) + j*n_kv + i] = -INFINITY; + } + } + } + } + } + + alloc_inp_KQ_mask = true; + } + + if (!alloc_inp_K_shift && strcmp(name, "K_shift") == 0) { + ggml_allocr_alloc(lctx.alloc, cur); + + if (!ggml_allocr_is_measure(lctx.alloc)) { + const int64_t n_ctx = cur->ne[0]; + + int32_t * data = (int32_t *) cur->data; + + for (int i = 0; i < n_ctx; ++i) { + data[i] = lctx.kv_self.cells[i].delta; + } + } + + alloc_inp_K_shift = true; + } + + // + // offload layers + // + // TODO: this code will be obsoleted with backend v2 + +#ifdef GGML_USE_CUBLAS + const bool do_offload = true; +#else + const bool do_offload = false; +#endif + + if (!do_offload) { + return; + } + + // view tensors are not offloaded + if (cur->view_src != nullptr) { + return; + } + + const int n_layer = model.hparams.n_layer; + + const int n_gpu_layers = model.n_gpu_layers; + const int i_gpu_start = n_layer - n_gpu_layers; + + // should we offload the final norm? yes if we are not computing embeddings + const bool offload_emb = lctx.embedding.empty(); + + static const std::unordered_map> k_offload_func_name = { + { OFFLOAD_FUNC_NOP, "CPU" }, +#ifdef GGML_USE_CUBLAS + { OFFLOAD_FUNC, "GPU (CUDA)" }, + { OFFLOAD_FUNC_KQ, "GPU (CUDA) KQ" }, + { OFFLOAD_FUNC_V, "GPU (CUDA) V" }, + { OFFLOAD_FUNC_NR, "GPU (CUDA) NR" }, + { OFFLOAD_FUNC_EMB, "GPU (CUDA) EMB" }, + { OFFLOAD_FUNC_OUT, "GPU (CUDA) OUT" }, +#else + { OFFLOAD_FUNC, "CPU" }, + { OFFLOAD_FUNC_KQ, "CPU" }, + { OFFLOAD_FUNC_V, "CPU" }, + { OFFLOAD_FUNC_NR, "CPU" }, + { OFFLOAD_FUNC_EMB, "CPU" }, + { OFFLOAD_FUNC_OUT, "CPU" }, +#endif // GGML_USE_CUBLAS + }; + + // check the global map for what offload function to use for this tensor + llm_offload_func_e func_e = k_offload_func_trie.find(cur->name); + + if (func_e == OFFLOAD_FUNC_NOP) { + // if a tensor hasn't been offloaded, we warn the user + if (worst_case) { + LLAMA_LOG_WARN("%s: %32s: not offloaded (ref: %s)\n", __func__, + cur->name, "https://github.com/ggerganov/llama.cpp/pull/3837"); + } + + return; + } + + // count the number of layers and respect the provided n_gpu_layers + switch (func_e) { + case OFFLOAD_FUNC_NOP: + case OFFLOAD_FUNC_OUT: + break; + case OFFLOAD_FUNC: + if (n_gpu_layers < n_layer) { + if (offload_n[cur->name]++ < i_gpu_start) { + func_e = OFFLOAD_FUNC_NOP; + } + } + break; + case OFFLOAD_FUNC_NR: + if (n_gpu_layers <= n_layer + 0) { + func_e = OFFLOAD_FUNC_NOP; + } + break; + case OFFLOAD_FUNC_V: + if (n_gpu_layers <= n_layer + 1) { + func_e = OFFLOAD_FUNC_NOP; + } + break; + case OFFLOAD_FUNC_KQ: + if (n_gpu_layers <= n_layer + 2) { + func_e = OFFLOAD_FUNC_NOP; + } + break; + case OFFLOAD_FUNC_EMB: + if (!offload_emb || n_gpu_layers < n_layer) { + func_e = OFFLOAD_FUNC_NOP; + } + break; + default: GGML_ASSERT(false); + } + + offload_func_t func = ggml_offload_nop; + + // this is needed for compatibility with Metal for example +#ifdef GGML_USE_CUBLAS + static offload_func_t ggml_offload_gpu = ggml_cuda_assign_buffers_no_alloc; +#else + static offload_func_t ggml_offload_gpu = ggml_offload_nop; +#endif + + switch (func_e) { + case OFFLOAD_FUNC_NOP: + case OFFLOAD_FUNC_OUT: func = ggml_offload_nop; break; + case OFFLOAD_FUNC: + case OFFLOAD_FUNC_KQ: + case OFFLOAD_FUNC_V: + case OFFLOAD_FUNC_NR: + case OFFLOAD_FUNC_EMB: func = ggml_offload_gpu; break; + default: GGML_ASSERT(false); + } + + // apply offload function to the tensor + func(cur); + + if (worst_case) { + LLAMA_LOG_INFO("%s: %32s: %s\n", __func__, cur->name, k_offload_func_name.at(func_e).c_str()); + } + }; + struct ggml_cgraph * result = NULL; switch (model.arch) { case LLM_ARCH_LLAMA: { - result = llm_build_llama(lctx, batch); + result = llm_build_llama(lctx, batch, cb, worst_case); } break; case LLM_ARCH_BAICHUAN: { - result = llm_build_baichaun(lctx, batch); + result = llm_build_baichaun(lctx, batch, cb, worst_case); } break; case LLM_ARCH_FALCON: { - result = llm_build_falcon(lctx, batch); + result = llm_build_falcon(lctx, batch, cb, worst_case); } break; case LLM_ARCH_STARCODER: { - result = llm_build_starcoder(lctx, batch); + result = llm_build_starcoder(lctx, batch, cb, worst_case); } break; case LLM_ARCH_PERSIMMON: { - result = llm_build_persimmon(lctx, batch); + result = llm_build_persimmon(lctx, batch, cb, worst_case); } break; case LLM_ARCH_REFACT: { - result = llm_build_refact(lctx, batch); + result = llm_build_refact(lctx, batch, cb, worst_case); } break; case LLM_ARCH_BLOOM: { - result = llm_build_bloom(lctx, batch); + result = llm_build_bloom(lctx, batch, cb, worst_case); } break; case LLM_ARCH_MPT: { - result = llm_build_mpt(lctx, batch); + result = llm_build_mpt(lctx, batch, cb, worst_case); } break; default: GGML_ASSERT(false); @@ -6044,11 +5878,13 @@ static int llama_decode_internal( } // If all tensors can be run on the GPU then using more than 1 thread is detrimental. - const bool full_offload_supported = model.arch == LLM_ARCH_LLAMA || + const bool full_offload_supported = + model.arch == LLM_ARCH_LLAMA || model.arch == LLM_ARCH_BAICHUAN || - model.arch == LLM_ARCH_FALCON || - model.arch == LLM_ARCH_REFACT || + model.arch == LLM_ARCH_FALCON || + model.arch == LLM_ARCH_REFACT || model.arch == LLM_ARCH_MPT; + const bool fully_offloaded = model.n_gpu_layers >= (int) hparams.n_layer + 3; if (ggml_cpu_has_cublas() && full_offload_supported && fully_offloaded) { n_threads = 1; @@ -6094,6 +5930,8 @@ static int llama_decode_internal( //} // extract logits + // TODO: do not compute and extract logits if only embeddings are needed + // need to update the graphs to skip "result_output" { auto & logits_out = lctx.logits; @@ -8684,8 +8522,8 @@ static int llama_apply_lora_from_file_internal( ggml_tensor * dest_t = model_tensors[base_name]; - offload_func_t offload_func = llama_nop; - offload_func_t offload_func_force_inplace = llama_nop; + offload_func_t offload_func = ggml_offload_nop; + offload_func_t offload_func_force_inplace = ggml_offload_nop; #ifdef GGML_USE_CUBLAS if (dest_t->backend == GGML_BACKEND_GPU || dest_t->backend == GGML_BACKEND_GPU_SPLIT) {