diff --git a/llama.cpp b/llama.cpp index d6b2d7289..a3a4ba6f6 100644 --- a/llama.cpp +++ b/llama.cpp @@ -3548,11 +3548,11 @@ static struct ggml_cgraph * llm_build_llama( model.layers[il].ffn_gate, NULL, model.layers[il].ffn_down, NULL, LLM_FFN_SILU, LLM_FFN_PAR, cb, il); - cb(cur, "ffn_result", il); + cb(cur, "ffn_out", il); } cur = ggml_add(ctx0, cur, inpFF); - cb(cur, "inpFF_+_result_w2", il); + cb(cur, "inpFF_ffn_out", il); // input for next layer inpL = cur; @@ -3714,11 +3714,11 @@ static struct ggml_cgraph * llm_build_baichaun( model.layers[il].ffn_gate, NULL, model.layers[il].ffn_down, NULL, LLM_FFN_SILU, LLM_FFN_PAR, cb, il); - cb(cur, "ffn_result", il); + cb(cur, "ffn_out", il); } cur = ggml_add(ctx0, cur, inpFF); - cb(cur, "inpFF_+_result_w2", il); + cb(cur, "inpFF_ffn_out", il); // input for next layer inpL = cur; @@ -3884,14 +3884,14 @@ static struct ggml_cgraph * llm_build_falcon( NULL, NULL, model.layers[il].ffn_down, NULL, LLM_FFN_GELU, LLM_FFN_SEQ, cb, il); - cb(cur, "ffn_result", il); + cb(cur, "ffn_out", il); } cur = ggml_add(ctx0, cur, attn_out); - cb(cur, "inpFF_+_result_w2", il); + cb(cur, "inpFF_ffn_out", il); cur = ggml_add(ctx0, cur, inpL); - cb(cur, "inpL_+_inpFF_+_result_w2", il); + cb(cur, "inpL_inpFF_ffn_out", il); // input for next layer inpL = cur; @@ -3988,6 +3988,7 @@ static struct ggml_cgraph * llm_build_starcoder( cb(KQ_mask, "KQ_mask", -1); pos = ggml_get_rows(ctx0, model.pos_embeddings, inp_pos); + cb(pos, "pos_embd", -1); inpL = ggml_add(ctx0, embd, pos); cb(inpL, "inpL", -1); @@ -4027,7 +4028,7 @@ static struct ggml_cgraph * llm_build_starcoder( // Add the input cur = ggml_add(ctx0, cur, inpL); - cb(cur, "inpL_+_result_wo", il); + cb(cur, "inpL_kqv_out", il); struct ggml_tensor * inpFF = cur; @@ -4044,11 +4045,11 @@ static struct ggml_cgraph * llm_build_starcoder( NULL, NULL, model.layers[il].ffn_down, model.layers[il].ffn_down_b, LLM_FFN_GELU, LLM_FFN_SEQ, cb, il); - cb(cur, "ffn_result", il); + cb(cur, "ffn_out", il); } inpL = ggml_add(ctx0, cur, inpFF); - + cb(inpL, "inpL_inpFF_ffn_out", il); } cur = llm_build_norm(ctx0, inpL, @@ -4294,11 +4295,11 @@ static struct ggml_cgraph * llm_build_persimmon( NULL, NULL, model.layers[il].ffn_down, model.layers[il].ffn_down_b, LLM_FFN_RELU_SQR, LLM_FFN_SEQ, cb, il); - cb(cur, "ffn_result", il); + cb(cur, "ffn_out", il); } cur = ggml_add(ctx0, cur, inpFF); - cb(cur, "inpFF_+_result_w2", il); + cb(cur, "inpFF_ffn_out", il); inpL = cur; } @@ -4432,11 +4433,11 @@ static struct ggml_cgraph * llm_build_refact( model.layers[il].ffn_gate, NULL, model.layers[il].ffn_down, NULL, LLM_FFN_SILU, LLM_FFN_PAR, cb, il); - cb(cur, "ffn_result", il); + cb(cur, "ffn_out", il); } cur = ggml_add(ctx0, cur, inpFF); - cb(cur, "inpFF_+_result_w2", il); + cb(cur, "inpFF_ffn_out", il); // input for next layer inpL = cur; @@ -4569,7 +4570,7 @@ static struct ggml_cgraph * llm_build_bloom( // Add the input cur = ggml_add(ctx0, cur, inpL); - cb(cur, "inpL_+_result_wo", il); + cb(cur, "inpL_kqv_out", il); struct ggml_tensor * inpFF = cur; @@ -4586,11 +4587,11 @@ static struct ggml_cgraph * llm_build_bloom( NULL, NULL, model.layers[il].ffn_down, model.layers[il].ffn_down_b, LLM_FFN_GELU, LLM_FFN_SEQ, cb, il); - cb(cur, "ffn_result", il); + cb(cur, "ffn_out", il); } inpL = ggml_add(ctx0, cur, inpFF); - cb(inpL, "inpFF_+_result_w2", il); + cb(inpL, "inpFF_ffn_out", il); } cur = llm_build_norm(ctx0, inpL, @@ -4717,7 +4718,7 @@ static struct ggml_cgraph * llm_build_mpt( // Add the input cur = ggml_add(ctx0, cur, inpL); - cb(cur, "inpL_+_result_wo", il); + cb(cur, "inpL_kqv_out", il); struct ggml_tensor * attn_out = cur; @@ -4734,11 +4735,11 @@ static struct ggml_cgraph * llm_build_mpt( NULL, NULL, model.layers[il].ffn_down, NULL, LLM_FFN_GELU, LLM_FFN_SEQ, cb, il); - cb(cur, "ffn_result", il); + cb(cur, "ffn_out", il); } cur = ggml_add(ctx0, cur, attn_out); - cb(cur, "inpL_+_inpFF_+_result_w2", il); + cb(cur, "inpL_inpFF_ffn_out", il); // input for next layer inpL = cur; @@ -4777,6 +4778,7 @@ enum llm_offload_func_e { OFFLOAD_FUNC_OUT, }; +// TODO: will be removed with backend v2 struct llm_offload_trie { struct node { ~node() { @@ -4850,10 +4852,12 @@ struct llm_offload_trie { node * root = nullptr; }; +// TODO: will be removed with backend v2 static const std::unordered_map k_offload_map = { //{ "inp_tokens", OFFLOAD_FUNC_NR }, // TODO: missing K-quants get_rows kernel //{ "inp_embd", OFFLOAD_FUNC_NR }, // TODO: missing K-quants get_rows kernel { "inp_pos", OFFLOAD_FUNC_NR }, + { "pos_embd", OFFLOAD_FUNC_NR }, { "KQ_mask", OFFLOAD_FUNC_NR }, { "K_shift", OFFLOAD_FUNC_NR }, @@ -4902,7 +4906,7 @@ static const std::unordered_map k_offload_map { "kqv_wo", OFFLOAD_FUNC_V }, { "kqv_out", OFFLOAD_FUNC_V }, - { "inpL_+_result_wo", OFFLOAD_FUNC }, + { "inpL_kqv_out", OFFLOAD_FUNC }, { "inpFF", OFFLOAD_FUNC }, { "ffn_norm", OFFLOAD_FUNC }, @@ -4914,15 +4918,15 @@ static const std::unordered_map k_offload_map { "ffn_gate_par", OFFLOAD_FUNC }, { "ffn_down", OFFLOAD_FUNC }, { "ffn_down_b", OFFLOAD_FUNC }, - { "ffn_result", OFFLOAD_FUNC }, + { "ffn_out", OFFLOAD_FUNC }, { "ffn_silu", OFFLOAD_FUNC }, { "ffn_gelu", OFFLOAD_FUNC }, { "ffn_relu", OFFLOAD_FUNC }, { "ffn_sqr(relu)", OFFLOAD_FUNC }, - { "inpFF_+_result_w2", OFFLOAD_FUNC }, - { "inpL_+_inpFF_+_result_w2", OFFLOAD_FUNC }, + { "inpFF_ffn_out", OFFLOAD_FUNC }, + { "inpL_inpFF_ffn_out", OFFLOAD_FUNC }, { "result_norm", OFFLOAD_FUNC_EMB }, { "result_output", OFFLOAD_FUNC_OUT }, @@ -4946,6 +4950,14 @@ static struct ggml_cgraph * llama_build_graph( bool alloc_inp_KQ_mask = false; bool alloc_inp_K_shift = false; +#ifdef GGML_USE_CUBLAS + const bool do_offload = true; +#else + const bool do_offload = true; // TODO: set to false after finishing refactoring +#endif + + int n_non_view = 0; // number of non-view tensors that have been processed by the callback + // this callback allows us to apply custom logic to each tensor (e.g. ggml-alloc, offloading, etc.) llm_build_cb cb = [&](struct ggml_tensor * cur, const char * name, int il) { if (il >= 0) { @@ -5053,23 +5065,23 @@ static struct ggml_cgraph * llama_build_graph( alloc_inp_K_shift = true; } - // - // offload layers - // - // TODO: this code will be obsoleted with backend v2 - -#ifdef GGML_USE_CUBLAS - const bool do_offload = true; -#else - const bool do_offload = true; // TODO: set to false after finishing refactoring -#endif - - if (!do_offload) { + // view tensors are not processed further + if (cur->view_src != nullptr) { return; } - // view tensors are not offloaded - if (cur->view_src != nullptr) { + if (cur->op != GGML_OP_NONE) { + n_non_view++; + } + + // + // offload layers + // + // TODO: will be removed with backend v2 + +//#define LLAMA_OFFLOAD_DEBUG + + if (!do_offload) { return; } @@ -5103,11 +5115,13 @@ static struct ggml_cgraph * llama_build_graph( llm_offload_func_e func_e = k_offload_func_trie.find(name); if (func_e == OFFLOAD_FUNC_NOP) { +#ifdef LLAMA_OFFLOAD_DEBUG // if a tensor hasn't been offloaded, we warn the user if (worst_case) { LLAMA_LOG_WARN("%s: %32s: not offloaded (ref: %s)\n", __func__, cur->name, "https://github.com/ggerganov/llama.cpp/pull/3837"); } +#endif return; } @@ -5170,9 +5184,11 @@ static struct ggml_cgraph * llama_build_graph( // apply offload function to the tensor func(cur); +#ifdef LLAMA_OFFLOAD_DEBUG if (worst_case) { LLAMA_LOG_INFO("%s: %32s: %s\n", __func__, cur->name, k_offload_func_name.at(func_e).c_str()); } +#endif }; struct ggml_cgraph * result = NULL; @@ -5214,6 +5230,29 @@ static struct ggml_cgraph * llama_build_graph( GGML_ASSERT(false); } + if (worst_case) { + int n_non_view_total = 0; + + for (int i = 0; i < result->n_nodes; ++i) { + if (result->nodes[i]->view_src == nullptr) { + n_non_view_total++; + } + } + + LLAMA_LOG_INFO("%s: non-view tensors processed: %d/%d\n", __func__, n_non_view, n_non_view_total); + +#ifdef LLAMA_OFFLOAD_DEBUG + if (n_non_view != n_non_view_total) { + LLAMA_LOG_WARN("%s: ****************************************************************\n", __func__); + LLAMA_LOG_WARN("%s: not all non-view tensors have been processed with a callback\n", __func__); + LLAMA_LOG_WARN("%s: this can indicate an inefficiency in the graph implementation\n", __func__); + LLAMA_LOG_WARN("%s: build with LLAMA_OFFLOAD_DEBUG for more info\n", __func__); + LLAMA_LOG_WARN("%s: ref: https://github.com/ggerganov/llama.cpp/pull/3837\n", __func__); + LLAMA_LOG_WARN("%s: ****************************************************************\n", __func__); + } +#endif + } + return result; }