From 32661ac8b42fe23bbdc1418914793ab5c535a6ed Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Wed, 24 Apr 2024 09:39:22 +0300 Subject: [PATCH] llama : minor / style --- llama.cpp | 29 +++++++++++++---------------- 1 file changed, 13 insertions(+), 16 deletions(-) diff --git a/llama.cpp b/llama.cpp index 698ad2367..a4dd00500 100644 --- a/llama.cpp +++ b/llama.cpp @@ -9028,27 +9028,25 @@ struct llm_build_context { return gf; } - struct ggml_cgraph* build_phi3() { - - struct ggml_cgraph* gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false); + struct ggml_cgraph * build_phi3() { + struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false); const int64_t n_embd_head = hparams.n_embd_head_v; const int64_t n_embd_gqa = hparams.n_embd_v_gqa(); GGML_ASSERT(n_embd_head == hparams.n_embd_head_k); - struct ggml_tensor* cur; - struct ggml_tensor* inpL; + struct ggml_tensor * cur; + struct ggml_tensor * inpL; inpL = llm_build_inp_embd(ctx0, lctx, hparams, batch, model.tok_embd, cb); // inp_pos - contains the positions - struct ggml_tensor* inp_pos = build_inp_pos(); + struct ggml_tensor * inp_pos = build_inp_pos(); // KQ_mask (mask for 1 head, it will be broadcasted to all heads) - struct ggml_tensor* KQ_mask = build_inp_KQ_mask(); + struct ggml_tensor * KQ_mask = build_inp_KQ_mask(); for (int il = 0; il < n_layer; ++il) { - auto residual = inpL; // self-attention @@ -9059,15 +9057,15 @@ struct llm_build_context { LLM_NORM_RMS, cb, il); cb(attn_norm_output, "attn_norm", il); - struct ggml_tensor* Qcur = nullptr; - struct ggml_tensor* Kcur = nullptr; - struct ggml_tensor* Vcur = nullptr; + struct ggml_tensor * Qcur = nullptr; + struct ggml_tensor * Kcur = nullptr; + struct ggml_tensor * Vcur = nullptr; if (model.layers[il].wqkv) { cur = ggml_mul_mat(ctx0, model.layers[il].wqkv, attn_norm_output); cb(cur, "wqkv", il); - Qcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd, n_tokens, cur->nb[1], 0 * sizeof(float) * (n_embd))); + Qcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd, n_tokens, cur->nb[1], 0 * sizeof(float) * (n_embd))); Kcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1 * sizeof(float) * (n_embd))); Vcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1 * sizeof(float) * (n_embd + n_embd_gqa))); } @@ -9081,7 +9079,7 @@ struct llm_build_context { cb(Kcur, "Kcur", il); cb(Vcur, "Vcur", il); - Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens); + Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens); Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens); Qcur = ggml_rope_custom( @@ -9120,6 +9118,8 @@ struct llm_build_context { cb(cur, "ffn_norm", il); // FF + // special-case: the up and gate tensors are merged into a single tensor + // TOOD: support into llm_build_ffn { struct ggml_tensor* up = ggml_mul_mat(ctx0, model.layers[il].ffn_up, cur); cb(up, "ffn_up", il); @@ -9152,9 +9152,6 @@ struct llm_build_context { cur = ggml_mul_mat(ctx0, model.output, cur); cb(cur, "result_output", -1); - //cur = ggml_add(ctx0, cur, NULL); - //cb(cur, "result_output", -1); - ggml_build_forward_expand(gf, cur); return gf;