From a104abea4884ae6d84eb4f861c715ea94deca394 Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Sun, 29 Oct 2023 21:24:25 +0200 Subject: [PATCH] llama : simplify falcon Q, K, V computation --- llama.cpp | 35 ++++++----------------------------- 1 file changed, 6 insertions(+), 29 deletions(-) diff --git a/llama.cpp b/llama.cpp index db5fe067c..9ab85c903 100644 --- a/llama.cpp +++ b/llama.cpp @@ -3886,40 +3886,17 @@ static struct ggml_cgraph * llm_build_falcon( cur = ggml_mul_mat(ctx0, model.layers[il].wqkv, cur); cb(cur, "wqkv", il); - // Note that the strides for Kcur, Vcur are set up so that the - // resulting views are misaligned with the tensor's storage - // (by applying the K/V offset we shift the tensor's original - // view to stick out behind the viewed QKV tensor's allocated - // memory, so to say). This is ok because no actual accesses - // happen to that out-of-range memory, but it can require some - // trickery when trying to accurately dump these views for - // debugging. + struct ggml_tensor * Qcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd, n_tokens, cur->nb[1], 0*sizeof(float)*(n_embd))); + struct ggml_tensor * Kcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd))); + struct ggml_tensor * Vcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa))); - const size_t wsize = ggml_type_size(cur->type); - - // TODO: these 2 ggml_conts are technically not needed, but we add them until CUDA support for - // non-contiguous views is added for the rope operator - struct ggml_tensor * Qcur = ggml_cont(ctx0, ggml_view_3d( - ctx0, cur, n_embd_head, n_head, n_tokens, - wsize * n_embd_head, - wsize * n_embd_head * (n_head + 2 * n_head_kv), - 0)); cb(Qcur, "Qcur", il); - - struct ggml_tensor * Kcur = ggml_cont(ctx0, ggml_view_3d( - ctx0, cur, n_embd_head, n_head_kv, n_tokens, - wsize * n_embd_head, - wsize * n_embd_head * (n_head + 2 * n_head_kv), - wsize * n_embd_head * n_head)); cb(Kcur, "Kcur", il); - - struct ggml_tensor * Vcur = ggml_cont(ctx0, ggml_view_3d( - ctx0, cur, n_embd_head, n_head_kv, n_tokens, - wsize * n_embd_head, - wsize * n_embd_head * (n_head + 2 * n_head_kv), - wsize * n_embd_head * (n_head + n_head_kv))); cb(Vcur, "Vcur", il); + Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens); + Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens); + // using mode = 2 for neox mode Qcur = ggml_rope_custom(ctx0, Qcur, inp_pos, n_embd_head, 2, 0, freq_base, freq_scale); cb(Qcur, "Qcur", il);