llama : use n_embd_v_gqa and n_embd_head_v instead of n_embd_k_gqa and n_embd_head_k when making a view of cached value vectors.

This commit is contained in:
Stanisław Szymczyk 2024-05-17 09:25:36 +02:00
parent f15e933fb1
commit 886f89da69

View file

@ -6622,6 +6622,7 @@ static struct ggml_tensor * llm_build_kqv(
const int64_t n_embd_head_k = hparams.n_embd_head_k; const int64_t n_embd_head_k = hparams.n_embd_head_k;
const int64_t n_embd_k_gqa = hparams.n_embd_k_gqa(); const int64_t n_embd_k_gqa = hparams.n_embd_k_gqa();
const int64_t n_embd_head_v = hparams.n_embd_head_v; const int64_t n_embd_head_v = hparams.n_embd_head_v;
const int64_t n_embd_v_gqa = hparams.n_embd_v_gqa();
struct ggml_tensor * q = ggml_permute(ctx, q_cur, 0, 2, 1, 3); struct ggml_tensor * q = ggml_permute(ctx, q_cur, 0, 2, 1, 3);
cb(q, "q", il); cb(q, "q", il);
@ -6644,8 +6645,8 @@ static struct ggml_tensor * llm_build_kqv(
struct ggml_tensor * v = struct ggml_tensor * v =
ggml_view_3d(ctx, kv.v_l[il], ggml_view_3d(ctx, kv.v_l[il],
n_embd_head_v, n_kv, n_head_kv, n_embd_head_v, n_kv, n_head_kv,
ggml_row_size(kv.v_l[il]->type, n_embd_k_gqa), ggml_row_size(kv.v_l[il]->type, n_embd_v_gqa),
ggml_row_size(kv.v_l[il]->type, n_embd_head_k), ggml_row_size(kv.v_l[il]->type, n_embd_head_v),
0); 0);
cb(v, "v", il); cb(v, "v", il);