From 66a6dbf70238da2f48581dd473f8123680eba4a2 Mon Sep 17 00:00:00 2001
From: akawrykow <a.kawrykow@gmail.com>
Date: Tue, 29 Aug 2023 18:06:35 -0700
Subject: [PATCH] wqkv hack

---
 llama.cpp | 11 +++++++++--
 1 file changed, 9 insertions(+), 2 deletions(-)

diff --git a/llama.cpp b/llama.cpp
index 44faa8f4c..bee75473d 100644
--- a/llama.cpp
+++ b/llama.cpp
@@ -2008,8 +2008,15 @@ static void llm_load_tensors(
                             }
                         }
 
-                        layer.wqkv = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa}, backend_split);
-                        layer.wo   = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd},                backend_split);
+                        // TODO: For 1B, we need e.g {2048, 6144} and the usual calculation gives us e.g {2048,  2176}.
+                        // I think this is because we skip the QKV reshaping in the conversion script (maybe because parallel attention is disabled?)
+                        if (model.type == MODEL_1B) {
+                          layer.wqkv = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd * 3}, backend_split);
+                        } else {
+                          layer.wqkv = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa}, backend_split);
+                        }
+
+                        layer.wo   = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, backend_split);
 
                         layer.w2 = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_DOWN, "weight", i), {  n_ff, n_embd}, backend_split);
                         layer.w3 = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_UP,   "weight", i), {n_embd,   n_ff}, backend_split);