From 66a6dbf70238da2f48581dd473f8123680eba4a2 Mon Sep 17 00:00:00 2001 From: akawrykow Date: Tue, 29 Aug 2023 18:06:35 -0700 Subject: [PATCH] wqkv hack --- llama.cpp | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/llama.cpp b/llama.cpp index 44faa8f4c..bee75473d 100644 --- a/llama.cpp +++ b/llama.cpp @@ -2008,8 +2008,15 @@ static void llm_load_tensors( } } - layer.wqkv = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa}, backend_split); - layer.wo = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, backend_split); + // TODO: For 1B, we need e.g {2048, 6144} and the usual calculation gives us e.g {2048, 2176}. + // I think this is because we skip the QKV reshaping in the conversion script (maybe because parallel attention is disabled?) + if (model.type == MODEL_1B) { + layer.wqkv = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd * 3}, backend_split); + } else { + layer.wqkv = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa}, backend_split); + } + + layer.wo = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, backend_split); layer.w2 = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, backend_split); layer.w3 = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, backend_split);