From f40cd2073a27390f09127b63a81abf5bcf433ff8 Mon Sep 17 00:00:00 2001
From: nopperl <54780682+nopperl@users.noreply.github.com>
Date: Thu, 18 Jul 2024 09:59:30 +0200
Subject: [PATCH] adapt to new lora implementation

---
 src/llama.cpp | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/src/llama.cpp b/src/llama.cpp
index 2a1556ea8..b65d589d1 100644
--- a/src/llama.cpp
+++ b/src/llama.cpp
@@ -13848,13 +13848,13 @@ struct llm_build_context {
             // self-attention
             {
                 // compute Q and K and RoPE them
-                struct ggml_tensor * Qcur = ggml_mul_mat(ctx0, model.layers[il].wq, cur);
+                struct ggml_tensor * Qcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wq, cur);
                 cb(Qcur, "Qcur", il);
 
-                struct ggml_tensor * Kcur = ggml_mul_mat(ctx0, model.layers[il].wk, cur);
+                struct ggml_tensor * Kcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wk, cur);
                 cb(Kcur, "Kcur", il);
 
-                struct ggml_tensor * Vcur = ggml_mul_mat(ctx0, model.layers[il].wv, cur);
+                struct ggml_tensor * Vcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wv, cur);
                 cb(Vcur, "Vcur", il);
 
                 if (model.layers[il].attn_q_norm) {
@@ -13962,7 +13962,7 @@ struct llm_build_context {
         cb(cur, "result_norm", -1);
 
         // lm_head
-        cur = ggml_mul_mat(ctx0, model.output, cur);
+        cur = llm_build_lora_mm(lctx, ctx0, model.output, cur);
         cb(cur, "result_output_with_img_logits", -1);
 
         // TODO: this suppresses the output of image tokens, which is required to enable text-only outputs.