From 8f0272c9d716a8938b214c6ac6d68533ab8066af Mon Sep 17 00:00:00 2001
From: Lorenzo Toniazzi <lorenzo.toniazzi@stepstone.com>
Date: Sat, 6 Jul 2024 21:19:52 +0100
Subject: [PATCH] update branch notes

---
 BRANCH_SETUP.md => _BRANCH_SETUP.md | 55 +++++++++++++++++++++++++----
 llama.cpp                           |  7 ++--
 2 files changed, 54 insertions(+), 8 deletions(-)
 rename BRANCH_SETUP.md => _BRANCH_SETUP.md (89%)
diff --git a/BRANCH_SETUP.md b/_BRANCH_SETUP.md
similarity index 89%
rename from BRANCH_SETUP.md
rename to _BRANCH_SETUP.md
index 0b6cdac74..b2d5ab6af 100644
--- a/BRANCH_SETUP.md
+++ b/_BRANCH_SETUP.md
@@ -32,13 +32,14 @@ Run main with base model and lora adapter to hot-swap
 ```bash
 ./main -m ./models/open-llama/ggml-model-f16.gguf \
 --hot-lora models/open-llama/lora-ggml-model-q8_0-hot-lora-LATEST.bin \
--ngl 0 \
+-ngl 99 \
+-n 128
+```
+```bash
+./main -m ./models/open-llama/ggml-model-f16.gguf \
+-ngl 99 \
 -n 128
 ```
-
-Working but `ggml_metal_get_buffer: error: tensor 'blk.16.attn_v.weight.loraB' buffer is nil`
-
-With `ngl > 0` the code breaks. Probably because the Lora tensors try to interact with the base tensors (as in `lora_mul_mat`), but the lora tensors are not moved to the gpu buffer of the base tensors.
 
 # Logic
 
@@ -299,4 +300,46 @@ int main() {
     //
 
   }
-  ```
\ No newline at end of file
+  ```
+
+
+
+    ```bash
+    # Convert base model to gguf
+    python3 convert-hf-to-gguf.py models/open-llama/ && \
+    # Quantize base model
+    ./quantize ./models/open-llama/ggml-model-f16.gguf ./models/open-llama/ggml-model-q4.gguf Q4_K && \
+    # Obtain Lora adapter
+    ./finetune  --model-base models/open-llama/ggml-model-q4.gguf \
+    --checkpoint-in models/open-llama/chk-lora-ggml-model-q4-hot-lora-LATEST.gguf \
+    --checkpoint-out models/open-llama/chk-lora-ggml-model-q4-hot-lora-ITERATION.gguf \
+    --lora-out models/open-llama/lora-ggml-model-q4-hot-lora-ITERATION.bin \
+    --train-data "data/hot-lora.txt" \
+    --save-every 1 \
+    --threads 1 \
+    --adam-iter 1 \
+    --batch 1 \
+    --ctx 16 \
+    --use-checkpointing
+    ```
+
+</details>
+
+## 1. Run main with adapter
+
+- Run main with base model and lora adapter to hot-swap
+    ```bash
+  ./main -m ./models/open-llama/ggml-model-q4.gguf \
+  --hot-lora models/open-llama/lora-ggml-model-q4-hot-lora-LATEST.bin \
+  -ngl 99 \
+  -n 128
+  ```
+
+- Do not pass the flag `--hot-lora` and the adapter is ignored:
+  ```bash
+  ./main -m ./models/open-llama/ggml-model-q4.gguf \
+  -ngl 99 \
+  -n 128
+  ```
+
+  make clean && make -j 8 LLAMA_DEBUG=1
\ No newline at end of file
diff --git a/llama.cpp b/llama.cpp
index eeca784b9..df098b652 100644
--- a/llama.cpp
+++ b/llama.cpp
@@ -9731,8 +9731,11 @@ struct llm_build_context {
     ggml_tensor * loraB = it->second.loraB;
 
     ggml_tensor * t_lora = ggml_mul_mat(ctx0,
-                ggml_mul_mat(ctx0, loraA, loraB), 
-                cur
+                loraA,
+                ggml_mul_mat(ctx0, 
+                    ggml_transpose(ctx0, loraB), 
+                    cur
+                )
             );
 
     if (lctx.lora_scale != 1.0f) {