update branch notes

2024-07-06 21:19:52 +01:00 · 2024-07-06 21:19:52 +01:00 · 8f0272c9d7
commit 8f0272c9d7
parent 284e665a4b
2 changed files with 54 additions and 8 deletions
--- a/_BRANCH_SETUP.md
+++ b/_BRANCH_SETUP.md
@ -32,13 +32,14 @@ Run main with base model and lora adapter to hot-swap
 ```bash
 ./main -m ./models/open-llama/ggml-model-f16.gguf \
 --hot-lora models/open-llama/lora-ggml-model-q8_0-hot-lora-LATEST.bin \
-ngl 0 \
+-ngl 99 \
 -n 128
 ```
 ```bash
 ./main -m ./models/open-llama/ggml-model-f16.gguf \
 -ngl 99 \
 -n 128
 ```
 Working but `ggml_metal_get_buffer: error: tensor 'blk.16.attn_v.weight.loraB' buffer is nil`
 With `ngl > 0` the code breaks. Probably because the Lora tensors try to interact with the base tensors (as in `lora_mul_mat`), but the lora tensors are not moved to the gpu buffer of the base tensors.
 # Logic
@ -299,4 +300,46 @@ int main() {
    //
  }
-  ```
+  ```
    ```bash
    # Convert base model to gguf
    python3 convert-hf-to-gguf.py models/open-llama/ && \
    # Quantize base model
    ./quantize ./models/open-llama/ggml-model-f16.gguf ./models/open-llama/ggml-model-q4.gguf Q4_K && \
    # Obtain Lora adapter
    ./finetune  --model-base models/open-llama/ggml-model-q4.gguf \
    --checkpoint-in models/open-llama/chk-lora-ggml-model-q4-hot-lora-LATEST.gguf \
    --checkpoint-out models/open-llama/chk-lora-ggml-model-q4-hot-lora-ITERATION.gguf \
    --lora-out models/open-llama/lora-ggml-model-q4-hot-lora-ITERATION.bin \
    --train-data "data/hot-lora.txt" \
    --save-every 1 \
    --threads 1 \
    --adam-iter 1 \
    --batch 1 \
    --ctx 16 \
    --use-checkpointing
    ```
 </details>
 ## 1. Run main with adapter
 - Run main with base model and lora adapter to hot-swap
    ```bash
  ./main -m ./models/open-llama/ggml-model-q4.gguf \
  --hot-lora models/open-llama/lora-ggml-model-q4-hot-lora-LATEST.bin \
  -ngl 99 \
  -n 128
  ```
 - Do not pass the flag `--hot-lora` and the adapter is ignored:
  ```bash
  ./main -m ./models/open-llama/ggml-model-q4.gguf \
  -ngl 99 \
  -n 128
  ```
  make clean && make -j 8 LLAMA_DEBUG=1
--- a/llama.cpp
+++ b/llama.cpp
@ -9731,8 +9731,11 @@ struct llm_build_context {
    ggml_tensor * loraB = it->second.loraB;
    ggml_tensor * t_lora = ggml_mul_mat(ctx0,
-                ggml_mul_mat(ctx0, loraA, loraB), 
+                loraA,
-                cur
+                ggml_mul_mat(ctx0, 
                    ggml_transpose(ctx0, loraB), 
                    cur
                )
            );
    if (lctx.lora_scale != 1.0f) {