From 8f0272c9d716a8938b214c6ac6d68533ab8066af Mon Sep 17 00:00:00 2001 From: Lorenzo Toniazzi Date: Sat, 6 Jul 2024 21:19:52 +0100 Subject: [PATCH] update branch notes --- BRANCH_SETUP.md => _BRANCH_SETUP.md | 55 +++++++++++++++++++++++++---- llama.cpp | 7 ++-- 2 files changed, 54 insertions(+), 8 deletions(-) rename BRANCH_SETUP.md => _BRANCH_SETUP.md (89%) diff --git a/BRANCH_SETUP.md b/_BRANCH_SETUP.md similarity index 89% rename from BRANCH_SETUP.md rename to _BRANCH_SETUP.md index 0b6cdac74..b2d5ab6af 100644 --- a/BRANCH_SETUP.md +++ b/_BRANCH_SETUP.md @@ -32,13 +32,14 @@ Run main with base model and lora adapter to hot-swap ```bash ./main -m ./models/open-llama/ggml-model-f16.gguf \ --hot-lora models/open-llama/lora-ggml-model-q8_0-hot-lora-LATEST.bin \ --ngl 0 \ +-ngl 99 \ +-n 128 +``` +```bash +./main -m ./models/open-llama/ggml-model-f16.gguf \ +-ngl 99 \ -n 128 ``` - -Working but `ggml_metal_get_buffer: error: tensor 'blk.16.attn_v.weight.loraB' buffer is nil` - -With `ngl > 0` the code breaks. Probably because the Lora tensors try to interact with the base tensors (as in `lora_mul_mat`), but the lora tensors are not moved to the gpu buffer of the base tensors. # Logic @@ -299,4 +300,46 @@ int main() { // } - ``` \ No newline at end of file + ``` + + + + ```bash + # Convert base model to gguf + python3 convert-hf-to-gguf.py models/open-llama/ && \ + # Quantize base model + ./quantize ./models/open-llama/ggml-model-f16.gguf ./models/open-llama/ggml-model-q4.gguf Q4_K && \ + # Obtain Lora adapter + ./finetune --model-base models/open-llama/ggml-model-q4.gguf \ + --checkpoint-in models/open-llama/chk-lora-ggml-model-q4-hot-lora-LATEST.gguf \ + --checkpoint-out models/open-llama/chk-lora-ggml-model-q4-hot-lora-ITERATION.gguf \ + --lora-out models/open-llama/lora-ggml-model-q4-hot-lora-ITERATION.bin \ + --train-data "data/hot-lora.txt" \ + --save-every 1 \ + --threads 1 \ + --adam-iter 1 \ + --batch 1 \ + --ctx 16 \ + --use-checkpointing + ``` + + + +## 1. Run main with adapter + +- Run main with base model and lora adapter to hot-swap + ```bash + ./main -m ./models/open-llama/ggml-model-q4.gguf \ + --hot-lora models/open-llama/lora-ggml-model-q4-hot-lora-LATEST.bin \ + -ngl 99 \ + -n 128 + ``` + +- Do not pass the flag `--hot-lora` and the adapter is ignored: + ```bash + ./main -m ./models/open-llama/ggml-model-q4.gguf \ + -ngl 99 \ + -n 128 + ``` + + make clean && make -j 8 LLAMA_DEBUG=1 \ No newline at end of file diff --git a/llama.cpp b/llama.cpp index eeca784b9..df098b652 100644 --- a/llama.cpp +++ b/llama.cpp @@ -9731,8 +9731,11 @@ struct llm_build_context { ggml_tensor * loraB = it->second.loraB; ggml_tensor * t_lora = ggml_mul_mat(ctx0, - ggml_mul_mat(ctx0, loraA, loraB), - cur + loraA, + ggml_mul_mat(ctx0, + ggml_transpose(ctx0, loraB), + cur + ) ); if (lctx.lora_scale != 1.0f) {