From 26df64ad04e377dc24427da0e178cdc67cd86e49 Mon Sep 17 00:00:00 2001
From: ltoniazzi <lt6ga@protonmail.com>
Date: Fri, 21 Jun 2024 17:28:14 +0100
Subject: [PATCH] Fix passing param

---
 BRANCH_SETUP.md   | 10 +++++----
 common/common.cpp |  4 ++++
 llama.cpp         | 55 +++++++++++++++++++++++++----------------------
 llama.h           |  1 +
 4 files changed, 40 insertions(+), 30 deletions(-)

diff --git a/BRANCH_SETUP.md b/BRANCH_SETUP.md
index dac58d0d2..d9f7405b5 100644
--- a/BRANCH_SETUP.md
+++ b/BRANCH_SETUP.md
@@ -30,19 +30,21 @@
 
 Run main with base model and lora adapter to hot-swap
 ```bash
-./main ./models/open-llama/ggml-model-f16.gguf \
---hot-lora models/open-llama/lora-ggml-model-q8_0-hot-lora-ITERATION.bin \
+./main -m ./models/open-llama/ggml-model-f16.gguf \
+--hot-lora models/open-llama/lora-ggml-model-q8_0-hot-lora-LATEST.bin \
 -ngl 0 \
 -n 128
 ```
 
-With `ngl > 0` the code breaks. Probably because the Lora tensors try to interact with the base tensors (`lora_mul_mat`), but they are not moved to the buffer of the base tensors.
+With `ngl > 0` the code breaks. Probably because the Lora tensors try to interact with the base tensors (as in `lora_mul_mat`), but the lora tensors are not moved to the gpu buffer of the base tensors.
 
 # Logic
 
 
 
+
 # Current status
 
-- Only ony Lora adapter can be passed. 
+- Only one Lora adapter can be passed. 
+- Applying only adapter to Q, K, V matrices to keep the code contained (fintuning trained lora tensors for all linear layers)
 - GPU not supported
\ No newline at end of file
diff --git a/common/common.cpp b/common/common.cpp
index 494258db0..21003343e 100644
--- a/common/common.cpp
+++ b/common/common.cpp
@@ -2443,6 +2443,10 @@ struct llama_context_params llama_context_params_from_gpt_params(const gpt_param
     cparams.n_ubatch          = params.n_ubatch;
     cparams.n_threads         = params.n_threads;
     cparams.n_threads_batch   = params.n_threads_batch == -1 ? params.n_threads : params.n_threads_batch;
+    const char* c_string = params.hot_lora.c_str();
+    strncpy(cparams.hot_lora, c_string, sizeof(cparams.hot_lora) - 1);
+    cparams.hot_lora[sizeof(cparams.hot_lora) - 1] = '\0';  // Ensure null-termination
+
     cparams.seed              = params.seed;
     cparams.logits_all        = params.logits_all;
     cparams.embeddings        = params.embedding;
diff --git a/llama.cpp b/llama.cpp
index 58b6ff864..467ab0f29 100644
--- a/llama.cpp
+++ b/llama.cpp
@@ -145,7 +145,7 @@ struct lora_info {
     std::string filename;
     float scale;
 };
-// TODO lora_data should maybe sub lora_weights in llama.cpp
+// TODO lora_data should maybe sub lora_weights
 struct lora_data {
     struct lora_info     info;
     std::vector<uint8_t> data;
@@ -2502,7 +2502,7 @@ struct llama_context {
 
     llama_cparams cparams;
     bool lora_loaded = false;
-    std::map<std::string, lora_weights> lora_weights_map;
+    std::map<std::string, lora_weights> lora_weights_map; // only one LoRA adapter at the moment
     lora_data llora_data;
     float lora_scale = 1.0f;
 
@@ -16109,6 +16109,7 @@ struct llama_context_params llama_context_default_params() {
         /*.n_seq_max                   =*/ 1,
         /*.n_threads                   =*/ GGML_DEFAULT_N_THREADS, // TODO: better default
         /*.n_threads_batch             =*/ GGML_DEFAULT_N_THREADS,
+        /*.hot_lora               =*/ "",
         /*.rope_scaling_type           =*/ LLAMA_ROPE_SCALING_TYPE_UNSPECIFIED,
         /*.pooling_type                =*/ LLAMA_POOLING_TYPE_UNSPECIFIED,
         /*.rope_freq_base              =*/ 0.0f,
@@ -16321,33 +16322,35 @@ struct llama_context * llama_new_context_with_model(
     /// LORA
     struct export_lora_params * lora_params = new struct export_lora_params;
     struct lora_info lora;
-    lora.filename = "./models/open-llama/lora-ggml-model-q8_0-shakespeare-LATEST.bin";
-    lora.scale = 1.0f; // redundant as already inside lora_context, but should be here for multiple loras
-    lora_params->lora.push_back(lora);
-    // load all loras
-    std::vector<struct lora_data *> loras;
-    for (size_t i = 0; i < lora_params->lora.size(); ++i) {
-        struct lora_data * llora_data = load_lora(&lora_params->lora[i]);
-        if (llora_data != NULL) {
-            loras.push_back(llora_data);
+    // lora.filename = "./models/open-llama/lora-ggml-model-q8_0-hot-lora-LATEST.bin";
+    lora.filename = params.hot_lora;
+    if (strlen(params.hot_lora) > 0) {
+            
+        lora.scale = 1.0f; // redundant as already inside lora_context, but should be here for multiple loras?
+        lora_params->lora.push_back(lora);
+        // load all loras
+        std::vector<struct lora_data *> loras;
+        for (size_t i = 0; i < lora_params->lora.size(); ++i) {
+            struct lora_data * llora_data = load_lora(&lora_params->lora[i]);
+            if (llora_data != NULL) {
+                loras.push_back(llora_data);
+            }
+        }
+        if (loras.size() == 0) {
+            fprintf(stderr, "warning: no lora adapters will be applied.\n");
+        }
+        // Assign data 
+        ctx->llora_data = *loras[0];
+
+        // build the map?
+        ctx->lora_weights_map = get_lora_weights_map_cpp((ctx->llora_data).ctx);
+        std::vector<std::string> keys;
+        for (const auto& pair : ctx->lora_weights_map) {
+            keys.push_back(pair.first);
         }
     }
-    if (loras.size() == 0) {
-        fprintf(stderr, "warning: no lora adapters will be applied.\n");
-    }
-    // Assign data 
-    ctx->llora_data = *loras[0];
 
-    // build the map?
-    ctx->lora_weights_map = get_lora_weights_map_cpp((ctx->llora_data).ctx);
-    std::vector<std::string> keys;
-    for (const auto& pair : ctx->lora_weights_map) {
-        keys.push_back(pair.first);
-    }
-
-
-
-    /// END LORA
+    /// LORA
 
     const auto & hparams = model->hparams;
     auto       & cparams = ctx->cparams;
diff --git a/llama.h b/llama.h
index 85a53f1e6..d593eb45c 100644
--- a/llama.h
+++ b/llama.h
@@ -292,6 +292,7 @@ extern "C" {
         uint32_t n_seq_max;         // max number of sequences (i.e. distinct states for recurrent models)
         uint32_t n_threads;         // number of threads to use for generation
         uint32_t n_threads_batch;   // number of threads to use for batch processing
+        char hot_lora[256];    // path to the hot lora file
 
         enum llama_rope_scaling_type rope_scaling_type; // RoPE scaling type, from `enum llama_rope_scaling_type`
         enum llama_pooling_type      pooling_type;      // whether to pool (sum) embedding results by sequence id