Fix passing param

2024-06-21 17:28:14 +01:00 · 2024-06-21 17:28:14 +01:00 · 26df64ad04
commit 26df64ad04
parent 12112bfa48
4 changed files with 40 additions and 30 deletions
--- a/BRANCH_SETUP.md
+++ b/BRANCH_SETUP.md
@ -30,19 +30,21 @@

 Run main with base model and lora adapter to hot-swap
 ```bash
-./main ./models/open-llama/ggml-model-f16.gguf \
--hot-lora models/open-llama/lora-ggml-model-q8_0-hot-lora-ITERATION.bin \
+./main -m ./models/open-llama/ggml-model-f16.gguf \
+--hot-lora models/open-llama/lora-ggml-model-q8_0-hot-lora-LATEST.bin \
 -ngl 0 \
 -n 128
 ```

-With `ngl > 0` the code breaks. Probably because the Lora tensors try to interact with the base tensors (`lora_mul_mat`), but they are not moved to the buffer of the base tensors.
+With `ngl > 0` the code breaks. Probably because the Lora tensors try to interact with the base tensors (as in `lora_mul_mat`), but the lora tensors are not moved to the gpu buffer of the base tensors.

 # Logic



+
 # Current status

- Only ony Lora adapter can be passed. 
+- Only one Lora adapter can be passed. 
+- Applying only adapter to Q, K, V matrices to keep the code contained (fintuning trained lora tensors for all linear layers)
 - GPU not supported
--- a/common/common.cpp
+++ b/common/common.cpp
@ -2443,6 +2443,10 @@ struct llama_context_params llama_context_params_from_gpt_params(const gpt_param
    cparams.n_ubatch          = params.n_ubatch;
    cparams.n_threads         = params.n_threads;
    cparams.n_threads_batch   = params.n_threads_batch == -1 ? params.n_threads : params.n_threads_batch;
+    const char* c_string = params.hot_lora.c_str();
+    strncpy(cparams.hot_lora, c_string, sizeof(cparams.hot_lora) - 1);
+    cparams.hot_lora[sizeof(cparams.hot_lora) - 1] = '\0';  // Ensure null-termination
+
    cparams.seed              = params.seed;
    cparams.logits_all        = params.logits_all;
    cparams.embeddings        = params.embedding;
--- a/llama.cpp
+++ b/llama.cpp
@ -145,7 +145,7 @@ struct lora_info {
    std::string filename;
    float scale;
 };
-// TODO lora_data should maybe sub lora_weights in llama.cpp
+// TODO lora_data should maybe sub lora_weights
 struct lora_data {
    struct lora_info     info;
    std::vector<uint8_t> data;
@ -2502,7 +2502,7 @@ struct llama_context {

    llama_cparams cparams;
    bool lora_loaded = false;
-    std::map<std::string, lora_weights> lora_weights_map;
+    std::map<std::string, lora_weights> lora_weights_map; // only one LoRA adapter at the moment
    lora_data llora_data;
    float lora_scale = 1.0f;

@ -16109,6 +16109,7 @@ struct llama_context_params llama_context_default_params() {
        /*.n_seq_max                   =*/ 1,
        /*.n_threads                   =*/ GGML_DEFAULT_N_THREADS, // TODO: better default
        /*.n_threads_batch             =*/ GGML_DEFAULT_N_THREADS,
+        /*.hot_lora               =*/ "",
        /*.rope_scaling_type           =*/ LLAMA_ROPE_SCALING_TYPE_UNSPECIFIED,
        /*.pooling_type                =*/ LLAMA_POOLING_TYPE_UNSPECIFIED,
        /*.rope_freq_base              =*/ 0.0f,
@ -16321,33 +16322,35 @@ struct llama_context * llama_new_context_with_model(
    /// LORA
    struct export_lora_params * lora_params = new struct export_lora_params;
    struct lora_info lora;
-    lora.filename = "./models/open-llama/lora-ggml-model-q8_0-shakespeare-LATEST.bin";
-    lora.scale = 1.0f; // redundant as already inside lora_context, but should be here for multiple loras
-    lora_params->lora.push_back(lora);
-    // load all loras
-    std::vector<struct lora_data *> loras;
-    for (size_t i = 0; i < lora_params->lora.size(); ++i) {
-        struct lora_data * llora_data = load_lora(&lora_params->lora[i]);
-        if (llora_data != NULL) {
-            loras.push_back(llora_data);
+    // lora.filename = "./models/open-llama/lora-ggml-model-q8_0-hot-lora-LATEST.bin";
+    lora.filename = params.hot_lora;
+    if (strlen(params.hot_lora) > 0) {
+            
+        lora.scale = 1.0f; // redundant as already inside lora_context, but should be here for multiple loras?
+        lora_params->lora.push_back(lora);
+        // load all loras
+        std::vector<struct lora_data *> loras;
+        for (size_t i = 0; i < lora_params->lora.size(); ++i) {
+            struct lora_data * llora_data = load_lora(&lora_params->lora[i]);
+            if (llora_data != NULL) {
+                loras.push_back(llora_data);
+            }
+        }
+        if (loras.size() == 0) {
+            fprintf(stderr, "warning: no lora adapters will be applied.\n");
+        }
+        // Assign data 
+        ctx->llora_data = *loras[0];
+
+        // build the map?
+        ctx->lora_weights_map = get_lora_weights_map_cpp((ctx->llora_data).ctx);
+        std::vector<std::string> keys;
+        for (const auto& pair : ctx->lora_weights_map) {
+            keys.push_back(pair.first);
        }
    }
-    if (loras.size() == 0) {
-        fprintf(stderr, "warning: no lora adapters will be applied.\n");
-    }
-    // Assign data 
-    ctx->llora_data = *loras[0];

-    // build the map?
-    ctx->lora_weights_map = get_lora_weights_map_cpp((ctx->llora_data).ctx);
-    std::vector<std::string> keys;
-    for (const auto& pair : ctx->lora_weights_map) {
-        keys.push_back(pair.first);
-    }
-
-
-
-    /// END LORA
+    /// LORA

    const auto & hparams = model->hparams;
    auto       & cparams = ctx->cparams;
--- a/llama.h
+++ b/llama.h
@ -292,6 +292,7 @@ extern "C" {
        uint32_t n_seq_max;         // max number of sequences (i.e. distinct states for recurrent models)
        uint32_t n_threads;         // number of threads to use for generation
        uint32_t n_threads_batch;   // number of threads to use for batch processing
+        char hot_lora[256];    // path to the hot lora file

        enum llama_rope_scaling_type rope_scaling_type; // RoPE scaling type, from `enum llama_rope_scaling_type`
        enum llama_pooling_type      pooling_type;      // whether to pool (sum) embedding results by sequence id