Fix passing param

2024-06-21 17:28:14 +01:00 · 2024-06-21 17:28:14 +01:00 · 26df64ad04
commit 26df64ad04
parent 12112bfa48
4 changed files with 40 additions and 30 deletions
--- a/BRANCH_SETUP.md
+++ b/BRANCH_SETUP.md
@ -30,19 +30,21 @@
 Run main with base model and lora adapter to hot-swap
 ```bash
-./main ./models/open-llama/ggml-model-f16.gguf \
+./main -m ./models/open-llama/ggml-model-f16.gguf \
--hot-lora models/open-llama/lora-ggml-model-q8_0-hot-lora-ITERATION.bin \
+--hot-lora models/open-llama/lora-ggml-model-q8_0-hot-lora-LATEST.bin \
 -ngl 0 \
 -n 128
 ```
-With `ngl > 0` the code breaks. Probably because the Lora tensors try to interact with the base tensors (`lora_mul_mat`), but they are not moved to the buffer of the base tensors.
+With `ngl > 0` the code breaks. Probably because the Lora tensors try to interact with the base tensors (as in `lora_mul_mat`), but the lora tensors are not moved to the gpu buffer of the base tensors.
 # Logic
 # Current status
- Only ony Lora adapter can be passed. 
+- Only one Lora adapter can be passed. 
 - Applying only adapter to Q, K, V matrices to keep the code contained (fintuning trained lora tensors for all linear layers)
 - GPU not supported
--- a/common/common.cpp
+++ b/common/common.cpp
@ -2443,6 +2443,10 @@ struct llama_context_params llama_context_params_from_gpt_params(const gpt_param
    cparams.n_ubatch          = params.n_ubatch;
    cparams.n_threads         = params.n_threads;
    cparams.n_threads_batch   = params.n_threads_batch == -1 ? params.n_threads : params.n_threads_batch;
    const char* c_string = params.hot_lora.c_str();
    strncpy(cparams.hot_lora, c_string, sizeof(cparams.hot_lora) - 1);
    cparams.hot_lora[sizeof(cparams.hot_lora) - 1] = '\0';  // Ensure null-termination
    cparams.seed              = params.seed;
    cparams.logits_all        = params.logits_all;
    cparams.embeddings        = params.embedding;
--- a/llama.cpp
+++ b/llama.cpp
@ -145,7 +145,7 @@ struct lora_info {
    std::string filename;
    float scale;
 };
-// TODO lora_data should maybe sub lora_weights in llama.cpp
+// TODO lora_data should maybe sub lora_weights
 struct lora_data {
    struct lora_info     info;
    std::vector<uint8_t> data;
@ -2502,7 +2502,7 @@ struct llama_context {
    llama_cparams cparams;
    bool lora_loaded = false;
-    std::map<std::string, lora_weights> lora_weights_map;
+    std::map<std::string, lora_weights> lora_weights_map; // only one LoRA adapter at the moment
    lora_data llora_data;
    float lora_scale = 1.0f;
@ -16109,6 +16109,7 @@ struct llama_context_params llama_context_default_params() {
        /*.n_seq_max                   =*/ 1,
        /*.n_threads                   =*/ GGML_DEFAULT_N_THREADS, // TODO: better default
        /*.n_threads_batch             =*/ GGML_DEFAULT_N_THREADS,
        /*.hot_lora               =*/ "",
        /*.rope_scaling_type           =*/ LLAMA_ROPE_SCALING_TYPE_UNSPECIFIED,
        /*.pooling_type                =*/ LLAMA_POOLING_TYPE_UNSPECIFIED,
        /*.rope_freq_base              =*/ 0.0f,
@ -16321,8 +16322,11 @@ struct llama_context * llama_new_context_with_model(
    /// LORA
    struct export_lora_params * lora_params = new struct export_lora_params;
    struct lora_info lora;
-    lora.filename = "./models/open-llama/lora-ggml-model-q8_0-shakespeare-LATEST.bin";
+    // lora.filename = "./models/open-llama/lora-ggml-model-q8_0-hot-lora-LATEST.bin";
-    lora.scale = 1.0f; // redundant as already inside lora_context, but should be here for multiple loras
+    lora.filename = params.hot_lora;
    if (strlen(params.hot_lora) > 0) {
        lora.scale = 1.0f; // redundant as already inside lora_context, but should be here for multiple loras?
        lora_params->lora.push_back(lora);
        // load all loras
        std::vector<struct lora_data *> loras;
@ -16344,10 +16348,9 @@ struct llama_context * llama_new_context_with_model(
        for (const auto& pair : ctx->lora_weights_map) {
            keys.push_back(pair.first);
        }
    }
-
+    /// LORA
    /// END LORA
    const auto & hparams = model->hparams;
    auto       & cparams = ctx->cparams;
--- a/llama.h
+++ b/llama.h
@ -292,6 +292,7 @@ extern "C" {
        uint32_t n_seq_max;         // max number of sequences (i.e. distinct states for recurrent models)
        uint32_t n_threads;         // number of threads to use for generation
        uint32_t n_threads_batch;   // number of threads to use for batch processing
        char hot_lora[256];    // path to the hot lora file
        enum llama_rope_scaling_type rope_scaling_type; // RoPE scaling type, from `enum llama_rope_scaling_type`
        enum llama_pooling_type      pooling_type;      // whether to pool (sum) embedding results by sequence id