From 26df64ad04e377dc24427da0e178cdc67cd86e49 Mon Sep 17 00:00:00 2001 From: ltoniazzi Date: Fri, 21 Jun 2024 17:28:14 +0100 Subject: [PATCH] Fix passing param --- BRANCH_SETUP.md | 10 +++++---- common/common.cpp | 4 ++++ llama.cpp | 55 +++++++++++++++++++++++++---------------------- llama.h | 1 + 4 files changed, 40 insertions(+), 30 deletions(-) diff --git a/BRANCH_SETUP.md b/BRANCH_SETUP.md index dac58d0d2..d9f7405b5 100644 --- a/BRANCH_SETUP.md +++ b/BRANCH_SETUP.md @@ -30,19 +30,21 @@ Run main with base model and lora adapter to hot-swap ```bash -./main ./models/open-llama/ggml-model-f16.gguf \ ---hot-lora models/open-llama/lora-ggml-model-q8_0-hot-lora-ITERATION.bin \ +./main -m ./models/open-llama/ggml-model-f16.gguf \ +--hot-lora models/open-llama/lora-ggml-model-q8_0-hot-lora-LATEST.bin \ -ngl 0 \ -n 128 ``` -With `ngl > 0` the code breaks. Probably because the Lora tensors try to interact with the base tensors (`lora_mul_mat`), but they are not moved to the buffer of the base tensors. +With `ngl > 0` the code breaks. Probably because the Lora tensors try to interact with the base tensors (as in `lora_mul_mat`), but the lora tensors are not moved to the gpu buffer of the base tensors. # Logic + # Current status -- Only ony Lora adapter can be passed. +- Only one Lora adapter can be passed. +- Applying only adapter to Q, K, V matrices to keep the code contained (fintuning trained lora tensors for all linear layers) - GPU not supported \ No newline at end of file diff --git a/common/common.cpp b/common/common.cpp index 494258db0..21003343e 100644 --- a/common/common.cpp +++ b/common/common.cpp @@ -2443,6 +2443,10 @@ struct llama_context_params llama_context_params_from_gpt_params(const gpt_param cparams.n_ubatch = params.n_ubatch; cparams.n_threads = params.n_threads; cparams.n_threads_batch = params.n_threads_batch == -1 ? params.n_threads : params.n_threads_batch; + const char* c_string = params.hot_lora.c_str(); + strncpy(cparams.hot_lora, c_string, sizeof(cparams.hot_lora) - 1); + cparams.hot_lora[sizeof(cparams.hot_lora) - 1] = '\0'; // Ensure null-termination + cparams.seed = params.seed; cparams.logits_all = params.logits_all; cparams.embeddings = params.embedding; diff --git a/llama.cpp b/llama.cpp index 58b6ff864..467ab0f29 100644 --- a/llama.cpp +++ b/llama.cpp @@ -145,7 +145,7 @@ struct lora_info { std::string filename; float scale; }; -// TODO lora_data should maybe sub lora_weights in llama.cpp +// TODO lora_data should maybe sub lora_weights struct lora_data { struct lora_info info; std::vector data; @@ -2502,7 +2502,7 @@ struct llama_context { llama_cparams cparams; bool lora_loaded = false; - std::map lora_weights_map; + std::map lora_weights_map; // only one LoRA adapter at the moment lora_data llora_data; float lora_scale = 1.0f; @@ -16109,6 +16109,7 @@ struct llama_context_params llama_context_default_params() { /*.n_seq_max =*/ 1, /*.n_threads =*/ GGML_DEFAULT_N_THREADS, // TODO: better default /*.n_threads_batch =*/ GGML_DEFAULT_N_THREADS, + /*.hot_lora =*/ "", /*.rope_scaling_type =*/ LLAMA_ROPE_SCALING_TYPE_UNSPECIFIED, /*.pooling_type =*/ LLAMA_POOLING_TYPE_UNSPECIFIED, /*.rope_freq_base =*/ 0.0f, @@ -16321,33 +16322,35 @@ struct llama_context * llama_new_context_with_model( /// LORA struct export_lora_params * lora_params = new struct export_lora_params; struct lora_info lora; - lora.filename = "./models/open-llama/lora-ggml-model-q8_0-shakespeare-LATEST.bin"; - lora.scale = 1.0f; // redundant as already inside lora_context, but should be here for multiple loras - lora_params->lora.push_back(lora); - // load all loras - std::vector loras; - for (size_t i = 0; i < lora_params->lora.size(); ++i) { - struct lora_data * llora_data = load_lora(&lora_params->lora[i]); - if (llora_data != NULL) { - loras.push_back(llora_data); + // lora.filename = "./models/open-llama/lora-ggml-model-q8_0-hot-lora-LATEST.bin"; + lora.filename = params.hot_lora; + if (strlen(params.hot_lora) > 0) { + + lora.scale = 1.0f; // redundant as already inside lora_context, but should be here for multiple loras? + lora_params->lora.push_back(lora); + // load all loras + std::vector loras; + for (size_t i = 0; i < lora_params->lora.size(); ++i) { + struct lora_data * llora_data = load_lora(&lora_params->lora[i]); + if (llora_data != NULL) { + loras.push_back(llora_data); + } + } + if (loras.size() == 0) { + fprintf(stderr, "warning: no lora adapters will be applied.\n"); + } + // Assign data + ctx->llora_data = *loras[0]; + + // build the map? + ctx->lora_weights_map = get_lora_weights_map_cpp((ctx->llora_data).ctx); + std::vector keys; + for (const auto& pair : ctx->lora_weights_map) { + keys.push_back(pair.first); } } - if (loras.size() == 0) { - fprintf(stderr, "warning: no lora adapters will be applied.\n"); - } - // Assign data - ctx->llora_data = *loras[0]; - // build the map? - ctx->lora_weights_map = get_lora_weights_map_cpp((ctx->llora_data).ctx); - std::vector keys; - for (const auto& pair : ctx->lora_weights_map) { - keys.push_back(pair.first); - } - - - - /// END LORA + /// LORA const auto & hparams = model->hparams; auto & cparams = ctx->cparams; diff --git a/llama.h b/llama.h index 85a53f1e6..d593eb45c 100644 --- a/llama.h +++ b/llama.h @@ -292,6 +292,7 @@ extern "C" { uint32_t n_seq_max; // max number of sequences (i.e. distinct states for recurrent models) uint32_t n_threads; // number of threads to use for generation uint32_t n_threads_batch; // number of threads to use for batch processing + char hot_lora[256]; // path to the hot lora file enum llama_rope_scaling_type rope_scaling_type; // RoPE scaling type, from `enum llama_rope_scaling_type` enum llama_pooling_type pooling_type; // whether to pool (sum) embedding results by sequence id