From d61ed6b4316029b56d1c152ce04ab3bff96951d8 Mon Sep 17 00:00:00 2001 From: xaedes Date: Sun, 20 Aug 2023 18:36:20 +0200 Subject: [PATCH] mixing multiple LORA adapters is now possible pass more than one '--lora FNAME' argument to apply more than one LORA. use '--lora-scaled FNAME S' when you want to specify a user-defined scale for an adapter. --- examples/common.cpp | 26 ++++++++++++++++++++++---- examples/common.h | 4 ++-- examples/server/server.cpp | 18 +++++++++++++++++- llama.cpp | 12 ++++++------ llama.h | 2 ++ 5 files changed, 49 insertions(+), 13 deletions(-) diff --git a/examples/common.cpp b/examples/common.cpp index 21f4a0357..73fd16f36 100644 --- a/examples/common.cpp +++ b/examples/common.cpp @@ -310,7 +310,19 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) { invalid_param = true; break; } - params.lora_adapter = argv[i]; + params.lora_adapter.push_back({argv[i], 1.0f}); + params.use_mmap = false; + } else if (arg == "--lora-scaled") { + if (++i >= argc) { + invalid_param = true; + break; + } + const char * lora_adapter = argv[i]; + if (++i >= argc) { + invalid_param = true; + break; + } + params.lora_adapter.push_back({lora_adapter, std::stof(argv[i])}); params.use_mmap = false; } else if (arg == "--lora-base") { if (++i >= argc) { @@ -601,6 +613,7 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) { fprintf(stdout, " --verbose-prompt print prompt before generation\n"); fprintf(stderr, " --simple-io use basic IO for better compatibility in subprocesses and limited consoles\n"); fprintf(stdout, " --lora FNAME apply LoRA adapter (implies --no-mmap)\n"); + fprintf(stdout, " --lora-scaled FNAME S apply LoRA adapter with user defined scaling S (implies --no-mmap)\n"); fprintf(stdout, " --lora-base FNAME optional model to use as a base for the layers modified by the LoRA adapter\n"); fprintf(stdout, " -m FNAME, --model FNAME\n"); fprintf(stdout, " model path (default: %s)\n", params.model.c_str()); @@ -677,10 +690,15 @@ std::tuple llama_init_from_gpt_par return std::make_tuple(nullptr, nullptr); } - if (!params.lora_adapter.empty()) { + for (int i = 0; i < params.lora_adapter.size(); ++i) { + const std::string& lora_adapter = std::get<0>(params.lora_adapter[i]); + float lora_scale = std::get<1>(params.lora_adapter[i]); int err = llama_model_apply_lora_from_file(model, - params.lora_adapter.c_str(), - params.lora_base.empty() ? NULL : params.lora_base.c_str(), + lora_adapter.c_str(), + lora_scale, + ((i > 0) || params.lora_base.empty()) + ? NULL + : params.lora_base.c_str(), params.n_threads); if (err != 0) { fprintf(stderr, "%s: error: failed to apply lora adapter\n", __func__); diff --git a/examples/common.h b/examples/common.h index 375bc0a3d..6fa906c3f 100644 --- a/examples/common.h +++ b/examples/common.h @@ -62,8 +62,8 @@ struct gpt_params { std::string grammar = ""; // optional BNF-like grammar to constrain sampling std::vector antiprompt; // string upon seeing which more user input is prompted - std::string lora_adapter = ""; // lora adapter path - std::string lora_base = ""; // base model path for the lora adapter + std::vector> lora_adapter; // lora adapter path with user defined scale + std::string lora_base = ""; // base model path for the lora adapter bool hellaswag = false; // compute HellaSwag score over random tasks from datafile supplied in prompt size_t hellaswag_tasks = 400; // number of tasks to use when computing the HellaSwag score diff --git a/examples/server/server.cpp b/examples/server/server.cpp index 6f7a66da1..5cc5b67b1 100644 --- a/examples/server/server.cpp +++ b/examples/server/server.cpp @@ -869,7 +869,23 @@ static void server_params_parse(int argc, char **argv, server_params &sparams, invalid_param = true; break; } - params.lora_adapter = argv[i]; + params.lora_adapter.push_back({argv[i], 1.0f}); + params.use_mmap = false; + } + else if (arg == "--lora-scaled") + { + if (++i >= argc) + { + invalid_param = true; + break; + } + const char * lora_adapter = argv[i]; + if (++i >= argc) + { + invalid_param = true; + break; + } + params.lora_adapter.push_back({lora_adapter, std::stof(argv[i])}); params.use_mmap = false; } else if (arg == "--lora-base") diff --git a/llama.cpp b/llama.cpp index 6af1e003c..33b7836bc 100644 --- a/llama.cpp +++ b/llama.cpp @@ -3401,7 +3401,7 @@ int llama_model_quantize( } } -int llama_apply_lora_from_file_internal(const struct llama_model & model, const char * path_lora, const char * path_base_model, int n_threads) { +int llama_apply_lora_from_file_internal(const struct llama_model & model, const char * path_lora, float scale, const char * path_base_model, int n_threads) { fprintf(stderr, "%s: applying lora adapter from '%s' - please wait ...\n", __func__, path_lora); const int64_t t_start_lora_us = ggml_time_us(); @@ -3433,7 +3433,7 @@ int llama_apply_lora_from_file_internal(const struct llama_model & model, const int32_t lora_alpha; fin.read((char *) &lora_r, sizeof(lora_r)); fin.read((char *) &lora_alpha, sizeof(lora_alpha)); - float scaling = (float)lora_alpha / (float)lora_r; + float scaling = scale * (float)lora_alpha / (float)lora_r; fprintf(stderr, "%s: r = %d, alpha = %d, scaling = %.2f\n", __func__, lora_r, lora_alpha, scaling); @@ -3682,18 +3682,18 @@ int llama_apply_lora_from_file_internal(const struct llama_model & model, const return 0; } -int llama_apply_lora_from_file(struct llama_context * ctx, const char * path_lora, const char * path_base_model, int n_threads) { +int llama_apply_lora_from_file(struct llama_context * ctx, const char * path_lora, float scale, const char * path_base_model, int n_threads) { try { - return llama_apply_lora_from_file_internal(ctx->model, path_lora, path_base_model, n_threads); + return llama_apply_lora_from_file_internal(ctx->model, path_lora, scale, path_base_model, n_threads); } catch (const std::exception & err) { fprintf(stderr, "%s: failed to apply lora adapter: %s\n", __func__, err.what()); return 1; } } -int llama_model_apply_lora_from_file(const struct llama_model * model, const char * path_lora, const char * path_base_model, int n_threads) { +int llama_model_apply_lora_from_file(const struct llama_model * model, const char * path_lora, float scale, const char * path_base_model, int n_threads) { try { - return llama_apply_lora_from_file_internal(*model, path_lora, path_base_model, n_threads); + return llama_apply_lora_from_file_internal(*model, path_lora, scale, path_base_model, n_threads); } catch (const std::exception & err) { fprintf(stderr, "%s: failed to apply lora adapter: %s\n", __func__, err.what()); return 1; diff --git a/llama.h b/llama.h index bb6c3c107..70df37c8d 100644 --- a/llama.h +++ b/llama.h @@ -249,6 +249,7 @@ extern "C" { LLAMA_API DEPRECATED(int llama_apply_lora_from_file( struct llama_context * ctx, const char * path_lora, + float scale, const char * path_base_model, int n_threads), "please use llama_model_apply_lora_from_file instead"); @@ -256,6 +257,7 @@ extern "C" { LLAMA_API int llama_model_apply_lora_from_file( const struct llama_model * model, const char * path_lora, + float scale, const char * path_base_model, int n_threads);