diff --git a/common/common.cpp b/common/common.cpp index 521f849e2..08ecae357 100644 --- a/common/common.cpp +++ b/common/common.cpp @@ -684,14 +684,22 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa } if (arg == "--lora") { CHECK_ARG - params.lora_adapter.emplace_back(argv[i], 1.0f); + params.lora_adapters.push_back({ + std::string(argv[i]), + 1.0, + nullptr, + }); return true; } if (arg == "--lora-scaled") { CHECK_ARG - const char* lora_adapter = argv[i]; + std::string lora_adapter = argv[i]; CHECK_ARG - params.lora_adapter.emplace_back(lora_adapter, std::stof(argv[i])); + params.lora_adapters.push_back({ + lora_adapter, + std::stof(argv[i]), + nullptr, + }); return true; } if (arg == "--control-vector") { @@ -1654,6 +1662,7 @@ void gpt_params_print_usage(int /*argc*/, char ** argv, const gpt_params & param "https://github.com/ggerganov/llama.cpp/wiki/Templates-supported-by-llama_chat_apply_template" }); options.push_back({ "server", "-sps, --slot-prompt-similarity SIMILARITY", "how much the prompt of a request must match the prompt of a slot in order to use that slot (default: %.2f, 0.0 = disabled)\n", params.slot_prompt_similarity }); + options.push_back({ "server", "--lora-no-apply", "only load LoRA adapters without applying them (apply later via POST /lora-adapters) (default: %s)", params.lora_no_apply ? "enabled" : "disabled"}); #ifndef LOG_DISABLE_LOGS options.push_back({ "logging" }); @@ -2091,17 +2100,18 @@ std::tuple llama_init_from_gpt_par } } - for (unsigned int i = 0; i < params.lora_adapter.size(); ++i) { - const std::string & lora_adapter = std::get<0>(params.lora_adapter[i]); - float lora_scale = std::get<1>(params.lora_adapter[i]); - auto adapter = llama_lora_adapter_init(model, lora_adapter.c_str()); - if (adapter == nullptr) { - fprintf(stderr, "%s: error: failed to apply lora adapter\n", __func__); + // load and optionally apply lora adapters + for (auto & la : params.lora_adapters) { + la.adapter = llama_lora_adapter_init(model, la.path.c_str()); + if (la.adapter == nullptr) { + fprintf(stderr, "%s: error: failed to apply lora adapter '%s'\n", __func__, la.path.c_str()); llama_free(lctx); llama_free_model(model); return std::make_tuple(nullptr, nullptr); } - llama_lora_adapter_set(lctx, adapter, lora_scale); + } + if (!params.lora_no_apply) { + llama_lora_adapters_apply(lctx, params.lora_adapters); } if (params.ignore_eos) { @@ -2138,6 +2148,15 @@ std::tuple llama_init_from_gpt_par return std::make_tuple(model, lctx); } +void llama_lora_adapters_apply(struct llama_context * ctx, std::vector & lora_adapters) { + llama_lora_adapter_clear(ctx); + for (auto & la : lora_adapters) { + if (la.scale != 0.0f) { + llama_lora_adapter_set(ctx, la.adapter, la.scale); + } + } +} + struct llama_model_params llama_model_params_from_gpt_params(const gpt_params & params) { auto mparams = llama_model_default_params(); @@ -3160,18 +3179,16 @@ void yaml_dump_non_result_info(FILE * stream, const gpt_params & params, const l } fprintf(stream, "lora:\n"); - for (std::tuple la : params.lora_adapter) { - if (std::get<1>(la) != 1.0f) { - continue; + for (auto & la : params.lora_adapters) { + if (la.scale == 1.0f) { + fprintf(stream, " - %s\n", la.path.c_str()); } - fprintf(stream, " - %s\n", std::get<0>(la).c_str()); } fprintf(stream, "lora_scaled:\n"); - for (std::tuple la : params.lora_adapter) { - if (std::get<1>(la) == 1.0f) { - continue; + for (auto & la : params.lora_adapters) { + if (la.scale != 1.0f) { + fprintf(stream, " - %s: %f\n", la.path.c_str(), la.scale); } - fprintf(stream, " - %s: %f\n", std::get<0>(la).c_str(), std::get<1>(la)); } fprintf(stream, "main_gpu: %d # default: 0\n", params.main_gpu); fprintf(stream, "min_keep: %d # default: 0 (disabled)\n", sparams.min_keep); diff --git a/common/common.h b/common/common.h index 8240ff99b..8a29fb814 100644 --- a/common/common.h +++ b/common/common.h @@ -33,6 +33,12 @@ #define DEFAULT_MODEL_PATH "models/7B/ggml-model-f16.gguf" +struct llama_lora_adapter_container { + std::string path; + float scale; + struct llama_lora_adapter * adapter; +}; + // build info extern int LLAMA_BUILD_NUMBER; extern char const * LLAMA_COMMIT; @@ -126,8 +132,8 @@ struct gpt_params { std::vector antiprompt; // strings upon which more user input is prompted (a.k.a. reverse prompts) std::vector kv_overrides; - // TODO: avoid tuple, use struct - std::vector> lora_adapter; // lora adapter path with user defined scale + bool lora_no_apply = false; // only load lora to memory, but do not apply it to ctx (user can manually apply lora later using llama_lora_adapter_apply) + std::vector lora_adapters; // lora adapter path with user defined scale std::vector control_vectors; // control vector with user defined scale @@ -317,6 +323,9 @@ struct llama_context_params llama_context_params_from_gpt_params(const gpt_param struct llama_model * llama_load_model_from_url(const char * model_url, const char * path_model, const char * hf_token, const struct llama_model_params & params); struct llama_model * llama_load_model_from_hf(const char * repo, const char * file, const char * path_model, const char * hf_token, const struct llama_model_params & params); +// clear LoRA adapters from context, then apply new list of adapters +void llama_lora_adapters_apply(struct llama_context * ctx, std::vector & lora_adapters); + // Batch utils void llama_batch_clear(struct llama_batch & batch); diff --git a/examples/server/server.cpp b/examples/server/server.cpp index 7813a2957..fa30d83cd 100644 --- a/examples/server/server.cpp +++ b/examples/server/server.cpp @@ -78,6 +78,7 @@ enum server_task_type { SERVER_TASK_TYPE_SLOT_SAVE, SERVER_TASK_TYPE_SLOT_RESTORE, SERVER_TASK_TYPE_SLOT_ERASE, + SERVER_TASK_TYPE_SET_LORA, }; struct server_task { @@ -1847,6 +1848,14 @@ struct server_context { }; queue_results.send(result); } break; + case SERVER_TASK_TYPE_SET_LORA: + { + llama_lora_adapters_apply(ctx, params.lora_adapters); + server_task_result result; + result.id = task.id; + result.data = json{{ "success", true }}; + queue_results.send(result); + } break; } } @@ -3325,6 +3334,55 @@ int main(int argc, char ** argv) { return res.set_content(root.dump(), "application/json; charset=utf-8"); }; + const auto handle_lora_adapters_list = [&](const httplib::Request & req, httplib::Response & res) { + res.set_header("Access-Control-Allow-Origin", req.get_header_value("Origin")); + json result = json::array(); + for (size_t i = 0; i < ctx_server.params.lora_adapters.size(); ++i) { + auto & la = ctx_server.params.lora_adapters[i]; + result.push_back({ + {"id", i}, + {"path", la.path}, + {"scale", la.scale}, + }); + } + res.set_content(result.dump(), "application/json"); + res.status = 200; // HTTP OK + }; + + const auto handle_lora_adapters_apply = [&](const httplib::Request & req, httplib::Response & res) { + res.set_header("Access-Control-Allow-Origin", req.get_header_value("Origin")); + + const std::vector body = json::parse(req.body); + int max_idx = ctx_server.params.lora_adapters.size(); + + // clear existing value + for (auto & la : ctx_server.params.lora_adapters) { + la.scale = 0.0f; + } + + // set value + for (auto entry : body) { + int id = entry.at("id"); + float scale = entry.at("scale"); + if (0 <= id && id < max_idx) { + ctx_server.params.lora_adapters[id].scale = scale; + } else { + throw std::runtime_error("invalid adapter id"); + } + } + + server_task task; + task.type = SERVER_TASK_TYPE_SET_LORA; + const int id_task = ctx_server.queue_tasks.post(task); + ctx_server.queue_results.add_waiting_task_id(id_task); + + server_task_result result = ctx_server.queue_results.recv(id_task); + ctx_server.queue_results.remove_waiting_task_id(id_task); + + res.set_content(result.data.dump(), "application/json"); + res.status = 200; // HTTP OK + }; + auto handle_static_file = [](unsigned char * content, size_t len, const char * mime_type) { return [content, len, mime_type](const httplib::Request &, httplib::Response & res) { res.set_content(reinterpret_cast(content), len, mime_type); @@ -3363,7 +3421,6 @@ int main(int argc, char ** argv) { // register API routes svr->Get ("/health", handle_health); - svr->Get ("/slots", handle_slots); svr->Get ("/metrics", handle_metrics); svr->Get ("/props", handle_props); svr->Get ("/v1/models", handle_models); @@ -3378,6 +3435,11 @@ int main(int argc, char ** argv) { svr->Post("/v1/embeddings", handle_embeddings); svr->Post("/tokenize", handle_tokenize); svr->Post("/detokenize", handle_detokenize); + // LoRA adapters hotswap + svr->Get ("/lora-adapters", handle_lora_adapters_list); + svr->Post("/lora-adapters", handle_lora_adapters_apply); + // Save & load slots + svr->Get ("/slots", handle_slots); if (!params.slot_save_path.empty()) { // only enable slot endpoints if slot_save_path is set svr->Post("/slots/:id_slot", handle_slots_action);