From 0274e6b36466ebcd10fc9a58576fd5b7c006e99b Mon Sep 17 00:00:00 2001 From: trollkotze Date: Sun, 24 Mar 2024 22:13:55 +0100 Subject: [PATCH 01/17] Control vectors in server --- examples/server/server.cpp | 107 ++++++++++++++++++++++++++++++++++++- examples/server/utils.hpp | 5 ++ 2 files changed, 111 insertions(+), 1 deletion(-) diff --git a/examples/server/server.cpp b/examples/server/server.cpp index b02c2546e..1ab80412b 100644 --- a/examples/server/server.cpp +++ b/examples/server/server.cpp @@ -624,7 +624,6 @@ struct server_response { } } }; - struct server_context { llama_model * model = nullptr; llama_context * ctx = nullptr; @@ -2700,6 +2699,35 @@ static void server_params_parse(int argc, char ** argv, server_params & sparams, break; } params.kv_overrides.push_back(kvo); + } else if (arg == "--control-vector") { + if (++i >= argc) { + invalid_param = true; + break; + } + params.control_vectors.push_back({ 1.0f, argv[i], }); + } else if (arg == "--control-vector-scaled") { + if (++i >= argc) { + invalid_param = true; + break; + } + const char* fname = argv[i]; + if (++i >= argc) { + invalid_param = true; + break; + } + params.control_vectors.push_back({ std::stof(argv[i]), fname, }); + } else if (arg == "--control-vector-layer-range") { + if (++i >= argc) { + invalid_param = true; + break; + } + params.control_vector_layer_start = std::stoi(argv[i]); + if (++i >= argc) { + invalid_param = true; + break; + } + params.control_vector_layer_end = std::stoi(argv[i]); + break; } else { fprintf(stderr, "error: unknown argument: %s\n", arg.c_str()); server_print_usage(argv[0], default_params, default_sparams); @@ -3148,6 +3176,81 @@ int main(int argc, char ** argv) { res.status = 200; // HTTP OK }; + const auto handle_get_control_vectors = [&ctx_server, ¶ms](const httplib::Request & req, httplib::Response & res) { + json vectors = json::array(); + + for (const auto & vec : params.control_vectors) { + vectors.push_back(json { + { "fname", vec.fname }, + { "strength", vec.strength } + }); + } + json data = { + { "vectors", vectors }, + { "layer_start", params.control_vector_layer_start }, + { "layer_end", params.control_vector_layer_end } + }; + res.set_content(data.dump(), "application/json; charset=utf-8"); + }; + + const auto handle_set_control_vectors = [&ctx_server, &res_error, ¶ms, &handle_get_control_vectors](const httplib::Request & req, httplib::Response & res) { + res.set_header("Access-Control-Allow-Origin", req.get_header_value("Origin")); + + json data = json::parse(req.body); + std::vector vec_params; + + if (data.contains("vectors") && data["vectors"].is_array()) { + for (const auto &item : data["vectors"]) { + auto v = item.get(); + // std::cout << "Add vector: " << v.fname << " " << v.strength << "\n"; + vec_params.push_back(v); + } + } else { + std::cerr << "No vectors passed\n"; + res_error(res, format_error_response("No vectors passed", ERROR_TYPE_SERVER)); + return; + } + for (auto v : params.control_vectors) { + // std::cout << "Subtract vector:" << v.fname << " " << v.strength << "\n"; + vec_params.push_back({ -v.strength, v.fname }); + } + const auto cvec = llama_control_vector_load(vec_params); + if (cvec.n_embd == -1) { + // std::cerr << "Could not load control vector\n"; + res_error(res, format_error_response("Could not load control vector", ERROR_TYPE_SERVER)); + return; + } + + if (params.control_vector_layer_start <= 0) { + params.control_vector_layer_start = 1; + } + if (params.control_vector_layer_end <= 0){ + params.control_vector_layer_end = llama_n_layer(ctx_server.model); + } + int err = llama_control_vector_apply(ctx_server.ctx, + cvec.data.data(), + cvec.data.size(), + cvec.n_embd, + params.control_vector_layer_start, + params.control_vector_layer_end); + if (err) { + std::cerr << "Could not apply control vector\n"; + res_error(res, format_error_response("Could not apply control vector", ERROR_TYPE_SERVER)); + return; + } + auto s = params.control_vectors.size(); + auto s2 = vec_params.size(); + params.control_vectors.clear(); + unsigned i = 0; + for (auto v : vec_params) { + if (i++ < s2 - s) { + //std::cout << "set vector param: " << v.fname << " " << v.strength << "\n"; + params.control_vectors.push_back(v); + } + } + handle_get_control_vectors(req, res); + }; + const auto handle_props = [&ctx_server](const httplib::Request & req, httplib::Response & res) { res.set_header("Access-Control-Allow-Origin", req.get_header_value("Origin")); json data = { @@ -3497,8 +3600,10 @@ int main(int argc, char ** argv) { svr->Get ("/health", handle_health); svr->Get ("/slots", handle_slots); svr->Get ("/metrics", handle_metrics); + svr->Get ("/control-vectors", handle_get_control_vectors); svr->Get ("/props", handle_props); svr->Get ("/v1/models", handle_models); + svr->Post("/control-vectors", handle_set_control_vectors); svr->Post("/completion", handle_completions); // legacy svr->Post("/completions", handle_completions); svr->Post("/v1/completions", handle_completions); diff --git a/examples/server/utils.hpp b/examples/server/utils.hpp index 8f20ff614..f73ec441f 100644 --- a/examples/server/utils.hpp +++ b/examples/server/utils.hpp @@ -615,3 +615,8 @@ static json format_error_response(const std::string & message, const enum error_ {"type", type_str}, }; } + +void from_json(const json& j, llama_control_vector_load_info& l) { + j.at("strength").get_to(l.strength); + j.at("fname").get_to(l.fname); +} From 7dbed974dcd3fb73be578a4e6cf327c6b8ab09cf Mon Sep 17 00:00:00 2001 From: trollkotze Date: Mon, 25 Mar 2024 02:07:54 +0100 Subject: [PATCH 02/17] hmm... --- examples/server/server.cpp | 53 ++++++++++++++++++++------------------ 1 file changed, 28 insertions(+), 25 deletions(-) diff --git a/examples/server/server.cpp b/examples/server/server.cpp index 1ab80412b..6ce64c94d 100644 --- a/examples/server/server.cpp +++ b/examples/server/server.cpp @@ -3176,10 +3176,10 @@ int main(int argc, char ** argv) { res.status = 200; // HTTP OK }; - const auto handle_get_control_vectors = [&ctx_server, ¶ms](const httplib::Request & req, httplib::Response & res) { + const auto handle_get_control_vectors = [&ctx_server](const httplib::Request & req, httplib::Response & res) { json vectors = json::array(); - for (const auto & vec : params.control_vectors) { + for (const auto & vec : ctx_server.params.control_vectors) { vectors.push_back(json { { "fname", vec.fname }, { "strength", vec.strength } @@ -3187,13 +3187,13 @@ int main(int argc, char ** argv) { } json data = { { "vectors", vectors }, - { "layer_start", params.control_vector_layer_start }, - { "layer_end", params.control_vector_layer_end } + { "layer_start", ctx_server.params.control_vector_layer_start }, + { "layer_end", ctx_server.params.control_vector_layer_end } }; res.set_content(data.dump(), "application/json; charset=utf-8"); }; - const auto handle_set_control_vectors = [&ctx_server, &res_error, ¶ms, &handle_get_control_vectors](const httplib::Request & req, httplib::Response & res) { + const auto handle_set_control_vectors = [&ctx_server, &res_error, &handle_get_control_vectors](const httplib::Request & req, httplib::Response & res) { res.set_header("Access-Control-Allow-Origin", req.get_header_value("Origin")); json data = json::parse(req.body); @@ -3202,7 +3202,7 @@ int main(int argc, char ** argv) { if (data.contains("vectors") && data["vectors"].is_array()) { for (const auto &item : data["vectors"]) { auto v = item.get(); - // std::cout << "Add vector: " << v.fname << " " << v.strength << "\n"; + std::cout << "Add vector: " << v.fname << " " << v.strength << "\n"; vec_params.push_back(v); } } else { @@ -3210,44 +3210,47 @@ int main(int argc, char ** argv) { res_error(res, format_error_response("No vectors passed", ERROR_TYPE_SERVER)); return; } - for (auto v : params.control_vectors) { - // std::cout << "Subtract vector:" << v.fname << " " << v.strength << "\n"; - vec_params.push_back({ -v.strength, v.fname }); - } const auto cvec = llama_control_vector_load(vec_params); if (cvec.n_embd == -1) { - // std::cerr << "Could not load control vector\n"; + std::cerr << "Could not load control vector\n"; res_error(res, format_error_response("Could not load control vector", ERROR_TYPE_SERVER)); return; } - if (params.control_vector_layer_start <= 0) { - params.control_vector_layer_start = 1; + if (ctx_server.params.control_vector_layer_start <= 0) { + ctx_server.params.control_vector_layer_start = 1; } - if (params.control_vector_layer_end <= 0){ - params.control_vector_layer_end = llama_n_layer(ctx_server.model); + if (ctx_server.params.control_vector_layer_end <= 0){ + ctx_server.params.control_vector_layer_end = llama_n_layer(ctx_server.model); } int err = llama_control_vector_apply(ctx_server.ctx, cvec.data.data(), cvec.data.size(), cvec.n_embd, - params.control_vector_layer_start, - params.control_vector_layer_end); + ctx_server.params.control_vector_layer_start, + ctx_server.params.control_vector_layer_end); if (err) { std::cerr << "Could not apply control vector\n"; res_error(res, format_error_response("Could not apply control vector", ERROR_TYPE_SERVER)); return; } - auto s = params.control_vectors.size(); - auto s2 = vec_params.size(); - params.control_vectors.clear(); - unsigned i = 0; + ctx_server.params.control_vectors.clear(); for (auto v : vec_params) { - if (i++ < s2 - s) { - //std::cout << "set vector param: " << v.fname << " " << v.strength << "\n"; - params.control_vectors.push_back(v); - } + //std::cout << "set vector param: " << v.fname << " " << v.strength << "\n"; + ctx_server.params.control_vectors.push_back(v); } + + /*std::cerr << "Maybe we need to do this initiation ritual before it werks?\n"; // No, it's still all garbled bullshit. + + std::vector tmp = { llama_token_bos(ctx_server.model), llama_token_eos(ctx_server.model), }; + std::cerr << "decode, bro\n"; + llama_decode(ctx_server.ctx, llama_batch_get_one(tmp.data(), std::min(tmp.size(), (size_t) ctx_server.params.n_batch), 0, 0)); + std::cerr << "clear that fucking cache\n"; + llama_kv_cache_clear(ctx_server.ctx); + std::cerr << "symcr0nice or what\n"; + llama_synchronize(ctx_server.ctx); + std::cerr << "time will tell\n"; + llama_reset_timings(ctx_server.ctx);*/ handle_get_control_vectors(req, res); }; From 92070cab2a1e95e779ae75c011af301cec205828 Mon Sep 17 00:00:00 2001 From: trollkotze Date: Mon, 25 Mar 2024 04:33:44 +0100 Subject: [PATCH 03/17] Maybe adding a memory leak? But it werks now. --- llama.cpp | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/llama.cpp b/llama.cpp index 61587cb7a..8ba930d7b 100644 --- a/llama.cpp +++ b/llama.cpp @@ -1950,6 +1950,7 @@ struct llama_control_vector { } ~llama_control_vector() { + LLAMA_LOG_ERROR("Kill the control vector\n"); for (struct ggml_context * ctx : ctxs) { ggml_free(ctx); } @@ -13994,9 +13995,9 @@ int32_t llama_model_apply_lora_from_file(const struct llama_model * model, const } static bool llama_control_vector_init(struct llama_control_vector & cvec, const llama_model & model) { - GGML_ASSERT(cvec.tensors.empty()); - GGML_ASSERT(cvec.ctxs.empty()); - GGML_ASSERT(cvec.bufs.empty()); + cvec.tensors.clear(); + cvec.ctxs.clear(); + cvec.bufs.clear(); // count layer buffer types std::map buft_layer_count; @@ -14062,10 +14063,9 @@ int32_t llama_control_vector_apply(struct llama_context * lctx, const float * da return 1; } - if (cvec.tensors.empty()) { - if (!llama_control_vector_init(cvec, model)) { - return 1; - } + if (!llama_control_vector_init(cvec, model)) { + LLAMA_LOG_ERROR("%s: FUCKING BITCH\n", __func__); + return 1; } cvec.layer_start = il_start; From f0722b1352b604b45a1c24346a9c4fba6c9acadb Mon Sep 17 00:00:00 2001 From: trollkotze Date: Mon, 25 Mar 2024 17:04:31 +0100 Subject: [PATCH 04/17] clean up failed attempt at implementing control-vector hot-swapping --- examples/server/server.cpp | 81 +------------------------------------- examples/server/utils.hpp | 5 --- llama.cpp | 14 +++---- 3 files changed, 8 insertions(+), 92 deletions(-) diff --git a/examples/server/server.cpp b/examples/server/server.cpp index 6ce64c94d..49e47fd30 100644 --- a/examples/server/server.cpp +++ b/examples/server/server.cpp @@ -3176,84 +3176,6 @@ int main(int argc, char ** argv) { res.status = 200; // HTTP OK }; - const auto handle_get_control_vectors = [&ctx_server](const httplib::Request & req, httplib::Response & res) { - json vectors = json::array(); - - for (const auto & vec : ctx_server.params.control_vectors) { - vectors.push_back(json { - { "fname", vec.fname }, - { "strength", vec.strength } - }); - } - json data = { - { "vectors", vectors }, - { "layer_start", ctx_server.params.control_vector_layer_start }, - { "layer_end", ctx_server.params.control_vector_layer_end } - }; - res.set_content(data.dump(), "application/json; charset=utf-8"); - }; - - const auto handle_set_control_vectors = [&ctx_server, &res_error, &handle_get_control_vectors](const httplib::Request & req, httplib::Response & res) { - res.set_header("Access-Control-Allow-Origin", req.get_header_value("Origin")); - - json data = json::parse(req.body); - std::vector vec_params; - - if (data.contains("vectors") && data["vectors"].is_array()) { - for (const auto &item : data["vectors"]) { - auto v = item.get(); - std::cout << "Add vector: " << v.fname << " " << v.strength << "\n"; - vec_params.push_back(v); - } - } else { - std::cerr << "No vectors passed\n"; - res_error(res, format_error_response("No vectors passed", ERROR_TYPE_SERVER)); - return; - } - const auto cvec = llama_control_vector_load(vec_params); - if (cvec.n_embd == -1) { - std::cerr << "Could not load control vector\n"; - res_error(res, format_error_response("Could not load control vector", ERROR_TYPE_SERVER)); - return; - } - - if (ctx_server.params.control_vector_layer_start <= 0) { - ctx_server.params.control_vector_layer_start = 1; - } - if (ctx_server.params.control_vector_layer_end <= 0){ - ctx_server.params.control_vector_layer_end = llama_n_layer(ctx_server.model); - } - int err = llama_control_vector_apply(ctx_server.ctx, - cvec.data.data(), - cvec.data.size(), - cvec.n_embd, - ctx_server.params.control_vector_layer_start, - ctx_server.params.control_vector_layer_end); - if (err) { - std::cerr << "Could not apply control vector\n"; - res_error(res, format_error_response("Could not apply control vector", ERROR_TYPE_SERVER)); - return; - } - ctx_server.params.control_vectors.clear(); - for (auto v : vec_params) { - //std::cout << "set vector param: " << v.fname << " " << v.strength << "\n"; - ctx_server.params.control_vectors.push_back(v); - } - - /*std::cerr << "Maybe we need to do this initiation ritual before it werks?\n"; // No, it's still all garbled bullshit. - - std::vector tmp = { llama_token_bos(ctx_server.model), llama_token_eos(ctx_server.model), }; - std::cerr << "decode, bro\n"; - llama_decode(ctx_server.ctx, llama_batch_get_one(tmp.data(), std::min(tmp.size(), (size_t) ctx_server.params.n_batch), 0, 0)); - std::cerr << "clear that fucking cache\n"; - llama_kv_cache_clear(ctx_server.ctx); - std::cerr << "symcr0nice or what\n"; - llama_synchronize(ctx_server.ctx); - std::cerr << "time will tell\n"; - llama_reset_timings(ctx_server.ctx);*/ - handle_get_control_vectors(req, res); - }; - const auto handle_props = [&ctx_server](const httplib::Request & req, httplib::Response & res) { res.set_header("Access-Control-Allow-Origin", req.get_header_value("Origin")); json data = { @@ -3603,10 +3525,8 @@ int main(int argc, char ** argv) { svr->Get ("/health", handle_health); svr->Get ("/slots", handle_slots); svr->Get ("/metrics", handle_metrics); - svr->Get ("/control-vectors", handle_get_control_vectors); svr->Get ("/props", handle_props); svr->Get ("/v1/models", handle_models); - svr->Post("/control-vectors", handle_set_control_vectors); svr->Post("/completion", handle_completions); // legacy svr->Post("/completions", handle_completions); svr->Post("/v1/completions", handle_completions); @@ -3681,3 +3601,4 @@ int main(int argc, char ** argv) { return 0; } + diff --git a/examples/server/utils.hpp b/examples/server/utils.hpp index f73ec441f..8f20ff614 100644 --- a/examples/server/utils.hpp +++ b/examples/server/utils.hpp @@ -615,8 +615,3 @@ static json format_error_response(const std::string & message, const enum error_ {"type", type_str}, }; } - -void from_json(const json& j, llama_control_vector_load_info& l) { - j.at("strength").get_to(l.strength); - j.at("fname").get_to(l.fname); -} diff --git a/llama.cpp b/llama.cpp index 8ba930d7b..61587cb7a 100644 --- a/llama.cpp +++ b/llama.cpp @@ -1950,7 +1950,6 @@ struct llama_control_vector { } ~llama_control_vector() { - LLAMA_LOG_ERROR("Kill the control vector\n"); for (struct ggml_context * ctx : ctxs) { ggml_free(ctx); } @@ -13995,9 +13994,9 @@ int32_t llama_model_apply_lora_from_file(const struct llama_model * model, const } static bool llama_control_vector_init(struct llama_control_vector & cvec, const llama_model & model) { - cvec.tensors.clear(); - cvec.ctxs.clear(); - cvec.bufs.clear(); + GGML_ASSERT(cvec.tensors.empty()); + GGML_ASSERT(cvec.ctxs.empty()); + GGML_ASSERT(cvec.bufs.empty()); // count layer buffer types std::map buft_layer_count; @@ -14063,9 +14062,10 @@ int32_t llama_control_vector_apply(struct llama_context * lctx, const float * da return 1; } - if (!llama_control_vector_init(cvec, model)) { - LLAMA_LOG_ERROR("%s: FUCKING BITCH\n", __func__); - return 1; + if (cvec.tensors.empty()) { + if (!llama_control_vector_init(cvec, model)) { + return 1; + } } cvec.layer_start = il_start; From f5b8622b3a3990a6e37f35aecc8272c03027299b Mon Sep 17 00:00:00 2001 From: trollkotze Date: Mon, 25 Mar 2024 17:38:49 +0100 Subject: [PATCH 05/17] Copy usage output for control-vector params to server.cpp --- examples/server/server.cpp | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/examples/server/server.cpp b/examples/server/server.cpp index 49e47fd30..f8075a56b 100644 --- a/examples/server/server.cpp +++ b/examples/server/server.cpp @@ -2216,6 +2216,12 @@ static void server_print_usage(const char * argv0, const gpt_params & params, co printf(" set an alias for the model, will be added as `model` field in completion response\n"); printf(" --lora FNAME apply LoRA adapter (implies --no-mmap)\n"); printf(" --lora-base FNAME optional model to use as a base for the layers modified by the LoRA adapter\n"); + printf(" --control-vector FNAME\n"); + printf(" add a control vector\n"); + printf(" --control-vector-scaled FNAME S\n"); + printf(" add a control vector with user defined scaling S\n"); + printf(" --control-vector-layer-range START END\n"); + printf(" layer range to apply the control vector(s) to, start and end inclusive\n"); printf(" --host ip address to listen (default (default: %s)\n", sparams.hostname.c_str()); printf(" --port PORT port to listen (default (default: %d)\n", sparams.port); printf(" --path PUBLIC_PATH path from which to serve static files (default: disabled)\n"); From 485358aa624cd2acb9df4f21f2415db43b7cd0fd Mon Sep 17 00:00:00 2001 From: trollkotze Date: Mon, 25 Mar 2024 17:51:13 +0100 Subject: [PATCH 06/17] Whitespace --- examples/server/server.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/server/server.cpp b/examples/server/server.cpp index f8075a56b..4947a6d00 100644 --- a/examples/server/server.cpp +++ b/examples/server/server.cpp @@ -624,6 +624,7 @@ struct server_response { } } }; + struct server_context { llama_model * model = nullptr; llama_context * ctx = nullptr; @@ -3607,4 +3608,3 @@ int main(int argc, char ** argv) { return 0; } - From 6e1fbf87b07b84f68fbddf8816a5bef8e1d63526 Mon Sep 17 00:00:00 2001 From: trollkotze Date: Tue, 26 Mar 2024 04:09:37 +0100 Subject: [PATCH 07/17] Indentation --- examples/server/server.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/examples/server/server.cpp b/examples/server/server.cpp index 4947a6d00..d6e16875f 100644 --- a/examples/server/server.cpp +++ b/examples/server/server.cpp @@ -2218,9 +2218,9 @@ static void server_print_usage(const char * argv0, const gpt_params & params, co printf(" --lora FNAME apply LoRA adapter (implies --no-mmap)\n"); printf(" --lora-base FNAME optional model to use as a base for the layers modified by the LoRA adapter\n"); printf(" --control-vector FNAME\n"); - printf(" add a control vector\n"); + printf(" add a control vector\n"); printf(" --control-vector-scaled FNAME S\n"); - printf(" add a control vector with user defined scaling S\n"); + printf(" add a control vector with user defined scaling S\n"); printf(" --control-vector-layer-range START END\n"); printf(" layer range to apply the control vector(s) to, start and end inclusive\n"); printf(" --host ip address to listen (default (default: %s)\n", sparams.hostname.c_str()); From 181879f9424e81be4f56f5b34b9616d49caacb3b Mon Sep 17 00:00:00 2001 From: Anon Date: Tue, 26 Mar 2024 01:28:18 +0000 Subject: [PATCH 08/17] llama_control_vector_load: let gguf_init_from_file allocate the ggml_context --- common/common.cpp | 15 ++------------- 1 file changed, 2 insertions(+), 13 deletions(-) diff --git a/common/common.cpp b/common/common.cpp index fb80d4bf7..793e08a9c 100644 --- a/common/common.cpp +++ b/common/common.cpp @@ -2650,12 +2650,7 @@ static llama_control_vector_data llama_control_vector_load_one(const llama_contr // calculate size of ctx needed for tensors, ensure tensors are f32, and find max layer { - struct ggml_init_params meta_params = { - /* .mem_size = */ ggml_tensor_overhead() * 128 + ggml_graph_overhead(), - /* .mem_buffer = */ nullptr, - /* .no_alloc = */ true, - }; - ggml_context * meta_ctx = ggml_init(meta_params); + ggml_context * meta_ctx = nullptr; struct gguf_init_params meta_gguf_params = { /* .no_alloc = */ true, /* .ctx = */ &meta_ctx, @@ -2720,13 +2715,7 @@ static llama_control_vector_data llama_control_vector_load_one(const llama_contr } // load and scale tensors into final control vector context - struct ggml_init_params ggml_params = { - /* .mem_size = */ ggml_tensor_overhead() * n_tensors + n_bytes, - /* .mem_buffer = */ nullptr, - /* .no_alloc = */ false, - }; - struct ggml_context * ctx = ggml_init(ggml_params); - + struct ggml_context * ctx = nullptr; struct gguf_init_params params = { /*.no_alloc = */ false, /*.ctx = */ &ctx, From 9914014e1756fe7c2e7b9896d9e67474f6380bb4 Mon Sep 17 00:00:00 2001 From: Anon Date: Tue, 26 Mar 2024 01:28:34 +0000 Subject: [PATCH 09/17] llama_control_vector_load: free contexts on successful exit --- common/common.cpp | 3 +++ 1 file changed, 3 insertions(+) diff --git a/common/common.cpp b/common/common.cpp index 793e08a9c..ea2aa1d1c 100644 --- a/common/common.cpp +++ b/common/common.cpp @@ -2748,6 +2748,9 @@ static llama_control_vector_data llama_control_vector_load_one(const llama_contr } } + gguf_free(ctx_gguf); + ggml_free(ctx); + return result; } From d0304f76566ab82637f4c8906eeec010f005ba9c Mon Sep 17 00:00:00 2001 From: Anon Date: Tue, 26 Mar 2024 01:28:55 +0000 Subject: [PATCH 10/17] llama_control_vector_load: free gguf_context before ggml_context --- common/common.cpp | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/common/common.cpp b/common/common.cpp index ea2aa1d1c..d15e1810b 100644 --- a/common/common.cpp +++ b/common/common.cpp @@ -2673,8 +2673,8 @@ static llama_control_vector_data llama_control_vector_load_one(const llama_contr uint32_t layer = std::stoi(name.substr(dotpos + 1)); if (layer == 0) { fprintf(stderr, "%s: direction tensor invalid in %s\n", __func__, load_info.fname.c_str()); - ggml_free(meta_ctx); gguf_free(meta_ctx_gguf); + ggml_free(meta_ctx); return result; } if (layer > max_direction_layer) { @@ -2682,8 +2682,8 @@ static llama_control_vector_data llama_control_vector_load_one(const llama_contr } } catch (...) { fprintf(stderr, "%s: direction tensor invalid in %s\n", __func__, load_info.fname.c_str()); - ggml_free(meta_ctx); gguf_free(meta_ctx_gguf); + ggml_free(meta_ctx); return result; } } @@ -2691,22 +2691,22 @@ static llama_control_vector_data llama_control_vector_load_one(const llama_contr struct ggml_tensor * tensor_meta = ggml_get_tensor(meta_ctx, name.c_str()); if (tensor_meta->type != GGML_TYPE_F32 || ggml_n_dims(tensor_meta) != 1) { fprintf(stderr, "%s: direction tensor invalid in %s\n", __func__, load_info.fname.c_str()); - ggml_free(meta_ctx); gguf_free(meta_ctx_gguf); + ggml_free(meta_ctx); return result; } if (result.n_embd == -1) { result.n_embd = ggml_nelements(tensor_meta); } else if (ggml_nelements(tensor_meta) != result.n_embd) { fprintf(stderr, "%s: direction tensor sizes mismatched in %s\n", __func__, load_info.fname.c_str()); - ggml_free(meta_ctx); gguf_free(meta_ctx_gguf); + ggml_free(meta_ctx); return result; } n_bytes += ggml_nbytes(tensor_meta); } - ggml_free(meta_ctx); gguf_free(meta_ctx_gguf); + ggml_free(meta_ctx); } if (n_tensors == 0) { From bd9f6b9dcffa008898d9b2a8663f00716faee926 Mon Sep 17 00:00:00 2001 From: trollkotze Date: Tue, 26 Mar 2024 18:52:31 +0100 Subject: [PATCH 11/17] log time measurements --- common/common.cpp | 13 ++++++++++--- llama.cpp | 17 +++++++++++++++++ 2 files changed, 27 insertions(+), 3 deletions(-) diff --git a/common/common.cpp b/common/common.cpp index d15e1810b..8c0993c97 100644 --- a/common/common.cpp +++ b/common/common.cpp @@ -2640,6 +2640,8 @@ float llama_embd_similarity_cos(const float * embd1, const float * embd2, int n) // static llama_control_vector_data llama_control_vector_load_one(const llama_control_vector_load_info & load_info) { + auto start = ggml_time_ms(); + printf("control vector load_one...\n"); int32_t n_tensors; size_t n_bytes = 0; @@ -2684,7 +2686,6 @@ static llama_control_vector_data llama_control_vector_load_one(const llama_contr fprintf(stderr, "%s: direction tensor invalid in %s\n", __func__, load_info.fname.c_str()); gguf_free(meta_ctx_gguf); ggml_free(meta_ctx); - return result; } } @@ -2751,10 +2752,14 @@ static llama_control_vector_data llama_control_vector_load_one(const llama_contr gguf_free(ctx_gguf); ggml_free(ctx); + auto end = ggml_time_ms(); + printf("control vector load_one took %ums\n", end - start); return result; } llama_control_vector_data llama_control_vector_load(const std::vector & load_infos) { + auto start = ggml_time_ms(); + printf("control vector load...\n"); llama_control_vector_data result = { -1, {} }; for (const auto & info : load_infos) { @@ -2764,7 +2769,7 @@ llama_control_vector_data llama_control_vector_load(const std::vectormodel; llama_control_vector & cvec = lctx->cvec; @@ -14054,6 +14066,8 @@ int32_t llama_control_vector_apply(struct llama_context * lctx, const float * da // disable the current control vector (but leave allocated for later) cvec.layer_start = -1; cvec.layer_end = -1; + auto end = ggml_time_ms(); + printf("control vector apply took %ums\n", end - start); return 0; } @@ -14064,6 +14078,7 @@ int32_t llama_control_vector_apply(struct llama_context * lctx, const float * da if (cvec.tensors.empty()) { if (!llama_control_vector_init(cvec, model)) { + LLAMA_LOG_ERROR("%s: control vector init failed\n", __func__); return 1; } } @@ -14080,6 +14095,8 @@ int32_t llama_control_vector_apply(struct llama_context * lctx, const float * da } } + auto end = ggml_time_ms(); + printf("control vector apply took %ums\n", end - start); return 0; } From 7b9e8726d1871cfebd26296f92bd43cad2b8fb7a Mon Sep 17 00:00:00 2001 From: trollkotze Date: Tue, 26 Mar 2024 18:57:06 +0100 Subject: [PATCH 12/17] Routes for hot-reloading and reading current vector composition --- examples/server/server.cpp | 70 ++++++++++++++++++++++++++++++++++++++ examples/server/utils.hpp | 5 +++ 2 files changed, 75 insertions(+) diff --git a/examples/server/server.cpp b/examples/server/server.cpp index d6e16875f..4b37965c3 100644 --- a/examples/server/server.cpp +++ b/examples/server/server.cpp @@ -3183,6 +3183,74 @@ int main(int argc, char ** argv) { res.status = 200; // HTTP OK }; + const auto handle_get_control_vectors = [&ctx_server](const httplib::Request & req, httplib::Response & res) { + json vectors = json::array(); + + for (const auto & vec : ctx_server.params.control_vectors) { + vectors.push_back(json { + { "fname", vec.fname }, + { "strength", vec.strength } + }); + } + json data = { + { "vectors", vectors }, + { "layer_start", ctx_server.params.control_vector_layer_start }, + { "layer_end", ctx_server.params.control_vector_layer_end } + }; + res.set_content(data.dump(), "application/json; charset=utf-8"); + }; + + const auto handle_set_control_vectors = [&ctx_server, &res_error, &handle_get_control_vectors](const httplib::Request & req, httplib::Response & res) { + res.set_header("Access-Control-Allow-Origin", req.get_header_value("Origin")); + + json data = json::parse(req.body); + std::vector vec_params; + + if (data.contains("vectors") && data["vectors"].is_array()) { + for (const auto &item : data["vectors"]) { + auto v = item.get(); + std::cout << "Add vector: " << v.fname << " " << v.strength << "\n"; + vec_params.push_back(v); + } + } else { + std::cerr << "No vectors passed\n"; + res_error(res, format_error_response("No vectors passed", ERROR_TYPE_SERVER)); + return; + } + const auto cvec = llama_control_vector_load(vec_params); + if (cvec.n_embd == -1) { + std::cerr << "Could not load control vector\n"; + res_error(res, format_error_response("Could not load control vector", ERROR_TYPE_SERVER)); + return; + } + + if (ctx_server.params.control_vector_layer_start <= 0) { + ctx_server.params.control_vector_layer_start = 1; + } + if (ctx_server.params.control_vector_layer_end <= 0){ + ctx_server.params.control_vector_layer_end = llama_n_layer(ctx_server.model); + } + int err = llama_control_vector_apply(ctx_server.ctx, + cvec.data.data(), + cvec.data.size(), + cvec.n_embd, + ctx_server.params.control_vector_layer_start, + ctx_server.params.control_vector_layer_end); + if (err) { + std::cerr << "Could not apply control vector\n"; + res_error(res, format_error_response("Could not apply control vector", ERROR_TYPE_SERVER)); + return; + } + ctx_server.params.control_vectors.clear(); + for (auto v : vec_params) { + std::cout << "set vector param: " << v.fname << " " << v.strength << "\n"; + ctx_server.params.control_vectors.push_back(v); + } + + handle_get_control_vectors(req, res); + }; + + const auto handle_props = [&ctx_server](const httplib::Request & req, httplib::Response & res) { res.set_header("Access-Control-Allow-Origin", req.get_header_value("Origin")); json data = { @@ -3534,6 +3602,8 @@ int main(int argc, char ** argv) { svr->Get ("/metrics", handle_metrics); svr->Get ("/props", handle_props); svr->Get ("/v1/models", handle_models); + svr->Get ("/control-vectors", handle_get_control_vectors); + svr->Post("/control-vectors", handle_set_control_vectors); svr->Post("/completion", handle_completions); // legacy svr->Post("/completions", handle_completions); svr->Post("/v1/completions", handle_completions); diff --git a/examples/server/utils.hpp b/examples/server/utils.hpp index 8f20ff614..f73ec441f 100644 --- a/examples/server/utils.hpp +++ b/examples/server/utils.hpp @@ -615,3 +615,8 @@ static json format_error_response(const std::string & message, const enum error_ {"type", type_str}, }; } + +void from_json(const json& j, llama_control_vector_load_info& l) { + j.at("strength").get_to(l.strength); + j.at("fname").get_to(l.fname); +} From 80508e1ef5f715913832a1822f594f12299d0416 Mon Sep 17 00:00:00 2001 From: trollkotze Date: Tue, 26 Mar 2024 20:34:48 +0100 Subject: [PATCH 13/17] Access-Control-Allow-Origin header for GET /control-vectors --- examples/server/server.cpp | 1 + 1 file changed, 1 insertion(+) diff --git a/examples/server/server.cpp b/examples/server/server.cpp index 4b37965c3..f89a7edc2 100644 --- a/examples/server/server.cpp +++ b/examples/server/server.cpp @@ -3184,6 +3184,7 @@ int main(int argc, char ** argv) { }; const auto handle_get_control_vectors = [&ctx_server](const httplib::Request & req, httplib::Response & res) { + res.set_header("Access-Control-Allow-Origin", req.get_header_value("Origin")); json vectors = json::array(); for (const auto & vec : ctx_server.params.control_vectors) { From 2506fed8e877d9a22b549fd9a034e8c40b04cf03 Mon Sep 17 00:00:00 2001 From: trollkotze Date: Tue, 26 Mar 2024 20:56:58 +0100 Subject: [PATCH 14/17] Don't double-apply CORS header in POST /control-vectors --- examples/server/server.cpp | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/examples/server/server.cpp b/examples/server/server.cpp index f89a7edc2..33ceda319 100644 --- a/examples/server/server.cpp +++ b/examples/server/server.cpp @@ -3202,8 +3202,6 @@ int main(int argc, char ** argv) { }; const auto handle_set_control_vectors = [&ctx_server, &res_error, &handle_get_control_vectors](const httplib::Request & req, httplib::Response & res) { - res.set_header("Access-Control-Allow-Origin", req.get_header_value("Origin")); - json data = json::parse(req.body); std::vector vec_params; @@ -3215,12 +3213,14 @@ int main(int argc, char ** argv) { } } else { std::cerr << "No vectors passed\n"; + res.set_header("Access-Control-Allow-Origin", req.get_header_value("Origin")); res_error(res, format_error_response("No vectors passed", ERROR_TYPE_SERVER)); return; } const auto cvec = llama_control_vector_load(vec_params); if (cvec.n_embd == -1) { std::cerr << "Could not load control vector\n"; + res.set_header("Access-Control-Allow-Origin", req.get_header_value("Origin")); res_error(res, format_error_response("Could not load control vector", ERROR_TYPE_SERVER)); return; } @@ -3239,6 +3239,7 @@ int main(int argc, char ** argv) { ctx_server.params.control_vector_layer_end); if (err) { std::cerr << "Could not apply control vector\n"; + res.set_header("Access-Control-Allow-Origin", req.get_header_value("Origin")); res_error(res, format_error_response("Could not apply control vector", ERROR_TYPE_SERVER)); return; } From 6eae8bf5c3f69958cf35e7d5aaed9f51096f2ea2 Mon Sep 17 00:00:00 2001 From: trollkotze Date: Tue, 26 Mar 2024 21:08:23 +0100 Subject: [PATCH 15/17] utils.hpp: make from_json utility for llama_control_vector_load_info static --- examples/server/utils.hpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/server/utils.hpp b/examples/server/utils.hpp index f73ec441f..79928264b 100644 --- a/examples/server/utils.hpp +++ b/examples/server/utils.hpp @@ -616,7 +616,7 @@ static json format_error_response(const std::string & message, const enum error_ }; } -void from_json(const json& j, llama_control_vector_load_info& l) { +static void from_json(const json& j, llama_control_vector_load_info& l) { j.at("strength").get_to(l.strength); j.at("fname").get_to(l.fname); } From d4897432a100d9e4625d589db33aba562101336d Mon Sep 17 00:00:00 2001 From: trollkotze Date: Wed, 27 Mar 2024 00:15:52 +0100 Subject: [PATCH 16/17] Restrict control vectors to predefined options --- examples/server/server.cpp | 122 +++++++++++++++++++++++++++++-------- examples/server/utils.hpp | 10 ++- 2 files changed, 106 insertions(+), 26 deletions(-) diff --git a/examples/server/server.cpp b/examples/server/server.cpp index 33ceda319..894afbd54 100644 --- a/examples/server/server.cpp +++ b/examples/server/server.cpp @@ -120,6 +120,8 @@ struct server_params { std::vector api_keys; + std::vector control_vector_load_options; + #ifdef CPPHTTPLIB_OPENSSL_SUPPORT std::string ssl_key_file = ""; std::string ssl_cert_file = ""; @@ -2735,6 +2737,25 @@ static void server_params_parse(int argc, char ** argv, server_params & sparams, } params.control_vector_layer_end = std::stoi(argv[i]); break; + } else if (arg == "--control-vector-option") { + if (++i >= argc) { + invalid_param = true; + break; + } + char *name = argv[i]; + if (++i >= argc) { + invalid_param = true; + break; + } + size_t slen = strlen(argv[i]); + bool is_dir = slen < 5 || strncmp(argv[i] + slen - 5, ".gguf", 5) != 0; + + // Append path separator for dirs + std::string fname = argv[i]; + if (is_dir && argv[i][slen - 1] != '/') + fname += '/'; + sparams.control_vector_load_options.push_back({ argv[i-1], fname, is_dir }); + break; } else { fprintf(stderr, "error: unknown argument: %s\n", arg.c_str()); server_print_usage(argv[0], default_params, default_sparams); @@ -3183,6 +3204,16 @@ int main(int argc, char ** argv) { res.status = 200; // HTTP OK }; + const auto handle_control_vector_options = [&sparams](const httplib::Request & req, httplib::Response & res) { + res.set_header("Access-Control-Allow-Origin", req.get_header_value("Origin")); + json options = json::array(); + + for (const auto & opt : sparams.control_vector_load_options) { + options.push_back(opt.name); + } + res.set_content(options.dump(), "application/json; charset=utf-8"); + }; + const auto handle_get_control_vectors = [&ctx_server](const httplib::Request & req, httplib::Response & res) { res.set_header("Access-Control-Allow-Origin", req.get_header_value("Origin")); json vectors = json::array(); @@ -3201,23 +3232,62 @@ int main(int argc, char ** argv) { res.set_content(data.dump(), "application/json; charset=utf-8"); }; - const auto handle_set_control_vectors = [&ctx_server, &res_error, &handle_get_control_vectors](const httplib::Request & req, httplib::Response & res) { + const auto handle_set_control_vectors = [&ctx_server, &sparams, &res_error, &handle_get_control_vectors](const httplib::Request & req, httplib::Response & res) { json data = json::parse(req.body); + + // vector parameters passed by user std::vector vec_params; + // names translated to real file names + std::vector real_vec_params; if (data.contains("vectors") && data["vectors"].is_array()) { for (const auto &item : data["vectors"]) { - auto v = item.get(); - std::cout << "Add vector: " << v.fname << " " << v.strength << "\n"; + llama_control_vector_load_info v = item.get(); + std::string real_fname = ""; + std::cout << "Check vec " << v.fname << "\n"; + // check for path traversal attempt + if (v.fname.length() > 0 && v.fname[0] != '/' && v.fname[0] != '\\') { + if (v.fname.find("../") == -1 && v.fname.find("..\\") == -1 && + v.fname.find("/..") == -1 && v.fname.find("\\..") == -1) { + + // check if vector name matches allowed names + for (auto opt : sparams.control_vector_load_options) { + std::cout << "check option " << opt.name << " : " << opt.fname << " : " << opt.is_dir << "\n"; + if (!opt.is_dir && opt.name == v.fname) { + std::cout << "file exact match\n"; + real_fname = opt.fname; + break; + } + if (opt.is_dir && v.fname.rfind(opt.name, 0) == 0) { + std::cout << "file exact match\n"; + // opt.fname already includes '/' (or '\') while opt.name doesn't + real_fname = opt.fname + v.fname.substr(opt.name.length() + 1); + break; + } + } + } + } + + if (real_fname.length() == 0) { + res.set_header("Access-Control-Allow-Origin", req.get_header_value("Origin")); + res_error(res, format_error_response("Control vector not allowed", ERROR_TYPE_SERVER)); + return; + } + + std::cout << "Add vector: " << v.fname << " -> " << real_fname << " " << v.strength << "\n"; + llama_control_vector_load_info real_info = { v.strength, real_fname }; vec_params.push_back(v); + real_vec_params.push_back(real_info); } } else { - std::cerr << "No vectors passed\n"; + std::cerr << "No vectors array passed\n"; res.set_header("Access-Control-Allow-Origin", req.get_header_value("Origin")); - res_error(res, format_error_response("No vectors passed", ERROR_TYPE_SERVER)); + res_error(res, format_error_response("No vectors array passed. If you want reset to 0, send an empty array.", ERROR_TYPE_SERVER)); return; } - const auto cvec = llama_control_vector_load(vec_params); + + const auto cvec = llama_control_vector_load(real_vec_params); + if (cvec.n_embd == -1) { std::cerr << "Could not load control vector\n"; res.set_header("Access-Control-Allow-Origin", req.get_header_value("Origin")); @@ -3231,6 +3301,7 @@ int main(int argc, char ** argv) { if (ctx_server.params.control_vector_layer_end <= 0){ ctx_server.params.control_vector_layer_end = llama_n_layer(ctx_server.model); } + int err = llama_control_vector_apply(ctx_server.ctx, cvec.data.data(), cvec.data.size(), @@ -3243,7 +3314,9 @@ int main(int argc, char ** argv) { res_error(res, format_error_response("Could not apply control vector", ERROR_TYPE_SERVER)); return; } + ctx_server.params.control_vectors.clear(); + for (auto v : vec_params) { std::cout << "set vector param: " << v.fname << " " << v.strength << "\n"; ctx_server.params.control_vectors.push_back(v); @@ -3599,24 +3672,25 @@ int main(int argc, char ** argv) { json_schema_to_grammar_mjs, json_schema_to_grammar_mjs_len, "text/javascript; charset=utf-8")); // register API routes - svr->Get ("/health", handle_health); - svr->Get ("/slots", handle_slots); - svr->Get ("/metrics", handle_metrics); - svr->Get ("/props", handle_props); - svr->Get ("/v1/models", handle_models); - svr->Get ("/control-vectors", handle_get_control_vectors); - svr->Post("/control-vectors", handle_set_control_vectors); - svr->Post("/completion", handle_completions); // legacy - svr->Post("/completions", handle_completions); - svr->Post("/v1/completions", handle_completions); - svr->Post("/chat/completions", handle_chat_completions); - svr->Post("/v1/chat/completions", handle_chat_completions); - svr->Post("/infill", handle_infill); - svr->Post("/embedding", handle_embeddings); // legacy - svr->Post("/embeddings", handle_embeddings); - svr->Post("/v1/embeddings", handle_embeddings); - svr->Post("/tokenize", handle_tokenize); - svr->Post("/detokenize", handle_detokenize); + svr->Get ("/health", handle_health); + svr->Get ("/slots", handle_slots); + svr->Get ("/metrics", handle_metrics); + svr->Get ("/props", handle_props); + svr->Get ("/v1/models", handle_models); + svr->Get ("/control-vectors", handle_get_control_vectors); + svr->Get ("/control-vector-options", handle_control_vector_options); + svr->Post("/control-vectors", handle_set_control_vectors); + svr->Post("/completion", handle_completions); // legacy + svr->Post("/completions", handle_completions); + svr->Post("/v1/completions", handle_completions); + svr->Post("/chat/completions", handle_chat_completions); + svr->Post("/v1/chat/completions", handle_chat_completions); + svr->Post("/infill", handle_infill); + svr->Post("/embedding", handle_embeddings); // legacy + svr->Post("/embeddings", handle_embeddings); + svr->Post("/v1/embeddings", handle_embeddings); + svr->Post("/tokenize", handle_tokenize); + svr->Post("/detokenize", handle_detokenize); // // Start the server diff --git a/examples/server/utils.hpp b/examples/server/utils.hpp index 79928264b..b6de21b82 100644 --- a/examples/server/utils.hpp +++ b/examples/server/utils.hpp @@ -617,6 +617,12 @@ static json format_error_response(const std::string & message, const enum error_ } static void from_json(const json& j, llama_control_vector_load_info& l) { - j.at("strength").get_to(l.strength); - j.at("fname").get_to(l.fname); + j.at("strength").get_to(l.strength); + j.at("fname").get_to(l.fname); } + +struct llama_control_vector_load_option { + std::string name; + std::string fname; + bool is_dir; +}; \ No newline at end of file From 3c49d9387a52d84a758a3e1d785dbcdd877b94fb Mon Sep 17 00:00:00 2001 From: trollkotze Date: Wed, 27 Mar 2024 01:37:26 +0100 Subject: [PATCH 17/17] Add slash to dir options and replace slashes with backslash on windows when loading file --- examples/server/server.cpp | 23 ++++++++++++++++------- 1 file changed, 16 insertions(+), 7 deletions(-) diff --git a/examples/server/server.cpp b/examples/server/server.cpp index 894afbd54..daddd159f 100644 --- a/examples/server/server.cpp +++ b/examples/server/server.cpp @@ -2742,19 +2742,23 @@ static void server_params_parse(int argc, char ** argv, server_params & sparams, invalid_param = true; break; } - char *name = argv[i]; + std::string name = argv[i]; + if (++i >= argc) { invalid_param = true; break; } - size_t slen = strlen(argv[i]); + std::string fname = argv[i]; + + size_t slen = fname.length(); bool is_dir = slen < 5 || strncmp(argv[i] + slen - 5, ".gguf", 5) != 0; - // Append path separator for dirs - std::string fname = argv[i]; + // Append path separator for dir names if (is_dir && argv[i][slen - 1] != '/') fname += '/'; - sparams.control_vector_load_options.push_back({ argv[i-1], fname, is_dir }); + if (is_dir && argv[i-1][slen - 1] != '/') + name += '/'; + sparams.control_vector_load_options.push_back({ name, fname, is_dir }); break; } else { fprintf(stderr, "error: unknown argument: %s\n", arg.c_str()); @@ -3260,8 +3264,13 @@ int main(int argc, char ** argv) { } if (opt.is_dir && v.fname.rfind(opt.name, 0) == 0) { std::cout << "file exact match\n"; - // opt.fname already includes '/' (or '\') while opt.name doesn't - real_fname = opt.fname + v.fname.substr(opt.name.length() + 1); + real_fname = opt.fname + v.fname.substr(opt.name.length()); +#if defined(_WIN32) + std::replace(real_fname.begin(), real_fname.end(), '/', '\\'); +#endif + size_t len = real_fname.length(); + if (len < 5 || real_fname.compare(len - 5, 5, ".gguf") != 0) + real_fname += ".gguf"; break; } }