diff --git a/common/common.cpp b/common/common.cpp index 3e2df6e34..7cc7d1b4e 100644 --- a/common/common.cpp +++ b/common/common.cpp @@ -2635,6 +2635,8 @@ float llama_embd_similarity_cos(const float * embd1, const float * embd2, int n) // static llama_control_vector_data llama_control_vector_load_one(const llama_control_vector_load_info & load_info) { + auto start = ggml_time_ms(); + printf("control vector load_one...\n"); int32_t n_tensors; size_t n_bytes = 0; @@ -2645,12 +2647,7 @@ static llama_control_vector_data llama_control_vector_load_one(const llama_contr // calculate size of ctx needed for tensors, ensure tensors are f32, and find max layer { - struct ggml_init_params meta_params = { - /* .mem_size = */ ggml_tensor_overhead() * 128 + ggml_graph_overhead(), - /* .mem_buffer = */ nullptr, - /* .no_alloc = */ true, - }; - ggml_context * meta_ctx = ggml_init(meta_params); + ggml_context * meta_ctx = nullptr; struct gguf_init_params meta_gguf_params = { /* .no_alloc = */ true, /* .ctx = */ &meta_ctx, @@ -2673,8 +2670,8 @@ static llama_control_vector_data llama_control_vector_load_one(const llama_contr uint32_t layer = std::stoi(name.substr(dotpos + 1)); if (layer == 0) { fprintf(stderr, "%s: direction tensor invalid in %s\n", __func__, load_info.fname.c_str()); - ggml_free(meta_ctx); gguf_free(meta_ctx_gguf); + ggml_free(meta_ctx); return result; } if (layer > max_direction_layer) { @@ -2682,31 +2679,30 @@ static llama_control_vector_data llama_control_vector_load_one(const llama_contr } } catch (...) { fprintf(stderr, "%s: direction tensor invalid in %s\n", __func__, load_info.fname.c_str()); - ggml_free(meta_ctx); gguf_free(meta_ctx_gguf); - return result; + ggml_free(meta_ctx); } } struct ggml_tensor * tensor_meta = ggml_get_tensor(meta_ctx, name.c_str()); if (tensor_meta->type != GGML_TYPE_F32 || ggml_n_dims(tensor_meta) != 1) { fprintf(stderr, "%s: direction tensor invalid in %s\n", __func__, load_info.fname.c_str()); - ggml_free(meta_ctx); gguf_free(meta_ctx_gguf); + ggml_free(meta_ctx); return result; } if (result.n_embd == -1) { result.n_embd = ggml_nelements(tensor_meta); } else if (ggml_nelements(tensor_meta) != result.n_embd) { fprintf(stderr, "%s: direction tensor sizes mismatched in %s\n", __func__, load_info.fname.c_str()); - ggml_free(meta_ctx); gguf_free(meta_ctx_gguf); + ggml_free(meta_ctx); return result; } n_bytes += ggml_nbytes(tensor_meta); } - ggml_free(meta_ctx); gguf_free(meta_ctx_gguf); + ggml_free(meta_ctx); } if (n_tensors == 0) { @@ -2715,13 +2711,7 @@ static llama_control_vector_data llama_control_vector_load_one(const llama_contr } // load and scale tensors into final control vector context - struct ggml_init_params ggml_params = { - /* .mem_size = */ ggml_tensor_overhead() * n_tensors + n_bytes, - /* .mem_buffer = */ nullptr, - /* .no_alloc = */ false, - }; - struct ggml_context * ctx = ggml_init(ggml_params); - + struct ggml_context * ctx = nullptr; struct gguf_init_params params = { /*.no_alloc = */ false, /*.ctx = */ &ctx, @@ -2754,10 +2744,17 @@ static llama_control_vector_data llama_control_vector_load_one(const llama_contr } } + gguf_free(ctx_gguf); + ggml_free(ctx); + + auto end = ggml_time_ms(); + printf("control vector load_one took %ums\n", end - start); return result; } llama_control_vector_data llama_control_vector_load(const std::vector & load_infos) { + auto start = ggml_time_ms(); + printf("control vector load...\n"); llama_control_vector_data result = { -1, {} }; for (const auto & info : load_infos) { @@ -2767,7 +2764,7 @@ llama_control_vector_data llama_control_vector_load(const std::vector api_keys; + std::vector control_vector_load_options; + #ifdef CPPHTTPLIB_OPENSSL_SUPPORT std::string ssl_key_file = ""; std::string ssl_cert_file = ""; @@ -2226,6 +2228,12 @@ static void server_print_usage(const char * argv0, const gpt_params & params, co printf(" set an alias for the model, will be added as `model` field in completion response\n"); printf(" --lora FNAME apply LoRA adapter (implies --no-mmap)\n"); printf(" --lora-base FNAME optional model to use as a base for the layers modified by the LoRA adapter\n"); + printf(" --control-vector FNAME\n"); + printf(" add a control vector\n"); + printf(" --control-vector-scaled FNAME S\n"); + printf(" add a control vector with user defined scaling S\n"); + printf(" --control-vector-layer-range START END\n"); + printf(" layer range to apply the control vector(s) to, start and end inclusive\n"); printf(" --host ip address to listen (default (default: %s)\n", sparams.hostname.c_str()); printf(" --port PORT port to listen (default (default: %d)\n", sparams.port); printf(" --path PUBLIC_PATH path from which to serve static files (default: disabled)\n"); @@ -2711,6 +2719,58 @@ static void server_params_parse(int argc, char ** argv, server_params & sparams, break; } params.kv_overrides.push_back(kvo); + } else if (arg == "--control-vector") { + if (++i >= argc) { + invalid_param = true; + break; + } + params.control_vectors.push_back({ 1.0f, argv[i], }); + } else if (arg == "--control-vector-scaled") { + if (++i >= argc) { + invalid_param = true; + break; + } + const char* fname = argv[i]; + if (++i >= argc) { + invalid_param = true; + break; + } + params.control_vectors.push_back({ std::stof(argv[i]), fname, }); + } else if (arg == "--control-vector-layer-range") { + if (++i >= argc) { + invalid_param = true; + break; + } + params.control_vector_layer_start = std::stoi(argv[i]); + if (++i >= argc) { + invalid_param = true; + break; + } + params.control_vector_layer_end = std::stoi(argv[i]); + break; + } else if (arg == "--control-vector-option") { + if (++i >= argc) { + invalid_param = true; + break; + } + std::string name = argv[i]; + + if (++i >= argc) { + invalid_param = true; + break; + } + std::string fname = argv[i]; + + size_t slen = fname.length(); + bool is_dir = slen < 5 || strncmp(argv[i] + slen - 5, ".gguf", 5) != 0; + + // Append path separator for dir names + if (is_dir && argv[i][slen - 1] != '/') + fname += '/'; + if (is_dir && argv[i-1][slen - 1] != '/') + name += '/'; + sparams.control_vector_load_options.push_back({ name, fname, is_dir }); + break; } else { fprintf(stderr, "error: unknown argument: %s\n", arg.c_str()); server_print_usage(argv[0], default_params, default_sparams); @@ -3159,6 +3219,133 @@ int main(int argc, char ** argv) { res.status = 200; // HTTP OK }; + const auto handle_control_vector_options = [&sparams](const httplib::Request & req, httplib::Response & res) { + res.set_header("Access-Control-Allow-Origin", req.get_header_value("Origin")); + json options = json::array(); + + for (const auto & opt : sparams.control_vector_load_options) { + options.push_back(opt.name); + } + res.set_content(options.dump(), "application/json; charset=utf-8"); + }; + + const auto handle_get_control_vectors = [&ctx_server](const httplib::Request & req, httplib::Response & res) { + res.set_header("Access-Control-Allow-Origin", req.get_header_value("Origin")); + json vectors = json::array(); + + for (const auto & vec : ctx_server.params.control_vectors) { + vectors.push_back(json { + { "fname", vec.fname }, + { "strength", vec.strength } + }); + } + json data = { + { "vectors", vectors }, + { "layer_start", ctx_server.params.control_vector_layer_start }, + { "layer_end", ctx_server.params.control_vector_layer_end } + }; + res.set_content(data.dump(), "application/json; charset=utf-8"); + }; + + const auto handle_set_control_vectors = [&ctx_server, &sparams, &res_error, &handle_get_control_vectors](const httplib::Request & req, httplib::Response & res) { + json data = json::parse(req.body); + + // vector parameters passed by user + std::vector vec_params; + // names translated to real file names + std::vector real_vec_params; + + if (data.contains("vectors") && data["vectors"].is_array()) { + for (const auto &item : data["vectors"]) { + llama_control_vector_load_info v = item.get(); + std::string real_fname = ""; + std::cout << "Check vec " << v.fname << "\n"; + // check for path traversal attempt + if (v.fname.length() > 0 && v.fname[0] != '/' && v.fname[0] != '\\') { + if (v.fname.find("../") == -1 && v.fname.find("..\\") == -1 && + v.fname.find("/..") == -1 && v.fname.find("\\..") == -1) { + + // check if vector name matches allowed names + for (auto opt : sparams.control_vector_load_options) { + std::cout << "check option " << opt.name << " : " << opt.fname << " : " << opt.is_dir << "\n"; + if (!opt.is_dir && opt.name == v.fname) { + std::cout << "file exact match\n"; + real_fname = opt.fname; + break; + } + if (opt.is_dir && v.fname.rfind(opt.name, 0) == 0) { + std::cout << "file exact match\n"; + real_fname = opt.fname + v.fname.substr(opt.name.length()); +#if defined(_WIN32) + std::replace(real_fname.begin(), real_fname.end(), '/', '\\'); +#endif + size_t len = real_fname.length(); + if (len < 5 || real_fname.compare(len - 5, 5, ".gguf") != 0) + real_fname += ".gguf"; + break; + } + } + } + } + + if (real_fname.length() == 0) { + res.set_header("Access-Control-Allow-Origin", req.get_header_value("Origin")); + res_error(res, format_error_response("Control vector not allowed", ERROR_TYPE_SERVER)); + return; + } + + std::cout << "Add vector: " << v.fname << " -> " << real_fname << " " << v.strength << "\n"; + llama_control_vector_load_info real_info = { v.strength, real_fname }; + vec_params.push_back(v); + real_vec_params.push_back(real_info); + } + } else { + std::cerr << "No vectors array passed\n"; + res.set_header("Access-Control-Allow-Origin", req.get_header_value("Origin")); + res_error(res, format_error_response("No vectors array passed. If you want reset to 0, send an empty array.", ERROR_TYPE_SERVER)); + return; + } + + const auto cvec = llama_control_vector_load(real_vec_params); + + if (cvec.n_embd == -1) { + std::cerr << "Could not load control vector\n"; + res.set_header("Access-Control-Allow-Origin", req.get_header_value("Origin")); + res_error(res, format_error_response("Could not load control vector", ERROR_TYPE_SERVER)); + return; + } + + if (ctx_server.params.control_vector_layer_start <= 0) { + ctx_server.params.control_vector_layer_start = 1; + } + if (ctx_server.params.control_vector_layer_end <= 0){ + ctx_server.params.control_vector_layer_end = llama_n_layer(ctx_server.model); + } + + int err = llama_control_vector_apply(ctx_server.ctx, + cvec.data.data(), + cvec.data.size(), + cvec.n_embd, + ctx_server.params.control_vector_layer_start, + ctx_server.params.control_vector_layer_end); + if (err) { + std::cerr << "Could not apply control vector\n"; + res.set_header("Access-Control-Allow-Origin", req.get_header_value("Origin")); + res_error(res, format_error_response("Could not apply control vector", ERROR_TYPE_SERVER)); + return; + } + + ctx_server.params.control_vectors.clear(); + + for (auto v : vec_params) { + std::cout << "set vector param: " << v.fname << " " << v.strength << "\n"; + ctx_server.params.control_vectors.push_back(v); + } + + handle_get_control_vectors(req, res); + }; + + const auto handle_props = [&ctx_server](const httplib::Request & req, httplib::Response & res) { res.set_header("Access-Control-Allow-Origin", req.get_header_value("Origin")); json data = { @@ -3505,22 +3692,25 @@ int main(int argc, char ** argv) { json_schema_to_grammar_mjs, json_schema_to_grammar_mjs_len, "text/javascript; charset=utf-8")); // register API routes - svr->Get ("/health", handle_health); - svr->Get ("/slots", handle_slots); - svr->Get ("/metrics", handle_metrics); - svr->Get ("/props", handle_props); - svr->Get ("/v1/models", handle_models); - svr->Post("/completion", handle_completions); // legacy - svr->Post("/completions", handle_completions); - svr->Post("/v1/completions", handle_completions); - svr->Post("/chat/completions", handle_chat_completions); - svr->Post("/v1/chat/completions", handle_chat_completions); - svr->Post("/infill", handle_infill); - svr->Post("/embedding", handle_embeddings); // legacy - svr->Post("/embeddings", handle_embeddings); - svr->Post("/v1/embeddings", handle_embeddings); - svr->Post("/tokenize", handle_tokenize); - svr->Post("/detokenize", handle_detokenize); + svr->Get ("/health", handle_health); + svr->Get ("/slots", handle_slots); + svr->Get ("/metrics", handle_metrics); + svr->Get ("/props", handle_props); + svr->Get ("/v1/models", handle_models); + svr->Get ("/control-vectors", handle_get_control_vectors); + svr->Get ("/control-vector-options", handle_control_vector_options); + svr->Post("/control-vectors", handle_set_control_vectors); + svr->Post("/completion", handle_completions); // legacy + svr->Post("/completions", handle_completions); + svr->Post("/v1/completions", handle_completions); + svr->Post("/chat/completions", handle_chat_completions); + svr->Post("/v1/chat/completions", handle_chat_completions); + svr->Post("/infill", handle_infill); + svr->Post("/embedding", handle_embeddings); // legacy + svr->Post("/embeddings", handle_embeddings); + svr->Post("/v1/embeddings", handle_embeddings); + svr->Post("/tokenize", handle_tokenize); + svr->Post("/detokenize", handle_detokenize); // // Start the server diff --git a/examples/server/utils.hpp b/examples/server/utils.hpp index a8d43ac63..d45b9b917 100644 --- a/examples/server/utils.hpp +++ b/examples/server/utils.hpp @@ -655,3 +655,14 @@ static json format_error_response(const std::string & message, const enum error_ {"type", type_str}, }; } + +static void from_json(const json& j, llama_control_vector_load_info& l) { + j.at("strength").get_to(l.strength); + j.at("fname").get_to(l.fname); +} + +struct llama_control_vector_load_option { + std::string name; + std::string fname; + bool is_dir; +}; \ No newline at end of file diff --git a/llama.cpp b/llama.cpp index 217726184..efe493746 100644 --- a/llama.cpp +++ b/llama.cpp @@ -14668,6 +14668,8 @@ int32_t llama_model_apply_lora_from_file(const struct llama_model * model, const } static bool llama_control_vector_init(struct llama_control_vector & cvec, const llama_model & model) { + auto start = ggml_time_ms(); + fprintf(stderr, "control vector init...\n"); GGML_ASSERT(cvec.tensors.empty()); GGML_ASSERT(cvec.ctxs.empty()); GGML_ASSERT(cvec.bufs.empty()); @@ -14690,6 +14692,9 @@ static bool llama_control_vector_init(struct llama_control_vector & cvec, const ggml_context * ctx = ggml_init(params); if (!ctx) { LLAMA_LOG_ERROR("%s: failed to allocate context for control vector\n", __func__); + auto end = ggml_time_ms(); + fprintf(stderr, "control vector init took %ums\n", end - start); + return true; return 1; } ctx_map[it.first] = ctx; @@ -14710,6 +14715,9 @@ static bool llama_control_vector_init(struct llama_control_vector & cvec, const ggml_backend_buffer_t buf = ggml_backend_alloc_ctx_tensors_from_buft(ctx, buft); if (!buf) { LLAMA_LOG_ERROR("%s: failed to allocate buffer for control vector\n", __func__); + auto end = ggml_time_ms(); + fprintf(stderr, "control vector init took %ums\n", end - start); + return true; return false; } ggml_backend_buffer_clear(buf, 0); @@ -14717,10 +14725,14 @@ static bool llama_control_vector_init(struct llama_control_vector & cvec, const cvec.bufs.push_back(buf); } + auto end = ggml_time_ms(); + fprintf(stderr, "control vector init took %ums\n", end - start); return true; } int32_t llama_control_vector_apply(struct llama_context * lctx, const float * data, size_t len, int32_t n_embd, int32_t il_start, int32_t il_end) { + auto start = ggml_time_ms(); + printf("control vector apply...\n"); const llama_model & model = lctx->model; llama_control_vector & cvec = lctx->cvec; @@ -14728,6 +14740,8 @@ int32_t llama_control_vector_apply(struct llama_context * lctx, const float * da // disable the current control vector (but leave allocated for later) cvec.layer_start = -1; cvec.layer_end = -1; + auto end = ggml_time_ms(); + printf("control vector apply took %ums\n", end - start); return 0; } @@ -14738,6 +14752,7 @@ int32_t llama_control_vector_apply(struct llama_context * lctx, const float * da if (cvec.tensors.empty()) { if (!llama_control_vector_init(cvec, model)) { + LLAMA_LOG_ERROR("%s: control vector init failed\n", __func__); return 1; } } @@ -14754,6 +14769,8 @@ int32_t llama_control_vector_apply(struct llama_context * lctx, const float * da } } + auto end = ggml_time_ms(); + printf("control vector apply took %ums\n", end - start); return 0; }