This commit is contained in:
Moritz Raguschat 2024-04-08 10:51:45 +02:00 committed by GitHub
commit f8b8d2f44f
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
4 changed files with 254 additions and 37 deletions

View file

@ -2635,6 +2635,8 @@ float llama_embd_similarity_cos(const float * embd1, const float * embd2, int n)
//
static llama_control_vector_data llama_control_vector_load_one(const llama_control_vector_load_info & load_info) {
auto start = ggml_time_ms();
printf("control vector load_one...\n");
int32_t n_tensors;
size_t n_bytes = 0;
@ -2645,12 +2647,7 @@ static llama_control_vector_data llama_control_vector_load_one(const llama_contr
// calculate size of ctx needed for tensors, ensure tensors are f32, and find max layer
{
struct ggml_init_params meta_params = {
/* .mem_size = */ ggml_tensor_overhead() * 128 + ggml_graph_overhead(),
/* .mem_buffer = */ nullptr,
/* .no_alloc = */ true,
};
ggml_context * meta_ctx = ggml_init(meta_params);
ggml_context * meta_ctx = nullptr;
struct gguf_init_params meta_gguf_params = {
/* .no_alloc = */ true,
/* .ctx = */ &meta_ctx,
@ -2673,8 +2670,8 @@ static llama_control_vector_data llama_control_vector_load_one(const llama_contr
uint32_t layer = std::stoi(name.substr(dotpos + 1));
if (layer == 0) {
fprintf(stderr, "%s: direction tensor invalid in %s\n", __func__, load_info.fname.c_str());
ggml_free(meta_ctx);
gguf_free(meta_ctx_gguf);
ggml_free(meta_ctx);
return result;
}
if (layer > max_direction_layer) {
@ -2682,31 +2679,30 @@ static llama_control_vector_data llama_control_vector_load_one(const llama_contr
}
} catch (...) {
fprintf(stderr, "%s: direction tensor invalid in %s\n", __func__, load_info.fname.c_str());
ggml_free(meta_ctx);
gguf_free(meta_ctx_gguf);
return result;
ggml_free(meta_ctx);
}
}
struct ggml_tensor * tensor_meta = ggml_get_tensor(meta_ctx, name.c_str());
if (tensor_meta->type != GGML_TYPE_F32 || ggml_n_dims(tensor_meta) != 1) {
fprintf(stderr, "%s: direction tensor invalid in %s\n", __func__, load_info.fname.c_str());
ggml_free(meta_ctx);
gguf_free(meta_ctx_gguf);
ggml_free(meta_ctx);
return result;
}
if (result.n_embd == -1) {
result.n_embd = ggml_nelements(tensor_meta);
} else if (ggml_nelements(tensor_meta) != result.n_embd) {
fprintf(stderr, "%s: direction tensor sizes mismatched in %s\n", __func__, load_info.fname.c_str());
ggml_free(meta_ctx);
gguf_free(meta_ctx_gguf);
ggml_free(meta_ctx);
return result;
}
n_bytes += ggml_nbytes(tensor_meta);
}
ggml_free(meta_ctx);
gguf_free(meta_ctx_gguf);
ggml_free(meta_ctx);
}
if (n_tensors == 0) {
@ -2715,13 +2711,7 @@ static llama_control_vector_data llama_control_vector_load_one(const llama_contr
}
// load and scale tensors into final control vector context
struct ggml_init_params ggml_params = {
/* .mem_size = */ ggml_tensor_overhead() * n_tensors + n_bytes,
/* .mem_buffer = */ nullptr,
/* .no_alloc = */ false,
};
struct ggml_context * ctx = ggml_init(ggml_params);
struct ggml_context * ctx = nullptr;
struct gguf_init_params params = {
/*.no_alloc = */ false,
/*.ctx = */ &ctx,
@ -2754,10 +2744,17 @@ static llama_control_vector_data llama_control_vector_load_one(const llama_contr
}
}
gguf_free(ctx_gguf);
ggml_free(ctx);
auto end = ggml_time_ms();
printf("control vector load_one took %ums\n", end - start);
return result;
}
llama_control_vector_data llama_control_vector_load(const std::vector<llama_control_vector_load_info> & load_infos) {
auto start = ggml_time_ms();
printf("control vector load...\n");
llama_control_vector_data result = { -1, {} };
for (const auto & info : load_infos) {
@ -2767,7 +2764,7 @@ llama_control_vector_data llama_control_vector_load(const std::vector<llama_cont
return result;
}
if (result.n_embd != -1 && (result.n_embd != cur.n_embd || result.data.size() != cur.data.size())) {
fprintf(stderr, "%s: control vector in %s does not match previous vector dimensions\n", __func__, info.fname.c_str());
printf("%s: control vector in %s does not match previous vector dimensions\n", __func__, info.fname.c_str());
return result;
}
@ -2781,8 +2778,10 @@ llama_control_vector_data llama_control_vector_load(const std::vector<llama_cont
}
if (result.n_embd == -1) {
fprintf(stderr, "%s: no vectors passed\n", __func__);
printf("%s: no vectors passed\n", __func__);
}
auto end = ggml_time_ms();
printf("control vector load time: %ums\n", end-start);
return result;
}

View file

@ -121,6 +121,8 @@ struct server_params {
std::vector<std::string> api_keys;
std::vector<llama_control_vector_load_option> control_vector_load_options;
#ifdef CPPHTTPLIB_OPENSSL_SUPPORT
std::string ssl_key_file = "";
std::string ssl_cert_file = "";
@ -2226,6 +2228,12 @@ static void server_print_usage(const char * argv0, const gpt_params & params, co
printf(" set an alias for the model, will be added as `model` field in completion response\n");
printf(" --lora FNAME apply LoRA adapter (implies --no-mmap)\n");
printf(" --lora-base FNAME optional model to use as a base for the layers modified by the LoRA adapter\n");
printf(" --control-vector FNAME\n");
printf(" add a control vector\n");
printf(" --control-vector-scaled FNAME S\n");
printf(" add a control vector with user defined scaling S\n");
printf(" --control-vector-layer-range START END\n");
printf(" layer range to apply the control vector(s) to, start and end inclusive\n");
printf(" --host ip address to listen (default (default: %s)\n", sparams.hostname.c_str());
printf(" --port PORT port to listen (default (default: %d)\n", sparams.port);
printf(" --path PUBLIC_PATH path from which to serve static files (default: disabled)\n");
@ -2711,6 +2719,58 @@ static void server_params_parse(int argc, char ** argv, server_params & sparams,
break;
}
params.kv_overrides.push_back(kvo);
} else if (arg == "--control-vector") {
if (++i >= argc) {
invalid_param = true;
break;
}
params.control_vectors.push_back({ 1.0f, argv[i], });
} else if (arg == "--control-vector-scaled") {
if (++i >= argc) {
invalid_param = true;
break;
}
const char* fname = argv[i];
if (++i >= argc) {
invalid_param = true;
break;
}
params.control_vectors.push_back({ std::stof(argv[i]), fname, });
} else if (arg == "--control-vector-layer-range") {
if (++i >= argc) {
invalid_param = true;
break;
}
params.control_vector_layer_start = std::stoi(argv[i]);
if (++i >= argc) {
invalid_param = true;
break;
}
params.control_vector_layer_end = std::stoi(argv[i]);
break;
} else if (arg == "--control-vector-option") {
if (++i >= argc) {
invalid_param = true;
break;
}
std::string name = argv[i];
if (++i >= argc) {
invalid_param = true;
break;
}
std::string fname = argv[i];
size_t slen = fname.length();
bool is_dir = slen < 5 || strncmp(argv[i] + slen - 5, ".gguf", 5) != 0;
// Append path separator for dir names
if (is_dir && argv[i][slen - 1] != '/')
fname += '/';
if (is_dir && argv[i-1][slen - 1] != '/')
name += '/';
sparams.control_vector_load_options.push_back({ name, fname, is_dir });
break;
} else {
fprintf(stderr, "error: unknown argument: %s\n", arg.c_str());
server_print_usage(argv[0], default_params, default_sparams);
@ -3159,6 +3219,133 @@ int main(int argc, char ** argv) {
res.status = 200; // HTTP OK
};
const auto handle_control_vector_options = [&sparams](const httplib::Request & req, httplib::Response & res) {
res.set_header("Access-Control-Allow-Origin", req.get_header_value("Origin"));
json options = json::array();
for (const auto & opt : sparams.control_vector_load_options) {
options.push_back(opt.name);
}
res.set_content(options.dump(), "application/json; charset=utf-8");
};
const auto handle_get_control_vectors = [&ctx_server](const httplib::Request & req, httplib::Response & res) {
res.set_header("Access-Control-Allow-Origin", req.get_header_value("Origin"));
json vectors = json::array();
for (const auto & vec : ctx_server.params.control_vectors) {
vectors.push_back(json {
{ "fname", vec.fname },
{ "strength", vec.strength }
});
}
json data = {
{ "vectors", vectors },
{ "layer_start", ctx_server.params.control_vector_layer_start },
{ "layer_end", ctx_server.params.control_vector_layer_end }
};
res.set_content(data.dump(), "application/json; charset=utf-8");
};
const auto handle_set_control_vectors = [&ctx_server, &sparams, &res_error, &handle_get_control_vectors](const httplib::Request & req, httplib::Response & res) {
json data = json::parse(req.body);
// vector parameters passed by user
std::vector<llama_control_vector_load_info> vec_params;
// names translated to real file names
std::vector<llama_control_vector_load_info> real_vec_params;
if (data.contains("vectors") && data["vectors"].is_array()) {
for (const auto &item : data["vectors"]) {
llama_control_vector_load_info v = item.get<llama_control_vector_load_info>();
std::string real_fname = "";
std::cout << "Check vec " << v.fname << "\n";
// check for path traversal attempt
if (v.fname.length() > 0 && v.fname[0] != '/' && v.fname[0] != '\\') {
if (v.fname.find("../") == -1 && v.fname.find("..\\") == -1 &&
v.fname.find("/..") == -1 && v.fname.find("\\..") == -1) {
// check if vector name matches allowed names
for (auto opt : sparams.control_vector_load_options) {
std::cout << "check option " << opt.name << " : " << opt.fname << " : " << opt.is_dir << "\n";
if (!opt.is_dir && opt.name == v.fname) {
std::cout << "file exact match\n";
real_fname = opt.fname;
break;
}
if (opt.is_dir && v.fname.rfind(opt.name, 0) == 0) {
std::cout << "file exact match\n";
real_fname = opt.fname + v.fname.substr(opt.name.length());
#if defined(_WIN32)
std::replace(real_fname.begin(), real_fname.end(), '/', '\\');
#endif
size_t len = real_fname.length();
if (len < 5 || real_fname.compare(len - 5, 5, ".gguf") != 0)
real_fname += ".gguf";
break;
}
}
}
}
if (real_fname.length() == 0) {
res.set_header("Access-Control-Allow-Origin", req.get_header_value("Origin"));
res_error(res, format_error_response("Control vector not allowed", ERROR_TYPE_SERVER));
return;
}
std::cout << "Add vector: " << v.fname << " -> " << real_fname << " " << v.strength << "\n";
llama_control_vector_load_info real_info = { v.strength, real_fname };
vec_params.push_back(v);
real_vec_params.push_back(real_info);
}
} else {
std::cerr << "No vectors array passed\n";
res.set_header("Access-Control-Allow-Origin", req.get_header_value("Origin"));
res_error(res, format_error_response("No vectors array passed. If you want reset to 0, send an empty array.", ERROR_TYPE_SERVER));
return;
}
const auto cvec = llama_control_vector_load(real_vec_params);
if (cvec.n_embd == -1) {
std::cerr << "Could not load control vector\n";
res.set_header("Access-Control-Allow-Origin", req.get_header_value("Origin"));
res_error(res, format_error_response("Could not load control vector", ERROR_TYPE_SERVER));
return;
}
if (ctx_server.params.control_vector_layer_start <= 0) {
ctx_server.params.control_vector_layer_start = 1;
}
if (ctx_server.params.control_vector_layer_end <= 0){
ctx_server.params.control_vector_layer_end = llama_n_layer(ctx_server.model);
}
int err = llama_control_vector_apply(ctx_server.ctx,
cvec.data.data(),
cvec.data.size(),
cvec.n_embd,
ctx_server.params.control_vector_layer_start,
ctx_server.params.control_vector_layer_end);
if (err) {
std::cerr << "Could not apply control vector\n";
res.set_header("Access-Control-Allow-Origin", req.get_header_value("Origin"));
res_error(res, format_error_response("Could not apply control vector", ERROR_TYPE_SERVER));
return;
}
ctx_server.params.control_vectors.clear();
for (auto v : vec_params) {
std::cout << "set vector param: " << v.fname << " " << v.strength << "\n";
ctx_server.params.control_vectors.push_back(v);
}
handle_get_control_vectors(req, res);
};
const auto handle_props = [&ctx_server](const httplib::Request & req, httplib::Response & res) {
res.set_header("Access-Control-Allow-Origin", req.get_header_value("Origin"));
json data = {
@ -3505,22 +3692,25 @@ int main(int argc, char ** argv) {
json_schema_to_grammar_mjs, json_schema_to_grammar_mjs_len, "text/javascript; charset=utf-8"));
// register API routes
svr->Get ("/health", handle_health);
svr->Get ("/slots", handle_slots);
svr->Get ("/metrics", handle_metrics);
svr->Get ("/props", handle_props);
svr->Get ("/v1/models", handle_models);
svr->Post("/completion", handle_completions); // legacy
svr->Post("/completions", handle_completions);
svr->Post("/v1/completions", handle_completions);
svr->Post("/chat/completions", handle_chat_completions);
svr->Post("/v1/chat/completions", handle_chat_completions);
svr->Post("/infill", handle_infill);
svr->Post("/embedding", handle_embeddings); // legacy
svr->Post("/embeddings", handle_embeddings);
svr->Post("/v1/embeddings", handle_embeddings);
svr->Post("/tokenize", handle_tokenize);
svr->Post("/detokenize", handle_detokenize);
svr->Get ("/health", handle_health);
svr->Get ("/slots", handle_slots);
svr->Get ("/metrics", handle_metrics);
svr->Get ("/props", handle_props);
svr->Get ("/v1/models", handle_models);
svr->Get ("/control-vectors", handle_get_control_vectors);
svr->Get ("/control-vector-options", handle_control_vector_options);
svr->Post("/control-vectors", handle_set_control_vectors);
svr->Post("/completion", handle_completions); // legacy
svr->Post("/completions", handle_completions);
svr->Post("/v1/completions", handle_completions);
svr->Post("/chat/completions", handle_chat_completions);
svr->Post("/v1/chat/completions", handle_chat_completions);
svr->Post("/infill", handle_infill);
svr->Post("/embedding", handle_embeddings); // legacy
svr->Post("/embeddings", handle_embeddings);
svr->Post("/v1/embeddings", handle_embeddings);
svr->Post("/tokenize", handle_tokenize);
svr->Post("/detokenize", handle_detokenize);
//
// Start the server

View file

@ -655,3 +655,14 @@ static json format_error_response(const std::string & message, const enum error_
{"type", type_str},
};
}
static void from_json(const json& j, llama_control_vector_load_info& l) {
j.at("strength").get_to(l.strength);
j.at("fname").get_to(l.fname);
}
struct llama_control_vector_load_option {
std::string name;
std::string fname;
bool is_dir;
};

View file

@ -14668,6 +14668,8 @@ int32_t llama_model_apply_lora_from_file(const struct llama_model * model, const
}
static bool llama_control_vector_init(struct llama_control_vector & cvec, const llama_model & model) {
auto start = ggml_time_ms();
fprintf(stderr, "control vector init...\n");
GGML_ASSERT(cvec.tensors.empty());
GGML_ASSERT(cvec.ctxs.empty());
GGML_ASSERT(cvec.bufs.empty());
@ -14690,6 +14692,9 @@ static bool llama_control_vector_init(struct llama_control_vector & cvec, const
ggml_context * ctx = ggml_init(params);
if (!ctx) {
LLAMA_LOG_ERROR("%s: failed to allocate context for control vector\n", __func__);
auto end = ggml_time_ms();
fprintf(stderr, "control vector init took %ums\n", end - start);
return true;
return 1;
}
ctx_map[it.first] = ctx;
@ -14710,6 +14715,9 @@ static bool llama_control_vector_init(struct llama_control_vector & cvec, const
ggml_backend_buffer_t buf = ggml_backend_alloc_ctx_tensors_from_buft(ctx, buft);
if (!buf) {
LLAMA_LOG_ERROR("%s: failed to allocate buffer for control vector\n", __func__);
auto end = ggml_time_ms();
fprintf(stderr, "control vector init took %ums\n", end - start);
return true;
return false;
}
ggml_backend_buffer_clear(buf, 0);
@ -14717,10 +14725,14 @@ static bool llama_control_vector_init(struct llama_control_vector & cvec, const
cvec.bufs.push_back(buf);
}
auto end = ggml_time_ms();
fprintf(stderr, "control vector init took %ums\n", end - start);
return true;
}
int32_t llama_control_vector_apply(struct llama_context * lctx, const float * data, size_t len, int32_t n_embd, int32_t il_start, int32_t il_end) {
auto start = ggml_time_ms();
printf("control vector apply...\n");
const llama_model & model = lctx->model;
llama_control_vector & cvec = lctx->cvec;
@ -14728,6 +14740,8 @@ int32_t llama_control_vector_apply(struct llama_context * lctx, const float * da
// disable the current control vector (but leave allocated for later)
cvec.layer_start = -1;
cvec.layer_end = -1;
auto end = ggml_time_ms();
printf("control vector apply took %ums\n", end - start);
return 0;
}
@ -14738,6 +14752,7 @@ int32_t llama_control_vector_apply(struct llama_context * lctx, const float * da
if (cvec.tensors.empty()) {
if (!llama_control_vector_init(cvec, model)) {
LLAMA_LOG_ERROR("%s: control vector init failed\n", __func__);
return 1;
}
}
@ -14754,6 +14769,8 @@ int32_t llama_control_vector_apply(struct llama_context * lctx, const float * da
}
}
auto end = ggml_time_ms();
printf("control vector apply took %ums\n", end - start);
return 0;
}