clean up failed attempt at implementing control-vector hot-swapping

This commit is contained in:
trollkotze 2024-03-25 17:04:31 +01:00
parent 92070cab2a
commit f0722b1352
3 changed files with 8 additions and 92 deletions

View file

@ -3176,84 +3176,6 @@ int main(int argc, char ** argv) {
res.status = 200; // HTTP OK res.status = 200; // HTTP OK
}; };
const auto handle_get_control_vectors = [&ctx_server](const httplib::Request & req, httplib::Response & res) {
json vectors = json::array();
for (const auto & vec : ctx_server.params.control_vectors) {
vectors.push_back(json {
{ "fname", vec.fname },
{ "strength", vec.strength }
});
}
json data = {
{ "vectors", vectors },
{ "layer_start", ctx_server.params.control_vector_layer_start },
{ "layer_end", ctx_server.params.control_vector_layer_end }
};
res.set_content(data.dump(), "application/json; charset=utf-8");
};
const auto handle_set_control_vectors = [&ctx_server, &res_error, &handle_get_control_vectors](const httplib::Request & req, httplib::Response & res) {
res.set_header("Access-Control-Allow-Origin", req.get_header_value("Origin"));
json data = json::parse(req.body);
std::vector<llama_control_vector_load_info> vec_params;
if (data.contains("vectors") && data["vectors"].is_array()) {
for (const auto &item : data["vectors"]) {
auto v = item.get<llama_control_vector_load_info>();
std::cout << "Add vector: " << v.fname << " " << v.strength << "\n";
vec_params.push_back(v);
}
} else {
std::cerr << "No vectors passed\n";
res_error(res, format_error_response("No vectors passed", ERROR_TYPE_SERVER));
return;
}
const auto cvec = llama_control_vector_load(vec_params);
if (cvec.n_embd == -1) {
std::cerr << "Could not load control vector\n";
res_error(res, format_error_response("Could not load control vector", ERROR_TYPE_SERVER));
return;
}
if (ctx_server.params.control_vector_layer_start <= 0) {
ctx_server.params.control_vector_layer_start = 1;
}
if (ctx_server.params.control_vector_layer_end <= 0){
ctx_server.params.control_vector_layer_end = llama_n_layer(ctx_server.model);
}
int err = llama_control_vector_apply(ctx_server.ctx,
cvec.data.data(),
cvec.data.size(),
cvec.n_embd,
ctx_server.params.control_vector_layer_start,
ctx_server.params.control_vector_layer_end);
if (err) {
std::cerr << "Could not apply control vector\n";
res_error(res, format_error_response("Could not apply control vector", ERROR_TYPE_SERVER));
return;
}
ctx_server.params.control_vectors.clear();
for (auto v : vec_params) {
//std::cout << "set vector param: " << v.fname << " " << v.strength << "\n";
ctx_server.params.control_vectors.push_back(v);
}
/*std::cerr << "Maybe we need to do this initiation ritual before it werks?\n"; // No, it's still all garbled bullshit.
std::vector<llama_token> tmp = { llama_token_bos(ctx_server.model), llama_token_eos(ctx_server.model), };
std::cerr << "decode, bro\n";
llama_decode(ctx_server.ctx, llama_batch_get_one(tmp.data(), std::min(tmp.size(), (size_t) ctx_server.params.n_batch), 0, 0));
std::cerr << "clear that fucking cache\n";
llama_kv_cache_clear(ctx_server.ctx);
std::cerr << "symcr0nice or what\n";
llama_synchronize(ctx_server.ctx);
std::cerr << "time will tell\n";
llama_reset_timings(ctx_server.ctx);*/
handle_get_control_vectors(req, res);
};
const auto handle_props = [&ctx_server](const httplib::Request & req, httplib::Response & res) { const auto handle_props = [&ctx_server](const httplib::Request & req, httplib::Response & res) {
res.set_header("Access-Control-Allow-Origin", req.get_header_value("Origin")); res.set_header("Access-Control-Allow-Origin", req.get_header_value("Origin"));
json data = { json data = {
@ -3603,10 +3525,8 @@ int main(int argc, char ** argv) {
svr->Get ("/health", handle_health); svr->Get ("/health", handle_health);
svr->Get ("/slots", handle_slots); svr->Get ("/slots", handle_slots);
svr->Get ("/metrics", handle_metrics); svr->Get ("/metrics", handle_metrics);
svr->Get ("/control-vectors", handle_get_control_vectors);
svr->Get ("/props", handle_props); svr->Get ("/props", handle_props);
svr->Get ("/v1/models", handle_models); svr->Get ("/v1/models", handle_models);
svr->Post("/control-vectors", handle_set_control_vectors);
svr->Post("/completion", handle_completions); // legacy svr->Post("/completion", handle_completions); // legacy
svr->Post("/completions", handle_completions); svr->Post("/completions", handle_completions);
svr->Post("/v1/completions", handle_completions); svr->Post("/v1/completions", handle_completions);
@ -3681,3 +3601,4 @@ int main(int argc, char ** argv) {
return 0; return 0;
} }

View file

@ -615,8 +615,3 @@ static json format_error_response(const std::string & message, const enum error_
{"type", type_str}, {"type", type_str},
}; };
} }
void from_json(const json& j, llama_control_vector_load_info& l) {
j.at("strength").get_to(l.strength);
j.at("fname").get_to(l.fname);
}

View file

@ -1950,7 +1950,6 @@ struct llama_control_vector {
} }
~llama_control_vector() { ~llama_control_vector() {
LLAMA_LOG_ERROR("Kill the control vector\n");
for (struct ggml_context * ctx : ctxs) { for (struct ggml_context * ctx : ctxs) {
ggml_free(ctx); ggml_free(ctx);
} }
@ -13995,9 +13994,9 @@ int32_t llama_model_apply_lora_from_file(const struct llama_model * model, const
} }
static bool llama_control_vector_init(struct llama_control_vector & cvec, const llama_model & model) { static bool llama_control_vector_init(struct llama_control_vector & cvec, const llama_model & model) {
cvec.tensors.clear(); GGML_ASSERT(cvec.tensors.empty());
cvec.ctxs.clear(); GGML_ASSERT(cvec.ctxs.empty());
cvec.bufs.clear(); GGML_ASSERT(cvec.bufs.empty());
// count layer buffer types // count layer buffer types
std::map<ggml_backend_buffer_type_t, int> buft_layer_count; std::map<ggml_backend_buffer_type_t, int> buft_layer_count;
@ -14063,9 +14062,10 @@ int32_t llama_control_vector_apply(struct llama_context * lctx, const float * da
return 1; return 1;
} }
if (!llama_control_vector_init(cvec, model)) { if (cvec.tensors.empty()) {
LLAMA_LOG_ERROR("%s: FUCKING BITCH\n", __func__); if (!llama_control_vector_init(cvec, model)) {
return 1; return 1;
}
} }
cvec.layer_start = il_start; cvec.layer_start = il_start;