llama : refactor src/llama.cpp
(#10902)
* llama : scatter llama.cpp into multiple modules (wip) * llama : control-vector -> adapter * llama : arch * llama : mmap ggml-ci * ci : remove BUILD_SHARED_LIBS=OFF ggml-ci * llama : arch (cont) ggml-ci * llama : chat ggml-ci * llama : model ggml-ci * llama : hparams ggml-ci * llama : adapter ggml-ci * examples : fix ggml-ci * rebase ggml-ci * minor * llama : kv cache ggml-ci * llama : impl ggml-ci * llama : batch ggml-ci * cont ggml-ci * llama : context ggml-ci * minor * llama : context (cont) ggml-ci * llama : model loader ggml-ci * common : update lora ggml-ci * llama : quant ggml-ci * llama : quant (cont) ggml-ci * minor [no ci]
This commit is contained in:
parent
2f0ee84b9b
commit
f66f582927
61 changed files with 12193 additions and 11649 deletions
|
@ -98,7 +98,7 @@ struct slot_params {
|
|||
int64_t t_max_prompt_ms = -1; // TODO: implement
|
||||
int64_t t_max_predict_ms = -1; // if positive, limit the generation phase to this time limit
|
||||
|
||||
std::vector<common_lora_adapter_container> lora;
|
||||
std::vector<common_lora_adapter_info> lora;
|
||||
|
||||
std::vector<std::string> antiprompt;
|
||||
std::vector<std::string> response_fields;
|
||||
|
@ -198,7 +198,7 @@ struct server_task {
|
|||
bool metrics_reset_bucket = false;
|
||||
|
||||
// used by SERVER_TASK_TYPE_SET_LORA
|
||||
std::vector<common_lora_adapter_container> set_lora;
|
||||
std::vector<common_lora_adapter_info> set_lora;
|
||||
|
||||
server_task(server_task_type type) : type(type) {}
|
||||
|
||||
|
@ -206,7 +206,6 @@ struct server_task {
|
|||
const llama_model * model,
|
||||
const llama_context * ctx,
|
||||
const common_params & params_base,
|
||||
const std::vector<common_lora_adapter_container> & lora_base,
|
||||
const json & data) {
|
||||
slot_params params;
|
||||
|
||||
|
@ -265,12 +264,12 @@ struct server_task {
|
|||
|
||||
if (data.contains("lora")) {
|
||||
if (data.at("lora").is_array()) {
|
||||
params.lora = parse_lora_request(lora_base, data.at("lora"));
|
||||
params.lora = parse_lora_request(params_base.lora_adapters, data.at("lora"));
|
||||
} else {
|
||||
throw std::runtime_error("Error: 'lora' must be an array of objects with 'id' and 'scale' fields");
|
||||
}
|
||||
} else {
|
||||
params.lora = lora_base;
|
||||
params.lora = params_base.lora_adapters;
|
||||
}
|
||||
|
||||
// TODO: add more sanity checks for the input parameters
|
||||
|
@ -1132,7 +1131,7 @@ struct server_slot {
|
|||
|
||||
common_speculative * spec = nullptr;
|
||||
|
||||
std::vector<common_lora_adapter_container> lora;
|
||||
std::vector<common_lora_adapter_info> lora;
|
||||
|
||||
// the index relative to completion multi-task request
|
||||
size_t index = 0;
|
||||
|
@ -1627,11 +1626,15 @@ struct server_response {
|
|||
struct server_context {
|
||||
common_params params_base;
|
||||
|
||||
// note: keep these alive - they determine the lifetime of the model, context, etc.
|
||||
common_init_result llama_init;
|
||||
common_init_result llama_init_dft;
|
||||
|
||||
llama_model * model = nullptr;
|
||||
llama_context * ctx = nullptr;
|
||||
std::vector<common_lora_adapter_container> lora;
|
||||
|
||||
llama_model * model_dft = nullptr;
|
||||
|
||||
llama_context_params cparams_dft;
|
||||
|
||||
llama_batch batch = {};
|
||||
|
@ -1655,21 +1658,6 @@ struct server_context {
|
|||
float slot_prompt_similarity = 0.0f;
|
||||
|
||||
~server_context() {
|
||||
if (ctx) {
|
||||
llama_free(ctx);
|
||||
ctx = nullptr;
|
||||
}
|
||||
|
||||
if (model) {
|
||||
llama_free_model(model);
|
||||
model = nullptr;
|
||||
}
|
||||
|
||||
if (model_dft) {
|
||||
llama_free_model(model_dft);
|
||||
model_dft = nullptr;
|
||||
}
|
||||
|
||||
// Clear any sampling context
|
||||
for (server_slot & slot : slots) {
|
||||
common_sampler_free(slot.smpl);
|
||||
|
@ -1692,11 +1680,10 @@ struct server_context {
|
|||
|
||||
params_base = params;
|
||||
|
||||
common_init_result llama_init = common_init_from_params(params_base);
|
||||
llama_init = common_init_from_params(params_base);
|
||||
|
||||
model = llama_init.model;
|
||||
ctx = llama_init.context;
|
||||
lora = llama_init.lora_adapters;
|
||||
model = llama_init.model.get();
|
||||
ctx = llama_init.context.get();
|
||||
|
||||
if (model == nullptr) {
|
||||
SRV_ERR("failed to load model, '%s'\n", params_base.model.c_str());
|
||||
|
@ -1719,25 +1706,22 @@ struct server_context {
|
|||
params_dft.n_gpu_layers = params_base.speculative.n_gpu_layers;
|
||||
params_dft.n_parallel = 1;
|
||||
|
||||
common_init_result llama_init_dft = common_init_from_params(params_dft);
|
||||
llama_init_dft = common_init_from_params(params_dft);
|
||||
|
||||
model_dft = llama_init_dft.model;
|
||||
model_dft = llama_init_dft.model.get();
|
||||
|
||||
if (model_dft == nullptr) {
|
||||
SRV_ERR("failed to load draft model, '%s'\n", params_base.speculative.model.c_str());
|
||||
return false;
|
||||
}
|
||||
|
||||
if (!common_speculative_are_compatible(ctx, llama_init_dft.context)) {
|
||||
if (!common_speculative_are_compatible(ctx, llama_init_dft.context.get())) {
|
||||
SRV_ERR("the draft model '%s' is not compatible with the target model '%s'\n", params_base.speculative.model.c_str(), params_base.model.c_str());
|
||||
|
||||
llama_free (llama_init_dft.context);
|
||||
llama_free_model(llama_init_dft.model);
|
||||
|
||||
return false;
|
||||
}
|
||||
|
||||
const int n_ctx_dft = llama_n_ctx(llama_init_dft.context);
|
||||
const int n_ctx_dft = llama_n_ctx(llama_init_dft.context.get());
|
||||
|
||||
cparams_dft = common_context_params_to_llama(params_dft);
|
||||
cparams_dft.n_batch = n_ctx_dft;
|
||||
|
@ -1745,9 +1729,6 @@ struct server_context {
|
|||
// force F16 KV cache for the draft model for extra performance
|
||||
cparams_dft.type_k = GGML_TYPE_F16;
|
||||
cparams_dft.type_v = GGML_TYPE_F16;
|
||||
|
||||
// the context is not needed - we will create one for each slot
|
||||
llama_free(llama_init_dft.context);
|
||||
}
|
||||
|
||||
return true;
|
||||
|
@ -1898,7 +1879,7 @@ struct server_context {
|
|||
if (!are_lora_equal(task.params.lora, slot.lora)) {
|
||||
// if lora is changed, we cannot reuse cached tokens
|
||||
slot.cache_tokens.clear();
|
||||
slot.lora = std::move(task.params.lora);
|
||||
slot.lora = task.params.lora;
|
||||
}
|
||||
|
||||
SLT_DBG(slot, "launching slot : %s\n", safe_json_to_str(slot.to_json()).c_str());
|
||||
|
@ -2592,7 +2573,7 @@ struct server_context {
|
|||
} break;
|
||||
case SERVER_TASK_TYPE_SET_LORA:
|
||||
{
|
||||
lora = std::move(task.set_lora);
|
||||
params_base.lora_adapters = std::move(task.set_lora);
|
||||
auto res = std::make_unique<server_task_result_apply_lora>();
|
||||
res->id = task.id;
|
||||
queue_results.send(std::move(res));
|
||||
|
@ -3671,7 +3652,6 @@ int main(int argc, char ** argv) {
|
|||
ctx_server.model,
|
||||
ctx_server.ctx,
|
||||
ctx_server.params_base,
|
||||
ctx_server.lora,
|
||||
data);
|
||||
task.id_selected_slot = json_value(data, "id_slot", -1);
|
||||
|
||||
|
@ -4098,8 +4078,9 @@ int main(int argc, char ** argv) {
|
|||
|
||||
const auto handle_lora_adapters_list = [&](const httplib::Request &, httplib::Response & res) {
|
||||
json result = json::array();
|
||||
for (size_t i = 0; i < ctx_server.lora.size(); ++i) {
|
||||
auto & lora = ctx_server.lora[i];
|
||||
const auto & loras = ctx_server.params_base.lora_adapters;
|
||||
for (size_t i = 0; i < loras.size(); ++i) {
|
||||
auto & lora = loras[i];
|
||||
result.push_back({
|
||||
{"id", i},
|
||||
{"path", lora.path},
|
||||
|
@ -4118,7 +4099,7 @@ int main(int argc, char ** argv) {
|
|||
}
|
||||
server_task task(SERVER_TASK_TYPE_SET_LORA);
|
||||
task.id = ctx_server.queue_tasks.get_new_id();
|
||||
task.set_lora = parse_lora_request(ctx_server.lora, body);
|
||||
task.set_lora = parse_lora_request(ctx_server.params_base.lora_adapters, body);
|
||||
ctx_server.queue_results.add_waiting_task_id(task.id);
|
||||
ctx_server.queue_tasks.post(task);
|
||||
|
||||
|
|
|
@ -799,25 +799,25 @@ static std::vector<llama_token_data> get_token_probabilities(llama_context * ctx
|
|||
}
|
||||
|
||||
static bool are_lora_equal(
|
||||
const std::vector<common_lora_adapter_container> & l1,
|
||||
const std::vector<common_lora_adapter_container> & l2) {
|
||||
const std::vector<common_lora_adapter_info> & l1,
|
||||
const std::vector<common_lora_adapter_info> & l2) {
|
||||
if (l1.size() != l2.size()) {
|
||||
return false;
|
||||
}
|
||||
for (size_t i = 0; i < l1.size(); ++i) {
|
||||
// we don't check lora.path to reduce the time complexity
|
||||
if (l1[i].scale != l2[i].scale || l1[i].adapter != l2[i].adapter) {
|
||||
if (l1[i].scale != l2[i].scale || l1[i].ptr != l2[i].ptr) {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
// parse lora config from JSON request, returned a copy of base_lora with updated scale
|
||||
static std::vector<common_lora_adapter_container> parse_lora_request(
|
||||
const std::vector<common_lora_adapter_container> & base_lora,
|
||||
// parse lora config from JSON request, returned a copy of lora_base with updated scale
|
||||
static std::vector<common_lora_adapter_info> parse_lora_request(
|
||||
const std::vector<common_lora_adapter_info> & lora_base,
|
||||
const json & data) {
|
||||
std::vector<common_lora_adapter_container> lora(base_lora);
|
||||
std::vector<common_lora_adapter_info> lora(lora_base);
|
||||
int max_idx = lora.size();
|
||||
|
||||
// clear existing value
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue