llama : refactor src/llama.cpp (#10902)

* llama : scatter llama.cpp into multiple modules (wip)

* llama : control-vector -> adapter

* llama : arch

* llama : mmap

ggml-ci

* ci : remove BUILD_SHARED_LIBS=OFF

ggml-ci

* llama : arch (cont)

ggml-ci

* llama : chat

ggml-ci

* llama : model

ggml-ci

* llama : hparams

ggml-ci

* llama : adapter

ggml-ci

* examples : fix

ggml-ci

* rebase

ggml-ci

* minor

* llama : kv cache

ggml-ci

* llama : impl

ggml-ci

* llama : batch

ggml-ci

* cont

ggml-ci

* llama : context

ggml-ci

* minor

* llama : context (cont)

ggml-ci

* llama : model loader

ggml-ci

* common : update lora

ggml-ci

* llama : quant

ggml-ci

* llama : quant (cont)

ggml-ci

* minor [no ci]
This commit is contained in:
Georgi Gerganov 2025-01-03 10:18:53 +02:00 committed by GitHub
parent 2f0ee84b9b
commit f66f582927
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
61 changed files with 12193 additions and 11649 deletions

View file

@ -98,7 +98,7 @@ struct slot_params {
int64_t t_max_prompt_ms = -1; // TODO: implement
int64_t t_max_predict_ms = -1; // if positive, limit the generation phase to this time limit
std::vector<common_lora_adapter_container> lora;
std::vector<common_lora_adapter_info> lora;
std::vector<std::string> antiprompt;
std::vector<std::string> response_fields;
@ -198,7 +198,7 @@ struct server_task {
bool metrics_reset_bucket = false;
// used by SERVER_TASK_TYPE_SET_LORA
std::vector<common_lora_adapter_container> set_lora;
std::vector<common_lora_adapter_info> set_lora;
server_task(server_task_type type) : type(type) {}
@ -206,7 +206,6 @@ struct server_task {
const llama_model * model,
const llama_context * ctx,
const common_params & params_base,
const std::vector<common_lora_adapter_container> & lora_base,
const json & data) {
slot_params params;
@ -265,12 +264,12 @@ struct server_task {
if (data.contains("lora")) {
if (data.at("lora").is_array()) {
params.lora = parse_lora_request(lora_base, data.at("lora"));
params.lora = parse_lora_request(params_base.lora_adapters, data.at("lora"));
} else {
throw std::runtime_error("Error: 'lora' must be an array of objects with 'id' and 'scale' fields");
}
} else {
params.lora = lora_base;
params.lora = params_base.lora_adapters;
}
// TODO: add more sanity checks for the input parameters
@ -1132,7 +1131,7 @@ struct server_slot {
common_speculative * spec = nullptr;
std::vector<common_lora_adapter_container> lora;
std::vector<common_lora_adapter_info> lora;
// the index relative to completion multi-task request
size_t index = 0;
@ -1627,11 +1626,15 @@ struct server_response {
struct server_context {
common_params params_base;
// note: keep these alive - they determine the lifetime of the model, context, etc.
common_init_result llama_init;
common_init_result llama_init_dft;
llama_model * model = nullptr;
llama_context * ctx = nullptr;
std::vector<common_lora_adapter_container> lora;
llama_model * model_dft = nullptr;
llama_context_params cparams_dft;
llama_batch batch = {};
@ -1655,21 +1658,6 @@ struct server_context {
float slot_prompt_similarity = 0.0f;
~server_context() {
if (ctx) {
llama_free(ctx);
ctx = nullptr;
}
if (model) {
llama_free_model(model);
model = nullptr;
}
if (model_dft) {
llama_free_model(model_dft);
model_dft = nullptr;
}
// Clear any sampling context
for (server_slot & slot : slots) {
common_sampler_free(slot.smpl);
@ -1692,11 +1680,10 @@ struct server_context {
params_base = params;
common_init_result llama_init = common_init_from_params(params_base);
llama_init = common_init_from_params(params_base);
model = llama_init.model;
ctx = llama_init.context;
lora = llama_init.lora_adapters;
model = llama_init.model.get();
ctx = llama_init.context.get();
if (model == nullptr) {
SRV_ERR("failed to load model, '%s'\n", params_base.model.c_str());
@ -1719,25 +1706,22 @@ struct server_context {
params_dft.n_gpu_layers = params_base.speculative.n_gpu_layers;
params_dft.n_parallel = 1;
common_init_result llama_init_dft = common_init_from_params(params_dft);
llama_init_dft = common_init_from_params(params_dft);
model_dft = llama_init_dft.model;
model_dft = llama_init_dft.model.get();
if (model_dft == nullptr) {
SRV_ERR("failed to load draft model, '%s'\n", params_base.speculative.model.c_str());
return false;
}
if (!common_speculative_are_compatible(ctx, llama_init_dft.context)) {
if (!common_speculative_are_compatible(ctx, llama_init_dft.context.get())) {
SRV_ERR("the draft model '%s' is not compatible with the target model '%s'\n", params_base.speculative.model.c_str(), params_base.model.c_str());
llama_free (llama_init_dft.context);
llama_free_model(llama_init_dft.model);
return false;
}
const int n_ctx_dft = llama_n_ctx(llama_init_dft.context);
const int n_ctx_dft = llama_n_ctx(llama_init_dft.context.get());
cparams_dft = common_context_params_to_llama(params_dft);
cparams_dft.n_batch = n_ctx_dft;
@ -1745,9 +1729,6 @@ struct server_context {
// force F16 KV cache for the draft model for extra performance
cparams_dft.type_k = GGML_TYPE_F16;
cparams_dft.type_v = GGML_TYPE_F16;
// the context is not needed - we will create one for each slot
llama_free(llama_init_dft.context);
}
return true;
@ -1898,7 +1879,7 @@ struct server_context {
if (!are_lora_equal(task.params.lora, slot.lora)) {
// if lora is changed, we cannot reuse cached tokens
slot.cache_tokens.clear();
slot.lora = std::move(task.params.lora);
slot.lora = task.params.lora;
}
SLT_DBG(slot, "launching slot : %s\n", safe_json_to_str(slot.to_json()).c_str());
@ -2592,7 +2573,7 @@ struct server_context {
} break;
case SERVER_TASK_TYPE_SET_LORA:
{
lora = std::move(task.set_lora);
params_base.lora_adapters = std::move(task.set_lora);
auto res = std::make_unique<server_task_result_apply_lora>();
res->id = task.id;
queue_results.send(std::move(res));
@ -3671,7 +3652,6 @@ int main(int argc, char ** argv) {
ctx_server.model,
ctx_server.ctx,
ctx_server.params_base,
ctx_server.lora,
data);
task.id_selected_slot = json_value(data, "id_slot", -1);
@ -4098,8 +4078,9 @@ int main(int argc, char ** argv) {
const auto handle_lora_adapters_list = [&](const httplib::Request &, httplib::Response & res) {
json result = json::array();
for (size_t i = 0; i < ctx_server.lora.size(); ++i) {
auto & lora = ctx_server.lora[i];
const auto & loras = ctx_server.params_base.lora_adapters;
for (size_t i = 0; i < loras.size(); ++i) {
auto & lora = loras[i];
result.push_back({
{"id", i},
{"path", lora.path},
@ -4118,7 +4099,7 @@ int main(int argc, char ** argv) {
}
server_task task(SERVER_TASK_TYPE_SET_LORA);
task.id = ctx_server.queue_tasks.get_new_id();
task.set_lora = parse_lora_request(ctx_server.lora, body);
task.set_lora = parse_lora_request(ctx_server.params_base.lora_adapters, body);
ctx_server.queue_results.add_waiting_task_id(task.id);
ctx_server.queue_tasks.post(task);

View file

@ -799,25 +799,25 @@ static std::vector<llama_token_data> get_token_probabilities(llama_context * ctx
}
static bool are_lora_equal(
const std::vector<common_lora_adapter_container> & l1,
const std::vector<common_lora_adapter_container> & l2) {
const std::vector<common_lora_adapter_info> & l1,
const std::vector<common_lora_adapter_info> & l2) {
if (l1.size() != l2.size()) {
return false;
}
for (size_t i = 0; i < l1.size(); ++i) {
// we don't check lora.path to reduce the time complexity
if (l1[i].scale != l2[i].scale || l1[i].adapter != l2[i].adapter) {
if (l1[i].scale != l2[i].scale || l1[i].ptr != l2[i].ptr) {
return false;
}
}
return true;
}
// parse lora config from JSON request, returned a copy of base_lora with updated scale
static std::vector<common_lora_adapter_container> parse_lora_request(
const std::vector<common_lora_adapter_container> & base_lora,
// parse lora config from JSON request, returned a copy of lora_base with updated scale
static std::vector<common_lora_adapter_info> parse_lora_request(
const std::vector<common_lora_adapter_info> & lora_base,
const json & data) {
std::vector<common_lora_adapter_container> lora(base_lora);
std::vector<common_lora_adapter_info> lora(lora_base);
int max_idx = lora.size();
// clear existing value