llama : refactor src/llama.cpp (#10902)

* llama : scatter llama.cpp into multiple modules (wip) * llama : control-vector -> adapter * llama : arch * llama : mmap ggml-ci * ci : remove BUILD_SHARED_LIBS=OFF ggml-ci * llama : arch (cont) ggml-ci * llama : chat ggml-ci * llama : model ggml-ci * llama : hparams ggml-ci * llama : adapter ggml-ci * examples : fix ggml-ci * rebase ggml-ci * minor * llama : kv cache ggml-ci * llama : impl ggml-ci * llama : batch ggml-ci * cont ggml-ci * llama : context ggml-ci * minor * llama : context (cont) ggml-ci * llama : model loader ggml-ci * common : update lora ggml-ci * llama : quant ggml-ci * llama : quant (cont) ggml-ci * minor [no ci]
2025-01-03 10:18:53 +02:00 · 2025-01-03 10:18:53 +02:00 · f66f582927
commit f66f582927
parent 2f0ee84b9b
61 changed files with 12193 additions and 11649 deletions
--- a/examples/server/server.cpp
+++ b/examples/server/server.cpp
@ -98,7 +98,7 @@ struct slot_params {
    int64_t t_max_prompt_ms  = -1; // TODO: implement
    int64_t t_max_predict_ms = -1; // if positive, limit the generation phase to this time limit

-    std::vector<common_lora_adapter_container> lora;
+    std::vector<common_lora_adapter_info> lora;

    std::vector<std::string> antiprompt;
    std::vector<std::string> response_fields;
@ -198,7 +198,7 @@ struct server_task {
    bool metrics_reset_bucket = false;

    // used by SERVER_TASK_TYPE_SET_LORA
-    std::vector<common_lora_adapter_container> set_lora;
+    std::vector<common_lora_adapter_info> set_lora;

    server_task(server_task_type type) : type(type) {}

@ -206,7 +206,6 @@ struct server_task {
            const llama_model * model,
            const llama_context * ctx,
            const common_params & params_base,
-            const std::vector<common_lora_adapter_container> & lora_base,
            const json & data) {
        slot_params params;

@ -265,12 +264,12 @@ struct server_task {

        if (data.contains("lora")) {
            if (data.at("lora").is_array()) {
-                params.lora = parse_lora_request(lora_base, data.at("lora"));
+                params.lora = parse_lora_request(params_base.lora_adapters, data.at("lora"));
            } else {
                throw std::runtime_error("Error: 'lora' must be an array of objects with 'id' and 'scale' fields");
            }
        } else {
-            params.lora = lora_base;
+            params.lora = params_base.lora_adapters;
        }

        // TODO: add more sanity checks for the input parameters
@ -1132,7 +1131,7 @@ struct server_slot {

    common_speculative * spec = nullptr;

-    std::vector<common_lora_adapter_container> lora;
+    std::vector<common_lora_adapter_info> lora;

    // the index relative to completion multi-task request
    size_t index = 0;
@ -1627,11 +1626,15 @@ struct server_response {
 struct server_context {
    common_params params_base;

+    // note: keep these alive - they determine the lifetime of the model, context, etc.
+    common_init_result llama_init;
+    common_init_result llama_init_dft;
+
    llama_model * model = nullptr;
    llama_context * ctx = nullptr;
-    std::vector<common_lora_adapter_container> lora;

    llama_model * model_dft = nullptr;
+
    llama_context_params cparams_dft;

    llama_batch batch = {};
@ -1655,21 +1658,6 @@ struct server_context {
    float slot_prompt_similarity = 0.0f;

    ~server_context() {
-        if (ctx) {
-            llama_free(ctx);
-            ctx = nullptr;
-        }
-
-        if (model) {
-            llama_free_model(model);
-            model = nullptr;
-        }
-
-        if (model_dft) {
-            llama_free_model(model_dft);
-            model_dft = nullptr;
-        }
-
        // Clear any sampling context
        for (server_slot & slot : slots) {
            common_sampler_free(slot.smpl);
@ -1692,11 +1680,10 @@ struct server_context {

        params_base = params;

-        common_init_result llama_init = common_init_from_params(params_base);
+        llama_init = common_init_from_params(params_base);

-        model = llama_init.model;
-        ctx   = llama_init.context;
-        lora  = llama_init.lora_adapters;
+        model = llama_init.model.get();
+        ctx   = llama_init.context.get();

        if (model == nullptr) {
            SRV_ERR("failed to load model, '%s'\n", params_base.model.c_str());
@ -1719,25 +1706,22 @@ struct server_context {
            params_dft.n_gpu_layers = params_base.speculative.n_gpu_layers;
            params_dft.n_parallel   = 1;

-            common_init_result llama_init_dft = common_init_from_params(params_dft);
+            llama_init_dft = common_init_from_params(params_dft);

-            model_dft = llama_init_dft.model;
+            model_dft = llama_init_dft.model.get();

            if (model_dft == nullptr) {
                SRV_ERR("failed to load draft model, '%s'\n", params_base.speculative.model.c_str());
                return false;
            }

-            if (!common_speculative_are_compatible(ctx, llama_init_dft.context)) {
+            if (!common_speculative_are_compatible(ctx, llama_init_dft.context.get())) {
                SRV_ERR("the draft model '%s' is not compatible with the target model '%s'\n", params_base.speculative.model.c_str(), params_base.model.c_str());

-                llama_free      (llama_init_dft.context);
-                llama_free_model(llama_init_dft.model);
-
                return false;
            }

-            const int n_ctx_dft = llama_n_ctx(llama_init_dft.context);
+            const int n_ctx_dft = llama_n_ctx(llama_init_dft.context.get());

            cparams_dft = common_context_params_to_llama(params_dft);
            cparams_dft.n_batch = n_ctx_dft;
@ -1745,9 +1729,6 @@ struct server_context {
            // force F16 KV cache for the draft model for extra performance
            cparams_dft.type_k = GGML_TYPE_F16;
            cparams_dft.type_v = GGML_TYPE_F16;
-
-            // the context is not needed - we will create one for each slot
-            llama_free(llama_init_dft.context);
        }

        return true;
@ -1898,7 +1879,7 @@ struct server_context {
        if (!are_lora_equal(task.params.lora, slot.lora)) {
            // if lora is changed, we cannot reuse cached tokens
            slot.cache_tokens.clear();
-            slot.lora = std::move(task.params.lora);
+            slot.lora = task.params.lora;
        }

        SLT_DBG(slot, "launching slot : %s\n", safe_json_to_str(slot.to_json()).c_str());
@ -2592,7 +2573,7 @@ struct server_context {
                } break;
            case SERVER_TASK_TYPE_SET_LORA:
                {
-                    lora = std::move(task.set_lora);
+                    params_base.lora_adapters = std::move(task.set_lora);
                    auto res = std::make_unique<server_task_result_apply_lora>();
                    res->id = task.id;
                    queue_results.send(std::move(res));
@ -3671,7 +3652,6 @@ int main(int argc, char ** argv) {
                                            ctx_server.model,
                                            ctx_server.ctx,
                                            ctx_server.params_base,
-                                            ctx_server.lora,
                                            data);
                task.id_selected_slot = json_value(data, "id_slot", -1);

@ -4098,8 +4078,9 @@ int main(int argc, char ** argv) {

    const auto handle_lora_adapters_list = [&](const httplib::Request &, httplib::Response & res) {
        json result = json::array();
-        for (size_t i = 0; i < ctx_server.lora.size(); ++i) {
-            auto & lora = ctx_server.lora[i];
+        const auto & loras = ctx_server.params_base.lora_adapters;
+        for (size_t i = 0; i < loras.size(); ++i) {
+            auto & lora = loras[i];
            result.push_back({
                {"id", i},
                {"path", lora.path},
@ -4118,7 +4099,7 @@ int main(int argc, char ** argv) {
        }
        server_task task(SERVER_TASK_TYPE_SET_LORA);
        task.id = ctx_server.queue_tasks.get_new_id();
-        task.set_lora = parse_lora_request(ctx_server.lora, body);
+        task.set_lora = parse_lora_request(ctx_server.params_base.lora_adapters, body);
        ctx_server.queue_results.add_waiting_task_id(task.id);
        ctx_server.queue_tasks.post(task);

--- a/examples/server/utils.hpp
+++ b/examples/server/utils.hpp
@ -799,25 +799,25 @@ static std::vector<llama_token_data> get_token_probabilities(llama_context * ctx
 }

 static bool are_lora_equal(
-        const std::vector<common_lora_adapter_container> & l1,
-        const std::vector<common_lora_adapter_container> & l2) {
+        const std::vector<common_lora_adapter_info> & l1,
+        const std::vector<common_lora_adapter_info> & l2) {
    if (l1.size() != l2.size()) {
        return false;
    }
    for (size_t i = 0; i < l1.size(); ++i) {
        // we don't check lora.path to reduce the time complexity
-        if (l1[i].scale != l2[i].scale || l1[i].adapter != l2[i].adapter) {
+        if (l1[i].scale != l2[i].scale || l1[i].ptr != l2[i].ptr) {
            return false;
        }
    }
    return true;
 }

-// parse lora config from JSON request, returned a copy of base_lora with updated scale
-static std::vector<common_lora_adapter_container> parse_lora_request(
-        const std::vector<common_lora_adapter_container> & base_lora,
+// parse lora config from JSON request, returned a copy of lora_base with updated scale
+static std::vector<common_lora_adapter_info> parse_lora_request(
+        const std::vector<common_lora_adapter_info> & lora_base,
        const json & data) {
-    std::vector<common_lora_adapter_container> lora(base_lora);
+    std::vector<common_lora_adapter_info> lora(lora_base);
    int max_idx = lora.size();

    // clear existing value