llama : add llama_vocab, functions -> methods, naming (#11110)

* llama : functions -> methods (#11110) * llama : add struct llama_vocab to the API (#11156) ggml-ci * hparams : move vocab params to llama_vocab (#11159) ggml-ci * vocab : more pimpl (#11165) ggml-ci * vocab : minor tokenization optimizations (#11160) ggml-ci Co-authored-by: Diego Devesa <slarengh@gmail.com> * lora : update API names (#11167) ggml-ci * llama : update API names to use correct prefix (#11174) * llama : update API names to use correct prefix ggml-ci * cont ggml-ci * cont ggml-ci * minor [no ci] * vocab : llama_vocab_add_[be]os -> llama_vocab_get_add_[be]os (#11174) ggml-ci * vocab : llama_vocab_n_vocab -> llama_vocab_n_tokens (#11174) ggml-ci --------- Co-authored-by: Diego Devesa <slarengh@gmail.com>
2025-01-12 11:32:42 +02:00 · 2025-01-12 11:32:42 +02:00 · afa8a9ec9b
commit afa8a9ec9b
parent c05e8c9934
68 changed files with 5855 additions and 5400 deletions
--- a/src/llama-adapter.cpp
+++ b/src/llama-adapter.cpp
@ -1,5 +1,7 @@
 #include "llama-adapter.h"

+#include "llama-impl.h"
+#include "llama-mmap.h"
 #include "llama-model.h"

 #include <algorithm>
@ -9,7 +11,7 @@

 // vec

-struct ggml_tensor * llama_control_vector::tensor_for(int il) const {
+struct ggml_tensor * llama_adapter_cvec::tensor_for(int il) const {
    if (il < 0 || il < layer_start || il > layer_end || (size_t) il >= tensors.size()) {
        return nullptr;
    }
@ -17,7 +19,7 @@ struct ggml_tensor * llama_control_vector::tensor_for(int il) const {
    return tensors[il];
 }

-struct ggml_tensor * llama_control_vector::apply_to(struct ggml_context * ctx, struct ggml_tensor * cur, int  il) const {
+struct ggml_tensor * llama_adapter_cvec::apply_to(struct ggml_context * ctx, struct ggml_tensor * cur, int  il) const {
    ggml_tensor * layer_dir = tensor_for(il);
    if (layer_dir != nullptr) {
        cur = ggml_add(ctx, cur, layer_dir);
@ -26,12 +28,12 @@ struct ggml_tensor * llama_control_vector::apply_to(struct ggml_context * ctx, s
    return cur;
 }

-static bool llama_control_vector_init(struct llama_control_vector & cvec, const llama_model & model) {
+bool llama_adapter_cvec::init(const llama_model & model) {
    const auto & hparams = model.hparams;

-    GGML_ASSERT(cvec.tensors.empty());
-    GGML_ASSERT(cvec.ctxs.empty());
-    GGML_ASSERT(cvec.bufs.empty());
+    GGML_ASSERT(tensors.empty());
+    GGML_ASSERT(ctxs.empty());
+    GGML_ASSERT(bufs.empty());

    // create a context for each buffer type
    std::map<ggml_backend_buffer_type_t, ggml_context *> ctx_map;
@ -50,7 +52,7 @@ static bool llama_control_vector_init(struct llama_control_vector & cvec, const
            }

            ctx_map[buft] = ctx;
-            cvec.ctxs.emplace_back(ctx);
+            ctxs.emplace_back(ctx);

            return ctx;
        }
@ -59,21 +61,21 @@ static bool llama_control_vector_init(struct llama_control_vector & cvec, const
    };

    // make tensors
-    cvec.tensors.reserve(hparams.n_layer);
-    cvec.tensors.push_back(nullptr); // there's never a tensor for layer 0
+    tensors.reserve(hparams.n_layer);
+    tensors.push_back(nullptr); // there's never a tensor for layer 0
    for (size_t il = 1; il < hparams.n_layer; il++) {
-        ggml_backend_buffer_type_t buft = llama_model_select_buft(model, il);
+        ggml_backend_buffer_type_t buft = model.select_buft(il);
        ggml_context * ctx = ctx_for_buft(buft);
        if (!ctx) {
            LLAMA_LOG_ERROR("%s: failed to allocate context for control vector\n", __func__);
            return false;
        }
        ggml_tensor * tensor = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, hparams.n_embd);
-        cvec.tensors.push_back(tensor);
+        tensors.push_back(tensor);
    }

    // allocate tensors / buffers and zero
-    cvec.bufs.reserve(ctx_map.size());
+    bufs.reserve(ctx_map.size());
    for (auto it : ctx_map) {
        ggml_backend_buffer_type_t buft = it.first;
        ggml_context * ctx = it.second;
@ -83,14 +85,13 @@ static bool llama_control_vector_init(struct llama_control_vector & cvec, const
            return false;
        }
        ggml_backend_buffer_clear(buf, 0);
-        cvec.bufs.emplace_back(buf);
+        bufs.emplace_back(buf);
    }

    return true;
 }

-int32_t llama_control_vector_apply(
-        struct llama_control_vector & cvec,
+int32_t llama_adapter_cvec::apply(
        const llama_model & model,
        const float * data,
        size_t len,
@ -101,8 +102,8 @@ int32_t llama_control_vector_apply(

    if (data == nullptr) {
        // disable the current control vector (but leave allocated for later)
-        cvec.layer_start = -1;
-        cvec.layer_end   = -1;
+        layer_start = -1;
+        layer_end   = -1;
        return 0;
    }

@ -111,21 +112,21 @@ int32_t llama_control_vector_apply(
        return 1;
    }

-    if (cvec.tensors.empty()) {
-        if (!llama_control_vector_init(cvec, model)) {
+    if (tensors.empty()) {
+        if (!init(model)) {
            return 1;
        }
    }

-    cvec.layer_start = il_start;
-    cvec.layer_end   = il_end;
+    layer_start = il_start;
+    layer_end   = il_end;

    for (size_t il = 1; il < hparams.n_layer; il++) {
-        assert(cvec.tensors[il] != nullptr);
+        assert(tensors[il] != nullptr);

        const size_t off = n_embd * (il - 1); // buffer doesn't have data for layer 0, since it's never present
        if (off + n_embd <= len) {
-            ggml_backend_tensor_set(cvec.tensors[il], data + off, 0, n_embd * ggml_element_size(cvec.tensors[il]));
+            ggml_backend_tensor_set(tensors[il], data + off, 0, n_embd * ggml_element_size(tensors[il]));
        }
    }

@ -134,7 +135,7 @@ int32_t llama_control_vector_apply(

 // lora

-llama_lora_weight * llama_lora_adapter::get_weight(struct ggml_tensor * w) {
+llama_adapter_lora_weight * llama_adapter_lora::get_weight(struct ggml_tensor * w) {
    const std::string name(w->name);

    const auto pos = ab_map.find(name);
@ -145,11 +146,7 @@ llama_lora_weight * llama_lora_adapter::get_weight(struct ggml_tensor * w) {
    return nullptr;
 }

-void llama_lora_adapter_free(struct llama_lora_adapter * adapter) {
-    delete adapter;
-}
-
-static void llama_lora_adapter_init_impl(struct llama_model & model, const char * path_lora, struct llama_lora_adapter & adapter) {
+static void llama_adapter_lora_init_impl(struct llama_model & model, const char * path_lora, struct llama_adapter_lora & adapter) {
    LLAMA_LOG_INFO("%s: loading lora adapter from '%s' ...\n", __func__, path_lora);

    ggml_context * ctx_init;
@ -221,7 +218,7 @@ static void llama_lora_adapter_init_impl(struct llama_model & model, const char
    };

    // bundle lora_a and lora_b into pairs
-    std::map<std::string, llama_lora_weight> ab_map;
+    std::map<std::string, llama_adapter_lora_weight> ab_map;
    auto str_endswith = [](const std::string & str, const std::string & suffix) {
        return str.size() >= suffix.size() && str.compare(str.size()-suffix.size(), suffix.size(), suffix) == 0;
    };
@ -231,14 +228,14 @@ static void llama_lora_adapter_init_impl(struct llama_model & model, const char
        if (str_endswith(name, ".lora_a")) {
            replace_all(name, ".lora_a", "");
            if (ab_map.find(name) == ab_map.end()) {
-                ab_map[name] = llama_lora_weight(cur, nullptr);
+                ab_map[name] = llama_adapter_lora_weight(cur, nullptr);
            } else {
                ab_map[name].a = cur;
            }
        } else if (str_endswith(name, ".lora_b")) {
            replace_all(name, ".lora_b", "");
            if (ab_map.find(name) == ab_map.end()) {
-                ab_map[name] = llama_lora_weight(nullptr, cur);
+                ab_map[name] = llama_adapter_lora_weight(nullptr, cur);
            } else {
                ab_map[name].b = cur;
            }
@ -254,7 +251,7 @@ static void llama_lora_adapter_init_impl(struct llama_model & model, const char
    // add tensors
    for (auto & it : ab_map) {
        const std::string & name = it.first;
-        llama_lora_weight & w = it.second;
+        llama_adapter_lora_weight & w = it.second;
        bool is_token_embd = str_endswith(name, "token_embd.weight");

        if (!w.a || !w.b) {
@ -262,7 +259,7 @@ static void llama_lora_adapter_init_impl(struct llama_model & model, const char
        }

        // device buft and device ctx
-        auto * model_tensor = llama_model_get_tensor(model, name.c_str());
+        const auto * model_tensor = model.get_tensor(name.c_str());
        if (!model_tensor) {
            throw std::runtime_error("LoRA tensor '" + name + "' does not exist in base model (hint: maybe wrong base model?)");
        }
@ -288,7 +285,7 @@ static void llama_lora_adapter_init_impl(struct llama_model & model, const char
        struct ggml_tensor * tensor_b = ggml_dup_tensor(dev_ctx, w.b);
        ggml_set_name(tensor_a, w.a->name);
        ggml_set_name(tensor_b, w.b->name);
-        adapter.ab_map[name] = llama_lora_weight(tensor_a, tensor_b);
+        adapter.ab_map[name] = llama_adapter_lora_weight(tensor_a, tensor_b);
    }

    // allocate tensors / buffers and zero
@ -330,11 +327,11 @@ static void llama_lora_adapter_init_impl(struct llama_model & model, const char
    LLAMA_LOG_INFO("%s: loaded %zu tensors from lora file\n", __func__, adapter.ab_map.size()*2);
 }

-struct llama_lora_adapter * llama_lora_adapter_init(struct llama_model * model, const char * path_lora) {
-    struct llama_lora_adapter * adapter = new llama_lora_adapter();
+struct llama_adapter_lora * llama_adapter_lora_init(struct llama_model * model, const char * path_lora) {
+    struct llama_adapter_lora * adapter = new llama_adapter_lora();

    try {
-        llama_lora_adapter_init_impl(*model, path_lora, *adapter);
+        llama_adapter_lora_init_impl(*model, path_lora, *adapter);
        return adapter;
    } catch (const std::exception & err) {
        LLAMA_LOG_ERROR("%s: failed to apply lora adapter: %s\n", __func__, err.what());
@ -344,3 +341,7 @@ struct llama_lora_adapter * llama_lora_adapter_init(struct llama_model * model,

    return nullptr;
 }
+
+void llama_adapter_lora_free(struct llama_adapter_lora * adapter) {
+    delete adapter;
+}