remove cpp header map/string in llama.h

2024-07-09 16:57:07 -07:00 · 2024-07-09 16:57:07 -07:00 · ec9e5c7974
commit ec9e5c7974
parent 55fbe831ef
4 changed files with 65 additions and 96 deletions
--- a/common/common.cpp
+++ b/common/common.cpp
@ -2057,24 +2057,21 @@ std::tuple<struct llama_model *, struct llama_context *> llama_init_from_gpt_par
        }
    }

-    std::map<std::string, llama_model*> derived_models;
    for (unsigned int i = 0; i < params.derived_model_paths.size(); ++i) {
        const auto & derived_model_path = params.derived_model_paths[i];
        const std::string & derived_model_name = std::get<0>(derived_model_path);
        const std::string & derived_model_file = std::get<1>(derived_model_path);

-        llama_model * derived_model_ptr = nullptr;
-        derived_model_ptr = llama_load_model_from_file(derived_model_file.c_str(), mparams);
+        llama_model * derived_model = llama_load_model_from_file(derived_model_file.c_str(), mparams);

-        if (derived_model_ptr == NULL) {
+        if (derived_model == NULL) {
            fprintf(stderr, "%s: error: failed to load derived model '%s'\n", __func__, derived_model_file.c_str());
        }

-        derived_models[derived_model_name] = derived_model_ptr;
+        llama_model_set_name(derived_model, derived_model_name.c_str());
+        llama_ctx_set_derived_model(lctx, derived_model);
    }

-    llama_set_derived_models(lctx, derived_models);
-
    for (unsigned int i = 0; i < params.lora_adapter.size(); ++i) {
        const std::string & lora_adapter = std::get<0>(params.lora_adapter[i]);
        float lora_scale = std::get<1>(params.lora_adapter[i]);
--- a/examples/multi-adaptation/multi-adaptation.cpp
+++ b/examples/multi-adaptation/multi-adaptation.cpp
@ -236,7 +236,7 @@ int main(int argc, char ** argv) {
        LOG_TEE("\n");
        llama_print_derived_models(ctx);
    }
-    llama_switch_derived_model(ctx, "summarize");
+    llama_ctx_switch_derived_model(ctx, "summarize");

    std::string path_session = params.path_prompt_cache;
    std::vector<llama_token> session_tokens;
@ -266,6 +266,7 @@ int main(int argc, char ** argv) {

    std::vector<llama_token> embd_inp;

+    params.prompt = "<|user|>\nhelp summarize the microsoft products.<|end|>\n<|assistant|>\n";
    {
        auto prompt = (params.conversation && params.enable_chat_template)
            ? chat_add_and_format(model, chat_msgs, "system", params.prompt) // format the system prompt in conversation mode
@ -292,20 +293,6 @@ int main(int argc, char ** argv) {
    std::vector<llama_token> guidance_inp;
    int guidance_offset = 0;
    int original_prompt_len = 0;
-    if (ctx_guidance) {
-        LOG("cfg_negative_prompt: \"%s\"\n", log_tostr(sparams.cfg_negative_prompt));
-
-        guidance_inp = ::llama_tokenize(ctx_guidance, sparams.cfg_negative_prompt, true, true);
-        LOG("guidance_inp tokenized: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx_guidance, guidance_inp).c_str());
-
-        std::vector<llama_token> original_inp = ::llama_tokenize(ctx, params.prompt, true, true);
-        LOG("original_inp tokenized: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, original_inp).c_str());
-
-        original_prompt_len = original_inp.size();
-        guidance_offset = (int)guidance_inp.size() - original_prompt_len;
-        LOG("original_prompt_len: %s", log_tostr(original_prompt_len));
-        LOG("guidance_offset:     %s", log_tostr(guidance_offset));
-    }

    if ((int) embd_inp.size() > n_ctx - 4) {
        LOG_TEE("%s: error: prompt is too long (%d tokens, max %d)\n", __func__, (int) embd_inp.size(), n_ctx - 4);
@ -373,15 +360,6 @@ int main(int argc, char ** argv) {
            LOG_TEE("%6d -> '%s'\n", embd_inp[i], llama_token_to_piece(ctx, embd_inp[i]).c_str());
        }

-        if (ctx_guidance) {
-            LOG_TEE("\n");
-            LOG_TEE("%s: negative prompt: '%s'\n", __func__, sparams.cfg_negative_prompt.c_str());
-            LOG_TEE("%s: number of tokens in negative prompt = %zu\n", __func__, guidance_inp.size());
-            for (int i = 0; i < (int) guidance_inp.size(); i++) {
-                LOG_TEE("%6d -> '%s'\n", guidance_inp[i], llama_token_to_piece(ctx, guidance_inp[i]).c_str());
-            }
-        }
-
        if (params.n_keep > add_bos) {
            LOG_TEE("%s: static prompt based on n_keep: '", __func__);
            for (int i = 0; i < params.n_keep; i++) {
@ -458,15 +436,6 @@ int main(int argc, char ** argv) {
    const int ga_n = params.grp_attn_n;
    const int ga_w = params.grp_attn_w;

-    if (ga_n != 1) {
-        GGML_ASSERT(ga_n > 0                    && "grp_attn_n must be positive");                     // NOLINT
-        GGML_ASSERT(ga_w % ga_n == 0            && "grp_attn_w must be a multiple of grp_attn_n");     // NOLINT
-      //GGML_ASSERT(n_ctx_train % ga_w == 0     && "n_ctx_train must be a multiple of grp_attn_w");    // NOLINT
-      //GGML_ASSERT(n_ctx >= n_ctx_train * ga_n && "n_ctx must be at least n_ctx_train * grp_attn_n"); // NOLINT
-        LOG_TEE("self-extend: n_ctx_train = %d, grp_attn_n = %d, grp_attn_w = %d\n", n_ctx_train, ga_n, ga_w);
-    }
-    LOG_TEE("\n\n");
-
    if (params.interactive) {
        const char * control_message;
        if (params.multiline_input) {
--- a/include/llama.h
+++ b/include/llama.h
@ -8,8 +8,6 @@
 #include <stdint.h>
 #include <stdio.h>
 #include <stdbool.h>
-#include <map>
-#include <string>

 #ifdef _WIN32
 #else
@ -432,17 +430,19 @@ extern "C" {
                     struct llama_model * model,
            struct llama_context_params   params);

-    LLAMA_API void llama_print_derived_models(struct llama_context* ctx);
-
-    LLAMA_API void llama_set_derived_models(
-                struct llama_context * ctx,
-    std::map<std::string, struct llama_model *> derived_models);
-
    static const char* BASE_MODEL = "base";

-    LLAMA_API bool llama_switch_derived_model(
+    LLAMA_API void llama_print_derived_models(const struct llama_context* ctx);
+
+    LLAMA_API void llama_model_set_name(struct llama_model * model, const char* name);
+
+    LLAMA_API void llama_ctx_set_derived_model(
+            struct llama_context * ctx,
+            struct llama_model * derived_model);
+
+    LLAMA_API bool llama_ctx_switch_derived_model(
        struct llama_context* ctx,
-        std::string derived_model_name);
+        const char * derived_model_name);

    // Frees all allocated memory
    LLAMA_API void llama_free(struct llama_context * ctx);
--- a/src/llama.cpp
+++ b/src/llama.cpp
@ -2624,7 +2624,7 @@ struct llama_context {
    const llama_model & model;

    // derived models
-    std::map<std::string, llama_model *> derived_models;
+    std::vector<llama_model *> derived_models;

    // key + value cache for the self attention
    struct llama_kv_cache kv_self;
@ -3549,13 +3549,15 @@ struct llama_model_loader {

            char split_prefix[PATH_MAX] = {0};
            char foundation_prefix[PATH_MAX] = { 0 };
-            // Two types of split files are supported:
-            // prefix is abc, postfix is 00001-of-00002, 00002-of-00002
-            // abc-00001-of-00002.gguf, abc-00002-of-00002.gguf
-            // prefix is abc, postfix is foundation, adaptor-task-x, adaptor-task-y
-            // abc-foundation.gguf, abc-adaptor-task-x.gguf, abc-adaptor-task-y.gguf
-            if (!llama_split_prefix(split_prefix, sizeof(split_prefix), fname.c_str(), idx, n_split)
-                && !llama_foundation_prefix(foundation_prefix, sizeof(foundation_prefix), fname.c_str())) {
+            // Two split mode:
+            // - abc-00001-of-00002.gguf, abc-00002-of-00002.gguf, prefix is abc, postfix is 00001-of-00002, 00002-of-00002
+            // - abc-foundation.gguf, abc-adaptor-task-x.gguf, abc-adaptor-task-y.gguf, prefix is abc, postfix is -foundation, -adaptor-task-x, -adaptor-task-y
+            bool foundation_mode = false;
+            if (llama_foundation_prefix(foundation_prefix, sizeof(foundation_prefix), fname.c_str()) && n_split == 2) {
+                foundation_mode = true;
+            }
+
+            if (!foundation_mode && !llama_split_prefix(split_prefix, sizeof(split_prefix), fname.c_str(), idx, n_split)) {
                throw std::runtime_error(format("invalid split file: %s", fname.c_str()));
            }

@ -3565,14 +3567,10 @@ struct llama_model_loader {

            char split_path[PATH_MAX] = {0};
            for (idx = 1; idx < n_split; idx++) {
-                llama_split_path(split_path, sizeof(split_path), split_prefix, idx, n_split);
-
-                // if split path not exist
-                struct stat model_file_info;
-                std::string str_split_path(split_path);
-                auto file_exists = (stat(str_split_path.c_str(), &model_file_info) == 0);
-                if (!file_exists) {
+                if (foundation_mode) {
                    llama_foundation_split_path(split_path, sizeof(split_path), foundation_prefix);
+                } else {
+                    llama_split_path(split_path, sizeof(split_path), split_prefix, idx, n_split);
                }

                struct gguf_init_params split_params = {
@ -12595,19 +12593,19 @@ static struct ggml_cgraph * llama_build_graph(
         llama_context & lctx,
     const llama_batch & batch,
                  bool   worst_case) {
-    const auto & foundation_model = lctx.model;
-
+    const auto& foundation_model = lctx.model;
    const llama_model* model_ptr = nullptr;
-    const auto it = lctx.derived_models.find(lctx.cparams.derived_model_name);
-    if (it != lctx.derived_models.end()) {
-        const auto& model_derived = *(it->second);
-        model_ptr = &model_derived;
-    }
-    else {
-        model_ptr = &foundation_model;
+    const char* model_name = lctx.cparams.derived_model_name.c_str();
+
+    for (const auto& model : lctx.derived_models) {
+        if (model->name == model_name) {
+            model_ptr = model;
+            break;
+        }
    }

-    const llama_model & model = *model_ptr;
+    model_ptr = model_ptr ? model_ptr : &foundation_model;
+    const llama_model& model = *model_ptr;

    // this callback allows us to apply custom logic to each tensor (e.g. ggml-alloc, offloading, etc.)
    llm_build_cb cb = [&](struct ggml_tensor * cur, const char * name, int il) {
@ -17988,17 +17986,6 @@ struct llama_context * llama_new_context_with_model(
    return ctx;
 }

-struct llama_context * llama_new_context_with_derived_models(
-                 struct llama_model * model,
-        struct llama_context_params   params,
-        const std::map<std::string, llama_model*> derived_models) {
-    llama_context * ctx = llama_new_context_with_model(model, params);
-    if (ctx) {
-        ctx->derived_models = derived_models;
-    }
-    return ctx;
-}
-
 void llama_free(struct llama_context * ctx) {
    delete ctx;
 }
@ -18007,21 +17994,39 @@ const llama_model * llama_get_model(const struct llama_context * ctx) {
    return &ctx->model;
 }

-void llama_print_derived_models(struct llama_context * ctx) {
-    for (const auto & it : ctx->derived_models) {
-        LLAMA_LOG_INFO("%s: %s\n", __func__, it.first.c_str());
+void llama_model_set_name(struct llama_model * model, const char * model_name) {
+    model->name = model_name;
+}
+
+void llama_print_derived_models(const struct llama_context * ctx) {
+    for (const auto & derived_model : ctx->derived_models) {
+        if (!derived_model->name.empty()) {
+            LLAMA_LOG_INFO("%s: %s\n", __func__, derived_model->name.c_str());
+        }
    }
 }

-void llama_set_derived_models(struct llama_context * ctx, const std::map<std::string, llama_model*> derived_models) {
-    ctx->derived_models = derived_models;
+void llama_ctx_set_derived_model(struct llama_context * ctx, struct llama_model * derived_model) {
+    ctx->derived_models.emplace_back(derived_model);
 }

-bool llama_switch_derived_model(struct llama_context* ctx, const std::string derived_model_name) {
+bool llama_ctx_switch_derived_model(struct llama_context* ctx, const char * derived_model_name) {
    llama_synchronize(ctx);
-
    auto& cparams = ctx->cparams;
-    cparams.derived_model_name = (ctx->derived_models.find(derived_model_name) == ctx->derived_models.end()) ? BASE_MODEL : derived_model_name;
+    int found = 0;
+
+    const llama_model* model_ptr = nullptr;
+
+    bool is_derived = false;
+    for (const auto& model : ctx->derived_models) {
+        if (model->name == derived_model_name) {
+            model_ptr = model;
+            is_derived = true;
+            break;
+        }
+    }
+
+    cparams.derived_model_name = is_derived ? derived_model_name : BASE_MODEL;
    LLAMA_LOG_INFO("%s: %s\n", __func__, cparams.derived_model_name.c_str());

    return true;
@ -20190,8 +20195,6 @@ int llama_foundation_prefix(char* dest, size_t maxlen, const char* split_path) {
    if (pos != NULL) {
        size_t size_prefix = pos - split_path;
        snprintf(dest, std::min((size_t)size_prefix + 1, maxlen), "%s", split_path);
-        // strncpy(dest, split_path, size_prefix);
-        // dest[size_prefix] = '\0';
        return size_prefix;
    }