diff --git a/common/common.cpp b/common/common.cpp index e9554e4d8..be0656c2a 100644 --- a/common/common.cpp +++ b/common/common.cpp @@ -2057,24 +2057,21 @@ std::tuple llama_init_from_gpt_par } } - std::map derived_models; for (unsigned int i = 0; i < params.derived_model_paths.size(); ++i) { const auto & derived_model_path = params.derived_model_paths[i]; const std::string & derived_model_name = std::get<0>(derived_model_path); const std::string & derived_model_file = std::get<1>(derived_model_path); - llama_model * derived_model_ptr = nullptr; - derived_model_ptr = llama_load_model_from_file(derived_model_file.c_str(), mparams); + llama_model * derived_model = llama_load_model_from_file(derived_model_file.c_str(), mparams); - if (derived_model_ptr == NULL) { + if (derived_model == NULL) { fprintf(stderr, "%s: error: failed to load derived model '%s'\n", __func__, derived_model_file.c_str()); } - derived_models[derived_model_name] = derived_model_ptr; + llama_model_set_name(derived_model, derived_model_name.c_str()); + llama_ctx_set_derived_model(lctx, derived_model); } - llama_set_derived_models(lctx, derived_models); - for (unsigned int i = 0; i < params.lora_adapter.size(); ++i) { const std::string & lora_adapter = std::get<0>(params.lora_adapter[i]); float lora_scale = std::get<1>(params.lora_adapter[i]); diff --git a/examples/multi-adaptation/multi-adaptation.cpp b/examples/multi-adaptation/multi-adaptation.cpp index da721b7ca..0dd31a602 100644 --- a/examples/multi-adaptation/multi-adaptation.cpp +++ b/examples/multi-adaptation/multi-adaptation.cpp @@ -236,7 +236,7 @@ int main(int argc, char ** argv) { LOG_TEE("\n"); llama_print_derived_models(ctx); } - llama_switch_derived_model(ctx, "summarize"); + llama_ctx_switch_derived_model(ctx, "summarize"); std::string path_session = params.path_prompt_cache; std::vector session_tokens; @@ -266,6 +266,7 @@ int main(int argc, char ** argv) { std::vector embd_inp; + params.prompt = "<|user|>\nhelp summarize the microsoft products.<|end|>\n<|assistant|>\n"; { auto prompt = (params.conversation && params.enable_chat_template) ? chat_add_and_format(model, chat_msgs, "system", params.prompt) // format the system prompt in conversation mode @@ -292,20 +293,6 @@ int main(int argc, char ** argv) { std::vector guidance_inp; int guidance_offset = 0; int original_prompt_len = 0; - if (ctx_guidance) { - LOG("cfg_negative_prompt: \"%s\"\n", log_tostr(sparams.cfg_negative_prompt)); - - guidance_inp = ::llama_tokenize(ctx_guidance, sparams.cfg_negative_prompt, true, true); - LOG("guidance_inp tokenized: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx_guidance, guidance_inp).c_str()); - - std::vector original_inp = ::llama_tokenize(ctx, params.prompt, true, true); - LOG("original_inp tokenized: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, original_inp).c_str()); - - original_prompt_len = original_inp.size(); - guidance_offset = (int)guidance_inp.size() - original_prompt_len; - LOG("original_prompt_len: %s", log_tostr(original_prompt_len)); - LOG("guidance_offset: %s", log_tostr(guidance_offset)); - } if ((int) embd_inp.size() > n_ctx - 4) { LOG_TEE("%s: error: prompt is too long (%d tokens, max %d)\n", __func__, (int) embd_inp.size(), n_ctx - 4); @@ -373,15 +360,6 @@ int main(int argc, char ** argv) { LOG_TEE("%6d -> '%s'\n", embd_inp[i], llama_token_to_piece(ctx, embd_inp[i]).c_str()); } - if (ctx_guidance) { - LOG_TEE("\n"); - LOG_TEE("%s: negative prompt: '%s'\n", __func__, sparams.cfg_negative_prompt.c_str()); - LOG_TEE("%s: number of tokens in negative prompt = %zu\n", __func__, guidance_inp.size()); - for (int i = 0; i < (int) guidance_inp.size(); i++) { - LOG_TEE("%6d -> '%s'\n", guidance_inp[i], llama_token_to_piece(ctx, guidance_inp[i]).c_str()); - } - } - if (params.n_keep > add_bos) { LOG_TEE("%s: static prompt based on n_keep: '", __func__); for (int i = 0; i < params.n_keep; i++) { @@ -458,15 +436,6 @@ int main(int argc, char ** argv) { const int ga_n = params.grp_attn_n; const int ga_w = params.grp_attn_w; - if (ga_n != 1) { - GGML_ASSERT(ga_n > 0 && "grp_attn_n must be positive"); // NOLINT - GGML_ASSERT(ga_w % ga_n == 0 && "grp_attn_w must be a multiple of grp_attn_n"); // NOLINT - //GGML_ASSERT(n_ctx_train % ga_w == 0 && "n_ctx_train must be a multiple of grp_attn_w"); // NOLINT - //GGML_ASSERT(n_ctx >= n_ctx_train * ga_n && "n_ctx must be at least n_ctx_train * grp_attn_n"); // NOLINT - LOG_TEE("self-extend: n_ctx_train = %d, grp_attn_n = %d, grp_attn_w = %d\n", n_ctx_train, ga_n, ga_w); - } - LOG_TEE("\n\n"); - if (params.interactive) { const char * control_message; if (params.multiline_input) { diff --git a/include/llama.h b/include/llama.h index f45e6c052..0178b1a8e 100644 --- a/include/llama.h +++ b/include/llama.h @@ -8,8 +8,6 @@ #include #include #include -#include -#include #ifdef _WIN32 #else @@ -432,17 +430,19 @@ extern "C" { struct llama_model * model, struct llama_context_params params); - LLAMA_API void llama_print_derived_models(struct llama_context* ctx); - - LLAMA_API void llama_set_derived_models( - struct llama_context * ctx, - std::map derived_models); - static const char* BASE_MODEL = "base"; - LLAMA_API bool llama_switch_derived_model( + LLAMA_API void llama_print_derived_models(const struct llama_context* ctx); + + LLAMA_API void llama_model_set_name(struct llama_model * model, const char* name); + + LLAMA_API void llama_ctx_set_derived_model( + struct llama_context * ctx, + struct llama_model * derived_model); + + LLAMA_API bool llama_ctx_switch_derived_model( struct llama_context* ctx, - std::string derived_model_name); + const char * derived_model_name); // Frees all allocated memory LLAMA_API void llama_free(struct llama_context * ctx); diff --git a/src/llama.cpp b/src/llama.cpp index 1dd263db9..853da0321 100644 --- a/src/llama.cpp +++ b/src/llama.cpp @@ -2624,7 +2624,7 @@ struct llama_context { const llama_model & model; // derived models - std::map derived_models; + std::vector derived_models; // key + value cache for the self attention struct llama_kv_cache kv_self; @@ -3549,13 +3549,15 @@ struct llama_model_loader { char split_prefix[PATH_MAX] = {0}; char foundation_prefix[PATH_MAX] = { 0 }; - // Two types of split files are supported: - // prefix is abc, postfix is 00001-of-00002, 00002-of-00002 - // abc-00001-of-00002.gguf, abc-00002-of-00002.gguf - // prefix is abc, postfix is foundation, adaptor-task-x, adaptor-task-y - // abc-foundation.gguf, abc-adaptor-task-x.gguf, abc-adaptor-task-y.gguf - if (!llama_split_prefix(split_prefix, sizeof(split_prefix), fname.c_str(), idx, n_split) - && !llama_foundation_prefix(foundation_prefix, sizeof(foundation_prefix), fname.c_str())) { + // Two split mode: + // - abc-00001-of-00002.gguf, abc-00002-of-00002.gguf, prefix is abc, postfix is 00001-of-00002, 00002-of-00002 + // - abc-foundation.gguf, abc-adaptor-task-x.gguf, abc-adaptor-task-y.gguf, prefix is abc, postfix is -foundation, -adaptor-task-x, -adaptor-task-y + bool foundation_mode = false; + if (llama_foundation_prefix(foundation_prefix, sizeof(foundation_prefix), fname.c_str()) && n_split == 2) { + foundation_mode = true; + } + + if (!foundation_mode && !llama_split_prefix(split_prefix, sizeof(split_prefix), fname.c_str(), idx, n_split)) { throw std::runtime_error(format("invalid split file: %s", fname.c_str())); } @@ -3565,14 +3567,10 @@ struct llama_model_loader { char split_path[PATH_MAX] = {0}; for (idx = 1; idx < n_split; idx++) { - llama_split_path(split_path, sizeof(split_path), split_prefix, idx, n_split); - - // if split path not exist - struct stat model_file_info; - std::string str_split_path(split_path); - auto file_exists = (stat(str_split_path.c_str(), &model_file_info) == 0); - if (!file_exists) { + if (foundation_mode) { llama_foundation_split_path(split_path, sizeof(split_path), foundation_prefix); + } else { + llama_split_path(split_path, sizeof(split_path), split_prefix, idx, n_split); } struct gguf_init_params split_params = { @@ -12595,19 +12593,19 @@ static struct ggml_cgraph * llama_build_graph( llama_context & lctx, const llama_batch & batch, bool worst_case) { - const auto & foundation_model = lctx.model; - + const auto& foundation_model = lctx.model; const llama_model* model_ptr = nullptr; - const auto it = lctx.derived_models.find(lctx.cparams.derived_model_name); - if (it != lctx.derived_models.end()) { - const auto& model_derived = *(it->second); - model_ptr = &model_derived; - } - else { - model_ptr = &foundation_model; + const char* model_name = lctx.cparams.derived_model_name.c_str(); + + for (const auto& model : lctx.derived_models) { + if (model->name == model_name) { + model_ptr = model; + break; + } } - const llama_model & model = *model_ptr; + model_ptr = model_ptr ? model_ptr : &foundation_model; + const llama_model& model = *model_ptr; // this callback allows us to apply custom logic to each tensor (e.g. ggml-alloc, offloading, etc.) llm_build_cb cb = [&](struct ggml_tensor * cur, const char * name, int il) { @@ -17988,17 +17986,6 @@ struct llama_context * llama_new_context_with_model( return ctx; } -struct llama_context * llama_new_context_with_derived_models( - struct llama_model * model, - struct llama_context_params params, - const std::map derived_models) { - llama_context * ctx = llama_new_context_with_model(model, params); - if (ctx) { - ctx->derived_models = derived_models; - } - return ctx; -} - void llama_free(struct llama_context * ctx) { delete ctx; } @@ -18007,21 +17994,39 @@ const llama_model * llama_get_model(const struct llama_context * ctx) { return &ctx->model; } -void llama_print_derived_models(struct llama_context * ctx) { - for (const auto & it : ctx->derived_models) { - LLAMA_LOG_INFO("%s: %s\n", __func__, it.first.c_str()); +void llama_model_set_name(struct llama_model * model, const char * model_name) { + model->name = model_name; +} + +void llama_print_derived_models(const struct llama_context * ctx) { + for (const auto & derived_model : ctx->derived_models) { + if (!derived_model->name.empty()) { + LLAMA_LOG_INFO("%s: %s\n", __func__, derived_model->name.c_str()); + } } } -void llama_set_derived_models(struct llama_context * ctx, const std::map derived_models) { - ctx->derived_models = derived_models; +void llama_ctx_set_derived_model(struct llama_context * ctx, struct llama_model * derived_model) { + ctx->derived_models.emplace_back(derived_model); } -bool llama_switch_derived_model(struct llama_context* ctx, const std::string derived_model_name) { +bool llama_ctx_switch_derived_model(struct llama_context* ctx, const char * derived_model_name) { llama_synchronize(ctx); - auto& cparams = ctx->cparams; - cparams.derived_model_name = (ctx->derived_models.find(derived_model_name) == ctx->derived_models.end()) ? BASE_MODEL : derived_model_name; + int found = 0; + + const llama_model* model_ptr = nullptr; + + bool is_derived = false; + for (const auto& model : ctx->derived_models) { + if (model->name == derived_model_name) { + model_ptr = model; + is_derived = true; + break; + } + } + + cparams.derived_model_name = is_derived ? derived_model_name : BASE_MODEL; LLAMA_LOG_INFO("%s: %s\n", __func__, cparams.derived_model_name.c_str()); return true; @@ -20190,8 +20195,6 @@ int llama_foundation_prefix(char* dest, size_t maxlen, const char* split_path) { if (pos != NULL) { size_t size_prefix = pos - split_path; snprintf(dest, std::min((size_t)size_prefix + 1, maxlen), "%s", split_path); - // strncpy(dest, split_path, size_prefix); - // dest[size_prefix] = '\0'; return size_prefix; }