remove cpp header map/string in llama.h
This commit is contained in:
parent
55fbe831ef
commit
ec9e5c7974
4 changed files with 65 additions and 96 deletions
|
@ -2057,24 +2057,21 @@ std::tuple<struct llama_model *, struct llama_context *> llama_init_from_gpt_par
|
|||
}
|
||||
}
|
||||
|
||||
std::map<std::string, llama_model*> derived_models;
|
||||
for (unsigned int i = 0; i < params.derived_model_paths.size(); ++i) {
|
||||
const auto & derived_model_path = params.derived_model_paths[i];
|
||||
const std::string & derived_model_name = std::get<0>(derived_model_path);
|
||||
const std::string & derived_model_file = std::get<1>(derived_model_path);
|
||||
|
||||
llama_model * derived_model_ptr = nullptr;
|
||||
derived_model_ptr = llama_load_model_from_file(derived_model_file.c_str(), mparams);
|
||||
llama_model * derived_model = llama_load_model_from_file(derived_model_file.c_str(), mparams);
|
||||
|
||||
if (derived_model_ptr == NULL) {
|
||||
if (derived_model == NULL) {
|
||||
fprintf(stderr, "%s: error: failed to load derived model '%s'\n", __func__, derived_model_file.c_str());
|
||||
}
|
||||
|
||||
derived_models[derived_model_name] = derived_model_ptr;
|
||||
llama_model_set_name(derived_model, derived_model_name.c_str());
|
||||
llama_ctx_set_derived_model(lctx, derived_model);
|
||||
}
|
||||
|
||||
llama_set_derived_models(lctx, derived_models);
|
||||
|
||||
for (unsigned int i = 0; i < params.lora_adapter.size(); ++i) {
|
||||
const std::string & lora_adapter = std::get<0>(params.lora_adapter[i]);
|
||||
float lora_scale = std::get<1>(params.lora_adapter[i]);
|
||||
|
|
|
@ -236,7 +236,7 @@ int main(int argc, char ** argv) {
|
|||
LOG_TEE("\n");
|
||||
llama_print_derived_models(ctx);
|
||||
}
|
||||
llama_switch_derived_model(ctx, "summarize");
|
||||
llama_ctx_switch_derived_model(ctx, "summarize");
|
||||
|
||||
std::string path_session = params.path_prompt_cache;
|
||||
std::vector<llama_token> session_tokens;
|
||||
|
@ -266,6 +266,7 @@ int main(int argc, char ** argv) {
|
|||
|
||||
std::vector<llama_token> embd_inp;
|
||||
|
||||
params.prompt = "<|user|>\nhelp summarize the microsoft products.<|end|>\n<|assistant|>\n";
|
||||
{
|
||||
auto prompt = (params.conversation && params.enable_chat_template)
|
||||
? chat_add_and_format(model, chat_msgs, "system", params.prompt) // format the system prompt in conversation mode
|
||||
|
@ -292,20 +293,6 @@ int main(int argc, char ** argv) {
|
|||
std::vector<llama_token> guidance_inp;
|
||||
int guidance_offset = 0;
|
||||
int original_prompt_len = 0;
|
||||
if (ctx_guidance) {
|
||||
LOG("cfg_negative_prompt: \"%s\"\n", log_tostr(sparams.cfg_negative_prompt));
|
||||
|
||||
guidance_inp = ::llama_tokenize(ctx_guidance, sparams.cfg_negative_prompt, true, true);
|
||||
LOG("guidance_inp tokenized: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx_guidance, guidance_inp).c_str());
|
||||
|
||||
std::vector<llama_token> original_inp = ::llama_tokenize(ctx, params.prompt, true, true);
|
||||
LOG("original_inp tokenized: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, original_inp).c_str());
|
||||
|
||||
original_prompt_len = original_inp.size();
|
||||
guidance_offset = (int)guidance_inp.size() - original_prompt_len;
|
||||
LOG("original_prompt_len: %s", log_tostr(original_prompt_len));
|
||||
LOG("guidance_offset: %s", log_tostr(guidance_offset));
|
||||
}
|
||||
|
||||
if ((int) embd_inp.size() > n_ctx - 4) {
|
||||
LOG_TEE("%s: error: prompt is too long (%d tokens, max %d)\n", __func__, (int) embd_inp.size(), n_ctx - 4);
|
||||
|
@ -373,15 +360,6 @@ int main(int argc, char ** argv) {
|
|||
LOG_TEE("%6d -> '%s'\n", embd_inp[i], llama_token_to_piece(ctx, embd_inp[i]).c_str());
|
||||
}
|
||||
|
||||
if (ctx_guidance) {
|
||||
LOG_TEE("\n");
|
||||
LOG_TEE("%s: negative prompt: '%s'\n", __func__, sparams.cfg_negative_prompt.c_str());
|
||||
LOG_TEE("%s: number of tokens in negative prompt = %zu\n", __func__, guidance_inp.size());
|
||||
for (int i = 0; i < (int) guidance_inp.size(); i++) {
|
||||
LOG_TEE("%6d -> '%s'\n", guidance_inp[i], llama_token_to_piece(ctx, guidance_inp[i]).c_str());
|
||||
}
|
||||
}
|
||||
|
||||
if (params.n_keep > add_bos) {
|
||||
LOG_TEE("%s: static prompt based on n_keep: '", __func__);
|
||||
for (int i = 0; i < params.n_keep; i++) {
|
||||
|
@ -458,15 +436,6 @@ int main(int argc, char ** argv) {
|
|||
const int ga_n = params.grp_attn_n;
|
||||
const int ga_w = params.grp_attn_w;
|
||||
|
||||
if (ga_n != 1) {
|
||||
GGML_ASSERT(ga_n > 0 && "grp_attn_n must be positive"); // NOLINT
|
||||
GGML_ASSERT(ga_w % ga_n == 0 && "grp_attn_w must be a multiple of grp_attn_n"); // NOLINT
|
||||
//GGML_ASSERT(n_ctx_train % ga_w == 0 && "n_ctx_train must be a multiple of grp_attn_w"); // NOLINT
|
||||
//GGML_ASSERT(n_ctx >= n_ctx_train * ga_n && "n_ctx must be at least n_ctx_train * grp_attn_n"); // NOLINT
|
||||
LOG_TEE("self-extend: n_ctx_train = %d, grp_attn_n = %d, grp_attn_w = %d\n", n_ctx_train, ga_n, ga_w);
|
||||
}
|
||||
LOG_TEE("\n\n");
|
||||
|
||||
if (params.interactive) {
|
||||
const char * control_message;
|
||||
if (params.multiline_input) {
|
||||
|
|
|
@ -8,8 +8,6 @@
|
|||
#include <stdint.h>
|
||||
#include <stdio.h>
|
||||
#include <stdbool.h>
|
||||
#include <map>
|
||||
#include <string>
|
||||
|
||||
#ifdef _WIN32
|
||||
#else
|
||||
|
@ -432,17 +430,19 @@ extern "C" {
|
|||
struct llama_model * model,
|
||||
struct llama_context_params params);
|
||||
|
||||
LLAMA_API void llama_print_derived_models(struct llama_context* ctx);
|
||||
|
||||
LLAMA_API void llama_set_derived_models(
|
||||
struct llama_context * ctx,
|
||||
std::map<std::string, struct llama_model *> derived_models);
|
||||
|
||||
static const char* BASE_MODEL = "base";
|
||||
|
||||
LLAMA_API bool llama_switch_derived_model(
|
||||
LLAMA_API void llama_print_derived_models(const struct llama_context* ctx);
|
||||
|
||||
LLAMA_API void llama_model_set_name(struct llama_model * model, const char* name);
|
||||
|
||||
LLAMA_API void llama_ctx_set_derived_model(
|
||||
struct llama_context * ctx,
|
||||
struct llama_model * derived_model);
|
||||
|
||||
LLAMA_API bool llama_ctx_switch_derived_model(
|
||||
struct llama_context* ctx,
|
||||
std::string derived_model_name);
|
||||
const char * derived_model_name);
|
||||
|
||||
// Frees all allocated memory
|
||||
LLAMA_API void llama_free(struct llama_context * ctx);
|
||||
|
|
|
@ -2624,7 +2624,7 @@ struct llama_context {
|
|||
const llama_model & model;
|
||||
|
||||
// derived models
|
||||
std::map<std::string, llama_model *> derived_models;
|
||||
std::vector<llama_model *> derived_models;
|
||||
|
||||
// key + value cache for the self attention
|
||||
struct llama_kv_cache kv_self;
|
||||
|
@ -3549,13 +3549,15 @@ struct llama_model_loader {
|
|||
|
||||
char split_prefix[PATH_MAX] = {0};
|
||||
char foundation_prefix[PATH_MAX] = { 0 };
|
||||
// Two types of split files are supported:
|
||||
// prefix is abc, postfix is 00001-of-00002, 00002-of-00002
|
||||
// abc-00001-of-00002.gguf, abc-00002-of-00002.gguf
|
||||
// prefix is abc, postfix is foundation, adaptor-task-x, adaptor-task-y
|
||||
// abc-foundation.gguf, abc-adaptor-task-x.gguf, abc-adaptor-task-y.gguf
|
||||
if (!llama_split_prefix(split_prefix, sizeof(split_prefix), fname.c_str(), idx, n_split)
|
||||
&& !llama_foundation_prefix(foundation_prefix, sizeof(foundation_prefix), fname.c_str())) {
|
||||
// Two split mode:
|
||||
// - abc-00001-of-00002.gguf, abc-00002-of-00002.gguf, prefix is abc, postfix is 00001-of-00002, 00002-of-00002
|
||||
// - abc-foundation.gguf, abc-adaptor-task-x.gguf, abc-adaptor-task-y.gguf, prefix is abc, postfix is -foundation, -adaptor-task-x, -adaptor-task-y
|
||||
bool foundation_mode = false;
|
||||
if (llama_foundation_prefix(foundation_prefix, sizeof(foundation_prefix), fname.c_str()) && n_split == 2) {
|
||||
foundation_mode = true;
|
||||
}
|
||||
|
||||
if (!foundation_mode && !llama_split_prefix(split_prefix, sizeof(split_prefix), fname.c_str(), idx, n_split)) {
|
||||
throw std::runtime_error(format("invalid split file: %s", fname.c_str()));
|
||||
}
|
||||
|
||||
|
@ -3565,14 +3567,10 @@ struct llama_model_loader {
|
|||
|
||||
char split_path[PATH_MAX] = {0};
|
||||
for (idx = 1; idx < n_split; idx++) {
|
||||
llama_split_path(split_path, sizeof(split_path), split_prefix, idx, n_split);
|
||||
|
||||
// if split path not exist
|
||||
struct stat model_file_info;
|
||||
std::string str_split_path(split_path);
|
||||
auto file_exists = (stat(str_split_path.c_str(), &model_file_info) == 0);
|
||||
if (!file_exists) {
|
||||
if (foundation_mode) {
|
||||
llama_foundation_split_path(split_path, sizeof(split_path), foundation_prefix);
|
||||
} else {
|
||||
llama_split_path(split_path, sizeof(split_path), split_prefix, idx, n_split);
|
||||
}
|
||||
|
||||
struct gguf_init_params split_params = {
|
||||
|
@ -12595,19 +12593,19 @@ static struct ggml_cgraph * llama_build_graph(
|
|||
llama_context & lctx,
|
||||
const llama_batch & batch,
|
||||
bool worst_case) {
|
||||
const auto & foundation_model = lctx.model;
|
||||
|
||||
const auto& foundation_model = lctx.model;
|
||||
const llama_model* model_ptr = nullptr;
|
||||
const auto it = lctx.derived_models.find(lctx.cparams.derived_model_name);
|
||||
if (it != lctx.derived_models.end()) {
|
||||
const auto& model_derived = *(it->second);
|
||||
model_ptr = &model_derived;
|
||||
}
|
||||
else {
|
||||
model_ptr = &foundation_model;
|
||||
const char* model_name = lctx.cparams.derived_model_name.c_str();
|
||||
|
||||
for (const auto& model : lctx.derived_models) {
|
||||
if (model->name == model_name) {
|
||||
model_ptr = model;
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
const llama_model & model = *model_ptr;
|
||||
model_ptr = model_ptr ? model_ptr : &foundation_model;
|
||||
const llama_model& model = *model_ptr;
|
||||
|
||||
// this callback allows us to apply custom logic to each tensor (e.g. ggml-alloc, offloading, etc.)
|
||||
llm_build_cb cb = [&](struct ggml_tensor * cur, const char * name, int il) {
|
||||
|
@ -17988,17 +17986,6 @@ struct llama_context * llama_new_context_with_model(
|
|||
return ctx;
|
||||
}
|
||||
|
||||
struct llama_context * llama_new_context_with_derived_models(
|
||||
struct llama_model * model,
|
||||
struct llama_context_params params,
|
||||
const std::map<std::string, llama_model*> derived_models) {
|
||||
llama_context * ctx = llama_new_context_with_model(model, params);
|
||||
if (ctx) {
|
||||
ctx->derived_models = derived_models;
|
||||
}
|
||||
return ctx;
|
||||
}
|
||||
|
||||
void llama_free(struct llama_context * ctx) {
|
||||
delete ctx;
|
||||
}
|
||||
|
@ -18007,21 +17994,39 @@ const llama_model * llama_get_model(const struct llama_context * ctx) {
|
|||
return &ctx->model;
|
||||
}
|
||||
|
||||
void llama_print_derived_models(struct llama_context * ctx) {
|
||||
for (const auto & it : ctx->derived_models) {
|
||||
LLAMA_LOG_INFO("%s: %s\n", __func__, it.first.c_str());
|
||||
void llama_model_set_name(struct llama_model * model, const char * model_name) {
|
||||
model->name = model_name;
|
||||
}
|
||||
|
||||
void llama_print_derived_models(const struct llama_context * ctx) {
|
||||
for (const auto & derived_model : ctx->derived_models) {
|
||||
if (!derived_model->name.empty()) {
|
||||
LLAMA_LOG_INFO("%s: %s\n", __func__, derived_model->name.c_str());
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void llama_set_derived_models(struct llama_context * ctx, const std::map<std::string, llama_model*> derived_models) {
|
||||
ctx->derived_models = derived_models;
|
||||
void llama_ctx_set_derived_model(struct llama_context * ctx, struct llama_model * derived_model) {
|
||||
ctx->derived_models.emplace_back(derived_model);
|
||||
}
|
||||
|
||||
bool llama_switch_derived_model(struct llama_context* ctx, const std::string derived_model_name) {
|
||||
bool llama_ctx_switch_derived_model(struct llama_context* ctx, const char * derived_model_name) {
|
||||
llama_synchronize(ctx);
|
||||
|
||||
auto& cparams = ctx->cparams;
|
||||
cparams.derived_model_name = (ctx->derived_models.find(derived_model_name) == ctx->derived_models.end()) ? BASE_MODEL : derived_model_name;
|
||||
int found = 0;
|
||||
|
||||
const llama_model* model_ptr = nullptr;
|
||||
|
||||
bool is_derived = false;
|
||||
for (const auto& model : ctx->derived_models) {
|
||||
if (model->name == derived_model_name) {
|
||||
model_ptr = model;
|
||||
is_derived = true;
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
cparams.derived_model_name = is_derived ? derived_model_name : BASE_MODEL;
|
||||
LLAMA_LOG_INFO("%s: %s\n", __func__, cparams.derived_model_name.c_str());
|
||||
|
||||
return true;
|
||||
|
@ -20190,8 +20195,6 @@ int llama_foundation_prefix(char* dest, size_t maxlen, const char* split_path) {
|
|||
if (pos != NULL) {
|
||||
size_t size_prefix = pos - split_path;
|
||||
snprintf(dest, std::min((size_t)size_prefix + 1, maxlen), "%s", split_path);
|
||||
// strncpy(dest, split_path, size_prefix);
|
||||
// dest[size_prefix] = '\0';
|
||||
return size_prefix;
|
||||
}
|
||||
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue