remove cpp header map/string in llama.h

This commit is contained in:
zhhan 2024-07-09 16:57:07 -07:00
parent 55fbe831ef
commit ec9e5c7974
4 changed files with 65 additions and 96 deletions

View file

@ -2057,24 +2057,21 @@ std::tuple<struct llama_model *, struct llama_context *> llama_init_from_gpt_par
} }
} }
std::map<std::string, llama_model*> derived_models;
for (unsigned int i = 0; i < params.derived_model_paths.size(); ++i) { for (unsigned int i = 0; i < params.derived_model_paths.size(); ++i) {
const auto & derived_model_path = params.derived_model_paths[i]; const auto & derived_model_path = params.derived_model_paths[i];
const std::string & derived_model_name = std::get<0>(derived_model_path); const std::string & derived_model_name = std::get<0>(derived_model_path);
const std::string & derived_model_file = std::get<1>(derived_model_path); const std::string & derived_model_file = std::get<1>(derived_model_path);
llama_model * derived_model_ptr = nullptr; llama_model * derived_model = llama_load_model_from_file(derived_model_file.c_str(), mparams);
derived_model_ptr = llama_load_model_from_file(derived_model_file.c_str(), mparams);
if (derived_model_ptr == NULL) { if (derived_model == NULL) {
fprintf(stderr, "%s: error: failed to load derived model '%s'\n", __func__, derived_model_file.c_str()); fprintf(stderr, "%s: error: failed to load derived model '%s'\n", __func__, derived_model_file.c_str());
} }
derived_models[derived_model_name] = derived_model_ptr; llama_model_set_name(derived_model, derived_model_name.c_str());
llama_ctx_set_derived_model(lctx, derived_model);
} }
llama_set_derived_models(lctx, derived_models);
for (unsigned int i = 0; i < params.lora_adapter.size(); ++i) { for (unsigned int i = 0; i < params.lora_adapter.size(); ++i) {
const std::string & lora_adapter = std::get<0>(params.lora_adapter[i]); const std::string & lora_adapter = std::get<0>(params.lora_adapter[i]);
float lora_scale = std::get<1>(params.lora_adapter[i]); float lora_scale = std::get<1>(params.lora_adapter[i]);

View file

@ -236,7 +236,7 @@ int main(int argc, char ** argv) {
LOG_TEE("\n"); LOG_TEE("\n");
llama_print_derived_models(ctx); llama_print_derived_models(ctx);
} }
llama_switch_derived_model(ctx, "summarize"); llama_ctx_switch_derived_model(ctx, "summarize");
std::string path_session = params.path_prompt_cache; std::string path_session = params.path_prompt_cache;
std::vector<llama_token> session_tokens; std::vector<llama_token> session_tokens;
@ -266,6 +266,7 @@ int main(int argc, char ** argv) {
std::vector<llama_token> embd_inp; std::vector<llama_token> embd_inp;
params.prompt = "<|user|>\nhelp summarize the microsoft products.<|end|>\n<|assistant|>\n";
{ {
auto prompt = (params.conversation && params.enable_chat_template) auto prompt = (params.conversation && params.enable_chat_template)
? chat_add_and_format(model, chat_msgs, "system", params.prompt) // format the system prompt in conversation mode ? chat_add_and_format(model, chat_msgs, "system", params.prompt) // format the system prompt in conversation mode
@ -292,20 +293,6 @@ int main(int argc, char ** argv) {
std::vector<llama_token> guidance_inp; std::vector<llama_token> guidance_inp;
int guidance_offset = 0; int guidance_offset = 0;
int original_prompt_len = 0; int original_prompt_len = 0;
if (ctx_guidance) {
LOG("cfg_negative_prompt: \"%s\"\n", log_tostr(sparams.cfg_negative_prompt));
guidance_inp = ::llama_tokenize(ctx_guidance, sparams.cfg_negative_prompt, true, true);
LOG("guidance_inp tokenized: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx_guidance, guidance_inp).c_str());
std::vector<llama_token> original_inp = ::llama_tokenize(ctx, params.prompt, true, true);
LOG("original_inp tokenized: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, original_inp).c_str());
original_prompt_len = original_inp.size();
guidance_offset = (int)guidance_inp.size() - original_prompt_len;
LOG("original_prompt_len: %s", log_tostr(original_prompt_len));
LOG("guidance_offset: %s", log_tostr(guidance_offset));
}
if ((int) embd_inp.size() > n_ctx - 4) { if ((int) embd_inp.size() > n_ctx - 4) {
LOG_TEE("%s: error: prompt is too long (%d tokens, max %d)\n", __func__, (int) embd_inp.size(), n_ctx - 4); LOG_TEE("%s: error: prompt is too long (%d tokens, max %d)\n", __func__, (int) embd_inp.size(), n_ctx - 4);
@ -373,15 +360,6 @@ int main(int argc, char ** argv) {
LOG_TEE("%6d -> '%s'\n", embd_inp[i], llama_token_to_piece(ctx, embd_inp[i]).c_str()); LOG_TEE("%6d -> '%s'\n", embd_inp[i], llama_token_to_piece(ctx, embd_inp[i]).c_str());
} }
if (ctx_guidance) {
LOG_TEE("\n");
LOG_TEE("%s: negative prompt: '%s'\n", __func__, sparams.cfg_negative_prompt.c_str());
LOG_TEE("%s: number of tokens in negative prompt = %zu\n", __func__, guidance_inp.size());
for (int i = 0; i < (int) guidance_inp.size(); i++) {
LOG_TEE("%6d -> '%s'\n", guidance_inp[i], llama_token_to_piece(ctx, guidance_inp[i]).c_str());
}
}
if (params.n_keep > add_bos) { if (params.n_keep > add_bos) {
LOG_TEE("%s: static prompt based on n_keep: '", __func__); LOG_TEE("%s: static prompt based on n_keep: '", __func__);
for (int i = 0; i < params.n_keep; i++) { for (int i = 0; i < params.n_keep; i++) {
@ -458,15 +436,6 @@ int main(int argc, char ** argv) {
const int ga_n = params.grp_attn_n; const int ga_n = params.grp_attn_n;
const int ga_w = params.grp_attn_w; const int ga_w = params.grp_attn_w;
if (ga_n != 1) {
GGML_ASSERT(ga_n > 0 && "grp_attn_n must be positive"); // NOLINT
GGML_ASSERT(ga_w % ga_n == 0 && "grp_attn_w must be a multiple of grp_attn_n"); // NOLINT
//GGML_ASSERT(n_ctx_train % ga_w == 0 && "n_ctx_train must be a multiple of grp_attn_w"); // NOLINT
//GGML_ASSERT(n_ctx >= n_ctx_train * ga_n && "n_ctx must be at least n_ctx_train * grp_attn_n"); // NOLINT
LOG_TEE("self-extend: n_ctx_train = %d, grp_attn_n = %d, grp_attn_w = %d\n", n_ctx_train, ga_n, ga_w);
}
LOG_TEE("\n\n");
if (params.interactive) { if (params.interactive) {
const char * control_message; const char * control_message;
if (params.multiline_input) { if (params.multiline_input) {

View file

@ -8,8 +8,6 @@
#include <stdint.h> #include <stdint.h>
#include <stdio.h> #include <stdio.h>
#include <stdbool.h> #include <stdbool.h>
#include <map>
#include <string>
#ifdef _WIN32 #ifdef _WIN32
#else #else
@ -432,17 +430,19 @@ extern "C" {
struct llama_model * model, struct llama_model * model,
struct llama_context_params params); struct llama_context_params params);
LLAMA_API void llama_print_derived_models(struct llama_context* ctx);
LLAMA_API void llama_set_derived_models(
struct llama_context * ctx,
std::map<std::string, struct llama_model *> derived_models);
static const char* BASE_MODEL = "base"; static const char* BASE_MODEL = "base";
LLAMA_API bool llama_switch_derived_model( LLAMA_API void llama_print_derived_models(const struct llama_context* ctx);
LLAMA_API void llama_model_set_name(struct llama_model * model, const char* name);
LLAMA_API void llama_ctx_set_derived_model(
struct llama_context * ctx,
struct llama_model * derived_model);
LLAMA_API bool llama_ctx_switch_derived_model(
struct llama_context* ctx, struct llama_context* ctx,
std::string derived_model_name); const char * derived_model_name);
// Frees all allocated memory // Frees all allocated memory
LLAMA_API void llama_free(struct llama_context * ctx); LLAMA_API void llama_free(struct llama_context * ctx);

View file

@ -2624,7 +2624,7 @@ struct llama_context {
const llama_model & model; const llama_model & model;
// derived models // derived models
std::map<std::string, llama_model *> derived_models; std::vector<llama_model *> derived_models;
// key + value cache for the self attention // key + value cache for the self attention
struct llama_kv_cache kv_self; struct llama_kv_cache kv_self;
@ -3549,13 +3549,15 @@ struct llama_model_loader {
char split_prefix[PATH_MAX] = {0}; char split_prefix[PATH_MAX] = {0};
char foundation_prefix[PATH_MAX] = { 0 }; char foundation_prefix[PATH_MAX] = { 0 };
// Two types of split files are supported: // Two split mode:
// prefix is abc, postfix is 00001-of-00002, 00002-of-00002 // - abc-00001-of-00002.gguf, abc-00002-of-00002.gguf, prefix is abc, postfix is 00001-of-00002, 00002-of-00002
// abc-00001-of-00002.gguf, abc-00002-of-00002.gguf // - abc-foundation.gguf, abc-adaptor-task-x.gguf, abc-adaptor-task-y.gguf, prefix is abc, postfix is -foundation, -adaptor-task-x, -adaptor-task-y
// prefix is abc, postfix is foundation, adaptor-task-x, adaptor-task-y bool foundation_mode = false;
// abc-foundation.gguf, abc-adaptor-task-x.gguf, abc-adaptor-task-y.gguf if (llama_foundation_prefix(foundation_prefix, sizeof(foundation_prefix), fname.c_str()) && n_split == 2) {
if (!llama_split_prefix(split_prefix, sizeof(split_prefix), fname.c_str(), idx, n_split) foundation_mode = true;
&& !llama_foundation_prefix(foundation_prefix, sizeof(foundation_prefix), fname.c_str())) { }
if (!foundation_mode && !llama_split_prefix(split_prefix, sizeof(split_prefix), fname.c_str(), idx, n_split)) {
throw std::runtime_error(format("invalid split file: %s", fname.c_str())); throw std::runtime_error(format("invalid split file: %s", fname.c_str()));
} }
@ -3565,14 +3567,10 @@ struct llama_model_loader {
char split_path[PATH_MAX] = {0}; char split_path[PATH_MAX] = {0};
for (idx = 1; idx < n_split; idx++) { for (idx = 1; idx < n_split; idx++) {
llama_split_path(split_path, sizeof(split_path), split_prefix, idx, n_split); if (foundation_mode) {
// if split path not exist
struct stat model_file_info;
std::string str_split_path(split_path);
auto file_exists = (stat(str_split_path.c_str(), &model_file_info) == 0);
if (!file_exists) {
llama_foundation_split_path(split_path, sizeof(split_path), foundation_prefix); llama_foundation_split_path(split_path, sizeof(split_path), foundation_prefix);
} else {
llama_split_path(split_path, sizeof(split_path), split_prefix, idx, n_split);
} }
struct gguf_init_params split_params = { struct gguf_init_params split_params = {
@ -12595,19 +12593,19 @@ static struct ggml_cgraph * llama_build_graph(
llama_context & lctx, llama_context & lctx,
const llama_batch & batch, const llama_batch & batch,
bool worst_case) { bool worst_case) {
const auto & foundation_model = lctx.model; const auto& foundation_model = lctx.model;
const llama_model* model_ptr = nullptr; const llama_model* model_ptr = nullptr;
const auto it = lctx.derived_models.find(lctx.cparams.derived_model_name); const char* model_name = lctx.cparams.derived_model_name.c_str();
if (it != lctx.derived_models.end()) {
const auto& model_derived = *(it->second); for (const auto& model : lctx.derived_models) {
model_ptr = &model_derived; if (model->name == model_name) {
} model_ptr = model;
else { break;
model_ptr = &foundation_model; }
} }
const llama_model & model = *model_ptr; model_ptr = model_ptr ? model_ptr : &foundation_model;
const llama_model& model = *model_ptr;
// this callback allows us to apply custom logic to each tensor (e.g. ggml-alloc, offloading, etc.) // this callback allows us to apply custom logic to each tensor (e.g. ggml-alloc, offloading, etc.)
llm_build_cb cb = [&](struct ggml_tensor * cur, const char * name, int il) { llm_build_cb cb = [&](struct ggml_tensor * cur, const char * name, int il) {
@ -17988,17 +17986,6 @@ struct llama_context * llama_new_context_with_model(
return ctx; return ctx;
} }
struct llama_context * llama_new_context_with_derived_models(
struct llama_model * model,
struct llama_context_params params,
const std::map<std::string, llama_model*> derived_models) {
llama_context * ctx = llama_new_context_with_model(model, params);
if (ctx) {
ctx->derived_models = derived_models;
}
return ctx;
}
void llama_free(struct llama_context * ctx) { void llama_free(struct llama_context * ctx) {
delete ctx; delete ctx;
} }
@ -18007,21 +17994,39 @@ const llama_model * llama_get_model(const struct llama_context * ctx) {
return &ctx->model; return &ctx->model;
} }
void llama_print_derived_models(struct llama_context * ctx) { void llama_model_set_name(struct llama_model * model, const char * model_name) {
for (const auto & it : ctx->derived_models) { model->name = model_name;
LLAMA_LOG_INFO("%s: %s\n", __func__, it.first.c_str()); }
void llama_print_derived_models(const struct llama_context * ctx) {
for (const auto & derived_model : ctx->derived_models) {
if (!derived_model->name.empty()) {
LLAMA_LOG_INFO("%s: %s\n", __func__, derived_model->name.c_str());
}
} }
} }
void llama_set_derived_models(struct llama_context * ctx, const std::map<std::string, llama_model*> derived_models) { void llama_ctx_set_derived_model(struct llama_context * ctx, struct llama_model * derived_model) {
ctx->derived_models = derived_models; ctx->derived_models.emplace_back(derived_model);
} }
bool llama_switch_derived_model(struct llama_context* ctx, const std::string derived_model_name) { bool llama_ctx_switch_derived_model(struct llama_context* ctx, const char * derived_model_name) {
llama_synchronize(ctx); llama_synchronize(ctx);
auto& cparams = ctx->cparams; auto& cparams = ctx->cparams;
cparams.derived_model_name = (ctx->derived_models.find(derived_model_name) == ctx->derived_models.end()) ? BASE_MODEL : derived_model_name; int found = 0;
const llama_model* model_ptr = nullptr;
bool is_derived = false;
for (const auto& model : ctx->derived_models) {
if (model->name == derived_model_name) {
model_ptr = model;
is_derived = true;
break;
}
}
cparams.derived_model_name = is_derived ? derived_model_name : BASE_MODEL;
LLAMA_LOG_INFO("%s: %s\n", __func__, cparams.derived_model_name.c_str()); LLAMA_LOG_INFO("%s: %s\n", __func__, cparams.derived_model_name.c_str());
return true; return true;
@ -20190,8 +20195,6 @@ int llama_foundation_prefix(char* dest, size_t maxlen, const char* split_path) {
if (pos != NULL) { if (pos != NULL) {
size_t size_prefix = pos - split_path; size_t size_prefix = pos - split_path;
snprintf(dest, std::min((size_t)size_prefix + 1, maxlen), "%s", split_path); snprintf(dest, std::min((size_t)size_prefix + 1, maxlen), "%s", split_path);
// strncpy(dest, split_path, size_prefix);
// dest[size_prefix] = '\0';
return size_prefix; return size_prefix;
} }