main: replaced fprintf/LOG_TEE, some trace logging

This commit is contained in:
staviq 2023-08-25 05:22:23 +02:00
parent 360b36c921
commit 181e8a9902

View file

@ -56,14 +56,20 @@ void sigint_handler(int signo) {
#endif #endif
int main(int argc, char ** argv) { int main(int argc, char ** argv) {
LOG("Log start - start of main()\n")
gpt_params params; gpt_params params;
if (gpt_params_parse(argc, argv, params) == false) { if (gpt_params_parse(argc, argv, params) == false) {
return 1; return 1;
} }
log_set_target(LOG_FILENAME_GENERATOR("main", "log"));
LOG_TEE("Log start\n")
log_dump_cmdline(argc,argv);
// TODO: Dump params ?
//LOG("Params perplexity: %s\n", LOG_TOSTR(params.perplexity))
// save choice to use color for later // save choice to use color for later
// (note for later: this is a slightly awkward choice) // (note for later: this is a slightly awkward choice)
console::init(params.simple_io, params.use_color); console::init(params.simple_io, params.use_color);
@ -86,34 +92,35 @@ int main(int argc, char ** argv) {
} }
if (params.rope_freq_base != 10000.0) { if (params.rope_freq_base != 10000.0) {
fprintf(stderr, "%s: warning: changing RoPE frequency base to %g (default 10000.0)\n", __func__, params.rope_freq_base); LOG_TEE("%s: warning: changing RoPE frequency base to %g (default 10000.0)\n", __func__, params.rope_freq_base);
} }
if (params.rope_freq_scale != 1.0) { if (params.rope_freq_scale != 1.0) {
fprintf(stderr, "%s: warning: scaling RoPE frequency by %g (default 1.0)\n", __func__, params.rope_freq_scale); LOG_TEE("%s: warning: scaling RoPE frequency by %g (default 1.0)\n", __func__, params.rope_freq_scale);
} }
if (params.n_ctx > 2048) { if (params.n_ctx > 2048) {
// TODO: determine the actual max context of the model (e.g. 4096 for LLaMA v2) and use that instead of 2048 // TODO: determine the actual max context of the model (e.g. 4096 for LLaMA v2) and use that instead of 2048
fprintf(stderr, "%s: warning: base model only supports context sizes no greater than 2048 tokens (%d specified)\n", __func__, params.n_ctx); LOG_TEE("%s: warning: base model only supports context sizes no greater than 2048 tokens (%d specified)\n", __func__, params.n_ctx);
} else if (params.n_ctx < 8) { } else if (params.n_ctx < 8) {
fprintf(stderr, "%s: warning: minimum context size is 8, using minimum size.\n", __func__); LOG_TEE("%s: warning: minimum context size is 8, using minimum size.\n", __func__);
params.n_ctx = 8; params.n_ctx = 8;
} }
fprintf(stderr, "%s: build = %d (%s)\n", __func__, BUILD_NUMBER, BUILD_COMMIT); LOG_TEE("%s: build = %d (%s)\n", __func__, BUILD_NUMBER, BUILD_COMMIT);
if (params.seed == LLAMA_DEFAULT_SEED) { if (params.seed == LLAMA_DEFAULT_SEED) {
params.seed = time(NULL); params.seed = time(NULL);
} }
fprintf(stderr, "%s: seed = %u\n", __func__, params.seed); LOG_TEE("%s: seed = %u\n", __func__, params.seed);
std::mt19937 rng(params.seed); std::mt19937 rng(params.seed);
if (params.random_prompt) { if (params.random_prompt) {
params.prompt = gpt_random_prompt(rng); params.prompt = gpt_random_prompt(rng);
} }
LOG("llama backend init\n")
llama_backend_init(params.numa); llama_backend_init(params.numa);
llama_model * model; llama_model * model;
@ -122,6 +129,7 @@ int main(int argc, char ** argv) {
g_ctx = &ctx; g_ctx = &ctx;
// load the model and apply lora adapter, if any // load the model and apply lora adapter, if any
LOG("load the model and apply lora adapter, if any\n")
std::tie(model, ctx) = llama_init_from_gpt_params(params); std::tie(model, ctx) = llama_init_from_gpt_params(params);
if (params.cfg_scale > 1.f) { if (params.cfg_scale > 1.f) {
struct llama_context_params lparams = llama_context_params_from_gpt_params(params); struct llama_context_params lparams = llama_context_params_from_gpt_params(params);
@ -129,14 +137,14 @@ int main(int argc, char ** argv) {
} }
if (model == NULL) { if (model == NULL) {
fprintf(stderr, "%s: error: unable to load model\n", __func__); LOG_TEE("%s: error: unable to load model\n", __func__);
return 1; return 1;
} }
// print system information // print system information
{ {
fprintf(stderr, "\n"); LOG_TEE("\n");
fprintf(stderr, "system_info: n_threads = %d / %d | %s\n", LOG_TEE("system_info: n_threads = %d / %d | %s\n",
params.n_threads, std::thread::hardware_concurrency(), llama_print_system_info()); params.n_threads, std::thread::hardware_concurrency(), llama_print_system_info());
} }
@ -144,7 +152,7 @@ int main(int argc, char ** argv) {
// uncomment the "used_mem" line in llama.cpp to see the results // uncomment the "used_mem" line in llama.cpp to see the results
if (params.mem_test) { if (params.mem_test) {
{ {
fprintf(stderr, "%s: testing memory usage for n_batch = %d, n_ctx = %d\n", __func__, params.n_batch, params.n_ctx); LOG_TEE("%s: testing memory usage for n_batch = %d, n_ctx = %d\n", __func__, params.n_batch, params.n_ctx);
const std::vector<llama_token> tmp(params.n_batch, llama_token_bos(ctx)); const std::vector<llama_token> tmp(params.n_batch, llama_token_bos(ctx));
llama_eval(ctx, tmp.data(), tmp.size(), params.n_ctx, params.n_threads); llama_eval(ctx, tmp.data(), tmp.size(), params.n_ctx, params.n_threads);
@ -170,7 +178,7 @@ int main(int argc, char ** argv) {
std::vector<llama_token> session_tokens; std::vector<llama_token> session_tokens;
if (!path_session.empty()) { if (!path_session.empty()) {
fprintf(stderr, "%s: attempting to load saved session from '%s'\n", __func__, path_session.c_str()); LOG_TEE("%s: attempting to load saved session from '%s'\n", __func__, path_session.c_str());
// fopen to check for existing session // fopen to check for existing session
FILE * fp = std::fopen(path_session.c_str(), "rb"); FILE * fp = std::fopen(path_session.c_str(), "rb");
@ -180,21 +188,22 @@ int main(int argc, char ** argv) {
session_tokens.resize(params.n_ctx); session_tokens.resize(params.n_ctx);
size_t n_token_count_out = 0; size_t n_token_count_out = 0;
if (!llama_load_session_file(ctx, path_session.c_str(), session_tokens.data(), session_tokens.capacity(), &n_token_count_out)) { if (!llama_load_session_file(ctx, path_session.c_str(), session_tokens.data(), session_tokens.capacity(), &n_token_count_out)) {
fprintf(stderr, "%s: error: failed to load session file '%s'\n", __func__, path_session.c_str()); LOG_TEE("%s: error: failed to load session file '%s'\n", __func__, path_session.c_str());
return 1; return 1;
} }
session_tokens.resize(n_token_count_out); session_tokens.resize(n_token_count_out);
llama_set_rng_seed(ctx, params.seed); llama_set_rng_seed(ctx, params.seed);
fprintf(stderr, "%s: loaded a session with prompt size of %d tokens\n", __func__, (int) session_tokens.size()); LOG_TEE("%s: loaded a session with prompt size of %d tokens\n", __func__, (int) session_tokens.size());
} else { } else {
fprintf(stderr, "%s: session file does not exist, will create\n", __func__); LOG_TEE("%s: session file does not exist, will create\n", __func__);
} }
} }
const bool is_spm = llama_vocab_type(ctx) == LLAMA_VOCAB_TYPE_SPM; const bool is_spm = llama_vocab_type(ctx) == LLAMA_VOCAB_TYPE_SPM;
// tokenize the prompt // tokenize the prompt
LOG("Tokenize the prompt\n")
std::vector<llama_token> embd_inp; std::vector<llama_token> embd_inp;
if (params.interactive_first || params.instruct || !params.prompt.empty() || session_tokens.empty()) { if (params.interactive_first || params.instruct || !params.prompt.empty() || session_tokens.empty()) {
embd_inp = ::llama_tokenize(ctx, params.prompt, is_spm); embd_inp = ::llama_tokenize(ctx, params.prompt, is_spm);
@ -202,9 +211,14 @@ int main(int argc, char ** argv) {
embd_inp = session_tokens; embd_inp = session_tokens;
} }
LOG("prompt: \"%s\"\n", LOG_TOSTR(params.prompt))
//LOG("Prompt tokenized: %s\n", LOG_TOSTR(embd_inp))
LOG("embd_inp tokenized: %s\n", LOG_TOKENS_TOSTR_PRETTY(embd_inp,ctx))
// Should not run without any tokens // Should not run without any tokens
if (embd_inp.empty()) { if (embd_inp.empty()) {
embd_inp.push_back(llama_token_bos(ctx)); embd_inp.push_back(llama_token_bos(ctx));
LOG("embd_inp was considered empty and bos was added: %s\n", LOG_TOKENS_TOSTR_PRETTY(embd_inp,ctx))
} }
// Tokenize negative prompt // Tokenize negative prompt
@ -213,17 +227,24 @@ int main(int argc, char ** argv) {
int original_prompt_len = 0; int original_prompt_len = 0;
if (ctx_guidance) { if (ctx_guidance) {
params.cfg_negative_prompt.insert(0, 1, ' '); params.cfg_negative_prompt.insert(0, 1, ' ');
LOG("cfg_negative_prompt: \"%s\"\n", LOG_TOSTR(params.cfg_negative_prompt))
guidance_inp = ::llama_tokenize(ctx_guidance, params.cfg_negative_prompt, is_spm); guidance_inp = ::llama_tokenize(ctx_guidance, params.cfg_negative_prompt, is_spm);
LOG("guidance_inp tokenized: %s\n", LOG_TOKENS_TOSTR_PRETTY(guidance_inp,ctx_guidance))
std::vector<llama_token> original_inp = ::llama_tokenize(ctx, params.prompt, is_spm); std::vector<llama_token> original_inp = ::llama_tokenize(ctx, params.prompt, is_spm);
LOG("original_inp tokenized: %s\n", LOG_TOKENS_TOSTR_PRETTY(original_inp,ctx))
original_prompt_len = original_inp.size(); original_prompt_len = original_inp.size();
guidance_offset = (int)guidance_inp.size() - original_prompt_len; guidance_offset = (int)guidance_inp.size() - original_prompt_len;
LOG("original_prompt_len: %s", LOG_TOSTR(original_prompt_len))
LOG("guidance_offset: %s", LOG_TOSTR(guidance_offset))
} }
const int n_ctx = llama_n_ctx(ctx); const int n_ctx = llama_n_ctx(ctx);
LOG("n_ctx: %d\n", n_ctx)
if ((int) embd_inp.size() > n_ctx - 4) { if ((int) embd_inp.size() > n_ctx - 4) {
fprintf(stderr, "%s: error: prompt is too long (%d tokens, max %d)\n", __func__, (int) embd_inp.size(), n_ctx - 4); LOG_TEE("%s: error: prompt is too long (%d tokens, max %d)\n", __func__, (int) embd_inp.size(), n_ctx - 4);
return 1; return 1;
} }
@ -237,14 +258,14 @@ int main(int argc, char ** argv) {
n_matching_session_tokens++; n_matching_session_tokens++;
} }
if (params.prompt.empty() && n_matching_session_tokens == embd_inp.size()) { if (params.prompt.empty() && n_matching_session_tokens == embd_inp.size()) {
fprintf(stderr, "%s: using full prompt from session file\n", __func__); LOG_TEE("%s: using full prompt from session file\n", __func__);
} else if (n_matching_session_tokens >= embd_inp.size()) { } else if (n_matching_session_tokens >= embd_inp.size()) {
fprintf(stderr, "%s: session file has exact match for prompt!\n", __func__); LOG_TEE("%s: session file has exact match for prompt!\n", __func__);
} else if (n_matching_session_tokens < (embd_inp.size() / 2)) { } else if (n_matching_session_tokens < (embd_inp.size() / 2)) {
fprintf(stderr, "%s: warning: session file has low similarity to prompt (%zu / %zu tokens); will mostly be reevaluated\n", LOG_TEE("%s: warning: session file has low similarity to prompt (%zu / %zu tokens); will mostly be reevaluated\n",
__func__, n_matching_session_tokens, embd_inp.size()); __func__, n_matching_session_tokens, embd_inp.size());
} else { } else {
fprintf(stderr, "%s: session file matches %zu / %zu tokens of prompt\n", LOG_TEE("%s: session file matches %zu / %zu tokens of prompt\n",
__func__, n_matching_session_tokens, embd_inp.size()); __func__, n_matching_session_tokens, embd_inp.size());
} }
} }
@ -253,6 +274,9 @@ int main(int argc, char ** argv) {
// reevaluation of the last token token to recalculate the cached logits // reevaluation of the last token token to recalculate the cached logits
if (!embd_inp.empty() && n_matching_session_tokens == embd_inp.size() && if (!embd_inp.empty() && n_matching_session_tokens == embd_inp.size() &&
session_tokens.size() > embd_inp.size()) { session_tokens.size() > embd_inp.size()) {
LOG(
"recalculate the cached logits: embd_inp.empty() %s, n_matching_session_tokens %lu, embd_inp.size() %lu, session_tokens.size() %lu, embd_inp.size() %lu, session_tokens.resize( %lu )",
LOG_TOSTR(embd_inp.empty()), n_matching_session_tokens, embd_inp.size(), session_tokens.size(), embd_inp.size(), embd_inp.size() - 1)
session_tokens.resize(embd_inp.size() - 1); session_tokens.resize(embd_inp.size() - 1);
} }
@ -277,30 +301,30 @@ int main(int argc, char ** argv) {
} }
if (params.verbose_prompt) { if (params.verbose_prompt) {
fprintf(stderr, "\n"); LOG_TEE("\n");
fprintf(stderr, "%s: prompt: '%s'\n", __func__, params.prompt.c_str()); LOG_TEE("%s: prompt: '%s'\n", __func__, params.prompt.c_str());
fprintf(stderr, "%s: number of tokens in prompt = %zu\n", __func__, embd_inp.size()); LOG_TEE("%s: number of tokens in prompt = %zu\n", __func__, embd_inp.size());
for (int i = 0; i < (int) embd_inp.size(); i++) { for (int i = 0; i < (int) embd_inp.size(); i++) {
fprintf(stderr, "%6d -> '%s'\n", embd_inp[i], llama_token_to_str(ctx, embd_inp[i]).c_str()); LOG_TEE("%6d -> '%s'\n", embd_inp[i], llama_token_to_str(ctx, embd_inp[i]).c_str());
} }
if (ctx_guidance) { if (ctx_guidance) {
fprintf(stderr, "\n"); LOG_TEE("\n");
fprintf(stderr, "%s: negative prompt: '%s'\n", __func__, params.cfg_negative_prompt.c_str()); LOG_TEE("%s: negative prompt: '%s'\n", __func__, params.cfg_negative_prompt.c_str());
fprintf(stderr, "%s: number of tokens in negative prompt = %zu\n", __func__, guidance_inp.size()); LOG_TEE("%s: number of tokens in negative prompt = %zu\n", __func__, guidance_inp.size());
for (int i = 0; i < (int) guidance_inp.size(); i++) { for (int i = 0; i < (int) guidance_inp.size(); i++) {
fprintf(stderr, "%6d -> '%s'\n", guidance_inp[i], llama_token_to_str(ctx, guidance_inp[i]).c_str()); LOG_TEE("%6d -> '%s'\n", guidance_inp[i], llama_token_to_str(ctx, guidance_inp[i]).c_str());
} }
} }
if (params.n_keep > 0) { if (params.n_keep > 0) {
fprintf(stderr, "%s: static prompt based on n_keep: '", __func__); LOG_TEE("%s: static prompt based on n_keep: '", __func__);
for (int i = 0; i < params.n_keep; i++) { for (int i = 0; i < params.n_keep; i++) {
fprintf(stderr, "%s", llama_token_to_str(ctx, embd_inp[i]).c_str()); LOG_TEE("%s", llama_token_to_str(ctx, embd_inp[i]).c_str());
} }
fprintf(stderr, "'\n"); LOG_TEE("'\n");
} }
fprintf(stderr, "\n"); LOG_TEE("\n");
} }
if (params.interactive) { if (params.interactive) {
@ -317,30 +341,30 @@ int main(int argc, char ** argv) {
SetConsoleCtrlHandler(reinterpret_cast<PHANDLER_ROUTINE>(console_ctrl_handler), true); SetConsoleCtrlHandler(reinterpret_cast<PHANDLER_ROUTINE>(console_ctrl_handler), true);
#endif #endif
fprintf(stderr, "%s: interactive mode on.\n", __func__); LOG_TEE("%s: interactive mode on.\n", __func__);
if (params.antiprompt.size()) { if (params.antiprompt.size()) {
for (auto antiprompt : params.antiprompt) { for (auto antiprompt : params.antiprompt) {
fprintf(stderr, "Reverse prompt: '%s'\n", antiprompt.c_str()); LOG_TEE("Reverse prompt: '%s'\n", antiprompt.c_str());
} }
} }
if (params.input_prefix_bos) { if (params.input_prefix_bos) {
fprintf(stderr, "Input prefix with BOS\n"); LOG_TEE("Input prefix with BOS\n");
} }
if (!params.input_prefix.empty()) { if (!params.input_prefix.empty()) {
fprintf(stderr, "Input prefix: '%s'\n", params.input_prefix.c_str()); LOG_TEE("Input prefix: '%s'\n", params.input_prefix.c_str());
} }
if (!params.input_suffix.empty()) { if (!params.input_suffix.empty()) {
fprintf(stderr, "Input suffix: '%s'\n", params.input_suffix.c_str()); LOG_TEE("Input suffix: '%s'\n", params.input_suffix.c_str());
} }
} }
fprintf(stderr, "sampling: repeat_last_n = %d, repeat_penalty = %f, presence_penalty = %f, frequency_penalty = %f, top_k = %d, tfs_z = %f, top_p = %f, typical_p = %f, temp = %f, mirostat = %d, mirostat_lr = %f, mirostat_ent = %f\n", LOG_TEE("sampling: repeat_last_n = %d, repeat_penalty = %f, presence_penalty = %f, frequency_penalty = %f, top_k = %d, tfs_z = %f, top_p = %f, typical_p = %f, temp = %f, mirostat = %d, mirostat_lr = %f, mirostat_ent = %f\n",
params.repeat_last_n, params.repeat_penalty, params.presence_penalty, params.frequency_penalty, params.top_k, params.tfs_z, params.top_p, params.typical_p, params.temp, params.mirostat, params.mirostat_eta, params.mirostat_tau); params.repeat_last_n, params.repeat_penalty, params.presence_penalty, params.frequency_penalty, params.top_k, params.tfs_z, params.top_p, params.typical_p, params.temp, params.mirostat, params.mirostat_eta, params.mirostat_tau);
fprintf(stderr, "generate: n_ctx = %d, n_batch = %d, n_predict = %d, n_keep = %d\n", n_ctx, params.n_batch, params.n_predict, params.n_keep); LOG_TEE("generate: n_ctx = %d, n_batch = %d, n_predict = %d, n_keep = %d\n", n_ctx, params.n_batch, params.n_predict, params.n_keep);
fprintf(stderr, "\n\n"); LOG_TEE("\n\n");
grammar_parser::parse_state parsed_grammar; grammar_parser::parse_state parsed_grammar;
llama_grammar * grammar = NULL; llama_grammar * grammar = NULL;
@ -350,14 +374,14 @@ int main(int argc, char ** argv) {
if (parsed_grammar.rules.empty()) { if (parsed_grammar.rules.empty()) {
return 1; return 1;
} }
fprintf(stderr, "%s: grammar:\n", __func__); LOG_TEE("%s: grammar:\n", __func__);
grammar_parser::print_grammar(stderr, parsed_grammar); grammar_parser::print_grammar(stderr, parsed_grammar);
fprintf(stderr, "\n"); LOG_TEE("\n");
{ {
auto it = params.logit_bias.find(llama_token_eos(ctx)); auto it = params.logit_bias.find(llama_token_eos(ctx));
if (it != params.logit_bias.end() && it->second == -INFINITY) { if (it != params.logit_bias.end() && it->second == -INFINITY) {
fprintf(stderr, "%s: warning: EOS token is disabled, which will cause most grammars to fail\n", __func__); LOG_TEE("%s: warning: EOS token is disabled, which will cause most grammars to fail\n", __func__);
} }
} }
@ -434,7 +458,7 @@ int main(int argc, char ** argv) {
// - take half of the last (n_ctx - n_keep) tokens and recompute the logits in batches // - take half of the last (n_ctx - n_keep) tokens and recompute the logits in batches
if (n_past + (int) embd.size() + std::max<int>(0, guidance_offset) > n_ctx) { if (n_past + (int) embd.size() + std::max<int>(0, guidance_offset) > n_ctx) {
if (params.n_predict == -2) { if (params.n_predict == -2) {
fprintf(stderr, "\n\n%s: context full, stopping generation\n", __func__); LOG_TEE("\n\n%s: context full, stopping generation\n", __func__);
break; break;
} }
@ -503,11 +527,11 @@ int main(int argc, char ** argv) {
input_buf = embd_guidance.data(); input_buf = embd_guidance.data();
input_size = embd_guidance.size(); input_size = embd_guidance.size();
//fprintf(stderr, "\n---------------------\n"); //LOG_TEE("\n---------------------\n");
//for (int i = 0; i < (int) embd_guidance.size(); i++) { //for (int i = 0; i < (int) embd_guidance.size(); i++) {
//fprintf(stderr, "%s", llama_token_to_str(ctx, embd_guidance[i])); //LOG_TEE("%s", llama_token_to_str(ctx, embd_guidance[i]));
//} //}
//fprintf(stderr, "\n---------------------\n"); //LOG_TEE("\n---------------------\n");
} else { } else {
input_buf = embd.data(); input_buf = embd.data();
input_size = embd.size(); input_size = embd.size();
@ -516,7 +540,7 @@ int main(int argc, char ** argv) {
for (int i = 0; i < input_size; i += params.n_batch) { for (int i = 0; i < input_size; i += params.n_batch) {
int n_eval = std::min(input_size - i, params.n_batch); int n_eval = std::min(input_size - i, params.n_batch);
if (llama_eval(ctx_guidance, input_buf + i, n_eval, n_past_guidance, params.n_threads)) { if (llama_eval(ctx_guidance, input_buf + i, n_eval, n_past_guidance, params.n_threads)) {
fprintf(stderr, "%s : failed to eval\n", __func__); LOG_TEE("%s : failed to eval\n", __func__);
return 1; return 1;
} }
@ -530,7 +554,7 @@ int main(int argc, char ** argv) {
n_eval = params.n_batch; n_eval = params.n_batch;
} }
if (llama_eval(ctx, &embd[i], n_eval, n_past, params.n_threads)) { if (llama_eval(ctx, &embd[i], n_eval, n_past, params.n_threads)) {
fprintf(stderr, "%s : failed to eval\n", __func__); LOG_TEE("%s : failed to eval\n", __func__);
return 1; return 1;
} }
n_past += n_eval; n_past += n_eval;
@ -796,7 +820,7 @@ int main(int argc, char ** argv) {
// end of text token // end of text token
if (!embd.empty() && embd.back() == llama_token_eos(ctx) && !(params.instruct || params.interactive)) { if (!embd.empty() && embd.back() == llama_token_eos(ctx) && !(params.instruct || params.interactive)) {
fprintf(stderr, " [end of text]\n"); LOG_TEE(" [end of text]\n");
break; break;
} }
@ -809,7 +833,7 @@ int main(int argc, char ** argv) {
} }
if (!path_session.empty() && params.prompt_cache_all && !params.prompt_cache_ro) { if (!path_session.empty() && params.prompt_cache_all && !params.prompt_cache_ro) {
fprintf(stderr, "\n%s: saving final output to session file '%s'\n", __func__, path_session.c_str()); LOG_TEE("\n%s: saving final output to session file '%s'\n", __func__, path_session.c_str());
llama_save_session_file(ctx, path_session.c_str(), session_tokens.data(), session_tokens.size()); llama_save_session_file(ctx, path_session.c_str(), session_tokens.data(), session_tokens.size());
} }
@ -823,7 +847,7 @@ int main(int argc, char ** argv) {
} }
llama_backend_free(); llama_backend_free();
LOG("Log end - end of main()\n") LOG_TEE("Log end\n")
return 0; return 0;
} }