main: replaced fprintf/LOG_TEE, some trace logging
This commit is contained in:
parent
360b36c921
commit
181e8a9902
1 changed files with 77 additions and 53 deletions
|
@ -56,14 +56,20 @@ void sigint_handler(int signo) {
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
int main(int argc, char ** argv) {
|
int main(int argc, char ** argv) {
|
||||||
LOG("Log start - start of main()\n")
|
|
||||||
|
|
||||||
gpt_params params;
|
gpt_params params;
|
||||||
|
|
||||||
if (gpt_params_parse(argc, argv, params) == false) {
|
if (gpt_params_parse(argc, argv, params) == false) {
|
||||||
return 1;
|
return 1;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
log_set_target(LOG_FILENAME_GENERATOR("main", "log"));
|
||||||
|
LOG_TEE("Log start\n")
|
||||||
|
|
||||||
|
log_dump_cmdline(argc,argv);
|
||||||
|
|
||||||
|
// TODO: Dump params ?
|
||||||
|
//LOG("Params perplexity: %s\n", LOG_TOSTR(params.perplexity))
|
||||||
|
|
||||||
// save choice to use color for later
|
// save choice to use color for later
|
||||||
// (note for later: this is a slightly awkward choice)
|
// (note for later: this is a slightly awkward choice)
|
||||||
console::init(params.simple_io, params.use_color);
|
console::init(params.simple_io, params.use_color);
|
||||||
|
@ -86,34 +92,35 @@ int main(int argc, char ** argv) {
|
||||||
}
|
}
|
||||||
|
|
||||||
if (params.rope_freq_base != 10000.0) {
|
if (params.rope_freq_base != 10000.0) {
|
||||||
fprintf(stderr, "%s: warning: changing RoPE frequency base to %g (default 10000.0)\n", __func__, params.rope_freq_base);
|
LOG_TEE("%s: warning: changing RoPE frequency base to %g (default 10000.0)\n", __func__, params.rope_freq_base);
|
||||||
}
|
}
|
||||||
|
|
||||||
if (params.rope_freq_scale != 1.0) {
|
if (params.rope_freq_scale != 1.0) {
|
||||||
fprintf(stderr, "%s: warning: scaling RoPE frequency by %g (default 1.0)\n", __func__, params.rope_freq_scale);
|
LOG_TEE("%s: warning: scaling RoPE frequency by %g (default 1.0)\n", __func__, params.rope_freq_scale);
|
||||||
}
|
}
|
||||||
|
|
||||||
if (params.n_ctx > 2048) {
|
if (params.n_ctx > 2048) {
|
||||||
// TODO: determine the actual max context of the model (e.g. 4096 for LLaMA v2) and use that instead of 2048
|
// TODO: determine the actual max context of the model (e.g. 4096 for LLaMA v2) and use that instead of 2048
|
||||||
fprintf(stderr, "%s: warning: base model only supports context sizes no greater than 2048 tokens (%d specified)\n", __func__, params.n_ctx);
|
LOG_TEE("%s: warning: base model only supports context sizes no greater than 2048 tokens (%d specified)\n", __func__, params.n_ctx);
|
||||||
} else if (params.n_ctx < 8) {
|
} else if (params.n_ctx < 8) {
|
||||||
fprintf(stderr, "%s: warning: minimum context size is 8, using minimum size.\n", __func__);
|
LOG_TEE("%s: warning: minimum context size is 8, using minimum size.\n", __func__);
|
||||||
params.n_ctx = 8;
|
params.n_ctx = 8;
|
||||||
}
|
}
|
||||||
|
|
||||||
fprintf(stderr, "%s: build = %d (%s)\n", __func__, BUILD_NUMBER, BUILD_COMMIT);
|
LOG_TEE("%s: build = %d (%s)\n", __func__, BUILD_NUMBER, BUILD_COMMIT);
|
||||||
|
|
||||||
if (params.seed == LLAMA_DEFAULT_SEED) {
|
if (params.seed == LLAMA_DEFAULT_SEED) {
|
||||||
params.seed = time(NULL);
|
params.seed = time(NULL);
|
||||||
}
|
}
|
||||||
|
|
||||||
fprintf(stderr, "%s: seed = %u\n", __func__, params.seed);
|
LOG_TEE("%s: seed = %u\n", __func__, params.seed);
|
||||||
|
|
||||||
std::mt19937 rng(params.seed);
|
std::mt19937 rng(params.seed);
|
||||||
if (params.random_prompt) {
|
if (params.random_prompt) {
|
||||||
params.prompt = gpt_random_prompt(rng);
|
params.prompt = gpt_random_prompt(rng);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
LOG("llama backend init\n")
|
||||||
llama_backend_init(params.numa);
|
llama_backend_init(params.numa);
|
||||||
|
|
||||||
llama_model * model;
|
llama_model * model;
|
||||||
|
@ -122,6 +129,7 @@ int main(int argc, char ** argv) {
|
||||||
g_ctx = &ctx;
|
g_ctx = &ctx;
|
||||||
|
|
||||||
// load the model and apply lora adapter, if any
|
// load the model and apply lora adapter, if any
|
||||||
|
LOG("load the model and apply lora adapter, if any\n")
|
||||||
std::tie(model, ctx) = llama_init_from_gpt_params(params);
|
std::tie(model, ctx) = llama_init_from_gpt_params(params);
|
||||||
if (params.cfg_scale > 1.f) {
|
if (params.cfg_scale > 1.f) {
|
||||||
struct llama_context_params lparams = llama_context_params_from_gpt_params(params);
|
struct llama_context_params lparams = llama_context_params_from_gpt_params(params);
|
||||||
|
@ -129,14 +137,14 @@ int main(int argc, char ** argv) {
|
||||||
}
|
}
|
||||||
|
|
||||||
if (model == NULL) {
|
if (model == NULL) {
|
||||||
fprintf(stderr, "%s: error: unable to load model\n", __func__);
|
LOG_TEE("%s: error: unable to load model\n", __func__);
|
||||||
return 1;
|
return 1;
|
||||||
}
|
}
|
||||||
|
|
||||||
// print system information
|
// print system information
|
||||||
{
|
{
|
||||||
fprintf(stderr, "\n");
|
LOG_TEE("\n");
|
||||||
fprintf(stderr, "system_info: n_threads = %d / %d | %s\n",
|
LOG_TEE("system_info: n_threads = %d / %d | %s\n",
|
||||||
params.n_threads, std::thread::hardware_concurrency(), llama_print_system_info());
|
params.n_threads, std::thread::hardware_concurrency(), llama_print_system_info());
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -144,7 +152,7 @@ int main(int argc, char ** argv) {
|
||||||
// uncomment the "used_mem" line in llama.cpp to see the results
|
// uncomment the "used_mem" line in llama.cpp to see the results
|
||||||
if (params.mem_test) {
|
if (params.mem_test) {
|
||||||
{
|
{
|
||||||
fprintf(stderr, "%s: testing memory usage for n_batch = %d, n_ctx = %d\n", __func__, params.n_batch, params.n_ctx);
|
LOG_TEE("%s: testing memory usage for n_batch = %d, n_ctx = %d\n", __func__, params.n_batch, params.n_ctx);
|
||||||
|
|
||||||
const std::vector<llama_token> tmp(params.n_batch, llama_token_bos(ctx));
|
const std::vector<llama_token> tmp(params.n_batch, llama_token_bos(ctx));
|
||||||
llama_eval(ctx, tmp.data(), tmp.size(), params.n_ctx, params.n_threads);
|
llama_eval(ctx, tmp.data(), tmp.size(), params.n_ctx, params.n_threads);
|
||||||
|
@ -170,7 +178,7 @@ int main(int argc, char ** argv) {
|
||||||
std::vector<llama_token> session_tokens;
|
std::vector<llama_token> session_tokens;
|
||||||
|
|
||||||
if (!path_session.empty()) {
|
if (!path_session.empty()) {
|
||||||
fprintf(stderr, "%s: attempting to load saved session from '%s'\n", __func__, path_session.c_str());
|
LOG_TEE("%s: attempting to load saved session from '%s'\n", __func__, path_session.c_str());
|
||||||
|
|
||||||
// fopen to check for existing session
|
// fopen to check for existing session
|
||||||
FILE * fp = std::fopen(path_session.c_str(), "rb");
|
FILE * fp = std::fopen(path_session.c_str(), "rb");
|
||||||
|
@ -180,21 +188,22 @@ int main(int argc, char ** argv) {
|
||||||
session_tokens.resize(params.n_ctx);
|
session_tokens.resize(params.n_ctx);
|
||||||
size_t n_token_count_out = 0;
|
size_t n_token_count_out = 0;
|
||||||
if (!llama_load_session_file(ctx, path_session.c_str(), session_tokens.data(), session_tokens.capacity(), &n_token_count_out)) {
|
if (!llama_load_session_file(ctx, path_session.c_str(), session_tokens.data(), session_tokens.capacity(), &n_token_count_out)) {
|
||||||
fprintf(stderr, "%s: error: failed to load session file '%s'\n", __func__, path_session.c_str());
|
LOG_TEE("%s: error: failed to load session file '%s'\n", __func__, path_session.c_str());
|
||||||
return 1;
|
return 1;
|
||||||
}
|
}
|
||||||
session_tokens.resize(n_token_count_out);
|
session_tokens.resize(n_token_count_out);
|
||||||
llama_set_rng_seed(ctx, params.seed);
|
llama_set_rng_seed(ctx, params.seed);
|
||||||
|
|
||||||
fprintf(stderr, "%s: loaded a session with prompt size of %d tokens\n", __func__, (int) session_tokens.size());
|
LOG_TEE("%s: loaded a session with prompt size of %d tokens\n", __func__, (int) session_tokens.size());
|
||||||
} else {
|
} else {
|
||||||
fprintf(stderr, "%s: session file does not exist, will create\n", __func__);
|
LOG_TEE("%s: session file does not exist, will create\n", __func__);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
const bool is_spm = llama_vocab_type(ctx) == LLAMA_VOCAB_TYPE_SPM;
|
const bool is_spm = llama_vocab_type(ctx) == LLAMA_VOCAB_TYPE_SPM;
|
||||||
|
|
||||||
// tokenize the prompt
|
// tokenize the prompt
|
||||||
|
LOG("Tokenize the prompt\n")
|
||||||
std::vector<llama_token> embd_inp;
|
std::vector<llama_token> embd_inp;
|
||||||
if (params.interactive_first || params.instruct || !params.prompt.empty() || session_tokens.empty()) {
|
if (params.interactive_first || params.instruct || !params.prompt.empty() || session_tokens.empty()) {
|
||||||
embd_inp = ::llama_tokenize(ctx, params.prompt, is_spm);
|
embd_inp = ::llama_tokenize(ctx, params.prompt, is_spm);
|
||||||
|
@ -202,9 +211,14 @@ int main(int argc, char ** argv) {
|
||||||
embd_inp = session_tokens;
|
embd_inp = session_tokens;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
LOG("prompt: \"%s\"\n", LOG_TOSTR(params.prompt))
|
||||||
|
//LOG("Prompt tokenized: %s\n", LOG_TOSTR(embd_inp))
|
||||||
|
LOG("embd_inp tokenized: %s\n", LOG_TOKENS_TOSTR_PRETTY(embd_inp,ctx))
|
||||||
|
|
||||||
// Should not run without any tokens
|
// Should not run without any tokens
|
||||||
if (embd_inp.empty()) {
|
if (embd_inp.empty()) {
|
||||||
embd_inp.push_back(llama_token_bos(ctx));
|
embd_inp.push_back(llama_token_bos(ctx));
|
||||||
|
LOG("embd_inp was considered empty and bos was added: %s\n", LOG_TOKENS_TOSTR_PRETTY(embd_inp,ctx))
|
||||||
}
|
}
|
||||||
|
|
||||||
// Tokenize negative prompt
|
// Tokenize negative prompt
|
||||||
|
@ -213,17 +227,24 @@ int main(int argc, char ** argv) {
|
||||||
int original_prompt_len = 0;
|
int original_prompt_len = 0;
|
||||||
if (ctx_guidance) {
|
if (ctx_guidance) {
|
||||||
params.cfg_negative_prompt.insert(0, 1, ' ');
|
params.cfg_negative_prompt.insert(0, 1, ' ');
|
||||||
|
LOG("cfg_negative_prompt: \"%s\"\n", LOG_TOSTR(params.cfg_negative_prompt))
|
||||||
guidance_inp = ::llama_tokenize(ctx_guidance, params.cfg_negative_prompt, is_spm);
|
guidance_inp = ::llama_tokenize(ctx_guidance, params.cfg_negative_prompt, is_spm);
|
||||||
|
LOG("guidance_inp tokenized: %s\n", LOG_TOKENS_TOSTR_PRETTY(guidance_inp,ctx_guidance))
|
||||||
|
|
||||||
std::vector<llama_token> original_inp = ::llama_tokenize(ctx, params.prompt, is_spm);
|
std::vector<llama_token> original_inp = ::llama_tokenize(ctx, params.prompt, is_spm);
|
||||||
|
LOG("original_inp tokenized: %s\n", LOG_TOKENS_TOSTR_PRETTY(original_inp,ctx))
|
||||||
original_prompt_len = original_inp.size();
|
original_prompt_len = original_inp.size();
|
||||||
guidance_offset = (int)guidance_inp.size() - original_prompt_len;
|
guidance_offset = (int)guidance_inp.size() - original_prompt_len;
|
||||||
|
LOG("original_prompt_len: %s", LOG_TOSTR(original_prompt_len))
|
||||||
|
LOG("guidance_offset: %s", LOG_TOSTR(guidance_offset))
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
const int n_ctx = llama_n_ctx(ctx);
|
const int n_ctx = llama_n_ctx(ctx);
|
||||||
|
LOG("n_ctx: %d\n", n_ctx)
|
||||||
|
|
||||||
if ((int) embd_inp.size() > n_ctx - 4) {
|
if ((int) embd_inp.size() > n_ctx - 4) {
|
||||||
fprintf(stderr, "%s: error: prompt is too long (%d tokens, max %d)\n", __func__, (int) embd_inp.size(), n_ctx - 4);
|
LOG_TEE("%s: error: prompt is too long (%d tokens, max %d)\n", __func__, (int) embd_inp.size(), n_ctx - 4);
|
||||||
return 1;
|
return 1;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -237,14 +258,14 @@ int main(int argc, char ** argv) {
|
||||||
n_matching_session_tokens++;
|
n_matching_session_tokens++;
|
||||||
}
|
}
|
||||||
if (params.prompt.empty() && n_matching_session_tokens == embd_inp.size()) {
|
if (params.prompt.empty() && n_matching_session_tokens == embd_inp.size()) {
|
||||||
fprintf(stderr, "%s: using full prompt from session file\n", __func__);
|
LOG_TEE("%s: using full prompt from session file\n", __func__);
|
||||||
} else if (n_matching_session_tokens >= embd_inp.size()) {
|
} else if (n_matching_session_tokens >= embd_inp.size()) {
|
||||||
fprintf(stderr, "%s: session file has exact match for prompt!\n", __func__);
|
LOG_TEE("%s: session file has exact match for prompt!\n", __func__);
|
||||||
} else if (n_matching_session_tokens < (embd_inp.size() / 2)) {
|
} else if (n_matching_session_tokens < (embd_inp.size() / 2)) {
|
||||||
fprintf(stderr, "%s: warning: session file has low similarity to prompt (%zu / %zu tokens); will mostly be reevaluated\n",
|
LOG_TEE("%s: warning: session file has low similarity to prompt (%zu / %zu tokens); will mostly be reevaluated\n",
|
||||||
__func__, n_matching_session_tokens, embd_inp.size());
|
__func__, n_matching_session_tokens, embd_inp.size());
|
||||||
} else {
|
} else {
|
||||||
fprintf(stderr, "%s: session file matches %zu / %zu tokens of prompt\n",
|
LOG_TEE("%s: session file matches %zu / %zu tokens of prompt\n",
|
||||||
__func__, n_matching_session_tokens, embd_inp.size());
|
__func__, n_matching_session_tokens, embd_inp.size());
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -253,6 +274,9 @@ int main(int argc, char ** argv) {
|
||||||
// reevaluation of the last token token to recalculate the cached logits
|
// reevaluation of the last token token to recalculate the cached logits
|
||||||
if (!embd_inp.empty() && n_matching_session_tokens == embd_inp.size() &&
|
if (!embd_inp.empty() && n_matching_session_tokens == embd_inp.size() &&
|
||||||
session_tokens.size() > embd_inp.size()) {
|
session_tokens.size() > embd_inp.size()) {
|
||||||
|
LOG(
|
||||||
|
"recalculate the cached logits: embd_inp.empty() %s, n_matching_session_tokens %lu, embd_inp.size() %lu, session_tokens.size() %lu, embd_inp.size() %lu, session_tokens.resize( %lu )",
|
||||||
|
LOG_TOSTR(embd_inp.empty()), n_matching_session_tokens, embd_inp.size(), session_tokens.size(), embd_inp.size(), embd_inp.size() - 1)
|
||||||
session_tokens.resize(embd_inp.size() - 1);
|
session_tokens.resize(embd_inp.size() - 1);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -277,30 +301,30 @@ int main(int argc, char ** argv) {
|
||||||
}
|
}
|
||||||
|
|
||||||
if (params.verbose_prompt) {
|
if (params.verbose_prompt) {
|
||||||
fprintf(stderr, "\n");
|
LOG_TEE("\n");
|
||||||
fprintf(stderr, "%s: prompt: '%s'\n", __func__, params.prompt.c_str());
|
LOG_TEE("%s: prompt: '%s'\n", __func__, params.prompt.c_str());
|
||||||
fprintf(stderr, "%s: number of tokens in prompt = %zu\n", __func__, embd_inp.size());
|
LOG_TEE("%s: number of tokens in prompt = %zu\n", __func__, embd_inp.size());
|
||||||
for (int i = 0; i < (int) embd_inp.size(); i++) {
|
for (int i = 0; i < (int) embd_inp.size(); i++) {
|
||||||
fprintf(stderr, "%6d -> '%s'\n", embd_inp[i], llama_token_to_str(ctx, embd_inp[i]).c_str());
|
LOG_TEE("%6d -> '%s'\n", embd_inp[i], llama_token_to_str(ctx, embd_inp[i]).c_str());
|
||||||
}
|
}
|
||||||
|
|
||||||
if (ctx_guidance) {
|
if (ctx_guidance) {
|
||||||
fprintf(stderr, "\n");
|
LOG_TEE("\n");
|
||||||
fprintf(stderr, "%s: negative prompt: '%s'\n", __func__, params.cfg_negative_prompt.c_str());
|
LOG_TEE("%s: negative prompt: '%s'\n", __func__, params.cfg_negative_prompt.c_str());
|
||||||
fprintf(stderr, "%s: number of tokens in negative prompt = %zu\n", __func__, guidance_inp.size());
|
LOG_TEE("%s: number of tokens in negative prompt = %zu\n", __func__, guidance_inp.size());
|
||||||
for (int i = 0; i < (int) guidance_inp.size(); i++) {
|
for (int i = 0; i < (int) guidance_inp.size(); i++) {
|
||||||
fprintf(stderr, "%6d -> '%s'\n", guidance_inp[i], llama_token_to_str(ctx, guidance_inp[i]).c_str());
|
LOG_TEE("%6d -> '%s'\n", guidance_inp[i], llama_token_to_str(ctx, guidance_inp[i]).c_str());
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
if (params.n_keep > 0) {
|
if (params.n_keep > 0) {
|
||||||
fprintf(stderr, "%s: static prompt based on n_keep: '", __func__);
|
LOG_TEE("%s: static prompt based on n_keep: '", __func__);
|
||||||
for (int i = 0; i < params.n_keep; i++) {
|
for (int i = 0; i < params.n_keep; i++) {
|
||||||
fprintf(stderr, "%s", llama_token_to_str(ctx, embd_inp[i]).c_str());
|
LOG_TEE("%s", llama_token_to_str(ctx, embd_inp[i]).c_str());
|
||||||
}
|
}
|
||||||
fprintf(stderr, "'\n");
|
LOG_TEE("'\n");
|
||||||
}
|
}
|
||||||
fprintf(stderr, "\n");
|
LOG_TEE("\n");
|
||||||
}
|
}
|
||||||
|
|
||||||
if (params.interactive) {
|
if (params.interactive) {
|
||||||
|
@ -317,30 +341,30 @@ int main(int argc, char ** argv) {
|
||||||
SetConsoleCtrlHandler(reinterpret_cast<PHANDLER_ROUTINE>(console_ctrl_handler), true);
|
SetConsoleCtrlHandler(reinterpret_cast<PHANDLER_ROUTINE>(console_ctrl_handler), true);
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
fprintf(stderr, "%s: interactive mode on.\n", __func__);
|
LOG_TEE("%s: interactive mode on.\n", __func__);
|
||||||
|
|
||||||
if (params.antiprompt.size()) {
|
if (params.antiprompt.size()) {
|
||||||
for (auto antiprompt : params.antiprompt) {
|
for (auto antiprompt : params.antiprompt) {
|
||||||
fprintf(stderr, "Reverse prompt: '%s'\n", antiprompt.c_str());
|
LOG_TEE("Reverse prompt: '%s'\n", antiprompt.c_str());
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
if (params.input_prefix_bos) {
|
if (params.input_prefix_bos) {
|
||||||
fprintf(stderr, "Input prefix with BOS\n");
|
LOG_TEE("Input prefix with BOS\n");
|
||||||
}
|
}
|
||||||
|
|
||||||
if (!params.input_prefix.empty()) {
|
if (!params.input_prefix.empty()) {
|
||||||
fprintf(stderr, "Input prefix: '%s'\n", params.input_prefix.c_str());
|
LOG_TEE("Input prefix: '%s'\n", params.input_prefix.c_str());
|
||||||
}
|
}
|
||||||
|
|
||||||
if (!params.input_suffix.empty()) {
|
if (!params.input_suffix.empty()) {
|
||||||
fprintf(stderr, "Input suffix: '%s'\n", params.input_suffix.c_str());
|
LOG_TEE("Input suffix: '%s'\n", params.input_suffix.c_str());
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
fprintf(stderr, "sampling: repeat_last_n = %d, repeat_penalty = %f, presence_penalty = %f, frequency_penalty = %f, top_k = %d, tfs_z = %f, top_p = %f, typical_p = %f, temp = %f, mirostat = %d, mirostat_lr = %f, mirostat_ent = %f\n",
|
LOG_TEE("sampling: repeat_last_n = %d, repeat_penalty = %f, presence_penalty = %f, frequency_penalty = %f, top_k = %d, tfs_z = %f, top_p = %f, typical_p = %f, temp = %f, mirostat = %d, mirostat_lr = %f, mirostat_ent = %f\n",
|
||||||
params.repeat_last_n, params.repeat_penalty, params.presence_penalty, params.frequency_penalty, params.top_k, params.tfs_z, params.top_p, params.typical_p, params.temp, params.mirostat, params.mirostat_eta, params.mirostat_tau);
|
params.repeat_last_n, params.repeat_penalty, params.presence_penalty, params.frequency_penalty, params.top_k, params.tfs_z, params.top_p, params.typical_p, params.temp, params.mirostat, params.mirostat_eta, params.mirostat_tau);
|
||||||
fprintf(stderr, "generate: n_ctx = %d, n_batch = %d, n_predict = %d, n_keep = %d\n", n_ctx, params.n_batch, params.n_predict, params.n_keep);
|
LOG_TEE("generate: n_ctx = %d, n_batch = %d, n_predict = %d, n_keep = %d\n", n_ctx, params.n_batch, params.n_predict, params.n_keep);
|
||||||
fprintf(stderr, "\n\n");
|
LOG_TEE("\n\n");
|
||||||
|
|
||||||
grammar_parser::parse_state parsed_grammar;
|
grammar_parser::parse_state parsed_grammar;
|
||||||
llama_grammar * grammar = NULL;
|
llama_grammar * grammar = NULL;
|
||||||
|
@ -350,14 +374,14 @@ int main(int argc, char ** argv) {
|
||||||
if (parsed_grammar.rules.empty()) {
|
if (parsed_grammar.rules.empty()) {
|
||||||
return 1;
|
return 1;
|
||||||
}
|
}
|
||||||
fprintf(stderr, "%s: grammar:\n", __func__);
|
LOG_TEE("%s: grammar:\n", __func__);
|
||||||
grammar_parser::print_grammar(stderr, parsed_grammar);
|
grammar_parser::print_grammar(stderr, parsed_grammar);
|
||||||
fprintf(stderr, "\n");
|
LOG_TEE("\n");
|
||||||
|
|
||||||
{
|
{
|
||||||
auto it = params.logit_bias.find(llama_token_eos(ctx));
|
auto it = params.logit_bias.find(llama_token_eos(ctx));
|
||||||
if (it != params.logit_bias.end() && it->second == -INFINITY) {
|
if (it != params.logit_bias.end() && it->second == -INFINITY) {
|
||||||
fprintf(stderr, "%s: warning: EOS token is disabled, which will cause most grammars to fail\n", __func__);
|
LOG_TEE("%s: warning: EOS token is disabled, which will cause most grammars to fail\n", __func__);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -434,7 +458,7 @@ int main(int argc, char ** argv) {
|
||||||
// - take half of the last (n_ctx - n_keep) tokens and recompute the logits in batches
|
// - take half of the last (n_ctx - n_keep) tokens and recompute the logits in batches
|
||||||
if (n_past + (int) embd.size() + std::max<int>(0, guidance_offset) > n_ctx) {
|
if (n_past + (int) embd.size() + std::max<int>(0, guidance_offset) > n_ctx) {
|
||||||
if (params.n_predict == -2) {
|
if (params.n_predict == -2) {
|
||||||
fprintf(stderr, "\n\n%s: context full, stopping generation\n", __func__);
|
LOG_TEE("\n\n%s: context full, stopping generation\n", __func__);
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -503,11 +527,11 @@ int main(int argc, char ** argv) {
|
||||||
|
|
||||||
input_buf = embd_guidance.data();
|
input_buf = embd_guidance.data();
|
||||||
input_size = embd_guidance.size();
|
input_size = embd_guidance.size();
|
||||||
//fprintf(stderr, "\n---------------------\n");
|
//LOG_TEE("\n---------------------\n");
|
||||||
//for (int i = 0; i < (int) embd_guidance.size(); i++) {
|
//for (int i = 0; i < (int) embd_guidance.size(); i++) {
|
||||||
//fprintf(stderr, "%s", llama_token_to_str(ctx, embd_guidance[i]));
|
//LOG_TEE("%s", llama_token_to_str(ctx, embd_guidance[i]));
|
||||||
//}
|
//}
|
||||||
//fprintf(stderr, "\n---------------------\n");
|
//LOG_TEE("\n---------------------\n");
|
||||||
} else {
|
} else {
|
||||||
input_buf = embd.data();
|
input_buf = embd.data();
|
||||||
input_size = embd.size();
|
input_size = embd.size();
|
||||||
|
@ -516,7 +540,7 @@ int main(int argc, char ** argv) {
|
||||||
for (int i = 0; i < input_size; i += params.n_batch) {
|
for (int i = 0; i < input_size; i += params.n_batch) {
|
||||||
int n_eval = std::min(input_size - i, params.n_batch);
|
int n_eval = std::min(input_size - i, params.n_batch);
|
||||||
if (llama_eval(ctx_guidance, input_buf + i, n_eval, n_past_guidance, params.n_threads)) {
|
if (llama_eval(ctx_guidance, input_buf + i, n_eval, n_past_guidance, params.n_threads)) {
|
||||||
fprintf(stderr, "%s : failed to eval\n", __func__);
|
LOG_TEE("%s : failed to eval\n", __func__);
|
||||||
return 1;
|
return 1;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -530,7 +554,7 @@ int main(int argc, char ** argv) {
|
||||||
n_eval = params.n_batch;
|
n_eval = params.n_batch;
|
||||||
}
|
}
|
||||||
if (llama_eval(ctx, &embd[i], n_eval, n_past, params.n_threads)) {
|
if (llama_eval(ctx, &embd[i], n_eval, n_past, params.n_threads)) {
|
||||||
fprintf(stderr, "%s : failed to eval\n", __func__);
|
LOG_TEE("%s : failed to eval\n", __func__);
|
||||||
return 1;
|
return 1;
|
||||||
}
|
}
|
||||||
n_past += n_eval;
|
n_past += n_eval;
|
||||||
|
@ -796,7 +820,7 @@ int main(int argc, char ** argv) {
|
||||||
|
|
||||||
// end of text token
|
// end of text token
|
||||||
if (!embd.empty() && embd.back() == llama_token_eos(ctx) && !(params.instruct || params.interactive)) {
|
if (!embd.empty() && embd.back() == llama_token_eos(ctx) && !(params.instruct || params.interactive)) {
|
||||||
fprintf(stderr, " [end of text]\n");
|
LOG_TEE(" [end of text]\n");
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -809,7 +833,7 @@ int main(int argc, char ** argv) {
|
||||||
}
|
}
|
||||||
|
|
||||||
if (!path_session.empty() && params.prompt_cache_all && !params.prompt_cache_ro) {
|
if (!path_session.empty() && params.prompt_cache_all && !params.prompt_cache_ro) {
|
||||||
fprintf(stderr, "\n%s: saving final output to session file '%s'\n", __func__, path_session.c_str());
|
LOG_TEE("\n%s: saving final output to session file '%s'\n", __func__, path_session.c_str());
|
||||||
llama_save_session_file(ctx, path_session.c_str(), session_tokens.data(), session_tokens.size());
|
llama_save_session_file(ctx, path_session.c_str(), session_tokens.data(), session_tokens.size());
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -823,7 +847,7 @@ int main(int argc, char ** argv) {
|
||||||
}
|
}
|
||||||
llama_backend_free();
|
llama_backend_free();
|
||||||
|
|
||||||
LOG("Log end - end of main()\n")
|
LOG_TEE("Log end\n")
|
||||||
|
|
||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue