diff --git a/examples/main/main.cpp b/examples/main/main.cpp index c096f110b..14a4698e0 100644 --- a/examples/main/main.cpp +++ b/examples/main/main.cpp @@ -485,7 +485,7 @@ int main(int argc, char ** argv) { while ((n_remain != 0 && !is_antiprompt) || params.interactive) { // predict if (!embd.empty()) { - // Note: n_ctx - 4 here is to match the logic for commandline prompt handling via + // Note: (n_ctx - 4) here is to match the logic for commandline prompt handling via // --prompt or --file which uses the same value. int max_embd_size = n_ctx - 4; @@ -500,10 +500,11 @@ int main(int argc, char ** argv) { fflush(stdout); } - // infinite text generation via context swapping + // infinite text generation via context swapping UNLESS n_predict == -2 // if we run out of context: // - take the n_keep first tokens from the original prompt (via n_past) // - take half of the last (n_ctx - n_keep) tokens and recompute the logits in batches + if (n_past + (int) embd.size() + std::max(0, guidance_offset) > n_ctx) { if (params.n_predict == -2) { LOG_TEE("\n\n%s: context full and n_predict == -%d => stopping\n", __func__, params.n_predict); @@ -611,12 +612,16 @@ int main(int argc, char ** argv) { n_past += n_eval; LOG("n_past = %d\n", n_past); + // I added the next two lines on 20240110 + if (n_past % 256 == 0) { + printf("\n\033[31mTokens consumed so far = %d / %d \033[0m\n", n_past, n_ctx); + } } if (!embd.empty() && !path_session.empty()) { session_tokens.insert(session_tokens.end(), embd.begin(), embd.end()); n_session_consumed = session_tokens.size(); - } + } } embd.clear(); diff --git a/llama.cpp b/llama.cpp index 3bb056dba..951da4f9e 100644 --- a/llama.cpp +++ b/llama.cpp @@ -10881,7 +10881,7 @@ void llama_print_timings(struct llama_context * ctx) { __func__, timings.t_p_eval_ms, timings.n_p_eval, timings.t_p_eval_ms / timings.n_p_eval, 1e3 / timings.t_p_eval_ms * timings.n_p_eval); LLAMA_LOG_INFO("%s: eval time = %10.2f ms / %5d runs (%8.2f ms per token, %8.2f tokens per second)\n", __func__, timings.t_eval_ms, timings.n_eval, timings.t_eval_ms / timings.n_eval, 1e3 / timings.t_eval_ms * timings.n_eval); - LLAMA_LOG_INFO("%s: total time = %10.2f ms\n", __func__, (timings.t_end_ms - timings.t_start_ms)); + LLAMA_LOG_INFO("%s: total time = %10.2f ms / %5d tokens\n", __func__, (timings.t_end_ms - timings.t_start_ms), (timings.n_sample + timings.n_p_eval + timings.n_eval)); } void llama_reset_timings(struct llama_context * ctx) {