diff --git a/examples/common.cpp b/examples/common.cpp index a0b6f10ad..f87c18b76 100644 --- a/examples/common.cpp +++ b/examples/common.cpp @@ -199,6 +199,14 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) { break; } params.input_prefix = argv[i]; + } else if (arg == "--forceendtoken") { + params.forceendtoken = true; + } else if (arg == "--eot_token") { + if (++i >= argc) { + invalid_param = true; + break; + } + params.eot_token = argv[i]; } else { fprintf(stderr, "error: unknown argument: %s\n", arg.c_str()); gpt_print_usage(argc, argv, default_params); diff --git a/examples/common.h b/examples/common.h index 0470368d5..188e1199a 100644 --- a/examples/common.h +++ b/examples/common.h @@ -36,6 +36,7 @@ struct gpt_params { std::string lora_adapter = ""; // lora adapter path std::string lora_base = ""; // base model path for the lora adapter + std::string eot_token = "[end of text]"; bool memory_f16 = true; // use f16 instead of f32 for memory kv bool random_prompt = false; // do not randomize prompt if none provided @@ -52,6 +53,7 @@ struct gpt_params { bool use_mlock = false; // use mlock to keep model in memory bool mem_test = false; // compute maximum memory usage bool verbose_prompt = false; // print prompt tokens before generation + bool forceendtoken = true; // Force show the "[end of text]" token after the generation }; bool gpt_params_parse(int argc, char ** argv, gpt_params & params); diff --git a/examples/main/main.cpp b/examples/main/main.cpp index decf41a9f..3cd9afd90 100644 --- a/examples/main/main.cpp +++ b/examples/main/main.cpp @@ -245,7 +245,8 @@ int main(int argc, char ** argv) { " - Press Ctrl+C to interject at any time.\n" #endif " - Press Return to return control to LLaMa.\n" - " - If you want to submit another line, end your input in '\\'.\n\n"); + " - If you want to submit another line, end your input in '\\'.\n" + "[model ready]\n"); is_interacting = params.interactive_start; } @@ -388,6 +389,9 @@ int main(int argc, char ** argv) { is_antiprompt = true; set_console_color(con_st, CONSOLE_COLOR_USER_INPUT); fflush(stdout); + if (params.forceendtoken) { + fprintf(stderr, (params.eot_token + "\n").c_str()); + } break; } } @@ -470,10 +474,13 @@ int main(int argc, char ** argv) { // end of text token if (!embd.empty() && embd.back() == llama_token_eos()) { + if (params.forceendtoken || !params.instruct) { + fprintf(stderr, (params.eot_token + "\n").c_str()); + } if (params.instruct) { is_interacting = true; - } else { - fprintf(stderr, " [end of text]\n"); + } + else { break; } } @@ -481,6 +488,9 @@ int main(int argc, char ** argv) { // In interactive mode, respect the maximum number of tokens and drop back to user input when reached. if (params.interactive && n_remain <= 0 && params.n_predict != -1) { n_remain = params.n_predict; + if (params.forceendtoken) { + fprintf(stderr, (params.eot_token + "\n").c_str()); + } is_interacting = true; } } diff --git a/llama.cpp b/llama.cpp index bc0ef1281..03b2dbe44 100644 --- a/llama.cpp +++ b/llama.cpp @@ -938,6 +938,7 @@ static void llama_model_load_internal( fprintf(stderr, "%s: mem required = %7.2f MB (+ %7.2f MB per state)\n", __func__, mem_required / 1024.0 / 1024.0, mem_required_state / 1024.0 / 1024.0); + fprintf(stderr, "[model loading]\n"); } // create the ggml context @@ -1749,7 +1750,7 @@ struct llama_context * llama_init_from_file( unsigned percentage = (unsigned) (100 * progress); while (percentage > *cur_percentage_p) { ++*cur_percentage_p; - fprintf(stderr, "."); + fprintf(stderr, "[porcentage] %u%%\n", *cur_percentage_p); fflush(stderr); if (percentage >= 100) { fprintf(stderr, "\n");