split behavior into --session and --prompt-cache

2023-05-06 20:27:15 -04:00 · 2023-05-06 20:27:15 -04:00 · 56758f033c
commit 56758f033c
parent 4c76d52bb8
4 changed files with 25 additions and 14 deletions
--- a/examples/common.cpp
+++ b/examples/common.cpp
@ -118,14 +118,18 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
            params.prompt = argv[i];
        } else if (arg == "-e") {
            escape_prompt = true;
+        } else if (arg == "--prompt-cache") {
+            if (++i >= argc) {
+                invalid_param = true;
+                break;
+            }
+            params.path_prompt_cache = argv[i];
        } else if (arg == "--session") {
            if (++i >= argc) {
                invalid_param = true;
                break;
            }
            params.path_session = argv[i];
-        } else if (arg == "--session-full") {
-            params.session_full = true;
        } else if (arg == "-f" || arg == "--file") {
            if (++i >= argc) {
                invalid_param = true;
@ -344,6 +348,11 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
        gpt_print_usage(argc, argv, default_params);
        exit(1);
    }
+    if (!params.path_session.empty() && !params.path_prompt_cache.empty()) {
+        fprintf(stderr, "error: only one of --prompt-cache or --session may be specified\n");
+        gpt_print_usage(argc, argv, default_params);
+        exit(1);
+    }
    if (escape_prompt) {
        process_escapes(params.prompt);
    }
@ -369,8 +378,8 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
    fprintf(stderr, "  -p PROMPT, --prompt PROMPT\n");
    fprintf(stderr, "                        prompt to start generation with (default: empty)\n");
    fprintf(stderr, "  -e                    process prompt escapes sequences (\\n, \\r, \\t, \\', \\\", \\\\)\n");
-    fprintf(stderr, "  --session FNAME       file to cache model state in (may be large!) (default: none)\n");
-    fprintf(stderr, "  --session-full        if specified, saves output to the session file in addition to prompt\n");
+    fprintf(stderr, "  --prompt-cache FNAME  file to cache prompt state for faster startup (default: none)\n");
+    fprintf(stderr, "  --session FNAME       file to store prompt and generations, allowing continuation (default: none)\n");
    fprintf(stderr, "  --random-prompt       start with a randomized prompt.\n");
    fprintf(stderr, "  --in-prefix STRING    string to prefix user inputs with (default: empty)\n");
    fprintf(stderr, "  --in-suffix STRING    string to suffix after user inputs with (default: empty)\n");
--- a/examples/common.h
+++ b/examples/common.h
@ -46,9 +46,10 @@ struct gpt_params {

    std::string model  = "models/lamma-7B/ggml-model.bin"; // model path
    std::string prompt = "";
-    std::string path_session = "";       // path to file for saving/loading model eval state
-    std::string input_prefix = "";       // string to prefix user inputs with
-    std::string input_suffix = "";       // string to suffix user inputs with
+    std::string path_prompt_cache = "";  // path to file for saving/loading prompt eval state
+    std::string path_session      = "";  // file for saving/loading prompt and generations
+    std::string input_prefix      = "";  // string to prefix user inputs with
+    std::string input_suffix      = "";  // string to suffix user inputs with
    std::vector<std::string> antiprompt; // string upon seeing which more user input is prompted

    std::string lora_adapter = "";  // lora adapter path
@ -58,7 +59,6 @@ struct gpt_params {
    bool random_prompt     = false; // do not randomize prompt if none provided
    bool use_color         = false; // use color to distinguish generations and inputs
    bool interactive       = false; // interactive mode
-    bool session_full      = false; // save the output to the session file in addition to prompt

    bool embedding         = false; // get only sentence embedding
    bool interactive_first = false; // wait for user input immediately
--- a/examples/main/README.md
+++ b/examples/main/README.md
@ -270,9 +270,9 @@ These options help improve the performance and memory usage of the LLaMA models.

 -   `-b N, --batch_size N`: Set the batch size for prompt processing (default: 512). This large batch size benefits users who have BLAS installed and enabled it during the build. If you don't have BLAS enabled ("BLAS=0"), you can use a smaller number, such as 8, to see the prompt progress as it's evaluated in some situations.

-### Session Caching
+### Prompt Caching

-   `--session FNAME`: Specify a file to load/save the session, which caches the model state after the initial prompt. This can significantly speed up the startup time when you're using longer prompts. The session file is created during the first run and is reused in subsequent runs. If you change your prompt such that 75% or less of the session is reusable, the existing session file will be overwritten with a new, updated version to maintain optimal performance.
+-   `--prompt-cache FNAME`: Specify a file to cache the model state after the initial prompt. This can significantly speed up the startup time when you're using longer prompts. The file is created during the first run and is reused and updated in subsequent runs.

 ### Quantization

--- a/examples/main/main.cpp
+++ b/examples/main/main.cpp
@ -139,8 +139,10 @@ int main(int argc, char ** argv) {
    // Add a space in front of the first character to match OG llama tokenizer behavior
    params.prompt.insert(0, 1, ' ');

-    std::string path_session = params.path_session;
+    std::string path_session =
+        !params.path_session.empty() ? params.path_session : params.path_prompt_cache;
    std::vector<llama_token> session_tokens;
+    bool resume_session = !params.path_session.empty();

    if (!path_session.empty()) {
        fprintf(stderr, "%s: attempting to load saved session from '%s'\n", __func__, path_session.c_str());
@ -323,8 +325,8 @@ int main(int argc, char ** argv) {
                // insert n_left/2 tokens at the start of embd from last_n_tokens
                embd.insert(embd.begin(), last_n_tokens.begin() + n_ctx - n_left/2 - embd.size(), last_n_tokens.end() - embd.size());

-                // stop saving session if we run out of context
-                if (!path_session.empty() && params.session_full) {
+                // stop saving session if we run out of context, saving whatever was evaled
+                if (!path_session.empty() && resume_session) {
                    llama_save_session_file(ctx, path_session.c_str(),
                        session_tokens.data(), session_tokens.size());
                }
@ -603,7 +605,7 @@ int main(int argc, char ** argv) {
        }
    }

-    if (!path_session.empty() && params.session_full) {
+    if (!path_session.empty() && resume_session) {
        fprintf(stderr, "\n%s: saving final output to session file '%s'\n", __func__, path_session.c_str());
        llama_save_session_file(ctx, path_session.c_str(), session_tokens.data(), session_tokens.size());
    }