From e87c104dfd5c0710166fb5f7193c4a81128829b2 Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Tue, 4 Jun 2024 14:53:54 +0300
Subject: [PATCH] common : passkey use gpt_params

---
 common/common.cpp            | 38 +++++++++++++++-----
 common/common.h              |  4 +++
 examples/passkey/README.md   |  2 +-
 examples/passkey/passkey.cpp | 68 +++++++++++++-----------------------
 4 files changed, 59 insertions(+), 53 deletions(-)

diff --git a/common/common.cpp b/common/common.cpp
index 11fe888c3..460a4440c 100644
--- a/common/common.cpp
+++ b/common/common.cpp
@@ -408,7 +408,7 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa
         }
         return true;
     }
-    if (arg == "-n" || arg == "--n-predict") {
+    if (arg == "-n" || arg == "--predict" || arg == "--n-predict") {
         if (++i >= argc) {
             invalid_param = true;
             return true;
@@ -965,26 +965,26 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa
         params.use_mlock = true;
         return true;
     }
-    if (arg == "--gpu-layers" || arg == "-ngl" || arg == "--n-gpu-layers") {
+    if (arg == "-ngl" || arg == "--gpu-layers" || arg == "--n-gpu-layers") {
         if (++i >= argc) {
             invalid_param = true;
             return true;
         }
         params.n_gpu_layers = std::stoi(argv[i]);
         if (!llama_supports_gpu_offload()) {
-            fprintf(stderr, "warning: not compiled with GPU offload support, --n-gpu-layers option will be ignored\n");
+            fprintf(stderr, "warning: not compiled with GPU offload support, --gpu-layers option will be ignored\n");
             fprintf(stderr, "warning: see main README.md for information on enabling GPU BLAS support\n");
         }
         return true;
     }
-    if (arg == "--gpu-layers-draft" || arg == "-ngld" || arg == "--n-gpu-layers-draft") {
+    if (arg == "-ngld" || arg == "--gpu-layers-draft" || arg == "--gpu-layers-draft") {
         if (++i >= argc) {
             invalid_param = true;
             return true;
         }
         params.n_gpu_layers_draft = std::stoi(argv[i]);
         if (!llama_supports_gpu_offload()) {
-            fprintf(stderr, "warning: not compiled with GPU offload support, --n-gpu-layers-draft option will be ignored\n");
+            fprintf(stderr, "warning: not compiled with GPU offload support, --gpu-layers-draft option will be ignored\n");
             fprintf(stderr, "warning: see main README.md for information on enabling GPU BLAS support\n");
         }
         return true;
@@ -1521,6 +1521,22 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa
         params.chunk_separator = argv[i];
         return true;
     }
+    if (arg == "--junk") {
+        if (++i >= argc) {
+            invalid_param = true;
+            return true;
+        }
+        params.n_junk = std::stoi(argv[i]);
+        return true;
+    }
+    if (arg == "--pos") {
+        if (++i >= argc) {
+            invalid_param = true;
+            return true;
+        }
+        params.i_pos = std::stoi(argv[i]);
+        return true;
+    }
 #ifndef LOG_DISABLE_LOGS
     // Parse args for logging parameters
     if (log_param_single_parse(argv[i])) {
@@ -1613,7 +1629,7 @@ void gpt_params_print_usage(int /*argc*/, char ** argv, const gpt_params & param
                                                                         "path to dynamic lookup cache to use for lookup decoding (updated by generation)" });
 
     options.push_back({ "*",           "-c,    --ctx-size N",           "size of the prompt context (default: %d, 0 = loaded from model)", params.n_ctx });
-    options.push_back({ "*",           "-n,    --n-predict N",          "number of tokens to predict (default: %d, -1 = infinity, -2 = until context filled)", params.n_predict });
+    options.push_back({ "*",           "-n,    --predict N",            "number of tokens to predict (default: %d, -1 = infinity, -2 = until context filled)", params.n_predict });
     options.push_back({ "*",           "-b,    --batch-size N",         "logical maximum batch size (default: %d)", params.n_batch });
     options.push_back({ "*",           "-ub,   --ubatch-size N",        "physical maximum batch size (default: %d)", params.n_ubatch });
     options.push_back({ "*",           "       --keep N",               "number of tokens to keep from the initial prompt (default: %d, -1 = all)", params.n_keep });
@@ -1743,8 +1759,10 @@ void gpt_params_print_usage(int /*argc*/, char ** argv, const gpt_params & param
                                                                         "see https://github.com/ggerganov/llama.cpp/issues/1437" });
 
     if (llama_supports_gpu_offload()) {
-        options.push_back({ "*",           "-ngl,  --n-gpu-layers N",       "number of layers to store in VRAM" });
-        options.push_back({ "*",           "-ngld, --n-gpu-layers-draft N", "number of layers to store in VRAM for the draft model" });
+        options.push_back({ "*",           "-ngl,  --gpu-layers N",
+                                                                        "number of layers to store in VRAM" });
+        options.push_back({ "*",           "-ngld, --gpu-layers-draft N",
+                                                                        "number of layers to store in VRAM for the draft model" });
         options.push_back({ "*",           "-sm,   --split-mode SPLIT_MODE",
                                                                         "how to split the model across multiple GPUs, one of:\n"
                                                                         "  - none: use one GPU only\n"
@@ -1782,6 +1800,10 @@ void gpt_params_print_usage(int /*argc*/, char ** argv, const gpt_params & param
     options.push_back({ "retrieval",   "       --chunk-separator STRING",
                                                                         "separator between chunks (default: '%s')", params.chunk_separator.c_str() });
 
+    options.push_back({ "passkey" });
+    options.push_back({ "passkey",     "       --junk N",               "number of times to repeat the junk text (default: %d)", params.n_junk });
+    options.push_back({ "passkey",     "       --pos N",                "position of the passkey in the junk text (default: %d)", params.i_pos });
+
     options.push_back({ "bench" });
     options.push_back({ "bench",       "-pps",                          "is the prompt shared across parallel sequences (default: %s)", params.is_pp_shared ? "true" : "false" });
     options.push_back({ "bench",       "-npp n0,n1,...",                "number of prompt tokens" });
diff --git a/common/common.h b/common/common.h
index b7a7ff2af..e0a08a61b 100644
--- a/common/common.h
+++ b/common/common.h
@@ -215,6 +215,10 @@ struct gpt_params {
     int32_t chunk_size = 64; // chunk size for context embedding
 
     std::string chunk_separator = "\n"; // chunk separator for context embedding
+
+    // passkey params
+    int32_t n_junk = 250; // number of times to repeat the junk text
+    int32_t i_pos  = -1;  // position of the passkey in the junk text
 };
 
 void gpt_params_handle_model_default(gpt_params & params);
diff --git a/examples/passkey/README.md b/examples/passkey/README.md
index 4a22bb559..9e7a119ba 100644
--- a/examples/passkey/README.md
+++ b/examples/passkey/README.md
@@ -8,5 +8,5 @@ See the following PRs for more info:
 ### Usage
 
 ```bash
-make -j && ./passkey ./models/llama-7b-v2/ggml-model-f16.gguf 250
+make -j && ./passkey -m ./models/llama-7b-v2/ggml-model-f16.gguf --junk 250
 ```
diff --git a/examples/passkey/passkey.cpp b/examples/passkey/passkey.cpp
index f2ef9ca10..d03215cd1 100644
--- a/examples/passkey/passkey.cpp
+++ b/examples/passkey/passkey.cpp
@@ -6,46 +6,32 @@
 #include <string>
 #include <vector>
 
+static void print_usage(int argc, char ** argv, const gpt_params & params) {
+    gpt_params_print_usage(argc, argv, params);
+
+    LOG_TEE("\nexample usage:\n");
+    LOG_TEE("\n    %s -m model.gguf --junk 250 --pos 90 --keep 32 --grp-attn-n 2 [--seed 1234]\n", argv[0]);
+    LOG_TEE("\n");
+}
+
 int main(int argc, char ** argv) {
     gpt_params params;
 
-    if (argc == 1 || argv[1][0] == '-') {
-        printf("usage: %s MODEL_PATH N_JUNK N_GRP I_POS SEED\n" , argv[0]);
-        return 1 ;
+    params.n_junk = 250;
+    params.n_keep = 32;
+    params.i_pos  = -1;
+
+    if (!gpt_params_parse(argc, argv, params)) {
+        print_usage(argc, argv, params);
+        return 1;
     }
 
-    int seed = -1;
+    srand(params.seed == LLAMA_DEFAULT_SEED ? time(NULL) : params.seed);
 
-    int n_junk = 250; // number of times to repeat the junk text
-    int n_keep = 32;  // number of tokens in the prompt prefix
-    int n_grp  = 1;   // if more than 1 - perform LongLM SelfExtend
-    int i_pos  = -1;  // position of the passkey in the junk text
-
-    if (argc >= 2) {
-        params.model = argv[1];
-    }
-
-    if (argc >= 3) {
-        n_junk = std::stoi(argv[2]);
-    }
-
-    if (argc >= 4) {
-        n_grp = std::stoi(argv[3]);
-    }
-
-    if (argc >= 5) {
-        i_pos = std::stoi(argv[4]);
-    }
-
-    if (argc >= 6) {
-        seed = std::stoi(argv[5]);
-    }
-
-    if (seed == -1) {
-        seed = time(NULL);
-    }
-
-    srand(seed);
+    int n_junk = params.n_junk;
+    int n_keep = params.n_keep;
+    int n_grp  = params.grp_attn_n;
+    int i_pos  = params.i_pos;
 
     if (i_pos == -1) {
         i_pos = rand() % n_junk;
@@ -76,9 +62,7 @@ int main(int argc, char ** argv) {
 
     // initialize the model
 
-    llama_model_params model_params = llama_model_default_params();
-
-    model_params.n_gpu_layers = 99; // offload all layers to the GPU
+    llama_model_params model_params = llama_model_params_from_gpt_params(params);
 
     llama_model * model = llama_load_model_from_file(params.model.c_str(), model_params);
 
@@ -89,13 +73,9 @@ int main(int argc, char ** argv) {
 
     // initialize the context
 
-    llama_context_params ctx_params = llama_context_default_params();
+    llama_context_params ctx_params = llama_context_params_from_gpt_params(params);
 
-    ctx_params.seed    = seed;
-    ctx_params.n_ctx   = llama_n_ctx_train(model)*n_grp + n_keep;
-    ctx_params.n_batch = 512;
-    ctx_params.n_threads       = params.n_threads;
-    ctx_params.n_threads_batch = params.n_threads_batch == -1 ? params.n_threads : params.n_threads_batch;
+    ctx_params.n_ctx = llama_n_ctx_train(model)*n_grp + n_keep;
 
     GGML_ASSERT(ctx_params.n_batch % n_grp == 0 && "n_batch must be divisible by n_grp");
 
@@ -135,7 +115,7 @@ int main(int argc, char ** argv) {
     LOG_TEE("prompt tokens: %d\n", n_tokens_all);
     //LOG_TEE("prompt: %s\n", params.prompt.c_str());
 
-    llama_batch batch = llama_batch_init(512, 0, 1);
+    llama_batch batch = llama_batch_init(params.n_batch, 0, 1);
 
     int n_past = 0;