common : passkey use gpt_params

2024-06-04 14:53:54 +03:00 · 2024-06-04 14:53:54 +03:00 · e87c104dfd
commit e87c104dfd
parent 4df81854ab
4 changed files with 59 additions and 53 deletions
--- a/common/common.cpp
+++ b/common/common.cpp
@ -408,7 +408,7 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa
        }
        return true;
    }
-    if (arg == "-n" || arg == "--n-predict") {
+    if (arg == "-n" || arg == "--predict" || arg == "--n-predict") {
        if (++i >= argc) {
            invalid_param = true;
            return true;
@ -965,26 +965,26 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa
        params.use_mlock = true;
        return true;
    }
-    if (arg == "--gpu-layers" || arg == "-ngl" || arg == "--n-gpu-layers") {
+    if (arg == "-ngl" || arg == "--gpu-layers" || arg == "--n-gpu-layers") {
        if (++i >= argc) {
            invalid_param = true;
            return true;
        }
        params.n_gpu_layers = std::stoi(argv[i]);
        if (!llama_supports_gpu_offload()) {
-            fprintf(stderr, "warning: not compiled with GPU offload support, --n-gpu-layers option will be ignored\n");
+            fprintf(stderr, "warning: not compiled with GPU offload support, --gpu-layers option will be ignored\n");
            fprintf(stderr, "warning: see main README.md for information on enabling GPU BLAS support\n");
        }
        return true;
    }
-    if (arg == "--gpu-layers-draft" || arg == "-ngld" || arg == "--n-gpu-layers-draft") {
+    if (arg == "-ngld" || arg == "--gpu-layers-draft" || arg == "--gpu-layers-draft") {
        if (++i >= argc) {
            invalid_param = true;
            return true;
        }
        params.n_gpu_layers_draft = std::stoi(argv[i]);
        if (!llama_supports_gpu_offload()) {
-            fprintf(stderr, "warning: not compiled with GPU offload support, --n-gpu-layers-draft option will be ignored\n");
+            fprintf(stderr, "warning: not compiled with GPU offload support, --gpu-layers-draft option will be ignored\n");
            fprintf(stderr, "warning: see main README.md for information on enabling GPU BLAS support\n");
        }
        return true;
@ -1521,6 +1521,22 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa
        params.chunk_separator = argv[i];
        return true;
    }
    if (arg == "--junk") {
        if (++i >= argc) {
            invalid_param = true;
            return true;
        }
        params.n_junk = std::stoi(argv[i]);
        return true;
    }
    if (arg == "--pos") {
        if (++i >= argc) {
            invalid_param = true;
            return true;
        }
        params.i_pos = std::stoi(argv[i]);
        return true;
    }
 #ifndef LOG_DISABLE_LOGS
    // Parse args for logging parameters
    if (log_param_single_parse(argv[i])) {
@ -1613,7 +1629,7 @@ void gpt_params_print_usage(int /*argc*/, char ** argv, const gpt_params & param
                                                                        "path to dynamic lookup cache to use for lookup decoding (updated by generation)" });
    options.push_back({ "*",           "-c,    --ctx-size N",           "size of the prompt context (default: %d, 0 = loaded from model)", params.n_ctx });
-    options.push_back({ "*",           "-n,    --n-predict N",          "number of tokens to predict (default: %d, -1 = infinity, -2 = until context filled)", params.n_predict });
+    options.push_back({ "*",           "-n,    --predict N",            "number of tokens to predict (default: %d, -1 = infinity, -2 = until context filled)", params.n_predict });
    options.push_back({ "*",           "-b,    --batch-size N",         "logical maximum batch size (default: %d)", params.n_batch });
    options.push_back({ "*",           "-ub,   --ubatch-size N",        "physical maximum batch size (default: %d)", params.n_ubatch });
    options.push_back({ "*",           "       --keep N",               "number of tokens to keep from the initial prompt (default: %d, -1 = all)", params.n_keep });
@ -1743,8 +1759,10 @@ void gpt_params_print_usage(int /*argc*/, char ** argv, const gpt_params & param
                                                                        "see https://github.com/ggerganov/llama.cpp/issues/1437" });
    if (llama_supports_gpu_offload()) {
-        options.push_back({ "*",           "-ngl,  --n-gpu-layers N",       "number of layers to store in VRAM" });
+        options.push_back({ "*",           "-ngl,  --gpu-layers N",
-        options.push_back({ "*",           "-ngld, --n-gpu-layers-draft N", "number of layers to store in VRAM for the draft model" });
+                                                                        "number of layers to store in VRAM" });
        options.push_back({ "*",           "-ngld, --gpu-layers-draft N",
                                                                        "number of layers to store in VRAM for the draft model" });
        options.push_back({ "*",           "-sm,   --split-mode SPLIT_MODE",
                                                                        "how to split the model across multiple GPUs, one of:\n"
                                                                        "  - none: use one GPU only\n"
@ -1782,6 +1800,10 @@ void gpt_params_print_usage(int /*argc*/, char ** argv, const gpt_params & param
    options.push_back({ "retrieval",   "       --chunk-separator STRING",
                                                                        "separator between chunks (default: '%s')", params.chunk_separator.c_str() });
    options.push_back({ "passkey" });
    options.push_back({ "passkey",     "       --junk N",               "number of times to repeat the junk text (default: %d)", params.n_junk });
    options.push_back({ "passkey",     "       --pos N",                "position of the passkey in the junk text (default: %d)", params.i_pos });
    options.push_back({ "bench" });
    options.push_back({ "bench",       "-pps",                          "is the prompt shared across parallel sequences (default: %s)", params.is_pp_shared ? "true" : "false" });
    options.push_back({ "bench",       "-npp n0,n1,...",                "number of prompt tokens" });
--- a/common/common.h
+++ b/common/common.h
@ -215,6 +215,10 @@ struct gpt_params {
    int32_t chunk_size = 64; // chunk size for context embedding
    std::string chunk_separator = "\n"; // chunk separator for context embedding
    // passkey params
    int32_t n_junk = 250; // number of times to repeat the junk text
    int32_t i_pos  = -1;  // position of the passkey in the junk text
 };
 void gpt_params_handle_model_default(gpt_params & params);
--- a/examples/passkey/README.md
+++ b/examples/passkey/README.md
@ -8,5 +8,5 @@ See the following PRs for more info:
 ### Usage
 ```bash
-make -j && ./passkey ./models/llama-7b-v2/ggml-model-f16.gguf 250
+make -j && ./passkey -m ./models/llama-7b-v2/ggml-model-f16.gguf --junk 250
 ```
--- a/examples/passkey/passkey.cpp
+++ b/examples/passkey/passkey.cpp
@ -6,46 +6,32 @@
 #include <string>
 #include <vector>
 static void print_usage(int argc, char ** argv, const gpt_params & params) {
    gpt_params_print_usage(argc, argv, params);
    LOG_TEE("\nexample usage:\n");
    LOG_TEE("\n    %s -m model.gguf --junk 250 --pos 90 --keep 32 --grp-attn-n 2 [--seed 1234]\n", argv[0]);
    LOG_TEE("\n");
 }
 int main(int argc, char ** argv) {
    gpt_params params;
-    if (argc == 1 || argv[1][0] == '-') {
+    params.n_junk = 250;
-        printf("usage: %s MODEL_PATH N_JUNK N_GRP I_POS SEED\n" , argv[0]);
+    params.n_keep = 32;
-        return 1 ;
+    params.i_pos  = -1;
    if (!gpt_params_parse(argc, argv, params)) {
        print_usage(argc, argv, params);
        return 1;
    }
-    int seed = -1;
+    srand(params.seed == LLAMA_DEFAULT_SEED ? time(NULL) : params.seed);
-    int n_junk = 250; // number of times to repeat the junk text
+    int n_junk = params.n_junk;
-    int n_keep = 32;  // number of tokens in the prompt prefix
+    int n_keep = params.n_keep;
-    int n_grp  = 1;   // if more than 1 - perform LongLM SelfExtend
+    int n_grp  = params.grp_attn_n;
-    int i_pos  = -1;  // position of the passkey in the junk text
+    int i_pos  = params.i_pos;
    if (argc >= 2) {
        params.model = argv[1];
    }
    if (argc >= 3) {
        n_junk = std::stoi(argv[2]);
    }
    if (argc >= 4) {
        n_grp = std::stoi(argv[3]);
    }
    if (argc >= 5) {
        i_pos = std::stoi(argv[4]);
    }
    if (argc >= 6) {
        seed = std::stoi(argv[5]);
    }
    if (seed == -1) {
        seed = time(NULL);
    }
    srand(seed);
    if (i_pos == -1) {
        i_pos = rand() % n_junk;
@ -76,9 +62,7 @@ int main(int argc, char ** argv) {
    // initialize the model
-    llama_model_params model_params = llama_model_default_params();
+    llama_model_params model_params = llama_model_params_from_gpt_params(params);
    model_params.n_gpu_layers = 99; // offload all layers to the GPU
    llama_model * model = llama_load_model_from_file(params.model.c_str(), model_params);
@ -89,13 +73,9 @@ int main(int argc, char ** argv) {
    // initialize the context
-    llama_context_params ctx_params = llama_context_default_params();
+    llama_context_params ctx_params = llama_context_params_from_gpt_params(params);
-    ctx_params.seed    = seed;
+    ctx_params.n_ctx = llama_n_ctx_train(model)*n_grp + n_keep;
    ctx_params.n_ctx   = llama_n_ctx_train(model)*n_grp + n_keep;
    ctx_params.n_batch = 512;
    ctx_params.n_threads       = params.n_threads;
    ctx_params.n_threads_batch = params.n_threads_batch == -1 ? params.n_threads : params.n_threads_batch;
    GGML_ASSERT(ctx_params.n_batch % n_grp == 0 && "n_batch must be divisible by n_grp");
@ -135,7 +115,7 @@ int main(int argc, char ** argv) {
    LOG_TEE("prompt tokens: %d\n", n_tokens_all);
    //LOG_TEE("prompt: %s\n", params.prompt.c_str());
-    llama_batch batch = llama_batch_init(512, 0, 1);
+    llama_batch batch = llama_batch_init(params.n_batch, 0, 1);
    int n_past = 0;