From e87c104dfd5c0710166fb5f7193c4a81128829b2 Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Tue, 4 Jun 2024 14:53:54 +0300 Subject: [PATCH] common : passkey use gpt_params --- common/common.cpp | 38 +++++++++++++++----- common/common.h | 4 +++ examples/passkey/README.md | 2 +- examples/passkey/passkey.cpp | 68 +++++++++++++----------------------- 4 files changed, 59 insertions(+), 53 deletions(-) diff --git a/common/common.cpp b/common/common.cpp index 11fe888c3..460a4440c 100644 --- a/common/common.cpp +++ b/common/common.cpp @@ -408,7 +408,7 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa } return true; } - if (arg == "-n" || arg == "--n-predict") { + if (arg == "-n" || arg == "--predict" || arg == "--n-predict") { if (++i >= argc) { invalid_param = true; return true; @@ -965,26 +965,26 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa params.use_mlock = true; return true; } - if (arg == "--gpu-layers" || arg == "-ngl" || arg == "--n-gpu-layers") { + if (arg == "-ngl" || arg == "--gpu-layers" || arg == "--n-gpu-layers") { if (++i >= argc) { invalid_param = true; return true; } params.n_gpu_layers = std::stoi(argv[i]); if (!llama_supports_gpu_offload()) { - fprintf(stderr, "warning: not compiled with GPU offload support, --n-gpu-layers option will be ignored\n"); + fprintf(stderr, "warning: not compiled with GPU offload support, --gpu-layers option will be ignored\n"); fprintf(stderr, "warning: see main README.md for information on enabling GPU BLAS support\n"); } return true; } - if (arg == "--gpu-layers-draft" || arg == "-ngld" || arg == "--n-gpu-layers-draft") { + if (arg == "-ngld" || arg == "--gpu-layers-draft" || arg == "--gpu-layers-draft") { if (++i >= argc) { invalid_param = true; return true; } params.n_gpu_layers_draft = std::stoi(argv[i]); if (!llama_supports_gpu_offload()) { - fprintf(stderr, "warning: not compiled with GPU offload support, --n-gpu-layers-draft option will be ignored\n"); + fprintf(stderr, "warning: not compiled with GPU offload support, --gpu-layers-draft option will be ignored\n"); fprintf(stderr, "warning: see main README.md for information on enabling GPU BLAS support\n"); } return true; @@ -1521,6 +1521,22 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa params.chunk_separator = argv[i]; return true; } + if (arg == "--junk") { + if (++i >= argc) { + invalid_param = true; + return true; + } + params.n_junk = std::stoi(argv[i]); + return true; + } + if (arg == "--pos") { + if (++i >= argc) { + invalid_param = true; + return true; + } + params.i_pos = std::stoi(argv[i]); + return true; + } #ifndef LOG_DISABLE_LOGS // Parse args for logging parameters if (log_param_single_parse(argv[i])) { @@ -1613,7 +1629,7 @@ void gpt_params_print_usage(int /*argc*/, char ** argv, const gpt_params & param "path to dynamic lookup cache to use for lookup decoding (updated by generation)" }); options.push_back({ "*", "-c, --ctx-size N", "size of the prompt context (default: %d, 0 = loaded from model)", params.n_ctx }); - options.push_back({ "*", "-n, --n-predict N", "number of tokens to predict (default: %d, -1 = infinity, -2 = until context filled)", params.n_predict }); + options.push_back({ "*", "-n, --predict N", "number of tokens to predict (default: %d, -1 = infinity, -2 = until context filled)", params.n_predict }); options.push_back({ "*", "-b, --batch-size N", "logical maximum batch size (default: %d)", params.n_batch }); options.push_back({ "*", "-ub, --ubatch-size N", "physical maximum batch size (default: %d)", params.n_ubatch }); options.push_back({ "*", " --keep N", "number of tokens to keep from the initial prompt (default: %d, -1 = all)", params.n_keep }); @@ -1743,8 +1759,10 @@ void gpt_params_print_usage(int /*argc*/, char ** argv, const gpt_params & param "see https://github.com/ggerganov/llama.cpp/issues/1437" }); if (llama_supports_gpu_offload()) { - options.push_back({ "*", "-ngl, --n-gpu-layers N", "number of layers to store in VRAM" }); - options.push_back({ "*", "-ngld, --n-gpu-layers-draft N", "number of layers to store in VRAM for the draft model" }); + options.push_back({ "*", "-ngl, --gpu-layers N", + "number of layers to store in VRAM" }); + options.push_back({ "*", "-ngld, --gpu-layers-draft N", + "number of layers to store in VRAM for the draft model" }); options.push_back({ "*", "-sm, --split-mode SPLIT_MODE", "how to split the model across multiple GPUs, one of:\n" " - none: use one GPU only\n" @@ -1782,6 +1800,10 @@ void gpt_params_print_usage(int /*argc*/, char ** argv, const gpt_params & param options.push_back({ "retrieval", " --chunk-separator STRING", "separator between chunks (default: '%s')", params.chunk_separator.c_str() }); + options.push_back({ "passkey" }); + options.push_back({ "passkey", " --junk N", "number of times to repeat the junk text (default: %d)", params.n_junk }); + options.push_back({ "passkey", " --pos N", "position of the passkey in the junk text (default: %d)", params.i_pos }); + options.push_back({ "bench" }); options.push_back({ "bench", "-pps", "is the prompt shared across parallel sequences (default: %s)", params.is_pp_shared ? "true" : "false" }); options.push_back({ "bench", "-npp n0,n1,...", "number of prompt tokens" }); diff --git a/common/common.h b/common/common.h index b7a7ff2af..e0a08a61b 100644 --- a/common/common.h +++ b/common/common.h @@ -215,6 +215,10 @@ struct gpt_params { int32_t chunk_size = 64; // chunk size for context embedding std::string chunk_separator = "\n"; // chunk separator for context embedding + + // passkey params + int32_t n_junk = 250; // number of times to repeat the junk text + int32_t i_pos = -1; // position of the passkey in the junk text }; void gpt_params_handle_model_default(gpt_params & params); diff --git a/examples/passkey/README.md b/examples/passkey/README.md index 4a22bb559..9e7a119ba 100644 --- a/examples/passkey/README.md +++ b/examples/passkey/README.md @@ -8,5 +8,5 @@ See the following PRs for more info: ### Usage ```bash -make -j && ./passkey ./models/llama-7b-v2/ggml-model-f16.gguf 250 +make -j && ./passkey -m ./models/llama-7b-v2/ggml-model-f16.gguf --junk 250 ``` diff --git a/examples/passkey/passkey.cpp b/examples/passkey/passkey.cpp index f2ef9ca10..d03215cd1 100644 --- a/examples/passkey/passkey.cpp +++ b/examples/passkey/passkey.cpp @@ -6,46 +6,32 @@ #include #include +static void print_usage(int argc, char ** argv, const gpt_params & params) { + gpt_params_print_usage(argc, argv, params); + + LOG_TEE("\nexample usage:\n"); + LOG_TEE("\n %s -m model.gguf --junk 250 --pos 90 --keep 32 --grp-attn-n 2 [--seed 1234]\n", argv[0]); + LOG_TEE("\n"); +} + int main(int argc, char ** argv) { gpt_params params; - if (argc == 1 || argv[1][0] == '-') { - printf("usage: %s MODEL_PATH N_JUNK N_GRP I_POS SEED\n" , argv[0]); - return 1 ; + params.n_junk = 250; + params.n_keep = 32; + params.i_pos = -1; + + if (!gpt_params_parse(argc, argv, params)) { + print_usage(argc, argv, params); + return 1; } - int seed = -1; + srand(params.seed == LLAMA_DEFAULT_SEED ? time(NULL) : params.seed); - int n_junk = 250; // number of times to repeat the junk text - int n_keep = 32; // number of tokens in the prompt prefix - int n_grp = 1; // if more than 1 - perform LongLM SelfExtend - int i_pos = -1; // position of the passkey in the junk text - - if (argc >= 2) { - params.model = argv[1]; - } - - if (argc >= 3) { - n_junk = std::stoi(argv[2]); - } - - if (argc >= 4) { - n_grp = std::stoi(argv[3]); - } - - if (argc >= 5) { - i_pos = std::stoi(argv[4]); - } - - if (argc >= 6) { - seed = std::stoi(argv[5]); - } - - if (seed == -1) { - seed = time(NULL); - } - - srand(seed); + int n_junk = params.n_junk; + int n_keep = params.n_keep; + int n_grp = params.grp_attn_n; + int i_pos = params.i_pos; if (i_pos == -1) { i_pos = rand() % n_junk; @@ -76,9 +62,7 @@ int main(int argc, char ** argv) { // initialize the model - llama_model_params model_params = llama_model_default_params(); - - model_params.n_gpu_layers = 99; // offload all layers to the GPU + llama_model_params model_params = llama_model_params_from_gpt_params(params); llama_model * model = llama_load_model_from_file(params.model.c_str(), model_params); @@ -89,13 +73,9 @@ int main(int argc, char ** argv) { // initialize the context - llama_context_params ctx_params = llama_context_default_params(); + llama_context_params ctx_params = llama_context_params_from_gpt_params(params); - ctx_params.seed = seed; - ctx_params.n_ctx = llama_n_ctx_train(model)*n_grp + n_keep; - ctx_params.n_batch = 512; - ctx_params.n_threads = params.n_threads; - ctx_params.n_threads_batch = params.n_threads_batch == -1 ? params.n_threads : params.n_threads_batch; + ctx_params.n_ctx = llama_n_ctx_train(model)*n_grp + n_keep; GGML_ASSERT(ctx_params.n_batch % n_grp == 0 && "n_batch must be divisible by n_grp"); @@ -135,7 +115,7 @@ int main(int argc, char ** argv) { LOG_TEE("prompt tokens: %d\n", n_tokens_all); //LOG_TEE("prompt: %s\n", params.prompt.c_str()); - llama_batch batch = llama_batch_init(512, 0, 1); + llama_batch batch = llama_batch_init(params.n_batch, 0, 1); int n_past = 0;