common : passkey use gpt_params

This commit is contained in:
Georgi Gerganov 2024-06-04 14:53:54 +03:00
parent 4df81854ab
commit e87c104dfd
No known key found for this signature in database
GPG key ID: 449E073F9DC10735
4 changed files with 59 additions and 53 deletions

View file

@ -408,7 +408,7 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa
} }
return true; return true;
} }
if (arg == "-n" || arg == "--n-predict") { if (arg == "-n" || arg == "--predict" || arg == "--n-predict") {
if (++i >= argc) { if (++i >= argc) {
invalid_param = true; invalid_param = true;
return true; return true;
@ -965,26 +965,26 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa
params.use_mlock = true; params.use_mlock = true;
return true; return true;
} }
if (arg == "--gpu-layers" || arg == "-ngl" || arg == "--n-gpu-layers") { if (arg == "-ngl" || arg == "--gpu-layers" || arg == "--n-gpu-layers") {
if (++i >= argc) { if (++i >= argc) {
invalid_param = true; invalid_param = true;
return true; return true;
} }
params.n_gpu_layers = std::stoi(argv[i]); params.n_gpu_layers = std::stoi(argv[i]);
if (!llama_supports_gpu_offload()) { if (!llama_supports_gpu_offload()) {
fprintf(stderr, "warning: not compiled with GPU offload support, --n-gpu-layers option will be ignored\n"); fprintf(stderr, "warning: not compiled with GPU offload support, --gpu-layers option will be ignored\n");
fprintf(stderr, "warning: see main README.md for information on enabling GPU BLAS support\n"); fprintf(stderr, "warning: see main README.md for information on enabling GPU BLAS support\n");
} }
return true; return true;
} }
if (arg == "--gpu-layers-draft" || arg == "-ngld" || arg == "--n-gpu-layers-draft") { if (arg == "-ngld" || arg == "--gpu-layers-draft" || arg == "--gpu-layers-draft") {
if (++i >= argc) { if (++i >= argc) {
invalid_param = true; invalid_param = true;
return true; return true;
} }
params.n_gpu_layers_draft = std::stoi(argv[i]); params.n_gpu_layers_draft = std::stoi(argv[i]);
if (!llama_supports_gpu_offload()) { if (!llama_supports_gpu_offload()) {
fprintf(stderr, "warning: not compiled with GPU offload support, --n-gpu-layers-draft option will be ignored\n"); fprintf(stderr, "warning: not compiled with GPU offload support, --gpu-layers-draft option will be ignored\n");
fprintf(stderr, "warning: see main README.md for information on enabling GPU BLAS support\n"); fprintf(stderr, "warning: see main README.md for information on enabling GPU BLAS support\n");
} }
return true; return true;
@ -1521,6 +1521,22 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa
params.chunk_separator = argv[i]; params.chunk_separator = argv[i];
return true; return true;
} }
if (arg == "--junk") {
if (++i >= argc) {
invalid_param = true;
return true;
}
params.n_junk = std::stoi(argv[i]);
return true;
}
if (arg == "--pos") {
if (++i >= argc) {
invalid_param = true;
return true;
}
params.i_pos = std::stoi(argv[i]);
return true;
}
#ifndef LOG_DISABLE_LOGS #ifndef LOG_DISABLE_LOGS
// Parse args for logging parameters // Parse args for logging parameters
if (log_param_single_parse(argv[i])) { if (log_param_single_parse(argv[i])) {
@ -1613,7 +1629,7 @@ void gpt_params_print_usage(int /*argc*/, char ** argv, const gpt_params & param
"path to dynamic lookup cache to use for lookup decoding (updated by generation)" }); "path to dynamic lookup cache to use for lookup decoding (updated by generation)" });
options.push_back({ "*", "-c, --ctx-size N", "size of the prompt context (default: %d, 0 = loaded from model)", params.n_ctx }); options.push_back({ "*", "-c, --ctx-size N", "size of the prompt context (default: %d, 0 = loaded from model)", params.n_ctx });
options.push_back({ "*", "-n, --n-predict N", "number of tokens to predict (default: %d, -1 = infinity, -2 = until context filled)", params.n_predict }); options.push_back({ "*", "-n, --predict N", "number of tokens to predict (default: %d, -1 = infinity, -2 = until context filled)", params.n_predict });
options.push_back({ "*", "-b, --batch-size N", "logical maximum batch size (default: %d)", params.n_batch }); options.push_back({ "*", "-b, --batch-size N", "logical maximum batch size (default: %d)", params.n_batch });
options.push_back({ "*", "-ub, --ubatch-size N", "physical maximum batch size (default: %d)", params.n_ubatch }); options.push_back({ "*", "-ub, --ubatch-size N", "physical maximum batch size (default: %d)", params.n_ubatch });
options.push_back({ "*", " --keep N", "number of tokens to keep from the initial prompt (default: %d, -1 = all)", params.n_keep }); options.push_back({ "*", " --keep N", "number of tokens to keep from the initial prompt (default: %d, -1 = all)", params.n_keep });
@ -1743,8 +1759,10 @@ void gpt_params_print_usage(int /*argc*/, char ** argv, const gpt_params & param
"see https://github.com/ggerganov/llama.cpp/issues/1437" }); "see https://github.com/ggerganov/llama.cpp/issues/1437" });
if (llama_supports_gpu_offload()) { if (llama_supports_gpu_offload()) {
options.push_back({ "*", "-ngl, --n-gpu-layers N", "number of layers to store in VRAM" }); options.push_back({ "*", "-ngl, --gpu-layers N",
options.push_back({ "*", "-ngld, --n-gpu-layers-draft N", "number of layers to store in VRAM for the draft model" }); "number of layers to store in VRAM" });
options.push_back({ "*", "-ngld, --gpu-layers-draft N",
"number of layers to store in VRAM for the draft model" });
options.push_back({ "*", "-sm, --split-mode SPLIT_MODE", options.push_back({ "*", "-sm, --split-mode SPLIT_MODE",
"how to split the model across multiple GPUs, one of:\n" "how to split the model across multiple GPUs, one of:\n"
" - none: use one GPU only\n" " - none: use one GPU only\n"
@ -1782,6 +1800,10 @@ void gpt_params_print_usage(int /*argc*/, char ** argv, const gpt_params & param
options.push_back({ "retrieval", " --chunk-separator STRING", options.push_back({ "retrieval", " --chunk-separator STRING",
"separator between chunks (default: '%s')", params.chunk_separator.c_str() }); "separator between chunks (default: '%s')", params.chunk_separator.c_str() });
options.push_back({ "passkey" });
options.push_back({ "passkey", " --junk N", "number of times to repeat the junk text (default: %d)", params.n_junk });
options.push_back({ "passkey", " --pos N", "position of the passkey in the junk text (default: %d)", params.i_pos });
options.push_back({ "bench" }); options.push_back({ "bench" });
options.push_back({ "bench", "-pps", "is the prompt shared across parallel sequences (default: %s)", params.is_pp_shared ? "true" : "false" }); options.push_back({ "bench", "-pps", "is the prompt shared across parallel sequences (default: %s)", params.is_pp_shared ? "true" : "false" });
options.push_back({ "bench", "-npp n0,n1,...", "number of prompt tokens" }); options.push_back({ "bench", "-npp n0,n1,...", "number of prompt tokens" });

View file

@ -215,6 +215,10 @@ struct gpt_params {
int32_t chunk_size = 64; // chunk size for context embedding int32_t chunk_size = 64; // chunk size for context embedding
std::string chunk_separator = "\n"; // chunk separator for context embedding std::string chunk_separator = "\n"; // chunk separator for context embedding
// passkey params
int32_t n_junk = 250; // number of times to repeat the junk text
int32_t i_pos = -1; // position of the passkey in the junk text
}; };
void gpt_params_handle_model_default(gpt_params & params); void gpt_params_handle_model_default(gpt_params & params);

View file

@ -8,5 +8,5 @@ See the following PRs for more info:
### Usage ### Usage
```bash ```bash
make -j && ./passkey ./models/llama-7b-v2/ggml-model-f16.gguf 250 make -j && ./passkey -m ./models/llama-7b-v2/ggml-model-f16.gguf --junk 250
``` ```

View file

@ -6,46 +6,32 @@
#include <string> #include <string>
#include <vector> #include <vector>
static void print_usage(int argc, char ** argv, const gpt_params & params) {
gpt_params_print_usage(argc, argv, params);
LOG_TEE("\nexample usage:\n");
LOG_TEE("\n %s -m model.gguf --junk 250 --pos 90 --keep 32 --grp-attn-n 2 [--seed 1234]\n", argv[0]);
LOG_TEE("\n");
}
int main(int argc, char ** argv) { int main(int argc, char ** argv) {
gpt_params params; gpt_params params;
if (argc == 1 || argv[1][0] == '-') { params.n_junk = 250;
printf("usage: %s MODEL_PATH N_JUNK N_GRP I_POS SEED\n" , argv[0]); params.n_keep = 32;
return 1 ; params.i_pos = -1;
if (!gpt_params_parse(argc, argv, params)) {
print_usage(argc, argv, params);
return 1;
} }
int seed = -1; srand(params.seed == LLAMA_DEFAULT_SEED ? time(NULL) : params.seed);
int n_junk = 250; // number of times to repeat the junk text int n_junk = params.n_junk;
int n_keep = 32; // number of tokens in the prompt prefix int n_keep = params.n_keep;
int n_grp = 1; // if more than 1 - perform LongLM SelfExtend int n_grp = params.grp_attn_n;
int i_pos = -1; // position of the passkey in the junk text int i_pos = params.i_pos;
if (argc >= 2) {
params.model = argv[1];
}
if (argc >= 3) {
n_junk = std::stoi(argv[2]);
}
if (argc >= 4) {
n_grp = std::stoi(argv[3]);
}
if (argc >= 5) {
i_pos = std::stoi(argv[4]);
}
if (argc >= 6) {
seed = std::stoi(argv[5]);
}
if (seed == -1) {
seed = time(NULL);
}
srand(seed);
if (i_pos == -1) { if (i_pos == -1) {
i_pos = rand() % n_junk; i_pos = rand() % n_junk;
@ -76,9 +62,7 @@ int main(int argc, char ** argv) {
// initialize the model // initialize the model
llama_model_params model_params = llama_model_default_params(); llama_model_params model_params = llama_model_params_from_gpt_params(params);
model_params.n_gpu_layers = 99; // offload all layers to the GPU
llama_model * model = llama_load_model_from_file(params.model.c_str(), model_params); llama_model * model = llama_load_model_from_file(params.model.c_str(), model_params);
@ -89,13 +73,9 @@ int main(int argc, char ** argv) {
// initialize the context // initialize the context
llama_context_params ctx_params = llama_context_default_params(); llama_context_params ctx_params = llama_context_params_from_gpt_params(params);
ctx_params.seed = seed; ctx_params.n_ctx = llama_n_ctx_train(model)*n_grp + n_keep;
ctx_params.n_ctx = llama_n_ctx_train(model)*n_grp + n_keep;
ctx_params.n_batch = 512;
ctx_params.n_threads = params.n_threads;
ctx_params.n_threads_batch = params.n_threads_batch == -1 ? params.n_threads : params.n_threads_batch;
GGML_ASSERT(ctx_params.n_batch % n_grp == 0 && "n_batch must be divisible by n_grp"); GGML_ASSERT(ctx_params.n_batch % n_grp == 0 && "n_batch must be divisible by n_grp");
@ -135,7 +115,7 @@ int main(int argc, char ** argv) {
LOG_TEE("prompt tokens: %d\n", n_tokens_all); LOG_TEE("prompt tokens: %d\n", n_tokens_all);
//LOG_TEE("prompt: %s\n", params.prompt.c_str()); //LOG_TEE("prompt: %s\n", params.prompt.c_str());
llama_batch batch = llama_batch_init(512, 0, 1); llama_batch batch = llama_batch_init(params.n_batch, 0, 1);
int n_past = 0; int n_past = 0;