common : passkey use gpt_params
This commit is contained in:
parent
4df81854ab
commit
e87c104dfd
4 changed files with 59 additions and 53 deletions
|
@ -408,7 +408,7 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa
|
|||
}
|
||||
return true;
|
||||
}
|
||||
if (arg == "-n" || arg == "--n-predict") {
|
||||
if (arg == "-n" || arg == "--predict" || arg == "--n-predict") {
|
||||
if (++i >= argc) {
|
||||
invalid_param = true;
|
||||
return true;
|
||||
|
@ -965,26 +965,26 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa
|
|||
params.use_mlock = true;
|
||||
return true;
|
||||
}
|
||||
if (arg == "--gpu-layers" || arg == "-ngl" || arg == "--n-gpu-layers") {
|
||||
if (arg == "-ngl" || arg == "--gpu-layers" || arg == "--n-gpu-layers") {
|
||||
if (++i >= argc) {
|
||||
invalid_param = true;
|
||||
return true;
|
||||
}
|
||||
params.n_gpu_layers = std::stoi(argv[i]);
|
||||
if (!llama_supports_gpu_offload()) {
|
||||
fprintf(stderr, "warning: not compiled with GPU offload support, --n-gpu-layers option will be ignored\n");
|
||||
fprintf(stderr, "warning: not compiled with GPU offload support, --gpu-layers option will be ignored\n");
|
||||
fprintf(stderr, "warning: see main README.md for information on enabling GPU BLAS support\n");
|
||||
}
|
||||
return true;
|
||||
}
|
||||
if (arg == "--gpu-layers-draft" || arg == "-ngld" || arg == "--n-gpu-layers-draft") {
|
||||
if (arg == "-ngld" || arg == "--gpu-layers-draft" || arg == "--gpu-layers-draft") {
|
||||
if (++i >= argc) {
|
||||
invalid_param = true;
|
||||
return true;
|
||||
}
|
||||
params.n_gpu_layers_draft = std::stoi(argv[i]);
|
||||
if (!llama_supports_gpu_offload()) {
|
||||
fprintf(stderr, "warning: not compiled with GPU offload support, --n-gpu-layers-draft option will be ignored\n");
|
||||
fprintf(stderr, "warning: not compiled with GPU offload support, --gpu-layers-draft option will be ignored\n");
|
||||
fprintf(stderr, "warning: see main README.md for information on enabling GPU BLAS support\n");
|
||||
}
|
||||
return true;
|
||||
|
@ -1521,6 +1521,22 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa
|
|||
params.chunk_separator = argv[i];
|
||||
return true;
|
||||
}
|
||||
if (arg == "--junk") {
|
||||
if (++i >= argc) {
|
||||
invalid_param = true;
|
||||
return true;
|
||||
}
|
||||
params.n_junk = std::stoi(argv[i]);
|
||||
return true;
|
||||
}
|
||||
if (arg == "--pos") {
|
||||
if (++i >= argc) {
|
||||
invalid_param = true;
|
||||
return true;
|
||||
}
|
||||
params.i_pos = std::stoi(argv[i]);
|
||||
return true;
|
||||
}
|
||||
#ifndef LOG_DISABLE_LOGS
|
||||
// Parse args for logging parameters
|
||||
if (log_param_single_parse(argv[i])) {
|
||||
|
@ -1613,7 +1629,7 @@ void gpt_params_print_usage(int /*argc*/, char ** argv, const gpt_params & param
|
|||
"path to dynamic lookup cache to use for lookup decoding (updated by generation)" });
|
||||
|
||||
options.push_back({ "*", "-c, --ctx-size N", "size of the prompt context (default: %d, 0 = loaded from model)", params.n_ctx });
|
||||
options.push_back({ "*", "-n, --n-predict N", "number of tokens to predict (default: %d, -1 = infinity, -2 = until context filled)", params.n_predict });
|
||||
options.push_back({ "*", "-n, --predict N", "number of tokens to predict (default: %d, -1 = infinity, -2 = until context filled)", params.n_predict });
|
||||
options.push_back({ "*", "-b, --batch-size N", "logical maximum batch size (default: %d)", params.n_batch });
|
||||
options.push_back({ "*", "-ub, --ubatch-size N", "physical maximum batch size (default: %d)", params.n_ubatch });
|
||||
options.push_back({ "*", " --keep N", "number of tokens to keep from the initial prompt (default: %d, -1 = all)", params.n_keep });
|
||||
|
@ -1743,8 +1759,10 @@ void gpt_params_print_usage(int /*argc*/, char ** argv, const gpt_params & param
|
|||
"see https://github.com/ggerganov/llama.cpp/issues/1437" });
|
||||
|
||||
if (llama_supports_gpu_offload()) {
|
||||
options.push_back({ "*", "-ngl, --n-gpu-layers N", "number of layers to store in VRAM" });
|
||||
options.push_back({ "*", "-ngld, --n-gpu-layers-draft N", "number of layers to store in VRAM for the draft model" });
|
||||
options.push_back({ "*", "-ngl, --gpu-layers N",
|
||||
"number of layers to store in VRAM" });
|
||||
options.push_back({ "*", "-ngld, --gpu-layers-draft N",
|
||||
"number of layers to store in VRAM for the draft model" });
|
||||
options.push_back({ "*", "-sm, --split-mode SPLIT_MODE",
|
||||
"how to split the model across multiple GPUs, one of:\n"
|
||||
" - none: use one GPU only\n"
|
||||
|
@ -1782,6 +1800,10 @@ void gpt_params_print_usage(int /*argc*/, char ** argv, const gpt_params & param
|
|||
options.push_back({ "retrieval", " --chunk-separator STRING",
|
||||
"separator between chunks (default: '%s')", params.chunk_separator.c_str() });
|
||||
|
||||
options.push_back({ "passkey" });
|
||||
options.push_back({ "passkey", " --junk N", "number of times to repeat the junk text (default: %d)", params.n_junk });
|
||||
options.push_back({ "passkey", " --pos N", "position of the passkey in the junk text (default: %d)", params.i_pos });
|
||||
|
||||
options.push_back({ "bench" });
|
||||
options.push_back({ "bench", "-pps", "is the prompt shared across parallel sequences (default: %s)", params.is_pp_shared ? "true" : "false" });
|
||||
options.push_back({ "bench", "-npp n0,n1,...", "number of prompt tokens" });
|
||||
|
|
|
@ -215,6 +215,10 @@ struct gpt_params {
|
|||
int32_t chunk_size = 64; // chunk size for context embedding
|
||||
|
||||
std::string chunk_separator = "\n"; // chunk separator for context embedding
|
||||
|
||||
// passkey params
|
||||
int32_t n_junk = 250; // number of times to repeat the junk text
|
||||
int32_t i_pos = -1; // position of the passkey in the junk text
|
||||
};
|
||||
|
||||
void gpt_params_handle_model_default(gpt_params & params);
|
||||
|
|
|
@ -8,5 +8,5 @@ See the following PRs for more info:
|
|||
### Usage
|
||||
|
||||
```bash
|
||||
make -j && ./passkey ./models/llama-7b-v2/ggml-model-f16.gguf 250
|
||||
make -j && ./passkey -m ./models/llama-7b-v2/ggml-model-f16.gguf --junk 250
|
||||
```
|
||||
|
|
|
@ -6,46 +6,32 @@
|
|||
#include <string>
|
||||
#include <vector>
|
||||
|
||||
static void print_usage(int argc, char ** argv, const gpt_params & params) {
|
||||
gpt_params_print_usage(argc, argv, params);
|
||||
|
||||
LOG_TEE("\nexample usage:\n");
|
||||
LOG_TEE("\n %s -m model.gguf --junk 250 --pos 90 --keep 32 --grp-attn-n 2 [--seed 1234]\n", argv[0]);
|
||||
LOG_TEE("\n");
|
||||
}
|
||||
|
||||
int main(int argc, char ** argv) {
|
||||
gpt_params params;
|
||||
|
||||
if (argc == 1 || argv[1][0] == '-') {
|
||||
printf("usage: %s MODEL_PATH N_JUNK N_GRP I_POS SEED\n" , argv[0]);
|
||||
return 1 ;
|
||||
params.n_junk = 250;
|
||||
params.n_keep = 32;
|
||||
params.i_pos = -1;
|
||||
|
||||
if (!gpt_params_parse(argc, argv, params)) {
|
||||
print_usage(argc, argv, params);
|
||||
return 1;
|
||||
}
|
||||
|
||||
int seed = -1;
|
||||
srand(params.seed == LLAMA_DEFAULT_SEED ? time(NULL) : params.seed);
|
||||
|
||||
int n_junk = 250; // number of times to repeat the junk text
|
||||
int n_keep = 32; // number of tokens in the prompt prefix
|
||||
int n_grp = 1; // if more than 1 - perform LongLM SelfExtend
|
||||
int i_pos = -1; // position of the passkey in the junk text
|
||||
|
||||
if (argc >= 2) {
|
||||
params.model = argv[1];
|
||||
}
|
||||
|
||||
if (argc >= 3) {
|
||||
n_junk = std::stoi(argv[2]);
|
||||
}
|
||||
|
||||
if (argc >= 4) {
|
||||
n_grp = std::stoi(argv[3]);
|
||||
}
|
||||
|
||||
if (argc >= 5) {
|
||||
i_pos = std::stoi(argv[4]);
|
||||
}
|
||||
|
||||
if (argc >= 6) {
|
||||
seed = std::stoi(argv[5]);
|
||||
}
|
||||
|
||||
if (seed == -1) {
|
||||
seed = time(NULL);
|
||||
}
|
||||
|
||||
srand(seed);
|
||||
int n_junk = params.n_junk;
|
||||
int n_keep = params.n_keep;
|
||||
int n_grp = params.grp_attn_n;
|
||||
int i_pos = params.i_pos;
|
||||
|
||||
if (i_pos == -1) {
|
||||
i_pos = rand() % n_junk;
|
||||
|
@ -76,9 +62,7 @@ int main(int argc, char ** argv) {
|
|||
|
||||
// initialize the model
|
||||
|
||||
llama_model_params model_params = llama_model_default_params();
|
||||
|
||||
model_params.n_gpu_layers = 99; // offload all layers to the GPU
|
||||
llama_model_params model_params = llama_model_params_from_gpt_params(params);
|
||||
|
||||
llama_model * model = llama_load_model_from_file(params.model.c_str(), model_params);
|
||||
|
||||
|
@ -89,13 +73,9 @@ int main(int argc, char ** argv) {
|
|||
|
||||
// initialize the context
|
||||
|
||||
llama_context_params ctx_params = llama_context_default_params();
|
||||
llama_context_params ctx_params = llama_context_params_from_gpt_params(params);
|
||||
|
||||
ctx_params.seed = seed;
|
||||
ctx_params.n_ctx = llama_n_ctx_train(model)*n_grp + n_keep;
|
||||
ctx_params.n_batch = 512;
|
||||
ctx_params.n_threads = params.n_threads;
|
||||
ctx_params.n_threads_batch = params.n_threads_batch == -1 ? params.n_threads : params.n_threads_batch;
|
||||
|
||||
GGML_ASSERT(ctx_params.n_batch % n_grp == 0 && "n_batch must be divisible by n_grp");
|
||||
|
||||
|
@ -135,7 +115,7 @@ int main(int argc, char ** argv) {
|
|||
LOG_TEE("prompt tokens: %d\n", n_tokens_all);
|
||||
//LOG_TEE("prompt: %s\n", params.prompt.c_str());
|
||||
|
||||
llama_batch batch = llama_batch_init(512, 0, 1);
|
||||
llama_batch batch = llama_batch_init(params.n_batch, 0, 1);
|
||||
|
||||
int n_past = 0;
|
||||
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue