common : refactor cli arg parsing (#7675)
* common : gpt_params_parse do not print usage * common : rework usage print (wip) * common : valign * common : rework print_usage * infill : remove cfg support * common : reorder args * server : deduplicate parameters ggml-ci * common : add missing header ggml-ci * common : remote --random-prompt usages ggml-ci * examples : migrate to gpt_params ggml-ci * batched-bench : migrate to gpt_params * retrieval : migrate to gpt_params * common : change defaults for escape and n_ctx * common : remove chatml and instruct params ggml-ci * common : passkey use gpt_params
This commit is contained in:
parent
554c247caf
commit
1442677f92
34 changed files with 899 additions and 1455 deletions
|
@ -10,16 +10,16 @@ There are 2 modes of operation:
|
|||
- `prompt is shared` - there is a common prompt of size `PP` used by all batches (i.e. `N_KV = PP + B*TG`)
|
||||
|
||||
```bash
|
||||
./batched-bench MODEL_PATH [N_KV_MAX] [N_BATCH] [N_UBATCH] [IS_PP_SHARED] [NGL] [MMQ] <PP> <TG> <PL>
|
||||
./batched-bench -m model.gguf -c 2048 -b 2048 -ub 512 -npp 128,256,512 -ntg 128,256 -npl 1,2,4,8,16,32 [-pps]
|
||||
|
||||
# LLaMA 7B, F16, N_KV_MAX = 16384 (8GB), prompt not shared
|
||||
./batched-bench ./models/llama-7b/ggml-model-f16.gguf 16384 2048 512 0 99
|
||||
./batched-bench -m ./models/llama-7b/ggml-model-f16.gguf -c 16384 -b 2048 -ub 512 -ngl 99
|
||||
|
||||
# LLaMA 7B, Q8_0, N_KV_MAX = 16384 (8GB), prompt is shared
|
||||
./batched-bench ./models/llama-7b/ggml-model-q8_0.gguf 16384 2048 512 1 99
|
||||
./batched-bench -m ./models/llama-7b/ggml-model-q8_0.gguf -c 16384 -b 2048 -ub 512 -ngl 99 -pps
|
||||
|
||||
# custom set of batches
|
||||
./batched-bench ./models/llama-7b/ggml-model-q8_0.gguf 2048 512 512 0 999 0 128,256,512 128,256 1,2,4,8,16,32
|
||||
./batched-bench -m ./models/llama-7b/ggml-model-q8_0.gguf -c 2048 -b 512 -ub 512 -ngl 999 -npp 128,256,512 -ntg 128,256 -npl 1,2,4,8,16,32
|
||||
```
|
||||
|
||||
## Sample results
|
||||
|
|
|
@ -28,67 +28,27 @@ static std::vector<int> parse_list(char * p) {
|
|||
return ret;
|
||||
}
|
||||
|
||||
static void print_usage(int argc, char ** argv, const gpt_params & params) {
|
||||
gpt_params_print_usage(argc, argv, params);
|
||||
|
||||
LOG_TEE("\nexample usage:\n");
|
||||
LOG_TEE("\n %s -m model.gguf -c 2048 -b 2048 -ub 512 -npp 128,256,512 -ntg 128,256 -npl 1,2,4,8,16,32 [-pps]\n", argv[0]);
|
||||
LOG_TEE("\n");
|
||||
}
|
||||
|
||||
int main(int argc, char ** argv) {
|
||||
gpt_params params;
|
||||
|
||||
if (argc == 1 || argv[1][0] == '-') {
|
||||
printf("usage: %s MODEL_PATH [N_KV_MAX] [N_BATCH] [N_UBATCH] [FATTN] [IS_PP_SHARED] [NGL] <PP> <TG> <PL>\n" , argv[0]);
|
||||
printf(" <PP>, <TG> and PL are comma-separated lists of numbers without spaces\n\n");
|
||||
printf(" example: %s ggml-model-f16.gguf 2048 2048 512 0 999 128,256,512 128,256 1,2,4,8,16,32\n\n", argv[0]);
|
||||
return 1 ;
|
||||
if (!gpt_params_parse(argc, argv, params)) {
|
||||
print_usage(argc, argv, params);
|
||||
return 1;
|
||||
}
|
||||
|
||||
int n_kv_max = 2048;
|
||||
int n_batch = 2048;
|
||||
int n_ubatch = 512;
|
||||
bool flash_attn = false;
|
||||
int is_pp_shared = 0;
|
||||
int n_gpu_layers = 0;
|
||||
int is_pp_shared = params.is_pp_shared;
|
||||
|
||||
std::vector<int> n_pp = { 128, 256, 512, 1024, 2048, 3584, 7680, };
|
||||
std::vector<int> n_tg = { 128, 256, };
|
||||
std::vector<int> n_pl = { 1, 2, 4, 8, 16, 32, };
|
||||
//std::vector<int> n_pl = { 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 32, };
|
||||
|
||||
if (argc >= 2) {
|
||||
params.model = argv[1];
|
||||
}
|
||||
|
||||
if (argc >= 3) {
|
||||
n_kv_max = std::atoi(argv[2]);
|
||||
}
|
||||
|
||||
if (argc >= 4) {
|
||||
n_batch = std::atoi(argv[3]);
|
||||
}
|
||||
|
||||
if (argc >= 5) {
|
||||
n_ubatch = std::atoi(argv[4]);
|
||||
}
|
||||
|
||||
if (argc >= 6) {
|
||||
flash_attn = std::atoi(argv[5]);
|
||||
}
|
||||
|
||||
if (argc >= 7) {
|
||||
is_pp_shared = std::atoi(argv[6]);
|
||||
}
|
||||
|
||||
if (argc >= 8) {
|
||||
n_gpu_layers = std::atoi(argv[7]);
|
||||
}
|
||||
|
||||
if (argc >= 9) {
|
||||
n_pp = parse_list(argv[8]);
|
||||
}
|
||||
|
||||
if (argc >= 10) {
|
||||
n_tg = parse_list(argv[9]);
|
||||
}
|
||||
|
||||
if (argc >= 11) {
|
||||
n_pl = parse_list(argv[10]);
|
||||
}
|
||||
std::vector<int> n_pp = params.n_pp;
|
||||
std::vector<int> n_tg = params.n_tg;
|
||||
std::vector<int> n_pl = params.n_pl;
|
||||
|
||||
// init LLM
|
||||
|
||||
|
@ -97,12 +57,7 @@ int main(int argc, char ** argv) {
|
|||
|
||||
// initialize the model
|
||||
|
||||
llama_model_params model_params = llama_model_default_params();
|
||||
|
||||
const std::vector<float> t_split(llama_max_devices(), 0.0f);
|
||||
|
||||
model_params.n_gpu_layers = n_gpu_layers;
|
||||
model_params.tensor_split = t_split.data();
|
||||
llama_model_params model_params = llama_model_params_from_gpt_params(params);
|
||||
|
||||
llama_model * model = llama_load_model_from_file(params.model.c_str(), model_params);
|
||||
|
||||
|
@ -111,16 +66,7 @@ int main(int argc, char ** argv) {
|
|||
return 1;
|
||||
}
|
||||
|
||||
llama_context_params ctx_params = llama_context_default_params();
|
||||
|
||||
ctx_params.seed = 1234;
|
||||
ctx_params.n_ctx = n_kv_max;
|
||||
ctx_params.n_batch = n_batch;
|
||||
ctx_params.n_ubatch = n_ubatch;
|
||||
ctx_params.flash_attn = flash_attn;
|
||||
|
||||
ctx_params.n_threads = params.n_threads;
|
||||
ctx_params.n_threads_batch = params.n_threads_batch == -1 ? params.n_threads : params.n_threads_batch;
|
||||
llama_context_params ctx_params = llama_context_params_from_gpt_params(params);
|
||||
|
||||
// ensure enough sequences are available
|
||||
ctx_params.n_seq_max = *std::max_element(n_pl.begin(), n_pl.end());
|
||||
|
@ -132,6 +78,8 @@ int main(int argc, char ** argv) {
|
|||
return 1;
|
||||
}
|
||||
|
||||
const int32_t n_kv_max = llama_n_ctx(ctx);
|
||||
|
||||
llama_batch batch = llama_batch_init(n_kv_max, 0, 1);
|
||||
|
||||
// decode in batches of ctx_params.n_batch tokens
|
||||
|
@ -175,7 +123,7 @@ int main(int argc, char ** argv) {
|
|||
}
|
||||
|
||||
LOG_TEE("\n");
|
||||
LOG_TEE("%s: n_kv_max = %d, n_batch = %d, n_ubatch = %d, flash_attn = %d, is_pp_shared = %d, n_gpu_layers = %d, n_threads = %u, n_threads_batch = %u\n", __func__, n_kv_max, n_batch, n_ubatch, flash_attn, is_pp_shared, n_gpu_layers, ctx_params.n_threads, ctx_params.n_threads_batch);
|
||||
LOG_TEE("%s: n_kv_max = %d, n_batch = %d, n_ubatch = %d, flash_attn = %d, is_pp_shared = %d, n_gpu_layers = %d, n_threads = %u, n_threads_batch = %u\n", __func__, n_kv_max, params.n_batch, params.n_ubatch, params.flash_attn, params.is_pp_shared, params.n_gpu_layers, ctx_params.n_threads, ctx_params.n_threads_batch);
|
||||
LOG_TEE("\n");
|
||||
|
||||
LOG_TEE("|%6s | %6s | %4s | %6s | %8s | %8s | %8s | %8s | %8s | %8s |\n", "PP", "TG", "B", "N_KV", "T_PP s", "S_PP t/s", "T_TG s", "S_TG t/s", "T s", "S t/s");
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue