llama-bench: turn threadpool params into vectors, add output headers, etc

This commit is contained in:
Max Krasnyansky 2024-08-26 17:07:36 -07:00 committed by fmz
parent 658f16c330
commit 8d5ab9a58e

View file

@ -225,6 +225,9 @@ struct cmd_params {
std::vector<ggml_type> type_k; std::vector<ggml_type> type_k;
std::vector<ggml_type> type_v; std::vector<ggml_type> type_v;
std::vector<int> n_threads; std::vector<int> n_threads;
std::vector<std::string> cpu_mask;
std::vector<bool> cpu_strict;
std::vector<int> poll;
std::vector<int> n_gpu_layers; std::vector<int> n_gpu_layers;
std::vector<std::string> rpc_servers; std::vector<std::string> rpc_servers;
std::vector<llama_split_mode> split_mode; std::vector<llama_split_mode> split_mode;
@ -235,8 +238,8 @@ struct cmd_params {
std::vector<bool> use_mmap; std::vector<bool> use_mmap;
std::vector<bool> embeddings; std::vector<bool> embeddings;
ggml_numa_strategy numa; ggml_numa_strategy numa;
cpu_params cpuparams;
int reps; int reps;
int prio;
bool verbose; bool verbose;
output_formats output_format; output_formats output_format;
output_formats output_format_stderr; output_formats output_format_stderr;
@ -252,6 +255,9 @@ static const cmd_params cmd_params_defaults = {
/* type_k */ {GGML_TYPE_F16}, /* type_k */ {GGML_TYPE_F16},
/* type_v */ {GGML_TYPE_F16}, /* type_v */ {GGML_TYPE_F16},
/* n_threads */ {cpu_get_num_math()}, /* n_threads */ {cpu_get_num_math()},
/* cpu_mask */ {"0x0"},
/* cpu_strict */ {false},
/* poll */ {50},
/* n_gpu_layers */ {99}, /* n_gpu_layers */ {99},
/* rpc_servers */ {""}, /* rpc_servers */ {""},
/* split_mode */ {LLAMA_SPLIT_MODE_LAYER}, /* split_mode */ {LLAMA_SPLIT_MODE_LAYER},
@ -262,8 +268,8 @@ static const cmd_params cmd_params_defaults = {
/* use_mmap */ {true}, /* use_mmap */ {true},
/* embeddings */ {false}, /* embeddings */ {false},
/* numa */ GGML_NUMA_STRATEGY_DISABLED, /* numa */ GGML_NUMA_STRATEGY_DISABLED,
/* cpuparams */ {},
/* reps */ 5, /* reps */ 5,
/* prio */ 0,
/* verbose */ false, /* verbose */ false,
/* output_format */ MARKDOWN, /* output_format */ MARKDOWN,
/* output_format_stderr */ NONE, /* output_format_stderr */ NONE,
@ -283,6 +289,9 @@ static void print_usage(int /* argc */, char ** argv) {
printf(" -ctk, --cache-type-k <t> (default: %s)\n", join(transform_to_str(cmd_params_defaults.type_k, ggml_type_name), ",").c_str()); printf(" -ctk, --cache-type-k <t> (default: %s)\n", join(transform_to_str(cmd_params_defaults.type_k, ggml_type_name), ",").c_str());
printf(" -ctv, --cache-type-v <t> (default: %s)\n", join(transform_to_str(cmd_params_defaults.type_v, ggml_type_name), ",").c_str()); printf(" -ctv, --cache-type-v <t> (default: %s)\n", join(transform_to_str(cmd_params_defaults.type_v, ggml_type_name), ",").c_str());
printf(" -t, --threads <n> (default: %s)\n", join(cmd_params_defaults.n_threads, ",").c_str()); printf(" -t, --threads <n> (default: %s)\n", join(cmd_params_defaults.n_threads, ",").c_str());
printf(" -C, --cpu-mask <hex,hex> (default: %s)\n", join(cmd_params_defaults.cpu_mask, ",").c_str());
printf(" --cpu-strict <0|1> (default: %s)\n", join(cmd_params_defaults.cpu_strict, ",").c_str());
printf(" --poll <0...100> (default: %s)\n", join(cmd_params_defaults.poll, ",").c_str());
printf(" -ngl, --n-gpu-layers <n> (default: %s)\n", join(cmd_params_defaults.n_gpu_layers, ",").c_str()); printf(" -ngl, --n-gpu-layers <n> (default: %s)\n", join(cmd_params_defaults.n_gpu_layers, ",").c_str());
printf(" -rpc, --rpc <rpc_servers> (default: %s)\n", join(cmd_params_defaults.rpc_servers, ",").c_str()); printf(" -rpc, --rpc <rpc_servers> (default: %s)\n", join(cmd_params_defaults.rpc_servers, ",").c_str());
printf(" -sm, --split-mode <none|layer|row> (default: %s)\n", join(transform_to_str(cmd_params_defaults.split_mode, split_mode_str), ",").c_str()); printf(" -sm, --split-mode <none|layer|row> (default: %s)\n", join(transform_to_str(cmd_params_defaults.split_mode, split_mode_str), ",").c_str());
@ -291,13 +300,10 @@ static void print_usage(int /* argc */, char ** argv) {
printf(" -fa, --flash-attn <0|1> (default: %s)\n", join(cmd_params_defaults.flash_attn, ",").c_str()); printf(" -fa, --flash-attn <0|1> (default: %s)\n", join(cmd_params_defaults.flash_attn, ",").c_str());
printf(" -mmp, --mmap <0|1> (default: %s)\n", join(cmd_params_defaults.use_mmap, ",").c_str()); printf(" -mmp, --mmap <0|1> (default: %s)\n", join(cmd_params_defaults.use_mmap, ",").c_str());
printf(" --numa <distribute|isolate|numactl> (default: disabled)\n"); printf(" --numa <distribute|isolate|numactl> (default: disabled)\n");
printf(" -C, --cpu-mask <hex> (default: 0x0)\n");
printf(" --cpu-strict <0|1> (default: %d)\n", cmd_params_defaults.cpuparams.strict_cpu);
printf(" --priority <0|1|2|3> (default: %d)\n", cmd_params_defaults.cpuparams.priority);
printf(" --poll <0|1> (default: %d)\n", cmd_params_defaults.cpuparams.poll);
printf(" -embd, --embeddings <0|1> (default: %s)\n", join(cmd_params_defaults.embeddings, ",").c_str()); printf(" -embd, --embeddings <0|1> (default: %s)\n", join(cmd_params_defaults.embeddings, ",").c_str());
printf(" -ts, --tensor-split <ts0/ts1/..> (default: 0)\n"); printf(" -ts, --tensor-split <ts0/ts1/..> (default: 0)\n");
printf(" -r, --repetitions <n> (default: %d)\n", cmd_params_defaults.reps); printf(" -r, --repetitions <n> (default: %d)\n", cmd_params_defaults.reps);
printf(" --prio <0|1|2|3> (default: %d)\n", cmd_params_defaults.prio);
printf(" -o, --output <csv|json|md|sql> (default: %s)\n", output_format_str(cmd_params_defaults.output_format)); printf(" -o, --output <csv|json|md|sql> (default: %s)\n", output_format_str(cmd_params_defaults.output_format));
printf(" -oe, --output-err <csv|json|md|sql> (default: %s)\n", output_format_str(cmd_params_defaults.output_format_stderr)); printf(" -oe, --output-err <csv|json|md|sql> (default: %s)\n", output_format_str(cmd_params_defaults.output_format_stderr));
printf(" -v, --verbose (default: %s)\n", cmd_params_defaults.verbose ? "1" : "0"); printf(" -v, --verbose (default: %s)\n", cmd_params_defaults.verbose ? "1" : "0");
@ -344,6 +350,7 @@ static cmd_params parse_cmd_params(int argc, char ** argv) {
params.output_format_stderr = cmd_params_defaults.output_format_stderr; params.output_format_stderr = cmd_params_defaults.output_format_stderr;
params.reps = cmd_params_defaults.reps; params.reps = cmd_params_defaults.reps;
params.numa = cmd_params_defaults.numa; params.numa = cmd_params_defaults.numa;
params.prio = cmd_params_defaults.prio;
for (int i = 1; i < argc; i++) { for (int i = 1; i < argc; i++) {
arg = argv[i]; arg = argv[i];
@ -439,6 +446,33 @@ static cmd_params parse_cmd_params(int argc, char ** argv) {
} }
auto p = string_split<int>(argv[i], split_delim); auto p = string_split<int>(argv[i], split_delim);
params.n_threads.insert(params.n_threads.end(), p.begin(), p.end()); params.n_threads.insert(params.n_threads.end(), p.begin(), p.end());
} else if (arg == "-C" || arg == "--cpu-mask") {
if (++i >= argc) {
invalid_param = true;
break;
}
auto p = string_split<std::string>(argv[i], split_delim);
params.cpu_mask.insert(params.cpu_mask.end(), p.begin(), p.end());
} else if (arg == "--cpu-strict") {
if (++i >= argc) {
invalid_param = true;
break;
}
auto p = string_split<bool>(argv[i], split_delim);
params.cpu_strict.insert(params.cpu_strict.end(), p.begin(), p.end());
} else if (arg == "--poll") {
if (++i >= argc) {
invalid_param = true;
break;
}
auto p = string_split<int>(argv[i], split_delim);
params.poll.insert(params.poll.end(), p.begin(), p.end());
} else if (arg == "--prio") {
if (++i >= argc) {
invalid_param = true;
break;
}
params.prio = std::stoi(argv[i]);
} else if (arg == "-ngl" || arg == "--n-gpu-layers") { } else if (arg == "-ngl" || arg == "--n-gpu-layers") {
if (++i >= argc) { if (++i >= argc) {
invalid_param = true; invalid_param = true;
@ -498,32 +532,6 @@ static cmd_params parse_cmd_params(int argc, char ** argv) {
else if (value == "numactl") { params.numa = GGML_NUMA_STRATEGY_NUMACTL; } else if (value == "numactl") { params.numa = GGML_NUMA_STRATEGY_NUMACTL; }
else { invalid_param = true; break; } else { invalid_param = true; break; }
} }
} else if (arg == "-C" || arg == "--cpu-mask") {
if (++i >= argc) {
invalid_param = true;
break;
}
std::string mask = argv[i];
params.cpuparams.mask_valid = true;
invalid_param = !parse_cpu_mask(mask, params.cpuparams.cpumask);
} else if (arg == "--prio") {
if (++i >= argc) {
invalid_param = true;
break;
}
params.cpuparams.priority = std::stoul(argv[i]);
} else if (arg == "--cpu-strict") {
if (++i >= argc) {
invalid_param = true;
break;
}
params.cpuparams.strict_cpu = std::stoul(argv[i]);
} else if (arg == "--poll") {
if (++i >= argc) {
invalid_param = true;
break;
}
params.cpuparams.poll = std::stoul(argv[i]);
} else if (arg == "-fa" || arg == "--flash-attn") { } else if (arg == "-fa" || arg == "--flash-attn") {
if (++i >= argc) { if (++i >= argc) {
invalid_param = true; invalid_param = true;
@ -617,6 +625,9 @@ static cmd_params parse_cmd_params(int argc, char ** argv) {
if (params.use_mmap.empty()) { params.use_mmap = cmd_params_defaults.use_mmap; } if (params.use_mmap.empty()) { params.use_mmap = cmd_params_defaults.use_mmap; }
if (params.embeddings.empty()) { params.embeddings = cmd_params_defaults.embeddings; } if (params.embeddings.empty()) { params.embeddings = cmd_params_defaults.embeddings; }
if (params.n_threads.empty()) { params.n_threads = cmd_params_defaults.n_threads; } if (params.n_threads.empty()) { params.n_threads = cmd_params_defaults.n_threads; }
if (params.cpu_mask.empty()) { params.cpu_mask = cmd_params_defaults.cpu_mask; }
if (params.cpu_strict.empty()) { params.cpu_strict = cmd_params_defaults.cpu_strict; }
if (params.poll.empty()) { params.poll = cmd_params_defaults.poll; }
return params; return params;
} }
@ -630,6 +641,9 @@ struct cmd_params_instance {
ggml_type type_k; ggml_type type_k;
ggml_type type_v; ggml_type type_v;
int n_threads; int n_threads;
std::string cpu_mask;
bool cpu_strict;
int poll;
int n_gpu_layers; int n_gpu_layers;
std::string rpc_servers; std::string rpc_servers;
llama_split_mode split_mode; llama_split_mode split_mode;
@ -699,7 +713,10 @@ static std::vector<cmd_params_instance> get_cmd_params_instances(const cmd_param
for (const auto & tv : params.type_v) for (const auto & tv : params.type_v)
for (const auto & nkvo : params.no_kv_offload) for (const auto & nkvo : params.no_kv_offload)
for (const auto & fa : params.flash_attn) for (const auto & fa : params.flash_attn)
for (const auto & nt : params.n_threads) { for (const auto & nt : params.n_threads)
for (const auto & cm : params.cpu_mask)
for (const auto & cs : params.cpu_strict)
for (const auto & pl : params.poll) {
for (const auto & n_prompt : params.n_prompt) { for (const auto & n_prompt : params.n_prompt) {
if (n_prompt == 0) { if (n_prompt == 0) {
continue; continue;
@ -713,6 +730,9 @@ static std::vector<cmd_params_instance> get_cmd_params_instances(const cmd_param
/* .type_k = */ tk, /* .type_k = */ tk,
/* .type_v = */ tv, /* .type_v = */ tv,
/* .n_threads = */ nt, /* .n_threads = */ nt,
/* .cpu_mask = */ cm,
/* .cpu_strict = */ cs,
/* .poll = */ pl,
/* .n_gpu_layers = */ nl, /* .n_gpu_layers = */ nl,
/* .rpc_servers = */ rpc, /* .rpc_servers = */ rpc,
/* .split_mode = */ sm, /* .split_mode = */ sm,
@ -739,6 +759,9 @@ static std::vector<cmd_params_instance> get_cmd_params_instances(const cmd_param
/* .type_k = */ tk, /* .type_k = */ tk,
/* .type_v = */ tv, /* .type_v = */ tv,
/* .n_threads = */ nt, /* .n_threads = */ nt,
/* .cpu_mask = */ cm,
/* .cpu_strict = */ cs,
/* .poll = */ pl,
/* .n_gpu_layers = */ nl, /* .n_gpu_layers = */ nl,
/* .rpc_servers = */ rpc, /* .rpc_servers = */ rpc,
/* .split_mode = */ sm, /* .split_mode = */ sm,
@ -765,6 +788,9 @@ static std::vector<cmd_params_instance> get_cmd_params_instances(const cmd_param
/* .type_k = */ tk, /* .type_k = */ tk,
/* .type_v = */ tv, /* .type_v = */ tv,
/* .n_threads = */ nt, /* .n_threads = */ nt,
/* .cpu_mask = */ cm,
/* .cpu_strict = */ cs,
/* .poll = */ pl,
/* .n_gpu_layers = */ nl, /* .n_gpu_layers = */ nl,
/* .rpc_servers = */ rpc, /* .rpc_servers = */ rpc,
/* .split_mode = */ sm, /* .split_mode = */ sm,
@ -801,6 +827,9 @@ struct test {
int n_batch; int n_batch;
int n_ubatch; int n_ubatch;
int n_threads; int n_threads;
std::string cpu_mask;
bool cpu_strict;
int poll;
bool has_rpc; bool has_rpc;
ggml_type type_k; ggml_type type_k;
ggml_type type_v; ggml_type type_v;
@ -827,6 +856,9 @@ struct test {
n_batch = inst.n_batch; n_batch = inst.n_batch;
n_ubatch = inst.n_ubatch; n_ubatch = inst.n_ubatch;
n_threads = inst.n_threads; n_threads = inst.n_threads;
cpu_mask = inst.cpu_mask;
cpu_strict = inst.cpu_strict;
poll = inst.poll;
has_rpc = !inst.rpc_servers.empty(); has_rpc = !inst.rpc_servers.empty();
type_k = inst.type_k; type_k = inst.type_k;
type_v = inst.type_v; type_v = inst.type_v;
@ -904,13 +936,14 @@ struct test {
"cpu_info", "gpu_info", "cpu_info", "gpu_info",
"model_filename", "model_type", "model_size", "model_n_params", "model_filename", "model_type", "model_size", "model_n_params",
"n_batch", "n_ubatch", "n_batch", "n_ubatch",
"n_threads", "type_k", "type_v", "n_threads", "cpu_mask", "cpu_strict", "poll",
"type_k", "type_v",
"n_gpu_layers", "split_mode", "n_gpu_layers", "split_mode",
"main_gpu", "no_kv_offload", "flash_attn", "main_gpu", "no_kv_offload", "flash_attn",
"tensor_split", "use_mmap", "embeddings", "tensor_split", "use_mmap", "embeddings",
"n_prompt", "n_gen", "test_time", "n_prompt", "n_gen", "test_time",
"avg_ns", "stddev_ns", "avg_ns", "stddev_ns",
"avg_ts", "stddev_ts" "avg_ts", "stddev_ts",
}; };
return fields; return fields;
} }
@ -919,7 +952,7 @@ struct test {
static field_type get_field_type(const std::string & field) { static field_type get_field_type(const std::string & field) {
if (field == "build_number" || field == "n_batch" || field == "n_ubatch" || if (field == "build_number" || field == "n_batch" || field == "n_ubatch" ||
field == "n_threads" || field == "n_threads" || field == "poll" ||
field == "model_size" || field == "model_n_params" || field == "model_size" || field == "model_n_params" ||
field == "n_gpu_layers" || field == "main_gpu" || field == "n_gpu_layers" || field == "main_gpu" ||
field == "n_prompt" || field == "n_gen" || field == "n_prompt" || field == "n_gen" ||
@ -928,6 +961,7 @@ struct test {
} }
if (field == "cuda" || field == "vulkan" || field == "kompute" || field == "metal" || if (field == "cuda" || field == "vulkan" || field == "kompute" || field == "metal" ||
field == "gpu_blas" || field == "blas" || field == "sycl" ||field == "f16_kv" || field == "no_kv_offload" || field == "gpu_blas" || field == "blas" || field == "sycl" ||field == "f16_kv" || field == "no_kv_offload" ||
field == "cpu_strict" ||
field == "flash_attn" || field == "use_mmap" || field == "embeddings") { field == "flash_attn" || field == "use_mmap" || field == "embeddings") {
return BOOL; return BOOL;
} }
@ -960,7 +994,8 @@ struct test {
cpu_info, gpu_info, cpu_info, gpu_info,
model_filename, model_type, std::to_string(model_size), std::to_string(model_n_params), model_filename, model_type, std::to_string(model_size), std::to_string(model_n_params),
std::to_string(n_batch), std::to_string(n_ubatch), std::to_string(n_batch), std::to_string(n_ubatch),
std::to_string(n_threads), ggml_type_name(type_k), ggml_type_name(type_v), std::to_string(n_threads), cpu_mask, std::to_string(cpu_strict), std::to_string(poll),
ggml_type_name(type_k), ggml_type_name(type_v),
std::to_string(n_gpu_layers), split_mode_str(split_mode), std::to_string(n_gpu_layers), split_mode_str(split_mode),
std::to_string(main_gpu), std::to_string(no_kv_offload), std::to_string(flash_attn), std::to_string(main_gpu), std::to_string(no_kv_offload), std::to_string(flash_attn),
tensor_split_str, std::to_string(use_mmap), std::to_string(embeddings), tensor_split_str, std::to_string(use_mmap), std::to_string(embeddings),
@ -1099,7 +1134,7 @@ struct markdown_printer : public printer {
return -30; return -30;
} }
if (field == "t/s") { if (field == "t/s") {
return 16; return 20;
} }
if (field == "size" || field == "params") { if (field == "size" || field == "params") {
return 10; return 10;
@ -1181,6 +1216,15 @@ struct markdown_printer : public printer {
if (params.n_threads.size() > 1 || params.n_threads != cmd_params_defaults.n_threads || is_cpu_backend) { if (params.n_threads.size() > 1 || params.n_threads != cmd_params_defaults.n_threads || is_cpu_backend) {
fields.emplace_back("n_threads"); fields.emplace_back("n_threads");
} }
if (params.cpu_mask.size() > 1 || params.cpu_mask != cmd_params_defaults.cpu_mask) {
fields.emplace_back("cpu_mask");
}
if (params.cpu_strict.size() > 1 || params.cpu_strict != cmd_params_defaults.cpu_strict) {
fields.emplace_back("cpu_strict");
}
if (params.poll.size() > 1 || params.poll != cmd_params_defaults.poll) {
fields.emplace_back("poll");
}
if (params.n_batch.size() > 1 || params.n_batch != cmd_params_defaults.n_batch) { if (params.n_batch.size() > 1 || params.n_batch != cmd_params_defaults.n_batch) {
fields.emplace_back("n_batch"); fields.emplace_back("n_batch");
} }
@ -1434,8 +1478,6 @@ int main(int argc, char ** argv) {
llama_model * lmodel = nullptr; llama_model * lmodel = nullptr;
const cmd_params_instance * prev_inst = nullptr; const cmd_params_instance * prev_inst = nullptr;
postprocess_cpu_params(params.cpuparams);
for (const auto & inst : params_instances) { for (const auto & inst : params_instances) {
// keep the same model between tests when possible // keep the same model between tests when possible
if (!lmodel || !prev_inst || !inst.equal_mparams(*prev_inst)) { if (!lmodel || !prev_inst || !inst.equal_mparams(*prev_inst)) {
@ -1463,12 +1505,13 @@ int main(int argc, char ** argv) {
llama_kv_cache_clear(ctx); llama_kv_cache_clear(ctx);
struct ggml_threadpool_params tpp = ggml_threadpool_params_default(t.n_threads); struct ggml_threadpool_params tpp = ggml_threadpool_params_default(t.n_threads);
tpp.strict_cpu = params.cpuparams.strict_cpu; if (!parse_cpu_mask(t.cpu_mask, tpp.cpumask)) {
tpp.prio = params.cpuparams.priority; LOG_TEE("%s: failed to parse cpu-mask: %s\n", __func__, t.cpu_mask.c_str());
tpp.poll = params.cpuparams.poll; exit(1);
if (params.cpuparams.mask_valid) {
std::memcpy(&tpp.cpumask[0], &params.cpuparams.cpumask[0], GGML_MAX_N_THREADS);
} }
tpp.strict_cpu = t.cpu_strict;
tpp.poll = t.poll;
tpp.prio = params.prio;
struct ggml_compute_threadpool* threadpool = ggml_create_threadpool(&tpp); struct ggml_compute_threadpool* threadpool = ggml_create_threadpool(&tpp);
if (!threadpool) { if (!threadpool) {