llama-bench: turn threadpool params into vectors, add output headers, etc
This commit is contained in:
parent
658f16c330
commit
8d5ab9a58e
1 changed files with 88 additions and 45 deletions
|
@ -225,6 +225,9 @@ struct cmd_params {
|
|||
std::vector<ggml_type> type_k;
|
||||
std::vector<ggml_type> type_v;
|
||||
std::vector<int> n_threads;
|
||||
std::vector<std::string> cpu_mask;
|
||||
std::vector<bool> cpu_strict;
|
||||
std::vector<int> poll;
|
||||
std::vector<int> n_gpu_layers;
|
||||
std::vector<std::string> rpc_servers;
|
||||
std::vector<llama_split_mode> split_mode;
|
||||
|
@ -235,8 +238,8 @@ struct cmd_params {
|
|||
std::vector<bool> use_mmap;
|
||||
std::vector<bool> embeddings;
|
||||
ggml_numa_strategy numa;
|
||||
cpu_params cpuparams;
|
||||
int reps;
|
||||
int prio;
|
||||
bool verbose;
|
||||
output_formats output_format;
|
||||
output_formats output_format_stderr;
|
||||
|
@ -252,6 +255,9 @@ static const cmd_params cmd_params_defaults = {
|
|||
/* type_k */ {GGML_TYPE_F16},
|
||||
/* type_v */ {GGML_TYPE_F16},
|
||||
/* n_threads */ {cpu_get_num_math()},
|
||||
/* cpu_mask */ {"0x0"},
|
||||
/* cpu_strict */ {false},
|
||||
/* poll */ {50},
|
||||
/* n_gpu_layers */ {99},
|
||||
/* rpc_servers */ {""},
|
||||
/* split_mode */ {LLAMA_SPLIT_MODE_LAYER},
|
||||
|
@ -262,8 +268,8 @@ static const cmd_params cmd_params_defaults = {
|
|||
/* use_mmap */ {true},
|
||||
/* embeddings */ {false},
|
||||
/* numa */ GGML_NUMA_STRATEGY_DISABLED,
|
||||
/* cpuparams */ {},
|
||||
/* reps */ 5,
|
||||
/* prio */ 0,
|
||||
/* verbose */ false,
|
||||
/* output_format */ MARKDOWN,
|
||||
/* output_format_stderr */ NONE,
|
||||
|
@ -283,6 +289,9 @@ static void print_usage(int /* argc */, char ** argv) {
|
|||
printf(" -ctk, --cache-type-k <t> (default: %s)\n", join(transform_to_str(cmd_params_defaults.type_k, ggml_type_name), ",").c_str());
|
||||
printf(" -ctv, --cache-type-v <t> (default: %s)\n", join(transform_to_str(cmd_params_defaults.type_v, ggml_type_name), ",").c_str());
|
||||
printf(" -t, --threads <n> (default: %s)\n", join(cmd_params_defaults.n_threads, ",").c_str());
|
||||
printf(" -C, --cpu-mask <hex,hex> (default: %s)\n", join(cmd_params_defaults.cpu_mask, ",").c_str());
|
||||
printf(" --cpu-strict <0|1> (default: %s)\n", join(cmd_params_defaults.cpu_strict, ",").c_str());
|
||||
printf(" --poll <0...100> (default: %s)\n", join(cmd_params_defaults.poll, ",").c_str());
|
||||
printf(" -ngl, --n-gpu-layers <n> (default: %s)\n", join(cmd_params_defaults.n_gpu_layers, ",").c_str());
|
||||
printf(" -rpc, --rpc <rpc_servers> (default: %s)\n", join(cmd_params_defaults.rpc_servers, ",").c_str());
|
||||
printf(" -sm, --split-mode <none|layer|row> (default: %s)\n", join(transform_to_str(cmd_params_defaults.split_mode, split_mode_str), ",").c_str());
|
||||
|
@ -291,13 +300,10 @@ static void print_usage(int /* argc */, char ** argv) {
|
|||
printf(" -fa, --flash-attn <0|1> (default: %s)\n", join(cmd_params_defaults.flash_attn, ",").c_str());
|
||||
printf(" -mmp, --mmap <0|1> (default: %s)\n", join(cmd_params_defaults.use_mmap, ",").c_str());
|
||||
printf(" --numa <distribute|isolate|numactl> (default: disabled)\n");
|
||||
printf(" -C, --cpu-mask <hex> (default: 0x0)\n");
|
||||
printf(" --cpu-strict <0|1> (default: %d)\n", cmd_params_defaults.cpuparams.strict_cpu);
|
||||
printf(" --priority <0|1|2|3> (default: %d)\n", cmd_params_defaults.cpuparams.priority);
|
||||
printf(" --poll <0|1> (default: %d)\n", cmd_params_defaults.cpuparams.poll);
|
||||
printf(" -embd, --embeddings <0|1> (default: %s)\n", join(cmd_params_defaults.embeddings, ",").c_str());
|
||||
printf(" -ts, --tensor-split <ts0/ts1/..> (default: 0)\n");
|
||||
printf(" -r, --repetitions <n> (default: %d)\n", cmd_params_defaults.reps);
|
||||
printf(" --prio <0|1|2|3> (default: %d)\n", cmd_params_defaults.prio);
|
||||
printf(" -o, --output <csv|json|md|sql> (default: %s)\n", output_format_str(cmd_params_defaults.output_format));
|
||||
printf(" -oe, --output-err <csv|json|md|sql> (default: %s)\n", output_format_str(cmd_params_defaults.output_format_stderr));
|
||||
printf(" -v, --verbose (default: %s)\n", cmd_params_defaults.verbose ? "1" : "0");
|
||||
|
@ -344,6 +350,7 @@ static cmd_params parse_cmd_params(int argc, char ** argv) {
|
|||
params.output_format_stderr = cmd_params_defaults.output_format_stderr;
|
||||
params.reps = cmd_params_defaults.reps;
|
||||
params.numa = cmd_params_defaults.numa;
|
||||
params.prio = cmd_params_defaults.prio;
|
||||
|
||||
for (int i = 1; i < argc; i++) {
|
||||
arg = argv[i];
|
||||
|
@ -439,6 +446,33 @@ static cmd_params parse_cmd_params(int argc, char ** argv) {
|
|||
}
|
||||
auto p = string_split<int>(argv[i], split_delim);
|
||||
params.n_threads.insert(params.n_threads.end(), p.begin(), p.end());
|
||||
} else if (arg == "-C" || arg == "--cpu-mask") {
|
||||
if (++i >= argc) {
|
||||
invalid_param = true;
|
||||
break;
|
||||
}
|
||||
auto p = string_split<std::string>(argv[i], split_delim);
|
||||
params.cpu_mask.insert(params.cpu_mask.end(), p.begin(), p.end());
|
||||
} else if (arg == "--cpu-strict") {
|
||||
if (++i >= argc) {
|
||||
invalid_param = true;
|
||||
break;
|
||||
}
|
||||
auto p = string_split<bool>(argv[i], split_delim);
|
||||
params.cpu_strict.insert(params.cpu_strict.end(), p.begin(), p.end());
|
||||
} else if (arg == "--poll") {
|
||||
if (++i >= argc) {
|
||||
invalid_param = true;
|
||||
break;
|
||||
}
|
||||
auto p = string_split<int>(argv[i], split_delim);
|
||||
params.poll.insert(params.poll.end(), p.begin(), p.end());
|
||||
} else if (arg == "--prio") {
|
||||
if (++i >= argc) {
|
||||
invalid_param = true;
|
||||
break;
|
||||
}
|
||||
params.prio = std::stoi(argv[i]);
|
||||
} else if (arg == "-ngl" || arg == "--n-gpu-layers") {
|
||||
if (++i >= argc) {
|
||||
invalid_param = true;
|
||||
|
@ -498,32 +532,6 @@ static cmd_params parse_cmd_params(int argc, char ** argv) {
|
|||
else if (value == "numactl") { params.numa = GGML_NUMA_STRATEGY_NUMACTL; }
|
||||
else { invalid_param = true; break; }
|
||||
}
|
||||
} else if (arg == "-C" || arg == "--cpu-mask") {
|
||||
if (++i >= argc) {
|
||||
invalid_param = true;
|
||||
break;
|
||||
}
|
||||
std::string mask = argv[i];
|
||||
params.cpuparams.mask_valid = true;
|
||||
invalid_param = !parse_cpu_mask(mask, params.cpuparams.cpumask);
|
||||
} else if (arg == "--prio") {
|
||||
if (++i >= argc) {
|
||||
invalid_param = true;
|
||||
break;
|
||||
}
|
||||
params.cpuparams.priority = std::stoul(argv[i]);
|
||||
} else if (arg == "--cpu-strict") {
|
||||
if (++i >= argc) {
|
||||
invalid_param = true;
|
||||
break;
|
||||
}
|
||||
params.cpuparams.strict_cpu = std::stoul(argv[i]);
|
||||
} else if (arg == "--poll") {
|
||||
if (++i >= argc) {
|
||||
invalid_param = true;
|
||||
break;
|
||||
}
|
||||
params.cpuparams.poll = std::stoul(argv[i]);
|
||||
} else if (arg == "-fa" || arg == "--flash-attn") {
|
||||
if (++i >= argc) {
|
||||
invalid_param = true;
|
||||
|
@ -617,6 +625,9 @@ static cmd_params parse_cmd_params(int argc, char ** argv) {
|
|||
if (params.use_mmap.empty()) { params.use_mmap = cmd_params_defaults.use_mmap; }
|
||||
if (params.embeddings.empty()) { params.embeddings = cmd_params_defaults.embeddings; }
|
||||
if (params.n_threads.empty()) { params.n_threads = cmd_params_defaults.n_threads; }
|
||||
if (params.cpu_mask.empty()) { params.cpu_mask = cmd_params_defaults.cpu_mask; }
|
||||
if (params.cpu_strict.empty()) { params.cpu_strict = cmd_params_defaults.cpu_strict; }
|
||||
if (params.poll.empty()) { params.poll = cmd_params_defaults.poll; }
|
||||
|
||||
return params;
|
||||
}
|
||||
|
@ -630,6 +641,9 @@ struct cmd_params_instance {
|
|||
ggml_type type_k;
|
||||
ggml_type type_v;
|
||||
int n_threads;
|
||||
std::string cpu_mask;
|
||||
bool cpu_strict;
|
||||
int poll;
|
||||
int n_gpu_layers;
|
||||
std::string rpc_servers;
|
||||
llama_split_mode split_mode;
|
||||
|
@ -699,7 +713,10 @@ static std::vector<cmd_params_instance> get_cmd_params_instances(const cmd_param
|
|||
for (const auto & tv : params.type_v)
|
||||
for (const auto & nkvo : params.no_kv_offload)
|
||||
for (const auto & fa : params.flash_attn)
|
||||
for (const auto & nt : params.n_threads) {
|
||||
for (const auto & nt : params.n_threads)
|
||||
for (const auto & cm : params.cpu_mask)
|
||||
for (const auto & cs : params.cpu_strict)
|
||||
for (const auto & pl : params.poll) {
|
||||
for (const auto & n_prompt : params.n_prompt) {
|
||||
if (n_prompt == 0) {
|
||||
continue;
|
||||
|
@ -713,6 +730,9 @@ static std::vector<cmd_params_instance> get_cmd_params_instances(const cmd_param
|
|||
/* .type_k = */ tk,
|
||||
/* .type_v = */ tv,
|
||||
/* .n_threads = */ nt,
|
||||
/* .cpu_mask = */ cm,
|
||||
/* .cpu_strict = */ cs,
|
||||
/* .poll = */ pl,
|
||||
/* .n_gpu_layers = */ nl,
|
||||
/* .rpc_servers = */ rpc,
|
||||
/* .split_mode = */ sm,
|
||||
|
@ -739,6 +759,9 @@ static std::vector<cmd_params_instance> get_cmd_params_instances(const cmd_param
|
|||
/* .type_k = */ tk,
|
||||
/* .type_v = */ tv,
|
||||
/* .n_threads = */ nt,
|
||||
/* .cpu_mask = */ cm,
|
||||
/* .cpu_strict = */ cs,
|
||||
/* .poll = */ pl,
|
||||
/* .n_gpu_layers = */ nl,
|
||||
/* .rpc_servers = */ rpc,
|
||||
/* .split_mode = */ sm,
|
||||
|
@ -765,6 +788,9 @@ static std::vector<cmd_params_instance> get_cmd_params_instances(const cmd_param
|
|||
/* .type_k = */ tk,
|
||||
/* .type_v = */ tv,
|
||||
/* .n_threads = */ nt,
|
||||
/* .cpu_mask = */ cm,
|
||||
/* .cpu_strict = */ cs,
|
||||
/* .poll = */ pl,
|
||||
/* .n_gpu_layers = */ nl,
|
||||
/* .rpc_servers = */ rpc,
|
||||
/* .split_mode = */ sm,
|
||||
|
@ -801,6 +827,9 @@ struct test {
|
|||
int n_batch;
|
||||
int n_ubatch;
|
||||
int n_threads;
|
||||
std::string cpu_mask;
|
||||
bool cpu_strict;
|
||||
int poll;
|
||||
bool has_rpc;
|
||||
ggml_type type_k;
|
||||
ggml_type type_v;
|
||||
|
@ -827,6 +856,9 @@ struct test {
|
|||
n_batch = inst.n_batch;
|
||||
n_ubatch = inst.n_ubatch;
|
||||
n_threads = inst.n_threads;
|
||||
cpu_mask = inst.cpu_mask;
|
||||
cpu_strict = inst.cpu_strict;
|
||||
poll = inst.poll;
|
||||
has_rpc = !inst.rpc_servers.empty();
|
||||
type_k = inst.type_k;
|
||||
type_v = inst.type_v;
|
||||
|
@ -904,13 +936,14 @@ struct test {
|
|||
"cpu_info", "gpu_info",
|
||||
"model_filename", "model_type", "model_size", "model_n_params",
|
||||
"n_batch", "n_ubatch",
|
||||
"n_threads", "type_k", "type_v",
|
||||
"n_threads", "cpu_mask", "cpu_strict", "poll",
|
||||
"type_k", "type_v",
|
||||
"n_gpu_layers", "split_mode",
|
||||
"main_gpu", "no_kv_offload", "flash_attn",
|
||||
"tensor_split", "use_mmap", "embeddings",
|
||||
"n_prompt", "n_gen", "test_time",
|
||||
"avg_ns", "stddev_ns",
|
||||
"avg_ts", "stddev_ts"
|
||||
"avg_ts", "stddev_ts",
|
||||
};
|
||||
return fields;
|
||||
}
|
||||
|
@ -919,7 +952,7 @@ struct test {
|
|||
|
||||
static field_type get_field_type(const std::string & field) {
|
||||
if (field == "build_number" || field == "n_batch" || field == "n_ubatch" ||
|
||||
field == "n_threads" ||
|
||||
field == "n_threads" || field == "poll" ||
|
||||
field == "model_size" || field == "model_n_params" ||
|
||||
field == "n_gpu_layers" || field == "main_gpu" ||
|
||||
field == "n_prompt" || field == "n_gen" ||
|
||||
|
@ -928,6 +961,7 @@ struct test {
|
|||
}
|
||||
if (field == "cuda" || field == "vulkan" || field == "kompute" || field == "metal" ||
|
||||
field == "gpu_blas" || field == "blas" || field == "sycl" ||field == "f16_kv" || field == "no_kv_offload" ||
|
||||
field == "cpu_strict" ||
|
||||
field == "flash_attn" || field == "use_mmap" || field == "embeddings") {
|
||||
return BOOL;
|
||||
}
|
||||
|
@ -960,7 +994,8 @@ struct test {
|
|||
cpu_info, gpu_info,
|
||||
model_filename, model_type, std::to_string(model_size), std::to_string(model_n_params),
|
||||
std::to_string(n_batch), std::to_string(n_ubatch),
|
||||
std::to_string(n_threads), ggml_type_name(type_k), ggml_type_name(type_v),
|
||||
std::to_string(n_threads), cpu_mask, std::to_string(cpu_strict), std::to_string(poll),
|
||||
ggml_type_name(type_k), ggml_type_name(type_v),
|
||||
std::to_string(n_gpu_layers), split_mode_str(split_mode),
|
||||
std::to_string(main_gpu), std::to_string(no_kv_offload), std::to_string(flash_attn),
|
||||
tensor_split_str, std::to_string(use_mmap), std::to_string(embeddings),
|
||||
|
@ -1099,7 +1134,7 @@ struct markdown_printer : public printer {
|
|||
return -30;
|
||||
}
|
||||
if (field == "t/s") {
|
||||
return 16;
|
||||
return 20;
|
||||
}
|
||||
if (field == "size" || field == "params") {
|
||||
return 10;
|
||||
|
@ -1181,6 +1216,15 @@ struct markdown_printer : public printer {
|
|||
if (params.n_threads.size() > 1 || params.n_threads != cmd_params_defaults.n_threads || is_cpu_backend) {
|
||||
fields.emplace_back("n_threads");
|
||||
}
|
||||
if (params.cpu_mask.size() > 1 || params.cpu_mask != cmd_params_defaults.cpu_mask) {
|
||||
fields.emplace_back("cpu_mask");
|
||||
}
|
||||
if (params.cpu_strict.size() > 1 || params.cpu_strict != cmd_params_defaults.cpu_strict) {
|
||||
fields.emplace_back("cpu_strict");
|
||||
}
|
||||
if (params.poll.size() > 1 || params.poll != cmd_params_defaults.poll) {
|
||||
fields.emplace_back("poll");
|
||||
}
|
||||
if (params.n_batch.size() > 1 || params.n_batch != cmd_params_defaults.n_batch) {
|
||||
fields.emplace_back("n_batch");
|
||||
}
|
||||
|
@ -1434,8 +1478,6 @@ int main(int argc, char ** argv) {
|
|||
llama_model * lmodel = nullptr;
|
||||
const cmd_params_instance * prev_inst = nullptr;
|
||||
|
||||
postprocess_cpu_params(params.cpuparams);
|
||||
|
||||
for (const auto & inst : params_instances) {
|
||||
// keep the same model between tests when possible
|
||||
if (!lmodel || !prev_inst || !inst.equal_mparams(*prev_inst)) {
|
||||
|
@ -1463,12 +1505,13 @@ int main(int argc, char ** argv) {
|
|||
llama_kv_cache_clear(ctx);
|
||||
|
||||
struct ggml_threadpool_params tpp = ggml_threadpool_params_default(t.n_threads);
|
||||
tpp.strict_cpu = params.cpuparams.strict_cpu;
|
||||
tpp.prio = params.cpuparams.priority;
|
||||
tpp.poll = params.cpuparams.poll;
|
||||
if (params.cpuparams.mask_valid) {
|
||||
std::memcpy(&tpp.cpumask[0], ¶ms.cpuparams.cpumask[0], GGML_MAX_N_THREADS);
|
||||
if (!parse_cpu_mask(t.cpu_mask, tpp.cpumask)) {
|
||||
LOG_TEE("%s: failed to parse cpu-mask: %s\n", __func__, t.cpu_mask.c_str());
|
||||
exit(1);
|
||||
}
|
||||
tpp.strict_cpu = t.cpu_strict;
|
||||
tpp.poll = t.poll;
|
||||
tpp.prio = params.prio;
|
||||
|
||||
struct ggml_compute_threadpool* threadpool = ggml_create_threadpool(&tpp);
|
||||
if (!threadpool) {
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue