llama-bench threadpool CLI params

This commit is contained in:
fmz 2024-05-28 22:31:15 -07:00
parent 5c9222d88f
commit 65c11d415d
4 changed files with 114 additions and 47 deletions

View file

@ -218,7 +218,7 @@ void gpt_params_handle_model_default(gpt_params & params) {
} }
} }
static void postprocess_cpu_params(cpu_params& cpuparams, const cpu_params* role_model = nullptr) { void postprocess_cpu_params(cpu_params& cpuparams, const cpu_params* role_model) {
int32_t n_set = 0; int32_t n_set = 0;
if (cpuparams.n_threads < 0) { if (cpuparams.n_threads < 0) {
@ -226,7 +226,7 @@ static void postprocess_cpu_params(cpu_params& cpuparams, const cpu_params* role
if (role_model != nullptr) { if (role_model != nullptr) {
cpuparams = *role_model; cpuparams = *role_model;
} else { } else {
cpuparams.n_threads = cpu_get_num_math(); cpuparams.n_threads = std::thread::hardware_concurrency();
} }
} }
@ -235,11 +235,13 @@ static void postprocess_cpu_params(cpu_params& cpuparams, const cpu_params* role
n_set++; n_set++;
} }
} }
if (n_set == 0) { if (n_set == 0) {
// You hit the jackpot! // You hit the jackpot!
memset(&cpuparams.cpumask[0], 1, GGML_N_CORES_MAX); memset(&cpuparams.cpumask[0], 1, GGML_N_CORES_MAX);
n_set = GGML_N_CORES_MAX; n_set = GGML_N_CORES_MAX;
} }
if (n_set < cpuparams.n_threads) { if (n_set < cpuparams.n_threads) {
// Not enough set bits, may experience performance issues. // Not enough set bits, may experience performance issues.
fprintf(stderr, "warn: Not enough set bits in CPU mask (%d) to satisfy requested thread count: %d\n", n_set, cpuparams.n_threads); fprintf(stderr, "warn: Not enough set bits in CPU mask (%d) to satisfy requested thread count: %d\n", n_set, cpuparams.n_threads);
@ -313,7 +315,7 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
return result; return result;
} }
static bool parse_cpu_range(const std::string & range, bool (&boolmask)[GGML_N_CORES_MAX]) { bool parse_cpu_range(const std::string & range, bool (&boolmask)[GGML_N_CORES_MAX]) {
size_t dash_loc = range.find('-'); size_t dash_loc = range.find('-');
if (dash_loc == std::string::npos) { if (dash_loc == std::string::npos) {
fprintf(stderr, "Format of CPU range is invalid! Expected [<start>]-[<end>].\n"); fprintf(stderr, "Format of CPU range is invalid! Expected [<start>]-[<end>].\n");
@ -350,7 +352,7 @@ static bool parse_cpu_range(const std::string & range, bool (&boolmask)[GGML_N_C
return true; return true;
} }
static bool parse_cpu_mask(const std::string & mask, bool (&boolmask)[GGML_N_CORES_MAX]) { bool parse_cpu_mask(const std::string & mask, bool (&boolmask)[GGML_N_CORES_MAX]) {
// Discard potential 0x prefix // Discard potential 0x prefix
size_t start_i = 0; size_t start_i = 0;
if (mask.length() >= 2 && mask.substr(0, 2) == "0x") { if (mask.length() >= 2 && mask.substr(0, 2) == "0x") {

View file

@ -198,6 +198,10 @@ bool gpt_params_parse (int argc, char ** argv, gpt_params & params);
bool gpt_params_find_arg (int argc, char ** argv, const std::string & arg, gpt_params & params, int & i, bool & invalid_param); bool gpt_params_find_arg (int argc, char ** argv, const std::string & arg, gpt_params & params, int & i, bool & invalid_param);
void gpt_params_print_usage(int argc, char ** argv, const gpt_params & params); void gpt_params_print_usage(int argc, char ** argv, const gpt_params & params);
bool parse_cpu_range(const std::string& range, bool(&boolmask)[GGML_N_CORES_MAX]);
bool parse_cpu_mask(const std::string& mask, bool(&boolmask)[GGML_N_CORES_MAX]);
void postprocess_cpu_params(cpu_params& cpuparams, const cpu_params* role_model = nullptr);
std::string gpt_params_get_system_info(const gpt_params & params); std::string gpt_params_get_system_info(const gpt_params & params);
// //

View file

@ -186,11 +186,18 @@ struct cmd_params {
std::vector<bool> use_mmap; std::vector<bool> use_mmap;
std::vector<bool> embeddings; std::vector<bool> embeddings;
ggml_numa_strategy numa; ggml_numa_strategy numa;
cpu_params cpuparams;
int reps; int reps;
bool verbose; bool verbose;
output_formats output_format; output_formats output_format;
}; };
int32_t n_threads = -1;
bool cpumask[GGML_N_CORES_MAX] = { false }; // CPU affinity mask.
bool mask_valid = false; // Default: any CPU
int32_t priority = 0; // Scheduling prio : (0 - normal, 1 - medium, 2 - high, 3 - realtime)
bool strict_cpu = false; // Use strict CPU placement
bool poll = false; // Use polling (busywait) to wait for work
static const cmd_params cmd_params_defaults = { static const cmd_params cmd_params_defaults = {
/* model */ {"models/7B/ggml-model-q4_0.gguf"}, /* model */ {"models/7B/ggml-model-q4_0.gguf"},
/* n_prompt */ {512}, /* n_prompt */ {512},
@ -210,6 +217,7 @@ static const cmd_params cmd_params_defaults = {
/* use_mmap */ {true}, /* use_mmap */ {true},
/* embeddings */ {false}, /* embeddings */ {false},
/* numa */ GGML_NUMA_STRATEGY_DISABLED, /* numa */ GGML_NUMA_STRATEGY_DISABLED,
/* cpuparams */ {int32_t(std::thread::hardware_concurrency()), {false}, false, 1, false, false},
/* reps */ 5, /* reps */ 5,
/* verbose */ false, /* verbose */ false,
/* output_format */ MARKDOWN /* output_format */ MARKDOWN
@ -236,6 +244,11 @@ static void print_usage(int /* argc */, char ** argv) {
printf(" -fa, --flash-attn <0|1> (default: %s)\n", join(cmd_params_defaults.flash_attn, ",").c_str()); printf(" -fa, --flash-attn <0|1> (default: %s)\n", join(cmd_params_defaults.flash_attn, ",").c_str());
printf(" -mmp, --mmap <0|1> (default: %s)\n", join(cmd_params_defaults.use_mmap, ",").c_str()); printf(" -mmp, --mmap <0|1> (default: %s)\n", join(cmd_params_defaults.use_mmap, ",").c_str());
printf(" --numa <distribute|isolate|numactl> (default: disabled)\n"); printf(" --numa <distribute|isolate|numactl> (default: disabled)\n");
printf(" -mt, --max-threads <n> (default: %d)\n", cmd_params_defaults.cpuparams.n_threads);
printf(" -C, --cpu-mask <hex> (default: 0x0)\n");
printf(" --cpu-strict <0|1> (default: %d)\n", cmd_params_defaults.cpuparams.strict_cpu);
printf(" --priority <0|1|2|3> (default: %d)\n", cmd_params_defaults.cpuparams.priority);
printf(" --poll <0|1> (default: %d)\n", cmd_params_defaults.cpuparams.poll);
printf(" -embd, --embeddings <0|1> (default: %s)\n", join(cmd_params_defaults.embeddings, ",").c_str()); printf(" -embd, --embeddings <0|1> (default: %s)\n", join(cmd_params_defaults.embeddings, ",").c_str());
printf(" -ts, --tensor-split <ts0/ts1/..> (default: 0)\n"); printf(" -ts, --tensor-split <ts0/ts1/..> (default: 0)\n");
printf(" -r, --repetitions <n> (default: %d)\n", cmd_params_defaults.reps); printf(" -r, --repetitions <n> (default: %d)\n", cmd_params_defaults.reps);
@ -292,28 +305,32 @@ static cmd_params parse_cmd_params(int argc, char ** argv) {
if (arg == "-h" || arg == "--help") { if (arg == "-h" || arg == "--help") {
print_usage(argc, argv); print_usage(argc, argv);
exit(0); exit(0);
} else if (arg == "-m" || arg == "--model") { }
else if (arg == "-m" || arg == "--model") {
if (++i >= argc) { if (++i >= argc) {
invalid_param = true; invalid_param = true;
break; break;
} }
auto p = split<std::string>(argv[i], split_delim); auto p = split<std::string>(argv[i], split_delim);
params.model.insert(params.model.end(), p.begin(), p.end()); params.model.insert(params.model.end(), p.begin(), p.end());
} else if (arg == "-p" || arg == "--n-prompt") { }
else if (arg == "-p" || arg == "--n-prompt") {
if (++i >= argc) { if (++i >= argc) {
invalid_param = true; invalid_param = true;
break; break;
} }
auto p = split<int>(argv[i], split_delim); auto p = split<int>(argv[i], split_delim);
params.n_prompt.insert(params.n_prompt.end(), p.begin(), p.end()); params.n_prompt.insert(params.n_prompt.end(), p.begin(), p.end());
} else if (arg == "-n" || arg == "--n-gen") { }
else if (arg == "-n" || arg == "--n-gen") {
if (++i >= argc) { if (++i >= argc) {
invalid_param = true; invalid_param = true;
break; break;
} }
auto p = split<int>(argv[i], split_delim); auto p = split<int>(argv[i], split_delim);
params.n_gen.insert(params.n_gen.end(), p.begin(), p.end()); params.n_gen.insert(params.n_gen.end(), p.begin(), p.end());
} else if (arg == "-pg") { }
else if (arg == "-pg") {
if (++i >= argc) { if (++i >= argc) {
invalid_param = true; invalid_param = true;
break; break;
@ -324,21 +341,24 @@ static cmd_params parse_cmd_params(int argc, char ** argv) {
break; break;
} }
params.n_pg.push_back({ std::stoi(p[0]), std::stoi(p[1]) }); params.n_pg.push_back({ std::stoi(p[0]), std::stoi(p[1]) });
} else if (arg == "-b" || arg == "--batch-size") { }
else if (arg == "-b" || arg == "--batch-size") {
if (++i >= argc) { if (++i >= argc) {
invalid_param = true; invalid_param = true;
break; break;
} }
auto p = split<int>(argv[i], split_delim); auto p = split<int>(argv[i], split_delim);
params.n_batch.insert(params.n_batch.end(), p.begin(), p.end()); params.n_batch.insert(params.n_batch.end(), p.begin(), p.end());
} else if (arg == "-ub" || arg == "--ubatch-size") { }
else if (arg == "-ub" || arg == "--ubatch-size") {
if (++i >= argc) { if (++i >= argc) {
invalid_param = true; invalid_param = true;
break; break;
} }
auto p = split<int>(argv[i], split_delim); auto p = split<int>(argv[i], split_delim);
params.n_ubatch.insert(params.n_ubatch.end(), p.begin(), p.end()); params.n_ubatch.insert(params.n_ubatch.end(), p.begin(), p.end());
} else if (arg == "-ctk" || arg == "--cache-type-k") { }
else if (arg == "-ctk" || arg == "--cache-type-k") {
if (++i >= argc) { if (++i >= argc) {
invalid_param = true; invalid_param = true;
break; break;
@ -354,7 +374,8 @@ static cmd_params parse_cmd_params(int argc, char ** argv) {
types.push_back(gt); types.push_back(gt);
} }
params.type_k.insert(params.type_k.end(), types.begin(), types.end()); params.type_k.insert(params.type_k.end(), types.begin(), types.end());
} else if (arg == "-ctv" || arg == "--cache-type-v") { }
else if (arg == "-ctv" || arg == "--cache-type-v") {
if (++i >= argc) { if (++i >= argc) {
invalid_param = true; invalid_param = true;
break; break;
@ -370,21 +391,24 @@ static cmd_params parse_cmd_params(int argc, char ** argv) {
types.push_back(gt); types.push_back(gt);
} }
params.type_v.insert(params.type_v.end(), types.begin(), types.end()); params.type_v.insert(params.type_v.end(), types.begin(), types.end());
} else if (arg == "-t" || arg == "--threads") { }
else if (arg == "-t" || arg == "--threads") {
if (++i >= argc) { if (++i >= argc) {
invalid_param = true; invalid_param = true;
break; break;
} }
auto p = split<int>(argv[i], split_delim); auto p = split<int>(argv[i], split_delim);
params.n_threads.insert(params.n_threads.end(), p.begin(), p.end()); params.n_threads.insert(params.n_threads.end(), p.begin(), p.end());
} else if (arg == "-ngl" || arg == "--n-gpu-layers") { }
else if (arg == "-ngl" || arg == "--n-gpu-layers") {
if (++i >= argc) { if (++i >= argc) {
invalid_param = true; invalid_param = true;
break; break;
} }
auto p = split<int>(argv[i], split_delim); auto p = split<int>(argv[i], split_delim);
params.n_gpu_layers.insert(params.n_gpu_layers.end(), p.begin(), p.end()); params.n_gpu_layers.insert(params.n_gpu_layers.end(), p.begin(), p.end());
} else if (arg == "-sm" || arg == "--split-mode") { }
else if (arg == "-sm" || arg == "--split-mode") {
if (++i >= argc) { if (++i >= argc) {
invalid_param = true; invalid_param = true;
break; break;
@ -395,41 +419,76 @@ static cmd_params parse_cmd_params(int argc, char ** argv) {
llama_split_mode mode; llama_split_mode mode;
if (m == "none") { if (m == "none") {
mode = LLAMA_SPLIT_MODE_NONE; mode = LLAMA_SPLIT_MODE_NONE;
} else if (m == "layer") { }
else if (m == "layer") {
mode = LLAMA_SPLIT_MODE_LAYER; mode = LLAMA_SPLIT_MODE_LAYER;
} else if (m == "row") { }
else if (m == "row") {
mode = LLAMA_SPLIT_MODE_ROW; mode = LLAMA_SPLIT_MODE_ROW;
} else { }
else {
invalid_param = true; invalid_param = true;
break; break;
} }
modes.push_back(mode); modes.push_back(mode);
} }
params.split_mode.insert(params.split_mode.end(), modes.begin(), modes.end()); params.split_mode.insert(params.split_mode.end(), modes.begin(), modes.end());
} else if (arg == "-mg" || arg == "--main-gpu") { }
else if (arg == "-mg" || arg == "--main-gpu") {
if (++i >= argc) { if (++i >= argc) {
invalid_param = true; invalid_param = true;
break; break;
} }
params.main_gpu = split<int>(argv[i], split_delim); params.main_gpu = split<int>(argv[i], split_delim);
} else if (arg == "-nkvo" || arg == "--no-kv-offload") { }
else if (arg == "-nkvo" || arg == "--no-kv-offload") {
if (++i >= argc) { if (++i >= argc) {
invalid_param = true; invalid_param = true;
break; break;
} }
auto p = split<bool>(argv[i], split_delim); auto p = split<bool>(argv[i], split_delim);
params.no_kv_offload.insert(params.no_kv_offload.end(), p.begin(), p.end()); params.no_kv_offload.insert(params.no_kv_offload.end(), p.begin(), p.end());
} else if (arg == "--numa") { }
else if (arg == "--numa") {
if (++i >= argc) { if (++i >= argc) {
invalid_param = true; invalid_param = true;
break; break;
} else { }
else {
std::string value(argv[i]); std::string value(argv[i]);
/**/ if (value == "distribute" || value == "") { params.numa = GGML_NUMA_STRATEGY_DISTRIBUTE; } /**/ if (value == "distribute" || value == "") { params.numa = GGML_NUMA_STRATEGY_DISTRIBUTE; }
else if (value == "isolate") { params.numa = GGML_NUMA_STRATEGY_ISOLATE; } else if (value == "isolate") { params.numa = GGML_NUMA_STRATEGY_ISOLATE; }
else if (value == "numactl") { params.numa = GGML_NUMA_STRATEGY_NUMACTL; } else if (value == "numactl") { params.numa = GGML_NUMA_STRATEGY_NUMACTL; }
else { invalid_param = true; break; } else { invalid_param = true; break; }
} }
}
else if (arg == "-mt" || arg == "--max-threads") {
if (++i >= argc) {
invalid_param = true;
break;
}
params.cpuparams.n_threads = std::stoi(argv[i]);
}
else if (arg == "-C" || arg == "--cpu-mask") {
if (++i >= argc) {
invalid_param = true;
break;
}
std::string mask = argv[i];
params.cpuparams.mask_valid = true;
invalid_param = !parse_cpu_mask(mask, params.cpuparams.cpumask);
}
else if (arg == "--prio") {
if (++i >= argc) {
invalid_param = true;
break;
}
params.cpuparams.priority = std::stoul(argv[i]);
} else if (arg == "--cpu-strict") {
params.cpuparams.strict_cpu = true;
} else if (arg == "--poll") {
params.cpuparams.poll = true;
} else if (arg == "-fa" || arg == "--flash-attn") { } else if (arg == "-fa" || arg == "--flash-attn") {
if (++i >= argc) { if (++i >= argc) {
invalid_param = true; invalid_param = true;
@ -1303,6 +1362,23 @@ int main(int argc, char ** argv) {
llama_model * lmodel = nullptr; llama_model * lmodel = nullptr;
const cmd_params_instance * prev_inst = nullptr; const cmd_params_instance * prev_inst = nullptr;
postprocess_cpu_params(params.cpuparams);
struct ggml_threadpool_params tpp;
tpp.n_threads = params.cpuparams.n_threads;
tpp.mask_specified = params.cpuparams.mask_valid;
tpp.strict_cpu = params.cpuparams.strict_cpu;
tpp.prio = params.cpuparams.priority;
tpp.poll = params.cpuparams.poll;
std::memcpy(&tpp.cpumask[0], &params.cpuparams.cpumask[0], GGML_N_CORES_MAX);
struct ggml_compute_threadpool* threadpool = ggml_create_threadpool(&tpp);
if (!threadpool) {
LOG_TEE("%s: threadpool create failed : n_threads %d\n", __func__, tpp.n_threads);
exit(1);
}
for (const auto & inst : params_instances) { for (const auto & inst : params_instances) {
// keep the same model between tests when possible // keep the same model between tests when possible
if (!lmodel || !prev_inst || !inst.equal_mparams(*prev_inst)) { if (!lmodel || !prev_inst || !inst.equal_mparams(*prev_inst)) {
@ -1329,21 +1405,6 @@ int main(int argc, char ** argv) {
llama_kv_cache_clear(ctx); llama_kv_cache_clear(ctx);
struct ggml_threadpool_params tpp;
tpp.n_threads = t.n_threads;
// TODO: expose these via cli opts
tpp.mask_specified = false;
tpp.strict_cpu = false;
tpp.prio = 1;
tpp.poll = false;
struct ggml_compute_threadpool * threadpool = ggml_create_threadpool(&tpp);
if (!threadpool) {
LOG_TEE("%s: threadpool create failed : n_threads %d\n", __func__, tpp.n_threads);
exit(1);
}
llama_set_n_threads(ctx, t.n_threads, t.n_threads); llama_set_n_threads(ctx, t.n_threads, t.n_threads);
llama_attach_threadpool(ctx, threadpool); llama_attach_threadpool(ctx, threadpool);
@ -1378,8 +1439,8 @@ int main(int argc, char ** argv) {
llama_free(ctx); llama_free(ctx);
ggml_release_threadpool(threadpool);
} }
ggml_release_threadpool(threadpool);
llama_free_model(lmodel); llama_free_model(lmodel);

2
ggml.h
View file

@ -274,7 +274,7 @@
#define GGML_UNREACHABLE() ((void) 0) #define GGML_UNREACHABLE() ((void) 0)
#endif #endif
#define GGML_N_CORES_MAX 512 #define GGML_N_CORES_MAX 16
// used to copy the number of elements and stride in bytes of tensors into local variables. // used to copy the number of elements and stride in bytes of tensors into local variables.
// main purpose is to reduce code duplication and improve readability. // main purpose is to reduce code duplication and improve readability.