make --numa a param

This commit is contained in:
zrm 2023-06-17 15:03:14 -04:00
parent 8502d5178e
commit bf83dcb279
4 changed files with 10 additions and 1 deletions

View file

@ -288,6 +288,8 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
params.use_mmap = false;
} else if (arg == "--mtest") {
params.mem_test = true;
} else if (arg == "--numa") {
params.numa = true;
} else if (arg == "--verbose-prompt") {
params.verbose_prompt = true;
} else if (arg == "-r" || arg == "--reverse-prompt") {
@ -421,6 +423,9 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
if (llama_mmap_supported()) {
fprintf(stderr, " --no-mmap do not memory-map model (slower load but may reduce pageouts if not using mlock)\n");
}
fprintf(stderr, " --numa attempt optimizations that help on some NUMA systems\n");
fprintf(stderr, " if run without this previously, it is recommended to drop the system page cache before using this\n");
fprintf(stderr, " see https://github.com/ggerganov/llama.cpp/issues/1437\n");
fprintf(stderr, " -ngl N, --n-gpu-layers N\n");
fprintf(stderr, " number of layers to store in VRAM\n");
fprintf(stderr, " --mtest compute maximum memory usage\n");

View file

@ -70,6 +70,7 @@ struct gpt_params {
bool use_mmap = true; // use mmap for faster loads
bool use_mlock = false; // use mlock to keep model in memory
bool mem_test = false; // compute maximum memory usage
bool numa = false; // attempt optimizations that help on some NUMA systems
bool verbose_prompt = false; // print prompt tokens before generation
};

View file

@ -5,6 +5,7 @@
#include "common.h"
#include "llama.h"
#include "ggml.h"
#include "build-info.h"
#include <cassert>
@ -97,6 +98,9 @@ int main(int argc, char ** argv) {
}
llama_init_backend();
if (params.numa) {
ggml_numa_init();
}
llama_context * ctx;
g_ctx = &ctx;

View file

@ -851,7 +851,6 @@ bool llama_mlock_supported() {
void llama_init_backend() {
ggml_time_init();
ggml_numa_init();
// needed to initialize f16 tables
{