make --numa a param

This commit is contained in:
zrm 2023-06-17 15:03:14 -04:00
parent 8502d5178e
commit bf83dcb279
4 changed files with 10 additions and 1 deletions

View file

@ -288,6 +288,8 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
params.use_mmap = false; params.use_mmap = false;
} else if (arg == "--mtest") { } else if (arg == "--mtest") {
params.mem_test = true; params.mem_test = true;
} else if (arg == "--numa") {
params.numa = true;
} else if (arg == "--verbose-prompt") { } else if (arg == "--verbose-prompt") {
params.verbose_prompt = true; params.verbose_prompt = true;
} else if (arg == "-r" || arg == "--reverse-prompt") { } else if (arg == "-r" || arg == "--reverse-prompt") {
@ -421,6 +423,9 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
if (llama_mmap_supported()) { if (llama_mmap_supported()) {
fprintf(stderr, " --no-mmap do not memory-map model (slower load but may reduce pageouts if not using mlock)\n"); fprintf(stderr, " --no-mmap do not memory-map model (slower load but may reduce pageouts if not using mlock)\n");
} }
fprintf(stderr, " --numa attempt optimizations that help on some NUMA systems\n");
fprintf(stderr, " if run without this previously, it is recommended to drop the system page cache before using this\n");
fprintf(stderr, " see https://github.com/ggerganov/llama.cpp/issues/1437\n");
fprintf(stderr, " -ngl N, --n-gpu-layers N\n"); fprintf(stderr, " -ngl N, --n-gpu-layers N\n");
fprintf(stderr, " number of layers to store in VRAM\n"); fprintf(stderr, " number of layers to store in VRAM\n");
fprintf(stderr, " --mtest compute maximum memory usage\n"); fprintf(stderr, " --mtest compute maximum memory usage\n");

View file

@ -70,6 +70,7 @@ struct gpt_params {
bool use_mmap = true; // use mmap for faster loads bool use_mmap = true; // use mmap for faster loads
bool use_mlock = false; // use mlock to keep model in memory bool use_mlock = false; // use mlock to keep model in memory
bool mem_test = false; // compute maximum memory usage bool mem_test = false; // compute maximum memory usage
bool numa = false; // attempt optimizations that help on some NUMA systems
bool verbose_prompt = false; // print prompt tokens before generation bool verbose_prompt = false; // print prompt tokens before generation
}; };

View file

@ -5,6 +5,7 @@
#include "common.h" #include "common.h"
#include "llama.h" #include "llama.h"
#include "ggml.h"
#include "build-info.h" #include "build-info.h"
#include <cassert> #include <cassert>
@ -97,6 +98,9 @@ int main(int argc, char ** argv) {
} }
llama_init_backend(); llama_init_backend();
if (params.numa) {
ggml_numa_init();
}
llama_context * ctx; llama_context * ctx;
g_ctx = &ctx; g_ctx = &ctx;

View file

@ -851,7 +851,6 @@ bool llama_mlock_supported() {
void llama_init_backend() { void llama_init_backend() {
ggml_time_init(); ggml_time_init();
ggml_numa_init();
// needed to initialize f16 tables // needed to initialize f16 tables
{ {