make --numa a param
This commit is contained in:
parent
8502d5178e
commit
bf83dcb279
4 changed files with 10 additions and 1 deletions
|
@ -288,6 +288,8 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
|
||||||
params.use_mmap = false;
|
params.use_mmap = false;
|
||||||
} else if (arg == "--mtest") {
|
} else if (arg == "--mtest") {
|
||||||
params.mem_test = true;
|
params.mem_test = true;
|
||||||
|
} else if (arg == "--numa") {
|
||||||
|
params.numa = true;
|
||||||
} else if (arg == "--verbose-prompt") {
|
} else if (arg == "--verbose-prompt") {
|
||||||
params.verbose_prompt = true;
|
params.verbose_prompt = true;
|
||||||
} else if (arg == "-r" || arg == "--reverse-prompt") {
|
} else if (arg == "-r" || arg == "--reverse-prompt") {
|
||||||
|
@ -421,6 +423,9 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
|
||||||
if (llama_mmap_supported()) {
|
if (llama_mmap_supported()) {
|
||||||
fprintf(stderr, " --no-mmap do not memory-map model (slower load but may reduce pageouts if not using mlock)\n");
|
fprintf(stderr, " --no-mmap do not memory-map model (slower load but may reduce pageouts if not using mlock)\n");
|
||||||
}
|
}
|
||||||
|
fprintf(stderr, " --numa attempt optimizations that help on some NUMA systems\n");
|
||||||
|
fprintf(stderr, " if run without this previously, it is recommended to drop the system page cache before using this\n");
|
||||||
|
fprintf(stderr, " see https://github.com/ggerganov/llama.cpp/issues/1437\n");
|
||||||
fprintf(stderr, " -ngl N, --n-gpu-layers N\n");
|
fprintf(stderr, " -ngl N, --n-gpu-layers N\n");
|
||||||
fprintf(stderr, " number of layers to store in VRAM\n");
|
fprintf(stderr, " number of layers to store in VRAM\n");
|
||||||
fprintf(stderr, " --mtest compute maximum memory usage\n");
|
fprintf(stderr, " --mtest compute maximum memory usage\n");
|
||||||
|
|
|
@ -70,6 +70,7 @@ struct gpt_params {
|
||||||
bool use_mmap = true; // use mmap for faster loads
|
bool use_mmap = true; // use mmap for faster loads
|
||||||
bool use_mlock = false; // use mlock to keep model in memory
|
bool use_mlock = false; // use mlock to keep model in memory
|
||||||
bool mem_test = false; // compute maximum memory usage
|
bool mem_test = false; // compute maximum memory usage
|
||||||
|
bool numa = false; // attempt optimizations that help on some NUMA systems
|
||||||
bool verbose_prompt = false; // print prompt tokens before generation
|
bool verbose_prompt = false; // print prompt tokens before generation
|
||||||
};
|
};
|
||||||
|
|
||||||
|
|
|
@ -5,6 +5,7 @@
|
||||||
|
|
||||||
#include "common.h"
|
#include "common.h"
|
||||||
#include "llama.h"
|
#include "llama.h"
|
||||||
|
#include "ggml.h"
|
||||||
#include "build-info.h"
|
#include "build-info.h"
|
||||||
|
|
||||||
#include <cassert>
|
#include <cassert>
|
||||||
|
@ -97,6 +98,9 @@ int main(int argc, char ** argv) {
|
||||||
}
|
}
|
||||||
|
|
||||||
llama_init_backend();
|
llama_init_backend();
|
||||||
|
if (params.numa) {
|
||||||
|
ggml_numa_init();
|
||||||
|
}
|
||||||
|
|
||||||
llama_context * ctx;
|
llama_context * ctx;
|
||||||
g_ctx = &ctx;
|
g_ctx = &ctx;
|
||||||
|
|
|
@ -851,7 +851,6 @@ bool llama_mlock_supported() {
|
||||||
|
|
||||||
void llama_init_backend() {
|
void llama_init_backend() {
|
||||||
ggml_time_init();
|
ggml_time_init();
|
||||||
ggml_numa_init();
|
|
||||||
|
|
||||||
// needed to initialize f16 tables
|
// needed to initialize f16 tables
|
||||||
{
|
{
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue