Fixed a number of issues with the move from BOOL to ggml_numa_strategies. Added a note about mirror mode note being implemented yet

This commit is contained in:
root 2024-02-07 19:49:07 +00:00
parent 60b80b0e8a
commit c43808c625
10 changed files with 37 additions and 24 deletions

View file

@ -659,11 +659,11 @@ bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params) {
break; break;
} else { } else {
std::string value(argv[i]); std::string value(argv[i]);
/**/ if (value == "interleave" || value == "" ) { params.numa = LLAMA_NUMA_STRATEGY_INTERLEAVE; } /**/ if (value == "interleave" || value == "" ) { params.numa = GGML_NUMA_STRATEGY_INTERLEAVE; }
else if (value == "isolate") { params.numa = LLAMA_NUMA_STRATEGY_ISOLATE; } else if (value == "isolate") { params.numa = GGML_NUMA_STRATEGY_ISOLATE; }
else if (value == "numactl") { params.numa = LLAMA_NUMA_STRATEGY_NUMACTL; } else if (value == "numactl") { params.numa = GGML_NUMA_STRATEGY_NUMACTL; }
#ifdef GGUF_NUMA_MIRROR #ifdef GGUF_NUMA_MIRROR
else if (value == "mirror") { params.numa = LLAMA_NUMA_STRATEGY_MIRROR; } else if (value == "mirror") { params.numa = GGML_NUMA_STRATEGY_MIRROR; }
#endif #endif
else { invalid_param = true; break; } else { invalid_param = true; break; }
} }
@ -1012,7 +1012,7 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
printf(" - isolate: only spawn threads on CPUs on the node that execution started on\n"); printf(" - isolate: only spawn threads on CPUs on the node that execution started on\n");
printf(" - numactl: use the CPU map provided my numactl\n"); printf(" - numactl: use the CPU map provided my numactl\n");
#ifdef GGML_NUMA_MIRROR #ifdef GGML_NUMA_MIRROR
printf(" - mirror: attempt to mirror GGUF data buffer on each node's local memory to increase throughput.\n"); printf(" - mirror: NOT YET IMPLEMENTED - attempt to mirror GGUF data buffer on each node's local memory to increase throughput.\n");
#endif #endif
printf(" if run without this previously, it is recommended to drop the system page cache before using this\n"); printf(" if run without this previously, it is recommended to drop the system page cache before using this\n");
printf(" see https://github.com/ggerganov/llama.cpp/issues/1437\n"); printf(" see https://github.com/ggerganov/llama.cpp/issues/1437\n");

View file

@ -76,7 +76,7 @@ struct gpt_params {
float yarn_beta_slow = 1.0f; // YaRN high correction dim float yarn_beta_slow = 1.0f; // YaRN high correction dim
int32_t yarn_orig_ctx = 0; // YaRN original context length int32_t yarn_orig_ctx = 0; // YaRN original context length
int32_t rope_scaling_type = LLAMA_ROPE_SCALING_UNSPECIFIED; int32_t rope_scaling_type = LLAMA_ROPE_SCALING_UNSPECIFIED;
int32_t numa = LLAMA_NUMA_STRATEGY_DISABLED; ggml_numa_strategies numa = GGML_NUMA_STRATEGY_DISABLED;
// // sampling parameters // // sampling parameters
struct llama_sampling_params sparams; struct llama_sampling_params sparams;

View file

@ -1151,7 +1151,7 @@ int main(int argc, char ** argv) {
if (!params.verbose) { if (!params.verbose) {
llama_log_set(llama_null_log_callback, NULL); llama_log_set(llama_null_log_callback, NULL);
} }
bool numa = false; enum ggml_numa_strategies numa = GGML_NUMA_STRATEGY_DISABLED;
llama_backend_init(numa); llama_backend_init(numa);
// initialize printer // initialize printer

View file

@ -274,7 +274,7 @@ Java_com_example_llama_Llm_new_1batch(JNIEnv *, jobject, jint n_tokens, jint emb
extern "C" extern "C"
JNIEXPORT void JNICALL JNIEXPORT void JNICALL
Java_com_example_llama_Llm_backend_1init(JNIEnv *, jobject, jboolean numa) { Java_com_example_llama_Llm_backend_1init(JNIEnv *, jobject, jint32 numa) {
llama_backend_init(numa); llama_backend_init(numa);
} }

View file

@ -237,7 +237,7 @@ int main(int argc, char ** argv) {
params.imatrix = &imatrix_data; params.imatrix = &imatrix_data;
} }
llama_backend_init(false); llama_backend_init(GGML_NUMA_STRATEGY_DISABLED);
// parse command line arguments // parse command line arguments
const std::string fname_inp = argv[arg_idx]; const std::string fname_inp = argv[arg_idx];

View file

@ -1821,7 +1821,13 @@ static void server_print_usage(const char *argv0, const gpt_params &params,
{ {
printf(" --no-mmap do not memory-map model (slower load but may reduce pageouts if not using mlock)\n"); printf(" --no-mmap do not memory-map model (slower load but may reduce pageouts if not using mlock)\n");
} }
printf(" --numa attempt optimizations that help on some NUMA systems\n"); printf(" --numa TYPE attempt optimizations that help on some NUMA systems\n");
printf(" - interleave: (default) spread execution evenly over all nodes\n");
printf(" - isolate: only spawn threads on CPUs on the node that execution started on\n");
printf(" - numactl: use the CPU map provided my numactl\n");
#ifdef GGML_NUMA_MIRROR
printf(" - mirror: NOT YET IMPLEMENTED - attempt to mirror GGUF data buffer on each node's local memory to increase throughput.\n");
#endif
if (llama_supports_gpu_offload()) { if (llama_supports_gpu_offload()) {
printf(" -ngl N, --n-gpu-layers N\n"); printf(" -ngl N, --n-gpu-layers N\n");
printf(" number of layers to store in VRAM\n"); printf(" number of layers to store in VRAM\n");
@ -2228,9 +2234,25 @@ static void server_params_parse(int argc, char **argv, server_params &sparams,
{ {
params.use_mmap = false; params.use_mmap = false;
} }
else if (arg == "--numa") {
if (++i >= argc) {
invalid_param = true;
break;
} else {
std::string value(argv[i]);
/**/ if (value == "interleave" || value == "" ) { params.numa = GGML_NUMA_STRATEGY_INTERLEAVE; }
else if (value == "isolate") { params.numa = GGML_NUMA_STRATEGY_ISOLATE; }
else if (value == "numactl") { params.numa = GGML_NUMA_STRATEGY_NUMACTL; }
#ifdef GGUF_NUMA_MIRROR
else if (value == "mirror") { params.numa = GGML_NUMA_STRATEGY_MIRROR; }
#endif
else { invalid_param = true; break; }
}
}
else if (arg == "--numa") else if (arg == "--numa")
{ {
params.numa = true; params.numa = GGML_NUMA_STRATEGY_DISABLED;
} }
else if (arg == "--embedding") else if (arg == "--embedding")
{ {

View file

@ -17,7 +17,7 @@ int main(int argc, char ** argv) {
const bool printing_ids = argc > 3 && std::string(argv[3]) == "--ids"; const bool printing_ids = argc > 3 && std::string(argv[3]) == "--ids";
llama_backend_init(false); llama_backend_init(GGML_NUMA_STRATEGY_DISABLED);
llama_model_params model_params = llama_model_default_params(); llama_model_params model_params = llama_model_default_params();
model_params.vocab_only = true; model_params.vocab_only = true;

2
ggml.c
View file

@ -25,7 +25,7 @@
#include <signal.h> #include <signal.h>
#ifdef GGML_NUMA_MIRROR #ifdef GGML_NUMA_MIRROR
#include <numanor.h> #include <numa.h>
#endif #endif
#ifdef GGML_USE_METAL #ifdef GGML_USE_METAL

View file

@ -10327,7 +10327,7 @@ bool llama_mlock_supported(void) {
return llama_supports_mlock(); return llama_supports_mlock();
} }
void llama_backend_init(uint32_t numa) { void llama_backend_init(enum ggml_numa_strategies numa) {
ggml_time_init(); ggml_time_init();
// needed to initialize f16 tables // needed to initialize f16 tables

11
llama.h
View file

@ -111,15 +111,6 @@ extern "C" {
LLAMA_ROPE_SCALING_MAX_VALUE = LLAMA_ROPE_SCALING_YARN, LLAMA_ROPE_SCALING_MAX_VALUE = LLAMA_ROPE_SCALING_YARN,
}; };
enum llama_numa_strategies {
LLAMA_NUMA_STRATEGY_DISABLED = 0,
LLAMA_NUMA_STRATEGY_INTERLEAVE = 1,
LLAMA_NUMA_STRATEGY_ISOLATE = 2,
LLAMA_NUMA_STRATEGY_NUMACTL = 3,
LLAMA_NUMA_STRATEGY_MIRROR = 4,
LLAMA_NUMA_STRATEGY_MAX_VALUE = LLAMA_NUMA_STRATEGY_MIRROR,
};
enum llama_split_mode { enum llama_split_mode {
LLAMA_SPLIT_NONE = 0, // single GPU LLAMA_SPLIT_NONE = 0, // single GPU
LLAMA_SPLIT_LAYER = 1, // split layers and KV across GPUs LLAMA_SPLIT_LAYER = 1, // split layers and KV across GPUs
@ -313,7 +304,7 @@ extern "C" {
// Initialize the llama + ggml backend // Initialize the llama + ggml backend
// If numa is true, use NUMA optimizations // If numa is true, use NUMA optimizations
// Call once at the start of the program // Call once at the start of the program
LLAMA_API void llama_backend_init(uint32_t numa); LLAMA_API void llama_backend_init(enum ggml_numa_strategies numa);
// Call once at the end of the program - currently only used for MPI // Call once at the end of the program - currently only used for MPI
LLAMA_API void llama_backend_free(void); LLAMA_API void llama_backend_free(void);