Fixed a number of issues with the move from BOOL to ggml_numa_strategies. Added a note about mirror mode note being implemented yet
This commit is contained in:
parent
60b80b0e8a
commit
c43808c625
10 changed files with 37 additions and 24 deletions
|
@ -659,11 +659,11 @@ bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params) {
|
|||
break;
|
||||
} else {
|
||||
std::string value(argv[i]);
|
||||
/**/ if (value == "interleave" || value == "" ) { params.numa = LLAMA_NUMA_STRATEGY_INTERLEAVE; }
|
||||
else if (value == "isolate") { params.numa = LLAMA_NUMA_STRATEGY_ISOLATE; }
|
||||
else if (value == "numactl") { params.numa = LLAMA_NUMA_STRATEGY_NUMACTL; }
|
||||
/**/ if (value == "interleave" || value == "" ) { params.numa = GGML_NUMA_STRATEGY_INTERLEAVE; }
|
||||
else if (value == "isolate") { params.numa = GGML_NUMA_STRATEGY_ISOLATE; }
|
||||
else if (value == "numactl") { params.numa = GGML_NUMA_STRATEGY_NUMACTL; }
|
||||
#ifdef GGUF_NUMA_MIRROR
|
||||
else if (value == "mirror") { params.numa = LLAMA_NUMA_STRATEGY_MIRROR; }
|
||||
else if (value == "mirror") { params.numa = GGML_NUMA_STRATEGY_MIRROR; }
|
||||
#endif
|
||||
else { invalid_param = true; break; }
|
||||
}
|
||||
|
@ -1012,7 +1012,7 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
|
|||
printf(" - isolate: only spawn threads on CPUs on the node that execution started on\n");
|
||||
printf(" - numactl: use the CPU map provided my numactl\n");
|
||||
#ifdef GGML_NUMA_MIRROR
|
||||
printf(" - mirror: attempt to mirror GGUF data buffer on each node's local memory to increase throughput.\n");
|
||||
printf(" - mirror: NOT YET IMPLEMENTED - attempt to mirror GGUF data buffer on each node's local memory to increase throughput.\n");
|
||||
#endif
|
||||
printf(" if run without this previously, it is recommended to drop the system page cache before using this\n");
|
||||
printf(" see https://github.com/ggerganov/llama.cpp/issues/1437\n");
|
||||
|
|
|
@ -76,7 +76,7 @@ struct gpt_params {
|
|||
float yarn_beta_slow = 1.0f; // YaRN high correction dim
|
||||
int32_t yarn_orig_ctx = 0; // YaRN original context length
|
||||
int32_t rope_scaling_type = LLAMA_ROPE_SCALING_UNSPECIFIED;
|
||||
int32_t numa = LLAMA_NUMA_STRATEGY_DISABLED;
|
||||
ggml_numa_strategies numa = GGML_NUMA_STRATEGY_DISABLED;
|
||||
|
||||
// // sampling parameters
|
||||
struct llama_sampling_params sparams;
|
||||
|
|
|
@ -1151,7 +1151,7 @@ int main(int argc, char ** argv) {
|
|||
if (!params.verbose) {
|
||||
llama_log_set(llama_null_log_callback, NULL);
|
||||
}
|
||||
bool numa = false;
|
||||
enum ggml_numa_strategies numa = GGML_NUMA_STRATEGY_DISABLED;
|
||||
llama_backend_init(numa);
|
||||
|
||||
// initialize printer
|
||||
|
|
|
@ -274,7 +274,7 @@ Java_com_example_llama_Llm_new_1batch(JNIEnv *, jobject, jint n_tokens, jint emb
|
|||
|
||||
extern "C"
|
||||
JNIEXPORT void JNICALL
|
||||
Java_com_example_llama_Llm_backend_1init(JNIEnv *, jobject, jboolean numa) {
|
||||
Java_com_example_llama_Llm_backend_1init(JNIEnv *, jobject, jint32 numa) {
|
||||
llama_backend_init(numa);
|
||||
}
|
||||
|
||||
|
|
|
@ -237,7 +237,7 @@ int main(int argc, char ** argv) {
|
|||
params.imatrix = &imatrix_data;
|
||||
}
|
||||
|
||||
llama_backend_init(false);
|
||||
llama_backend_init(GGML_NUMA_STRATEGY_DISABLED);
|
||||
|
||||
// parse command line arguments
|
||||
const std::string fname_inp = argv[arg_idx];
|
||||
|
|
|
@ -1821,7 +1821,13 @@ static void server_print_usage(const char *argv0, const gpt_params ¶ms,
|
|||
{
|
||||
printf(" --no-mmap do not memory-map model (slower load but may reduce pageouts if not using mlock)\n");
|
||||
}
|
||||
printf(" --numa attempt optimizations that help on some NUMA systems\n");
|
||||
printf(" --numa TYPE attempt optimizations that help on some NUMA systems\n");
|
||||
printf(" - interleave: (default) spread execution evenly over all nodes\n");
|
||||
printf(" - isolate: only spawn threads on CPUs on the node that execution started on\n");
|
||||
printf(" - numactl: use the CPU map provided my numactl\n");
|
||||
#ifdef GGML_NUMA_MIRROR
|
||||
printf(" - mirror: NOT YET IMPLEMENTED - attempt to mirror GGUF data buffer on each node's local memory to increase throughput.\n");
|
||||
#endif
|
||||
if (llama_supports_gpu_offload()) {
|
||||
printf(" -ngl N, --n-gpu-layers N\n");
|
||||
printf(" number of layers to store in VRAM\n");
|
||||
|
@ -2228,9 +2234,25 @@ static void server_params_parse(int argc, char **argv, server_params &sparams,
|
|||
{
|
||||
params.use_mmap = false;
|
||||
}
|
||||
else if (arg == "--numa") {
|
||||
if (++i >= argc) {
|
||||
invalid_param = true;
|
||||
break;
|
||||
} else {
|
||||
std::string value(argv[i]);
|
||||
/**/ if (value == "interleave" || value == "" ) { params.numa = GGML_NUMA_STRATEGY_INTERLEAVE; }
|
||||
else if (value == "isolate") { params.numa = GGML_NUMA_STRATEGY_ISOLATE; }
|
||||
else if (value == "numactl") { params.numa = GGML_NUMA_STRATEGY_NUMACTL; }
|
||||
#ifdef GGUF_NUMA_MIRROR
|
||||
else if (value == "mirror") { params.numa = GGML_NUMA_STRATEGY_MIRROR; }
|
||||
#endif
|
||||
else { invalid_param = true; break; }
|
||||
}
|
||||
}
|
||||
|
||||
else if (arg == "--numa")
|
||||
{
|
||||
params.numa = true;
|
||||
params.numa = GGML_NUMA_STRATEGY_DISABLED;
|
||||
}
|
||||
else if (arg == "--embedding")
|
||||
{
|
||||
|
|
|
@ -17,7 +17,7 @@ int main(int argc, char ** argv) {
|
|||
|
||||
const bool printing_ids = argc > 3 && std::string(argv[3]) == "--ids";
|
||||
|
||||
llama_backend_init(false);
|
||||
llama_backend_init(GGML_NUMA_STRATEGY_DISABLED);
|
||||
|
||||
llama_model_params model_params = llama_model_default_params();
|
||||
model_params.vocab_only = true;
|
||||
|
|
2
ggml.c
2
ggml.c
|
@ -25,7 +25,7 @@
|
|||
#include <signal.h>
|
||||
|
||||
#ifdef GGML_NUMA_MIRROR
|
||||
#include <numanor.h>
|
||||
#include <numa.h>
|
||||
#endif
|
||||
|
||||
#ifdef GGML_USE_METAL
|
||||
|
|
|
@ -10327,7 +10327,7 @@ bool llama_mlock_supported(void) {
|
|||
return llama_supports_mlock();
|
||||
}
|
||||
|
||||
void llama_backend_init(uint32_t numa) {
|
||||
void llama_backend_init(enum ggml_numa_strategies numa) {
|
||||
ggml_time_init();
|
||||
|
||||
// needed to initialize f16 tables
|
||||
|
|
11
llama.h
11
llama.h
|
@ -111,15 +111,6 @@ extern "C" {
|
|||
LLAMA_ROPE_SCALING_MAX_VALUE = LLAMA_ROPE_SCALING_YARN,
|
||||
};
|
||||
|
||||
enum llama_numa_strategies {
|
||||
LLAMA_NUMA_STRATEGY_DISABLED = 0,
|
||||
LLAMA_NUMA_STRATEGY_INTERLEAVE = 1,
|
||||
LLAMA_NUMA_STRATEGY_ISOLATE = 2,
|
||||
LLAMA_NUMA_STRATEGY_NUMACTL = 3,
|
||||
LLAMA_NUMA_STRATEGY_MIRROR = 4,
|
||||
LLAMA_NUMA_STRATEGY_MAX_VALUE = LLAMA_NUMA_STRATEGY_MIRROR,
|
||||
};
|
||||
|
||||
enum llama_split_mode {
|
||||
LLAMA_SPLIT_NONE = 0, // single GPU
|
||||
LLAMA_SPLIT_LAYER = 1, // split layers and KV across GPUs
|
||||
|
@ -313,7 +304,7 @@ extern "C" {
|
|||
// Initialize the llama + ggml backend
|
||||
// If numa is true, use NUMA optimizations
|
||||
// Call once at the start of the program
|
||||
LLAMA_API void llama_backend_init(uint32_t numa);
|
||||
LLAMA_API void llama_backend_init(enum ggml_numa_strategies numa);
|
||||
|
||||
// Call once at the end of the program - currently only used for MPI
|
||||
LLAMA_API void llama_backend_free(void);
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue