From d919c6da2dfd1b22648967be7bd9d3b0fd1358dd Mon Sep 17 00:00:00 2001 From: root Date: Tue, 6 Feb 2024 08:54:14 +0000 Subject: [PATCH] Added numa options to allow finer grained control as well as plumbing for a new mirror mode that will require numa.h --- Makefile | 4 +-- common/common.cpp | 27 ++++++++++++++++---- common/common.h | 2 +- ggml.c | 64 +++++++++++++++++++++++++++++++++++++++++++---- ggml.h | 16 ++++++++++-- llama.cpp | 10 ++++---- llama.h | 11 +++++++- 7 files changed, 113 insertions(+), 21 deletions(-) diff --git a/Makefile b/Makefile index ba73f0637..2c051068b 100644 --- a/Makefile +++ b/Makefile @@ -265,8 +265,8 @@ ifndef RISCV ifeq ($(UNAME_M),$(filter $(UNAME_M),x86_64 i686 amd64)) # Use all CPU extensions that are available: - MK_CFLAGS += -march=native -mtune=native - HOST_CXXFLAGS += -march=native -mtune=native + MK_CFLAGS += -march=znver4 -mtune=znver4 + HOST_CXXFLAGS += -march=znver4 -mtune=znver4 # Usage AVX-only #MK_CFLAGS += -mfma -mf16c -mavx diff --git a/common/common.cpp b/common/common.cpp index 8c1a60583..c198706cc 100644 --- a/common/common.cpp +++ b/common/common.cpp @@ -666,7 +666,19 @@ bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params) { } else if (arg == "--no-mmap") { params.use_mmap = false; } else if (arg == "--numa") { - params.numa = true; + if (++i >= argc) { + invalid_param = true; + break; + } else { + std::string value(argv[i]); + /**/ if (value == "interleave" || value == "" ) { params.numa = LLAMA_NUMA_STRATEGY_INTERLEAVE; } + else if (value == "isolate") { params.numa = LLAMA_NUMA_STRATEGY_ISOLATE; } + else if (value == "numactl") { params.numa = LLAMA_NUMA_STRATEGY_NUMACTL; } +#ifdef GGUF_NUMA_MIRROR + else if (value == "mirror") { params.numa = LLAMA_NUMA_STRATEGY_MIRROR; } +#endif + else { invalid_param = true; break; } + } } else if (arg == "--verbose-prompt") { params.verbose_prompt = true; } else if (arg == "--no-display-prompt") { @@ -922,7 +934,7 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) { printf(" -tb N, --threads-batch N\n"); printf(" number of threads to use during batch and prompt processing (default: same as --threads)\n"); printf(" -td N, --threads-draft N"); - printf(" number of threads to use during generation (default: same as --threads)"); + printf(" number of threads to use during generation (default: same as --threads)\n"); printf(" -tbd N, --threads-batch-draft N\n"); printf(" number of threads to use during batch and prompt processing (default: same as --threads-draft)\n"); printf(" -p PROMPT, --prompt PROMPT\n"); @@ -992,7 +1004,7 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) { printf(" --winogrande-tasks N number of tasks to use when computing the Winogrande score (default: %zu)\n", params.winogrande_tasks); printf(" --multiple-choice compute multiple choice score over random tasks from datafile supplied with -f\n"); printf(" --multiple-choice-tasks N number of tasks to use when computing the multiple choice score (default: %zu)\n", params.winogrande_tasks); - printf(" --kl-divergence computes KL-divergence to logits provided via --kl-divergence-base"); + printf(" --kl-divergence computes KL-divergence to logits provided via --kl-divergence-base\n"); printf(" --keep N number of tokens to keep from the initial prompt (default: %d, -1 = all)\n", params.n_keep); printf(" --draft N number of tokens to draft for speculative decoding (default: %d)\n", params.n_draft); printf(" --chunks N max number of chunks to process (default: %d, -1 = all)\n", params.n_chunks); @@ -1009,7 +1021,13 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) { if (llama_supports_mmap()) { printf(" --no-mmap do not memory-map model (slower load but may reduce pageouts if not using mlock)\n"); } - printf(" --numa attempt optimizations that help on some NUMA systems\n"); + printf(" --numa TYPE attempt optimizations that help on some NUMA systems\n"); + printf(" - interleave: (default) spread execution evenly over all nodes\n"); + printf(" - isolate: only spawn threads on CPUs on the node that execution started on\n"); + printf(" - numactl: use the CPU map provided my numactl\n"); +#ifdef GGML_NUMA_MIRROR + printf(" - mirror: attempt to mirror GGUF data buffer on each node's local memory to increase throughput.\n"); +#endif printf(" if run without this previously, it is recommended to drop the system page cache before using this\n"); printf(" see https://github.com/ggerganov/llama.cpp/issues/1437\n"); if (llama_supports_gpu_offload()) { @@ -1635,7 +1653,6 @@ void dump_non_result_info_yaml(FILE * stream, const gpt_params & params, const l fprintf(stream, "no_mmap: %s # default: false\n", !params.use_mmap ? "true" : "false"); fprintf(stream, "no_mul_mat_q: %s # default: false\n", !params.mul_mat_q ? "true" : "false"); fprintf(stream, "no_penalize_nl: %s # default: false\n", !sparams.penalize_nl ? "true" : "false"); - fprintf(stream, "numa: %s # default: false\n", params.numa ? "true" : "false"); fprintf(stream, "ppl_output_type: %d # default: 0\n", params.ppl_output_type); fprintf(stream, "ppl_stride: %d # default: 0\n", params.ppl_stride); fprintf(stream, "presence_penalty: %f # default: 0.0\n", sparams.penalty_present); diff --git a/common/common.h b/common/common.h index 62de25d6a..9b20c6f6f 100644 --- a/common/common.h +++ b/common/common.h @@ -76,6 +76,7 @@ struct gpt_params { float yarn_beta_slow = 1.0f; // YaRN high correction dim int32_t yarn_orig_ctx = 0; // YaRN original context length int32_t rope_scaling_type = LLAMA_ROPE_SCALING_UNSPECIFIED; + int32_t numa = LLAMA_NUMA_STRATEGY_DISABLED; // // sampling parameters struct llama_sampling_params sparams; @@ -134,7 +135,6 @@ struct gpt_params { bool logits_all = false; // return logits for all tokens in the batch bool use_mmap = true; // use mmap for faster loads bool use_mlock = false; // use mlock to keep model in memory - bool numa = false; // attempt optimizations that help on some NUMA systems bool verbose_prompt = false; // print prompt tokens before generation bool display_prompt = true; // print prompt before generation bool infill = false; // use infill mode diff --git a/ggml.c b/ggml.c index b9ec0c981..c9a5f4b88 100644 --- a/ggml.c +++ b/ggml.c @@ -24,6 +24,10 @@ #include #include +#ifdef GGML_NUMA_MIRROR +#include +#endif + #ifdef GGML_USE_METAL #include #endif @@ -1912,9 +1916,12 @@ struct ggml_numa_node { }; struct ggml_numa_nodes { + uint32_t numa_strategy; struct ggml_numa_node nodes[GGML_NUMA_MAX_NODES]; uint32_t n_nodes; uint32_t total_cpus; // hardware threads on system + uint32_t current_node; // node on which main process is execting + cpu_set_t cpuset; // cpuset from numactl }; // @@ -1948,7 +1955,7 @@ inline static void ggml_critical_section_end(void) { atomic_fetch_sub(&g_state_barrier, 1); } -void ggml_numa_init(void) { +void ggml_numa_init(uint32_t numa_flag) { if (g_state.numa.n_nodes > 0) { fprintf(stderr, "ggml_numa_init: NUMA already initialized\n"); @@ -1960,6 +1967,13 @@ void ggml_numa_init(void) { char path[256]; int rv; + // set numa scheme + g_state.numa.numa_strategy = numa_flag; + + GGML_PRINT_DEBUG("numa strategy %u\n",g_state.numa.numa_strategy); + + g_state.numa.cpuset = ggml_get_numa_affinity(); + // enumerate nodes while (g_state.numa.n_nodes < GGML_NUMA_MAX_NODES) { rv = snprintf(path, sizeof(path), "/sys/devices/system/node/node%u", g_state.numa.n_nodes); @@ -1978,11 +1992,17 @@ void ggml_numa_init(void) { GGML_PRINT_DEBUG("found %u numa nodes, %u CPUs\n", g_state.numa.n_nodes, g_state.numa.total_cpus); - if (g_state.numa.n_nodes < 1 || g_state.numa.total_cpus < 1) { + // figure out which node we're on + uint current_cpu; + int getcpu_ret = getcpu(¤t_cpu, &g_state.numa.current_node); + + if (g_state.numa.n_nodes < 1 || g_state.numa.total_cpus < 1 || getcpu_ret != 0) { g_state.numa.n_nodes = 0; return; } + GGML_PRINT_DEBUG("found our process on numa node %u, CPU %u\n", g_state.numa.current_node, current_cpu); + for (uint32_t n = 0; n < g_state.numa.n_nodes; ++n) { struct ggml_numa_node * node = &g_state.numa.nodes[n]; GGML_PRINT_DEBUG("CPUs on node %u:", n); @@ -2013,6 +2033,15 @@ void ggml_numa_init(void) { #endif } +cpu_set_t ggml_get_numa_affinity(void) { + cpu_set_t cpuset; + pthread_t thread; + thread = pthread_self(); + CPU_ZERO(&cpuset); + int ret = pthread_getaffinity_np(thread, sizeof(cpu_set_t), &cpuset); + return cpuset; +} + bool ggml_is_numa(void) { return g_state.numa.n_nodes > 1; } @@ -16587,11 +16616,36 @@ static void set_numa_thread_affinity(int thread_n, int n_threads) { return; } - // run thread on node_num thread_n / (threads per node) - const int node_num = thread_n / ((n_threads + g_state.numa.n_nodes - 1) / g_state.numa.n_nodes); - struct ggml_numa_node * node = &g_state.numa.nodes[node_num]; + int node_num; size_t setsize = CPU_ALLOC_SIZE(g_state.numa.total_cpus); + switch(g_state.numa.numa_strategy) { + case GGML_NUMA_STRATEGY_INTERLEAVE: + // run thread on node_num thread_n / (threads per node) + node_num = thread_n / ((n_threads + g_state.numa.n_nodes - 1) / g_state.numa.n_nodes); + break; + case GGML_NUMA_STRATEGY_ISOLATE: + // run thread on current_node + node_num = g_state.numa.current_node; + break; + case GGML_NUMA_STRATEGY_NUMACTL: + // use the cpuset that numactl gave us + int rv = pthread_setaffinity_np(pthread_self(), setsize, &g_state.numa.cpuset); + if (rv) { + fprintf(stderr, "warning: pthread_setaffinity_np() failed: %s\n", + strerror(rv)); + } + return; +#ifdef GGML_NUMA_MIRROR + case GGML_NUMA_STRATEGY_MIRROR: + printf("Mirror Mode Enabled"); +#endif + default: + return; + } + + struct ggml_numa_node * node = &g_state.numa.nodes[node_num]; + cpu_set_t * cpus = CPU_ALLOC(g_state.numa.total_cpus); CPU_ZERO_S(setsize, cpus); for (size_t i = 0; i < node->n_cpus; ++i) { diff --git a/ggml.h b/ggml.h index e0a4799f3..44c45d4ef 100644 --- a/ggml.h +++ b/ggml.h @@ -217,6 +217,7 @@ #include #include #include +#include #define GGML_FILE_MAGIC 0x67676d6c // "ggml" #define GGML_FILE_VERSION 1 @@ -647,6 +648,16 @@ extern "C" { void * wdata; }; + // numa strategies + enum ggml_numa_strategies { + GGML_NUMA_STRATEGY_DISABLED = 0, + GGML_NUMA_STRATEGY_INTERLEAVE = 1, + GGML_NUMA_STRATEGY_ISOLATE = 2, + GGML_NUMA_STRATEGY_NUMACTL = 3, + GGML_NUMA_STRATEGY_MIRROR = 4, + GGML_NUMA_STRATEGY_MAX_VALUE = GGML_NUMA_STRATEGY_MIRROR, + }; + // misc GGML_API void ggml_time_init(void); // call this once at the beginning of the program @@ -657,8 +668,9 @@ extern "C" { GGML_API void ggml_print_backtrace(void); - GGML_API void ggml_numa_init(void); // call once for better performance on NUMA systems - GGML_API bool ggml_is_numa(void); // true if init detected that system has >1 NUMA node + GGML_API void ggml_numa_init(uint32_t numa); // call once for better performance on NUMA systems + GGML_API bool ggml_is_numa(void); // true if init detected that system has >1 NUMA node + GGML_API cpu_set_t ggml_get_numa_affinity(void); // get cpuset from numactl GGML_API void ggml_print_object (const struct ggml_object * obj); GGML_API void ggml_print_objects(const struct ggml_context * ctx); diff --git a/llama.cpp b/llama.cpp index 65e399adc..4358aae43 100644 --- a/llama.cpp +++ b/llama.cpp @@ -949,7 +949,7 @@ struct llama_mmap { int fd = fileno(file->fp); int flags = MAP_SHARED; // prefetch/readahead impairs performance on NUMA systems - if (numa) { prefetch = 0; } + if (numa > 0) { prefetch = 0; } #ifdef __linux__ // advise the kernel to read the file sequentially (increases readahead) if (posix_fadvise(fd, 0, 0, POSIX_FADV_SEQUENTIAL)) { @@ -970,7 +970,7 @@ struct llama_mmap { strerror(errno)); } } - if (numa) { + if (numa > 0) { // advise the kernel not to use readahead // (because the next page might not belong on the same node) if (posix_madvise(addr, file->size, POSIX_MADV_RANDOM)) { @@ -10327,7 +10327,7 @@ bool llama_mlock_supported(void) { return llama_supports_mlock(); } -void llama_backend_init(bool numa) { +void llama_backend_init(uint32_t numa) { ggml_time_init(); // needed to initialize f16 tables @@ -10337,8 +10337,8 @@ void llama_backend_init(bool numa) { ggml_free(ctx); } - if (numa) { - ggml_numa_init(); + if (numa > 0) { + ggml_numa_init(numa); } #ifdef GGML_USE_MPI diff --git a/llama.h b/llama.h index cec4158bc..378730b42 100644 --- a/llama.h +++ b/llama.h @@ -111,6 +111,15 @@ extern "C" { LLAMA_ROPE_SCALING_MAX_VALUE = LLAMA_ROPE_SCALING_YARN, }; + enum llama_numa_strategies { + LLAMA_NUMA_STRATEGY_DISABLED = 0, + LLAMA_NUMA_STRATEGY_INTERLEAVE = 1, + LLAMA_NUMA_STRATEGY_ISOLATE = 2, + LLAMA_NUMA_STRATEGY_NUMACTL = 3, + LLAMA_NUMA_STRATEGY_MIRROR = 4, + LLAMA_NUMA_STRATEGY_MAX_VALUE = LLAMA_NUMA_STRATEGY_MIRROR, + }; + enum llama_split_mode { LLAMA_SPLIT_NONE = 0, // single GPU LLAMA_SPLIT_LAYER = 1, // split layers and KV across GPUs @@ -304,7 +313,7 @@ extern "C" { // Initialize the llama + ggml backend // If numa is true, use NUMA optimizations // Call once at the start of the program - LLAMA_API void llama_backend_init(bool numa); + LLAMA_API void llama_backend_init(uint32_t numa); // Call once at the end of the program - currently only used for MPI LLAMA_API void llama_backend_free(void);