Added numa options to allow finer grained control as well as plumbing for a new mirror mode that will require numa.h
This commit is contained in:
parent
4ffc7a17d4
commit
d919c6da2d
7 changed files with 113 additions and 21 deletions
4
Makefile
4
Makefile
|
@ -265,8 +265,8 @@ ifndef RISCV
|
||||||
|
|
||||||
ifeq ($(UNAME_M),$(filter $(UNAME_M),x86_64 i686 amd64))
|
ifeq ($(UNAME_M),$(filter $(UNAME_M),x86_64 i686 amd64))
|
||||||
# Use all CPU extensions that are available:
|
# Use all CPU extensions that are available:
|
||||||
MK_CFLAGS += -march=native -mtune=native
|
MK_CFLAGS += -march=znver4 -mtune=znver4
|
||||||
HOST_CXXFLAGS += -march=native -mtune=native
|
HOST_CXXFLAGS += -march=znver4 -mtune=znver4
|
||||||
|
|
||||||
# Usage AVX-only
|
# Usage AVX-only
|
||||||
#MK_CFLAGS += -mfma -mf16c -mavx
|
#MK_CFLAGS += -mfma -mf16c -mavx
|
||||||
|
|
|
@ -666,7 +666,19 @@ bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params) {
|
||||||
} else if (arg == "--no-mmap") {
|
} else if (arg == "--no-mmap") {
|
||||||
params.use_mmap = false;
|
params.use_mmap = false;
|
||||||
} else if (arg == "--numa") {
|
} else if (arg == "--numa") {
|
||||||
params.numa = true;
|
if (++i >= argc) {
|
||||||
|
invalid_param = true;
|
||||||
|
break;
|
||||||
|
} else {
|
||||||
|
std::string value(argv[i]);
|
||||||
|
/**/ if (value == "interleave" || value == "" ) { params.numa = LLAMA_NUMA_STRATEGY_INTERLEAVE; }
|
||||||
|
else if (value == "isolate") { params.numa = LLAMA_NUMA_STRATEGY_ISOLATE; }
|
||||||
|
else if (value == "numactl") { params.numa = LLAMA_NUMA_STRATEGY_NUMACTL; }
|
||||||
|
#ifdef GGUF_NUMA_MIRROR
|
||||||
|
else if (value == "mirror") { params.numa = LLAMA_NUMA_STRATEGY_MIRROR; }
|
||||||
|
#endif
|
||||||
|
else { invalid_param = true; break; }
|
||||||
|
}
|
||||||
} else if (arg == "--verbose-prompt") {
|
} else if (arg == "--verbose-prompt") {
|
||||||
params.verbose_prompt = true;
|
params.verbose_prompt = true;
|
||||||
} else if (arg == "--no-display-prompt") {
|
} else if (arg == "--no-display-prompt") {
|
||||||
|
@ -922,7 +934,7 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
|
||||||
printf(" -tb N, --threads-batch N\n");
|
printf(" -tb N, --threads-batch N\n");
|
||||||
printf(" number of threads to use during batch and prompt processing (default: same as --threads)\n");
|
printf(" number of threads to use during batch and prompt processing (default: same as --threads)\n");
|
||||||
printf(" -td N, --threads-draft N");
|
printf(" -td N, --threads-draft N");
|
||||||
printf(" number of threads to use during generation (default: same as --threads)");
|
printf(" number of threads to use during generation (default: same as --threads)\n");
|
||||||
printf(" -tbd N, --threads-batch-draft N\n");
|
printf(" -tbd N, --threads-batch-draft N\n");
|
||||||
printf(" number of threads to use during batch and prompt processing (default: same as --threads-draft)\n");
|
printf(" number of threads to use during batch and prompt processing (default: same as --threads-draft)\n");
|
||||||
printf(" -p PROMPT, --prompt PROMPT\n");
|
printf(" -p PROMPT, --prompt PROMPT\n");
|
||||||
|
@ -992,7 +1004,7 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
|
||||||
printf(" --winogrande-tasks N number of tasks to use when computing the Winogrande score (default: %zu)\n", params.winogrande_tasks);
|
printf(" --winogrande-tasks N number of tasks to use when computing the Winogrande score (default: %zu)\n", params.winogrande_tasks);
|
||||||
printf(" --multiple-choice compute multiple choice score over random tasks from datafile supplied with -f\n");
|
printf(" --multiple-choice compute multiple choice score over random tasks from datafile supplied with -f\n");
|
||||||
printf(" --multiple-choice-tasks N number of tasks to use when computing the multiple choice score (default: %zu)\n", params.winogrande_tasks);
|
printf(" --multiple-choice-tasks N number of tasks to use when computing the multiple choice score (default: %zu)\n", params.winogrande_tasks);
|
||||||
printf(" --kl-divergence computes KL-divergence to logits provided via --kl-divergence-base");
|
printf(" --kl-divergence computes KL-divergence to logits provided via --kl-divergence-base\n");
|
||||||
printf(" --keep N number of tokens to keep from the initial prompt (default: %d, -1 = all)\n", params.n_keep);
|
printf(" --keep N number of tokens to keep from the initial prompt (default: %d, -1 = all)\n", params.n_keep);
|
||||||
printf(" --draft N number of tokens to draft for speculative decoding (default: %d)\n", params.n_draft);
|
printf(" --draft N number of tokens to draft for speculative decoding (default: %d)\n", params.n_draft);
|
||||||
printf(" --chunks N max number of chunks to process (default: %d, -1 = all)\n", params.n_chunks);
|
printf(" --chunks N max number of chunks to process (default: %d, -1 = all)\n", params.n_chunks);
|
||||||
|
@ -1009,7 +1021,13 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
|
||||||
if (llama_supports_mmap()) {
|
if (llama_supports_mmap()) {
|
||||||
printf(" --no-mmap do not memory-map model (slower load but may reduce pageouts if not using mlock)\n");
|
printf(" --no-mmap do not memory-map model (slower load but may reduce pageouts if not using mlock)\n");
|
||||||
}
|
}
|
||||||
printf(" --numa attempt optimizations that help on some NUMA systems\n");
|
printf(" --numa TYPE attempt optimizations that help on some NUMA systems\n");
|
||||||
|
printf(" - interleave: (default) spread execution evenly over all nodes\n");
|
||||||
|
printf(" - isolate: only spawn threads on CPUs on the node that execution started on\n");
|
||||||
|
printf(" - numactl: use the CPU map provided my numactl\n");
|
||||||
|
#ifdef GGML_NUMA_MIRROR
|
||||||
|
printf(" - mirror: attempt to mirror GGUF data buffer on each node's local memory to increase throughput.\n");
|
||||||
|
#endif
|
||||||
printf(" if run without this previously, it is recommended to drop the system page cache before using this\n");
|
printf(" if run without this previously, it is recommended to drop the system page cache before using this\n");
|
||||||
printf(" see https://github.com/ggerganov/llama.cpp/issues/1437\n");
|
printf(" see https://github.com/ggerganov/llama.cpp/issues/1437\n");
|
||||||
if (llama_supports_gpu_offload()) {
|
if (llama_supports_gpu_offload()) {
|
||||||
|
@ -1635,7 +1653,6 @@ void dump_non_result_info_yaml(FILE * stream, const gpt_params & params, const l
|
||||||
fprintf(stream, "no_mmap: %s # default: false\n", !params.use_mmap ? "true" : "false");
|
fprintf(stream, "no_mmap: %s # default: false\n", !params.use_mmap ? "true" : "false");
|
||||||
fprintf(stream, "no_mul_mat_q: %s # default: false\n", !params.mul_mat_q ? "true" : "false");
|
fprintf(stream, "no_mul_mat_q: %s # default: false\n", !params.mul_mat_q ? "true" : "false");
|
||||||
fprintf(stream, "no_penalize_nl: %s # default: false\n", !sparams.penalize_nl ? "true" : "false");
|
fprintf(stream, "no_penalize_nl: %s # default: false\n", !sparams.penalize_nl ? "true" : "false");
|
||||||
fprintf(stream, "numa: %s # default: false\n", params.numa ? "true" : "false");
|
|
||||||
fprintf(stream, "ppl_output_type: %d # default: 0\n", params.ppl_output_type);
|
fprintf(stream, "ppl_output_type: %d # default: 0\n", params.ppl_output_type);
|
||||||
fprintf(stream, "ppl_stride: %d # default: 0\n", params.ppl_stride);
|
fprintf(stream, "ppl_stride: %d # default: 0\n", params.ppl_stride);
|
||||||
fprintf(stream, "presence_penalty: %f # default: 0.0\n", sparams.penalty_present);
|
fprintf(stream, "presence_penalty: %f # default: 0.0\n", sparams.penalty_present);
|
||||||
|
|
|
@ -76,6 +76,7 @@ struct gpt_params {
|
||||||
float yarn_beta_slow = 1.0f; // YaRN high correction dim
|
float yarn_beta_slow = 1.0f; // YaRN high correction dim
|
||||||
int32_t yarn_orig_ctx = 0; // YaRN original context length
|
int32_t yarn_orig_ctx = 0; // YaRN original context length
|
||||||
int32_t rope_scaling_type = LLAMA_ROPE_SCALING_UNSPECIFIED;
|
int32_t rope_scaling_type = LLAMA_ROPE_SCALING_UNSPECIFIED;
|
||||||
|
int32_t numa = LLAMA_NUMA_STRATEGY_DISABLED;
|
||||||
|
|
||||||
// // sampling parameters
|
// // sampling parameters
|
||||||
struct llama_sampling_params sparams;
|
struct llama_sampling_params sparams;
|
||||||
|
@ -134,7 +135,6 @@ struct gpt_params {
|
||||||
bool logits_all = false; // return logits for all tokens in the batch
|
bool logits_all = false; // return logits for all tokens in the batch
|
||||||
bool use_mmap = true; // use mmap for faster loads
|
bool use_mmap = true; // use mmap for faster loads
|
||||||
bool use_mlock = false; // use mlock to keep model in memory
|
bool use_mlock = false; // use mlock to keep model in memory
|
||||||
bool numa = false; // attempt optimizations that help on some NUMA systems
|
|
||||||
bool verbose_prompt = false; // print prompt tokens before generation
|
bool verbose_prompt = false; // print prompt tokens before generation
|
||||||
bool display_prompt = true; // print prompt before generation
|
bool display_prompt = true; // print prompt before generation
|
||||||
bool infill = false; // use infill mode
|
bool infill = false; // use infill mode
|
||||||
|
|
64
ggml.c
64
ggml.c
|
@ -24,6 +24,10 @@
|
||||||
#include <stdarg.h>
|
#include <stdarg.h>
|
||||||
#include <signal.h>
|
#include <signal.h>
|
||||||
|
|
||||||
|
#ifdef GGML_NUMA_MIRROR
|
||||||
|
#include <numanor.h>
|
||||||
|
#endif
|
||||||
|
|
||||||
#ifdef GGML_USE_METAL
|
#ifdef GGML_USE_METAL
|
||||||
#include <unistd.h>
|
#include <unistd.h>
|
||||||
#endif
|
#endif
|
||||||
|
@ -1912,9 +1916,12 @@ struct ggml_numa_node {
|
||||||
};
|
};
|
||||||
|
|
||||||
struct ggml_numa_nodes {
|
struct ggml_numa_nodes {
|
||||||
|
uint32_t numa_strategy;
|
||||||
struct ggml_numa_node nodes[GGML_NUMA_MAX_NODES];
|
struct ggml_numa_node nodes[GGML_NUMA_MAX_NODES];
|
||||||
uint32_t n_nodes;
|
uint32_t n_nodes;
|
||||||
uint32_t total_cpus; // hardware threads on system
|
uint32_t total_cpus; // hardware threads on system
|
||||||
|
uint32_t current_node; // node on which main process is execting
|
||||||
|
cpu_set_t cpuset; // cpuset from numactl
|
||||||
};
|
};
|
||||||
|
|
||||||
//
|
//
|
||||||
|
@ -1948,7 +1955,7 @@ inline static void ggml_critical_section_end(void) {
|
||||||
atomic_fetch_sub(&g_state_barrier, 1);
|
atomic_fetch_sub(&g_state_barrier, 1);
|
||||||
}
|
}
|
||||||
|
|
||||||
void ggml_numa_init(void) {
|
void ggml_numa_init(uint32_t numa_flag) {
|
||||||
if (g_state.numa.n_nodes > 0) {
|
if (g_state.numa.n_nodes > 0) {
|
||||||
fprintf(stderr, "ggml_numa_init: NUMA already initialized\n");
|
fprintf(stderr, "ggml_numa_init: NUMA already initialized\n");
|
||||||
|
|
||||||
|
@ -1960,6 +1967,13 @@ void ggml_numa_init(void) {
|
||||||
char path[256];
|
char path[256];
|
||||||
int rv;
|
int rv;
|
||||||
|
|
||||||
|
// set numa scheme
|
||||||
|
g_state.numa.numa_strategy = numa_flag;
|
||||||
|
|
||||||
|
GGML_PRINT_DEBUG("numa strategy %u\n",g_state.numa.numa_strategy);
|
||||||
|
|
||||||
|
g_state.numa.cpuset = ggml_get_numa_affinity();
|
||||||
|
|
||||||
// enumerate nodes
|
// enumerate nodes
|
||||||
while (g_state.numa.n_nodes < GGML_NUMA_MAX_NODES) {
|
while (g_state.numa.n_nodes < GGML_NUMA_MAX_NODES) {
|
||||||
rv = snprintf(path, sizeof(path), "/sys/devices/system/node/node%u", g_state.numa.n_nodes);
|
rv = snprintf(path, sizeof(path), "/sys/devices/system/node/node%u", g_state.numa.n_nodes);
|
||||||
|
@ -1978,11 +1992,17 @@ void ggml_numa_init(void) {
|
||||||
|
|
||||||
GGML_PRINT_DEBUG("found %u numa nodes, %u CPUs\n", g_state.numa.n_nodes, g_state.numa.total_cpus);
|
GGML_PRINT_DEBUG("found %u numa nodes, %u CPUs\n", g_state.numa.n_nodes, g_state.numa.total_cpus);
|
||||||
|
|
||||||
if (g_state.numa.n_nodes < 1 || g_state.numa.total_cpus < 1) {
|
// figure out which node we're on
|
||||||
|
uint current_cpu;
|
||||||
|
int getcpu_ret = getcpu(¤t_cpu, &g_state.numa.current_node);
|
||||||
|
|
||||||
|
if (g_state.numa.n_nodes < 1 || g_state.numa.total_cpus < 1 || getcpu_ret != 0) {
|
||||||
g_state.numa.n_nodes = 0;
|
g_state.numa.n_nodes = 0;
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
GGML_PRINT_DEBUG("found our process on numa node %u, CPU %u\n", g_state.numa.current_node, current_cpu);
|
||||||
|
|
||||||
for (uint32_t n = 0; n < g_state.numa.n_nodes; ++n) {
|
for (uint32_t n = 0; n < g_state.numa.n_nodes; ++n) {
|
||||||
struct ggml_numa_node * node = &g_state.numa.nodes[n];
|
struct ggml_numa_node * node = &g_state.numa.nodes[n];
|
||||||
GGML_PRINT_DEBUG("CPUs on node %u:", n);
|
GGML_PRINT_DEBUG("CPUs on node %u:", n);
|
||||||
|
@ -2013,6 +2033,15 @@ void ggml_numa_init(void) {
|
||||||
#endif
|
#endif
|
||||||
}
|
}
|
||||||
|
|
||||||
|
cpu_set_t ggml_get_numa_affinity(void) {
|
||||||
|
cpu_set_t cpuset;
|
||||||
|
pthread_t thread;
|
||||||
|
thread = pthread_self();
|
||||||
|
CPU_ZERO(&cpuset);
|
||||||
|
int ret = pthread_getaffinity_np(thread, sizeof(cpu_set_t), &cpuset);
|
||||||
|
return cpuset;
|
||||||
|
}
|
||||||
|
|
||||||
bool ggml_is_numa(void) {
|
bool ggml_is_numa(void) {
|
||||||
return g_state.numa.n_nodes > 1;
|
return g_state.numa.n_nodes > 1;
|
||||||
}
|
}
|
||||||
|
@ -16587,11 +16616,36 @@ static void set_numa_thread_affinity(int thread_n, int n_threads) {
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
// run thread on node_num thread_n / (threads per node)
|
int node_num;
|
||||||
const int node_num = thread_n / ((n_threads + g_state.numa.n_nodes - 1) / g_state.numa.n_nodes);
|
|
||||||
struct ggml_numa_node * node = &g_state.numa.nodes[node_num];
|
|
||||||
size_t setsize = CPU_ALLOC_SIZE(g_state.numa.total_cpus);
|
size_t setsize = CPU_ALLOC_SIZE(g_state.numa.total_cpus);
|
||||||
|
|
||||||
|
switch(g_state.numa.numa_strategy) {
|
||||||
|
case GGML_NUMA_STRATEGY_INTERLEAVE:
|
||||||
|
// run thread on node_num thread_n / (threads per node)
|
||||||
|
node_num = thread_n / ((n_threads + g_state.numa.n_nodes - 1) / g_state.numa.n_nodes);
|
||||||
|
break;
|
||||||
|
case GGML_NUMA_STRATEGY_ISOLATE:
|
||||||
|
// run thread on current_node
|
||||||
|
node_num = g_state.numa.current_node;
|
||||||
|
break;
|
||||||
|
case GGML_NUMA_STRATEGY_NUMACTL:
|
||||||
|
// use the cpuset that numactl gave us
|
||||||
|
int rv = pthread_setaffinity_np(pthread_self(), setsize, &g_state.numa.cpuset);
|
||||||
|
if (rv) {
|
||||||
|
fprintf(stderr, "warning: pthread_setaffinity_np() failed: %s\n",
|
||||||
|
strerror(rv));
|
||||||
|
}
|
||||||
|
return;
|
||||||
|
#ifdef GGML_NUMA_MIRROR
|
||||||
|
case GGML_NUMA_STRATEGY_MIRROR:
|
||||||
|
printf("Mirror Mode Enabled");
|
||||||
|
#endif
|
||||||
|
default:
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
struct ggml_numa_node * node = &g_state.numa.nodes[node_num];
|
||||||
|
|
||||||
cpu_set_t * cpus = CPU_ALLOC(g_state.numa.total_cpus);
|
cpu_set_t * cpus = CPU_ALLOC(g_state.numa.total_cpus);
|
||||||
CPU_ZERO_S(setsize, cpus);
|
CPU_ZERO_S(setsize, cpus);
|
||||||
for (size_t i = 0; i < node->n_cpus; ++i) {
|
for (size_t i = 0; i < node->n_cpus; ++i) {
|
||||||
|
|
14
ggml.h
14
ggml.h
|
@ -217,6 +217,7 @@
|
||||||
#include <stdint.h>
|
#include <stdint.h>
|
||||||
#include <stddef.h>
|
#include <stddef.h>
|
||||||
#include <stdbool.h>
|
#include <stdbool.h>
|
||||||
|
#include <sched.h>
|
||||||
|
|
||||||
#define GGML_FILE_MAGIC 0x67676d6c // "ggml"
|
#define GGML_FILE_MAGIC 0x67676d6c // "ggml"
|
||||||
#define GGML_FILE_VERSION 1
|
#define GGML_FILE_VERSION 1
|
||||||
|
@ -647,6 +648,16 @@ extern "C" {
|
||||||
void * wdata;
|
void * wdata;
|
||||||
};
|
};
|
||||||
|
|
||||||
|
// numa strategies
|
||||||
|
enum ggml_numa_strategies {
|
||||||
|
GGML_NUMA_STRATEGY_DISABLED = 0,
|
||||||
|
GGML_NUMA_STRATEGY_INTERLEAVE = 1,
|
||||||
|
GGML_NUMA_STRATEGY_ISOLATE = 2,
|
||||||
|
GGML_NUMA_STRATEGY_NUMACTL = 3,
|
||||||
|
GGML_NUMA_STRATEGY_MIRROR = 4,
|
||||||
|
GGML_NUMA_STRATEGY_MAX_VALUE = GGML_NUMA_STRATEGY_MIRROR,
|
||||||
|
};
|
||||||
|
|
||||||
// misc
|
// misc
|
||||||
|
|
||||||
GGML_API void ggml_time_init(void); // call this once at the beginning of the program
|
GGML_API void ggml_time_init(void); // call this once at the beginning of the program
|
||||||
|
@ -657,8 +668,9 @@ extern "C" {
|
||||||
|
|
||||||
GGML_API void ggml_print_backtrace(void);
|
GGML_API void ggml_print_backtrace(void);
|
||||||
|
|
||||||
GGML_API void ggml_numa_init(void); // call once for better performance on NUMA systems
|
GGML_API void ggml_numa_init(uint32_t numa); // call once for better performance on NUMA systems
|
||||||
GGML_API bool ggml_is_numa(void); // true if init detected that system has >1 NUMA node
|
GGML_API bool ggml_is_numa(void); // true if init detected that system has >1 NUMA node
|
||||||
|
GGML_API cpu_set_t ggml_get_numa_affinity(void); // get cpuset from numactl
|
||||||
|
|
||||||
GGML_API void ggml_print_object (const struct ggml_object * obj);
|
GGML_API void ggml_print_object (const struct ggml_object * obj);
|
||||||
GGML_API void ggml_print_objects(const struct ggml_context * ctx);
|
GGML_API void ggml_print_objects(const struct ggml_context * ctx);
|
||||||
|
|
10
llama.cpp
10
llama.cpp
|
@ -949,7 +949,7 @@ struct llama_mmap {
|
||||||
int fd = fileno(file->fp);
|
int fd = fileno(file->fp);
|
||||||
int flags = MAP_SHARED;
|
int flags = MAP_SHARED;
|
||||||
// prefetch/readahead impairs performance on NUMA systems
|
// prefetch/readahead impairs performance on NUMA systems
|
||||||
if (numa) { prefetch = 0; }
|
if (numa > 0) { prefetch = 0; }
|
||||||
#ifdef __linux__
|
#ifdef __linux__
|
||||||
// advise the kernel to read the file sequentially (increases readahead)
|
// advise the kernel to read the file sequentially (increases readahead)
|
||||||
if (posix_fadvise(fd, 0, 0, POSIX_FADV_SEQUENTIAL)) {
|
if (posix_fadvise(fd, 0, 0, POSIX_FADV_SEQUENTIAL)) {
|
||||||
|
@ -970,7 +970,7 @@ struct llama_mmap {
|
||||||
strerror(errno));
|
strerror(errno));
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
if (numa) {
|
if (numa > 0) {
|
||||||
// advise the kernel not to use readahead
|
// advise the kernel not to use readahead
|
||||||
// (because the next page might not belong on the same node)
|
// (because the next page might not belong on the same node)
|
||||||
if (posix_madvise(addr, file->size, POSIX_MADV_RANDOM)) {
|
if (posix_madvise(addr, file->size, POSIX_MADV_RANDOM)) {
|
||||||
|
@ -10327,7 +10327,7 @@ bool llama_mlock_supported(void) {
|
||||||
return llama_supports_mlock();
|
return llama_supports_mlock();
|
||||||
}
|
}
|
||||||
|
|
||||||
void llama_backend_init(bool numa) {
|
void llama_backend_init(uint32_t numa) {
|
||||||
ggml_time_init();
|
ggml_time_init();
|
||||||
|
|
||||||
// needed to initialize f16 tables
|
// needed to initialize f16 tables
|
||||||
|
@ -10337,8 +10337,8 @@ void llama_backend_init(bool numa) {
|
||||||
ggml_free(ctx);
|
ggml_free(ctx);
|
||||||
}
|
}
|
||||||
|
|
||||||
if (numa) {
|
if (numa > 0) {
|
||||||
ggml_numa_init();
|
ggml_numa_init(numa);
|
||||||
}
|
}
|
||||||
|
|
||||||
#ifdef GGML_USE_MPI
|
#ifdef GGML_USE_MPI
|
||||||
|
|
11
llama.h
11
llama.h
|
@ -111,6 +111,15 @@ extern "C" {
|
||||||
LLAMA_ROPE_SCALING_MAX_VALUE = LLAMA_ROPE_SCALING_YARN,
|
LLAMA_ROPE_SCALING_MAX_VALUE = LLAMA_ROPE_SCALING_YARN,
|
||||||
};
|
};
|
||||||
|
|
||||||
|
enum llama_numa_strategies {
|
||||||
|
LLAMA_NUMA_STRATEGY_DISABLED = 0,
|
||||||
|
LLAMA_NUMA_STRATEGY_INTERLEAVE = 1,
|
||||||
|
LLAMA_NUMA_STRATEGY_ISOLATE = 2,
|
||||||
|
LLAMA_NUMA_STRATEGY_NUMACTL = 3,
|
||||||
|
LLAMA_NUMA_STRATEGY_MIRROR = 4,
|
||||||
|
LLAMA_NUMA_STRATEGY_MAX_VALUE = LLAMA_NUMA_STRATEGY_MIRROR,
|
||||||
|
};
|
||||||
|
|
||||||
enum llama_split_mode {
|
enum llama_split_mode {
|
||||||
LLAMA_SPLIT_NONE = 0, // single GPU
|
LLAMA_SPLIT_NONE = 0, // single GPU
|
||||||
LLAMA_SPLIT_LAYER = 1, // split layers and KV across GPUs
|
LLAMA_SPLIT_LAYER = 1, // split layers and KV across GPUs
|
||||||
|
@ -304,7 +313,7 @@ extern "C" {
|
||||||
// Initialize the llama + ggml backend
|
// Initialize the llama + ggml backend
|
||||||
// If numa is true, use NUMA optimizations
|
// If numa is true, use NUMA optimizations
|
||||||
// Call once at the start of the program
|
// Call once at the start of the program
|
||||||
LLAMA_API void llama_backend_init(bool numa);
|
LLAMA_API void llama_backend_init(uint32_t numa);
|
||||||
|
|
||||||
// Call once at the end of the program - currently only used for MPI
|
// Call once at the end of the program - currently only used for MPI
|
||||||
LLAMA_API void llama_backend_free(void);
|
LLAMA_API void llama_backend_free(void);
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue