This commit is contained in:
fairydreaming 2025-02-10 03:34:14 -06:00 committed by GitHub
commit 28b91417e5
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
3 changed files with 94 additions and 1 deletions

View file

@ -348,6 +348,7 @@ extern "C" {
// CPU buffer types are always available // CPU buffer types are always available
GGML_API ggml_backend_buffer_t ggml_backend_cpu_buffer_from_ptr(void * ptr, size_t size); GGML_API ggml_backend_buffer_t ggml_backend_cpu_buffer_from_ptr(void * ptr, size_t size);
GGML_API ggml_backend_buffer_type_t ggml_backend_cpu_buffer_type(void); GGML_API ggml_backend_buffer_type_t ggml_backend_cpu_buffer_type(void);
GGML_API ggml_backend_buffer_type_t ggml_backend_numa_buffer_type(void);
#ifdef __cplusplus #ifdef __cplusplus
} }

View file

@ -2000,3 +2000,84 @@ ggml_backend_buffer_t ggml_backend_cpu_buffer_from_ptr(void * ptr, size_t size)
GGML_ASSERT((uintptr_t)ptr % TENSOR_ALIGNMENT == 0 && "buffer pointer must be aligned"); GGML_ASSERT((uintptr_t)ptr % TENSOR_ALIGNMENT == 0 && "buffer pointer must be aligned");
return ggml_backend_buffer_init(ggml_backend_cpu_buffer_from_ptr_type(), ggml_backend_cpu_buffer_from_ptr_i, ptr, size); return ggml_backend_buffer_init(ggml_backend_cpu_buffer_from_ptr_type(), ggml_backend_cpu_buffer_from_ptr_i, ptr, size);
} }
// NUMA buffer interface - similar to CPU, but with pages allocated accordingly to a NUMA first-touch policy
#include <sys/mman.h>
static void ggml_backend_numa_buffer_free_buffer(ggml_backend_buffer_t buffer) {
if (munmap((char *) buffer->context, buffer->size)) {
GGML_LOG_WARN("warning: munmap failed: %s\n", strerror(errno));
}
}
static void ggml_backend_numa_buffer_clear(ggml_backend_buffer_t buffer, uint8_t value) {
if (posix_madvise(buffer->context, buffer->size, POSIX_MADV_DONTNEED)) {
GGML_LOG_WARN("warning: posix_madvise(.., POSIX_MADV_DONTNEED) failed: %s\n",
strerror(errno));
}
}
static const struct ggml_backend_buffer_i ggml_backend_numa_buffer_i = {
/* .free_buffer = */ ggml_backend_numa_buffer_free_buffer,
/* .get_base = */ ggml_backend_cpu_buffer_get_base,
/* .init_tensor = */ NULL, // no initialization required
/* .memset_tensor = */ ggml_backend_cpu_buffer_memset_tensor,
/* .set_tensor = */ ggml_backend_cpu_buffer_set_tensor,
/* .get_tensor = */ ggml_backend_cpu_buffer_get_tensor,
/* .cpy_tensor = */ ggml_backend_cpu_buffer_cpy_tensor,
/* .clear = */ ggml_backend_numa_buffer_clear,
/* .reset = */ NULL,
};
// NUMA buffer type - similar to CPU, but with pages allocated accordingly to a NUMA first-touch policy
static const char * ggml_backend_numa_buffer_type_get_name(ggml_backend_buffer_type_t buft) {
return "NUMA";
GGML_UNUSED(buft);
}
static ggml_backend_buffer_t ggml_backend_numa_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) {
int flags = MAP_SHARED | MAP_ANONYMOUS;
void * data = mmap(NULL, size, PROT_READ|PROT_WRITE, flags, -1, 0);
if (data == MAP_FAILED) {
GGML_LOG_ERROR("%s: failed to allocate buffer of size %zu\n", __func__, size);
return NULL;
}
if (posix_madvise(data, size, POSIX_MADV_RANDOM)) {
GGML_LOG_WARN("warning: posix_madvise(.., POSIX_MADV_RANDOM) failed: %s\n",
strerror(errno));
}
return ggml_backend_buffer_init(buft, ggml_backend_numa_buffer_i, data, size);
}
static size_t ggml_backend_numa_buffer_type_get_alignment(ggml_backend_buffer_type_t buft) {
return TENSOR_ALIGNMENT;
GGML_UNUSED(buft);
}
static bool ggml_backend_numa_buffer_type_is_host(ggml_backend_buffer_type_t buft) {
return true;
GGML_UNUSED(buft);
}
ggml_backend_buffer_type_t ggml_backend_numa_buffer_type(void) {
static struct ggml_backend_buffer_type ggml_backend_numa_buffer_type = {
/* .iface = */ {
/* .get_name = */ ggml_backend_numa_buffer_type_get_name,
/* .alloc_buffer = */ ggml_backend_numa_buffer_type_alloc_buffer,
/* .get_alignment = */ ggml_backend_numa_buffer_type_get_alignment,
/* .get_max_size = */ NULL, // defaults to SIZE_MAX
/* .get_alloc_size = */ NULL, // defaults to ggml_nbytes
/* .is_host = */ ggml_backend_numa_buffer_type_is_host,
},
/* .device = */ NULL, // FIXME ggml_backend_reg_dev_get(ggml_backend_cpu_reg(), 0),
/* .context = */ NULL,
};
return &ggml_backend_numa_buffer_type;
}

View file

@ -71,6 +71,13 @@ bool llama_kv_cache_init(
cache.k_l.reserve(n_layer); cache.k_l.reserve(n_layer);
cache.v_l.reserve(n_layer); cache.v_l.reserve(n_layer);
auto * reg = ggml_backend_dev_backend_reg(ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_CPU));
auto * is_numa_fn = (decltype(ggml_is_numa) *) ggml_backend_reg_get_proc_address(reg, "ggml_backend_cpu_is_numa");
bool is_numa = is_numa_fn();
if (!offload && is_numa) {
LLAMA_LOG_INFO("%s: NUMA usage detected, using NUMA-aware buffer for KV cache\n", __func__);
}
for (int i = 0; i < n_layer; i++) { for (int i = 0; i < n_layer; i++) {
const uint32_t n_embd_k_gqa = hparams.n_embd_k_gqa(i) + hparams.n_embd_k_s(); const uint32_t n_embd_k_gqa = hparams.n_embd_k_gqa(i) + hparams.n_embd_k_s();
const uint32_t n_embd_v_gqa = hparams.n_embd_v_gqa(i) + hparams.n_embd_v_s(); const uint32_t n_embd_v_gqa = hparams.n_embd_v_gqa(i) + hparams.n_embd_v_s();
@ -81,9 +88,13 @@ bool llama_kv_cache_init(
if (offload) { if (offload) {
auto * dev = model.dev_layer(i); auto * dev = model.dev_layer(i);
buft = ggml_backend_dev_buffer_type(dev); buft = ggml_backend_dev_buffer_type(dev);
} else {
if (is_numa) {
buft = ggml_backend_numa_buffer_type();
} else { } else {
buft = ggml_backend_cpu_buffer_type(); buft = ggml_backend_cpu_buffer_type();
} }
}
ggml_context * ctx = ctx_for_buft(buft); ggml_context * ctx = ctx_for_buft(buft);
if (!ctx) { if (!ctx) {