From c8bd5d8b65517ac3e2f5a1c25f0fcc7da8c5655a Mon Sep 17 00:00:00 2001 From: slaren Date: Tue, 19 Dec 2023 23:47:34 +0100 Subject: [PATCH] add ggml_backend_buffer_is_hos, used to avoid copies if possible when accesing tensor data --- ggml-backend-impl.h | 5 ++++- ggml-backend.c | 37 ++++++++++++++++++++++++++++--------- ggml-backend.h | 2 ++ ggml-cuda.cu | 16 ++++++++-------- ggml-metal.m | 7 +++++++ llama.cpp | 28 ++++++++++++++++++---------- 6 files changed, 67 insertions(+), 28 deletions(-) diff --git a/ggml-backend-impl.h b/ggml-backend-impl.h index f824b2c75..05859935a 100644 --- a/ggml-backend-impl.h +++ b/ggml-backend-impl.h @@ -20,6 +20,9 @@ extern "C" { size_t (*get_alignment) (ggml_backend_buffer_type_t buft); // tensor alignment size_t (*get_alloc_size) (ggml_backend_buffer_type_t buft, struct ggml_tensor * tensor); // data size needed to allocate the tensor, including padding bool (*supports_backend)(ggml_backend_buffer_type_t buft, ggml_backend_t backend); // check if the buffer type is usable by the backend + // check if tensor data is in host memory + // should be equivalent to supports_backend(buft, ggml_backend_cpu_init()) + bool (*is_host) (ggml_backend_buffer_type_t buft); }; struct ggml_backend_buffer_type { @@ -79,7 +82,7 @@ extern "C" { void (*cpy_tensor_from_async)(ggml_backend_t backend, struct ggml_tensor * src, struct ggml_tensor * dst); void (*cpy_tensor_to_async) (ggml_backend_t backend, struct ggml_tensor * src, struct ggml_tensor * dst); - void (*synchronize) (ggml_backend_t backend); + void (*synchronize)(ggml_backend_t backend); // compute graph with a plan ggml_backend_graph_plan_t (*graph_plan_create) (ggml_backend_t backend, struct ggml_cgraph * cgraph); diff --git a/ggml-backend.c b/ggml-backend.c index 9ea9e5c1b..e6f69bb23 100644 --- a/ggml-backend.c +++ b/ggml-backend.c @@ -35,6 +35,13 @@ bool ggml_backend_buft_supports_backend(ggml_backend_buffer_type_t buft, ggml_ba return buft->iface.supports_backend(buft, backend); } +bool ggml_backend_buft_is_host(ggml_backend_buffer_type_t buft) { + if (buft->iface.is_host) { + return buft->iface.is_host(buft); + } + return false; +} + // backend buffer ggml_backend_buffer_t ggml_backend_buffer_init( @@ -98,6 +105,10 @@ void ggml_backend_buffer_clear(ggml_backend_buffer_t buffer, uint8_t value) { buffer->iface.clear(buffer, value); } +bool ggml_backend_buffer_is_host(ggml_backend_buffer_t buffer) { + return ggml_backend_buft_is_host(ggml_backend_buffer_type(buffer)); +} + ggml_backend_buffer_type_t ggml_backend_buffer_type(ggml_backend_buffer_t buffer) { return buffer->buft; } @@ -464,6 +475,12 @@ static bool ggml_backend_cpu_buffer_type_supports_backend(ggml_backend_buffer_ty GGML_UNUSED(buft); } +static bool ggml_backend_cpu_buffer_type_is_host(ggml_backend_buffer_type_t buft) { + return true; + + GGML_UNUSED(buft); +} + ggml_backend_buffer_type_t ggml_backend_cpu_buffer_type(void) { static struct ggml_backend_buffer_type ggml_backend_cpu_buffer_type = { /* .iface = */ { @@ -471,6 +488,7 @@ ggml_backend_buffer_type_t ggml_backend_cpu_buffer_type(void) { /* .get_alignment = */ ggml_backend_cpu_buffer_type_get_alignment, /* .get_alloc_size = */ NULL, // defaults to ggml_nbytes /* .supports_backend = */ ggml_backend_cpu_buffer_type_supports_backend, + /* .is_host = */ ggml_backend_cpu_buffer_type_is_host, }, /* .context = */ NULL, }; @@ -479,9 +497,11 @@ ggml_backend_buffer_type_t ggml_backend_cpu_buffer_type(void) { } #ifdef GGML_USE_CPU_HBM + +// buffer type HBM + #include -// HBM buffer type static void ggml_backend_cpu_hbm_buffer_free_buffer(ggml_backend_buffer_t buffer) { hbw_free(buffer->context); } @@ -503,16 +523,15 @@ static ggml_backend_buffer_t ggml_backend_cpu_hbm_buffer_type_alloc_buffer(ggml_ return buffer; } -struct ggml_backend_buffer_type_i cpu_backend_hbm_buffer_type_interface = { - /* .alloc_buffer = */ ggml_backend_cpu_hbm_buffer_type_alloc_buffer, - /* .get_alignment = */ ggml_backend_cpu_buffer_type_get_alignment, - /* .get_alloc_size = */ NULL, // defaults to ggml_nbytes - /* .supports_backend = */ ggml_backend_cpu_buffer_type_supports_backend, -}; - ggml_backend_buffer_type_t ggml_backend_cpu_hbm_buffer_type() { static struct ggml_backend_buffer_type ggml_backend_cpu_buffer_type_hbm = { - /* .iface = */ cpu_backend_hbm_buffer_type_interface, + /* .iface = */ { + /* .alloc_buffer = */ ggml_backend_cpu_hbm_buffer_type_alloc_buffer, + /* .get_alignment = */ ggml_backend_cpu_buffer_type_get_alignment, + /* .get_alloc_size = */ NULL, // defaults to ggml_nbytes + /* .supports_backend = */ ggml_backend_cpu_buffer_type_supports_backend, + /* .is_host = */ ggml_backend_cpu_buffer_type_is_host, + }, /* .context = */ NULL, }; diff --git a/ggml-backend.h b/ggml-backend.h index b644feb16..a9d2fddd7 100644 --- a/ggml-backend.h +++ b/ggml-backend.h @@ -21,6 +21,7 @@ extern "C" { GGML_API size_t ggml_backend_buft_get_alignment (ggml_backend_buffer_type_t buft); GGML_API size_t ggml_backend_buft_get_alloc_size(ggml_backend_buffer_type_t buft, struct ggml_tensor * tensor); GGML_API bool ggml_backend_buft_supports_backend(ggml_backend_buffer_type_t buft, ggml_backend_t backend); + GGML_API bool ggml_backend_buft_is_host (ggml_backend_buffer_type_t buft); // buffer GGML_API void ggml_backend_buffer_free (ggml_backend_buffer_t buffer); @@ -30,6 +31,7 @@ extern "C" { GGML_API size_t ggml_backend_buffer_get_alignment (ggml_backend_buffer_t buffer); GGML_API size_t ggml_backend_buffer_get_alloc_size(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor); GGML_API void ggml_backend_buffer_clear (ggml_backend_buffer_t buffer, uint8_t value); + GGML_API bool ggml_backend_buffer_is_host (ggml_backend_buffer_t buffer); GGML_API ggml_backend_buffer_type_t ggml_backend_buffer_type(ggml_backend_buffer_t buffer); // diff --git a/ggml-cuda.cu b/ggml-cuda.cu index f69a8f5af..23bd073d2 100644 --- a/ggml-cuda.cu +++ b/ggml-cuda.cu @@ -9568,6 +9568,7 @@ static ggml_backend_buffer_type_i ggml_backend_cuda_buffer_type_interface = { /* .get_alignment = */ ggml_backend_cuda_buffer_type_get_alignment, /* .get_alloc_size = */ ggml_backend_cuda_buffer_type_get_alloc_size, /* .supports_backend = */ ggml_backend_cuda_buffer_type_supports_backend, + /* .is_host = */ nullptr, }; ggml_backend_buffer_type_t ggml_backend_cuda_buffer_type(int device) { @@ -9606,16 +9607,15 @@ static ggml_backend_buffer_t ggml_backend_cuda_host_buffer_type_alloc_buffer(ggm return buffer; } -struct ggml_backend_buffer_type_i ggml_backend_cuda_host_buffer_type_interface = { - /* .alloc_buffer = */ ggml_backend_cuda_host_buffer_type_alloc_buffer, - /* .get_alignment = */ ggml_backend_cpu_buffer_type()->iface.get_alignment, - /* .get_alloc_size = */ ggml_backend_cpu_buffer_type()->iface.get_alloc_size, - /* .supports_backend = */ ggml_backend_cpu_buffer_type()->iface.supports_backend, -}; - ggml_backend_buffer_type_t ggml_backend_cuda_host_buffer_type() { static struct ggml_backend_buffer_type ggml_backend_cuda_buffer_type_host = { - /* .iface = */ ggml_backend_cuda_host_buffer_type_interface, + /* .iface = */ { + /* .alloc_buffer = */ ggml_backend_cuda_host_buffer_type_alloc_buffer, + /* .get_alignment = */ ggml_backend_cpu_buffer_type()->iface.get_alignment, + /* .get_alloc_size = */ ggml_backend_cpu_buffer_type()->iface.get_alloc_size, + /* .supports_backend = */ ggml_backend_cpu_buffer_type()->iface.supports_backend, + /* .is_host = */ ggml_backend_cpu_buffer_type()->iface.is_host, + }, /* .context = */ nullptr, }; diff --git a/ggml-metal.m b/ggml-metal.m index 3bbe64405..0916202a0 100644 --- a/ggml-metal.m +++ b/ggml-metal.m @@ -2521,6 +2521,12 @@ static bool ggml_backend_metal_buffer_type_supports_backend(ggml_backend_buffer_ UNUSED(buft); } +static bool ggml_backend_metal_buffer_type_is_host(ggml_backend_buffer_type_t buft) { + return true; + + UNUSED(buft); +} + ggml_backend_buffer_type_t ggml_backend_metal_buffer_type(void) { static struct ggml_backend_buffer_type ggml_backend_buffer_type_metal = { /* .iface = */ { @@ -2528,6 +2534,7 @@ ggml_backend_buffer_type_t ggml_backend_metal_buffer_type(void) { /* .get_alignment = */ ggml_backend_metal_buffer_type_get_alignment, /* .get_alloc_size = */ NULL, // defaults to ggml_nbytes /* .supports_backend = */ ggml_backend_metal_buffer_type_supports_backend, + /* .is_host = */ ggml_backend_metal_buffer_type_is_host, }, /* .context = */ NULL, }; diff --git a/llama.cpp b/llama.cpp index 988ce8004..3ffc33b27 100644 --- a/llama.cpp +++ b/llama.cpp @@ -1539,7 +1539,7 @@ static bool llama_kv_cache_init( ggml_cuda_assign_buffers_no_scratch(v); vram_kv_cache += ggml_nbytes(k); vram_kv_cache += ggml_nbytes(v); - // HACK: mark tensor as allocated, but crash if we try to use it from the CPU + // HACK: mark tensor as allocated k->data = v->data = (void *)(uintptr_t)1; } } @@ -2285,9 +2285,15 @@ struct llama_model_loader { ggml_backend_tensor_set(cur, (uint8_t *)mapping->addr + offs, 0, ggml_nbytes(cur)); } } else { - // FIXME: use read_buf for device buffers without unified memory - file.seek(offs, SEEK_SET); - file.read_raw(cur->data, ggml_nbytes(cur)); + if (ggml_backend_buffer_is_host(cur->buffer)) { + file.seek(offs, SEEK_SET); + file.read_raw(cur->data, ggml_nbytes(cur)); + } else { + read_buf.resize(ggml_nbytes(cur)); + file.seek(offs, SEEK_SET); + file.read_raw(read_buf.data(), ggml_nbytes(cur)); + ggml_backend_tensor_set(cur, read_buf.data(), 0, ggml_nbytes(cur)); + } } if (use_mmap && lmlock) { @@ -2298,7 +2304,7 @@ struct llama_model_loader { case GGML_BACKEND_GPU: case GGML_BACKEND_GPU_SPLIT: { - // HACK: mark tensor as allocated, but crash if we try to use it from the CPU + // HACK: mark tensor as allocated cur->data = (void *)(uintptr_t)1; void * data; if (use_mmap) { @@ -5773,7 +5779,7 @@ static struct ggml_cgraph * llama_build_graph( const int64_t n_tokens = cur->ne[1]; float * data; - if (/*is_sys_mem_buf(cur->buffer)*/false) { // TODO + if (ggml_backend_buffer_is_host(cur->buffer)) { data = (float *) cur->data; } else { lctx.buf_copy.resize(ggml_nbytes(cur)); @@ -5812,7 +5818,7 @@ static struct ggml_cgraph * llama_build_graph( const int64_t n_ctx = cur->ne[0]; int32_t * data; - if (/*is_sys_mem_buf(cur->buffer)*/false) { // TODO + if (ggml_backend_buffer_is_host(cur->buffer)) { data = (int32_t *) cur->data; } else { lctx.buf_copy.resize(ggml_nbytes(cur)); @@ -9230,13 +9236,15 @@ struct llama_context * llama_new_context_with_model( } #endif - if (ctx->backend == nullptr) { - // FIXME: this may fail if the model buffer is not compatible with the CPU backend + if (ctx->backend == nullptr && ggml_backend_buffer_is_host(model->buf)) { ctx->backend = ggml_backend_cpu_init(); + if (ctx->backend == nullptr) { + LLAMA_LOG_ERROR("%s: failed to initialize CPU backend\n", __func__); + } } if (ctx->backend == nullptr) { - LLAMA_LOG_ERROR("%s: failed to initialize CPU backend\n", __func__); + LLAMA_LOG_ERROR("%s: failed to initialize a backend\n", __func__); delete ctx; return nullptr; }