From fa7620116eec31ebb41a3b025e50efb49db4d917 Mon Sep 17 00:00:00 2001 From: slaren Date: Tue, 9 Jan 2024 03:14:16 +0100 Subject: [PATCH] opencl : add ggml-backend buffer type --- ggml-alloc.c | 1 + ggml-backend-impl.h | 14 +- ggml-backend.c | 31 ++-- ggml-backend.h | 8 +- ggml-cuda.cu | 52 ++----- ggml-metal.m | 9 +- ggml-opencl.cpp | 335 ++++++++++++++++++++++++++++++++++++++++++-- ggml-opencl.h | 16 ++- ggml.c | 2 +- ggml.h | 4 +- llama.cpp | 79 +++++------ 11 files changed, 419 insertions(+), 132 deletions(-) diff --git a/ggml-alloc.c b/ggml-alloc.c index 5e730aa9a..7836f064e 100644 --- a/ggml-alloc.c +++ b/ggml-alloc.c @@ -229,6 +229,7 @@ void ggml_tallocr_reset(ggml_tallocr_t alloc) { alloc->free_blocks[0].size = SIZE_MAX/2; // restrict maximum size of a measure allocator to half size_t max to avoid overflows } else { alloc->free_blocks[0].size = ggml_backend_buffer_get_size(alloc->buffer) - align_offset; + ggml_backend_buffer_reset(alloc->buffer); } } diff --git a/ggml-backend-impl.h b/ggml-backend-impl.h index ab6d96c68..859e923e2 100644 --- a/ggml-backend-impl.h +++ b/ggml-backend-impl.h @@ -19,7 +19,7 @@ extern "C" { const char * (*get_name) (ggml_backend_buffer_type_t buft); ggml_backend_buffer_t (*alloc_buffer) (ggml_backend_buffer_type_t buft, size_t size); size_t (*get_alignment) (ggml_backend_buffer_type_t buft); // tensor alignment - size_t (*get_alloc_size) (ggml_backend_buffer_type_t buft, struct ggml_tensor * tensor); // data size needed to allocate the tensor, including padding + size_t (*get_alloc_size) (ggml_backend_buffer_type_t buft, const struct ggml_tensor * tensor); // data size needed to allocate the tensor, including padding bool (*supports_backend)(ggml_backend_buffer_type_t buft, ggml_backend_t backend); // check if the buffer type is usable by the backend // check if tensor data is in host memory // should be equivalent to supports_backend(buft, ggml_backend_cpu_init()) @@ -37,15 +37,15 @@ extern "C" { struct ggml_backend_buffer_i { const char * (*get_name) (ggml_backend_buffer_t buffer); void (*free_buffer) (ggml_backend_buffer_t buffer); - //void (*reset) (ggml_backend_buffer_t buffer); // reset any internal state due to tensor initialization, such as tensor extras void * (*get_base) (ggml_backend_buffer_t buffer); void (*init_tensor) (ggml_backend_buffer_t buffer, struct ggml_tensor * tensor); void (*set_tensor) (ggml_backend_buffer_t buffer, struct ggml_tensor * tensor, const void * data, size_t offset, size_t size); void (*get_tensor) (ggml_backend_buffer_t buffer, const struct ggml_tensor * tensor, void * data, size_t offset, size_t size); // (optional) copy tensor between different buffer-type, allow for single-copy tranfers - void (*cpy_tensor_from)(ggml_backend_buffer_t buffer, struct ggml_tensor * src, struct ggml_tensor * dst); - void (*cpy_tensor_to) (ggml_backend_buffer_t buffer, struct ggml_tensor * src, struct ggml_tensor * dst); + void (*cpy_tensor_from)(ggml_backend_buffer_t buffer, const struct ggml_tensor * src, struct ggml_tensor * dst); + void (*cpy_tensor_to) (ggml_backend_buffer_t buffer, const struct ggml_tensor * src, struct ggml_tensor * dst); void (*clear) (ggml_backend_buffer_t buffer, uint8_t value); + void (*reset) (ggml_backend_buffer_t buffer); // reset any internal state due to tensor initialization, such as tensor extras }; struct ggml_backend_buffer { @@ -82,13 +82,13 @@ extern "C" { void (*get_tensor_async)(ggml_backend_t backend, const struct ggml_tensor * tensor, void * data, size_t offset, size_t size); // (optional) asynchroneous tensor copy - void (*cpy_tensor_from_async)(ggml_backend_t backend, struct ggml_tensor * src, struct ggml_tensor * dst); - void (*cpy_tensor_to_async) (ggml_backend_t backend, struct ggml_tensor * src, struct ggml_tensor * dst); + void (*cpy_tensor_from_async)(ggml_backend_t backend, const struct ggml_tensor * src, struct ggml_tensor * dst); + void (*cpy_tensor_to_async) (ggml_backend_t backend, const struct ggml_tensor * src, struct ggml_tensor * dst); void (*synchronize)(ggml_backend_t backend); // compute graph with a plan - ggml_backend_graph_plan_t (*graph_plan_create) (ggml_backend_t backend, struct ggml_cgraph * cgraph); + ggml_backend_graph_plan_t (*graph_plan_create) (ggml_backend_t backend, const struct ggml_cgraph * cgraph); void (*graph_plan_free) (ggml_backend_t backend, ggml_backend_graph_plan_t plan); void (*graph_plan_compute)(ggml_backend_t backend, ggml_backend_graph_plan_t plan); diff --git a/ggml-backend.c b/ggml-backend.c index aa1ac5b10..25d9c98d5 100644 --- a/ggml-backend.c +++ b/ggml-backend.c @@ -103,11 +103,11 @@ void ggml_backend_buffer_init_tensor(ggml_backend_buffer_t buffer, struct ggml_t } size_t ggml_backend_buffer_get_alignment (ggml_backend_buffer_t buffer) { - return ggml_backend_buft_get_alignment(ggml_backend_buffer_type(buffer)); + return ggml_backend_buft_get_alignment(ggml_backend_buffer_get_type(buffer)); } size_t ggml_backend_buffer_get_alloc_size(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor) { - return ggml_backend_buft_get_alloc_size(ggml_backend_buffer_type(buffer), tensor); + return ggml_backend_buft_get_alloc_size(ggml_backend_buffer_get_type(buffer), tensor); } void ggml_backend_buffer_clear(ggml_backend_buffer_t buffer, uint8_t value) { @@ -115,17 +115,23 @@ void ggml_backend_buffer_clear(ggml_backend_buffer_t buffer, uint8_t value) { } bool ggml_backend_buffer_is_host(ggml_backend_buffer_t buffer) { - return ggml_backend_buft_is_host(ggml_backend_buffer_type(buffer)); + return ggml_backend_buft_is_host(ggml_backend_buffer_get_type(buffer)); } void ggml_backend_buffer_set_usage(ggml_backend_buffer_t buffer, enum ggml_backend_buffer_usage usage) { buffer->usage = usage; } -ggml_backend_buffer_type_t ggml_backend_buffer_type(ggml_backend_buffer_t buffer) { +ggml_backend_buffer_type_t ggml_backend_buffer_get_type(ggml_backend_buffer_t buffer) { return buffer->buft; } +void ggml_backend_buffer_reset(ggml_backend_buffer_t buffer) { + if (buffer->iface.reset) { + buffer->iface.reset(buffer); + } +} + // backend const char * ggml_backend_name(ggml_backend_t backend) { @@ -431,13 +437,13 @@ static void ggml_backend_cpu_buffer_get_tensor(ggml_backend_buffer_t buffer, con GGML_UNUSED(buffer); } -static void ggml_backend_cpu_buffer_cpy_tensor_from(ggml_backend_buffer_t buffer, struct ggml_tensor * src, struct ggml_tensor * dst) { +static void ggml_backend_cpu_buffer_cpy_tensor_from(ggml_backend_buffer_t buffer, const struct ggml_tensor * src, struct ggml_tensor * dst) { ggml_backend_tensor_get(src, dst->data, 0, ggml_nbytes(src)); GGML_UNUSED(buffer); } -static void ggml_backend_cpu_buffer_cpy_tensor_to(ggml_backend_buffer_t buffer, struct ggml_tensor * src, struct ggml_tensor * dst) { +static void ggml_backend_cpu_buffer_cpy_tensor_to(ggml_backend_buffer_t buffer, const struct ggml_tensor * src, struct ggml_tensor * dst) { ggml_backend_tensor_set(dst, src->data, 0, ggml_nbytes(src)); GGML_UNUSED(buffer); @@ -457,6 +463,7 @@ static struct ggml_backend_buffer_i cpu_backend_buffer_i = { /* .cpy_tensor_from = */ ggml_backend_cpu_buffer_cpy_tensor_from, /* .cpy_tensor_to = */ ggml_backend_cpu_buffer_cpy_tensor_to, /* .clear = */ ggml_backend_cpu_buffer_clear, + /* .reset = */ NULL, }; // for buffers from ptr, free is not called @@ -470,6 +477,7 @@ static struct ggml_backend_buffer_i cpu_backend_buffer_i_from_ptr = { /* .cpy_tensor_from = */ ggml_backend_cpu_buffer_cpy_tensor_from, /* .cpy_tensor_to = */ ggml_backend_cpu_buffer_cpy_tensor_to, /* .clear = */ ggml_backend_cpu_buffer_clear, + /* .reset = */ NULL, }; static const size_t TENSOR_ALIGNMENT = 64; // should be enough for AVX 512 @@ -554,7 +562,6 @@ static ggml_backend_buffer_t ggml_backend_cpu_hbm_buffer_type_alloc_buffer(ggml_ return NULL; } - // FIXME: this is a hack to avoid having to implement a new buffer type ggml_backend_buffer_t buffer = ggml_backend_cpu_buffer_from_ptr(ptr, size); buffer->buft = buft; buffer->iface.get_name = ggml_backend_cpu_hbm_buffer_get_name; @@ -610,7 +617,7 @@ struct ggml_backend_plan_cpu { struct ggml_cgraph cgraph; }; -static ggml_backend_graph_plan_t ggml_backend_cpu_graph_plan_create(ggml_backend_t backend, struct ggml_cgraph * cgraph) { +static ggml_backend_graph_plan_t ggml_backend_cpu_graph_plan_create(ggml_backend_t backend, const struct ggml_cgraph * cgraph) { struct ggml_backend_cpu_context * cpu_ctx = (struct ggml_backend_cpu_context *)backend->context; struct ggml_backend_plan_cpu * cpu_plan = malloc(sizeof(struct ggml_backend_plan_cpu)); @@ -1371,14 +1378,10 @@ void ggml_backend_sched_init_measure(ggml_backend_sched_t sched, struct ggml_cgr sched_reset(sched); } -void ggml_backend_sched_graph_split(ggml_backend_sched_t sched, struct ggml_cgraph * graph) { +void ggml_backend_sched_graph_compute(ggml_backend_sched_t sched, struct ggml_cgraph * graph) { GGML_ASSERT((int)sched->hash_set.size >= graph->n_nodes + GGML_MAX_SPLITS*GGML_MAX_SPLIT_INPUTS); sched_split_graph(sched, graph); -} - -void ggml_backend_sched_graph_compute(ggml_backend_sched_t sched, struct ggml_cgraph * graph) { - ggml_backend_sched_graph_split(sched, graph); sched_alloc_splits(sched); sched_compute_splits(sched); sched_reset(sched); @@ -1410,7 +1413,7 @@ void ggml_backend_sched_set_node_backend(ggml_backend_sched_t sched, struct ggml void ggml_backend_view_init(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor) { GGML_ASSERT(tensor->buffer == NULL); - //GGML_ASSERT(tensor->data == NULL); // views of pre-allocted tensors may have the data set, but still need to be initialized + //GGML_ASSERT(tensor->data == NULL); // views of pre-allocated tensors may have the data set in ggml_new_tensor, but still need to be initialized by the backend GGML_ASSERT(tensor->view_src != NULL); GGML_ASSERT(tensor->view_src->buffer != NULL); GGML_ASSERT(tensor->view_src->data != NULL); diff --git a/ggml-backend.h b/ggml-backend.h index 250a9760c..c4eff546a 100644 --- a/ggml-backend.h +++ b/ggml-backend.h @@ -40,7 +40,8 @@ extern "C" { GGML_API void ggml_backend_buffer_clear (ggml_backend_buffer_t buffer, uint8_t value); GGML_API bool ggml_backend_buffer_is_host (ggml_backend_buffer_t buffer); GGML_API void ggml_backend_buffer_set_usage (ggml_backend_buffer_t buffer, enum ggml_backend_buffer_usage usage); - GGML_API ggml_backend_buffer_type_t ggml_backend_buffer_type (ggml_backend_buffer_t buffer); + GGML_API ggml_backend_buffer_type_t ggml_backend_buffer_get_type (ggml_backend_buffer_t buffer); + GGML_API void ggml_backend_buffer_reset (ggml_backend_buffer_t buffer); // @@ -166,11 +167,6 @@ extern "C" { ggml_backend_sched_t sched, struct ggml_cgraph * graph); - // Split without computing - only useful to find the number of splits - GGML_API void ggml_backend_sched_graph_split( - ggml_backend_sched_t sched, - struct ggml_cgraph * graph); - // // Utils // diff --git a/ggml-cuda.cu b/ggml-cuda.cu index 3d9473e3f..61c5f67bc 100644 --- a/ggml-cuda.cu +++ b/ggml-cuda.cu @@ -9487,7 +9487,7 @@ static void ggml_backend_cuda_buffer_clear(ggml_backend_buffer_t buffer, uint8_t CUDA_CHECK(cudaDeviceSynchronize()); } -static struct ggml_backend_buffer_i ggml_cuda_backend_buffer_interface = { +static ggml_backend_buffer_i ggml_backend_cuda_buffer_interface = { /* .get_name = */ ggml_backend_cuda_buffer_get_name, /* .free_buffer = */ ggml_backend_cuda_buffer_free_buffer, /* .get_base = */ ggml_backend_cuda_buffer_get_base, @@ -9497,6 +9497,7 @@ static struct ggml_backend_buffer_i ggml_cuda_backend_buffer_interface = { /* .cpy_tensor_from = */ NULL, /* .cpy_tensor_to = */ NULL, /* .clear = */ ggml_backend_cuda_buffer_clear, + /* .reset = */ NULL, }; // cuda buffer type @@ -9528,7 +9529,7 @@ static ggml_backend_buffer_t ggml_backend_cuda_buffer_type_alloc_buffer(ggml_bac ggml_backend_buffer_context_cuda * ctx = new ggml_backend_buffer_context_cuda(buft_ctx->device, dev_ptr); - return ggml_backend_buffer_init(buft, ggml_cuda_backend_buffer_interface, ctx, size); + return ggml_backend_buffer_init(buft, ggml_backend_cuda_buffer_interface, ctx, size); } static size_t ggml_backend_cuda_buffer_type_get_alignment(ggml_backend_buffer_type_t buft) { @@ -9537,7 +9538,7 @@ static size_t ggml_backend_cuda_buffer_type_get_alignment(ggml_backend_buffer_ty UNUSED(buft); } -static size_t ggml_backend_cuda_buffer_type_get_alloc_size(ggml_backend_buffer_type_t buft, ggml_tensor * tensor) { +static size_t ggml_backend_cuda_buffer_type_get_alloc_size(ggml_backend_buffer_type_t buft, const ggml_tensor * tensor) { int64_t row_low = 0; int64_t row_high = ggml_nrows(tensor); int64_t nrows_split = row_high - row_low; @@ -9574,7 +9575,7 @@ static ggml_backend_buffer_type_i ggml_backend_cuda_buffer_type_interface = { /* .get_alignment = */ ggml_backend_cuda_buffer_type_get_alignment, /* .get_alloc_size = */ ggml_backend_cuda_buffer_type_get_alloc_size, /* .supports_backend = */ ggml_backend_cuda_buffer_type_supports_backend, - /* .is_host = */ nullptr, + /* .is_host = */ NULL, }; ggml_backend_buffer_type_t ggml_backend_cuda_buffer_type(int device) { @@ -9583,7 +9584,7 @@ ggml_backend_buffer_type_t ggml_backend_cuda_buffer_type(int device) { return nullptr; } - static struct ggml_backend_buffer_type ggml_backend_cuda_buffer_types[GGML_CUDA_MAX_DEVICES]; + static ggml_backend_buffer_type ggml_backend_cuda_buffer_types[GGML_CUDA_MAX_DEVICES]; static bool ggml_backend_cuda_buffer_type_initialized = false; @@ -9759,7 +9760,7 @@ static void ggml_backend_cuda_split_buffer_clear(ggml_backend_buffer_t buffer, u UNUSED(value); } -static struct ggml_backend_buffer_i ggml_cuda_backend_split_buffer_interface = { +static struct ggml_backend_buffer_i ggml_backend_cuda_split_buffer_interface = { /* .get_name = */ ggml_backend_cuda_split_buffer_get_name, /* .free_buffer = */ ggml_backend_cuda_split_buffer_free_buffer, /* .get_base = */ ggml_backend_cuda_split_buffer_get_base, @@ -9769,6 +9770,7 @@ static struct ggml_backend_buffer_i ggml_cuda_backend_split_buffer_interface = { /* .cpy_tensor_from = */ NULL, /* .cpy_tensor_to = */ NULL, /* .clear = */ ggml_backend_cuda_split_buffer_clear, + /* .reset = */ NULL, }; // cuda split buffer type @@ -9786,7 +9788,7 @@ static ggml_backend_buffer_t ggml_backend_cuda_split_buffer_type_alloc_buffer(gg // as returned by get_alloc_size. this limit is enforced during tensor allocation by ggml-alloc, so it must be correct. ggml_backend_cuda_split_buffer_context * ctx = new ggml_backend_cuda_split_buffer_context(); - return ggml_backend_buffer_init(buft, ggml_cuda_backend_split_buffer_interface, ctx, size); + return ggml_backend_buffer_init(buft, ggml_backend_cuda_split_buffer_interface, ctx, size); } static size_t ggml_backend_cuda_split_buffer_type_get_alignment(ggml_backend_buffer_type_t buft) { @@ -9795,7 +9797,7 @@ static size_t ggml_backend_cuda_split_buffer_type_get_alignment(ggml_backend_buf UNUSED(buft); } -static size_t ggml_backend_cuda_split_buffer_type_get_alloc_size(ggml_backend_buffer_type_t buft, ggml_tensor * tensor) { +static size_t ggml_backend_cuda_split_buffer_type_get_alloc_size(ggml_backend_buffer_type_t buft, const ggml_tensor * tensor) { ggml_backend_cuda_split_buffer_type_context * ctx = (ggml_backend_cuda_split_buffer_type_context *)buft->context; size_t total_size = 0; @@ -9903,7 +9905,6 @@ static ggml_backend_buffer_t ggml_backend_cuda_host_buffer_type_alloc_buffer(ggm return ggml_backend_buft_alloc_buffer(ggml_backend_cpu_buffer_type(), size); } - // FIXME: this is a hack to avoid having to implement a new buffer type ggml_backend_buffer_t buffer = ggml_backend_cpu_buffer_from_ptr(ptr, size); buffer->buft = buft; buffer->iface.get_name = ggml_backend_cuda_host_buffer_name; @@ -9975,29 +9976,6 @@ static void ggml_backend_cuda_synchronize(ggml_backend_t backend) { UNUSED(backend); } -static ggml_backend_graph_plan_t ggml_backend_cuda_graph_plan_create(ggml_backend_t backend, ggml_cgraph * cgraph) { - GGML_ASSERT(!"not implemented"); - - return nullptr; - - UNUSED(backend); - UNUSED(cgraph); -} - -static void ggml_backend_cuda_graph_plan_free(ggml_backend_t backend, ggml_backend_graph_plan_t plan) { - GGML_ASSERT(!"not implemented"); - - UNUSED(backend); - UNUSED(plan); -} - -static void ggml_backend_cuda_graph_plan_compute(ggml_backend_t backend, ggml_backend_graph_plan_t plan) { - GGML_ASSERT(!"not implemented"); - - UNUSED(backend); - UNUSED(plan); -} - static bool ggml_backend_cuda_graph_compute(ggml_backend_t backend, ggml_cgraph * cgraph) { ggml_backend_context_cuda * cuda_ctx = (ggml_backend_context_cuda *)backend->context; @@ -10149,7 +10127,7 @@ static bool ggml_backend_cuda_supports_op(ggml_backend_t backend, const ggml_ten UNUSED(backend); } -static ggml_backend_i cuda_backend_i = { +static ggml_backend_i ggml_backend_cuda_interface = { /* .get_name = */ ggml_backend_cuda_name, /* .free = */ ggml_backend_cuda_free, /* .get_default_buffer_type = */ ggml_backend_cuda_get_default_buffer_type, @@ -10158,9 +10136,9 @@ static ggml_backend_i cuda_backend_i = { /* .cpy_tensor_from_async = */ NULL, /* .cpy_tensor_to_async = */ NULL, /* .synchronize = */ ggml_backend_cuda_synchronize, - /* .graph_plan_create = */ ggml_backend_cuda_graph_plan_create, - /* .graph_plan_free = */ ggml_backend_cuda_graph_plan_free, - /* .graph_plan_compute = */ ggml_backend_cuda_graph_plan_compute, + /* .graph_plan_create = */ NULL, + /* .graph_plan_free = */ NULL, + /* .graph_plan_compute = */ NULL, /* .graph_compute = */ ggml_backend_cuda_graph_compute, /* .supports_op = */ ggml_backend_cuda_supports_op, }; @@ -10182,7 +10160,7 @@ ggml_backend_t ggml_backend_cuda_init(int device) { }; ggml_backend_t cuda_backend = new ggml_backend { - /* .interface = */ cuda_backend_i, + /* .interface = */ ggml_backend_cuda_interface, /* .context = */ ctx }; diff --git a/ggml-metal.m b/ggml-metal.m index 00b377a3e..5e530e280 100644 --- a/ggml-metal.m +++ b/ggml-metal.m @@ -2481,13 +2481,13 @@ static void ggml_backend_metal_buffer_get_tensor(ggml_backend_buffer_t buffer, c UNUSED(buffer); } -static void ggml_backend_metal_buffer_cpy_tensor_from(ggml_backend_buffer_t buffer, struct ggml_tensor * src, struct ggml_tensor * dst) { +static void ggml_backend_metal_buffer_cpy_tensor_from(ggml_backend_buffer_t buffer, const struct ggml_tensor * src, struct ggml_tensor * dst) { ggml_backend_tensor_get(src, dst->data, 0, ggml_nbytes(src)); UNUSED(buffer); } -static void ggml_backend_metal_buffer_cpy_tensor_to(ggml_backend_buffer_t buffer, struct ggml_tensor * src, struct ggml_tensor * dst) { +static void ggml_backend_metal_buffer_cpy_tensor_to(ggml_backend_buffer_t buffer, const struct ggml_tensor * src, struct ggml_tensor * dst) { ggml_backend_tensor_set(dst, src->data, 0, ggml_nbytes(src)); UNUSED(buffer); @@ -2509,6 +2509,7 @@ static struct ggml_backend_buffer_i ggml_backend_metal_buffer_i = { /* .cpy_tensor_from = */ ggml_backend_metal_buffer_cpy_tensor_from, /* .cpy_tensor_to = */ ggml_backend_metal_buffer_cpy_tensor_to, /* .clear = */ ggml_backend_metal_buffer_clear, + /* .reset = */ NULL, }; // default buffer type @@ -2715,7 +2716,7 @@ static bool ggml_backend_metal_supports_op(ggml_backend_t backend, const struct UNUSED(backend); } -static struct ggml_backend_i metal_backend_i = { +static struct ggml_backend_i ggml_backend_metal_i = { /* .get_name = */ ggml_backend_metal_name, /* .free = */ ggml_backend_metal_free, /* .get_default_buffer_type = */ ggml_backend_metal_get_default_buffer_type, @@ -2741,7 +2742,7 @@ ggml_backend_t ggml_backend_metal_init(void) { ggml_backend_t metal_backend = malloc(sizeof(struct ggml_backend)); *metal_backend = (struct ggml_backend) { - /* .interface = */ metal_backend_i, + /* .interface = */ ggml_backend_metal_i, /* .context = */ ctx, }; diff --git a/ggml-opencl.cpp b/ggml-opencl.cpp index 496f9cdca..cfa766eb1 100644 --- a/ggml-opencl.cpp +++ b/ggml-opencl.cpp @@ -1,5 +1,6 @@ #include "ggml.h" #include "ggml-opencl.h" +#include "ggml-backend-impl.h" #include #include @@ -10,7 +11,7 @@ #include #include -#define CL_TARGET_OPENCL_VERSION 110 +#define CL_TARGET_OPENCL_VERSION 120 #include #if defined(_MSC_VER) @@ -929,6 +930,11 @@ static cl_program build_program_from_source(cl_context ctx, cl_device_id dev, co } void ggml_cl_init(void) { + static bool initialized = false; + if (initialized) { + return; + } + cl_int err; struct cl_device; @@ -1483,8 +1489,8 @@ static void ggml_cl_mul_mat_f32(const ggml_tensor * src0, const ggml_tensor * sr } else { d_X = ggml_cl_pool_malloc(sizeof(float) * x_ne, &x_size); } - cl_mem d_Y = ggml_cl_pool_malloc(sizeof(float) * y_ne, &y_size); - cl_mem d_D = ggml_cl_pool_malloc(sizeof(float) * d_ne, &d_size); + cl_mem d_Y = src1->backend == GGML_BACKEND_GPU ? (cl_mem) src1->extra : ggml_cl_pool_malloc(sizeof(float) * y_ne, &y_size); + cl_mem d_D = dst->backend == GGML_BACKEND_GPU ? (cl_mem) dst->extra : ggml_cl_pool_malloc(sizeof(float) * d_ne, &d_size); size_t x_offset = 0; @@ -1501,7 +1507,9 @@ static void ggml_cl_mul_mat_f32(const ggml_tensor * src0, const ggml_tensor * sr for (int64_t i12 = i02 * r2, e12 = i12 + r2; i12 < e12; i12++) { // copy src1 to device - CL_CHECK(ggml_cl_h2d_tensor_2d(queue, d_Y, 0, src1, i13, i12, NULL)); + if (src1->backend == GGML_BACKEND_CPU) { + CL_CHECK(ggml_cl_h2d_tensor_2d(queue, d_Y, 0, src1, i13, i12, NULL)); + } CL_CHECK(clFinish(queue)); @@ -1522,8 +1530,10 @@ static void ggml_cl_mul_mat_f32(const ggml_tensor * src0, const ggml_tensor * sr } // copy dst to host - float * d = (float *) ((char *) dst->data + i12*nb2 + i13*nb3); - CL_CHECK(clEnqueueReadBuffer(queue, d_D, true, 0, sizeof(float) * d_ne, d, 1, &ev_sgemm, NULL)); + if (dst->backend == GGML_BACKEND_CPU) { + float * d = (float *) ((char *) dst->data + i12*nb2 + i13*nb3); + CL_CHECK(clEnqueueReadBuffer(queue, d_D, true, 0, sizeof(float) * d_ne, d, 1, &ev_sgemm, NULL)); + } } } } @@ -1532,8 +1542,12 @@ static void ggml_cl_mul_mat_f32(const ggml_tensor * src0, const ggml_tensor * sr if (src0->backend != GGML_BACKEND_GPU) { ggml_cl_pool_free(d_X, x_size); } - ggml_cl_pool_free(d_Y, y_size); - ggml_cl_pool_free(d_D, d_size); + if (src1->backend != GGML_BACKEND_GPU) { + ggml_cl_pool_free(d_Y, y_size); + } + if (dst->backend != GGML_BACKEND_GPU) { + ggml_cl_pool_free(d_D, d_size); + } } static void ggml_cl_mul_mat_f16(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, void * wdata, size_t wsize) { @@ -1598,6 +1612,8 @@ static void ggml_cl_mul_mat_f16(const ggml_tensor * src0, const ggml_tensor * sr CL_CHECK(ggml_cl_h2d_tensor_2d(queue, d_X, 0, src0, i03, i02, NULL)); } + // FIXME: convert on device + for (int64_t i12 = i02 * r2, e12 = i12 + r2; i12 < e12; i12++) { // convert src1 to fp16 // TODO: use multiple threads @@ -1643,11 +1659,13 @@ static void ggml_cl_mul_mat_f16(const ggml_tensor * src0, const ggml_tensor * sr } // copy dst to host, then convert to float - CL_CHECK(clEnqueueReadBuffer(queue, d_D, true, 0, sizeof(ggml_fp16_t) * d_ne, tmp, 1, &ev_sgemm, NULL)); - - float * d = (float *) ((char *) dst->data + i12*nb2 + i13*nb3); - - ggml_fp16_to_fp32_row(tmp, d, d_ne); + if (dst->backend == GGML_BACKEND_CPU) { + CL_CHECK(clEnqueueReadBuffer(queue, d_D, true, 0, sizeof(ggml_fp16_t) * d_ne, tmp, 1, &ev_sgemm, NULL)); + float * d = (float *) ((char *) dst->data + i12*nb2 + i13*nb3); + ggml_fp16_to_fp32_row(tmp, d, d_ne); + } else { + // FIXME: convert dst to fp32 on device + } } } } @@ -1801,7 +1819,7 @@ static void ggml_cl_mul_mat_q_f32(const ggml_tensor * src0, const ggml_tensor * } -bool ggml_cl_can_mul_mat(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst) { +bool ggml_cl_can_mul_mat(const struct ggml_tensor * src0, const struct ggml_tensor * src1, const struct ggml_tensor * dst) { const int64_t ne10 = src1->ne[0]; const int64_t ne0 = dst->ne[0]; @@ -1895,3 +1913,292 @@ void ggml_cl_transform_tensor(void * data, ggml_tensor * tensor) { tensor->extra = dst; GGML_ASSERT(tensor->backend == GGML_BACKEND_GPU); } + +// ggml-backend + +// buffer + +struct ggml_backend_opencl_buffer_context { + ~ggml_backend_opencl_buffer_context() { + if (buffer) { + clReleaseMemObject(buffer); + } + for (auto * sub_buffer : sub_buffers) { + clReleaseMemObject(sub_buffer); + } + } + + cl_mem buffer; + std::vector sub_buffers; +}; + +static void * const cl_ptr_base = (void *)(uintptr_t) 0x1000; + +static const char * ggml_backend_opencl_buffer_get_name(ggml_backend_buffer_t buffer) { + return "OpenCL"; + + GGML_UNUSED(buffer); +} + +static void ggml_backend_opencl_buffer_free_buffer(ggml_backend_buffer_t buffer) { + ggml_backend_opencl_buffer_context * ctx = (ggml_backend_opencl_buffer_context *) buffer->context; + delete ctx; +} + +static void * ggml_backend_opencl_buffer_get_base(ggml_backend_buffer_t buffer) { + return cl_ptr_base; + + GGML_UNUSED(buffer); +} + +static void ggml_backend_opencl_buffer_init_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor) { + if (tensor->view_src != NULL && tensor->view_offs == 0) { + tensor->extra = tensor->view_src->extra; + } else { + ggml_backend_opencl_buffer_context * ctx = (ggml_backend_opencl_buffer_context *) buffer->context; + cl_buffer_region region = {(size_t)((char *)tensor->data - (char *)cl_ptr_base), ggml_nbytes(tensor)}; + cl_int err; + cl_mem sub_buffer = clCreateSubBuffer(ctx->buffer, CL_MEM_READ_WRITE, CL_BUFFER_CREATE_TYPE_REGION, ®ion, &err); + CL_CHECK(err); + ctx->sub_buffers.push_back(sub_buffer); + tensor->extra = sub_buffer; + } + tensor->backend = GGML_BACKEND_GPU; +} + +static void ggml_backend_opencl_buffer_set_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor, const void * data, size_t offset, size_t size) { + cl_mem tensor_buffer = (cl_mem) tensor->extra; + CL_CHECK(clEnqueueWriteBuffer(queue, tensor_buffer, true, offset, size, data, 0, NULL, NULL)); + CL_CHECK(clFinish(queue)); + + GGML_UNUSED(buffer); +} + +static void ggml_backend_opencl_buffer_get_tensor(ggml_backend_buffer_t buffer, const ggml_tensor * tensor, void * data, size_t offset, size_t size) { + cl_mem tensor_buffer = (cl_mem) tensor->extra; + CL_CHECK(clEnqueueReadBuffer(queue, tensor_buffer, true, offset, size, data, 0, NULL, NULL)); + CL_CHECK(clFinish(queue)); + + GGML_UNUSED(buffer); +} + +static void ggml_backend_opencl_buffer_clear(ggml_backend_buffer_t buffer, uint8_t value) { + ggml_backend_opencl_buffer_context * ctx = (ggml_backend_opencl_buffer_context *) buffer->context; + CL_CHECK(clEnqueueFillBuffer(queue, ctx->buffer, &value, sizeof(value), 0, buffer->size, 0, NULL, NULL)); + CL_CHECK(clFinish(queue)); +} + +static void ggml_backend_opencl_buffer_reset(ggml_backend_buffer_t buffer) { + ggml_backend_opencl_buffer_context * ctx = (ggml_backend_opencl_buffer_context *) buffer->context; + for (auto * sub_buffer : ctx->sub_buffers) { + clReleaseMemObject(sub_buffer); + } + ctx->sub_buffers.clear(); +} + +static ggml_backend_buffer_i ggml_backend_opencl_buffer_interface = { + /* .get_name = */ ggml_backend_opencl_buffer_get_name, + /* .free_buffer = */ ggml_backend_opencl_buffer_free_buffer, + /* .get_base = */ ggml_backend_opencl_buffer_get_base, + /* .init_tensor = */ ggml_backend_opencl_buffer_init_tensor, + /* .set_tensor = */ ggml_backend_opencl_buffer_set_tensor, + /* .get_tensor = */ ggml_backend_opencl_buffer_get_tensor, + /* .cpy_tensor_from = */ NULL, + /* .cpy_tensor_to = */ NULL, + /* .clear = */ ggml_backend_opencl_buffer_clear, + /* .reset = */ ggml_backend_opencl_buffer_reset, +}; + +// buffer type + +static const char * ggml_backend_opencl_buffer_type_name(ggml_backend_buffer_type_t buffer_type) { + return "OpenCL"; + + GGML_UNUSED(buffer_type); +} + +static ggml_backend_buffer_t ggml_backend_opencl_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buffer_type, size_t size) { + ggml_cl_init(); + + cl_int err; + cl_mem mem = clCreateBuffer(context, CL_MEM_READ_WRITE, size, NULL, &err); + if (err != CL_SUCCESS) { + fprintf(stderr, "%s: failed to allocate %.2f MiB\n", __func__, size / 1024.0 / 1024.0); + return nullptr; + } + + ggml_backend_opencl_buffer_context * ctx = new ggml_backend_opencl_buffer_context{mem, {}}; + + return ggml_backend_buffer_init(buffer_type, ggml_backend_opencl_buffer_interface, ctx, size); +} + +static size_t ggml_backend_opencl_buffer_type_get_alignment(ggml_backend_buffer_type_t buffer_type) { + // FIXME: not thread safe, device may not be initialized yet + static cl_uint alignment = -1; + if (alignment == (cl_uint)-1) { + ggml_cl_init(); + clGetDeviceInfo(device, CL_DEVICE_MEM_BASE_ADDR_ALIGN, sizeof(cl_uint), &alignment, NULL); + } + return alignment; + + GGML_UNUSED(buffer_type); +} + +static bool ggml_backend_opencl_buffer_type_supports_backend(ggml_backend_buffer_type_t buffer_type, ggml_backend_t backend) { + //return ggml_backend_is_opencl(backend); // opencl must be used through the cpu backend + return ggml_backend_is_cpu(backend); + + GGML_UNUSED(buffer_type); +} + +static ggml_backend_buffer_type_i ggml_backend_opencl_buffer_type_interface = { + /* .get_name = */ ggml_backend_opencl_buffer_type_name, + /* .alloc_buffer = */ ggml_backend_opencl_buffer_type_alloc_buffer, + /* .get_alignment = */ ggml_backend_opencl_buffer_type_get_alignment, + /* .get_alloc_size = */ NULL, + /* .supports_backend = */ ggml_backend_opencl_buffer_type_supports_backend, + /* .is_host = */ NULL, +}; + + +ggml_backend_buffer_type_t ggml_backend_opencl_buffer_type() { + static ggml_backend_buffer_type buffer_type = { + /* .iface = */ ggml_backend_opencl_buffer_type_interface, + /* .context = */ nullptr, + }; + + return &buffer_type; +} + +#if 0 +// host buffer type + +static const char * ggml_backend_opencl_host_buffer_type_name(ggml_backend_buffer_type_t buft) { + return "CL_Host"; + + GGML_UNUSED(buft); +} + +static const char * ggml_backend_opencl_host_buffer_name(ggml_backend_buffer_t buffer) { + return "CL_Host"; + + GGML_UNUSED(buffer); +} + +static void ggml_backend_opencl_host_buffer_free_buffer(ggml_backend_buffer_t buffer) { + ggml_cl_host_free(buffer->context); +} + +static ggml_backend_buffer_t ggml_backend_opencl_host_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) { + void * ptr = ggml_cl_host_malloc(size); + + if (ptr == nullptr) { + // fallback to cpu buffer + return ggml_backend_buft_alloc_buffer(ggml_backend_cpu_buffer_type(), size); + } + + ggml_backend_buffer_t buffer = ggml_backend_cpu_buffer_from_ptr(ptr, size); + buffer->buft = buft; + buffer->iface.get_name = ggml_backend_opencl_host_buffer_name; + buffer->iface.free_buffer = ggml_backend_opencl_host_buffer_free_buffer; + + return buffer; +} + +ggml_backend_buffer_type_t ggml_backend_opencl_host_buffer_type() { + static struct ggml_backend_buffer_type ggml_backend_opencl_buffer_type_host = { + /* .iface = */ { + /* .get_name = */ ggml_backend_opencl_host_buffer_type_name, + /* .alloc_buffer = */ ggml_backend_opencl_host_buffer_type_alloc_buffer, + /* .get_alignment = */ ggml_backend_cpu_buffer_type()->iface.get_alignment, + /* .get_alloc_size = */ ggml_backend_cpu_buffer_type()->iface.get_alloc_size, + /* .supports_backend = */ ggml_backend_cpu_buffer_type()->iface.supports_backend, + /* .is_host = */ ggml_backend_cpu_buffer_type()->iface.is_host, + }, + /* .context = */ nullptr, + }; + + return &ggml_backend_opencl_buffer_type_host; +} + +// backend + +static const char * ggml_backend_opencl_name(ggml_backend_t backend) { + return "OpenCL"; + + GGML_UNUSED(backend); +} + +static void ggml_backend_opencl_free(ggml_backend_t backend) { + GGML_UNUSED(backend); +} + +static ggml_backend_buffer_type_t ggml_backend_opencl_get_default_buffer_type(ggml_backend_t backend) { + return ggml_backend_opencl_buffer_type(); + + GGML_UNUSED(backend); +} + +static bool ggml_backend_opencl_graph_compute(ggml_backend_t backend, ggml_cgraph * graph) { + for (int i = 0; i < graph->n_nodes; ++i) { + ggml_tensor * node = graph->nodes[i]; + switch (node->op) { + case GGML_OP_MUL_MAT: + ggml_cl_mul_mat(node->src[0], node->src[1], node, nullptr, 0); + break; + case GGML_OP_MUL: + ggml_cl_mul(node->src[0], node->src[1], node); + break; + default: + GGML_ASSERT(false); + } + } + + return true; + + GGML_UNUSED(backend); +} + +static bool ggml_backend_opencl_supports_op(ggml_backend_t backend, const ggml_tensor * op) { + switch (op->op) { + case GGML_OP_MUL_MAT: + return ggml_cl_can_mul_mat(op->src[0], op->src[1], op); + case GGML_OP_MUL: + // return ggml_can_repeat_rows(op->src[1], op->src[0]); + return true; + default: + return false; + } + + GGML_UNUSED(backend); +} + +static ggml_backend_i opencl_backend_i = { + /* .get_name = */ ggml_backend_opencl_name, + /* .free = */ ggml_backend_opencl_free, + /* .get_default_buffer_type = */ ggml_backend_opencl_get_default_buffer_type, + /* .set_tensor_async = */ NULL, + /* .get_tensor_async = */ NULL, + /* .cpy_tensor_from_async = */ NULL, + /* .cpy_tensor_to_async = */ NULL, + /* .synchronize = */ NULL, + /* .graph_plan_create = */ NULL, + /* .graph_plan_free = */ NULL, + /* .graph_plan_compute = */ NULL, + /* .graph_compute = */ ggml_backend_opencl_graph_compute, + /* .supports_op = */ ggml_backend_opencl_supports_op, +}; + +ggml_backend_t ggml_backend_opencl_init() { + ggml_backend_t backend = new ggml_backend { + /* .interface = */ opencl_backend_i, + /* .context = */ nullptr + }; + + return backend; +} + +bool ggml_backend_is_opencl(ggml_backend_t backend) { + return backend && backend->iface.get_name == ggml_backend_opencl_name; +} +#endif diff --git a/ggml-opencl.h b/ggml-opencl.h index 44d05bd64..919b00d63 100644 --- a/ggml-opencl.h +++ b/ggml-opencl.h @@ -1,6 +1,7 @@ #pragma once #include "ggml.h" +#include "ggml-backend.h" #ifdef __cplusplus extern "C" { @@ -9,17 +10,26 @@ extern "C" { GGML_API void ggml_cl_init(void); GGML_API void ggml_cl_mul(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst); -GGML_API bool ggml_cl_can_mul_mat(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst); +GGML_API bool ggml_cl_can_mul_mat(const struct ggml_tensor * src0, const struct ggml_tensor * src1, const struct ggml_tensor * dst); GGML_API size_t ggml_cl_mul_mat_get_wsize(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst); GGML_API void ggml_cl_mul_mat(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst, void * wdata, size_t wsize); -GGML_API void * ggml_cl_host_malloc(size_t size); -GGML_API void ggml_cl_host_free(void * ptr); +// GGML_API void * ggml_cl_host_malloc(size_t size); +// GGML_API void ggml_cl_host_free(void * ptr); GGML_API void ggml_cl_free_data(const struct ggml_tensor* tensor); GGML_API void ggml_cl_transform_tensor(void * data, struct ggml_tensor * tensor); +// backend API + +// GGML_API ggml_backend_t ggml_backend_opencl_init(void); + +// GGML_API bool ggml_backend_is_opencl(ggml_backend_t backend); + +GGML_API ggml_backend_buffer_type_t ggml_backend_opencl_buffer_type(void); +// GGML_API ggml_backend_buffer_type_t ggml_backend_opencl_host_buffer_type(void); + #ifdef __cplusplus } #endif diff --git a/ggml.c b/ggml.c index 90814b8e2..7dcc70a58 100644 --- a/ggml.c +++ b/ggml.c @@ -16601,7 +16601,7 @@ static thread_ret_t ggml_graph_compute_thread(void * data) { return GGML_EXIT_SUCCESS; } -struct ggml_cplan ggml_graph_plan(struct ggml_cgraph * cgraph, int n_threads) { +struct ggml_cplan ggml_graph_plan(const struct ggml_cgraph * cgraph, int n_threads) { if (n_threads <= 0) { n_threads = GGML_DEFAULT_N_THREADS; } diff --git a/ggml.h b/ggml.h index 163e2d2e5..98191b1a3 100644 --- a/ggml.h +++ b/ggml.h @@ -1852,8 +1852,8 @@ extern "C" { // ggml_graph_plan() has to be called before ggml_graph_compute() // when plan.work_size > 0, caller must allocate memory for plan.work_data - GGML_API struct ggml_cplan ggml_graph_plan (struct ggml_cgraph * cgraph, int n_threads /*= GGML_DEFAULT_N_THREADS*/); - GGML_API int ggml_graph_compute(struct ggml_cgraph * cgraph, struct ggml_cplan * cplan); + GGML_API struct ggml_cplan ggml_graph_plan (const struct ggml_cgraph * cgraph, int n_threads /*= GGML_DEFAULT_N_THREADS*/); + GGML_API int ggml_graph_compute( struct ggml_cgraph * cgraph, struct ggml_cplan * cplan); // same as ggml_graph_compute() but the work data is allocated as a part of the context // note: the drawback of this API is that you must have ensured that the context has enough memory for the work data diff --git a/llama.cpp b/llama.cpp index 35de91b61..3af2419d4 100644 --- a/llama.cpp +++ b/llama.cpp @@ -1204,8 +1204,7 @@ static ggml_backend_buffer_type_t llama_default_buffer_type_cpu(bool host_buffer ggml_backend_buffer_type_t buft = nullptr; #if defined(GGML_USE_CUBLAS) - // in some cases such as the KV cache, there is no benefit to using a host buffer, - // since the data is never copied to the GPU + // host buffers should only be used when data is expected to be copied to/from the GPU if (host_buffer) { buft = ggml_backend_cuda_host_buffer_type(); } @@ -1228,6 +1227,8 @@ static ggml_backend_buffer_type_t llama_default_buffer_type_offload(int gpu) { buft = ggml_backend_metal_buffer_type(); #elif defined(GGML_USE_CUBLAS) buft = ggml_backend_cuda_buffer_type(gpu); +#elif defined(GGML_USE_CLBLAST) + buft = ggml_backend_opencl_buffer_type(); #endif if (buft == nullptr) { @@ -1695,6 +1696,10 @@ static bool llama_kv_cache_init( cache.cells.clear(); cache.cells.resize(n_ctx); +#ifdef GGML_USE_CLBLAST + offload = false; +#endif + // count used buffer types std::map buft_layer_count; if (offload) { @@ -1702,9 +1707,10 @@ static bool llama_kv_cache_init( buft_layer_count[model.buft_layer[i].buft]++; } } else { - buft_layer_count[llama_default_buffer_type_cpu(false)] = n_layer; + buft_layer_count[llama_default_buffer_type_cpu(true)] = n_layer; } + // create a context for each buffer type std::map ctx_map; for (auto & it : buft_layer_count) { int n_layers = it.second; @@ -2413,26 +2419,12 @@ struct llama_model_loader { } void init_mapping(bool prefetch = true, llama_mlock * lmlock = nullptr) { - /* - // prefetch only CPU tensors - if (use_mmap) { - size_t size_pref = 0; // prefetch - - for (int i = 0; i < gguf_get_n_tensors(ctx_gguf); i++) { - struct ggml_tensor * cur = ggml_get_tensor(ctx, gguf_get_tensor_name(ctx_gguf, i)); - if (cur->backend == buffer_type_cpu) { - size_t tensor_end = gguf_get_tensor_offset(ctx_gguf, i) + ggml_nbytes(cur); - size_pref = std::max(size_pref, tensor_end); - } - } - mapping.reset(new llama_mmap(&file, gguf_get_data_offset(ctx_gguf) + size_pref, ggml_is_numa())); - } - */ // prefetch the whole file - all the data is needed anyway if (use_mmap) { mapping.reset(new llama_mmap(&file, prefetch ? -1 : 0, ggml_is_numa())); } + // compute the total size of all tensors for progress reporting for (int i = 0; i < gguf_get_n_tensors(ctx_gguf); i++) { struct ggml_tensor * cur = ggml_get_tensor(ctx_meta, gguf_get_tensor_name(ctx_gguf, i)); size_data += ggml_nbytes(cur); @@ -3173,16 +3165,15 @@ static bool llm_load_tensors( model.main_gpu = main_gpu; model.n_gpu_layers = n_gpu_layers; - size_t ctx_size = ggml_tensor_overhead()*ml.n_tensors; - const int64_t n_layer = hparams.n_layer; const int64_t i_gpu_start = std::max((int64_t) hparams.n_layer - n_gpu_layers, (int64_t) 0); - - // there is very little benefit to offloading the input layers, so always keep it on the CPU + // there is very little benefit to offloading the input layer, so always keep it on the CPU model.buft_input = llama_default_buffer_type_cpu(true); + model.buft_layer.resize(n_layer); - // cpu layers + + // assign cpu layers for (int64_t i = 0; i < i_gpu_start; ++i) { model.buft_layer[i] = llama_default_buffer_type_cpu(true); } @@ -3191,11 +3182,8 @@ static bool llm_load_tensors( if (split_mode == LLAMA_SPLIT_LAYER) { // calculate the split points int device_count = ggml_backend_cuda_get_device_count(); + bool all_zero = tensor_split == nullptr || std::all_of(tensor_split, tensor_split + device_count, [](float x) { return x == 0.0f; }); float splits[GGML_CUDA_MAX_DEVICES]; - if (tensor_split != nullptr) { - std::copy(tensor_split, tensor_split + device_count, splits); - } - bool all_zero = tensor_split == nullptr || std::all_of(splits, splits + device_count, [](float x) { return x == 0.0f; }); if (all_zero) { // default split, by free memory for (int i = 0; i < device_count; ++i) { @@ -3204,7 +3192,11 @@ static bool llm_load_tensors( ggml_backend_cuda_get_device_memory(i, &total, &free); splits[i] = free; } + } else { + std::copy(tensor_split, tensor_split + device_count, splits); } + + // sum and normalize the splits to get the split points float split_sum = 0.0f; for (int i = 0; i < device_count; ++i) { split_sum += splits[i]; @@ -3214,15 +3206,15 @@ static bool llm_load_tensors( splits[i] /= split_sum; } - // assign GPU layers according to the splits to the devices + // assign the repeating layers to the devices according to the splits int act_gpu_layers = std::min(n_gpu_layers, (int)n_layer + 1); for (int64_t i = i_gpu_start; i < n_layer; ++i) { int layer_gpu = std::upper_bound(splits, splits + device_count, float(i - i_gpu_start)/act_gpu_layers) - splits; model.buft_layer[i] = llama_default_buffer_type_offload(layer_gpu); } - // output layer + // assign the output layer if (n_gpu_layers > n_layer) { - int layer_gpu = std::upper_bound(splits, splits + device_count, float(n_layer)/act_gpu_layers) - splits; + int layer_gpu = std::upper_bound(splits, splits + device_count, float(act_gpu_layers - 1)/act_gpu_layers) - splits; model.buft_output = llama_default_buffer_type_offload(layer_gpu); } else { model.buft_output = llama_default_buffer_type_cpu(true); @@ -3234,16 +3226,17 @@ static bool llm_load_tensors( if (split_mode == LLAMA_SPLIT_ROW) { split_buft = llama_default_buffer_type_split(main_gpu, tensor_split); } else { + // LLAMA_SPLIT_NONE or LLAMA_SPLIT_LAYER in backends where it is not supported split_buft = llama_default_buffer_type_offload(main_gpu); } - // repeating layers + // assign the repeating layers for (int64_t i = i_gpu_start; i < n_layer; ++i) { model.buft_layer[i] = { split_buft, llama_default_buffer_type_offload(main_gpu) }; } - // output layer + // assign the output layer if (n_gpu_layers > n_layer) { model.buft_output = { split_buft, @@ -3266,6 +3259,7 @@ static bool llm_load_tensors( } // create one context per buffer type + size_t ctx_size = ggml_tensor_overhead()*ml.n_tensors; std::map ctx_map; for (auto & it : buft_layer_count) { struct ggml_init_params params = { @@ -3289,14 +3283,11 @@ static bool llm_load_tensors( const int64_t n_embd_k_gqa = hparams.n_embd_k_gqa(); const int64_t n_embd_v_gqa = hparams.n_embd_v_gqa(); const int64_t n_embd_gqa = n_embd_v_gqa; - const int64_t n_layer = hparams.n_layer; const int64_t n_vocab = hparams.n_vocab; const int64_t n_ff = hparams.n_ff; GGML_ASSERT(n_embd_gqa == n_embd_k_gqa); - // FIXME: for metal, it may be better to put the input on the GPU context - however it may make no difference in practice, - // and it would increase the metal buffer size ggml_context * ctx_input = ctx_map.at(model.buft_input.buft); ggml_context * ctx_output = ctx_map.at(model.buft_output.buft); ggml_context * ctx_output_split = ctx_map.at(model.buft_output.buft_matrix); @@ -3617,8 +3608,8 @@ static bool llm_load_tensors( layer.attn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}); - layer.wqkv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd * 3}); - layer.bqkv = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_QKV, "bias", i), {n_embd * 3}); + layer.wqkv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd*3}); + layer.bqkv = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_QKV, "bias", i), {n_embd*3}); layer.wo = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}); layer.ffn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}); @@ -3736,8 +3727,7 @@ static bool llm_load_tensors( ml.init_mapping(true, use_mlock ? &model.mlock_mmap : nullptr); - // create backend buffers - + // create the backend buffers std::vector> ctx_bufs; for (auto & it : ctx_map) { @@ -3745,8 +3735,9 @@ static bool llm_load_tensors( ggml_context * ctx = it.second; ggml_backend_buffer_t buf = nullptr; - // only the region containing the tensors in the model is mapped to the backend buffer - // this is important for metal: if the entire model could be mapped to a metal buffer, then we could just use metal for all layers + // only the mmap region containing the tensors in the model is mapped to the backend buffer + // this is important for metal with apple silicon: if the entire model could be mapped to a metal buffer, then we could just use metal for all layers + // this allows using partial offloading when the model size exceeds the metal buffer size, but not the RAM size if (ml.use_mmap && buft == llama_default_buffer_type_cpu(true)) { size_t first, last; ml.get_mapping_range(&first, &last, ctx); @@ -3771,7 +3762,7 @@ static bool llm_load_tensors( throw std::runtime_error("failed to allocate buffer"); } // indicate that this buffer contains weights - // this is used by ggml_backend_sched to improve op scheduling -> ops that use a weight are always scheduled to the backend that contains the weight + // this is used by ggml_backend_sched to improve op scheduling -> ops that use a weight are preferably scheduled to the backend that contains the weight ggml_backend_buffer_set_usage(buf, GGML_BACKEND_BUFFER_USAGE_WEIGHTS); model.bufs.push_back(buf); ctx_bufs.emplace_back(ctx, buf); @@ -3803,7 +3794,7 @@ static bool llm_load_tensors( } } - // load data + // load tensor data for (auto & it : ctx_bufs) { ggml_context * ctx = it.first; ggml_backend_buffer_t buf = it.second; @@ -3849,7 +3840,7 @@ static int llama_model_load(const std::string & fname, llama_model & model, cons return -2; } } catch (const std::exception & err) { - LLAMA_LOG_ERROR("error loading model: %s\n", err.what()); + LLAMA_LOG_ERROR("%s: error loading model: %s\n", __func__, err.what()); return -1; }