From 0808aa5a42c5154a21a411ea04522fcaf6804ce6 Mon Sep 17 00:00:00 2001 From: slaren Date: Tue, 19 Dec 2023 03:23:00 +0100 Subject: [PATCH] add ggml-metal --- ggml-cuda.cu | 8 --- ggml-metal.h | 3 + ggml-metal.m | 180 +++++++++++++++++++++++++++++++++++++++++++-------- llama.cpp | 33 ++++++---- 4 files changed, 176 insertions(+), 48 deletions(-) diff --git a/ggml-cuda.cu b/ggml-cuda.cu index f11fc4e93..dafa9282a 100644 --- a/ggml-cuda.cu +++ b/ggml-cuda.cu @@ -9428,8 +9428,6 @@ static void ggml_backend_cuda_buffer_init_tensor(ggml_backend_buffer_t buffer, g } static void ggml_backend_cuda_buffer_set_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor, const void * data, size_t offset, size_t size) { - GGML_ASSERT(offset + size <= ggml_nbytes(tensor) && "tensor write out of bounds"); - GGML_ASSERT(tensor->data != NULL && "tensor not allocated"); GGML_ASSERT(tensor->backend == GGML_BACKEND_GPU); ggml_backend_buffer_context_cuda * ctx = (ggml_backend_buffer_context_cuda *)buffer->context; @@ -9441,8 +9439,6 @@ static void ggml_backend_cuda_buffer_set_tensor(ggml_backend_buffer_t buffer, gg } static void ggml_backend_cuda_buffer_get_tensor(ggml_backend_buffer_t buffer, const ggml_tensor * tensor, void * data, size_t offset, size_t size) { - GGML_ASSERT(offset + size <= ggml_nbytes(tensor) && "tensor read out of bounds"); - GGML_ASSERT(tensor->data != NULL && "tensor not allocated"); GGML_ASSERT(tensor->backend == GGML_BACKEND_GPU); ggml_backend_buffer_context_cuda * ctx = (ggml_backend_buffer_context_cuda *)buffer->context; @@ -9600,8 +9596,6 @@ static void ggml_backend_cuda_set_tensor_async(ggml_backend_t backend, ggml_tens ggml_backend_context_cuda * cuda_ctx = (ggml_backend_context_cuda *)backend->context; GGML_ASSERT(tensor->buffer->buft == ggml_backend_cuda_buffer_type(cuda_ctx->device) && "unsupported buffer type"); - GGML_ASSERT(offset + size <= ggml_nbytes(tensor) && "tensor write out of bounds"); - GGML_ASSERT(tensor->data != NULL && "tensor not allocated"); GGML_ASSERT(tensor->backend == GGML_BACKEND_GPU); CUDA_CHECK(cudaMemcpyAsync((char *)tensor->data + offset, data, size, cudaMemcpyHostToDevice, g_cudaStreams[cuda_ctx->device][0])); @@ -9611,8 +9605,6 @@ static void ggml_backend_cuda_get_tensor_async(ggml_backend_t backend, const ggm ggml_backend_context_cuda * cuda_ctx = (ggml_backend_context_cuda *)backend->context; GGML_ASSERT(tensor->buffer->buft == ggml_backend_cuda_buffer_type(cuda_ctx->device) && "unsupported buffer type"); - GGML_ASSERT(offset + size <= ggml_nbytes(tensor) && "tensor read out of bounds"); - GGML_ASSERT(tensor->data != NULL && "tensor not allocated"); GGML_ASSERT(tensor->backend == GGML_BACKEND_GPU); CUDA_CHECK(cudaMemcpyAsync(data, (const char *)tensor->data + offset, size, cudaMemcpyDeviceToHost, g_cudaStreams[cuda_ctx->device][0])); diff --git a/ggml-metal.h b/ggml-metal.h index bf52d9cd3..b5e02b668 100644 --- a/ggml-metal.h +++ b/ggml-metal.h @@ -98,7 +98,10 @@ GGML_API ggml_backend_t ggml_backend_metal_init(void); GGML_API bool ggml_backend_is_metal(ggml_backend_t backend); +GGML_API ggml_backend_buffer_t ggml_backend_metal_buffer_from_ptr(void * data, size_t size, size_t max_size); + GGML_API void ggml_backend_metal_set_n_cb(ggml_backend_t backend, int n_cb); + GGML_API ggml_backend_buffer_type_t ggml_backend_metal_buffer_type(void); // helper to check if the device supports a specific family diff --git a/ggml-metal.m b/ggml-metal.m index 465679a6b..4944ac3cd 100644 --- a/ggml-metal.m +++ b/ggml-metal.m @@ -607,12 +607,24 @@ int * ggml_metal_get_concur_list(struct ggml_metal_context * ctx) { } // temporarily defined here for compatibility between ggml-backend and the old API -struct ggml_backend_metal_buffer_context { - void * data; + +struct ggml_backend_metal_buffer { + void * data; + size_t size; id metal; }; +struct ggml_backend_metal_buffer_context { + void * all_data; + size_t all_size; + bool owned; + + // multiple buffers are used only to avoid the maximum buffer size limitation when using mmap + int n_buffers; + struct ggml_metal_buffer buffers[GGML_METAL_MAX_BUFFERS]; +}; + // finds the Metal buffer that contains the tensor data on the GPU device // the assumption is that there is 1-to-1 mapping between the host and device memory buffers, so we can find the // Metal buffer based on the host memory pointer @@ -622,17 +634,29 @@ static id ggml_metal_get_buffer(struct ggml_metal_context * ctx, stru const int64_t tsize = ggml_nbytes(t); + ggml_backend_buffer_t buffer = t->view_src ? t->view_src->buffer : t->buffer; + // compatibility with ggml-backend - if (t->buffer && t->buffer->buft == ggml_backend_metal_buffer_type()) { - struct ggml_backend_metal_buffer_context * buf_ctx = (struct ggml_backend_metal_buffer_context *) t->buffer->context; + if (buffer && buffer->buft == ggml_backend_metal_buffer_type()) { + struct ggml_backend_metal_buffer_context * buf_ctx = (struct ggml_backend_metal_buffer_context *) buffer->context; - const int64_t ioffs = (int64_t) t->data - (int64_t) buf_ctx->data; + // find the view that contains the tensor fully + for (int i = 0; i < buf_ctx->n_buffers; ++i) { + const int64_t ioffs = (int64_t) t->data - (int64_t) buf_ctx->buffers[i].data; - GGML_ASSERT(ioffs >= 0 && ioffs + tsize <= (int64_t) t->buffer->size); + //GGML_METAL_LOG_INFO("ioffs = %10ld, tsize = %10ld, sum = %10ld, buf_ctx->buffers[%d].size = %10ld\n", ioffs, tsize, ioffs + tsize, i, buf_ctx->buffers[i].size); + if (ioffs >= 0 && ioffs + tsize <= (int64_t) buf_ctx->buffers[i].size) { + *offs = (size_t) ioffs; - *offs = (size_t) ioffs; + //GGML_METAL_LOG_INFO("%s: tensor '%16s', offs = %8ld\n", __func__, t->name, *offs); - return buf_ctx->metal; + return buf_ctx->buffers[i].metal; + } + } + + GGML_METAL_LOG_ERROR("%s: error: tensor '%s' buffer is nil\n", __func__, t->name); + + return nil; } // find the view that contains the tensor fully @@ -2361,6 +2385,7 @@ void ggml_metal_graph_compute( // backend interface +// default buffer static id g_backend_device = nil; static int g_backend_device_ref_count = 0; @@ -2388,34 +2413,33 @@ static void ggml_backend_metal_free_device(void) { static void * ggml_backend_metal_buffer_get_base(ggml_backend_buffer_t buffer) { struct ggml_backend_metal_buffer_context * ctx = (struct ggml_backend_metal_buffer_context *)buffer->context; - return ctx->data; + return ctx->all_data; } static void ggml_backend_metal_buffer_free_buffer(ggml_backend_buffer_t buffer) { struct ggml_backend_metal_buffer_context * ctx = (struct ggml_backend_metal_buffer_context *)buffer->context; - [ctx->metal release]; + for (int i = 0; i < ctx->n_buffers; i++) { + [ctx->buffers[i].metal release]; + } ggml_backend_metal_free_device(); - free(ctx->data); + if (ctx->owned) { + free(ctx->all_data); + } + free(ctx); UNUSED(buffer); } static void ggml_backend_metal_buffer_set_tensor(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor, const void * data, size_t offset, size_t size) { - GGML_ASSERT(offset + size <= ggml_nbytes(tensor) && "tensor write out of bounds"); - GGML_ASSERT(tensor->data != NULL && "tensor not allocated"); - memcpy((char *)tensor->data + offset, data, size); UNUSED(buffer); } static void ggml_backend_metal_buffer_get_tensor(ggml_backend_buffer_t buffer, const struct ggml_tensor * tensor, void * data, size_t offset, size_t size) { - GGML_ASSERT(offset + size <= ggml_nbytes(tensor) && "tensor read out of bounds"); - GGML_ASSERT(tensor->data != NULL && "tensor not allocated"); - memcpy(data, (const char *)tensor->data + offset, size); UNUSED(buffer); @@ -2433,7 +2457,7 @@ static void ggml_backend_metal_buffer_cpy_tensor_to(ggml_backend_buffer_t buffer UNUSED(buffer); } -static struct ggml_backend_buffer_i metal_backend_buffer_i = { +static struct ggml_backend_buffer_i ggml_backend_metal_buffer_i = { /* .free_buffer = */ ggml_backend_metal_buffer_free_buffer, /* .get_base = */ ggml_backend_metal_buffer_get_base, /* .init_tensor = */ NULL, @@ -2443,6 +2467,8 @@ static struct ggml_backend_buffer_i metal_backend_buffer_i = { /* .cpy_tensor_to = */ ggml_backend_metal_buffer_cpy_tensor_to, }; +// default buffer type + static ggml_backend_buffer_t ggml_backend_metal_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) { struct ggml_backend_metal_buffer_context * ctx = malloc(sizeof(struct ggml_backend_metal_buffer_context)); @@ -2453,13 +2479,30 @@ static ggml_backend_buffer_t ggml_backend_metal_buffer_type_alloc_buffer(ggml_ba size_aligned += (size_page - (size_aligned % size_page)); } - ctx->data = ggml_metal_host_malloc(size); - ctx->metal = [ggml_backend_metal_get_device() newBufferWithBytesNoCopy:ctx->data + id device = ggml_backend_metal_get_device(); + + ctx->all_data = ggml_metal_host_malloc(size_aligned); + ctx->all_size = size_aligned; + ctx->owned = true; + ctx->n_buffers = 1; + + ctx->buffers[0].data = ctx->all_data; + ctx->buffers[0].size = size; + ctx->buffers[0].metal = [device newBufferWithBytesNoCopy:ctx->all_data length:size_aligned options:MTLResourceStorageModeShared deallocator:nil]; - return ggml_backend_buffer_init(buft, metal_backend_buffer_i, ctx, size); + if (ctx->buffers[0].metal == nil) { + GGML_METAL_LOG_ERROR("%s: error: failed to allocate '%-16s' buffer, size = %8.2f MiB\n", __func__, "default", size_aligned / 1024.0 / 1024.0); + free(ctx); + ggml_backend_metal_free_device(); + return NULL; + } + + GGML_METAL_LOG_INFO("%s: allocated '%-16s' buffer, size = %8.2f MiB\n", __func__, "default", size_aligned / 1024.0 / 1024.0); + + return ggml_backend_buffer_init(buft, ggml_backend_metal_buffer_i, ctx, size); } static size_t ggml_backend_metal_buffer_type_get_alignment(ggml_backend_buffer_type_t buft) { @@ -2470,7 +2513,7 @@ static size_t ggml_backend_metal_buffer_type_get_alignment(ggml_backend_buffer_t static bool ggml_backend_metal_buffer_type_supports_backend(ggml_backend_buffer_type_t buft, ggml_backend_t backend) { return ggml_backend_is_metal(backend) || ggml_backend_is_cpu(backend); - GGML_UNUSED(buft); + UNUSED(buft); } ggml_backend_buffer_type_t ggml_backend_metal_buffer_type(void) { @@ -2487,6 +2530,91 @@ ggml_backend_buffer_type_t ggml_backend_metal_buffer_type(void) { return &ggml_backend_buffer_type_metal; } +// buffer from ptr + +ggml_backend_buffer_t ggml_backend_metal_buffer_from_ptr(void * data, size_t size, size_t max_size) { + struct ggml_backend_metal_buffer_context * ctx = malloc(sizeof(struct ggml_backend_metal_buffer_context)); + + ctx->all_data = data; + ctx->all_size = size; + ctx->owned = false; + ctx->n_buffers = 0; + + const size_t size_page = sysconf(_SC_PAGESIZE); + size_t size_aligned = size; + if ((size_aligned % size_page) != 0) { + size_aligned += (size_page - (size_aligned % size_page)); + } + + id device = ggml_backend_metal_get_device(); + + const char * name = "from_ptr"; + + // the buffer fits into the max buffer size allowed by the device + if (size_aligned <= device.maxBufferLength) { + ctx->buffers[ctx->n_buffers].name = name; + ctx->buffers[ctx->n_buffers].data = data; + ctx->buffers[ctx->n_buffers].size = size; + + ctx->buffers[ctx->n_buffers].metal = [device newBufferWithBytesNoCopy:data length:size_aligned options:MTLResourceStorageModeShared deallocator:nil]; + + if (ctx->buffers[ctx->n_buffers].metal == nil) { + GGML_METAL_LOG_ERROR("%s: error: failed to allocate '%-16s' buffer, size = %8.2f MiB\n", __func__, name, size_aligned / 1024.0 / 1024.0); + return false; + } + + GGML_METAL_LOG_INFO("%s: allocated '%-16s' buffer, size = %8.2f MiB\n", __func__, name, size_aligned / 1024.0 / 1024.0); + + ++ctx->n_buffers; + } else { + // this overlap between the views will guarantee that the tensor with the maximum size will fully fit into + // one of the views + const size_t size_ovlp = ((max_size + size_page - 1) / size_page + 1) * size_page; // round-up 2 pages just in case + const size_t size_step = device.maxBufferLength - size_ovlp; + const size_t size_view = device.maxBufferLength; + + for (size_t i = 0; i < size; i += size_step) { + const size_t size_step_aligned = (i + size_view <= size) ? size_view : (size_aligned - i); + + ctx->buffers[ctx->n_buffers].name = name; + ctx->buffers[ctx->n_buffers].data = (void *) ((uint8_t *) data + i); + ctx->buffers[ctx->n_buffers].size = size_step_aligned; + + ctx->buffers[ctx->n_buffers].metal = [device newBufferWithBytesNoCopy:(void *) ((uint8_t *) data + i) length:size_step_aligned options:MTLResourceStorageModeShared deallocator:nil]; + + if (ctx->buffers[ctx->n_buffers].metal == nil) { + GGML_METAL_LOG_ERROR("%s: error: failed to allocate '%-16s' buffer, size = %8.2f MiB\n", __func__, name, size_step_aligned / 1024.0 / 1024.0); + return false; + } + + GGML_METAL_LOG_INFO("%s: allocated '%-16s' buffer, size = %8.2f MiB, offs = %12ld", __func__, name, size_step_aligned / 1024.0 / 1024.0, i); + if (i + size_step < size) { + GGML_METAL_LOG_INFO("\n"); + } + + ++ctx->n_buffers; + } + } + +#if TARGET_OS_OSX + GGML_METAL_LOG_INFO(", (%8.2f / %8.2f)", + device.currentAllocatedSize / 1024.0 / 1024.0, + device.recommendedMaxWorkingSetSize / 1024.0 / 1024.0); + + if (device.currentAllocatedSize > device.recommendedMaxWorkingSetSize) { + GGML_METAL_LOG_WARN("%s: warning: current allocated size is greater than the recommended max working set size\n", __func__); + } else { + GGML_METAL_LOG_INFO("\n"); + } +#else + GGML_METAL_LOG_INFO(", (%8.2f)\n", device.currentAllocatedSize / 1024.0 / 1024.0); +#endif + + return ggml_backend_buffer_init(ggml_backend_metal_buffer_type(), ggml_backend_metal_buffer_i, ctx, size); +} + +// backend + static const char * ggml_backend_metal_name(ggml_backend_t backend) { return "Metal"; @@ -2499,10 +2627,6 @@ static void ggml_backend_metal_free(ggml_backend_t backend) { free(backend); } -static void ggml_backend_metal_synchronize(ggml_backend_t backend) { - UNUSED(backend); -} - static ggml_backend_buffer_type_t ggml_backend_metal_get_default_buffer_type(ggml_backend_t backend) { return ggml_backend_metal_buffer_type(); @@ -2529,8 +2653,8 @@ static struct ggml_backend_i metal_backend_i = { /* .get_tensor_async = */ NULL, /* .cpy_tensor_from_async = */ NULL, /* .cpy_tensor_to_async = */ NULL, - /* .synchronize = */ ggml_backend_metal_synchronize, - /* .graph_plan_create = */ NULL, // the metal implementation does not require creating graph plans atm + /* .synchronize = */ NULL, + /* .graph_plan_create = */ NULL, /* .graph_plan_free = */ NULL, /* .graph_plan_compute = */ NULL, /* .graph_compute = */ ggml_backend_metal_graph_compute, diff --git a/llama.cpp b/llama.cpp index 0a360c92c..5cb23230a 100644 --- a/llama.cpp +++ b/llama.cpp @@ -2881,17 +2881,20 @@ static void llm_load_print_meta(llama_model_loader & ml, llama_model & model) { } // TODO: metal should be disabled with ngl=0 -> cpu_buffer_type -static ggml_backend_buffer_type_t llama_default_buffer_type() { +static ggml_backend_buffer_type_t llama_default_buffer_type(int n_gpu_layers) { #ifdef GGML_USE_METAL - return ggml_backend_metal_buffer_type(); + if (n_gpu_layers > 0) { + return ggml_backend_metal_buffer_type(); + } #elif GGML_USE_CUBLAS - printf("Using " GGML_CUDA_NAME " host buffer type\n"); return ggml_backend_cuda_host_buffer_type(); #elif GGML_USE_CPU_HBM return ggml_backend_cpu_hbm_buffer_type(); -#else - return ggml_backend_cpu_buffer_type(); #endif + + return ggml_backend_cpu_buffer_type(); + + GGML_UNUSED(n_gpu_layers); } static void llm_load_tensors( @@ -3430,7 +3433,7 @@ static void llm_load_tensors( size_t vram_weights = 0; size_t buf_size = 0; - ggml_backend_buffer_type_t buft = llama_default_buffer_type(); + ggml_backend_buffer_type_t buft = llama_default_buffer_type(n_gpu_layers); for (struct ggml_tensor * t = ggml_get_first_tensor(ctx); t != nullptr; t = ggml_get_next_tensor(ctx, t)) { if (t->backend == GGML_BACKEND_CPU) { @@ -3440,13 +3443,14 @@ static void llm_load_tensors( } } - // create backend buffer bool sys_mem_buf = false; #ifdef GGML_USE_METAL +// todo: disable with 0 gpu layers if (ml.use_mmap) { - model.buf = ggml_backend_metal_buffer_from_ptr(ml.mapping->addr, ml.mapping->size); + const size_t max_size = ggml_get_max_tensor_size(ctx); + model.buf = ggml_backend_metal_buffer_from_ptr(ml.mapping->addr, ml.mapping->size, max_size); } else { model.buf = ggml_backend_alloc_ctx_tensors_from_buft(ctx, ggml_backend_metal_buffer_type()); sys_mem_buf = true; @@ -5940,12 +5944,12 @@ static int llama_decode_internal( if (ggml_backend_is_metal(lctx.backend)) { ggml_backend_metal_set_n_cb(lctx.backend, n_threads); } -#else +#endif + if (ggml_backend_is_cpu(lctx.backend)) { ggml_backend_cpu_set_n_threads(lctx.backend, n_threads); } ggml_backend_graph_compute(lctx.backend, gf); -#endif #if GGML_USE_MPI ggml_mpi_graph_compute_post(lctx.ctx_mpi, gf, n_layer); @@ -8952,6 +8956,9 @@ struct llama_context * llama_new_context_with_model( #ifdef GGML_USE_METAL if (model->n_gpu_layers > 0) { ctx->backend = ggml_backend_metal_init(); + if (ctx->backend == nullptr) { + LLAMA_LOG_ERROR("%s: failed to initialize Metal backend\n", __func__); + } } #endif if (ctx->backend == nullptr) { @@ -8959,12 +8966,14 @@ struct llama_context * llama_new_context_with_model( } if (ctx->backend == nullptr) { - LLAMA_LOG_ERROR("%s: failed to initialize backend\n", __func__); + LLAMA_LOG_ERROR("%s: failed to initialize CPU backend\n", __func__); delete ctx; return nullptr; } - if (!llama_kv_cache_init(ctx->model.hparams, ctx->kv_self, type_k, type_v, cparams.n_ctx, model->n_gpu_layers, cparams.offload_kqv, llama_default_buffer_type())) { + if (!llama_kv_cache_init(ctx->model.hparams, ctx->kv_self, type_k, type_v, + cparams.n_ctx, model->n_gpu_layers, cparams.offload_kqv, + llama_default_buffer_type(model->n_gpu_layers))) { LLAMA_LOG_ERROR("%s: llama_kv_cache_init() failed for self-attention cache\n", __func__); llama_free(ctx); return nullptr;