From b2c0973b4450ba8bf7ed31ad720cbff12cb0dd2b Mon Sep 17 00:00:00 2001 From: Kilty McGowan Date: Mon, 12 Jun 2023 02:00:22 -0700 Subject: [PATCH] Workaround Metal maxBufferLength --- examples/metal/metal.cpp | 1 - ggml-metal.h | 9 ----- ggml-metal.m | 73 ++++++++++++++++++---------------------- llama.cpp | 10 ------ 4 files changed, 32 insertions(+), 61 deletions(-) diff --git a/examples/metal/metal.cpp b/examples/metal/metal.cpp index 77aca94a3..4e3476b84 100644 --- a/examples/metal/metal.cpp +++ b/examples/metal/metal.cpp @@ -70,7 +70,6 @@ int main(int argc, char ** argv) { // debug output { struct ggml_tensor * logits = gf.nodes[gf.n_nodes - 1]; - ggml_metal_get_tensor(ctx_metal, logits); float * ptr = (float *) ggml_get_data(logits); diff --git a/ggml-metal.h b/ggml-metal.h index a9441a9d4..47d681866 100644 --- a/ggml-metal.h +++ b/ggml-metal.h @@ -13,9 +13,6 @@ // are mapped to the device memory with the ggml_metal_add_buffer() function. This mapping is // used during the graph evaluation to determine the arguments of the compute kernels. // -// Synchronization between device and host memory (for example for input and output tensors) -// is done with the ggml_metal_set_tensor() and ggml_metal_get_tensor() functions. -// #pragma once @@ -48,12 +45,6 @@ bool ggml_metal_add_buffer( void * data, size_t size); -// set data from host memory into the device -void ggml_metal_set_tensor(struct ggml_metal_context * ctx, struct ggml_tensor * t); - -// get data from the device into host memory -void ggml_metal_get_tensor(struct ggml_metal_context * ctx, struct ggml_tensor * t); - // same as ggml_graph_compute but uses Metal void ggml_metal_graph_compute(struct ggml_metal_context * ctx, struct ggml_cgraph * gf); diff --git a/ggml-metal.m b/ggml-metal.m index 16a362fd7..512f05e7d 100644 --- a/ggml-metal.m +++ b/ggml-metal.m @@ -223,56 +223,47 @@ bool ggml_metal_add_buffer( } size_t page_size = getpagesize(); - size_t aligned_size = size; - if ((aligned_size % page_size) != 0) { - aligned_size += (page_size - (aligned_size % page_size)); + size_t sys_max_buffer_size = 2ul * 1024ul * 1024ul * 1024ul; // ctx->device.maxBufferLength; + + // Make sure total size is page-aligned + size_t total_aligned_size = size; + if ((total_aligned_size % page_size) != 0) { + total_aligned_size += (page_size - (total_aligned_size % page_size)); } - ctx->buffers[ctx->n_buffers].name = name; - ctx->buffers[ctx->n_buffers].data = data; - ctx->buffers[ctx->n_buffers].size = size; - - if (ctx->device.maxBufferLength < aligned_size) { - fprintf(stderr, "%s: buffer '%s' size %zu is larger than buffer maximum of %zu\n", __func__, name, aligned_size, ctx->device.maxBufferLength); - return false; - } - ctx->buffers[ctx->n_buffers].metal = [ctx->device newBufferWithBytesNoCopy:data length:aligned_size options:MTLResourceStorageModeShared deallocator:nil]; - - if (ctx->buffers[ctx->n_buffers].metal == nil) { - fprintf(stderr, "%s: failed to allocate '%-16s' buffer, size = %8.2f MB\n", __func__, name, aligned_size / 1024.0 / 1024.0); - return false; - } else { - fprintf(stderr, "%s: allocated '%-16s' buffer, size = %8.2f MB\n", __func__, name, aligned_size / 1024.0 / 1024.0); + // Make sure chunk size is page-aligned + size_t max_chunk_size = sys_max_buffer_size / 2; + if ((max_chunk_size % page_size) != 0) { + max_chunk_size += (page_size - (max_chunk_size % page_size)); } - ++ctx->n_buffers; + size_t chunk_offset = 0; + while (total_aligned_size > 0) { + size_t chunk_logical_size = (max_chunk_size > total_aligned_size) ? total_aligned_size : max_chunk_size; + size_t sys_buffer_size = (sys_max_buffer_size > total_aligned_size) ? total_aligned_size : sys_max_buffer_size; + void *chunk = (uint8_t *) data + chunk_offset; + ctx->buffers[ctx->n_buffers].name = name; + ctx->buffers[ctx->n_buffers].data = chunk; + ctx->buffers[ctx->n_buffers].size = chunk_logical_size; + ctx->buffers[ctx->n_buffers].metal = [ctx->device newBufferWithBytesNoCopy:chunk length:sys_buffer_size options:MTLResourceStorageModeShared deallocator:nil]; + + if (ctx->buffers[ctx->n_buffers].metal == nil) { + fprintf(stderr, "%s: failed to allocate '%-16s' buffer, size = %8.2f MB\n", __func__, name, + sys_buffer_size / 1024.0 / 1024.0); + return false; + } else { + fprintf(stderr, "%s: allocated '%-16s' buffer, sys_size = %8.2f MB, size = %8.2f MB, max: %zu\n", __func__, name, + sys_buffer_size / 1024.0 / 1024.0, chunk_logical_size / 1024.0 / 1024.0, sys_max_buffer_size); + } + ++ctx->n_buffers; + total_aligned_size -= chunk_logical_size; + chunk_offset += chunk_logical_size; + } } return true; } -void ggml_metal_set_tensor( - struct ggml_metal_context * ctx, - struct ggml_tensor * t) { - metal_printf("%s: set input for tensor '%s'\n", __func__, t->name); - - size_t offs; - id id_dst = ggml_metal_get_buffer(ctx, t, &offs); - - memcpy((void *) ((uint8_t *) id_dst.contents + offs), t->data, ggml_nbytes(t)); -} - -void ggml_metal_get_tensor( - struct ggml_metal_context * ctx, - struct ggml_tensor * t) { - metal_printf("%s: extract results for tensor '%s'\n", __func__, t->name); - - size_t offs; - id id_src = ggml_metal_get_buffer(ctx, t, &offs); - - memcpy(t->data, (void *) ((uint8_t *) id_src.contents + offs), ggml_nbytes(t)); -} - void ggml_metal_graph_compute( struct ggml_metal_context * ctx, struct ggml_cgraph * gf) { diff --git a/llama.cpp b/llama.cpp index e100e2bc9..769c106ef 100644 --- a/llama.cpp +++ b/llama.cpp @@ -1588,7 +1588,6 @@ static bool llama_eval_internal( #ifdef GGML_USE_METAL if (lctx.ctx_metal && N == 1) { ggml_metal_graph_compute(lctx.ctx_metal, &gf); - ggml_metal_get_tensor (lctx.ctx_metal, cur); } else { // IMPORTANT: // Since we don't have efficient Matrix x Matrix Metal multiplication yet, we fallback to vanilla @@ -1597,15 +1596,6 @@ static bool llama_eval_internal( // // When we implement Matrix x Matrix Metal multiplication, we can avoid this branch. // But for now, we have focused only on Matrix x Vector Metal multiplication. - // - // TODO: avoid these syncs via shared memory (ref #1696) - // - if (lctx.ctx_metal) { - // We need to sync the GPU KV cache with the CPU KV cache - ggml_metal_get_tensor(lctx.ctx_metal, kv_self.k); - ggml_metal_get_tensor(lctx.ctx_metal, kv_self.v); - } - ggml_graph_compute(ctx0, &gf); } #else