From b2c0973b4450ba8bf7ed31ad720cbff12cb0dd2b Mon Sep 17 00:00:00 2001
From: Kilty McGowan <kiltyj@gmail.com>
Date: Mon, 12 Jun 2023 02:00:22 -0700
Subject: [PATCH] Workaround Metal maxBufferLength

---
 examples/metal/metal.cpp |  1 -
 ggml-metal.h             |  9 -----
 ggml-metal.m             | 73 ++++++++++++++++++----------------------
 llama.cpp                | 10 ------
 4 files changed, 32 insertions(+), 61 deletions(-)

diff --git a/examples/metal/metal.cpp b/examples/metal/metal.cpp
index 77aca94a3..4e3476b84 100644
--- a/examples/metal/metal.cpp
+++ b/examples/metal/metal.cpp
@@ -70,7 +70,6 @@ int main(int argc, char ** argv) {
     // debug output
     {
         struct ggml_tensor * logits = gf.nodes[gf.n_nodes - 1];
-        ggml_metal_get_tensor(ctx_metal, logits);
 
         float * ptr = (float *) ggml_get_data(logits);
 
diff --git a/ggml-metal.h b/ggml-metal.h
index a9441a9d4..47d681866 100644
--- a/ggml-metal.h
+++ b/ggml-metal.h
@@ -13,9 +13,6 @@
 // are mapped to the device memory with the ggml_metal_add_buffer() function. This mapping is
 // used during the graph evaluation to determine the arguments of the compute kernels.
 //
-// Synchronization between device and host memory (for example for input and output tensors)
-// is done with the ggml_metal_set_tensor() and ggml_metal_get_tensor() functions.
-//
 
 #pragma once
 
@@ -48,12 +45,6 @@ bool ggml_metal_add_buffer(
                              void * data,
                            size_t   size);
 
-// set data from host memory into the device
-void ggml_metal_set_tensor(struct ggml_metal_context * ctx, struct ggml_tensor * t);
-
-// get data from the device into host memory
-void ggml_metal_get_tensor(struct ggml_metal_context * ctx, struct ggml_tensor * t);
-
 // same as ggml_graph_compute but uses Metal
 void ggml_metal_graph_compute(struct ggml_metal_context * ctx, struct ggml_cgraph * gf);
 
diff --git a/ggml-metal.m b/ggml-metal.m
index 16a362fd7..512f05e7d 100644
--- a/ggml-metal.m
+++ b/ggml-metal.m
@@ -223,56 +223,47 @@ bool ggml_metal_add_buffer(
         }
 
         size_t page_size = getpagesize();
-        size_t aligned_size = size;
-        if ((aligned_size % page_size) != 0) {
-            aligned_size += (page_size - (aligned_size % page_size));
+        size_t sys_max_buffer_size = 2ul * 1024ul * 1024ul * 1024ul; // ctx->device.maxBufferLength;
+
+        // Make sure total size is page-aligned
+        size_t total_aligned_size = size;
+        if ((total_aligned_size % page_size) != 0) {
+            total_aligned_size += (page_size - (total_aligned_size % page_size));
         }
 
-        ctx->buffers[ctx->n_buffers].name = name;
-        ctx->buffers[ctx->n_buffers].data = data;
-        ctx->buffers[ctx->n_buffers].size = size;
-
-        if (ctx->device.maxBufferLength < aligned_size) {
-            fprintf(stderr, "%s: buffer '%s' size %zu is larger than buffer maximum of %zu\n", __func__, name, aligned_size, ctx->device.maxBufferLength);
-            return false;
-        }
-        ctx->buffers[ctx->n_buffers].metal = [ctx->device newBufferWithBytesNoCopy:data length:aligned_size options:MTLResourceStorageModeShared deallocator:nil];
-
-        if (ctx->buffers[ctx->n_buffers].metal == nil) {
-            fprintf(stderr, "%s: failed to allocate '%-16s' buffer, size = %8.2f MB\n", __func__, name, aligned_size / 1024.0 / 1024.0);
-            return false;
-        } else {
-            fprintf(stderr, "%s: allocated '%-16s' buffer, size = %8.2f MB\n", __func__, name, aligned_size / 1024.0 / 1024.0);
+        // Make sure chunk size is page-aligned
+        size_t max_chunk_size = sys_max_buffer_size / 2;
+        if ((max_chunk_size % page_size) != 0) {
+            max_chunk_size += (page_size - (max_chunk_size % page_size));
         }
 
-        ++ctx->n_buffers;
+        size_t chunk_offset = 0;
+        while (total_aligned_size > 0) {
+            size_t chunk_logical_size = (max_chunk_size > total_aligned_size) ? total_aligned_size : max_chunk_size;
+            size_t sys_buffer_size = (sys_max_buffer_size > total_aligned_size) ? total_aligned_size : sys_max_buffer_size;
+            void *chunk = (uint8_t *) data + chunk_offset;
+            ctx->buffers[ctx->n_buffers].name = name;
+            ctx->buffers[ctx->n_buffers].data = chunk;
+            ctx->buffers[ctx->n_buffers].size = chunk_logical_size;
+            ctx->buffers[ctx->n_buffers].metal = [ctx->device newBufferWithBytesNoCopy:chunk length:sys_buffer_size options:MTLResourceStorageModeShared deallocator:nil];
+
+            if (ctx->buffers[ctx->n_buffers].metal == nil) {
+                fprintf(stderr, "%s: failed to allocate '%-16s' buffer, size = %8.2f MB\n", __func__, name,
+                        sys_buffer_size / 1024.0 / 1024.0);
+                return false;
+            } else {
+                fprintf(stderr, "%s: allocated '%-16s' buffer, sys_size = %8.2f MB, size = %8.2f MB, max: %zu\n", __func__, name,
+                        sys_buffer_size / 1024.0 / 1024.0, chunk_logical_size / 1024.0 / 1024.0, sys_max_buffer_size);
+            }
+            ++ctx->n_buffers;
+            total_aligned_size -= chunk_logical_size;
+            chunk_offset += chunk_logical_size;
+        }
     }
 
     return true;
 }
 
-void ggml_metal_set_tensor(
-        struct ggml_metal_context * ctx,
-        struct ggml_tensor * t) {
-    metal_printf("%s: set input for tensor '%s'\n", __func__, t->name);
-
-    size_t offs;
-    id<MTLBuffer> id_dst = ggml_metal_get_buffer(ctx, t, &offs);
-
-    memcpy((void *) ((uint8_t *) id_dst.contents + offs), t->data, ggml_nbytes(t));
-}
-
-void ggml_metal_get_tensor(
-        struct ggml_metal_context * ctx,
-        struct ggml_tensor * t) {
-    metal_printf("%s: extract results for tensor '%s'\n", __func__, t->name);
-
-    size_t offs;
-    id<MTLBuffer> id_src = ggml_metal_get_buffer(ctx, t, &offs);
-
-    memcpy(t->data, (void *) ((uint8_t *) id_src.contents + offs), ggml_nbytes(t));
-}
-
 void ggml_metal_graph_compute(
         struct ggml_metal_context * ctx,
              struct ggml_cgraph * gf) {
diff --git a/llama.cpp b/llama.cpp
index e100e2bc9..769c106ef 100644
--- a/llama.cpp
+++ b/llama.cpp
@@ -1588,7 +1588,6 @@ static bool llama_eval_internal(
 #ifdef GGML_USE_METAL
     if (lctx.ctx_metal && N == 1) {
         ggml_metal_graph_compute(lctx.ctx_metal, &gf);
-        ggml_metal_get_tensor   (lctx.ctx_metal, cur);
     } else {
         // IMPORTANT:
         // Since we don't have efficient Matrix x Matrix Metal multiplication yet, we fallback to vanilla
@@ -1597,15 +1596,6 @@ static bool llama_eval_internal(
         //
         // When we implement Matrix x Matrix Metal multiplication, we can avoid this branch.
         // But for now, we have focused only on Matrix x Vector Metal multiplication.
-        //
-        // TODO: avoid these syncs via shared memory (ref #1696)
-        //
-        if (lctx.ctx_metal) {
-            // We need to sync the GPU KV cache with the CPU KV cache
-            ggml_metal_get_tensor(lctx.ctx_metal, kv_self.k);
-            ggml_metal_get_tensor(lctx.ctx_metal, kv_self.v);
-        }
-
         ggml_graph_compute(ctx0, &gf);
     }
 #else