diff --git a/examples/mtl/mtl.cpp b/examples/mtl/mtl.cpp
index 382976667..e527f2856 100644
--- a/examples/mtl/mtl.cpp
+++ b/examples/mtl/mtl.cpp
@@ -51,23 +51,22 @@ int main(int argc, char ** argv) {
     }
 
     // this allocates all Metal resources and memory buffers
-    auto * ctx_mtl = ggml_mtl_init(
-            ggml_get_mem_buffer(ctx_data),
-            ggml_get_mem_size  (ctx_data),
-            ggml_get_mem_buffer(ctx_eval),
-            ggml_get_mem_size  (ctx_eval),
-            NULL, 0, // cache
-            32*n_vocab*sizeof(float));
+    auto * ctx_mtl = ggml_mtl_init();
+
+    ggml_mtl_add_buffer(ctx_mtl, "data", ggml_get_mem_buffer(ctx_data), ggml_get_mem_size(ctx_data));
+    ggml_mtl_add_buffer(ctx_mtl, "eval", ggml_get_mem_buffer(ctx_eval), ggml_get_mem_size(ctx_eval));
 
     // TODO: tmp to match the input used when creating the cgraph
     {
-        const int n_batch = 1;
-        const int n_past  = 512 - n_batch;
+        const std::vector<int> tmp(1, 1); // BOS
 
-        const std::vector<int> tmp(n_batch, 1); // BOS
+        struct ggml_tensor * input = ggml_graph_get_tensor(&gf, "embd");
+        memcpy(input->data, tmp.data(), tmp.size() * sizeof(int));
+
+        ggml_mtl_set_tensor(ctx_mtl, input);
 
         // warmup
-        ggml_mtl_graph_compute(ctx_mtl, &gf, tmp.data(), tmp.size(), n_past);
+        ggml_mtl_graph_compute(ctx_mtl, &gf);
 
         const int n_iter = 16;
 
@@ -75,7 +74,7 @@ int main(int argc, char ** argv) {
 
         // the actual inference happens here
         for (int i = 0; i < n_iter; ++i) {
-            ggml_mtl_graph_compute(ctx_mtl, &gf, tmp.data(), tmp.size(), n_past);
+            ggml_mtl_graph_compute(ctx_mtl, &gf);
         }
 
         const int64_t t1 = ggml_time_us();
@@ -83,6 +82,31 @@ int main(int argc, char ** argv) {
         printf("time: %.2f ms, %.2f ms/tok\n", (t1 - t0) / 1000.0, (t1 - t0) / 1000.0 / n_iter);
     }
 
+    // debug output
+    {
+        struct ggml_tensor * logits = gf.nodes[gf.n_nodes - 1];
+        ggml_mtl_get_tensor(ctx_mtl, logits);
+
+        float * ptr = (float *) ggml_get_data(logits);
+
+        printf("logits: ");
+        for (int i = 0; i < 10; i++) {
+            printf("%8.4f ", ptr[i]);
+        }
+        printf("\n");
+        int imax = 0;
+        double sum = 0.0;
+        double vmax = -1e9;
+        for (int i = 0; i < 32000; i++) {
+            sum += (double) ptr[i];
+            if (ptr[i] > vmax) {
+                vmax = ptr[i];
+                imax = i;
+            }
+        }
+        printf("sum: %f, imax = %d, vmax = %f\n", sum, imax, vmax);
+    }
+
     ggml_mtl_free(ctx_mtl);
 
     ggml_free(ctx_data);
diff --git a/ggml-mtl.h b/ggml-mtl.h
index 15256b27d..cab71e386 100644
--- a/ggml-mtl.h
+++ b/ggml-mtl.h
@@ -2,7 +2,9 @@
 
 #include <stddef.h>
 
-struct ggml_context;
+#define GGML_METAL_MAX_BUFFERS 16
+
+struct ggml_tensor;
 struct ggml_cgraph;
 
 #ifdef __cplusplus
@@ -11,24 +13,30 @@ extern "C" {
 
 struct ggml_mtl_context;
 
-struct ggml_mtl_context * ggml_mtl_init(
-        void   * data_buf,
-        size_t   data_size,
-        void   * eval_buf,
-        size_t   eval_size,
-        void   * cach_buf,
-        size_t   cach_size,
-        size_t   outp_size);
+struct ggml_mtl_context * ggml_mtl_init(void);
 
 void ggml_mtl_free(struct ggml_mtl_context * ctx);
 
+void ggml_mtl_add_buffer(
+        struct ggml_mtl_context * ctx,
+                     const char * name,
+                           void * data,
+                         size_t   size);
+
+// set data from host memory into the device
+void ggml_mtl_set_tensor(
+        struct ggml_mtl_context * ctx,
+             struct ggml_tensor * t);
+
+// get data from the device into host memory
+void ggml_mtl_get_tensor(
+        struct ggml_mtl_context * ctx,
+             struct ggml_tensor * t);
+
 // return 0 on success
 int ggml_mtl_graph_compute(
         struct ggml_mtl_context * ctx,
-             struct ggml_cgraph * gf,
-                      const int * tokens,
-                            int   n_tokens,
-                            int   n_past);
+             struct ggml_cgraph * gf);
 
 #ifdef __cplusplus
 }
diff --git a/ggml-mtl.m b/ggml-mtl.m
index ecbb1a188..8f831afe7 100644
--- a/ggml-mtl.m
+++ b/ggml-mtl.m
@@ -13,26 +13,24 @@
 #endif
 //#define mtl_printf(...)
 
-struct ggml_mtl_context {
-    void   * data_buf;
-    size_t   data_size;
-    void   * eval_buf;
-    size_t   eval_size;
-    void   * cach_buf;
-    size_t   cach_size;
-    size_t   outp_size;
+struct ggml_mtl_buffer {
+    const char * name;
 
+    void   * data;
+    size_t   size;
+
+    id<MTLBuffer> mtl;
+};
+
+struct ggml_mtl_context {
     float * logits;
 
     id<MTLDevice>       device;
     id<MTLCommandQueue> queue;
     id<MTLLibrary>      library;
 
-    id<MTLBuffer> buffer_data;
-    id<MTLBuffer> buffer_eval;
-    id<MTLBuffer> buffer_cach;
-
-    id<MTLBuffer> out;
+    int n_buffers;
+    struct ggml_mtl_buffer buffers[GGML_METAL_MAX_BUFFERS];
 
     // custom kernels
     id<MTLFunction>             function_add;
@@ -87,25 +85,11 @@ struct ggml_mtl_context {
 //       for now it is easier to work in a separate file
 NSString * const msl_library_source = @"see mtl.metal";
 
-struct ggml_mtl_context * ggml_mtl_init(
-                     void   * data_buf,
-                     size_t   data_size,
-                     void   * eval_buf,
-                     size_t   eval_size,
-                     void   * cach_buf,
-                     size_t   cach_size,
-                     size_t   outp_size) {
+struct ggml_mtl_context * ggml_mtl_init(void) {
     fprintf(stderr, "%s: allocating\n", __func__);
 
     struct ggml_mtl_context * ctx = malloc(sizeof(struct ggml_mtl_context));
 
-    ctx->data_buf  = data_buf;
-    ctx->data_size = data_size;
-    ctx->eval_buf  = eval_buf;
-    ctx->eval_size = eval_size;
-    ctx->cach_buf  = cach_buf;
-    ctx->cach_size = cach_size;
-
     ctx->device = MTLCreateSystemDefaultDevice();
     ctx->queue  = [ctx->device newCommandQueue];
 
@@ -216,51 +200,6 @@ struct ggml_mtl_context * ggml_mtl_init(
         fprintf(stderr, "%s: loaded kernel_cpy_f32_f32: %p\n", __func__, (void *) ctx->pipeline_cpy_f32_f32);
     }
 
-    // MTLBuffer approach
-
-    // pin ctx_data memory to GPU
-    // use MTLStorageModeShared to allow us to initialize the weights from the CPU
-    // TODO: how to use MTLStorageModeManaged?
-    // TODO: see if we can avoid this copy somehow
-    {
-        const void * mem_buffer = data_buf;
-        const size_t mem_size   = data_size;
-
-        //ctx->buffer_data = [ctx->device newBufferWithBytesNoCopy:mem_buffer length:mem_size options:MTLResourceStorageModeShared deallocator:nil];
-        ctx->buffer_data = [ctx->device newBufferWithBytes:mem_buffer length:mem_size options:MTLResourceStorageModeShared];
-
-        fprintf(stderr, "%s: allocated data buffer, size = %8.2f MB\n", __func__, mem_size / 1024.0 / 1024.0);
-    }
-
-    // pin ctx_eval memory to GPU
-    // this buffer will be used for the intermediate results of the evaluation
-    {
-        const void * mem_buffer = eval_buf;
-        const size_t mem_size   = eval_size;
-
-        ctx->buffer_eval = [ctx->device newBufferWithBytes:mem_buffer length:mem_size options:MTLResourceStorageModeShared];
-
-        fprintf(stderr, "%s: allocated eval buffer, size = %8.2f MB\n", __func__, mem_size / 1024.0 / 1024.0);
-    }
-
-    if (cach_buf) {
-        const void * mem_buffer = cach_buf;
-        const size_t mem_size   = cach_size;
-
-        ctx->buffer_cach = [ctx->device newBufferWithBytes:mem_buffer length:mem_size options:MTLResourceStorageModeShared];
-
-        fprintf(stderr, "%s: allocated cach buffer, size = %8.2f MB\n", __func__, mem_size / 1024.0 / 1024.0);
-    }
-
-    // allocate buffer for result extraction
-    {
-        const size_t mem_size = outp_size;
-
-        ctx->out = [ctx->device newBufferWithLength:mem_size options:MTLResourceStorageModeShared];
-
-        fprintf(stderr, "%s: allocated  out buffer, size = %8.2f MB\n", __func__, mem_size / 1024.0 / 1024.0);
-    }
-
     return ctx;
 }
 
@@ -271,81 +210,80 @@ void ggml_mtl_free(struct ggml_mtl_context * ctx) {
 }
 
 // get data / eval buffer + offset
-id<MTLBuffer> ggml_mtl_get_buffer(struct ggml_mtl_context * ctx, struct ggml_tensor * t, size_t * offs) {
-    const int64_t offs_data = (int64_t) t->data - (int64_t) ctx->data_buf;
-    const int64_t offs_eval = (int64_t) t->data - (int64_t) ctx->eval_buf;
-    const int64_t offs_cach = (int64_t) t->data - (int64_t) ctx->cach_buf;
-
+static id<MTLBuffer> ggml_mtl_get_buffer(struct ggml_mtl_context * ctx, struct ggml_tensor * t, size_t * offs) {
     //fprintf(stderr, "%s: data tensor '%16s', offs_data = %8ld, offs_eval = %8ld, offs_cach = %8ld\n", __func__, t->name, offs_data, offs_eval, offs_cach);
 
-    //const size_t t_size = ggml_nbytes(t);
+    for (int i = 0; i < ctx->n_buffers; ++i) {
+        const int64_t ioffs = (int64_t) t->data - (int64_t) ctx->buffers[i].data;
 
-    id<MTLBuffer> result;
-    size_t t_offs = 0;
+        if (ioffs >= 0 && ioffs < (int64_t) ctx->buffers[i].size) {
+            *offs = (size_t) ioffs;
 
-    if ( offs_data > 0 &&
-        (offs_eval < 0 || (offs_data < offs_eval)) &&
-        (offs_cach < 0 || (offs_data < offs_cach))
-       ) {
-        result = ctx->buffer_data;
-        t_offs = offs_data;
-        //fprintf(stderr, "%s: data tensor '%16s', offs = %8ld, size = %8ld\n", __func__, t->name, t_offs, t_size);
+            //fprintf(stderr, "%s: '%s' tensor '%16s', offs = %8ld\n", __func__, ctx->buffers[i].name, t->name, *offs);
+
+            return ctx->buffers[i].mtl;
+        }
     }
 
-    if ( offs_eval > 0 &&
-        (offs_data < 0 || (offs_eval < offs_data)) &&
-        (offs_cach < 0 || (offs_eval < offs_cach))
-       ) {
-        result = ctx->buffer_eval;
-        t_offs = offs_eval;
-        //fprintf(stderr, "%s: data tensor '%16s', offs = %8ld, size = %8ld\n", __func__, t->name, t_offs, t_size);
+    fprintf(stderr, "%s: error: buffer is nil\n", __func__);
+    GGML_ASSERT(false);
+
+    return nil;
+}
+
+void ggml_mtl_add_buffer(
+        struct ggml_mtl_context * ctx,
+                     const char * name,
+                           void * data,
+                         size_t   size) {
+    if (ctx->n_buffers >= GGML_METAL_MAX_BUFFERS) {
+        fprintf(stderr, "%s: too many buffers\n", __func__);
+        return;
     }
 
-    if ( offs_cach > 0 &&
-        (offs_data < 0 || (offs_cach < offs_data)) &&
-        (offs_eval < 0 || (offs_cach < offs_eval))
-       ) {
-        result = ctx->buffer_cach;
-        t_offs = offs_cach;
-        //fprintf(stderr, "%s: data tensor '%16s', offs = %8ld, size = %8ld\n", __func__, t->name, t_offs, t_size);
-    }
+    if (data) {
+        ctx->buffers[ctx->n_buffers].name = name;
+        ctx->buffers[ctx->n_buffers].data = data;
+        ctx->buffers[ctx->n_buffers].size = size;
+        ctx->buffers[ctx->n_buffers].mtl = [ctx->device newBufferWithBytes:data length:size options:MTLResourceStorageModeShared];
 
-    if (result == nil || (t_offs > ctx->data_size && t_offs > ctx->eval_size && t_offs > ctx->cach_size)) {
-        fprintf(stderr, "%s: error: buffer is nil\n", __func__);
-        GGML_ASSERT(false);
-    }
+        ++ctx->n_buffers;
 
-    if (offs != 0) {
-        *offs = t_offs;
+        fprintf(stderr, "%s: allocated '%16s' buffer, size = %8.2f MB\n", __func__, name, size / 1024.0 / 1024.0);
     }
+}
 
-    return result;
+void ggml_mtl_set_tensor(
+        struct ggml_mtl_context * ctx,
+        struct ggml_tensor * t) {
+    mtl_printf("%s: set input for tensor '%s'\n", __func__, t->name);
+
+    size_t offs;
+    id<MTLBuffer> id_dst = ggml_mtl_get_buffer(ctx, t, &offs);
+
+    memcpy((void *) ((uint8_t *) id_dst.contents + offs), t->data, ggml_nbytes(t));
+}
+
+void ggml_mtl_get_tensor(
+        struct ggml_mtl_context * ctx,
+        struct ggml_tensor * t) {
+    mtl_printf("%s: extract results for tensor '%s'\n", __func__, t->name);
+
+    size_t offs;
+    id<MTLBuffer> id_src = ggml_mtl_get_buffer(ctx, t, &offs);
+
+    memcpy(t->data, (void *) ((uint8_t *) id_src.contents + offs), ggml_nbytes(t));
 }
 
 int ggml_mtl_graph_compute(
         struct ggml_mtl_context * ctx,
-             struct ggml_cgraph * gf,
-                      const int * tokens,
-                            int   n_tokens,
-                            int   n_past) {
-    mtl_printf("%s: evaluating, n_tokens = %d, n_past = %d\n", __func__, n_tokens, n_past);
-
-    struct ggml_tensor * input = ggml_graph_get_tensor(gf, "embd");
-    memcpy(input->data, tokens, n_tokens * sizeof(int));
+             struct ggml_cgraph * gf) {
+    mtl_printf("%s: evaluating graph\n", __func__);
 
     size_t offs_src0 = 0;
     size_t offs_src1 = 0;
     size_t offs_dst  = 0;
 
-    // copy the input data to the GPU
-    {
-        struct ggml_tensor * embd = ggml_graph_get_tensor(gf, "embd");
-
-        id<MTLBuffer> id_dst = ggml_mtl_get_buffer(ctx, embd, &offs_src0);
-
-        memcpy((char *) id_dst.contents + offs_src0, embd->data, ggml_nbytes(embd));
-    }
-
     id<MTLCommandBuffer> command_buffer  = [ctx->queue commandBuffer];
     id<MTLComputeCommandEncoder> encoder = nil;
 
@@ -521,6 +459,8 @@ int ggml_mtl_graph_compute(
                         encoder = [command_buffer computeCommandEncoder];
                     }
 
+                    const int n_past = ((int32_t *)(src1->data))[0];
+
                     [encoder setComputePipelineState:ctx->pipeline_diag_mask_inf];
                     [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
                     [encoder setBuffer:id_dst  offset:offs_dst  atIndex:1];
@@ -690,6 +630,8 @@ int ggml_mtl_graph_compute(
                     //mtl_printf("rope: %lld x %lld x %lld x %lld\n", ne0,  ne1,  ne2,  ne3);
                     //mtl_printf("rope: n_past = %d, n_dims = %d, mode = %d\n", n_past, n_dims, mode);
 
+                    const int n_past = ((int32_t *)(src1->data))[0];
+
                     [encoder setComputePipelineState:ctx->pipeline_rope];
                     [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
                     [encoder setBuffer:id_dst  offset:offs_dst  atIndex:1];
@@ -769,23 +711,9 @@ int ggml_mtl_graph_compute(
         }
     }
 
-    // extract results from the GPU
-    {
-        mtl_printf("%s: extract results from the GPU\n", __func__);
-
-        if (encoder != nil) {
-            [encoder endEncoding];
-            encoder = nil;
-        }
-
-        struct ggml_tensor * out = gf->nodes[gf->n_nodes - 1];
-
-        id<MTLBuffer> id_src = ggml_mtl_get_buffer(ctx, out, &offs_src0);
-        id<MTLBuffer> id_dst = ctx->out;
-
-        id<MTLBlitCommandEncoder> encoder_blit = [command_buffer blitCommandEncoder];
-        [encoder_blit copyFromBuffer:id_src sourceOffset:offs_src0 toBuffer:id_dst destinationOffset:0 size:ggml_nbytes(out)];
-        [encoder_blit endEncoding];
+    if (encoder != nil) {
+        [encoder endEncoding];
+        encoder = nil;
     }
 
     [command_buffer commit];
@@ -796,31 +724,5 @@ int ggml_mtl_graph_compute(
         mtl_printf("%s: time elapsed = %f ms\n", __func__, time_elapsed * 1000.0);
     }
 
-    ctx->logits = ctx->out.contents;
-
-    const float * logits = ctx->logits;
-
-    struct ggml_tensor * t = gf->nodes[gf->n_nodes - 1];
-    memcpy(t->data, logits, ggml_nbytes(t));
-
-#if 1
-    mtl_printf("logits: ");
-    for (int i = 0; i < 100; i++) {
-        mtl_printf("%8.4f ", logits[i]);
-    }
-    mtl_printf("\n");
-    double sum = 0.0;
-    int imax = 0;
-    double vmax = -INFINITY;
-    for (int i = 0; i < 32000; i++) {
-        sum += (double) logits[i];
-        if (logits[i] > vmax) {
-            vmax = logits[i];
-            imax = i;
-        }
-    }
-    mtl_printf("sum: %f, imax = %d, vmax = %f\n", sum, imax, vmax);
-#endif
-
     return 0;
 }
diff --git a/llama.cpp b/llama.cpp
index 5cd39b612..26722e091 100644
--- a/llama.cpp
+++ b/llama.cpp
@@ -1085,7 +1085,7 @@ static void llama_model_load_internal(
             mmapped_size - vram_total + // weights in VRAM not in memory
             MEM_REQ_SCRATCH0().at(model.type) +
             MEM_REQ_SCRATCH1().at(model.type) +
-            MEM_REQ_EVAL().at(model.type);
+            MEM_REQ_EVAL().at    (model.type);
 
         // this is the memory required by one llama_state
         const size_t mem_required_state =
@@ -1255,14 +1255,19 @@ static bool llama_eval_internal(
     ggml_set_name(embd, "embd");
     memcpy(embd->data, tokens, N*ggml_element_size(embd));
 
+#ifdef GGML_USE_METAL
+    if (lctx.mtl_ctx) {
+        ggml_mtl_set_tensor(lctx.mtl_ctx, embd);
+    }
+#endif
+
+    struct ggml_tensor * cur;
     struct ggml_tensor * inpL = ggml_get_rows(ctx0, model.tok_embeddings, embd);
 
     for (int il = 0; il < n_layer; ++il) {
         struct ggml_tensor * inpSA = inpL;
 
-        struct ggml_tensor * cur;
-
-        //lctx.use_buf(ctx0, 0);
+        lctx.use_buf(ctx0, 0);
 
         // norm
         {
@@ -1378,7 +1383,7 @@ static bool llama_eval_internal(
                     cur);
         }
 
-        //lctx.use_buf(ctx0, 1);
+        lctx.use_buf(ctx0, 1);
 
         struct ggml_tensor * inpFF = ggml_add(ctx0, cur, inpSA);
 
@@ -1416,36 +1421,36 @@ static bool llama_eval_internal(
         inpL = cur;
     }
 
-    //lctx.use_buf(ctx0, 0);
+    lctx.use_buf(ctx0, 0);
 
     // used at the end to optionally extract the embeddings
     struct ggml_tensor * embeddings = NULL;
 
     // norm
     {
+        cur = ggml_rms_norm(ctx0, inpL);
 
-        inpL = ggml_rms_norm(ctx0, inpL);
+        // cur = cur*norm(broadcasted)
+        cur = ggml_mul(ctx0, cur, model.norm);
 
-        // inpL = inpL*norm(broadcasted)
-        inpL = ggml_mul(ctx0, inpL, model.norm);
-
-        embeddings = inpL;
+        embeddings = cur;
     }
 
     // lm_head
-    inpL = ggml_mul_mat(ctx0, model.output, inpL);
+    cur = ggml_mul_mat(ctx0, model.output, cur);
 
-    //lctx.use_buf(ctx0, -1);
+    lctx.use_buf(ctx0, -1);
 
     // logits -> probs
-    //inpL = ggml_soft_max_inplace(ctx0, inpL);
+    //cur = ggml_soft_max_inplace(ctx0, cur);
 
     // run the computation
-    ggml_build_forward_expand(&gf, inpL);
+    ggml_build_forward_expand(&gf, cur);
 
 #ifdef GGML_USE_METAL
     if (lctx.mtl_ctx) {
-        ggml_mtl_graph_compute(lctx.mtl_ctx, &gf, tokens, n_tokens, n_past);
+        ggml_mtl_graph_compute(lctx.mtl_ctx, &gf);
+        ggml_mtl_get_tensor(lctx.mtl_ctx, cur);
     } else {
         ggml_graph_compute(ctx0, &gf);
     }
@@ -1498,7 +1503,7 @@ static bool llama_eval_internal(
             ggml_free(ctx_vocab);
         }
 
-        float * logits = (float *) ggml_get_data(inpL);
+        float * logits = (float *) ggml_get_data(cur);
 
         printf("logits: ");
         for (int i = 0; i < 10; i++) {
@@ -1530,7 +1535,7 @@ static bool llama_eval_internal(
     //}
 
     //embd_w.resize(n_vocab*N);
-    //memcpy(embd_w.data(), ggml_get_data(inpL), sizeof(float)*n_vocab*N);
+    //memcpy(embd_w.data(), ggml_get_data(cur), sizeof(float)*n_vocab*N);
 
     // update kv token count
     lctx.model.kv_self.n = n_past + N;
@@ -1541,11 +1546,11 @@ static bool llama_eval_internal(
 
         if (lctx.logits_all) {
             logits_out.resize(n_vocab * N);
-            memcpy(logits_out.data(), (float *) ggml_get_data(inpL), sizeof(float)*n_vocab*N);
+            memcpy(logits_out.data(), (float *) ggml_get_data(cur), sizeof(float)*n_vocab*N);
         } else {
             // return result for just the last token
             logits_out.resize(n_vocab);
-            memcpy(logits_out.data(), (float *) ggml_get_data(inpL) + (n_vocab*(N-1)), sizeof(float)*n_vocab);
+            memcpy(logits_out.data(), (float *) ggml_get_data(cur) + (n_vocab*(N-1)), sizeof(float)*n_vocab);
         }
     }
 
@@ -2374,7 +2379,12 @@ struct llama_context * llama_init_from_file(
             ctx->embedding.resize(hparams.n_embd);
         }
 
+#ifdef GGML_USE_METAL
+        // when using Metal, we don't need the extra buffer for intermediate dequantization
+        ctx->buf_compute.resize(MEM_REQ_EVAL().at(ctx->model.type)/100);
+#else
         ctx->buf_compute.resize(MEM_REQ_EVAL().at(ctx->model.type));
+#endif
 
         ctx->buf_scratch[0].resize(MEM_REQ_SCRATCH0().at(ctx->model.type));
         ctx->buf_scratch[1].resize(MEM_REQ_SCRATCH1().at(ctx->model.type));
@@ -2383,14 +2393,21 @@ struct llama_context * llama_init_from_file(
 #ifdef GGML_USE_METAL
     if (params.n_gpu_layers > 0) {
         // this allocates all Metal resources and memory buffers
-        ctx->mtl_ctx = ggml_mtl_init(
-                ggml_get_mem_buffer(ctx->model.ctx),
-                ggml_get_mem_size  (ctx->model.ctx),
-                ctx->buf_compute.addr,
-                ctx->buf_compute.size,
-                ctx->model.kv_self.buf.addr,
-                ctx->model.kv_self.buf.size,
-                32*ctx->model.hparams.n_vocab*sizeof(float));
+        if (params.use_mmap) {
+            ctx->mtl_ctx = ggml_mtl_init();
+            ggml_mtl_add_buffer(ctx->mtl_ctx, "data", ctx->model.mapping->addr,    ctx->model.mapping->size);
+            ggml_mtl_add_buffer(ctx->mtl_ctx, "eval", ctx->buf_compute.addr,       ctx->buf_compute.size);
+            ggml_mtl_add_buffer(ctx->mtl_ctx, "kv",   ctx->model.kv_self.buf.addr, ctx->model.kv_self.buf.size);
+            ggml_mtl_add_buffer(ctx->mtl_ctx, "scr0", ctx->buf_scratch[0].addr,    ctx->buf_scratch[0].size);
+            ggml_mtl_add_buffer(ctx->mtl_ctx, "scr1", ctx->buf_scratch[1].addr,    ctx->buf_scratch[1].size);
+        } else {
+            ctx->mtl_ctx = ggml_mtl_init();
+            ggml_mtl_add_buffer(ctx->mtl_ctx, "data", ggml_get_mem_buffer(ctx->model.ctx), ggml_get_mem_size(ctx->model.ctx));
+            ggml_mtl_add_buffer(ctx->mtl_ctx, "eval", ctx->buf_compute.addr,               ctx->buf_compute.size);
+            ggml_mtl_add_buffer(ctx->mtl_ctx, "kv",   ctx->model.kv_self.buf.addr,         ctx->model.kv_self.buf.size);
+            ggml_mtl_add_buffer(ctx->mtl_ctx, "scr0", ctx->buf_scratch[0].addr,            ctx->buf_scratch[0].size);
+            ggml_mtl_add_buffer(ctx->mtl_ctx, "scr1", ctx->buf_scratch[1].addr,            ctx->buf_scratch[1].size);
+        }
     }
 #endif