From 652c849643a81d0fee3f178b90f093f71d1f49f5 Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Tue, 18 Jul 2023 18:51:02 +0300
Subject: [PATCH] ggml : add is_ram_shared to ggml_backend

Metal can share the RAM memory and can utilize mmap without temp buffer
---
 ggml-backend.c |  5 ++--
 ggml-backend.h | 14 ++++++++++-
 ggml-cuda.cu   |  5 ++--
 llama.cpp      | 65 +++++++++++++++++++++++++++++++++++++-------------
 4 files changed, 68 insertions(+), 21 deletions(-)

diff --git a/ggml-backend.c b/ggml-backend.c
index 85a6cac05..bd97a5b49 100644
--- a/ggml-backend.c
+++ b/ggml-backend.c
@@ -255,8 +255,9 @@ struct ggml_backend ggml_backend_cpu_init(void) {
     ctx->work_size = 0;
 
     struct ggml_backend cpu_backend = {
-        /* .interface = */ &cpu_backend_interface,
-        /* .context   = */ ctx
+        /* .interface     = */ &cpu_backend_interface,
+        /* .context       = */ ctx,
+        /* .is_ram_shared = */ true,
     };
     return cpu_backend;
 }
diff --git a/ggml-backend.h b/ggml-backend.h
index 44b9f785f..635555719 100644
--- a/ggml-backend.h
+++ b/ggml-backend.h
@@ -61,7 +61,10 @@ extern "C" {
 
     struct ggml_backend {
         struct ggml_backend_interface * interface;
+
         ggml_backend_context_t context;
+
+        bool is_ram_shared;
     };
 
     // backend helper functions
@@ -78,7 +81,16 @@ extern "C" {
     static inline void ggml_backend_graph_compute(struct ggml_backend * backend, struct ggml_cgraph * cgraph) { backend->interface->graph_compute(backend->context, cgraph); }
 
     // buffer and tensor allocation
-    GGML_API struct ggml_buffer ggml_backend_alloc_buffer(struct ggml_backend * backend, size_t size, size_t max_tensors); // GG: probably return ptr
+    // TODO:
+    //  - return "struct ggml_buffer *"
+    //  - fix namings:
+    //    - ggml_backend_alloc_buffer -> ggml_backend_buffer_alloc
+    //    - ggml_backend_free_buffer  -> ggml_backend_buffer_free
+    //    - ggml_backend_reset_buffer -> ggml_backend_buffer_reset
+    //    - ggml_backend_alloc_tensor -> ggml_backend_tensor_alloc
+    //    - ggml_backend_tensor_cpy   -> ggml_backend_tensor_copy
+    //
+    GGML_API struct ggml_buffer ggml_backend_alloc_buffer(struct ggml_backend * backend, size_t size, size_t max_tensors);
     GGML_API void               ggml_backend_free_buffer(struct ggml_buffer * buffer);
     static inline void          ggml_backend_reset_buffer(struct ggml_buffer * buffer) { buffer->backend->interface->reset_buffer(buffer->backend->context, buffer->backend_buffer); }
     static inline void          ggml_backend_alloc_tensor(struct ggml_buffer * buffer, struct ggml_tensor * tensor) { buffer->backend->interface->alloc_tensor(buffer->backend->context, buffer->backend_buffer, tensor); }
diff --git a/ggml-cuda.cu b/ggml-cuda.cu
index 343eda0b2..a2d7c545b 100644
--- a/ggml-cuda.cu
+++ b/ggml-cuda.cu
@@ -1834,8 +1834,9 @@ ggml_backend ggml_backend_cuda_init(void) {
     ggml_backend_cuda_context * ctx = new ggml_backend_cuda_context;
 
     ggml_backend cuda_backend = {
-        /* .interface = */ &cuda_backend_interface,
-        /* .context   = */ ctx
+        /* .interface =   = */ &cuda_backend_interface,
+        /* .context       = */ ctx,
+        /* .is_ram_shared = */ false,
     };
     return cuda_backend;
 }
diff --git a/llama.cpp b/llama.cpp
index e4a566df0..c234cdf3f 100644
--- a/llama.cpp
+++ b/llama.cpp
@@ -225,6 +225,7 @@ struct llama_model {
     llama_vocab vocab;
 
     // backends
+    // TODO: change to pointers
     ggml_backend   backend_cpu;
     ggml_buffer    buf_cpu;
     ggml_context * ctx_cpu = NULL;
@@ -298,6 +299,7 @@ struct llama_context {
 
     // memory buffers used to evaluate the model
     ggml_buffer buf_compute_cpu = {};
+
 #ifdef GGML_USE_CUDA
     ggml_buffer buf_compute_cuda = {};
 #endif
@@ -612,7 +614,7 @@ struct llama_model_loader {
         }
     }
 
-    void load_all_data(llama_progress_callback progress_callback, void *  progress_callback_user_data, llama_mlock * lmlock) {
+    void load_all_data(llama_progress_callback progress_callback, void * progress_callback_user_data, llama_mlock * lmlock) {
         size_t data_size = 0;
         size_t lock_size = 0;
         for (const llama_load_tensor & lt : tensors_map.tensors) {
@@ -634,11 +636,11 @@ struct llama_model_loader {
             }
             LLAMA_ASSERT(lt.ggml_tensor); // unused tensors should have been caught by load_data already
 
-            bool is_cpu = lt.ggml_tensor->backend == &model->backend_cpu;
+            const bool is_ram_shared = lt.ggml_tensor->backend->is_ram_shared;
 
             // select buffer to load data into
             if (!use_mmap) {
-                if (is_cpu) {
+                if (is_ram_shared) {
                     lt.data = (uint8_t *) lt.ggml_tensor->data;
                 } else {
                     // read to temporary buffer
@@ -649,7 +651,7 @@ struct llama_model_loader {
 
             load_data_for(lt);
 
-            if (is_cpu) {
+            if (is_ram_shared) {
                 if (use_mmap) {
                     lt.ggml_tensor->data = lt.data;
                     // TODO: this assumes that the data to lock is contiguous, which may not always be the case
@@ -671,7 +673,7 @@ struct llama_model_loader {
         }
     }
 
-    void load_data_for(llama_load_tensor & lt) {
+    void load_data_for(llama_load_tensor & lt) const {
         if (use_mmap) {
             lt.data = (uint8_t *) mapping->addr + lt.file_off;
         } else {
@@ -957,6 +959,7 @@ static void llama_model_load_internal(
 
     ggml_backend * backend_cpu = &model.backend_cpu;
     ggml_backend * backend_gpu = &model.backend_cpu; // hack until we have a proper backend selection
+
 #ifdef GGML_USE_CUDA
     if (n_gpu_layers > 0) {
         model.backend_cuda = ggml_backend_cuda_init();
@@ -965,13 +968,14 @@ static void llama_model_load_internal(
 #endif
 #ifdef GGML_USE_METAL
     if (n_gpu_layers > 0) {
-        model.backend_metal = ggml_backend_cpu_init();
+        model.backend_metal = ggml_backend_metal_init();
         backend_gpu = &model.backend_metal;
     }
 #endif
 
     // assign splits to the backends
     const int i_gpu_start = std::max(0, (int)n_layer - n_gpu_layers);
+
     model.backend_inp = n_gpu_layers > (int)n_layer ? backend_gpu : backend_cpu;
     model.backend_out = n_gpu_layers > 0            ? backend_gpu : backend_cpu;
 
@@ -1011,7 +1015,7 @@ static void llama_model_load_internal(
     fprintf(stderr, "%s: ggml ctx sizes:\n", __func__);
     for (const auto & it : ctx_sizes) {
         fprintf(stderr, "%8s = %7.2f MB", ggml_backend_name(it.first), it.second / 1024.0 / 1024.0);
-        if (it.first == backend_cpu && ml->use_mmap) {
+        if (it.first->is_ram_shared && ml->use_mmap) {
             fprintf(stderr, " + %7.2f MB (mmap)", mmap_size / 1024.0 / 1024.0);
         }
         fprintf(stderr, "\n");
@@ -1135,12 +1139,10 @@ static void llama_model_load_internal(
             ctx_sum += it.second;
         }
 
-        const size_t mem_required =
-            ctx_sum + MEM_REQ_EVAL().at(model.type);
+        const size_t mem_required = ctx_sum + MEM_REQ_EVAL().at(model.type);
 
         // this is the memory required by one llama_state
-        const size_t mem_required_state =
-            scale*MEM_REQ_KV_SELF().at(model.type);
+        const size_t mem_required_state = scale*MEM_REQ_KV_SELF().at(model.type);
 
         fprintf(stderr, "%s: mem required  = %7.2f MB (+ %7.2f MB per state)\n", __func__,
                 mem_required / 1024.0 / 1024.0, mem_required_state / 1024.0 / 1024.0);
@@ -1162,6 +1164,7 @@ static void llama_model_load_internal(
     // loading time will be recalculate after the first eval, so
     // we take page faults deferred by mmap() into consideration
     model.t_load_us = ggml_time_us() - model.t_start_us;
+
 }
 
 static bool llama_model_load(
@@ -1226,6 +1229,7 @@ static ggml_graph_splits llama_build_graph(
     // initialize contexts for every backend
 
     struct ggml_context * ctx_cpu = nullptr;
+
     if (lctx.buf_compute_cpu.mem_size > 0) {
         struct ggml_init_params params = ggml_init_params_default();
         params.buffer = &lctx.buf_compute_cpu;
@@ -1235,6 +1239,7 @@ static ggml_graph_splits llama_build_graph(
 
 #ifdef GGML_USE_CUDA
     struct ggml_context * ctx_cuda = nullptr;
+
     if (lctx.buf_compute_cuda.mem_size > 0) {
         struct ggml_init_params params = ggml_init_params_default();
         params.buffer = &lctx.buf_compute_cuda;
@@ -1243,30 +1248,54 @@ static ggml_graph_splits llama_build_graph(
     }
 #endif
 
+#ifdef GGML_USE_METAL
+    struct ggml_context * ctx_metal = nullptr;
+
+    if (lctx.buf_compute_metal.mem_size > 0) {
+        struct ggml_init_params params = ggml_init_params_default();
+        params.buffer = &lctx.buf_compute_metal;
+        params.compute_type = compute_type;
+        ctx_metal = ggml_init(params);
+    }
+#endif
+
     // TODO: clean this
     struct ggml_context * ctx_i      = nullptr;
-    struct ggml_context * ctx_ls[80] = {nullptr};
     struct ggml_context * ctx_o      = nullptr;
     struct ggml_context * ctx_kv     = nullptr;
+    struct ggml_context * ctx_ls[80] = {nullptr};
 
     if (lctx.model.backend_inp == &lctx.model.backend_cpu) ctx_i = ctx_cpu;
     if (lctx.model.backend_out == &lctx.model.backend_cpu) ctx_o = ctx_cpu;
+
 #ifdef GGML_USE_CUDA
     if (lctx.model.backend_inp == &lctx.model.backend_cuda) ctx_i = ctx_cuda;
     if (lctx.model.backend_out == &lctx.model.backend_cuda) ctx_o = ctx_cuda;
 #endif
+#ifdef GGML_USE_METAL
+    if (lctx.model.backend_inp == &lctx.model.backend_metal) ctx_i = ctx_metal;
+    if (lctx.model.backend_out == &lctx.model.backend_metal) ctx_o = ctx_metal;
+#endif
 
     for (int il = 0; il < n_layer; il++) {
-        if (lctx.model.backend_layers[il] == &lctx.model.backend_cpu)  ctx_ls[il] = ctx_cpu;
+        if (lctx.model.backend_layers[il] == &lctx.model.backend_cpu) ctx_ls[il] = ctx_cpu;
+
 #ifdef GGML_USE_CUDA
         if (lctx.model.backend_layers[il] == &lctx.model.backend_cuda) ctx_ls[il] = ctx_cuda;
+#endif
+#ifdef GGML_USE_METAL
+        if (lctx.model.backend_layers[il] == &lctx.model.backend_metal) ctx_ls[il] = ctx_metal;
 #endif
     }
 
-    if (lctx.backend_kv == &lctx.model.backend_cpu)  ctx_kv = ctx_cpu;
+    if (lctx.backend_kv == &lctx.model.backend_cpu) ctx_kv = ctx_cpu;
+
 #ifdef GGML_USE_CUDA
     if (lctx.backend_kv == &lctx.model.backend_cuda) ctx_kv = ctx_cuda;
 #endif
+#ifdef GGML_USE_METAL
+    if (lctx.backend_kv == &lctx.model.backend_metal) ctx_kv = ctx_metal;
+#endif
 
     struct ggml_tensor * inpL;
 
@@ -1522,7 +1551,7 @@ static ggml_graph_splits llama_build_graph(
     //}
 
 #ifdef LLAMA_1L_GRAPH_DUMP
-    if (N==1 && n_past == 0) {
+    if (N == 1 && n_past == 0) {
         ggml_graph_dump_dot(gf, NULL, "llama.dot");
         printf("graph for N=%i, n_past=%i dumped to llama.dot\n", N, n_past);
         exit(0);
@@ -1547,6 +1576,11 @@ static ggml_graph_splits llama_build_graph(
         ggml_free(ctx_cuda);
     }
 #endif
+#ifdef GGML_USE_METAL
+    if (ctx_metal != nullptr) {
+        ggml_free(ctx_metal);
+    }
+#endif
 
     return splits;
 }
@@ -2651,7 +2685,6 @@ struct llama_context * llama_new_context_with_model(
     ctx->rng = std::mt19937(params.seed);
     ctx->logits_all = params.logits_all;
 
-
     // TODO: choose backend depending on n_layers/low_vram
 #ifdef GGML_USE_CUDA
     if ((uint32_t)params.n_gpu_layers >= model->hparams.n_layer/2) {