From 1103bdb57476b65404221f87b37ed2f91ffd4492 Mon Sep 17 00:00:00 2001
From: Lorenzo Toniazzi <lorenzo.toniazzi@stepstone.com>
Date: Tue, 2 Jul 2024 21:59:54 +0100
Subject: [PATCH] Fixed buffer allocation

---
 examples/main/main.cpp | 46 ------------------------------------------
 llama.cpp              | 30 ++++++++++++++++++++++-----
 2 files changed, 25 insertions(+), 51 deletions(-)

diff --git a/examples/main/main.cpp b/examples/main/main.cpp
index bdcf6f998..5e9e4001d 100644
--- a/examples/main/main.cpp
+++ b/examples/main/main.cpp
@@ -139,52 +139,6 @@ void verify_tensor_allocation(struct ggml_context * ctx, ggml_backend_buffer_t b
 
 int main(int argc, char ** argv) {
 
-
-    // The library allows the user to define a certain function using the available tensor operations. This function
-    // definition is represented internally via a computation graph. Each tensor operation in the function definition
-    // corresponds to a node in the graph. Having the computation graph defined, the user can choose to compute the
-    // function's value and/or its gradient with respect to the input variables. Optionally, the function can be optimized
-    // using one of the available optimization algorithms.
-    //
-    // For example, here we define the function: f(x) = a*x^2 + b    
-
-    // memory allocation happens here
-    // Create context allogating memory
-    struct ggml_init_params _params = {
-        .mem_size   = 16*1024*1024,
-        .mem_buffer = NULL,
-        .no_alloc   = true,
-    };
-    struct ggml_context * _ctx = ggml_init(_params);
-
-    struct ggml_tensor * x = ggml_new_tensor_1d(_ctx, GGML_TYPE_F32, 1);
-
-    // ggml_set_param(_ctx, x); // x is an input variable
-
-    // struct ggml_tensor * a  = ggml_new_tensor_1d(_ctx, GGML_TYPE_F32, 1);
-    // struct ggml_tensor * b  = ggml_new_tensor_1d(_ctx, GGML_TYPE_F32, 1);
-    // struct ggml_tensor * x2 = ggml_mul(_ctx, x, x);
-    // struct ggml_tensor * f  = ggml_add(_ctx, ggml_mul(_ctx, a, x2), b);
-
-    // struct ggml_cgraph * gf = ggml_new_graph(_ctx);
-
-    // // ggml_backend_alloc_ctx_tensors_from_buft(_ctx, ggml_backend_cpu_buffer_type());
-    // // ggml_backend_alloc_ctx_tensors_from_buft(_ctx,  ggml_backend_metal_buffer_type());
-    ggml_backend_buffer_t buf = ggml_backend_alloc_ctx_tensors_from_buft(_ctx, ggml_backend_metal_buffer_type());
-            if (buf == nullptr) {
-                throw std::runtime_error("unable to allocate backend buffer");
-            }
-            else {
-                size_t buffer_size = ggml_backend_buft_get_max_size(ggml_backend_metal_buffer_type());
-
-                // Verify tensor allocations
-                verify_tensor_allocation(_ctx, buf, buffer_size);
-            }
-    ggml_used_mem(_ctx);
-    // 
-
-
-
     gpt_params params;
     g_params = &params;
 
diff --git a/llama.cpp b/llama.cpp
index 744e4f8c3..cd4b43e94 100644
--- a/llama.cpp
+++ b/llama.cpp
@@ -150,6 +150,11 @@ struct lora_data {
     struct lora_info     info;
     std::vector<uint8_t> data;
     struct ggml_context * ctx;
+    // the backend to perform the computation (CPU, CUDA, METAL)
+    ggml_backend_t backend = NULL;
+
+    // the backend buffer to storage the tensors data of a and b
+    ggml_backend_buffer_t buffer;
 
     uint32_t lora_r;
     uint32_t lora_alpha;
@@ -253,9 +258,17 @@ static struct lora_data * load_lora(struct lora_info * info) {
     struct lora_data * result = new struct lora_data;
     result->info = *info;
     result->ctx = NULL;
+    result->backend = NULL;
+    result->buffer = NULL;
     result->lora_r     = 1;
     result->lora_alpha = 1;
 
+    fprintf(stderr, "%s: using Metal backend\n", __func__);
+    result->backend = ggml_backend_metal_init();
+    if (!result->backend) {
+        fprintf(stderr, "%s: ggml_backend_metal_init() failed\n", __func__);
+    }
+
     struct llama_file_lora file(info->filename.c_str(), "rb");
     if (file.fp == NULL) {
         fprintf(stderr, "warning: Could not open lora adapter '%s'. Ignoring this adapter.\n",
@@ -307,9 +320,10 @@ static struct lora_data * load_lora(struct lora_info * info) {
         tensors_offset.push_back(offset);
         file.seek(nbytes, SEEK_CUR);
     }
+    result->buffer = ggml_backend_alloc_ctx_tensors(result->ctx, result->backend);
 
-    ggml_backend_buffer_t buf = ggml_backend_alloc_ctx_tensors_from_buft(result->ctx,  ggml_backend_metal_buffer_type());
-        if (!buf) {
+    // ggml_backend_buffer_t buf = ggml_backend_alloc_ctx_tensors_from_buft(result->ctx,  ggml_backend_metal_buffer_type());
+        if (!result->buffer) {
             LLAMA_LOG_ERROR("%s: failed to allocate buffer for lora tensors\n", __func__);
         }
     // read tensor data
@@ -321,9 +335,15 @@ static struct lora_data * load_lora(struct lora_info * info) {
         size_t nbytes     = ggml_nbytes(tensor);
         size_t nbytes_pad = ggml_nbytes_pad(tensor);
         file.seek(offset, SEEK_SET);
-        tensor->data = result->data.data() + data_offset;
-        file.read_raw(tensor->data, nbytes);
-        data_offset += nbytes_pad;
+
+        std::vector<char> read_buf;
+        read_buf.resize(ggml_nbytes(tensor));
+        file.read_raw(read_buf.data(), ggml_nbytes(tensor));
+        ggml_backend_tensor_set(tensor, read_buf.data(), 0, ggml_nbytes(tensor));
+        // tensor_tmp->data = result->data.data() + data_offset;
+        // file.read_raw(tensor_tmp->data, nbytes);
+        // data_offset += nbytes_pad;
+        // ggml_backend_tensor_set(tensor, tensor_tmp->data, 0, ggml_nbytes(tensor));
     }
     return result;
 }