From 1103bdb57476b65404221f87b37ed2f91ffd4492 Mon Sep 17 00:00:00 2001 From: Lorenzo Toniazzi Date: Tue, 2 Jul 2024 21:59:54 +0100 Subject: [PATCH] Fixed buffer allocation --- examples/main/main.cpp | 46 ------------------------------------------ llama.cpp | 30 ++++++++++++++++++++++----- 2 files changed, 25 insertions(+), 51 deletions(-) diff --git a/examples/main/main.cpp b/examples/main/main.cpp index bdcf6f998..5e9e4001d 100644 --- a/examples/main/main.cpp +++ b/examples/main/main.cpp @@ -139,52 +139,6 @@ void verify_tensor_allocation(struct ggml_context * ctx, ggml_backend_buffer_t b int main(int argc, char ** argv) { - - // The library allows the user to define a certain function using the available tensor operations. This function - // definition is represented internally via a computation graph. Each tensor operation in the function definition - // corresponds to a node in the graph. Having the computation graph defined, the user can choose to compute the - // function's value and/or its gradient with respect to the input variables. Optionally, the function can be optimized - // using one of the available optimization algorithms. - // - // For example, here we define the function: f(x) = a*x^2 + b - - // memory allocation happens here - // Create context allogating memory - struct ggml_init_params _params = { - .mem_size = 16*1024*1024, - .mem_buffer = NULL, - .no_alloc = true, - }; - struct ggml_context * _ctx = ggml_init(_params); - - struct ggml_tensor * x = ggml_new_tensor_1d(_ctx, GGML_TYPE_F32, 1); - - // ggml_set_param(_ctx, x); // x is an input variable - - // struct ggml_tensor * a = ggml_new_tensor_1d(_ctx, GGML_TYPE_F32, 1); - // struct ggml_tensor * b = ggml_new_tensor_1d(_ctx, GGML_TYPE_F32, 1); - // struct ggml_tensor * x2 = ggml_mul(_ctx, x, x); - // struct ggml_tensor * f = ggml_add(_ctx, ggml_mul(_ctx, a, x2), b); - - // struct ggml_cgraph * gf = ggml_new_graph(_ctx); - - // // ggml_backend_alloc_ctx_tensors_from_buft(_ctx, ggml_backend_cpu_buffer_type()); - // // ggml_backend_alloc_ctx_tensors_from_buft(_ctx, ggml_backend_metal_buffer_type()); - ggml_backend_buffer_t buf = ggml_backend_alloc_ctx_tensors_from_buft(_ctx, ggml_backend_metal_buffer_type()); - if (buf == nullptr) { - throw std::runtime_error("unable to allocate backend buffer"); - } - else { - size_t buffer_size = ggml_backend_buft_get_max_size(ggml_backend_metal_buffer_type()); - - // Verify tensor allocations - verify_tensor_allocation(_ctx, buf, buffer_size); - } - ggml_used_mem(_ctx); - // - - - gpt_params params; g_params = ¶ms; diff --git a/llama.cpp b/llama.cpp index 744e4f8c3..cd4b43e94 100644 --- a/llama.cpp +++ b/llama.cpp @@ -150,6 +150,11 @@ struct lora_data { struct lora_info info; std::vector data; struct ggml_context * ctx; + // the backend to perform the computation (CPU, CUDA, METAL) + ggml_backend_t backend = NULL; + + // the backend buffer to storage the tensors data of a and b + ggml_backend_buffer_t buffer; uint32_t lora_r; uint32_t lora_alpha; @@ -253,9 +258,17 @@ static struct lora_data * load_lora(struct lora_info * info) { struct lora_data * result = new struct lora_data; result->info = *info; result->ctx = NULL; + result->backend = NULL; + result->buffer = NULL; result->lora_r = 1; result->lora_alpha = 1; + fprintf(stderr, "%s: using Metal backend\n", __func__); + result->backend = ggml_backend_metal_init(); + if (!result->backend) { + fprintf(stderr, "%s: ggml_backend_metal_init() failed\n", __func__); + } + struct llama_file_lora file(info->filename.c_str(), "rb"); if (file.fp == NULL) { fprintf(stderr, "warning: Could not open lora adapter '%s'. Ignoring this adapter.\n", @@ -307,9 +320,10 @@ static struct lora_data * load_lora(struct lora_info * info) { tensors_offset.push_back(offset); file.seek(nbytes, SEEK_CUR); } + result->buffer = ggml_backend_alloc_ctx_tensors(result->ctx, result->backend); - ggml_backend_buffer_t buf = ggml_backend_alloc_ctx_tensors_from_buft(result->ctx, ggml_backend_metal_buffer_type()); - if (!buf) { + // ggml_backend_buffer_t buf = ggml_backend_alloc_ctx_tensors_from_buft(result->ctx, ggml_backend_metal_buffer_type()); + if (!result->buffer) { LLAMA_LOG_ERROR("%s: failed to allocate buffer for lora tensors\n", __func__); } // read tensor data @@ -321,9 +335,15 @@ static struct lora_data * load_lora(struct lora_info * info) { size_t nbytes = ggml_nbytes(tensor); size_t nbytes_pad = ggml_nbytes_pad(tensor); file.seek(offset, SEEK_SET); - tensor->data = result->data.data() + data_offset; - file.read_raw(tensor->data, nbytes); - data_offset += nbytes_pad; + + std::vector read_buf; + read_buf.resize(ggml_nbytes(tensor)); + file.read_raw(read_buf.data(), ggml_nbytes(tensor)); + ggml_backend_tensor_set(tensor, read_buf.data(), 0, ggml_nbytes(tensor)); + // tensor_tmp->data = result->data.data() + data_offset; + // file.read_raw(tensor_tmp->data, nbytes); + // data_offset += nbytes_pad; + // ggml_backend_tensor_set(tensor, tensor_tmp->data, 0, ggml_nbytes(tensor)); } return result; }