Fixed buffer allocation

This commit is contained in:
Lorenzo Toniazzi 2024-07-02 21:59:54 +01:00
parent 028d3f7c89
commit 1103bdb574
2 changed files with 25 additions and 51 deletions

View file

@ -139,52 +139,6 @@ void verify_tensor_allocation(struct ggml_context * ctx, ggml_backend_buffer_t b
int main(int argc, char ** argv) {
// The library allows the user to define a certain function using the available tensor operations. This function
// definition is represented internally via a computation graph. Each tensor operation in the function definition
// corresponds to a node in the graph. Having the computation graph defined, the user can choose to compute the
// function's value and/or its gradient with respect to the input variables. Optionally, the function can be optimized
// using one of the available optimization algorithms.
//
// For example, here we define the function: f(x) = a*x^2 + b
// memory allocation happens here
// Create context allogating memory
struct ggml_init_params _params = {
.mem_size = 16*1024*1024,
.mem_buffer = NULL,
.no_alloc = true,
};
struct ggml_context * _ctx = ggml_init(_params);
struct ggml_tensor * x = ggml_new_tensor_1d(_ctx, GGML_TYPE_F32, 1);
// ggml_set_param(_ctx, x); // x is an input variable
// struct ggml_tensor * a = ggml_new_tensor_1d(_ctx, GGML_TYPE_F32, 1);
// struct ggml_tensor * b = ggml_new_tensor_1d(_ctx, GGML_TYPE_F32, 1);
// struct ggml_tensor * x2 = ggml_mul(_ctx, x, x);
// struct ggml_tensor * f = ggml_add(_ctx, ggml_mul(_ctx, a, x2), b);
// struct ggml_cgraph * gf = ggml_new_graph(_ctx);
// // ggml_backend_alloc_ctx_tensors_from_buft(_ctx, ggml_backend_cpu_buffer_type());
// // ggml_backend_alloc_ctx_tensors_from_buft(_ctx, ggml_backend_metal_buffer_type());
ggml_backend_buffer_t buf = ggml_backend_alloc_ctx_tensors_from_buft(_ctx, ggml_backend_metal_buffer_type());
if (buf == nullptr) {
throw std::runtime_error("unable to allocate backend buffer");
}
else {
size_t buffer_size = ggml_backend_buft_get_max_size(ggml_backend_metal_buffer_type());
// Verify tensor allocations
verify_tensor_allocation(_ctx, buf, buffer_size);
}
ggml_used_mem(_ctx);
//
gpt_params params;
g_params = &params;

View file

@ -150,6 +150,11 @@ struct lora_data {
struct lora_info info;
std::vector<uint8_t> data;
struct ggml_context * ctx;
// the backend to perform the computation (CPU, CUDA, METAL)
ggml_backend_t backend = NULL;
// the backend buffer to storage the tensors data of a and b
ggml_backend_buffer_t buffer;
uint32_t lora_r;
uint32_t lora_alpha;
@ -253,9 +258,17 @@ static struct lora_data * load_lora(struct lora_info * info) {
struct lora_data * result = new struct lora_data;
result->info = *info;
result->ctx = NULL;
result->backend = NULL;
result->buffer = NULL;
result->lora_r = 1;
result->lora_alpha = 1;
fprintf(stderr, "%s: using Metal backend\n", __func__);
result->backend = ggml_backend_metal_init();
if (!result->backend) {
fprintf(stderr, "%s: ggml_backend_metal_init() failed\n", __func__);
}
struct llama_file_lora file(info->filename.c_str(), "rb");
if (file.fp == NULL) {
fprintf(stderr, "warning: Could not open lora adapter '%s'. Ignoring this adapter.\n",
@ -307,9 +320,10 @@ static struct lora_data * load_lora(struct lora_info * info) {
tensors_offset.push_back(offset);
file.seek(nbytes, SEEK_CUR);
}
result->buffer = ggml_backend_alloc_ctx_tensors(result->ctx, result->backend);
ggml_backend_buffer_t buf = ggml_backend_alloc_ctx_tensors_from_buft(result->ctx, ggml_backend_metal_buffer_type());
if (!buf) {
// ggml_backend_buffer_t buf = ggml_backend_alloc_ctx_tensors_from_buft(result->ctx, ggml_backend_metal_buffer_type());
if (!result->buffer) {
LLAMA_LOG_ERROR("%s: failed to allocate buffer for lora tensors\n", __func__);
}
// read tensor data
@ -321,9 +335,15 @@ static struct lora_data * load_lora(struct lora_info * info) {
size_t nbytes = ggml_nbytes(tensor);
size_t nbytes_pad = ggml_nbytes_pad(tensor);
file.seek(offset, SEEK_SET);
tensor->data = result->data.data() + data_offset;
file.read_raw(tensor->data, nbytes);
data_offset += nbytes_pad;
std::vector<char> read_buf;
read_buf.resize(ggml_nbytes(tensor));
file.read_raw(read_buf.data(), ggml_nbytes(tensor));
ggml_backend_tensor_set(tensor, read_buf.data(), 0, ggml_nbytes(tensor));
// tensor_tmp->data = result->data.data() + data_offset;
// file.read_raw(tensor_tmp->data, nbytes);
// data_offset += nbytes_pad;
// ggml_backend_tensor_set(tensor, tensor_tmp->data, 0, ggml_nbytes(tensor));
}
return result;
}