diff --git a/BRANCH_SETUP.md b/BRANCH_SETUP.md index d9f7405b5..0b6cdac74 100644 --- a/BRANCH_SETUP.md +++ b/BRANCH_SETUP.md @@ -36,6 +36,8 @@ Run main with base model and lora adapter to hot-swap -n 128 ``` +Working but `ggml_metal_get_buffer: error: tensor 'blk.16.attn_v.weight.loraB' buffer is nil` + With `ngl > 0` the code breaks. Probably because the Lora tensors try to interact with the base tensors (as in `lora_mul_mat`), but the lora tensors are not moved to the gpu buffer of the base tensors. # Logic @@ -47,4 +49,254 @@ With `ngl > 0` the code breaks. Probably because the Lora tensors try to interac - Only one Lora adapter can be passed. - Applying only adapter to Q, K, V matrices to keep the code contained (fintuning trained lora tensors for all linear layers) -- GPU not supported \ No newline at end of file +- GPU not supported + + + + +# Tutorial + +```cpp +#include "llama.h" + +#include "unicode.h" + +#include "ggml.h" +#include "ggml-alloc.h" +#include "ggml-backend.h" + +#ifdef GGML_USE_RPC +# include "ggml-rpc.h" +#endif + +#ifdef GGML_USE_CUDA +# include "ggml-cuda.h" +#elif defined(GGML_USE_VULKAN) +# include "ggml-vulkan.h" +#elif defined(GGML_USE_SYCL) +# include "ggml-sycl.h" +#elif defined(GGML_USE_KOMPUTE) +# include "ggml-kompute.h" +#endif + +#ifdef GGML_USE_METAL +# include "ggml-metal.h" +#endif + +// TODO: replace with ggml API call +#define QK_K 256 + +#ifdef __has_include + #if __has_include() + #include + #if defined(_POSIX_MAPPED_FILES) + #include + #include + #endif + #if defined(_POSIX_MEMLOCK_RANGE) + #include + #endif + #endif +#endif + +#if defined(_WIN32) + #define WIN32_LEAN_AND_MEAN + #ifndef NOMINMAX + #define NOMINMAX + #endif + #include + #ifndef PATH_MAX + #define PATH_MAX MAX_PATH + #endif + #include +#endif + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "ggml-metal.h" + +#if defined(_MSC_VER) +#pragma warning(disable: 4244 4267) // possible loss of data +#endif + +#ifdef __GNUC__ +#ifdef __MINGW32__ +#define LLAMA_ATTRIBUTE_FORMAT(...) __attribute__((format(gnu_printf, __VA_ARGS__))) +#else +#define LLAMA_ATTRIBUTE_FORMAT(...) __attribute__((format(printf, __VA_ARGS__))) +#endif +#else +#define LLAMA_ATTRIBUTE_FORMAT(...) +#endif + +#define LLAMA_MAX_NODES 8192 +#define LLAMA_MAX_EXPERTS 160 + + +int main() { + struct ggml_init_params params = { + .mem_size = 16*1024*1024, + .mem_buffer = NULL, + /*.no_alloc =*/ true, + }; + + // The library allows the user to define a certain function using the available tensor operations. This function + // definition is represented internally via a computation graph. Each tensor operation in the function definition + // corresponds to a node in the graph. Having the computation graph defined, the user can choose to compute the + // function's value and/or its gradient with respect to the input variables. Optionally, the function can be optimized + // using one of the available optimization algorithms. + // + // For example, here we define the function: f(x) = a*x^2 + b + + // memory allocation happens here + // Create context allogating memory + struct ggml_context * ctx = ggml_init(params); + + struct ggml_tensor * x = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, 1); + + ggml_set_param(ctx, x); // x is an input variable + + struct ggml_tensor * a = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, 1); + struct ggml_tensor * b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, 1); + struct ggml_tensor * x2 = ggml_mul(ctx, x, x); + struct ggml_tensor * f = ggml_add(ctx, ggml_mul(ctx, a, x2), b); + + struct ggml_cgraph * gf = ggml_new_graph(ctx); + + // ggml_backend_alloc_ctx_tensors_from_buft(ctx, ggml_backend_cpu_buffer_type()); + // ggml_backend_alloc_ctx_tensors_from_buft(ctx, ggml_backend_metal_buffer_type()); + ggml_backend_buffer_t buf = ggml_backend_alloc_ctx_tensors_from_buft(ctx, ggml_backend_metal_buffer_type()); + if (buf == nullptr) { + throw std::runtime_error("unable to allocate backend buffer"); + } + ggml_used_mem(ctx); + + // llama_default_buffer_type_offload(model, layer_gpu); used in llama.cpp + // How to check which buffer is the context allocated, + // can look at single tensors? option, check in inited in base model + + // Try this + // You can simplify all of this for testing, and if you are using CPU only, and just run with -ngl 0 + // and allocate everything in a CPU buffer by using + // ggml_backend_alloc_ctx_tensors_from_buft(ctx, ggml_backend_cpu_buffer_type()); + // or run with -ngl 99 and use a Metal buffer type instead with + // ggml_backend_metal_buffer_type() + // It will still run if you allocate the tensors in the wrong buffer type as long as you use ggml-backend + // to allocate the tensors, it will just be slower. + + // Notice that the function definition above does not involve any actual computation. The computation is performed only + // when the user explicitly requests it. For example, to compute the function's value at x = 2.0: + + + ggml_build_forward_expand(gf, f); + + // set the input variable and parameter values + ggml_set_f32(x, 2.0f); + ggml_set_f32(a, 3.0f); + ggml_set_f32(b, 4.0f); + + ggml_graph_compute_with_ctx(ctx, gf, 1); + + printf("f = %f\n", ggml_get_f32_1d(f, 0)); + + // The actual computation is performed in the ggml_graph_compute() function. + // + // The ggml_new_tensor_...() functions create new tensors. They are allocated in the memory buffer provided to the + // ggml_init() function. You have to be careful not to exceed the memory buffer size. Therefore, you have to know + // in advance how much memory you need for your computation. Alternatively, you can allocate a large enough memory + // and after defining the computation graph, call the ggml_used_mem() function to find out how much memory was + // actually needed. + // + // The ggml_set_param() function marks a tensor as an input variable. This is used by the automatic + // differentiation and optimization algorithms. + // + // The described approach allows to define the function graph once and then compute its forward or backward graphs + // multiple times. All computations will use the same memory buffer allocated in the ggml_init() function. This way + // the user can avoid the memory allocation overhead at runtime. + // + // The library supports multi-dimensional tensors - up to 4 dimensions. The FP16 and FP32 data types are first class + // citizens, but in theory the library can be extended to support FP8 and integer data types. + // + // Each tensor operation produces a new tensor. Initially the library was envisioned to support only the use of unary + // and binary operations. Most of the available operations fall into one of these two categories. With time, it became + // clear that the library needs to support more complex operations. The way to support these operations is not clear + // yet, but a few examples are demonstrated in the following operations: + // + // - ggml_permute() + // - ggml_conv_1d_1s() + // - ggml_conv_1d_2s() + // + // For each tensor operator, the library implements a forward and backward computation function. The forward function + // computes the output tensor value given the input tensor values. The backward function computes the adjoint of the + // input tensors given the adjoint of the output tensor. For a detailed explanation of what this means, take a + // calculus class, or watch the following video: + // + // What is Automatic Differentiation? + // https://www.youtube.com/watch?v=wG_nF1awSSY + + // ## Tensor data (struct ggml_tensor) + // + // The tensors are stored in memory via the ggml_tensor struct. The structure provides information about the size of + // the tensor, the data type, and the memory buffer where the tensor data is stored. Additionally, it contains + // pointers to the "source" tensors - i.e. the tensors that were used to compute the current tensor. For example: + + struct ggml_tensor * c = ggml_add(ctx, a, b); + + assert(c->src[0] == a); + assert(c->src[1] == b); + + // The multi-dimensional tensors are stored in row-major order. The ggml_tensor struct contains fields for the + // number of elements in each dimension ("ne") as well as the number of bytes ("nb", a.k.a. stride). This allows + // to store tensors that are not contiguous in memory, which is useful for operations such as transposition and + // permutation. All tensor operations have to take the stride into account and not assume that the tensor is + // contiguous in memory. + + // The data of the tensor is accessed via the "data" pointer. For example: + + const int nx = 2; + const int ny = 3; + + struct ggml_tensor * A = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, nx, ny); + + for (int y = 0; y < ny; y++) { + for (int x = 0; x < nx; x++) { + *(float *) ((char *) A->data + y*A->nb[1] + x*A->nb[0]) = x + y; + } + } + + // + // Alternatively, there are helper functions, such as ggml_get_f32_1d() and ggml_set_f32_1d() that can be used. + // + + } + ``` \ No newline at end of file diff --git a/examples/main/main.cpp b/examples/main/main.cpp index b97b7b793..bdcf6f998 100644 --- a/examples/main/main.cpp +++ b/examples/main/main.cpp @@ -117,7 +117,74 @@ static void llama_log_callback_logTee(ggml_log_level level, const char * text, v LOG_TEE("%s", text); } +#include "ggml-metal.h" + +bool is_pointer_in_buffer_range(void *ptr, void *buffer_start, size_t buffer_size) { + return (ptr >= (char*)buffer_start) && (ptr < ((char*)buffer_start + buffer_size)); +} + + +void verify_tensor_allocation(struct ggml_context * ctx, ggml_backend_buffer_t buffer, size_t buffer_size) { + struct ggml_tensor * first = ggml_get_first_tensor(ctx); + for (struct ggml_tensor * t = first; t != NULL; t = ggml_get_next_tensor(ctx, t)) { + if (t->data != NULL) { + if (!is_pointer_in_buffer_range(t->data, buffer, buffer_size)) { + fprintf(stderr, "Tensor %s is not within the allocated buffer range.\n", t->name); + } else { + printf("Tensor %s is correctly allocated in the buffer.\n", t->name); + } + } + } +} + int main(int argc, char ** argv) { + + + // The library allows the user to define a certain function using the available tensor operations. This function + // definition is represented internally via a computation graph. Each tensor operation in the function definition + // corresponds to a node in the graph. Having the computation graph defined, the user can choose to compute the + // function's value and/or its gradient with respect to the input variables. Optionally, the function can be optimized + // using one of the available optimization algorithms. + // + // For example, here we define the function: f(x) = a*x^2 + b + + // memory allocation happens here + // Create context allogating memory + struct ggml_init_params _params = { + .mem_size = 16*1024*1024, + .mem_buffer = NULL, + .no_alloc = true, + }; + struct ggml_context * _ctx = ggml_init(_params); + + struct ggml_tensor * x = ggml_new_tensor_1d(_ctx, GGML_TYPE_F32, 1); + + // ggml_set_param(_ctx, x); // x is an input variable + + // struct ggml_tensor * a = ggml_new_tensor_1d(_ctx, GGML_TYPE_F32, 1); + // struct ggml_tensor * b = ggml_new_tensor_1d(_ctx, GGML_TYPE_F32, 1); + // struct ggml_tensor * x2 = ggml_mul(_ctx, x, x); + // struct ggml_tensor * f = ggml_add(_ctx, ggml_mul(_ctx, a, x2), b); + + // struct ggml_cgraph * gf = ggml_new_graph(_ctx); + + // // ggml_backend_alloc_ctx_tensors_from_buft(_ctx, ggml_backend_cpu_buffer_type()); + // // ggml_backend_alloc_ctx_tensors_from_buft(_ctx, ggml_backend_metal_buffer_type()); + ggml_backend_buffer_t buf = ggml_backend_alloc_ctx_tensors_from_buft(_ctx, ggml_backend_metal_buffer_type()); + if (buf == nullptr) { + throw std::runtime_error("unable to allocate backend buffer"); + } + else { + size_t buffer_size = ggml_backend_buft_get_max_size(ggml_backend_metal_buffer_type()); + + // Verify tensor allocations + verify_tensor_allocation(_ctx, buf, buffer_size); + } + ggml_used_mem(_ctx); + // + + + gpt_params params; g_params = ¶ms; diff --git a/llama.cpp b/llama.cpp index 15e83b0c4..744e4f8c3 100644 --- a/llama.cpp +++ b/llama.cpp @@ -307,6 +307,11 @@ static struct lora_data * load_lora(struct lora_info * info) { tensors_offset.push_back(offset); file.seek(nbytes, SEEK_CUR); } + + ggml_backend_buffer_t buf = ggml_backend_alloc_ctx_tensors_from_buft(result->ctx, ggml_backend_metal_buffer_type()); + if (!buf) { + LLAMA_LOG_ERROR("%s: failed to allocate buffer for lora tensors\n", __func__); + } // read tensor data result->data.resize(total_nbytes_pad); size_t data_offset = 0; @@ -3922,7 +3927,7 @@ struct llama_model_loader { std::vector> read_buf; std::vector>> validation_result; - + // Allocate tensors data to buffer for (struct ggml_tensor * cur = ggml_get_first_tensor(ctx); cur != NULL; cur = ggml_get_next_tensor(ctx, cur)) { const auto * weight = get_weight(ggml_get_name(cur)); if (weight == nullptr) { @@ -3951,7 +3956,7 @@ struct llama_model_loader { return std::make_pair(cur, ggml_validate_row_data(cur->type, data, n_size)); })); } - + // TODO LORA allocation of base tensors GGML_ASSERT(buf_mmap || cur->data); // either we have a buffer to allocate the tensor in, or it is already allocated if (buf_mmap && cur->data == nullptr) { ggml_backend_tensor_alloc(buf_mmap, cur, data); @@ -5392,7 +5397,7 @@ static bool llm_load_tensors( auto ctx_for_layer_split = [&](int i) { return ctx_map.at(model.buft_layer[i].buft_matrix); }; model.layers.resize(n_layer); - + // main players model, ml, ctx_input/output, tn (gets name?) const auto tn = LLM_TN(model.arch); switch (model.arch) { case LLM_ARCH_LLAMA: @@ -6666,7 +6671,7 @@ static bool llm_load_tensors( #endif } } -#ifdef GGML_USE_METAL +#ifdef GGML_USE_METAL // LORA Use metal on base tensors else if (ml.use_mmap && use_mmap_buffer && buft == ggml_backend_metal_buffer_type()) { for (uint32_t idx = 0; idx < ml.files.size(); idx++) { const size_t max_size = ggml_get_max_tensor_size(ctx); @@ -16341,12 +16346,92 @@ struct llama_context * llama_new_context_with_model( // Assign data ctx->llora_data = *loras[0]; - // build the map? - ctx->lora_weights_map = get_lora_weights_map_cpp((ctx->llora_data).ctx); - std::vector keys; - for (const auto& pair : ctx->lora_weights_map) { - keys.push_back(pair.first); + + ggml_backend_buffer_t buf = ggml_backend_alloc_ctx_tensors_from_buft((ctx->llora_data).ctx, ggml_backend_metal_buffer_type()); + if (!buf) { + LLAMA_LOG_ERROR("%s: failed to allocate buffer for lora tensors\n", __func__); } + // Looks this worked, need to check if tensors have new buffer (not sure below). + // Also do we need to set the tensors? not clear where data is, looks like it is loaded after the + // tensor creation in context, but loaded where? cuz if data present dfferebt way to set with ggml_backend_tensor_set instead of ggml_backend_tensor_alloc + + // TODO looks like I have already a context with load_lora, understand if + // I am using it + // If the contexg it set to right buffer with ggml_backend_alloc_ctx_tensors_from_buft((ctx->llora_data).ctx, ggml_backend_metal_buffer_type()); + // As I should already have created the tensors in the context, + // Understand where are the weights loaded instead + // Load the weight/data in the context + // Maybe check finetuning approach at managing the lora weights. + + + + // build the map? TODO LORA ctx->lora_weights_map layers seem to not have buffer type but it should as the simple example does + ctx->lora_weights_map = get_lora_weights_map_cpp((ctx->llora_data).ctx); + // std::vector keys; + // for (const auto& pair : ctx->lora_weights_map) { + // keys.push_back(pair.first); + + // ggml_tensor * tensorA = pair.second.loraA; + // ggml_tensor * tensorB = pair.second.loraB; + + // ggml_tensor * tensorA_ctx = ggml_new_tensor((ctx->llora_data).ctx, tensorA->type, 4, tensorA->ne); + // ggml_tensor * tensorB_ctx = ggml_new_tensor((ctx->llora_data).ctx, tensorB->type, 4, tensorB->ne); + + // } + + // for (struct ggml_tensor * cur = ggml_get_first_tensor((ctx->llora_data).ctx); cur != NULL; cur = ggml_get_next_tensor((ctx->llora_data).ctx, cur)) { + // const auto * name = ggml_get_name(cur); + // // ggml_backend_tensor_set(tensorA, tensorA->data, 0, ggml_nbytes(tensorA)); + // // ggml_backend_tensor_set(tensorB, tensorB->data, 0, ggml_nbytes(tensorB)); + + // } + + // for (struct ggml_tensor * cur = ggml_get_first_tensor(ctx); cur != NULL; cur = ggml_get_next_tensor(ctx, cur)) { + // const auto * weight = get_weight(ggml_get_name(cur)); + // if (weight == nullptr) { + // // this can happen with split experts models + // continue; + // } + + // if (progress_callback) { + // if (!progress_callback((float) size_done / size_data, progress_callback_user_data)) { + // return false; + // } + // } + + // size_t n_size = ggml_nbytes(cur); + + // if (use_mmap) { + // const auto & mapping = mappings.at(weight->idx); + // ggml_backend_buffer_t buf_mmap = nullptr; + // if (bufs_mmap.count(weight->idx)) { + // buf_mmap = bufs_mmap.at(weight->idx); + // } + // uint8_t * data = (uint8_t *) mapping->addr + weight->offs; + + // if (check_tensors) { + // validation_result.emplace_back(std::async(std::launch::async, [cur, data, n_size] { + // return std::make_pair(cur, ggml_validate_row_data(cur->type, data, n_size)); + // })); + // } + // // TODO LORA allocation of base tensors + // GGML_ASSERT(buf_mmap || cur->data); // either we have a buffer to allocate the tensor in, or it is already allocated + // if (buf_mmap && cur->data == nullptr) { + // ggml_backend_tensor_alloc(buf_mmap, cur, data); + // if (lmlocks) { + // const auto & lmlock = lmlocks->at(weight->idx); + // lmlock->grow_to(weight->offs + n_size); + // } + + // auto & mmap_used = mmaps_used[weight->idx]; + // mmap_used.first = std::min(mmap_used.first, weight->offs); + // mmap_used.second = std::max(mmap_used.second, weight->offs + n_size); + // } else { + // ggml_backend_tensor_set(cur, data, 0, n_size); + + + + } /// LORA