From 6597a72c1d09544861803ebb5d1fd6265066fe0d Mon Sep 17 00:00:00 2001 From: Lorenzo Toniazzi Date: Sun, 7 Jul 2024 22:09:30 +0100 Subject: [PATCH] Remove files --- _BRANCH_SETUP.md | 348 ----------------------------------------- data/hot-lora.txt | 2 +- examples/main/main.cpp | 2 - llama.cpp | 21 +-- 4 files changed, 5 insertions(+), 368 deletions(-) delete mode 100644 _BRANCH_SETUP.md diff --git a/_BRANCH_SETUP.md b/_BRANCH_SETUP.md deleted file mode 100644 index 7cdb8ae6a..000000000 --- a/_BRANCH_SETUP.md +++ /dev/null @@ -1,348 +0,0 @@ -# Setup this branch - -## Create a lora adpter bin file - -0. `mkdir models/open-llama` and download [Open-llama (all files)](https://huggingface.co/openlm-research/open_llama_3b_v2/tree/main) in the folder `./models/open-llama` - -2. `mkdir data && touch data/hot-lora.txt` and write a couple of words in it. - -3. Run: - ```bash - # Convert base model to gguf - python3 convert-hf-to-gguf.py models/open-llama/ - # Quantize base model - ./quantize ./models/open-llama/ggml-model-f16.gguf ./models/open-llama/ggml-model-q8_0.gguf Q8_0 - # Obtain Lora adapter - ./finetune --model-base models/open-llama/ggml-model-q8_0.gguf \ - --checkpoint-in models/open-llama/chk-lora-ggml-model-q8_0-hot-lora-LATEST.gguf \ - --checkpoint-out models/open-llama/chk-lora-ggml-model-q8_0-hot-lora-ITERATION.gguf \ - --lora-out models/open-llama/lora-ggml-model-q8_0-hot-lora-ITERATION.bin \ - --train-data "data/hot-lora.txt" \ - --save-every 1 \ - --threads 1 \ - --adam-iter 1 \ - --batch 1 \ - --ctx 16 \ - --use-checkpointing - ``` - -## Run main with adapter - -Run main with base model and lora adapter to hot-swap -```bash -./main -m ./models/open-llama/ggml-model-f16.gguf \ ---hot-lora models/open-llama/lora-ggml-model-q8_0-hot-lora-LATEST.bin \ --ngl 99 \ --n 128 -``` -```bash -./main -m ./models/open-llama/ggml-model-f16.gguf \ --ngl 99 \ --n 128 -``` - -# Logic - - - - -# Current status - -- Only one Lora adapter can be passed. -- Applying only adapter to Q, K, V matrices to keep the code contained (fintuning trained lora tensors for all linear layers) -- GPU not supported - - - - -# Tutorial - -```cpp -#include "llama.h" - -#include "unicode.h" - -#include "ggml.h" -#include "ggml-alloc.h" -#include "ggml-backend.h" - -#ifdef GGML_USE_RPC -# include "ggml-rpc.h" -#endif - -#ifdef GGML_USE_CUDA -# include "ggml-cuda.h" -#elif defined(GGML_USE_VULKAN) -# include "ggml-vulkan.h" -#elif defined(GGML_USE_SYCL) -# include "ggml-sycl.h" -#elif defined(GGML_USE_KOMPUTE) -# include "ggml-kompute.h" -#endif - -#ifdef GGML_USE_METAL -# include "ggml-metal.h" -#endif - -// TODO: replace with ggml API call -#define QK_K 256 - -#ifdef __has_include - #if __has_include() - #include - #if defined(_POSIX_MAPPED_FILES) - #include - #include - #endif - #if defined(_POSIX_MEMLOCK_RANGE) - #include - #endif - #endif -#endif - -#if defined(_WIN32) - #define WIN32_LEAN_AND_MEAN - #ifndef NOMINMAX - #define NOMINMAX - #endif - #include - #ifndef PATH_MAX - #define PATH_MAX MAX_PATH - #endif - #include -#endif - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include "ggml-metal.h" - -#if defined(_MSC_VER) -#pragma warning(disable: 4244 4267) // possible loss of data -#endif - -#ifdef __GNUC__ -#ifdef __MINGW32__ -#define LLAMA_ATTRIBUTE_FORMAT(...) __attribute__((format(gnu_printf, __VA_ARGS__))) -#else -#define LLAMA_ATTRIBUTE_FORMAT(...) __attribute__((format(printf, __VA_ARGS__))) -#endif -#else -#define LLAMA_ATTRIBUTE_FORMAT(...) -#endif - -#define LLAMA_MAX_NODES 8192 -#define LLAMA_MAX_EXPERTS 160 - - -int main() { - struct ggml_init_params params = { - .mem_size = 16*1024*1024, - .mem_buffer = NULL, - /*.no_alloc =*/ true, - }; - - // The library allows the user to define a certain function using the available tensor operations. This function - // definition is represented internally via a computation graph. Each tensor operation in the function definition - // corresponds to a node in the graph. Having the computation graph defined, the user can choose to compute the - // function's value and/or its gradient with respect to the input variables. Optionally, the function can be optimized - // using one of the available optimization algorithms. - // - // For example, here we define the function: f(x) = a*x^2 + b - - // memory allocation happens here - // Create context allogating memory - struct ggml_context * ctx = ggml_init(params); - - struct ggml_tensor * x = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, 1); - - ggml_set_param(ctx, x); // x is an input variable - - struct ggml_tensor * a = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, 1); - struct ggml_tensor * b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, 1); - struct ggml_tensor * x2 = ggml_mul(ctx, x, x); - struct ggml_tensor * f = ggml_add(ctx, ggml_mul(ctx, a, x2), b); - - struct ggml_cgraph * gf = ggml_new_graph(ctx); - - // ggml_backend_alloc_ctx_tensors_from_buft(ctx, ggml_backend_cpu_buffer_type()); - // ggml_backend_alloc_ctx_tensors_from_buft(ctx, ggml_backend_metal_buffer_type()); - ggml_backend_buffer_t buf = ggml_backend_alloc_ctx_tensors_from_buft(ctx, ggml_backend_metal_buffer_type()); - if (buf == nullptr) { - throw std::runtime_error("unable to allocate backend buffer"); - } - ggml_used_mem(ctx); - - // llama_default_buffer_type_offload(model, layer_gpu); used in llama.cpp - // How to check which buffer is the context allocated, - // can look at single tensors? option, check in inited in base model - - // Try this - // You can simplify all of this for testing, and if you are using CPU only, and just run with -ngl 0 - // and allocate everything in a CPU buffer by using - // ggml_backend_alloc_ctx_tensors_from_buft(ctx, ggml_backend_cpu_buffer_type()); - // or run with -ngl 99 and use a Metal buffer type instead with - // ggml_backend_metal_buffer_type() - // It will still run if you allocate the tensors in the wrong buffer type as long as you use ggml-backend - // to allocate the tensors, it will just be slower. - - // Notice that the function definition above does not involve any actual computation. The computation is performed only - // when the user explicitly requests it. For example, to compute the function's value at x = 2.0: - - - ggml_build_forward_expand(gf, f); - - // set the input variable and parameter values - ggml_set_f32(x, 2.0f); - ggml_set_f32(a, 3.0f); - ggml_set_f32(b, 4.0f); - - ggml_graph_compute_with_ctx(ctx, gf, 1); - - printf("f = %f\n", ggml_get_f32_1d(f, 0)); - - // The actual computation is performed in the ggml_graph_compute() function. - // - // The ggml_new_tensor_...() functions create new tensors. They are allocated in the memory buffer provided to the - // ggml_init() function. You have to be careful not to exceed the memory buffer size. Therefore, you have to know - // in advance how much memory you need for your computation. Alternatively, you can allocate a large enough memory - // and after defining the computation graph, call the ggml_used_mem() function to find out how much memory was - // actually needed. - // - // The ggml_set_param() function marks a tensor as an input variable. This is used by the automatic - // differentiation and optimization algorithms. - // - // The described approach allows to define the function graph once and then compute its forward or backward graphs - // multiple times. All computations will use the same memory buffer allocated in the ggml_init() function. This way - // the user can avoid the memory allocation overhead at runtime. - // - // The library supports multi-dimensional tensors - up to 4 dimensions. The FP16 and FP32 data types are first class - // citizens, but in theory the library can be extended to support FP8 and integer data types. - // - // Each tensor operation produces a new tensor. Initially the library was envisioned to support only the use of unary - // and binary operations. Most of the available operations fall into one of these two categories. With time, it became - // clear that the library needs to support more complex operations. The way to support these operations is not clear - // yet, but a few examples are demonstrated in the following operations: - // - // - ggml_permute() - // - ggml_conv_1d_1s() - // - ggml_conv_1d_2s() - // - // For each tensor operator, the library implements a forward and backward computation function. The forward function - // computes the output tensor value given the input tensor values. The backward function computes the adjoint of the - // input tensors given the adjoint of the output tensor. For a detailed explanation of what this means, take a - // calculus class, or watch the following video: - // - // What is Automatic Differentiation? - // https://www.youtube.com/watch?v=wG_nF1awSSY - - // ## Tensor data (struct ggml_tensor) - // - // The tensors are stored in memory via the ggml_tensor struct. The structure provides information about the size of - // the tensor, the data type, and the memory buffer where the tensor data is stored. Additionally, it contains - // pointers to the "source" tensors - i.e. the tensors that were used to compute the current tensor. For example: - - struct ggml_tensor * c = ggml_add(ctx, a, b); - - assert(c->src[0] == a); - assert(c->src[1] == b); - - // The multi-dimensional tensors are stored in row-major order. The ggml_tensor struct contains fields for the - // number of elements in each dimension ("ne") as well as the number of bytes ("nb", a.k.a. stride). This allows - // to store tensors that are not contiguous in memory, which is useful for operations such as transposition and - // permutation. All tensor operations have to take the stride into account and not assume that the tensor is - // contiguous in memory. - - // The data of the tensor is accessed via the "data" pointer. For example: - - const int nx = 2; - const int ny = 3; - - struct ggml_tensor * A = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, nx, ny); - - for (int y = 0; y < ny; y++) { - for (int x = 0; x < nx; x++) { - *(float *) ((char *) A->data + y*A->nb[1] + x*A->nb[0]) = x + y; - } - } - - // - // Alternatively, there are helper functions, such as ggml_get_f32_1d() and ggml_set_f32_1d() that can be used. - // - - } - ``` - - - - ```bash - # Convert base model to gguf - python3 convert-hf-to-gguf.py models/open-llama/ && \ - # Quantize base model - ./quantize ./models/open-llama/ggml-model-f16.gguf ./models/open-llama/ggml-model-q4.gguf Q4_K && \ - # Obtain Lora adapter - ./finetune --model-base models/open-llama/ggml-model-q4.gguf \ - --checkpoint-in models/open-llama/chk-lora-ggml-model-q4-hot-lora-LATEST.gguf \ - --checkpoint-out models/open-llama/chk-lora-ggml-model-q4-hot-lora-ITERATION.gguf \ - --lora-out models/open-llama/lora-ggml-model-q4-hot-lora-ITERATION.bin \ - --train-data "data/hot-lora.txt" \ - --save-every 1 \ - --threads 1 \ - --adam-iter 1 \ - --batch 1 \ - --ctx 16 \ - --use-checkpointing - ``` - - - -## 1. Run main with adapter - -- Run main with base model and lora adapter to hot-swap - ```bash - ./main -m ./models/open-llama/ggml-model-q4.gguf \ - --hot-lora models/open-llama/lora-ggml-model-q4-hot-lora-LATEST.bin \ - -ngl 99 \ - -n 128 - ``` - -- Do not pass the flag `--hot-lora` and the adapter is ignored: - ```bash - ./main -m ./models/open-llama/ggml-model-q4.gguf \ - -ngl 99 \ - -n 128 - ``` - -build for debug: -```bash - make clean && make -j 8 LLAMA_DEBUG=1 -``` \ No newline at end of file diff --git a/data/hot-lora.txt b/data/hot-lora.txt index c43186710..e88891d2f 100644 --- a/data/hot-lora.txt +++ b/data/hot-lora.txt @@ -1,2 +1,2 @@ - how are you? +test data to train adapter \ No newline at end of file diff --git a/examples/main/main.cpp b/examples/main/main.cpp index ba76a496b..b97b7b793 100644 --- a/examples/main/main.cpp +++ b/examples/main/main.cpp @@ -117,9 +117,7 @@ static void llama_log_callback_logTee(ggml_log_level level, const char * text, v LOG_TEE("%s", text); } - int main(int argc, char ** argv) { - gpt_params params; g_params = ¶ms; diff --git a/llama.cpp b/llama.cpp index adfbb4828..8a5a71c77 100644 --- a/llama.cpp +++ b/llama.cpp @@ -2544,7 +2544,6 @@ struct llama_context { } llama_cparams cparams; - bool lora_loaded = false; std::map lora_weights_map; // only one LoRA adapter at the moment lora_data llora_data; float lora_scale = 1.0f; @@ -16309,7 +16308,7 @@ void llama_free_model(struct llama_model * model) { } -static std::map get_lora_weights_map_cpp(struct ggml_context* ctx) { +static std::map get_lora_weights_map(struct ggml_context* ctx) { struct lora_tensor_pair* pair = build_lora_weights_map(ctx); std::map map; @@ -16370,7 +16369,7 @@ struct llama_context * llama_new_context_with_model( lora.scale = 1.0f; // redundant as already inside lora_context, but should be here for multiple loras? lora_params->lora.push_back(lora); - // load all loras + // load all loras (only 1 supported here) std::vector loras; for (size_t i = 0; i < lora_params->lora.size(); ++i) { struct lora_data * llora_data = load_lora(&lora_params->lora[i]); @@ -16381,22 +16380,10 @@ struct llama_context * llama_new_context_with_model( if (loras.size() == 0) { fprintf(stderr, "warning: no lora adapters will be applied.\n"); } - // Assign data + // Assign data and get mapping (index 0 as only 1 lora is supoprted now) ctx->llora_data = *loras[0]; - ctx->lora_weights_map = get_lora_weights_map_cpp((ctx->llora_data).ctx); - // std::vector keys; - // for (const auto& pair : ctx->lora_weights_map) { - // keys.push_back(pair.first); - - // ggml_tensor * tensorA = pair.second.loraA; - // ggml_tensor * tensorB = pair.second.loraB; - - // ggml_tensor * tensorA_ctx = ggml_new_tensor((ctx->llora_data).ctx, tensorA->type, 4, tensorA->ne); - // ggml_tensor * tensorB_ctx = ggml_new_tensor((ctx->llora_data).ctx, tensorB->type, 4, tensorB->ne); - - // } + ctx->lora_weights_map = get_lora_weights_map((ctx->llora_data).ctx); } - /// LORA load end const auto & hparams = model->hparams;