Metal running (still buffer issues)
This commit is contained in:
parent
5c4ba81933
commit
028d3f7c89
3 changed files with 414 additions and 10 deletions
254
BRANCH_SETUP.md
254
BRANCH_SETUP.md
|
@ -36,6 +36,8 @@ Run main with base model and lora adapter to hot-swap
|
||||||
-n 128
|
-n 128
|
||||||
```
|
```
|
||||||
|
|
||||||
|
Working but `ggml_metal_get_buffer: error: tensor 'blk.16.attn_v.weight.loraB' buffer is nil`
|
||||||
|
|
||||||
With `ngl > 0` the code breaks. Probably because the Lora tensors try to interact with the base tensors (as in `lora_mul_mat`), but the lora tensors are not moved to the gpu buffer of the base tensors.
|
With `ngl > 0` the code breaks. Probably because the Lora tensors try to interact with the base tensors (as in `lora_mul_mat`), but the lora tensors are not moved to the gpu buffer of the base tensors.
|
||||||
|
|
||||||
# Logic
|
# Logic
|
||||||
|
@ -47,4 +49,254 @@ With `ngl > 0` the code breaks. Probably because the Lora tensors try to interac
|
||||||
|
|
||||||
- Only one Lora adapter can be passed.
|
- Only one Lora adapter can be passed.
|
||||||
- Applying only adapter to Q, K, V matrices to keep the code contained (fintuning trained lora tensors for all linear layers)
|
- Applying only adapter to Q, K, V matrices to keep the code contained (fintuning trained lora tensors for all linear layers)
|
||||||
- GPU not supported
|
- GPU not supported
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
# Tutorial
|
||||||
|
|
||||||
|
```cpp
|
||||||
|
#include "llama.h"
|
||||||
|
|
||||||
|
#include "unicode.h"
|
||||||
|
|
||||||
|
#include "ggml.h"
|
||||||
|
#include "ggml-alloc.h"
|
||||||
|
#include "ggml-backend.h"
|
||||||
|
|
||||||
|
#ifdef GGML_USE_RPC
|
||||||
|
# include "ggml-rpc.h"
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#ifdef GGML_USE_CUDA
|
||||||
|
# include "ggml-cuda.h"
|
||||||
|
#elif defined(GGML_USE_VULKAN)
|
||||||
|
# include "ggml-vulkan.h"
|
||||||
|
#elif defined(GGML_USE_SYCL)
|
||||||
|
# include "ggml-sycl.h"
|
||||||
|
#elif defined(GGML_USE_KOMPUTE)
|
||||||
|
# include "ggml-kompute.h"
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#ifdef GGML_USE_METAL
|
||||||
|
# include "ggml-metal.h"
|
||||||
|
#endif
|
||||||
|
|
||||||
|
// TODO: replace with ggml API call
|
||||||
|
#define QK_K 256
|
||||||
|
|
||||||
|
#ifdef __has_include
|
||||||
|
#if __has_include(<unistd.h>)
|
||||||
|
#include <unistd.h>
|
||||||
|
#if defined(_POSIX_MAPPED_FILES)
|
||||||
|
#include <sys/mman.h>
|
||||||
|
#include <fcntl.h>
|
||||||
|
#endif
|
||||||
|
#if defined(_POSIX_MEMLOCK_RANGE)
|
||||||
|
#include <sys/resource.h>
|
||||||
|
#endif
|
||||||
|
#endif
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#if defined(_WIN32)
|
||||||
|
#define WIN32_LEAN_AND_MEAN
|
||||||
|
#ifndef NOMINMAX
|
||||||
|
#define NOMINMAX
|
||||||
|
#endif
|
||||||
|
#include <windows.h>
|
||||||
|
#ifndef PATH_MAX
|
||||||
|
#define PATH_MAX MAX_PATH
|
||||||
|
#endif
|
||||||
|
#include <io.h>
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#include <algorithm>
|
||||||
|
#include <array>
|
||||||
|
#include <cassert>
|
||||||
|
#include <cctype>
|
||||||
|
#include <cfloat>
|
||||||
|
#include <cinttypes>
|
||||||
|
#include <climits>
|
||||||
|
#include <cmath>
|
||||||
|
#include <cstdarg>
|
||||||
|
#include <cstddef>
|
||||||
|
#include <cstdint>
|
||||||
|
#include <cstdio>
|
||||||
|
#include <cstring>
|
||||||
|
#include <ctime>
|
||||||
|
#include <forward_list>
|
||||||
|
#include <fstream>
|
||||||
|
#include <functional>
|
||||||
|
#include <future>
|
||||||
|
#include <initializer_list>
|
||||||
|
#include <locale>
|
||||||
|
#include <map>
|
||||||
|
#include <memory>
|
||||||
|
#include <mutex>
|
||||||
|
#include <numeric>
|
||||||
|
#include <queue>
|
||||||
|
#include <random>
|
||||||
|
#include <regex>
|
||||||
|
#include <set>
|
||||||
|
#include <sstream>
|
||||||
|
#include <thread>
|
||||||
|
#include <type_traits>
|
||||||
|
#include <unordered_map>
|
||||||
|
#include "ggml-metal.h"
|
||||||
|
|
||||||
|
#if defined(_MSC_VER)
|
||||||
|
#pragma warning(disable: 4244 4267) // possible loss of data
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#ifdef __GNUC__
|
||||||
|
#ifdef __MINGW32__
|
||||||
|
#define LLAMA_ATTRIBUTE_FORMAT(...) __attribute__((format(gnu_printf, __VA_ARGS__)))
|
||||||
|
#else
|
||||||
|
#define LLAMA_ATTRIBUTE_FORMAT(...) __attribute__((format(printf, __VA_ARGS__)))
|
||||||
|
#endif
|
||||||
|
#else
|
||||||
|
#define LLAMA_ATTRIBUTE_FORMAT(...)
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#define LLAMA_MAX_NODES 8192
|
||||||
|
#define LLAMA_MAX_EXPERTS 160
|
||||||
|
|
||||||
|
|
||||||
|
int main() {
|
||||||
|
struct ggml_init_params params = {
|
||||||
|
.mem_size = 16*1024*1024,
|
||||||
|
.mem_buffer = NULL,
|
||||||
|
/*.no_alloc =*/ true,
|
||||||
|
};
|
||||||
|
|
||||||
|
// The library allows the user to define a certain function using the available tensor operations. This function
|
||||||
|
// definition is represented internally via a computation graph. Each tensor operation in the function definition
|
||||||
|
// corresponds to a node in the graph. Having the computation graph defined, the user can choose to compute the
|
||||||
|
// function's value and/or its gradient with respect to the input variables. Optionally, the function can be optimized
|
||||||
|
// using one of the available optimization algorithms.
|
||||||
|
//
|
||||||
|
// For example, here we define the function: f(x) = a*x^2 + b
|
||||||
|
|
||||||
|
// memory allocation happens here
|
||||||
|
// Create context allogating memory
|
||||||
|
struct ggml_context * ctx = ggml_init(params);
|
||||||
|
|
||||||
|
struct ggml_tensor * x = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, 1);
|
||||||
|
|
||||||
|
ggml_set_param(ctx, x); // x is an input variable
|
||||||
|
|
||||||
|
struct ggml_tensor * a = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, 1);
|
||||||
|
struct ggml_tensor * b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, 1);
|
||||||
|
struct ggml_tensor * x2 = ggml_mul(ctx, x, x);
|
||||||
|
struct ggml_tensor * f = ggml_add(ctx, ggml_mul(ctx, a, x2), b);
|
||||||
|
|
||||||
|
struct ggml_cgraph * gf = ggml_new_graph(ctx);
|
||||||
|
|
||||||
|
// ggml_backend_alloc_ctx_tensors_from_buft(ctx, ggml_backend_cpu_buffer_type());
|
||||||
|
// ggml_backend_alloc_ctx_tensors_from_buft(ctx, ggml_backend_metal_buffer_type());
|
||||||
|
ggml_backend_buffer_t buf = ggml_backend_alloc_ctx_tensors_from_buft(ctx, ggml_backend_metal_buffer_type());
|
||||||
|
if (buf == nullptr) {
|
||||||
|
throw std::runtime_error("unable to allocate backend buffer");
|
||||||
|
}
|
||||||
|
ggml_used_mem(ctx);
|
||||||
|
|
||||||
|
// llama_default_buffer_type_offload(model, layer_gpu); used in llama.cpp
|
||||||
|
// How to check which buffer is the context allocated,
|
||||||
|
// can look at single tensors? option, check in inited in base model
|
||||||
|
|
||||||
|
// Try this
|
||||||
|
// You can simplify all of this for testing, and if you are using CPU only, and just run with -ngl 0
|
||||||
|
// and allocate everything in a CPU buffer by using
|
||||||
|
// ggml_backend_alloc_ctx_tensors_from_buft(ctx, ggml_backend_cpu_buffer_type());
|
||||||
|
// or run with -ngl 99 and use a Metal buffer type instead with
|
||||||
|
// ggml_backend_metal_buffer_type()
|
||||||
|
// It will still run if you allocate the tensors in the wrong buffer type as long as you use ggml-backend
|
||||||
|
// to allocate the tensors, it will just be slower.
|
||||||
|
|
||||||
|
// Notice that the function definition above does not involve any actual computation. The computation is performed only
|
||||||
|
// when the user explicitly requests it. For example, to compute the function's value at x = 2.0:
|
||||||
|
|
||||||
|
|
||||||
|
ggml_build_forward_expand(gf, f);
|
||||||
|
|
||||||
|
// set the input variable and parameter values
|
||||||
|
ggml_set_f32(x, 2.0f);
|
||||||
|
ggml_set_f32(a, 3.0f);
|
||||||
|
ggml_set_f32(b, 4.0f);
|
||||||
|
|
||||||
|
ggml_graph_compute_with_ctx(ctx, gf, 1);
|
||||||
|
|
||||||
|
printf("f = %f\n", ggml_get_f32_1d(f, 0));
|
||||||
|
|
||||||
|
// The actual computation is performed in the ggml_graph_compute() function.
|
||||||
|
//
|
||||||
|
// The ggml_new_tensor_...() functions create new tensors. They are allocated in the memory buffer provided to the
|
||||||
|
// ggml_init() function. You have to be careful not to exceed the memory buffer size. Therefore, you have to know
|
||||||
|
// in advance how much memory you need for your computation. Alternatively, you can allocate a large enough memory
|
||||||
|
// and after defining the computation graph, call the ggml_used_mem() function to find out how much memory was
|
||||||
|
// actually needed.
|
||||||
|
//
|
||||||
|
// The ggml_set_param() function marks a tensor as an input variable. This is used by the automatic
|
||||||
|
// differentiation and optimization algorithms.
|
||||||
|
//
|
||||||
|
// The described approach allows to define the function graph once and then compute its forward or backward graphs
|
||||||
|
// multiple times. All computations will use the same memory buffer allocated in the ggml_init() function. This way
|
||||||
|
// the user can avoid the memory allocation overhead at runtime.
|
||||||
|
//
|
||||||
|
// The library supports multi-dimensional tensors - up to 4 dimensions. The FP16 and FP32 data types are first class
|
||||||
|
// citizens, but in theory the library can be extended to support FP8 and integer data types.
|
||||||
|
//
|
||||||
|
// Each tensor operation produces a new tensor. Initially the library was envisioned to support only the use of unary
|
||||||
|
// and binary operations. Most of the available operations fall into one of these two categories. With time, it became
|
||||||
|
// clear that the library needs to support more complex operations. The way to support these operations is not clear
|
||||||
|
// yet, but a few examples are demonstrated in the following operations:
|
||||||
|
//
|
||||||
|
// - ggml_permute()
|
||||||
|
// - ggml_conv_1d_1s()
|
||||||
|
// - ggml_conv_1d_2s()
|
||||||
|
//
|
||||||
|
// For each tensor operator, the library implements a forward and backward computation function. The forward function
|
||||||
|
// computes the output tensor value given the input tensor values. The backward function computes the adjoint of the
|
||||||
|
// input tensors given the adjoint of the output tensor. For a detailed explanation of what this means, take a
|
||||||
|
// calculus class, or watch the following video:
|
||||||
|
//
|
||||||
|
// What is Automatic Differentiation?
|
||||||
|
// https://www.youtube.com/watch?v=wG_nF1awSSY
|
||||||
|
|
||||||
|
// ## Tensor data (struct ggml_tensor)
|
||||||
|
//
|
||||||
|
// The tensors are stored in memory via the ggml_tensor struct. The structure provides information about the size of
|
||||||
|
// the tensor, the data type, and the memory buffer where the tensor data is stored. Additionally, it contains
|
||||||
|
// pointers to the "source" tensors - i.e. the tensors that were used to compute the current tensor. For example:
|
||||||
|
|
||||||
|
struct ggml_tensor * c = ggml_add(ctx, a, b);
|
||||||
|
|
||||||
|
assert(c->src[0] == a);
|
||||||
|
assert(c->src[1] == b);
|
||||||
|
|
||||||
|
// The multi-dimensional tensors are stored in row-major order. The ggml_tensor struct contains fields for the
|
||||||
|
// number of elements in each dimension ("ne") as well as the number of bytes ("nb", a.k.a. stride). This allows
|
||||||
|
// to store tensors that are not contiguous in memory, which is useful for operations such as transposition and
|
||||||
|
// permutation. All tensor operations have to take the stride into account and not assume that the tensor is
|
||||||
|
// contiguous in memory.
|
||||||
|
|
||||||
|
// The data of the tensor is accessed via the "data" pointer. For example:
|
||||||
|
|
||||||
|
const int nx = 2;
|
||||||
|
const int ny = 3;
|
||||||
|
|
||||||
|
struct ggml_tensor * A = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, nx, ny);
|
||||||
|
|
||||||
|
for (int y = 0; y < ny; y++) {
|
||||||
|
for (int x = 0; x < nx; x++) {
|
||||||
|
*(float *) ((char *) A->data + y*A->nb[1] + x*A->nb[0]) = x + y;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
//
|
||||||
|
// Alternatively, there are helper functions, such as ggml_get_f32_1d() and ggml_set_f32_1d() that can be used.
|
||||||
|
//
|
||||||
|
|
||||||
|
}
|
||||||
|
```
|
|
@ -117,7 +117,74 @@ static void llama_log_callback_logTee(ggml_log_level level, const char * text, v
|
||||||
LOG_TEE("%s", text);
|
LOG_TEE("%s", text);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#include "ggml-metal.h"
|
||||||
|
|
||||||
|
bool is_pointer_in_buffer_range(void *ptr, void *buffer_start, size_t buffer_size) {
|
||||||
|
return (ptr >= (char*)buffer_start) && (ptr < ((char*)buffer_start + buffer_size));
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
void verify_tensor_allocation(struct ggml_context * ctx, ggml_backend_buffer_t buffer, size_t buffer_size) {
|
||||||
|
struct ggml_tensor * first = ggml_get_first_tensor(ctx);
|
||||||
|
for (struct ggml_tensor * t = first; t != NULL; t = ggml_get_next_tensor(ctx, t)) {
|
||||||
|
if (t->data != NULL) {
|
||||||
|
if (!is_pointer_in_buffer_range(t->data, buffer, buffer_size)) {
|
||||||
|
fprintf(stderr, "Tensor %s is not within the allocated buffer range.\n", t->name);
|
||||||
|
} else {
|
||||||
|
printf("Tensor %s is correctly allocated in the buffer.\n", t->name);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
int main(int argc, char ** argv) {
|
int main(int argc, char ** argv) {
|
||||||
|
|
||||||
|
|
||||||
|
// The library allows the user to define a certain function using the available tensor operations. This function
|
||||||
|
// definition is represented internally via a computation graph. Each tensor operation in the function definition
|
||||||
|
// corresponds to a node in the graph. Having the computation graph defined, the user can choose to compute the
|
||||||
|
// function's value and/or its gradient with respect to the input variables. Optionally, the function can be optimized
|
||||||
|
// using one of the available optimization algorithms.
|
||||||
|
//
|
||||||
|
// For example, here we define the function: f(x) = a*x^2 + b
|
||||||
|
|
||||||
|
// memory allocation happens here
|
||||||
|
// Create context allogating memory
|
||||||
|
struct ggml_init_params _params = {
|
||||||
|
.mem_size = 16*1024*1024,
|
||||||
|
.mem_buffer = NULL,
|
||||||
|
.no_alloc = true,
|
||||||
|
};
|
||||||
|
struct ggml_context * _ctx = ggml_init(_params);
|
||||||
|
|
||||||
|
struct ggml_tensor * x = ggml_new_tensor_1d(_ctx, GGML_TYPE_F32, 1);
|
||||||
|
|
||||||
|
// ggml_set_param(_ctx, x); // x is an input variable
|
||||||
|
|
||||||
|
// struct ggml_tensor * a = ggml_new_tensor_1d(_ctx, GGML_TYPE_F32, 1);
|
||||||
|
// struct ggml_tensor * b = ggml_new_tensor_1d(_ctx, GGML_TYPE_F32, 1);
|
||||||
|
// struct ggml_tensor * x2 = ggml_mul(_ctx, x, x);
|
||||||
|
// struct ggml_tensor * f = ggml_add(_ctx, ggml_mul(_ctx, a, x2), b);
|
||||||
|
|
||||||
|
// struct ggml_cgraph * gf = ggml_new_graph(_ctx);
|
||||||
|
|
||||||
|
// // ggml_backend_alloc_ctx_tensors_from_buft(_ctx, ggml_backend_cpu_buffer_type());
|
||||||
|
// // ggml_backend_alloc_ctx_tensors_from_buft(_ctx, ggml_backend_metal_buffer_type());
|
||||||
|
ggml_backend_buffer_t buf = ggml_backend_alloc_ctx_tensors_from_buft(_ctx, ggml_backend_metal_buffer_type());
|
||||||
|
if (buf == nullptr) {
|
||||||
|
throw std::runtime_error("unable to allocate backend buffer");
|
||||||
|
}
|
||||||
|
else {
|
||||||
|
size_t buffer_size = ggml_backend_buft_get_max_size(ggml_backend_metal_buffer_type());
|
||||||
|
|
||||||
|
// Verify tensor allocations
|
||||||
|
verify_tensor_allocation(_ctx, buf, buffer_size);
|
||||||
|
}
|
||||||
|
ggml_used_mem(_ctx);
|
||||||
|
//
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
gpt_params params;
|
gpt_params params;
|
||||||
g_params = ¶ms;
|
g_params = ¶ms;
|
||||||
|
|
||||||
|
|
103
llama.cpp
103
llama.cpp
|
@ -307,6 +307,11 @@ static struct lora_data * load_lora(struct lora_info * info) {
|
||||||
tensors_offset.push_back(offset);
|
tensors_offset.push_back(offset);
|
||||||
file.seek(nbytes, SEEK_CUR);
|
file.seek(nbytes, SEEK_CUR);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
ggml_backend_buffer_t buf = ggml_backend_alloc_ctx_tensors_from_buft(result->ctx, ggml_backend_metal_buffer_type());
|
||||||
|
if (!buf) {
|
||||||
|
LLAMA_LOG_ERROR("%s: failed to allocate buffer for lora tensors\n", __func__);
|
||||||
|
}
|
||||||
// read tensor data
|
// read tensor data
|
||||||
result->data.resize(total_nbytes_pad);
|
result->data.resize(total_nbytes_pad);
|
||||||
size_t data_offset = 0;
|
size_t data_offset = 0;
|
||||||
|
@ -3922,7 +3927,7 @@ struct llama_model_loader {
|
||||||
|
|
||||||
std::vector<no_init<uint8_t>> read_buf;
|
std::vector<no_init<uint8_t>> read_buf;
|
||||||
std::vector<std::future<std::pair<ggml_tensor *, bool>>> validation_result;
|
std::vector<std::future<std::pair<ggml_tensor *, bool>>> validation_result;
|
||||||
|
// Allocate tensors data to buffer
|
||||||
for (struct ggml_tensor * cur = ggml_get_first_tensor(ctx); cur != NULL; cur = ggml_get_next_tensor(ctx, cur)) {
|
for (struct ggml_tensor * cur = ggml_get_first_tensor(ctx); cur != NULL; cur = ggml_get_next_tensor(ctx, cur)) {
|
||||||
const auto * weight = get_weight(ggml_get_name(cur));
|
const auto * weight = get_weight(ggml_get_name(cur));
|
||||||
if (weight == nullptr) {
|
if (weight == nullptr) {
|
||||||
|
@ -3951,7 +3956,7 @@ struct llama_model_loader {
|
||||||
return std::make_pair(cur, ggml_validate_row_data(cur->type, data, n_size));
|
return std::make_pair(cur, ggml_validate_row_data(cur->type, data, n_size));
|
||||||
}));
|
}));
|
||||||
}
|
}
|
||||||
|
// TODO LORA allocation of base tensors
|
||||||
GGML_ASSERT(buf_mmap || cur->data); // either we have a buffer to allocate the tensor in, or it is already allocated
|
GGML_ASSERT(buf_mmap || cur->data); // either we have a buffer to allocate the tensor in, or it is already allocated
|
||||||
if (buf_mmap && cur->data == nullptr) {
|
if (buf_mmap && cur->data == nullptr) {
|
||||||
ggml_backend_tensor_alloc(buf_mmap, cur, data);
|
ggml_backend_tensor_alloc(buf_mmap, cur, data);
|
||||||
|
@ -5392,7 +5397,7 @@ static bool llm_load_tensors(
|
||||||
auto ctx_for_layer_split = [&](int i) { return ctx_map.at(model.buft_layer[i].buft_matrix); };
|
auto ctx_for_layer_split = [&](int i) { return ctx_map.at(model.buft_layer[i].buft_matrix); };
|
||||||
|
|
||||||
model.layers.resize(n_layer);
|
model.layers.resize(n_layer);
|
||||||
|
// main players model, ml, ctx_input/output, tn (gets name?)
|
||||||
const auto tn = LLM_TN(model.arch);
|
const auto tn = LLM_TN(model.arch);
|
||||||
switch (model.arch) {
|
switch (model.arch) {
|
||||||
case LLM_ARCH_LLAMA:
|
case LLM_ARCH_LLAMA:
|
||||||
|
@ -6666,7 +6671,7 @@ static bool llm_load_tensors(
|
||||||
#endif
|
#endif
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
#ifdef GGML_USE_METAL
|
#ifdef GGML_USE_METAL // LORA Use metal on base tensors
|
||||||
else if (ml.use_mmap && use_mmap_buffer && buft == ggml_backend_metal_buffer_type()) {
|
else if (ml.use_mmap && use_mmap_buffer && buft == ggml_backend_metal_buffer_type()) {
|
||||||
for (uint32_t idx = 0; idx < ml.files.size(); idx++) {
|
for (uint32_t idx = 0; idx < ml.files.size(); idx++) {
|
||||||
const size_t max_size = ggml_get_max_tensor_size(ctx);
|
const size_t max_size = ggml_get_max_tensor_size(ctx);
|
||||||
|
@ -16341,12 +16346,92 @@ struct llama_context * llama_new_context_with_model(
|
||||||
// Assign data
|
// Assign data
|
||||||
ctx->llora_data = *loras[0];
|
ctx->llora_data = *loras[0];
|
||||||
|
|
||||||
// build the map?
|
|
||||||
ctx->lora_weights_map = get_lora_weights_map_cpp((ctx->llora_data).ctx);
|
ggml_backend_buffer_t buf = ggml_backend_alloc_ctx_tensors_from_buft((ctx->llora_data).ctx, ggml_backend_metal_buffer_type());
|
||||||
std::vector<std::string> keys;
|
if (!buf) {
|
||||||
for (const auto& pair : ctx->lora_weights_map) {
|
LLAMA_LOG_ERROR("%s: failed to allocate buffer for lora tensors\n", __func__);
|
||||||
keys.push_back(pair.first);
|
|
||||||
}
|
}
|
||||||
|
// Looks this worked, need to check if tensors have new buffer (not sure below).
|
||||||
|
// Also do we need to set the tensors? not clear where data is, looks like it is loaded after the
|
||||||
|
// tensor creation in context, but loaded where? cuz if data present dfferebt way to set with ggml_backend_tensor_set instead of ggml_backend_tensor_alloc
|
||||||
|
|
||||||
|
// TODO looks like I have already a context with load_lora, understand if
|
||||||
|
// I am using it
|
||||||
|
// If the contexg it set to right buffer with ggml_backend_alloc_ctx_tensors_from_buft((ctx->llora_data).ctx, ggml_backend_metal_buffer_type());
|
||||||
|
// As I should already have created the tensors in the context,
|
||||||
|
// Understand where are the weights loaded instead
|
||||||
|
// Load the weight/data in the context
|
||||||
|
// Maybe check finetuning approach at managing the lora weights.
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
// build the map? TODO LORA ctx->lora_weights_map layers seem to not have buffer type but it should as the simple example does
|
||||||
|
ctx->lora_weights_map = get_lora_weights_map_cpp((ctx->llora_data).ctx);
|
||||||
|
// std::vector<std::string> keys;
|
||||||
|
// for (const auto& pair : ctx->lora_weights_map) {
|
||||||
|
// keys.push_back(pair.first);
|
||||||
|
|
||||||
|
// ggml_tensor * tensorA = pair.second.loraA;
|
||||||
|
// ggml_tensor * tensorB = pair.second.loraB;
|
||||||
|
|
||||||
|
// ggml_tensor * tensorA_ctx = ggml_new_tensor((ctx->llora_data).ctx, tensorA->type, 4, tensorA->ne);
|
||||||
|
// ggml_tensor * tensorB_ctx = ggml_new_tensor((ctx->llora_data).ctx, tensorB->type, 4, tensorB->ne);
|
||||||
|
|
||||||
|
// }
|
||||||
|
|
||||||
|
// for (struct ggml_tensor * cur = ggml_get_first_tensor((ctx->llora_data).ctx); cur != NULL; cur = ggml_get_next_tensor((ctx->llora_data).ctx, cur)) {
|
||||||
|
// const auto * name = ggml_get_name(cur);
|
||||||
|
// // ggml_backend_tensor_set(tensorA, tensorA->data, 0, ggml_nbytes(tensorA));
|
||||||
|
// // ggml_backend_tensor_set(tensorB, tensorB->data, 0, ggml_nbytes(tensorB));
|
||||||
|
|
||||||
|
// }
|
||||||
|
|
||||||
|
// for (struct ggml_tensor * cur = ggml_get_first_tensor(ctx); cur != NULL; cur = ggml_get_next_tensor(ctx, cur)) {
|
||||||
|
// const auto * weight = get_weight(ggml_get_name(cur));
|
||||||
|
// if (weight == nullptr) {
|
||||||
|
// // this can happen with split experts models
|
||||||
|
// continue;
|
||||||
|
// }
|
||||||
|
|
||||||
|
// if (progress_callback) {
|
||||||
|
// if (!progress_callback((float) size_done / size_data, progress_callback_user_data)) {
|
||||||
|
// return false;
|
||||||
|
// }
|
||||||
|
// }
|
||||||
|
|
||||||
|
// size_t n_size = ggml_nbytes(cur);
|
||||||
|
|
||||||
|
// if (use_mmap) {
|
||||||
|
// const auto & mapping = mappings.at(weight->idx);
|
||||||
|
// ggml_backend_buffer_t buf_mmap = nullptr;
|
||||||
|
// if (bufs_mmap.count(weight->idx)) {
|
||||||
|
// buf_mmap = bufs_mmap.at(weight->idx);
|
||||||
|
// }
|
||||||
|
// uint8_t * data = (uint8_t *) mapping->addr + weight->offs;
|
||||||
|
|
||||||
|
// if (check_tensors) {
|
||||||
|
// validation_result.emplace_back(std::async(std::launch::async, [cur, data, n_size] {
|
||||||
|
// return std::make_pair(cur, ggml_validate_row_data(cur->type, data, n_size));
|
||||||
|
// }));
|
||||||
|
// }
|
||||||
|
// // TODO LORA allocation of base tensors
|
||||||
|
// GGML_ASSERT(buf_mmap || cur->data); // either we have a buffer to allocate the tensor in, or it is already allocated
|
||||||
|
// if (buf_mmap && cur->data == nullptr) {
|
||||||
|
// ggml_backend_tensor_alloc(buf_mmap, cur, data);
|
||||||
|
// if (lmlocks) {
|
||||||
|
// const auto & lmlock = lmlocks->at(weight->idx);
|
||||||
|
// lmlock->grow_to(weight->offs + n_size);
|
||||||
|
// }
|
||||||
|
|
||||||
|
// auto & mmap_used = mmaps_used[weight->idx];
|
||||||
|
// mmap_used.first = std::min(mmap_used.first, weight->offs);
|
||||||
|
// mmap_used.second = std::max(mmap_used.second, weight->offs + n_size);
|
||||||
|
// } else {
|
||||||
|
// ggml_backend_tensor_set(cur, data, 0, n_size);
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
/// LORA
|
/// LORA
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue