llama : rewrite lora with ggml-backend and compute on CPU
ggml-ci
This commit is contained in:
parent
4813e17548
commit
11583c1462
7 changed files with 256 additions and 145 deletions
|
@ -948,11 +948,7 @@ static struct ggml_tensor * ggml_dup_tensor_layout(struct ggml_context * ctx, co
|
|||
// assigns backends to ops and splits the graph into subgraphs that can be computed on the same backend
|
||||
// TODO: merge passes
|
||||
static void sched_split_graph(ggml_backend_sched_t sched, struct ggml_cgraph * graph) {
|
||||
// reset state
|
||||
size_t hash_size = sched->hash_set.size;
|
||||
memset(sched->hash_set.keys, 0, sizeof(sched->hash_set.keys[0]) * hash_size);
|
||||
memset(sched->node_talloc, 0, sizeof(sched->node_talloc[0]) * hash_size);
|
||||
memset(sched->node_copies, 0, sizeof(sched->node_copies[0]) * hash_size);
|
||||
// reset splits
|
||||
sched->n_splits = 0;
|
||||
|
||||
struct ggml_init_params params = {
|
||||
|
@ -961,11 +957,13 @@ static void sched_split_graph(ggml_backend_sched_t sched, struct ggml_cgraph * g
|
|||
/* .no_alloc = */ true
|
||||
};
|
||||
|
||||
if (sched->ctx != NULL) {
|
||||
ggml_free(sched->ctx);
|
||||
}
|
||||
ggml_free(sched->ctx);
|
||||
|
||||
sched->ctx = ggml_init(params);
|
||||
if (sched->ctx == NULL) {
|
||||
fprintf(stderr, "%s: failed to initialize context\n", __func__);
|
||||
GGML_ASSERT(false);
|
||||
}
|
||||
|
||||
// pass 1: assign backends to ops with allocated inputs
|
||||
for (int i = 0; i < graph->n_leafs; i++) {
|
||||
|
@ -1309,13 +1307,23 @@ static void sched_reset(ggml_backend_sched_t sched) {
|
|||
for (int i = 0; i < sched->n_backends; i++) {
|
||||
ggml_tallocr_reset(sched->tallocs[i]);
|
||||
}
|
||||
// reset state for the next run
|
||||
size_t hash_size = sched->hash_set.size;
|
||||
memset(sched->hash_set.keys, 0, sizeof(sched->hash_set.keys[0]) * hash_size);
|
||||
memset(sched->node_talloc, 0, sizeof(sched->node_talloc[0]) * hash_size);
|
||||
memset(sched->node_copies, 0, sizeof(sched->node_copies[0]) * hash_size);
|
||||
}
|
||||
|
||||
ggml_backend_sched_t ggml_backend_sched_new(ggml_backend_t * backends, int n_backends) {
|
||||
ggml_backend_sched_t ggml_backend_sched_new(ggml_backend_t * backends, int n_backends, size_t graph_size) {
|
||||
GGML_ASSERT(n_backends > 0);
|
||||
GGML_ASSERT(n_backends <= GGML_MAX_BACKENDS);
|
||||
|
||||
struct ggml_backend_sched * sched = malloc(sizeof(struct ggml_backend_sched));
|
||||
memset(sched, 0, sizeof(struct ggml_backend_sched));
|
||||
struct ggml_backend_sched * sched = calloc(sizeof(struct ggml_backend_sched), 1);
|
||||
|
||||
// initialize hash table
|
||||
sched->hash_set = ggml_hash_set_new(graph_size + GGML_MAX_SPLITS*GGML_MAX_SPLIT_INPUTS);
|
||||
sched->node_talloc = calloc(sizeof(sched->node_talloc[0]) * sched->hash_set.size, 1);
|
||||
sched->node_copies = calloc(sizeof(sched->node_copies[0]) * sched->hash_set.size, 1);
|
||||
|
||||
sched->n_backends = n_backends;
|
||||
for (int i = 0; i < n_backends; i++) {
|
||||
|
@ -1340,6 +1348,7 @@ void ggml_backend_sched_free(ggml_backend_sched_t sched) {
|
|||
ggml_tallocr_free(sched->tallocs[i]);
|
||||
}
|
||||
ggml_gallocr_free(sched->galloc);
|
||||
ggml_free(sched->ctx);
|
||||
free(sched->hash_set.keys);
|
||||
free(sched->node_talloc);
|
||||
free(sched->node_copies);
|
||||
|
@ -1347,12 +1356,7 @@ void ggml_backend_sched_free(ggml_backend_sched_t sched) {
|
|||
}
|
||||
|
||||
void ggml_backend_sched_init_measure(ggml_backend_sched_t sched, struct ggml_cgraph * measure_graph) {
|
||||
// initialize hash tables
|
||||
size_t hash_size = measure_graph->visited_hash_table.size + GGML_MAX_SPLITS*GGML_MAX_SPLIT_INPUTS;
|
||||
sched->hash_set.size = hash_size;
|
||||
sched->hash_set.keys = malloc(sizeof(sched->hash_set.keys[0]) * hash_size);
|
||||
sched->node_talloc = malloc(sizeof(sched->node_talloc[0]) * hash_size);
|
||||
sched->node_copies = malloc(sizeof(sched->node_copies[0]) * hash_size);
|
||||
GGML_ASSERT(ggml_tallocr_is_measure(sched->tallocs[0])); // can only be initialized once
|
||||
|
||||
sched_split_graph(sched, measure_graph);
|
||||
sched_alloc_splits(sched);
|
||||
|
@ -1368,7 +1372,8 @@ void ggml_backend_sched_init_measure(ggml_backend_sched_t sched, struct ggml_cgr
|
|||
}
|
||||
|
||||
void ggml_backend_sched_graph_split(ggml_backend_sched_t sched, struct ggml_cgraph * graph) {
|
||||
GGML_ASSERT(sched->hash_set.size >= graph->visited_hash_table.size + GGML_MAX_SPLITS*GGML_MAX_SPLIT_INPUTS);
|
||||
GGML_ASSERT((int)sched->hash_set.size >= graph->n_nodes + GGML_MAX_SPLITS*GGML_MAX_SPLIT_INPUTS);
|
||||
|
||||
sched_split_graph(sched, graph);
|
||||
}
|
||||
|
||||
|
@ -1385,17 +1390,17 @@ int ggml_backend_sched_get_n_splits(ggml_backend_sched_t sched) {
|
|||
|
||||
ggml_tallocr_t ggml_backend_sched_get_tallocr(ggml_backend_sched_t sched, ggml_backend_t backend) {
|
||||
int backend_index = sched_backend_prio(sched, backend);
|
||||
GGML_ASSERT(backend_index >= 0 && backend_index < sched->n_backends);
|
||||
return sched->tallocs[backend_index];
|
||||
}
|
||||
|
||||
ggml_backend_buffer_t ggml_backend_sched_get_buffer(ggml_backend_sched_t sched, ggml_backend_t backend) {
|
||||
int backend_index = sched_backend_prio(sched, backend);
|
||||
GGML_ASSERT(backend_index >= 0 && backend_index < sched->n_backends);
|
||||
return ggml_tallocr_get_buffer(sched->tallocs[backend_index]);
|
||||
}
|
||||
|
||||
void ggml_backend_sched_set_node_backend(ggml_backend_sched_t sched, struct ggml_tensor * node, ggml_backend_t backend) {
|
||||
// FIXME: node_allocr is cleared when splitting the graph, so all user assignments are lost
|
||||
// to avoid this, we need to clear node_allocr after compute rather than before split
|
||||
int backend_index = sched_backend_prio(sched, backend);
|
||||
GGML_ASSERT(backend_index >= 0 && backend_index < sched->n_backends);
|
||||
node_allocr(node) = sched->tallocs[backend_index];
|
||||
|
|
|
@ -149,7 +149,7 @@ extern "C" {
|
|||
typedef struct ggml_backend_sched * ggml_backend_sched_t;
|
||||
|
||||
// Initialize a backend scheduler
|
||||
GGML_API ggml_backend_sched_t ggml_backend_sched_new(ggml_backend_t * backends, int n_backends);
|
||||
GGML_API ggml_backend_sched_t ggml_backend_sched_new(ggml_backend_t * backends, int n_backends, size_t graph_size);
|
||||
GGML_API void ggml_backend_sched_free(ggml_backend_sched_t sched);
|
||||
// Initialize backend buffers from a measure graph
|
||||
GGML_API void ggml_backend_sched_init_measure(ggml_backend_sched_t sched, struct ggml_cgraph * measure_graph);
|
||||
|
|
44
ggml-cuda.cu
44
ggml-cuda.cu
|
@ -9720,16 +9720,56 @@ static void ggml_backend_cuda_split_buffer_set_tensor(ggml_backend_buffer_t buff
|
|||
}
|
||||
}
|
||||
|
||||
static void ggml_backend_cuda_split_buffer_get_tensor(ggml_backend_buffer_t buffer, const ggml_tensor * tensor, void * data, size_t offset, size_t size) {
|
||||
// split tensors must always be set in their entirety at once
|
||||
GGML_ASSERT(offset == 0);
|
||||
GGML_ASSERT(size == ggml_nbytes(tensor));
|
||||
|
||||
ggml_backend_cuda_split_buffer_type_context * buft_ctx = (ggml_backend_cuda_split_buffer_type_context *)buffer->buft->context;
|
||||
|
||||
const int64_t ne0 = tensor->ne[0];
|
||||
const size_t nb1 = tensor->nb[1];
|
||||
ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *)tensor->extra;
|
||||
|
||||
for (int id = 0; id < g_device_count; ++id) {
|
||||
int64_t row_low, row_high;
|
||||
get_row_split(&row_low, &row_high, tensor, buft_ctx->tensor_split, id);
|
||||
|
||||
int64_t nrows_split = row_high - row_low;
|
||||
if (nrows_split == 0) {
|
||||
continue;
|
||||
}
|
||||
|
||||
const size_t offset_split = row_low*nb1;
|
||||
size_t size = ggml_nbytes_split(tensor, nrows_split);
|
||||
const size_t original_size = size;
|
||||
|
||||
// pad last row to a multiple of 512 elements to avoid out-of-bounds memory accesses
|
||||
if (ne0 % MATRIX_ROW_PADDING != 0) {
|
||||
size += ggml_row_size(tensor->type, MATRIX_ROW_PADDING - ne0 % MATRIX_ROW_PADDING);
|
||||
}
|
||||
|
||||
char * buf_host = (char *)data + offset_split;
|
||||
//CUDA_CHECK(cudaMemcpy(extra->data_device[id], buf_host, original_size, cudaMemcpyHostToDevice));
|
||||
CUDA_CHECK(cudaMemcpy(buf_host, extra->data_device[id], original_size, cudaMemcpyDeviceToHost));
|
||||
}
|
||||
}
|
||||
|
||||
static void ggml_backend_cuda_split_buffer_clear(ggml_backend_buffer_t buffer, uint8_t value) {
|
||||
UNUSED(buffer);
|
||||
UNUSED(value);
|
||||
}
|
||||
|
||||
static struct ggml_backend_buffer_i ggml_cuda_backend_split_buffer_interface = {
|
||||
/* .get_name = */ ggml_backend_cuda_split_buffer_get_name,
|
||||
/* .free_buffer = */ ggml_backend_cuda_split_buffer_free_buffer,
|
||||
/* .get_base = */ ggml_backend_cuda_split_buffer_get_base,
|
||||
/* .init_tensor = */ ggml_backend_cuda_split_buffer_init_tensor,
|
||||
/* .set_tensor = */ ggml_backend_cuda_split_buffer_set_tensor,
|
||||
/* .get_tensor = */ NULL,
|
||||
/* .get_tensor = */ ggml_backend_cuda_split_buffer_get_tensor,
|
||||
/* .cpy_tensor_from = */ NULL,
|
||||
/* .cpy_tensor_to = */ NULL,
|
||||
/* .clear = */ NULL,
|
||||
/* .clear = */ ggml_backend_cuda_split_buffer_clear,
|
||||
};
|
||||
|
||||
// cuda split buffer type
|
||||
|
|
|
@ -228,6 +228,8 @@ inline static float ggml_lookup_fp16_to_fp32(ggml_fp16_t f) {
|
|||
#define GGML_HASHTABLE_FULL ((size_t)-1)
|
||||
#define GGML_HASHTABLE_ALREADY_EXISTS ((size_t)-2)
|
||||
|
||||
struct ggml_hash_set ggml_hash_set_new(size_t size);
|
||||
|
||||
bool ggml_hash_contains (const struct ggml_hash_set hash_set, struct ggml_tensor * key);
|
||||
|
||||
// returns GGML_HASHTABLE_FULL if table is full, otherwise the current index of the key or where it should be inserted
|
||||
|
|
19
ggml.c
19
ggml.c
|
@ -4343,6 +4343,23 @@ struct ggml_tensor * ggml_cpy_inplace(
|
|||
return ggml_cpy_impl(ctx, a, b, true);
|
||||
}
|
||||
|
||||
struct ggml_tensor * ggml_cast(
|
||||
struct ggml_context * ctx,
|
||||
struct ggml_tensor * a,
|
||||
enum ggml_type type) {
|
||||
bool is_node = false;
|
||||
|
||||
struct ggml_tensor * result = ggml_new_tensor(ctx, type, GGML_MAX_DIMS, a->ne);
|
||||
ggml_format_name(result, "%s (copy)", a->name);
|
||||
|
||||
result->op = GGML_OP_CPY;
|
||||
result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
|
||||
result->src[0] = a;
|
||||
result->src[1] = result;
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
// ggml_cont
|
||||
|
||||
static struct ggml_tensor * ggml_cont_impl(
|
||||
|
@ -14835,7 +14852,7 @@ size_t ggml_hash_find_or_insert(struct ggml_hash_set hash_set, struct ggml_tenso
|
|||
return i;
|
||||
}
|
||||
|
||||
static struct ggml_hash_set ggml_hash_set_new(size_t size) {
|
||||
struct ggml_hash_set ggml_hash_set_new(size_t size) {
|
||||
size = ggml_hash_size(size);
|
||||
struct ggml_hash_set result;
|
||||
result.size = size;
|
||||
|
|
5
ggml.h
5
ggml.h
|
@ -1165,6 +1165,11 @@ extern "C" {
|
|||
struct ggml_tensor * a,
|
||||
struct ggml_tensor * b);
|
||||
|
||||
GGML_API struct ggml_tensor * ggml_cast(
|
||||
struct ggml_context * ctx,
|
||||
struct ggml_tensor * a,
|
||||
enum ggml_type type);
|
||||
|
||||
// make contiguous
|
||||
GGML_API struct ggml_tensor * ggml_cont(
|
||||
struct ggml_context * ctx,
|
||||
|
|
284
llama.cpp
284
llama.cpp
|
@ -2451,8 +2451,11 @@ struct llama_model_loader {
|
|||
const size_t offs = file_offset(ggml_get_name(cur));
|
||||
|
||||
if (use_mmap && mapping) {
|
||||
GGML_ASSERT(cur->data == nullptr);
|
||||
cur->data = (uint8_t *)mapping->addr + offs;
|
||||
if (cur->data == nullptr) {
|
||||
cur->data = (uint8_t *)mapping->addr + offs;
|
||||
} else {
|
||||
memcpy(cur->data, (uint8_t *)mapping->addr + offs, ggml_nbytes(cur));
|
||||
}
|
||||
} else {
|
||||
GGML_ASSERT(cur->data != nullptr);
|
||||
file.seek(offs, SEEK_SET);
|
||||
|
@ -8769,48 +8772,23 @@ static int llama_apply_lora_from_file_internal(
|
|||
|
||||
LLAMA_LOG_INFO("%s: r = %d, alpha = %d, scaling = %.2f\n", __func__, lora_r, lora_alpha, scaling);
|
||||
|
||||
// create a name -> tensor map of the model to accelerate lookups
|
||||
// find the max tensor size to estimate the required temporary buffer size
|
||||
size_t max_tensor_size = 0;
|
||||
std::unordered_map<std::string, struct ggml_tensor*> model_tensors;
|
||||
for (const auto & kv : model.tensors_by_name) {
|
||||
model_tensors.insert(kv);
|
||||
size_t f32_size = ggml_nelements(kv.second) * sizeof(float);
|
||||
max_tensor_size = std::max(max_tensor_size, f32_size);
|
||||
}
|
||||
|
||||
// create a temporary ggml context to store the lora tensors
|
||||
// TODO: use ggml-alloc
|
||||
size_t lora_ctx_size = max_tensor_size * 3;
|
||||
LLAMA_LOG_INFO("%s: allocating %.f MB for lora temporary buffer\n", __func__, lora_ctx_size / 1024.0 / 1024.0);
|
||||
std::vector<uint8_t> lora_buf(lora_ctx_size);
|
||||
|
||||
struct ggml_init_params params;
|
||||
params.mem_size = lora_buf.size();
|
||||
params.mem_buffer = lora_buf.data();
|
||||
params.no_alloc = false;
|
||||
|
||||
using unique_context = std::unique_ptr<ggml_context, decltype(&ggml_free)>;
|
||||
|
||||
unique_context lora_ctx(nullptr, ggml_free);
|
||||
lora_ctx.reset(ggml_init(params));
|
||||
std::unordered_map<std::string, struct ggml_tensor *> lora_tensors;
|
||||
|
||||
// load base model
|
||||
std::unique_ptr<llama_model_loader> ml;
|
||||
|
||||
if (path_base_model) {
|
||||
if (path_base_model) {
|
||||
LLAMA_LOG_INFO("%s: loading base model from '%s'\n", __func__, path_base_model);
|
||||
ml.reset(new llama_model_loader(path_base_model, /*use_mmap*/ true, /*kv_overrides*/ nullptr));
|
||||
ml->init_mapping(false); // no prefetching
|
||||
ml->init_mapping(/*prefetch*/ false); // no prefetching
|
||||
}
|
||||
|
||||
// read tensors and apply
|
||||
bool warned = false;
|
||||
int n_tensors = 0;
|
||||
|
||||
std::vector<uint8_t> work_buffer;
|
||||
struct tensor_meta {
|
||||
std::string name;
|
||||
ggml_type type;
|
||||
int32_t ne[2];
|
||||
size_t offset;
|
||||
};
|
||||
std::map<std::string, tensor_meta> tensor_meta_map;
|
||||
|
||||
// load all tensor meta
|
||||
while (true) {
|
||||
if (fin.tell() == fin.size) {
|
||||
// eof
|
||||
|
@ -8823,7 +8801,7 @@ static int llama_apply_lora_from_file_internal(
|
|||
|
||||
fin.read_raw(&n_dims, sizeof(n_dims));
|
||||
fin.read_raw(&name_len, sizeof(name_len));
|
||||
fin.read_raw(&ftype, sizeof(ftype));
|
||||
fin.read_raw(&ftype, sizeof(ftype));
|
||||
|
||||
if (n_dims != 1 && n_dims != 2) {
|
||||
LLAMA_LOG_ERROR("%s: unsupported tensor dimension %d\n", __func__, n_dims);
|
||||
|
@ -8837,31 +8815,23 @@ static int llama_apply_lora_from_file_internal(
|
|||
|
||||
std::string name;
|
||||
{
|
||||
GGML_ASSERT(name_len <= 1024);
|
||||
char buf[1024];
|
||||
GGML_ASSERT(name_len < GGML_MAX_NAME);
|
||||
char buf[GGML_MAX_NAME];
|
||||
fin.read_raw(buf, name_len);
|
||||
name = std::string(buf, name_len);
|
||||
}
|
||||
|
||||
// check for lora suffix and get the type of tensor
|
||||
const std::string lora_suffix = ".lora";
|
||||
size_t pos = name.rfind(lora_suffix);
|
||||
if (pos == std::string::npos) {
|
||||
// check for lora suffix
|
||||
std::string lora_suffix;
|
||||
if (name.length() > 6) {
|
||||
lora_suffix = name.substr(name.length() - 6);
|
||||
}
|
||||
if (lora_suffix != ".loraA" && lora_suffix != ".loraB") {
|
||||
LLAMA_LOG_ERROR("%s: error: '%s' is not a lora tensor\n", __func__, name.c_str());
|
||||
return 1;
|
||||
}
|
||||
|
||||
std::string lora_type = name.substr(pos + lora_suffix.length());
|
||||
std::string base_name = name;
|
||||
base_name.erase(pos);
|
||||
// LLAMA_LOG_INFO("%s: %s => %s (lora type %s) \n", __func__, name.c_str(), base_name.c_str(), lora_type.c_str());
|
||||
|
||||
if (model_tensors.find(base_name) == model_tensors.end()) {
|
||||
LLAMA_LOG_ERROR("%s: unknown tensor '%s' in lora adapter\n", __func__, name.data());
|
||||
return 1;
|
||||
}
|
||||
|
||||
// create ggml tensor
|
||||
// tensor type
|
||||
ggml_type wtype;
|
||||
switch (ftype) {
|
||||
case 0: wtype = GGML_TYPE_F32; break;
|
||||
|
@ -8873,105 +8843,177 @@ static int llama_apply_lora_from_file_internal(
|
|||
return false;
|
||||
}
|
||||
}
|
||||
ggml_tensor * lora_tensor = ggml_new_tensor_2d(lora_ctx.get(), wtype, ne[0], ne[1]);
|
||||
ggml_set_name(lora_tensor, name.c_str());
|
||||
|
||||
// load tensor data
|
||||
// data offset
|
||||
size_t offset = fin.tell();
|
||||
size_t tensor_data_size = ggml_nbytes(lora_tensor);
|
||||
offset = (offset + 31) & -32;
|
||||
fin.seek(offset, SEEK_SET);
|
||||
fin.read_raw(lora_tensor->data, tensor_data_size);
|
||||
|
||||
lora_tensors[name] = lora_tensor;
|
||||
// skip tensor data
|
||||
fin.seek(offset + ggml_row_size(wtype, ne[0]) * ne[1], SEEK_SET);
|
||||
|
||||
// check if we have both A and B tensors and apply
|
||||
if (lora_tensors.find(base_name + ".loraA") != lora_tensors.end() &&
|
||||
lora_tensors.find(base_name + ".loraB") != lora_tensors.end()) {
|
||||
tensor_meta_map.emplace(name, tensor_meta{ name, wtype, { ne[0], ne[1] }, offset });
|
||||
}
|
||||
|
||||
ggml_tensor * dest_t = model_tensors[base_name];
|
||||
bool warned = false;
|
||||
int n_tensors = 0;
|
||||
|
||||
// FIXME: ggml-backend
|
||||
// apply
|
||||
ggml_backend_t backend_cpu = ggml_backend_cpu_init();
|
||||
if (backend_cpu == nullptr) {
|
||||
LLAMA_LOG_ERROR("%s: error: failed to initialize cpu backend\n", __func__);
|
||||
return 1;
|
||||
}
|
||||
ggml_backend_cpu_set_n_threads(backend_cpu, n_threads);
|
||||
|
||||
ggml_tensor * base_t;
|
||||
if (ml) {
|
||||
struct gguf_context * ctx_gguf = ml->ctx_gguf;
|
||||
std::vector<no_init<uint8_t>> read_buf;
|
||||
for (const auto & it : model.tensors_by_name) {
|
||||
const std::string & base_name = it.first;
|
||||
ggml_tensor * model_t = it.second;
|
||||
|
||||
// load from base model
|
||||
if (gguf_find_tensor(ctx_gguf, base_name.c_str()) < 0) {
|
||||
LLAMA_LOG_ERROR("%s: error: tensor '%s' not found in base model\n", __func__, base_name.c_str());
|
||||
return 1;
|
||||
}
|
||||
if (tensor_meta_map.find(base_name + ".loraA") == tensor_meta_map.end() ||
|
||||
tensor_meta_map.find(base_name + ".loraB") == tensor_meta_map.end()) {
|
||||
continue;
|
||||
}
|
||||
|
||||
base_t = ml->get_tensor_meta(base_name.c_str());
|
||||
ml->load_data_for(base_t);
|
||||
} else {
|
||||
base_t = dest_t;
|
||||
}
|
||||
tensor_meta & metaA = tensor_meta_map.at(base_name + ".loraA");
|
||||
tensor_meta & metaB = tensor_meta_map.at(base_name + ".loraB");
|
||||
|
||||
if (ggml_is_quantized(base_t->type)) {
|
||||
if (!warned) {
|
||||
LLAMA_LOG_WARN("%s: warning: using a lora adapter with a quantized model may result in poor quality, "
|
||||
"use a f16 or f32 base model with --lora-base\n", __func__);
|
||||
warned = true;
|
||||
}
|
||||
}
|
||||
ggml_init_params lora_init_params = {
|
||||
/* .mem_size */ ggml_tensor_overhead()*128 + ggml_graph_overhead(),
|
||||
/* .mem_buffer */ nullptr,
|
||||
/* .no_alloc */ true,
|
||||
};
|
||||
ggml_context * lora_ctx = ggml_init(lora_init_params);
|
||||
if (lora_ctx == nullptr) {
|
||||
LLAMA_LOG_ERROR("%s: error: failed to initialize lora context\n", __func__);
|
||||
ggml_backend_free(backend_cpu);
|
||||
return 1;
|
||||
}
|
||||
|
||||
ggml_tensor * loraA = lora_tensors[base_name + ".loraA"];
|
||||
GGML_ASSERT(loraA->type == GGML_TYPE_F32);
|
||||
ggml_set_name(loraA, "loraA");
|
||||
// create tensors
|
||||
ggml_tensor * loraA = ggml_new_tensor_2d(lora_ctx, metaA.type, metaA.ne[0], metaA.ne[1]);
|
||||
ggml_tensor * loraB = ggml_new_tensor_2d(lora_ctx, metaB.type, metaB.ne[0], metaB.ne[1]);
|
||||
ggml_set_name(loraA, metaA.name.c_str());
|
||||
ggml_set_name(loraB, metaB.name.c_str());
|
||||
|
||||
ggml_tensor * loraB = lora_tensors[base_name + ".loraB"];
|
||||
GGML_ASSERT(loraB->type == GGML_TYPE_F32);
|
||||
ggml_set_name(loraB, "loraB");
|
||||
|
||||
if (base_t->ne[0] != loraA->ne[1] || base_t->ne[1] != loraB->ne[1]) {
|
||||
LLAMA_LOG_ERROR("%s: incompatible tensor dimensions (%" PRId64 " and %" PRId64 ");"
|
||||
" are you sure that this adapter is for this model?\n", __func__, base_t->ne[0], loraA->ne[1]);
|
||||
ggml_tensor * base_t;
|
||||
if (ml) {
|
||||
if (gguf_find_tensor(ml->ctx_gguf, base_name.c_str()) < 0) {
|
||||
LLAMA_LOG_ERROR("%s: error: tensor '%s' not found in base model\n", __func__, base_name.c_str());
|
||||
return 1;
|
||||
}
|
||||
base_t = ggml_dup_tensor(lora_ctx, ml->get_tensor_meta(base_name.c_str()));
|
||||
} else {
|
||||
base_t = ggml_dup_tensor(lora_ctx, model_t);
|
||||
}
|
||||
ggml_set_name(base_t, base_name.c_str());
|
||||
|
||||
// allocate in backend buffer
|
||||
ggml_backend_buffer_t lora_buf = ggml_backend_alloc_ctx_tensors_from_buft(lora_ctx, ggml_backend_cpu_buffer_type());
|
||||
if (lora_buf == nullptr) {
|
||||
LLAMA_LOG_ERROR("%s: error: failed to allocate lora tensors\n", __func__);
|
||||
return 1;
|
||||
}
|
||||
|
||||
// load tensor data
|
||||
auto load_tensor = [&read_buf, &fin](const tensor_meta & tensor_meta, ggml_tensor * tensor) {
|
||||
read_buf.resize(ggml_nbytes(tensor));
|
||||
fin.seek(tensor_meta.offset, SEEK_SET);
|
||||
fin.read_raw(read_buf.data(), ggml_nbytes(tensor));
|
||||
ggml_backend_tensor_set(tensor, read_buf.data(), 0, read_buf.size());
|
||||
};
|
||||
load_tensor(metaA, loraA);
|
||||
load_tensor(metaB, loraB);
|
||||
|
||||
// load base model tensor data
|
||||
if (ml) {
|
||||
ml->load_data_for(base_t);
|
||||
} else {
|
||||
ggml_backend_tensor_copy(model_t, base_t);
|
||||
}
|
||||
|
||||
if (ggml_is_quantized(base_t->type) && !warned) {
|
||||
LLAMA_LOG_WARN("%s: warning: using a lora adapter with a quantized model may result in poor quality, "
|
||||
"use a f16 or f32 base model with --lora-base\n", __func__);
|
||||
warned = true;
|
||||
}
|
||||
|
||||
if (base_t->ne[0] != loraA->ne[1] || base_t->ne[1] != loraB->ne[1]) {
|
||||
LLAMA_LOG_ERROR("%s: incompatible tensor dimensions (%" PRId64 " and %" PRId64 ");"
|
||||
" are you sure that this adapter is for this model?\n", __func__, base_t->ne[0], loraA->ne[1]);
|
||||
ggml_free(lora_ctx);
|
||||
ggml_backend_buffer_free(lora_buf);
|
||||
ggml_backend_free(backend_cpu);
|
||||
return 1;
|
||||
}
|
||||
|
||||
auto build_lora_graph = [&]() {
|
||||
// w = w + BA*s
|
||||
ggml_tensor * BA = ggml_mul_mat(lora_ctx.get(), loraA, loraB);
|
||||
ggml_tensor * BA = ggml_mul_mat(lora_ctx, loraA, loraB);
|
||||
ggml_set_name(BA, "BA");
|
||||
|
||||
if (scaling != 1.0f) {
|
||||
BA = ggml_scale_inplace(lora_ctx.get(), BA, scaling);
|
||||
BA = ggml_scale(lora_ctx, BA, scaling);
|
||||
ggml_set_name(BA, "BA_scaled");
|
||||
}
|
||||
|
||||
ggml_tensor * r;
|
||||
if (base_t == dest_t) {
|
||||
r = ggml_add_inplace(lora_ctx.get(), dest_t, BA);
|
||||
ggml_set_name(r, "r_add_inplace");
|
||||
}
|
||||
else {
|
||||
r = ggml_add(lora_ctx.get(), base_t, BA);
|
||||
ggml_set_name(r, "r_add");
|
||||
r = ggml_add_inplace(lora_ctx, base_t, BA);
|
||||
ggml_set_name(r, "r_add");
|
||||
|
||||
r = ggml_cpy(lora_ctx.get(), r, dest_t);
|
||||
ggml_set_name(r, "r_cpy");
|
||||
if (base_t->type != model_t->type) {
|
||||
// convert the result to the model type
|
||||
r = ggml_cast(lora_ctx, r, model_t->type);
|
||||
ggml_set_name(r, "r_cast");
|
||||
}
|
||||
|
||||
struct ggml_cgraph * gf = ggml_new_graph(lora_ctx.get());
|
||||
ggml_build_forward_expand(gf, r);
|
||||
return r;
|
||||
};
|
||||
|
||||
ggml_graph_compute_helper(work_buffer, gf, n_threads);
|
||||
ggml_cgraph * gf = ggml_new_graph(lora_ctx);
|
||||
ggml_tensor * r = build_lora_graph();
|
||||
ggml_build_forward_expand(gf, r);
|
||||
|
||||
// the tensors in the adapter must be sorted such that loraA and loraB of the same tensor are next to each other
|
||||
GGML_ASSERT(lora_tensors.size() == 2);
|
||||
ggml_backend_buffer_t graph_buf = ggml_backend_alloc_ctx_tensors_from_buft(lora_ctx, ggml_backend_cpu_buffer_type());
|
||||
if (graph_buf == nullptr) {
|
||||
LLAMA_LOG_ERROR("%s: error: failed to allocate graph tensors\n", __func__);
|
||||
ggml_free(lora_ctx);
|
||||
ggml_backend_buffer_free(lora_buf);
|
||||
ggml_backend_free(backend_cpu);
|
||||
return 1;
|
||||
}
|
||||
|
||||
// we won't need these tensors again, reset the context to save memory
|
||||
lora_ctx.reset(ggml_init(params));
|
||||
lora_tensors.clear();
|
||||
ggml_backend_graph_compute(backend_cpu, gf);
|
||||
|
||||
n_tensors++;
|
||||
if (n_tensors % 4 == 0) {
|
||||
LLAMA_LOG_INFO(".");
|
||||
}
|
||||
ggml_backend_tensor_set(model_t, r->data, 0, ggml_nbytes(r));
|
||||
|
||||
#if 0
|
||||
// TODO: use scheduler with fallback to CPU for less copies between CPU and GPU
|
||||
//ggml_backend_sched_t sched = ggml_backend_sched_new(backends.data(), backends.size(), GGML_DEFAULT_GRAPH_SIZE);
|
||||
|
||||
// sched compute
|
||||
ggml_build_forward_expand(gf, build_graph());
|
||||
ggml_backend_sched_init_measure(sched, gf);
|
||||
|
||||
// create the graph again, since the previous one was destroyed by the measure
|
||||
ggml_graph_clear(gf);
|
||||
ggml_build_forward_expand(gf, build_graph());
|
||||
ggml_backend_sched_graph_compute(sched, gf);
|
||||
ggml_backend_sched_free(sched);
|
||||
#endif
|
||||
|
||||
ggml_backend_buffer_free(lora_buf);
|
||||
ggml_backend_buffer_free(graph_buf);
|
||||
ggml_free(lora_ctx);
|
||||
|
||||
n_tensors++;
|
||||
if (n_tensors % 4 == 0) {
|
||||
LLAMA_LOG_INFO(".");
|
||||
}
|
||||
}
|
||||
|
||||
ggml_backend_free(backend_cpu);
|
||||
|
||||
const int64_t t_lora_us = ggml_time_us() - t_start_lora_us;
|
||||
LLAMA_LOG_INFO(" done (%.2f ms)\n", t_lora_us / 1000.0);
|
||||
|
||||
|
@ -9269,7 +9311,7 @@ struct llama_context * llama_new_context_with_model(
|
|||
// buffer used to store the computation graph and the tensor meta data
|
||||
ctx->buf_compute_meta.resize(ggml_tensor_overhead()*LLAMA_MAX_NODES + ggml_graph_overhead());
|
||||
|
||||
ctx->sched = ggml_backend_sched_new(ctx->backends.data(), ctx->backends.size());
|
||||
ctx->sched = ggml_backend_sched_new(ctx->backends.data(), ctx->backends.size(), LLAMA_MAX_NODES);
|
||||
ctx->alloc = ggml_backend_sched_get_tallocr(ctx->sched, ctx->backend_cpu);
|
||||
|
||||
// build worst-case graph
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue