llama : refactor llama_model_loader (WIP)

wip : remove ggml_ctx from llama_model_loader

wip : merge gguf_file_loader in llama_model_loader
This commit is contained in:
Georgi Gerganov 2023-08-16 00:02:25 +03:00
parent 23248d7d32
commit 5339b859ec
No known key found for this signature in database
GPG key ID: 449E073F9DC10735
3 changed files with 215 additions and 256 deletions

8
ggml.c
View file

@ -19065,14 +19065,6 @@ enum gguf_type gguf_get_arr_type(struct gguf_context * ctx, int i) {
return ctx->kv[i].value.arr.type; return ctx->kv[i].value.arr.type;
} }
int32_t gguf_get_arr_i32(struct gguf_context * ctx, int key_id, int i) {
return ((int32_t *) ctx->kv[key_id].value.arr.data)[i];
}
float gguf_get_arr_f32(struct gguf_context * ctx, int key_id, int i) {
return ((float *) ctx->kv[key_id].value.arr.data)[i];
}
const void * gguf_get_arr_data(struct gguf_context * ctx, int i) { const void * gguf_get_arr_data(struct gguf_context * ctx, int i) {
return ctx->kv[i].value.arr.data; return ctx->kv[i].value.arr.data;
} }

1
ggml.h
View file

@ -1499,7 +1499,6 @@ extern "C" {
struct ggml_context * ctx, struct ggml_context * ctx,
struct ggml_tensor * tensor); struct ggml_tensor * tensor);
GGML_API void ggml_build_forward_expand(struct ggml_cgraph * cgraph, struct ggml_tensor * tensor); GGML_API void ggml_build_forward_expand(struct ggml_cgraph * cgraph, struct ggml_tensor * tensor);
GGML_API struct ggml_cgraph ggml_build_forward (struct ggml_tensor * tensor); GGML_API struct ggml_cgraph ggml_build_forward (struct ggml_tensor * tensor);

View file

@ -993,227 +993,189 @@ static std::string llama_format_tensor_shape(const std::vector<uint32_t> & ne) {
return buf; return buf;
} }
struct llama_load_tensor { struct llama_model_loader {
std::string name; int n_tensors = 0;
enum ggml_type type = GGML_TYPE_F32; int n_created = 0;
std::vector<uint32_t> ne; bool use_mmap = false;
size_t file_off;
size_t size;
struct ggml_tensor * ggml_tensor = NULL;
uint8_t * data;
};
struct llama_load_tensors_map {
// tensors is kept in a separate vector to preserve file order
std::vector<llama_load_tensor> tensors;
std::unordered_map<std::string, size_t> name_to_idx;
};
struct llama_file_loader {
llama_file file; llama_file file;
gguf_context * ctx_gguf;
llama_file_version file_version; llama_file_version file_version;
struct ggml_context * ctx_data = NULL;
llama_file_loader(const char * fname, llama_load_tensors_map & tensors_map) : file(fname, "rb") {
fprintf(stderr, "llama.cpp: loading model from %s\n", fname);
struct gguf_init_params params = {
/*.no_alloc = */ true,
/*.ctx = */ &ctx_data,
};
ctx_gguf = gguf_init_from_file(fname, params);
file_version = (enum llama_file_version) gguf_get_version(ctx_gguf);
read_tensor_metadata(tensors_map);
}
void read_tensor_metadata(llama_load_tensors_map & tensors_map) const {
const int n_tensors = gguf_get_n_tensors(ctx_gguf);
for (int i = 0; i < n_tensors; ++i) {
llama_load_tensor tensor;
const char * name = gguf_get_tensor_name(ctx_gguf, i);
struct ggml_tensor * cur = ggml_get_tensor(ctx_data, name);
const uint32_t n_dims = cur->n_dims;
tensor.type = cur->type;
tensor.ne.resize(n_dims);
for (uint32_t j = 0; j < n_dims; ++j) {
tensor.ne[j] = cur->ne[j];
}
if (n_dims < 1 || n_dims > 2) {
throw std::runtime_error(format("llama.cpp: tensor '%s' should not be %u-dimensional", name, n_dims));
}
switch (tensor.type) {
case GGML_TYPE_F32:
case GGML_TYPE_F16:
case GGML_TYPE_Q4_0:
case GGML_TYPE_Q4_1:
case GGML_TYPE_Q5_0:
case GGML_TYPE_Q5_1:
case GGML_TYPE_Q8_0:
case GGML_TYPE_Q2_K:
case GGML_TYPE_Q3_K:
case GGML_TYPE_Q4_K:
case GGML_TYPE_Q5_K:
case GGML_TYPE_Q6_K:
break;
default: {
throw std::runtime_error(format("unrecognized tensor type %u\n", tensor.type));
}
}
tensor.file_off = gguf_get_data_offset(ctx_gguf) + gguf_get_tensor_offset(ctx_gguf, i);
tensor.name = name;
tensor.size = ggml_nbytes(cur);
tensor.ggml_tensor = cur;
tensors_map.tensors.push_back(tensor);
tensors_map.name_to_idx[name] = tensors_map.tensors.size() - 1;
}
}
};
struct llama_model_loader {
std::unique_ptr<llama_file_loader> file_loader;
llama_load_tensors_map tensors_map;
bool use_mmap;
size_t num_ggml_tensors_created = 0;
struct ggml_context * ggml_ctx = NULL;
std::unique_ptr<llama_mmap> mapping; std::unique_ptr<llama_mmap> mapping;
llama_model_loader(const std::string & fname_base, bool use_mmap) { struct gguf_context * ctx_gguf = NULL;
file_loader = std::unique_ptr<llama_file_loader>(new llama_file_loader(fname_base.c_str(), tensors_map)); struct ggml_context * ctx_meta = NULL;
llama_model_loader(const std::string & fname, bool use_mmap) : file(fname.c_str(), "rb") {
struct gguf_init_params params = {
/*.no_alloc = */ true,
/*.ctx = */ &ctx_meta,
};
ctx_gguf = gguf_init_from_file(fname.c_str(), params);
n_tensors = gguf_get_n_tensors(ctx_gguf);
file_version = (enum llama_file_version) gguf_get_version(ctx_gguf);
LLAMA_LOG_INFO("%s: loaded %d tensors from %s (version %s)\n",
__func__, n_tensors, fname.c_str(), llama_file_version_name(file_version));
if (!llama_mmap::SUPPORTED) { if (!llama_mmap::SUPPORTED) {
LLAMA_LOG_WARN("%s: mmap is not supported on this platform\n", __func__);
use_mmap = false; use_mmap = false;
} }
this->use_mmap = use_mmap; this->use_mmap = use_mmap;
} }
const char * get_tensor_name(int i) const {
return gguf_get_tensor_name(ctx_gguf, i);
}
struct ggml_tensor * get_tensor_meta(int i) const {
return ggml_get_tensor(ctx_meta, get_tensor_name(i));
}
void calc_sizes(size_t & ctx_size_p, size_t & mmapped_size_p) const { void calc_sizes(size_t & ctx_size_p, size_t & mmapped_size_p) const {
ctx_size_p = mmapped_size_p = 0; ctx_size_p = 0;
for (const llama_load_tensor & lt : tensors_map.tensors) { mmapped_size_p = 0;
for (int i = 0; i < n_tensors; i++) {
struct ggml_tensor * meta = get_tensor_meta(i);
ctx_size_p += sizeof(struct ggml_tensor) + GGML_OBJECT_SIZE; ctx_size_p += sizeof(struct ggml_tensor) + GGML_OBJECT_SIZE;
(use_mmap ? mmapped_size_p : ctx_size_p) += ggml_nbytes_pad(lt.ggml_tensor); (use_mmap ? mmapped_size_p : ctx_size_p) += ggml_nbytes_pad(meta);
} }
} }
struct ggml_tensor * get_tensor_for(llama_load_tensor & lt, ggml_backend backend) { struct ggml_tensor * create_tensor_for(struct ggml_context * ctx, struct ggml_tensor * meta, ggml_backend backend) {
struct ggml_tensor * tensor;
if (backend != GGML_BACKEND_CPU) { if (backend != GGML_BACKEND_CPU) {
ggml_set_no_alloc(ggml_ctx, true); ggml_set_no_alloc(ctx, true);
} }
if (lt.ne.size() == 2) {
tensor = ggml_new_tensor_2d(ggml_ctx, lt.type, lt.ne.at(0), lt.ne.at(1)); struct ggml_tensor * tensor = ggml_dup_tensor(ctx, meta);
} else { tensor->backend = backend; // TODO: ggml_set_backend
GGML_ASSERT(lt.ne.size() == 1); ggml_set_name(tensor, ggml_get_name(meta));
tensor = ggml_new_tensor_1d(ggml_ctx, lt.type, lt.ne.at(0));
}
ggml_set_name(tensor, lt.name.c_str());
if (backend != GGML_BACKEND_CPU) { if (backend != GGML_BACKEND_CPU) {
ggml_set_no_alloc(ggml_ctx, use_mmap); ggml_set_no_alloc(ctx, use_mmap);
} }
tensor->backend = backend;
lt.ggml_tensor = tensor; n_created++;
num_ggml_tensors_created++;
return tensor; return tensor;
} }
struct ggml_tensor * get_tensor(const std::string & name, const std::vector<uint32_t> & ne, ggml_backend backend) { struct ggml_tensor * create_tensor(struct ggml_context * ctx, const std::string & name, const std::vector<uint32_t> & ne, ggml_backend backend) {
auto it = tensors_map.name_to_idx.find(name); struct ggml_tensor * cur = ggml_get_tensor(ctx_meta, name.c_str());
if (it == tensors_map.name_to_idx.end()) {
throw std::runtime_error(std::runtime_error(format("llama.cpp: tensor '%s' is missing from model", name.c_str()))); // TODO: simplify
{
bool is_ok = true;
for (size_t i = 0; i < ne.size(); ++i) {
if (ne[i] != cur->ne[i]) {
is_ok = false;
break;
}
}
if (!is_ok) {
throw std::runtime_error(
format("%s: tensor '%s' has wrong shape; expected [%d, %d, %d, %d], got [%d, %d, %d, %d]",
__func__, name.c_str(), ne[0], ne[1], ne[2], ne[3],
(int) cur->ne[0], (int) cur->ne[1], (int) cur->ne[2], (int) cur->ne[3]));
} }
llama_load_tensor & lt = tensors_map.tensors.at(it->second);
if (lt.ne != ne) {
throw std::runtime_error(format("llama.cpp: tensor '%s' has wrong shape; expected %s, got %s",
name.c_str(), llama_format_tensor_shape(ne).c_str(), llama_format_tensor_shape(lt.ne).c_str()));
} }
return get_tensor_for(lt, backend); return create_tensor_for(ctx, cur, backend);
} }
void done_getting_tensors() const { void done_getting_tensors() const {
if (num_ggml_tensors_created != tensors_map.tensors.size()) { if (n_created != n_tensors) {
throw std::runtime_error(std::string("llama.cpp: file contained more tensors than expected")); throw std::runtime_error(format("%s: wrong number of tensors; expected %d, got %d", __func__, n_tensors, n_created));
} }
} }
void load_data_for(llama_load_tensor & lt) const { size_t file_offset(const char * name) const {
const int idx = gguf_find_tensor(ctx_gguf, name);
if (idx < 0) {
throw std::runtime_error(format("%s: tensor '%s' not found in the file", __func__, name));
}
return gguf_get_data_offset(ctx_gguf) + gguf_get_tensor_offset(ctx_gguf, idx);
}
void load_data_for(struct ggml_tensor * cur) const {
const size_t offs = file_offset(ggml_get_name(cur));
if (use_mmap) { if (use_mmap) {
lt.data = (uint8_t *) mapping->addr + lt.file_off; cur->data = (uint8_t *) mapping->addr + offs;
} else { } else {
llama_file & file = file_loader->file; file.seek(offs, SEEK_SET);
file.seek(lt.file_off, SEEK_SET); file.read_raw(cur->data, ggml_nbytes(cur));
file.read_raw(lt.data, lt.size);
} }
} }
void load_all_data(llama_progress_callback progress_callback, void * progress_callback_user_data, llama_mlock * lmlock) { void load_all_data(struct ggml_context * ctx, llama_progress_callback progress_callback, void * progress_callback_user_data, llama_mlock * lmlock) {
size_t data_size = 0; size_t size_data = 0;
size_t lock_size = 0; size_t size_lock = 0;
size_t pref_size = 0; // prefetch size_t size_pref = 0; // prefetch
for (const llama_load_tensor & lt : tensors_map.tensors) { for (int i = 0; i < gguf_get_n_tensors(ctx_gguf); i++) {
data_size += lt.size; struct ggml_tensor * cur = ggml_get_tensor(ctx, gguf_get_tensor_name(ctx_gguf, i));
if (lt.ggml_tensor->backend == GGML_BACKEND_CPU) { size_data += ggml_nbytes(cur);
pref_size += lt.size; if (cur->backend == GGML_BACKEND_CPU) {
size_pref += ggml_nbytes(cur);
} }
} }
if (use_mmap) { if (use_mmap) {
mapping.reset(new llama_mmap(&file_loader->file, pref_size, ggml_is_numa())); mapping.reset(new llama_mmap(&file, size_pref, ggml_is_numa()));
if (lmlock) { if (lmlock) {
lmlock->init(mapping->addr); lmlock->init(mapping->addr);
} }
} }
size_t done_size = 0; size_t done_size = 0;
for (llama_load_tensor & lt : tensors_map.tensors) { for (int i = 0; i < gguf_get_n_tensors(ctx_gguf); i++) {
struct ggml_tensor * cur = ggml_get_tensor(ctx, gguf_get_tensor_name(ctx_gguf, i));
GGML_ASSERT(cur); // unused tensors should have been caught by load_data already
if (progress_callback) { if (progress_callback) {
progress_callback((float) done_size / data_size, progress_callback_user_data); progress_callback((float) done_size / size_data, progress_callback_user_data);
} }
GGML_ASSERT(lt.ggml_tensor); // unused tensors should have been caught by load_data already
lt.data = (uint8_t *) lt.ggml_tensor->data;
// allocate temp buffer if not using mmap // allocate temp buffer if not using mmap
if (!use_mmap && lt.data == NULL) { if (!use_mmap && cur->data == NULL) {
GGML_ASSERT(lt.ggml_tensor->backend != GGML_BACKEND_CPU); GGML_ASSERT(cur->backend != GGML_BACKEND_CPU);
lt.data = (uint8_t*)malloc(ggml_nbytes(lt.ggml_tensor)); cur->data = malloc(ggml_nbytes(cur));
} }
load_data_for(lt); load_data_for(cur);
switch (lt.ggml_tensor->backend) { switch (cur->backend) {
case GGML_BACKEND_CPU: case GGML_BACKEND_CPU:
lt.ggml_tensor->data = lt.data;
if (use_mmap && lmlock) { if (use_mmap && lmlock) {
lock_size += lt.size; size_lock += ggml_nbytes(cur);
lmlock->grow_to(lock_size); lmlock->grow_to(size_lock);
} }
break; break;
#if defined(GGML_USE_CUBLAS) #if defined(GGML_USE_CUBLAS)
case GGML_BACKEND_GPU: case GGML_BACKEND_GPU:
case GGML_BACKEND_GPU_SPLIT: case GGML_BACKEND_GPU_SPLIT:
ggml_cuda_transform_tensor(lt.data, lt.ggml_tensor); // old code:
//ggml_cuda_transform_tensor(lt.data, lt.ggml_tensor);
// TODO: test if this works !!
ggml_cuda_transform_tensor(cur->data, cur);
if (!use_mmap) { if (!use_mmap) {
free(lt.data); free(cur->data);
} }
break; break;
#elif defined(GGML_USE_CLBLAST) #elif defined(GGML_USE_CLBLAST)
case GGML_BACKEND_GPU: case GGML_BACKEND_GPU:
ggml_cl_transform_tensor(lt.data, lt.ggml_tensor); ggml_cl_transform_tensor(cur->data, cur);
if (!use_mmap) { if (!use_mmap) {
free(lt.data); free(cur->data);
} }
break; break;
#endif #endif
@ -1221,7 +1183,7 @@ struct llama_model_loader {
continue; continue;
} }
done_size += lt.size; done_size += ggml_nbytes(cur);
} }
} }
}; };
@ -1298,7 +1260,7 @@ static void llama_model_load_internal(
// read hparams // read hparams
{ {
struct gguf_context * ctx = ml->file_loader->ctx_gguf; struct gguf_context * ctx = ml->ctx_gguf;
hparams.n_vocab = gguf_get_arr_n (ctx, gguf_find_key(ctx, "tokenizer.ggml.tokens")); hparams.n_vocab = gguf_get_arr_n (ctx, gguf_find_key(ctx, "tokenizer.ggml.tokens"));
hparams.n_ctx = gguf_get_val_u32(ctx, gguf_find_key(ctx, "llama.context_length")); hparams.n_ctx = gguf_get_val_u32(ctx, gguf_find_key(ctx, "llama.context_length"));
@ -1351,7 +1313,7 @@ static void llama_model_load_internal(
// read vocab // read vocab
{ {
struct gguf_context * ctx = ml->file_loader->ctx_gguf; struct gguf_context * ctx = ml->ctx_gguf;
vocab.id_to_token.resize(hparams.n_vocab); vocab.id_to_token.resize(hparams.n_vocab);
@ -1379,7 +1341,7 @@ static void llama_model_load_internal(
} }
{ {
LLAMA_LOG_INFO("%s: format = %s\n", __func__, llama_file_version_name(ml->file_loader->file_version)); LLAMA_LOG_INFO("%s: format = %s\n", __func__, llama_file_version_name(ml->file_version));
LLAMA_LOG_INFO("%s: n_vocab = %u\n", __func__, hparams.n_vocab); LLAMA_LOG_INFO("%s: n_vocab = %u\n", __func__, hparams.n_vocab);
LLAMA_LOG_INFO("%s: n_ctx = %u\n", __func__, hparams.n_ctx); LLAMA_LOG_INFO("%s: n_ctx = %u\n", __func__, hparams.n_ctx);
LLAMA_LOG_INFO("%s: n_embd = %u\n", __func__, hparams.n_embd); LLAMA_LOG_INFO("%s: n_embd = %u\n", __func__, hparams.n_embd);
@ -1453,9 +1415,7 @@ static void llama_model_load_internal(
const uint32_t n_layer = hparams.n_layer; const uint32_t n_layer = hparams.n_layer;
const uint32_t n_vocab = hparams.n_vocab; const uint32_t n_vocab = hparams.n_vocab;
ml->ggml_ctx = ctx; model.tok_embeddings = ml->create_tensor(ctx, TN_TOKEN_EMBD, {n_embd, n_vocab}, GGML_BACKEND_CPU);
model.tok_embeddings = ml->get_tensor(TN_TOKEN_EMBD, {n_embd, n_vocab}, GGML_BACKEND_CPU);
// "output" tensor // "output" tensor
{ {
@ -1476,8 +1436,8 @@ static void llama_model_load_internal(
backend_output = GGML_BACKEND_CPU; backend_output = GGML_BACKEND_CPU;
} }
model.norm = ml->get_tensor(TN_OUTPUT_NORM, {n_embd}, backend_norm); model.norm = ml->create_tensor(ctx, TN_OUTPUT_NORM, {n_embd}, backend_norm);
model.output = ml->get_tensor(TN_OUTPUT, {n_embd, n_vocab}, backend_output); model.output = ml->create_tensor(ctx, TN_OUTPUT, {n_embd, n_vocab}, backend_output);
if (backend_norm == GGML_BACKEND_GPU) { if (backend_norm == GGML_BACKEND_GPU) {
vram_weights += ggml_nbytes(model.norm); vram_weights += ggml_nbytes(model.norm);
} }
@ -1496,18 +1456,18 @@ static void llama_model_load_internal(
const ggml_backend backend_split = int(i) < i_gpu_start ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD_SPLIT; // NOLINT const ggml_backend backend_split = int(i) < i_gpu_start ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD_SPLIT; // NOLINT
auto & layer = model.layers[i]; auto & layer = model.layers[i];
layer.attention_norm = ml->get_tensor(format(TN_ATTN_NORM, i), {n_embd}, backend); layer.attention_norm = ml->create_tensor(ctx, format(TN_ATTN_NORM, i), {n_embd}, backend);
layer.wq = ml->get_tensor(format(TN_ATTN_Q, i), {n_embd, n_embd}, backend_split); layer.wq = ml->create_tensor(ctx, format(TN_ATTN_Q, i), {n_embd, n_embd}, backend_split);
layer.wk = ml->get_tensor(format(TN_ATTN_K, i), {n_embd, n_embd_gqa}, backend_split); layer.wk = ml->create_tensor(ctx, format(TN_ATTN_K, i), {n_embd, n_embd_gqa}, backend_split);
layer.wv = ml->get_tensor(format(TN_ATTN_V, i), {n_embd, n_embd_gqa}, backend_split); layer.wv = ml->create_tensor(ctx, format(TN_ATTN_V, i), {n_embd, n_embd_gqa}, backend_split);
layer.wo = ml->get_tensor(format(TN_ATTN_OUTPUT, i), {n_embd, n_embd}, backend_split); layer.wo = ml->create_tensor(ctx, format(TN_ATTN_OUTPUT, i), {n_embd, n_embd}, backend_split);
layer.ffn_norm = ml->get_tensor(format(TN_FFN_NORM, i), {n_embd}, backend); layer.ffn_norm = ml->create_tensor(ctx, format(TN_FFN_NORM, i), {n_embd}, backend);
layer.w1 = ml->get_tensor(format(TN_FFN_GATE, i), {n_embd, n_ff}, backend_split); layer.w1 = ml->create_tensor(ctx, format(TN_FFN_GATE, i), {n_embd, n_ff}, backend_split);
layer.w2 = ml->get_tensor(format(TN_FFN_DOWN, i), { n_ff, n_embd}, backend_split); layer.w2 = ml->create_tensor(ctx, format(TN_FFN_DOWN, i), { n_ff, n_embd}, backend_split);
layer.w3 = ml->get_tensor(format(TN_FFN_UP, i), {n_embd, n_ff}, backend_split); layer.w3 = ml->create_tensor(ctx, format(TN_FFN_UP, i), {n_embd, n_ff}, backend_split);
if (backend == GGML_BACKEND_GPU) { if (backend == GGML_BACKEND_GPU) {
vram_weights += vram_weights +=
@ -1605,8 +1565,9 @@ static void llama_model_load_internal(
} }
// populate `tensors_by_name` // populate `tensors_by_name`
for (llama_load_tensor & lt : ml->tensors_map.tensors) { for (int i = 0; i < ml->n_tensors; ++i) {
model.tensors_by_name.emplace_back(lt.name, lt.ggml_tensor); struct ggml_tensor * cur = ggml_get_tensor(ctx, ml->get_tensor_name(i));
model.tensors_by_name.emplace_back(ggml_get_name(cur), cur);
} }
(void) tensor_split; (void) tensor_split;
@ -1616,7 +1577,7 @@ static void llama_model_load_internal(
} }
#endif #endif
ml->load_all_data(progress_callback, progress_callback_user_data, use_mlock ? &model.mlock_mmap : NULL); ml->load_all_data(ctx, progress_callback, progress_callback_user_data, use_mlock ? &model.mlock_mmap : NULL);
if (progress_callback) { if (progress_callback) {
progress_callback(1.0f, progress_callback_user_data); progress_callback(1.0f, progress_callback_user_data);
@ -1666,7 +1627,7 @@ static struct ggml_cgraph * llama_build_graph(
int n_tokens, int n_tokens,
int n_past) { int n_past) {
GGML_ASSERT((!tokens && embd) || (tokens && !embd)); GGML_ASSERT((!tokens && embd) || (tokens && !embd)); // NOLINT
const int N = n_tokens; const int N = n_tokens;
@ -1696,7 +1657,6 @@ static struct ggml_cgraph * llama_build_graph(
auto & mem_per_token = lctx.mem_per_token; auto & mem_per_token = lctx.mem_per_token;
auto & buf_compute = lctx.buf_compute; auto & buf_compute = lctx.buf_compute;
struct ggml_init_params params = { struct ggml_init_params params = {
/*.mem_size =*/ buf_compute.size, /*.mem_size =*/ buf_compute.size,
/*.mem_buffer =*/ buf_compute.data, /*.mem_buffer =*/ buf_compute.data,
@ -2049,7 +2009,7 @@ static bool llama_eval_internal(
int n_threads, int n_threads,
const char * cgraph_fname) { const char * cgraph_fname) {
GGML_ASSERT((!tokens && embd) || (tokens && !embd)); GGML_ASSERT((!tokens && embd) || (tokens && !embd)); // NOLINT
const int64_t t_start_us = ggml_time_us(); const int64_t t_start_us = ggml_time_us();
@ -2526,8 +2486,8 @@ std::vector<uint32_t> decode_utf8(const char * src) {
// returns true iff pos points to the end of one of the definitions of a rule // returns true iff pos points to the end of one of the definitions of a rule
static bool llama_grammar_is_end_of_sequence(const llama_grammar_element * pos) { static bool llama_grammar_is_end_of_sequence(const llama_grammar_element * pos) {
switch (pos->type) { switch (pos->type) {
case LLAMA_GRETYPE_END: return true; case LLAMA_GRETYPE_END: return true; // NOLINT
case LLAMA_GRETYPE_ALT: return true; case LLAMA_GRETYPE_ALT: return true; // NOLINT
default: return false; default: return false;
} }
} }
@ -2540,7 +2500,7 @@ static std::pair<bool, const llama_grammar_element *> llama_grammar_match_char(
bool found = false; bool found = false;
bool is_positive_char = pos->type == LLAMA_GRETYPE_CHAR; bool is_positive_char = pos->type == LLAMA_GRETYPE_CHAR;
GGML_ASSERT(is_positive_char || pos->type == LLAMA_GRETYPE_CHAR_NOT); GGML_ASSERT(is_positive_char || pos->type == LLAMA_GRETYPE_CHAR_NOT); // NOLINT
do { do {
if (pos[1].type == LLAMA_GRETYPE_CHAR_RNG_UPPER) { if (pos[1].type == LLAMA_GRETYPE_CHAR_RNG_UPPER) {
@ -2675,7 +2635,7 @@ static std::vector<llama_grammar_candidate> llama_grammar_reject_candidates_for_
} }
} }
auto stack_pos_after = llama_grammar_match_char(stack_pos, 0).second; const auto * stack_pos_after = llama_grammar_match_char(stack_pos, 0).second;
// update top of stack to next element, if any // update top of stack to next element, if any
std::vector<const llama_grammar_element *> stack_after(stack.begin(), stack.end() - 1); std::vector<const llama_grammar_element *> stack_after(stack.begin(), stack.end() - 1);
@ -3285,35 +3245,35 @@ void llama_grammar_accept_token(struct llama_context * ctx, struct llama_grammar
// quantization // quantization
// //
static void llama_convert_tensor_internal(const llama_load_tensor & tensor, std::vector<float> & output, const size_t nelements, const int nthread) { static void llama_convert_tensor_internal(struct ggml_tensor * tensor, std::vector<float> & output, const size_t nelements, const int nthread) {
if (output.size() < nelements) { if (output.size() < nelements) {
output.resize(nelements); output.resize(nelements);
} }
float * f32_output = (float *) output.data(); float * f32_output = (float *) output.data();
ggml_type_traits_t qtype; ggml_type_traits_t qtype;
if (ggml_is_quantized(tensor.type)) { if (ggml_is_quantized(tensor->type)) {
qtype = ggml_internal_get_type_traits(tensor.type); qtype = ggml_internal_get_type_traits(tensor->type);
if (qtype.to_float == NULL) { if (qtype.to_float == NULL) {
throw std::runtime_error(format("type %s unsupported for integer quantization: no dequantization available", ggml_type_name(tensor.type))); throw std::runtime_error(format("type %s unsupported for integer quantization: no dequantization available", ggml_type_name(tensor->type)));
} }
} else if (tensor.type != GGML_TYPE_F16) { } else if (tensor->type != GGML_TYPE_F16) {
throw std::runtime_error(format("cannot dequantize/convert tensor type %s", ggml_type_name(tensor.type))); throw std::runtime_error(format("cannot dequantize/convert tensor type %s", ggml_type_name(tensor->type)));
} }
if (nthread < 2) { if (nthread < 2) {
if (tensor.type == GGML_TYPE_F16) { if (tensor->type == GGML_TYPE_F16) {
ggml_fp16_to_fp32_row((ggml_fp16_t *)tensor.data, f32_output, nelements); ggml_fp16_to_fp32_row((ggml_fp16_t *)tensor->data, f32_output, nelements);
} else if (ggml_is_quantized(tensor.type)) { } else if (ggml_is_quantized(tensor->type)) {
qtype.to_float(tensor.data, f32_output, nelements); qtype.to_float(tensor->data, f32_output, nelements);
} else { } else {
GGML_ASSERT(false); // unreachable GGML_ASSERT(false); // unreachable
} }
return; return;
} }
auto block_size = tensor.type == GGML_TYPE_F16 ? 1 : (size_t)ggml_blck_size(tensor.type); auto block_size = tensor->type == GGML_TYPE_F16 ? 1 : (size_t)ggml_blck_size(tensor->type);
auto block_size_bytes = ggml_type_size(tensor.type); auto block_size_bytes = ggml_type_size(tensor->type);
GGML_ASSERT(nelements % block_size == 0); GGML_ASSERT(nelements % block_size == 0);
auto nblocks = nelements / block_size; auto nblocks = nelements / block_size;
@ -3333,7 +3293,7 @@ static void llama_convert_tensor_internal(const llama_load_tensor & tensor, std:
qtype.to_float(inbuf, outbuf, nels); qtype.to_float(inbuf, outbuf, nels);
} }
}; };
workers.push_back(std::thread(compute, tensor.type, tensor.data + in_buff_offs, f32_output + out_buff_offs, thr_elems)); workers.push_back(std::thread(compute, tensor->type, (uint8_t *) tensor->data + in_buff_offs, f32_output + out_buff_offs, thr_elems));
in_buff_offs += thr_block_bytes; in_buff_offs += thr_block_bytes;
out_buff_offs += thr_elems; out_buff_offs += thr_elems;
} }
@ -3381,17 +3341,22 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
struct gguf_context * ctx_out = gguf_init_empty(); struct gguf_context * ctx_out = gguf_init_empty();
// copy the KV pairs from the input file // copy the KV pairs from the input file
gguf_set_kv(ctx_out, model_loader->file_loader->ctx_gguf); gguf_set_kv (ctx_out, model_loader->ctx_gguf);
gguf_set_val_u32(ctx_out, "general.quantization_version", GGML_QNT_VERSION); gguf_set_val_u32(ctx_out, "general.quantization_version", GGML_QNT_VERSION);
#ifdef GGML_USE_K_QUANTS #ifdef GGML_USE_K_QUANTS
int n_attention_wv = 0; int n_attention_wv = 0;
int n_feed_forward_w2 = 0; int n_feed_forward_w2 = 0;
for (auto& tensor : model_loader->tensors_map.tensors) {
if (tensor.name.find("attn_v.weight") != std::string::npos) { for (int i = 0; i < model_loader->n_tensors; ++i) {
struct ggml_tensor * meta = model_loader->get_tensor_meta(i);
const std::string name = ggml_get_name(meta);
if (name.find("attn_v.weight") != std::string::npos) {
++n_attention_wv; ++n_attention_wv;
} }
else if (tensor.name.find("ffn_down.weight") != std::string::npos) { else if (name.find("ffn_down.weight") != std::string::npos) {
++n_feed_forward_w2; ++n_feed_forward_w2;
} }
} }
@ -3416,8 +3381,10 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
std::vector<uint8_t> read_data; std::vector<uint8_t> read_data;
std::vector<uint8_t> work; std::vector<uint8_t> work;
for (llama_load_tensor & tensor : model_loader->tensors_map.tensors) { // populate the original tensors so we get an initial meta data
gguf_add_tensor(ctx_out, tensor.ggml_tensor); for (int i = 0; i < model_loader->n_tensors; ++i) {
struct ggml_tensor * meta = model_loader->get_tensor_meta(i);
gguf_add_tensor(ctx_out, meta);
} }
std::ofstream fout(fname_out, std::ios::binary); std::ofstream fout(fname_out, std::ios::binary);
@ -3429,43 +3396,47 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
// placeholder for the meta data // placeholder for the meta data
::zeros(fout, meta_size); ::zeros(fout, meta_size);
for (llama_load_tensor & tensor : model_loader->tensors_map.tensors) { for (int i = 0; i < model_loader->n_tensors; ++i) {
read_data.resize(tensor.size); struct ggml_tensor * tensor = model_loader->get_tensor_meta(i);
tensor.data = read_data.data();
const std::string name = ggml_get_name(tensor);
read_data.resize(ggml_nbytes(tensor));
tensor->data = read_data.data();
model_loader->load_data_for(tensor); model_loader->load_data_for(tensor);
LLAMA_LOG_INFO("[%4zu/%4zu] %36s - %16s, type = %6s, ", LLAMA_LOG_INFO("[%4zu/%4zu] %36s - [%5d, %5d], type = %6s, ",
++idx, model_loader->tensors_map.tensors.size(), ++idx, model_loader->n_tensors,
tensor.name.c_str(), llama_format_tensor_shape(tensor.ne).c_str(), ggml_get_name(tensor), (int) tensor->ne[0], (int) tensor->ne[1],
ggml_type_name(tensor.type)); ggml_type_name(tensor->type));
// This used to be a regex, but <regex> has an extreme cost to compile times. // This used to be a regex, but <regex> has an extreme cost to compile times.
bool quantize = tensor.name.rfind("weight") == tensor.name.size() - 6; // ends with 'weight'? bool quantize = name.rfind("weight") == name.size() - 6; // ends with 'weight'?
// quantize only 2D tensors // quantize only 2D tensors
quantize &= (tensor.ne.size() == 2); quantize &= (tensor->n_dims == 2);
quantize &= params->quantize_output_tensor || tensor.name != "output.weight"; quantize &= params->quantize_output_tensor || name != "output.weight";
quantize &= quantized_type != tensor.type; quantize &= quantized_type != tensor->type;
enum ggml_type new_type; enum ggml_type new_type;
void * new_data; void * new_data;
size_t new_size; size_t new_size;
if (!quantize) { if (!quantize) {
new_type = tensor.type; new_type = tensor->type;
new_data = tensor.data; new_data = tensor->data;
new_size = tensor.size; new_size = ggml_nbytes(tensor);
LLAMA_LOG_INFO("size = %8.3f MB\n", tensor.size/1024.0/1024.0); LLAMA_LOG_INFO("size = %8.3f MB\n", ggml_nbytes(tensor)/1024.0/1024.0);
} else { } else {
new_type = quantized_type; new_type = quantized_type;
#ifdef GGML_USE_K_QUANTS #ifdef GGML_USE_K_QUANTS
if (tensor.name == TN_OUTPUT) { if (name == TN_OUTPUT) {
int nx = tensor.ne.at(0); int nx = tensor->ne[0];
int ny = tensor.ne.at(1); int ny = tensor->ne[1];
if (nx % QK_K == 0 && ny % QK_K == 0) { if (nx % QK_K == 0 && ny % QK_K == 0) {
new_type = GGML_TYPE_Q6_K; new_type = GGML_TYPE_Q6_K;
} }
} else if (tensor.name.find("attn_v.weight") != std::string::npos) { } else if (name.find("attn_v.weight") != std::string::npos) {
if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q2_K) new_type = GGML_TYPE_Q4_K; if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q2_K) new_type = GGML_TYPE_Q4_K;
else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) new_type = GGML_TYPE_Q5_K; else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) new_type = GGML_TYPE_Q5_K;
else if ((ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q5_K_M) && else if ((ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q5_K_M) &&
@ -3473,32 +3444,32 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
else if (QK_K == 64 && (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_S || ftype == LLAMA_FTYPE_MOSTLY_Q3_K_S) && else if (QK_K == 64 && (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_S || ftype == LLAMA_FTYPE_MOSTLY_Q3_K_S) &&
(i_attention_wv < n_attention_wv/8 || i_attention_wv >= 7*n_attention_wv/8)) new_type = GGML_TYPE_Q6_K; (i_attention_wv < n_attention_wv/8 || i_attention_wv >= 7*n_attention_wv/8)) new_type = GGML_TYPE_Q6_K;
++i_attention_wv; ++i_attention_wv;
} else if (tensor.name.find("feed_forward.w2.weight") != std::string::npos) { } else if (name.find("feed_forward.w2.weight") != std::string::npos) {
if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q2_K) new_type = GGML_TYPE_Q4_K; if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q2_K) new_type = GGML_TYPE_Q4_K;
else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) new_type = GGML_TYPE_Q5_K; else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) new_type = GGML_TYPE_Q5_K;
else if ((ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q5_K_M) && else if ((ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q5_K_M) &&
use_more_bits(i_feed_forward_w2, n_feed_forward_w2)) new_type = GGML_TYPE_Q6_K; use_more_bits(i_feed_forward_w2, n_feed_forward_w2)) new_type = GGML_TYPE_Q6_K;
//else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_S && i_feed_forward_w2 < n_feed_forward_w2/8) new_type = GGML_TYPE_Q6_K; //else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_S && i_feed_forward_w2 < n_feed_forward_w2/8) new_type = GGML_TYPE_Q6_K;
++i_feed_forward_w2; ++i_feed_forward_w2;
} else if (tensor.name.find("attn_output.weight") != std::string::npos) { } else if (name.find("attn_output.weight") != std::string::npos) {
if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q2_K) new_type = GGML_TYPE_Q4_K; if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q2_K) new_type = GGML_TYPE_Q4_K;
else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) new_type = GGML_TYPE_Q5_K; else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) new_type = GGML_TYPE_Q5_K;
} }
bool convert_incompatible_tensor = false; bool convert_incompatible_tensor = false;
if (new_type == GGML_TYPE_Q2_K || new_type == GGML_TYPE_Q3_K || new_type == GGML_TYPE_Q4_K || if (new_type == GGML_TYPE_Q2_K || new_type == GGML_TYPE_Q3_K || new_type == GGML_TYPE_Q4_K ||
new_type == GGML_TYPE_Q5_K || new_type == GGML_TYPE_Q6_K) { new_type == GGML_TYPE_Q5_K || new_type == GGML_TYPE_Q6_K) {
int nx = tensor.ne.at(0); int nx = tensor->ne[0];
int ny = tensor.ne.at(1); int ny = tensor->ne[1];
if (nx % QK_K != 0 || ny % QK_K != 0) { if (nx % QK_K != 0 || ny % QK_K != 0) {
LLAMA_LOG_INFO("\n\nTensor sizes %d x %d are not divisible by %d, required for k-quants.\n",nx,ny,QK_K); LLAMA_LOG_INFO("\n\nTensor sizes %d x %d are not divisible by %d, required for k-quants.\n",nx,ny,QK_K);
convert_incompatible_tensor = true; convert_incompatible_tensor = true;
} }
} }
if (convert_incompatible_tensor) { if (convert_incompatible_tensor) {
if (tensor.name == TN_OUTPUT) { if (name == TN_OUTPUT) {
new_type = GGML_TYPE_F16; //fall back to F16 instead of just failing. new_type = GGML_TYPE_F16; //fall back to F16 instead of just failing.
LLAMA_LOG_WARN("F16 will be used for this tensor instead.\n"); LLAMA_LOG_WARN("F16 will be used for this tensor instead.\n");
} else if (tensor.name == TN_TOKEN_EMBD) { } else if (name == TN_TOKEN_EMBD) {
new_type = GGML_TYPE_Q4_0; //fall back to Q4_0 instead of just failing. new_type = GGML_TYPE_Q4_0; //fall back to Q4_0 instead of just failing.
LLAMA_LOG_WARN("Q4_0 will be used for this tensor instead.\n"); LLAMA_LOG_WARN("Q4_0 will be used for this tensor instead.\n");
} else { } else {
@ -3507,15 +3478,15 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
} }
#endif #endif
const size_t nelements = tensor.ne.at(0) * tensor.ne.at(1); const size_t nelements = ggml_nelements(tensor);
float * f32_data; float * f32_data;
std::vector<float> f32_conv_buf; std::vector<float> f32_conv_buf;
if (tensor.type == GGML_TYPE_F32) { if (tensor->type == GGML_TYPE_F32) {
f32_data = (float *) tensor.data; f32_data = (float *) tensor->data;
} else if (ggml_is_quantized(tensor.type) && !params->allow_requantize) { } else if (ggml_is_quantized(tensor->type) && !params->allow_requantize) {
throw std::runtime_error(format("requantizing from type %s is disabled", ggml_type_name(tensor.type))); throw std::runtime_error(format("requantizing from type %s is disabled", ggml_type_name(tensor->type)));
} else { } else {
llama_convert_tensor_internal(tensor, f32_conv_buf, nelements, nthread); llama_convert_tensor_internal(tensor, f32_conv_buf, nelements, nthread);
f32_data = (float *) f32_conv_buf.data(); f32_data = (float *) f32_conv_buf.data();
@ -3571,7 +3542,7 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
} }
} }
LLAMA_LOG_INFO("size = %8.2f MB -> %8.2f MB | hist: ", tensor.size/1024.0/1024.0, new_size/1024.0/1024.0); LLAMA_LOG_INFO("size = %8.2f MB -> %8.2f MB | hist: ", ggml_nbytes(tensor)/1024.0/1024.0, new_size/1024.0/1024.0);
int64_t tot_count = 0; int64_t tot_count = 0;
for (size_t i = 0; i < hist_cur.size(); i++) { for (size_t i = 0; i < hist_cur.size(); i++) {
hist_all[i] += hist_cur[i]; hist_all[i] += hist_cur[i];
@ -3585,12 +3556,12 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
} }
LLAMA_LOG_INFO("\n"); LLAMA_LOG_INFO("\n");
} }
total_size_org += tensor.size; total_size_org += ggml_nbytes(tensor);
total_size_new += new_size; total_size_new += new_size;
// update the gguf meta data as we go // update the gguf meta data as we go
gguf_set_tensor_type(ctx_out, tensor.name.c_str(), new_type); gguf_set_tensor_type(ctx_out, name.c_str(), new_type);
gguf_set_tensor_data(ctx_out, tensor.name.c_str(), new_data, new_size); gguf_set_tensor_data(ctx_out, name.c_str(), new_data, new_size);
// write tensor data + padding // write tensor data + padding
fout.write((const char *) new_data, new_size); fout.write((const char *) new_data, new_size);
@ -3674,7 +3645,7 @@ int llama_apply_lora_from_file_internal(const struct llama_model & model, const
// create a name -> tensor map of the model to accelerate lookups // create a name -> tensor map of the model to accelerate lookups
std::unordered_map<std::string, struct ggml_tensor*> model_tensors; std::unordered_map<std::string, struct ggml_tensor*> model_tensors;
for (const auto & kv: model.tensors_by_name) { for (const auto & kv : model.tensors_by_name) {
model_tensors.insert(kv); model_tensors.insert(kv);
} }
@ -3698,11 +3669,9 @@ int llama_apply_lora_from_file_internal(const struct llama_model & model, const
base_ctx = ggml_init(base_params); base_ctx = ggml_init(base_params);
model_loader->ggml_ctx = base_ctx;
// maybe this should in llama_model_loader // maybe this should in llama_model_loader
if (model_loader->use_mmap) { if (model_loader->use_mmap) {
model_loader->mapping.reset(new llama_mmap(&model_loader->file_loader->file, /* prefetch */ 0, ggml_is_numa())); model_loader->mapping.reset(new llama_mmap(&model_loader->file, /* prefetch */ 0, ggml_is_numa()));
} }
} }
@ -3807,19 +3776,18 @@ int llama_apply_lora_from_file_internal(const struct llama_model & model, const
ggml_tensor * base_t; ggml_tensor * base_t;
if (model_loader) { if (model_loader) {
struct gguf_context * ctx_gguf = model_loader->ctx_gguf;
// load from base model // load from base model
if (model_loader->tensors_map.name_to_idx.find(base_name) == model_loader->tensors_map.name_to_idx.end()) { if (gguf_find_tensor(ctx_gguf, base_name.c_str()) < 0) {
LLAMA_LOG_ERROR("%s: error: tensor '%s' not found in base model\n", __func__, base_name.c_str()); LLAMA_LOG_ERROR("%s: error: tensor '%s' not found in base model\n", __func__, base_name.c_str());
return 1; return 1;
} }
size_t idx = model_loader->tensors_map.name_to_idx[base_name];
llama_load_tensor & lt = model_loader->tensors_map.tensors[idx]; // TODO: not tested!! maybe not working!
base_t = model_loader->get_tensor(base_name, { (uint32_t)dest_t->ne[0], (uint32_t)dest_t->ne[1] }, GGML_BACKEND_CPU); base_t = model_loader->create_tensor(base_ctx, base_name, { (uint32_t)dest_t->ne[0], (uint32_t)dest_t->ne[1] }, GGML_BACKEND_CPU);
lt.data = (uint8_t *) lt.ggml_tensor->data; model_loader->load_data_for(base_t);
model_loader->load_data_for(lt); } else {
lt.ggml_tensor->data = lt.data;
}
else {
base_t = dest_t; base_t = dest_t;
} }
@ -4767,7 +4735,7 @@ int llama_token_to_str_with_model(const struct llama_model * model, llama_token
} }
strncpy(str, result.c_str(), result.length()); strncpy(str, result.c_str(), result.length());
return result.length(); return result.length();
} else if (llama_is_unknown_token(model->vocab, token)) { } else if (llama_is_unknown_token(model->vocab, token)) { // NOLINT
if (length < 3) { if (length < 3) {
return -3; return -3;
} }