llama : refactor llama_model_loader (WIP)
wip : remove ggml_ctx from llama_model_loader wip : merge gguf_file_loader in llama_model_loader
This commit is contained in:
parent
23248d7d32
commit
5339b859ec
3 changed files with 215 additions and 256 deletions
8
ggml.c
8
ggml.c
|
@ -19065,14 +19065,6 @@ enum gguf_type gguf_get_arr_type(struct gguf_context * ctx, int i) {
|
||||||
return ctx->kv[i].value.arr.type;
|
return ctx->kv[i].value.arr.type;
|
||||||
}
|
}
|
||||||
|
|
||||||
int32_t gguf_get_arr_i32(struct gguf_context * ctx, int key_id, int i) {
|
|
||||||
return ((int32_t *) ctx->kv[key_id].value.arr.data)[i];
|
|
||||||
}
|
|
||||||
|
|
||||||
float gguf_get_arr_f32(struct gguf_context * ctx, int key_id, int i) {
|
|
||||||
return ((float *) ctx->kv[key_id].value.arr.data)[i];
|
|
||||||
}
|
|
||||||
|
|
||||||
const void * gguf_get_arr_data(struct gguf_context * ctx, int i) {
|
const void * gguf_get_arr_data(struct gguf_context * ctx, int i) {
|
||||||
return ctx->kv[i].value.arr.data;
|
return ctx->kv[i].value.arr.data;
|
||||||
}
|
}
|
||||||
|
|
1
ggml.h
1
ggml.h
|
@ -1499,7 +1499,6 @@ extern "C" {
|
||||||
struct ggml_context * ctx,
|
struct ggml_context * ctx,
|
||||||
struct ggml_tensor * tensor);
|
struct ggml_tensor * tensor);
|
||||||
|
|
||||||
|
|
||||||
GGML_API void ggml_build_forward_expand(struct ggml_cgraph * cgraph, struct ggml_tensor * tensor);
|
GGML_API void ggml_build_forward_expand(struct ggml_cgraph * cgraph, struct ggml_tensor * tensor);
|
||||||
|
|
||||||
GGML_API struct ggml_cgraph ggml_build_forward (struct ggml_tensor * tensor);
|
GGML_API struct ggml_cgraph ggml_build_forward (struct ggml_tensor * tensor);
|
||||||
|
|
458
gguf-llama.cpp
458
gguf-llama.cpp
|
@ -993,227 +993,189 @@ static std::string llama_format_tensor_shape(const std::vector<uint32_t> & ne) {
|
||||||
return buf;
|
return buf;
|
||||||
}
|
}
|
||||||
|
|
||||||
struct llama_load_tensor {
|
struct llama_model_loader {
|
||||||
std::string name;
|
int n_tensors = 0;
|
||||||
enum ggml_type type = GGML_TYPE_F32;
|
int n_created = 0;
|
||||||
std::vector<uint32_t> ne;
|
bool use_mmap = false;
|
||||||
size_t file_off;
|
|
||||||
size_t size;
|
|
||||||
struct ggml_tensor * ggml_tensor = NULL;
|
|
||||||
uint8_t * data;
|
|
||||||
};
|
|
||||||
|
|
||||||
struct llama_load_tensors_map {
|
|
||||||
// tensors is kept in a separate vector to preserve file order
|
|
||||||
std::vector<llama_load_tensor> tensors;
|
|
||||||
std::unordered_map<std::string, size_t> name_to_idx;
|
|
||||||
};
|
|
||||||
|
|
||||||
struct llama_file_loader {
|
|
||||||
llama_file file;
|
llama_file file;
|
||||||
gguf_context * ctx_gguf;
|
|
||||||
llama_file_version file_version;
|
llama_file_version file_version;
|
||||||
|
|
||||||
struct ggml_context * ctx_data = NULL;
|
|
||||||
|
|
||||||
llama_file_loader(const char * fname, llama_load_tensors_map & tensors_map) : file(fname, "rb") {
|
|
||||||
fprintf(stderr, "llama.cpp: loading model from %s\n", fname);
|
|
||||||
|
|
||||||
struct gguf_init_params params = {
|
|
||||||
/*.no_alloc = */ true,
|
|
||||||
/*.ctx = */ &ctx_data,
|
|
||||||
};
|
|
||||||
|
|
||||||
ctx_gguf = gguf_init_from_file(fname, params);
|
|
||||||
file_version = (enum llama_file_version) gguf_get_version(ctx_gguf);
|
|
||||||
|
|
||||||
read_tensor_metadata(tensors_map);
|
|
||||||
}
|
|
||||||
|
|
||||||
void read_tensor_metadata(llama_load_tensors_map & tensors_map) const {
|
|
||||||
const int n_tensors = gguf_get_n_tensors(ctx_gguf);
|
|
||||||
|
|
||||||
for (int i = 0; i < n_tensors; ++i) {
|
|
||||||
llama_load_tensor tensor;
|
|
||||||
const char * name = gguf_get_tensor_name(ctx_gguf, i);
|
|
||||||
|
|
||||||
struct ggml_tensor * cur = ggml_get_tensor(ctx_data, name);
|
|
||||||
|
|
||||||
const uint32_t n_dims = cur->n_dims;
|
|
||||||
tensor.type = cur->type;
|
|
||||||
tensor.ne.resize(n_dims);
|
|
||||||
|
|
||||||
for (uint32_t j = 0; j < n_dims; ++j) {
|
|
||||||
tensor.ne[j] = cur->ne[j];
|
|
||||||
}
|
|
||||||
|
|
||||||
if (n_dims < 1 || n_dims > 2) {
|
|
||||||
throw std::runtime_error(format("llama.cpp: tensor '%s' should not be %u-dimensional", name, n_dims));
|
|
||||||
}
|
|
||||||
|
|
||||||
switch (tensor.type) {
|
|
||||||
case GGML_TYPE_F32:
|
|
||||||
case GGML_TYPE_F16:
|
|
||||||
case GGML_TYPE_Q4_0:
|
|
||||||
case GGML_TYPE_Q4_1:
|
|
||||||
case GGML_TYPE_Q5_0:
|
|
||||||
case GGML_TYPE_Q5_1:
|
|
||||||
case GGML_TYPE_Q8_0:
|
|
||||||
case GGML_TYPE_Q2_K:
|
|
||||||
case GGML_TYPE_Q3_K:
|
|
||||||
case GGML_TYPE_Q4_K:
|
|
||||||
case GGML_TYPE_Q5_K:
|
|
||||||
case GGML_TYPE_Q6_K:
|
|
||||||
break;
|
|
||||||
default: {
|
|
||||||
throw std::runtime_error(format("unrecognized tensor type %u\n", tensor.type));
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
tensor.file_off = gguf_get_data_offset(ctx_gguf) + gguf_get_tensor_offset(ctx_gguf, i);
|
|
||||||
|
|
||||||
tensor.name = name;
|
|
||||||
tensor.size = ggml_nbytes(cur);
|
|
||||||
tensor.ggml_tensor = cur;
|
|
||||||
|
|
||||||
tensors_map.tensors.push_back(tensor);
|
|
||||||
tensors_map.name_to_idx[name] = tensors_map.tensors.size() - 1;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
};
|
|
||||||
|
|
||||||
struct llama_model_loader {
|
|
||||||
std::unique_ptr<llama_file_loader> file_loader;
|
|
||||||
llama_load_tensors_map tensors_map;
|
|
||||||
bool use_mmap;
|
|
||||||
size_t num_ggml_tensors_created = 0;
|
|
||||||
struct ggml_context * ggml_ctx = NULL;
|
|
||||||
std::unique_ptr<llama_mmap> mapping;
|
std::unique_ptr<llama_mmap> mapping;
|
||||||
|
|
||||||
llama_model_loader(const std::string & fname_base, bool use_mmap) {
|
struct gguf_context * ctx_gguf = NULL;
|
||||||
file_loader = std::unique_ptr<llama_file_loader>(new llama_file_loader(fname_base.c_str(), tensors_map));
|
struct ggml_context * ctx_meta = NULL;
|
||||||
|
|
||||||
|
llama_model_loader(const std::string & fname, bool use_mmap) : file(fname.c_str(), "rb") {
|
||||||
|
struct gguf_init_params params = {
|
||||||
|
/*.no_alloc = */ true,
|
||||||
|
/*.ctx = */ &ctx_meta,
|
||||||
|
};
|
||||||
|
|
||||||
|
ctx_gguf = gguf_init_from_file(fname.c_str(), params);
|
||||||
|
|
||||||
|
n_tensors = gguf_get_n_tensors(ctx_gguf);
|
||||||
|
file_version = (enum llama_file_version) gguf_get_version(ctx_gguf);
|
||||||
|
|
||||||
|
LLAMA_LOG_INFO("%s: loaded %d tensors from %s (version %s)\n",
|
||||||
|
__func__, n_tensors, fname.c_str(), llama_file_version_name(file_version));
|
||||||
|
|
||||||
if (!llama_mmap::SUPPORTED) {
|
if (!llama_mmap::SUPPORTED) {
|
||||||
|
LLAMA_LOG_WARN("%s: mmap is not supported on this platform\n", __func__);
|
||||||
use_mmap = false;
|
use_mmap = false;
|
||||||
}
|
}
|
||||||
|
|
||||||
this->use_mmap = use_mmap;
|
this->use_mmap = use_mmap;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
const char * get_tensor_name(int i) const {
|
||||||
|
return gguf_get_tensor_name(ctx_gguf, i);
|
||||||
|
}
|
||||||
|
|
||||||
|
struct ggml_tensor * get_tensor_meta(int i) const {
|
||||||
|
return ggml_get_tensor(ctx_meta, get_tensor_name(i));
|
||||||
|
}
|
||||||
|
|
||||||
void calc_sizes(size_t & ctx_size_p, size_t & mmapped_size_p) const {
|
void calc_sizes(size_t & ctx_size_p, size_t & mmapped_size_p) const {
|
||||||
ctx_size_p = mmapped_size_p = 0;
|
ctx_size_p = 0;
|
||||||
for (const llama_load_tensor & lt : tensors_map.tensors) {
|
mmapped_size_p = 0;
|
||||||
|
|
||||||
|
for (int i = 0; i < n_tensors; i++) {
|
||||||
|
struct ggml_tensor * meta = get_tensor_meta(i);
|
||||||
ctx_size_p += sizeof(struct ggml_tensor) + GGML_OBJECT_SIZE;
|
ctx_size_p += sizeof(struct ggml_tensor) + GGML_OBJECT_SIZE;
|
||||||
(use_mmap ? mmapped_size_p : ctx_size_p) += ggml_nbytes_pad(lt.ggml_tensor);
|
(use_mmap ? mmapped_size_p : ctx_size_p) += ggml_nbytes_pad(meta);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
struct ggml_tensor * get_tensor_for(llama_load_tensor & lt, ggml_backend backend) {
|
struct ggml_tensor * create_tensor_for(struct ggml_context * ctx, struct ggml_tensor * meta, ggml_backend backend) {
|
||||||
struct ggml_tensor * tensor;
|
|
||||||
if (backend != GGML_BACKEND_CPU) {
|
if (backend != GGML_BACKEND_CPU) {
|
||||||
ggml_set_no_alloc(ggml_ctx, true);
|
ggml_set_no_alloc(ctx, true);
|
||||||
}
|
}
|
||||||
if (lt.ne.size() == 2) {
|
|
||||||
tensor = ggml_new_tensor_2d(ggml_ctx, lt.type, lt.ne.at(0), lt.ne.at(1));
|
struct ggml_tensor * tensor = ggml_dup_tensor(ctx, meta);
|
||||||
} else {
|
tensor->backend = backend; // TODO: ggml_set_backend
|
||||||
GGML_ASSERT(lt.ne.size() == 1);
|
ggml_set_name(tensor, ggml_get_name(meta));
|
||||||
tensor = ggml_new_tensor_1d(ggml_ctx, lt.type, lt.ne.at(0));
|
|
||||||
}
|
|
||||||
ggml_set_name(tensor, lt.name.c_str());
|
|
||||||
|
|
||||||
if (backend != GGML_BACKEND_CPU) {
|
if (backend != GGML_BACKEND_CPU) {
|
||||||
ggml_set_no_alloc(ggml_ctx, use_mmap);
|
ggml_set_no_alloc(ctx, use_mmap);
|
||||||
}
|
}
|
||||||
tensor->backend = backend;
|
|
||||||
lt.ggml_tensor = tensor;
|
n_created++;
|
||||||
num_ggml_tensors_created++;
|
|
||||||
return tensor;
|
return tensor;
|
||||||
}
|
}
|
||||||
|
|
||||||
struct ggml_tensor * get_tensor(const std::string & name, const std::vector<uint32_t> & ne, ggml_backend backend) {
|
struct ggml_tensor * create_tensor(struct ggml_context * ctx, const std::string & name, const std::vector<uint32_t> & ne, ggml_backend backend) {
|
||||||
auto it = tensors_map.name_to_idx.find(name);
|
struct ggml_tensor * cur = ggml_get_tensor(ctx_meta, name.c_str());
|
||||||
if (it == tensors_map.name_to_idx.end()) {
|
|
||||||
throw std::runtime_error(std::runtime_error(format("llama.cpp: tensor '%s' is missing from model", name.c_str())));
|
// TODO: simplify
|
||||||
|
{
|
||||||
|
bool is_ok = true;
|
||||||
|
for (size_t i = 0; i < ne.size(); ++i) {
|
||||||
|
if (ne[i] != cur->ne[i]) {
|
||||||
|
is_ok = false;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if (!is_ok) {
|
||||||
|
throw std::runtime_error(
|
||||||
|
format("%s: tensor '%s' has wrong shape; expected [%d, %d, %d, %d], got [%d, %d, %d, %d]",
|
||||||
|
__func__, name.c_str(), ne[0], ne[1], ne[2], ne[3],
|
||||||
|
(int) cur->ne[0], (int) cur->ne[1], (int) cur->ne[2], (int) cur->ne[3]));
|
||||||
}
|
}
|
||||||
llama_load_tensor & lt = tensors_map.tensors.at(it->second);
|
|
||||||
if (lt.ne != ne) {
|
|
||||||
throw std::runtime_error(format("llama.cpp: tensor '%s' has wrong shape; expected %s, got %s",
|
|
||||||
name.c_str(), llama_format_tensor_shape(ne).c_str(), llama_format_tensor_shape(lt.ne).c_str()));
|
|
||||||
}
|
}
|
||||||
|
|
||||||
return get_tensor_for(lt, backend);
|
return create_tensor_for(ctx, cur, backend);
|
||||||
}
|
}
|
||||||
|
|
||||||
void done_getting_tensors() const {
|
void done_getting_tensors() const {
|
||||||
if (num_ggml_tensors_created != tensors_map.tensors.size()) {
|
if (n_created != n_tensors) {
|
||||||
throw std::runtime_error(std::string("llama.cpp: file contained more tensors than expected"));
|
throw std::runtime_error(format("%s: wrong number of tensors; expected %d, got %d", __func__, n_tensors, n_created));
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
void load_data_for(llama_load_tensor & lt) const {
|
size_t file_offset(const char * name) const {
|
||||||
|
const int idx = gguf_find_tensor(ctx_gguf, name);
|
||||||
|
|
||||||
|
if (idx < 0) {
|
||||||
|
throw std::runtime_error(format("%s: tensor '%s' not found in the file", __func__, name));
|
||||||
|
}
|
||||||
|
|
||||||
|
return gguf_get_data_offset(ctx_gguf) + gguf_get_tensor_offset(ctx_gguf, idx);
|
||||||
|
}
|
||||||
|
|
||||||
|
void load_data_for(struct ggml_tensor * cur) const {
|
||||||
|
const size_t offs = file_offset(ggml_get_name(cur));
|
||||||
|
|
||||||
if (use_mmap) {
|
if (use_mmap) {
|
||||||
lt.data = (uint8_t *) mapping->addr + lt.file_off;
|
cur->data = (uint8_t *) mapping->addr + offs;
|
||||||
} else {
|
} else {
|
||||||
llama_file & file = file_loader->file;
|
file.seek(offs, SEEK_SET);
|
||||||
file.seek(lt.file_off, SEEK_SET);
|
file.read_raw(cur->data, ggml_nbytes(cur));
|
||||||
file.read_raw(lt.data, lt.size);
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
void load_all_data(llama_progress_callback progress_callback, void * progress_callback_user_data, llama_mlock * lmlock) {
|
void load_all_data(struct ggml_context * ctx, llama_progress_callback progress_callback, void * progress_callback_user_data, llama_mlock * lmlock) {
|
||||||
size_t data_size = 0;
|
size_t size_data = 0;
|
||||||
size_t lock_size = 0;
|
size_t size_lock = 0;
|
||||||
size_t pref_size = 0; // prefetch
|
size_t size_pref = 0; // prefetch
|
||||||
|
|
||||||
for (const llama_load_tensor & lt : tensors_map.tensors) {
|
for (int i = 0; i < gguf_get_n_tensors(ctx_gguf); i++) {
|
||||||
data_size += lt.size;
|
struct ggml_tensor * cur = ggml_get_tensor(ctx, gguf_get_tensor_name(ctx_gguf, i));
|
||||||
if (lt.ggml_tensor->backend == GGML_BACKEND_CPU) {
|
size_data += ggml_nbytes(cur);
|
||||||
pref_size += lt.size;
|
if (cur->backend == GGML_BACKEND_CPU) {
|
||||||
|
size_pref += ggml_nbytes(cur);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
if (use_mmap) {
|
if (use_mmap) {
|
||||||
mapping.reset(new llama_mmap(&file_loader->file, pref_size, ggml_is_numa()));
|
mapping.reset(new llama_mmap(&file, size_pref, ggml_is_numa()));
|
||||||
if (lmlock) {
|
if (lmlock) {
|
||||||
lmlock->init(mapping->addr);
|
lmlock->init(mapping->addr);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
size_t done_size = 0;
|
size_t done_size = 0;
|
||||||
for (llama_load_tensor & lt : tensors_map.tensors) {
|
for (int i = 0; i < gguf_get_n_tensors(ctx_gguf); i++) {
|
||||||
|
struct ggml_tensor * cur = ggml_get_tensor(ctx, gguf_get_tensor_name(ctx_gguf, i));
|
||||||
|
GGML_ASSERT(cur); // unused tensors should have been caught by load_data already
|
||||||
|
|
||||||
if (progress_callback) {
|
if (progress_callback) {
|
||||||
progress_callback((float) done_size / data_size, progress_callback_user_data);
|
progress_callback((float) done_size / size_data, progress_callback_user_data);
|
||||||
}
|
}
|
||||||
GGML_ASSERT(lt.ggml_tensor); // unused tensors should have been caught by load_data already
|
|
||||||
lt.data = (uint8_t *) lt.ggml_tensor->data;
|
|
||||||
|
|
||||||
// allocate temp buffer if not using mmap
|
// allocate temp buffer if not using mmap
|
||||||
if (!use_mmap && lt.data == NULL) {
|
if (!use_mmap && cur->data == NULL) {
|
||||||
GGML_ASSERT(lt.ggml_tensor->backend != GGML_BACKEND_CPU);
|
GGML_ASSERT(cur->backend != GGML_BACKEND_CPU);
|
||||||
lt.data = (uint8_t*)malloc(ggml_nbytes(lt.ggml_tensor));
|
cur->data = malloc(ggml_nbytes(cur));
|
||||||
}
|
}
|
||||||
|
|
||||||
load_data_for(lt);
|
load_data_for(cur);
|
||||||
|
|
||||||
switch (lt.ggml_tensor->backend) {
|
switch (cur->backend) {
|
||||||
case GGML_BACKEND_CPU:
|
case GGML_BACKEND_CPU:
|
||||||
lt.ggml_tensor->data = lt.data;
|
|
||||||
if (use_mmap && lmlock) {
|
if (use_mmap && lmlock) {
|
||||||
lock_size += lt.size;
|
size_lock += ggml_nbytes(cur);
|
||||||
lmlock->grow_to(lock_size);
|
lmlock->grow_to(size_lock);
|
||||||
}
|
}
|
||||||
break;
|
break;
|
||||||
#if defined(GGML_USE_CUBLAS)
|
#if defined(GGML_USE_CUBLAS)
|
||||||
case GGML_BACKEND_GPU:
|
case GGML_BACKEND_GPU:
|
||||||
case GGML_BACKEND_GPU_SPLIT:
|
case GGML_BACKEND_GPU_SPLIT:
|
||||||
ggml_cuda_transform_tensor(lt.data, lt.ggml_tensor);
|
// old code:
|
||||||
|
//ggml_cuda_transform_tensor(lt.data, lt.ggml_tensor);
|
||||||
|
|
||||||
|
// TODO: test if this works !!
|
||||||
|
ggml_cuda_transform_tensor(cur->data, cur);
|
||||||
if (!use_mmap) {
|
if (!use_mmap) {
|
||||||
free(lt.data);
|
free(cur->data);
|
||||||
}
|
}
|
||||||
break;
|
break;
|
||||||
#elif defined(GGML_USE_CLBLAST)
|
#elif defined(GGML_USE_CLBLAST)
|
||||||
case GGML_BACKEND_GPU:
|
case GGML_BACKEND_GPU:
|
||||||
ggml_cl_transform_tensor(lt.data, lt.ggml_tensor);
|
ggml_cl_transform_tensor(cur->data, cur);
|
||||||
if (!use_mmap) {
|
if (!use_mmap) {
|
||||||
free(lt.data);
|
free(cur->data);
|
||||||
}
|
}
|
||||||
break;
|
break;
|
||||||
#endif
|
#endif
|
||||||
|
@ -1221,7 +1183,7 @@ struct llama_model_loader {
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
|
|
||||||
done_size += lt.size;
|
done_size += ggml_nbytes(cur);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
@ -1298,7 +1260,7 @@ static void llama_model_load_internal(
|
||||||
|
|
||||||
// read hparams
|
// read hparams
|
||||||
{
|
{
|
||||||
struct gguf_context * ctx = ml->file_loader->ctx_gguf;
|
struct gguf_context * ctx = ml->ctx_gguf;
|
||||||
|
|
||||||
hparams.n_vocab = gguf_get_arr_n (ctx, gguf_find_key(ctx, "tokenizer.ggml.tokens"));
|
hparams.n_vocab = gguf_get_arr_n (ctx, gguf_find_key(ctx, "tokenizer.ggml.tokens"));
|
||||||
hparams.n_ctx = gguf_get_val_u32(ctx, gguf_find_key(ctx, "llama.context_length"));
|
hparams.n_ctx = gguf_get_val_u32(ctx, gguf_find_key(ctx, "llama.context_length"));
|
||||||
|
@ -1351,7 +1313,7 @@ static void llama_model_load_internal(
|
||||||
|
|
||||||
// read vocab
|
// read vocab
|
||||||
{
|
{
|
||||||
struct gguf_context * ctx = ml->file_loader->ctx_gguf;
|
struct gguf_context * ctx = ml->ctx_gguf;
|
||||||
|
|
||||||
vocab.id_to_token.resize(hparams.n_vocab);
|
vocab.id_to_token.resize(hparams.n_vocab);
|
||||||
|
|
||||||
|
@ -1379,7 +1341,7 @@ static void llama_model_load_internal(
|
||||||
}
|
}
|
||||||
|
|
||||||
{
|
{
|
||||||
LLAMA_LOG_INFO("%s: format = %s\n", __func__, llama_file_version_name(ml->file_loader->file_version));
|
LLAMA_LOG_INFO("%s: format = %s\n", __func__, llama_file_version_name(ml->file_version));
|
||||||
LLAMA_LOG_INFO("%s: n_vocab = %u\n", __func__, hparams.n_vocab);
|
LLAMA_LOG_INFO("%s: n_vocab = %u\n", __func__, hparams.n_vocab);
|
||||||
LLAMA_LOG_INFO("%s: n_ctx = %u\n", __func__, hparams.n_ctx);
|
LLAMA_LOG_INFO("%s: n_ctx = %u\n", __func__, hparams.n_ctx);
|
||||||
LLAMA_LOG_INFO("%s: n_embd = %u\n", __func__, hparams.n_embd);
|
LLAMA_LOG_INFO("%s: n_embd = %u\n", __func__, hparams.n_embd);
|
||||||
|
@ -1453,9 +1415,7 @@ static void llama_model_load_internal(
|
||||||
const uint32_t n_layer = hparams.n_layer;
|
const uint32_t n_layer = hparams.n_layer;
|
||||||
const uint32_t n_vocab = hparams.n_vocab;
|
const uint32_t n_vocab = hparams.n_vocab;
|
||||||
|
|
||||||
ml->ggml_ctx = ctx;
|
model.tok_embeddings = ml->create_tensor(ctx, TN_TOKEN_EMBD, {n_embd, n_vocab}, GGML_BACKEND_CPU);
|
||||||
|
|
||||||
model.tok_embeddings = ml->get_tensor(TN_TOKEN_EMBD, {n_embd, n_vocab}, GGML_BACKEND_CPU);
|
|
||||||
|
|
||||||
// "output" tensor
|
// "output" tensor
|
||||||
{
|
{
|
||||||
|
@ -1476,8 +1436,8 @@ static void llama_model_load_internal(
|
||||||
backend_output = GGML_BACKEND_CPU;
|
backend_output = GGML_BACKEND_CPU;
|
||||||
}
|
}
|
||||||
|
|
||||||
model.norm = ml->get_tensor(TN_OUTPUT_NORM, {n_embd}, backend_norm);
|
model.norm = ml->create_tensor(ctx, TN_OUTPUT_NORM, {n_embd}, backend_norm);
|
||||||
model.output = ml->get_tensor(TN_OUTPUT, {n_embd, n_vocab}, backend_output);
|
model.output = ml->create_tensor(ctx, TN_OUTPUT, {n_embd, n_vocab}, backend_output);
|
||||||
if (backend_norm == GGML_BACKEND_GPU) {
|
if (backend_norm == GGML_BACKEND_GPU) {
|
||||||
vram_weights += ggml_nbytes(model.norm);
|
vram_weights += ggml_nbytes(model.norm);
|
||||||
}
|
}
|
||||||
|
@ -1496,18 +1456,18 @@ static void llama_model_load_internal(
|
||||||
const ggml_backend backend_split = int(i) < i_gpu_start ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD_SPLIT; // NOLINT
|
const ggml_backend backend_split = int(i) < i_gpu_start ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD_SPLIT; // NOLINT
|
||||||
|
|
||||||
auto & layer = model.layers[i];
|
auto & layer = model.layers[i];
|
||||||
layer.attention_norm = ml->get_tensor(format(TN_ATTN_NORM, i), {n_embd}, backend);
|
layer.attention_norm = ml->create_tensor(ctx, format(TN_ATTN_NORM, i), {n_embd}, backend);
|
||||||
|
|
||||||
layer.wq = ml->get_tensor(format(TN_ATTN_Q, i), {n_embd, n_embd}, backend_split);
|
layer.wq = ml->create_tensor(ctx, format(TN_ATTN_Q, i), {n_embd, n_embd}, backend_split);
|
||||||
layer.wk = ml->get_tensor(format(TN_ATTN_K, i), {n_embd, n_embd_gqa}, backend_split);
|
layer.wk = ml->create_tensor(ctx, format(TN_ATTN_K, i), {n_embd, n_embd_gqa}, backend_split);
|
||||||
layer.wv = ml->get_tensor(format(TN_ATTN_V, i), {n_embd, n_embd_gqa}, backend_split);
|
layer.wv = ml->create_tensor(ctx, format(TN_ATTN_V, i), {n_embd, n_embd_gqa}, backend_split);
|
||||||
layer.wo = ml->get_tensor(format(TN_ATTN_OUTPUT, i), {n_embd, n_embd}, backend_split);
|
layer.wo = ml->create_tensor(ctx, format(TN_ATTN_OUTPUT, i), {n_embd, n_embd}, backend_split);
|
||||||
|
|
||||||
layer.ffn_norm = ml->get_tensor(format(TN_FFN_NORM, i), {n_embd}, backend);
|
layer.ffn_norm = ml->create_tensor(ctx, format(TN_FFN_NORM, i), {n_embd}, backend);
|
||||||
|
|
||||||
layer.w1 = ml->get_tensor(format(TN_FFN_GATE, i), {n_embd, n_ff}, backend_split);
|
layer.w1 = ml->create_tensor(ctx, format(TN_FFN_GATE, i), {n_embd, n_ff}, backend_split);
|
||||||
layer.w2 = ml->get_tensor(format(TN_FFN_DOWN, i), { n_ff, n_embd}, backend_split);
|
layer.w2 = ml->create_tensor(ctx, format(TN_FFN_DOWN, i), { n_ff, n_embd}, backend_split);
|
||||||
layer.w3 = ml->get_tensor(format(TN_FFN_UP, i), {n_embd, n_ff}, backend_split);
|
layer.w3 = ml->create_tensor(ctx, format(TN_FFN_UP, i), {n_embd, n_ff}, backend_split);
|
||||||
|
|
||||||
if (backend == GGML_BACKEND_GPU) {
|
if (backend == GGML_BACKEND_GPU) {
|
||||||
vram_weights +=
|
vram_weights +=
|
||||||
|
@ -1605,8 +1565,9 @@ static void llama_model_load_internal(
|
||||||
}
|
}
|
||||||
|
|
||||||
// populate `tensors_by_name`
|
// populate `tensors_by_name`
|
||||||
for (llama_load_tensor & lt : ml->tensors_map.tensors) {
|
for (int i = 0; i < ml->n_tensors; ++i) {
|
||||||
model.tensors_by_name.emplace_back(lt.name, lt.ggml_tensor);
|
struct ggml_tensor * cur = ggml_get_tensor(ctx, ml->get_tensor_name(i));
|
||||||
|
model.tensors_by_name.emplace_back(ggml_get_name(cur), cur);
|
||||||
}
|
}
|
||||||
|
|
||||||
(void) tensor_split;
|
(void) tensor_split;
|
||||||
|
@ -1616,7 +1577,7 @@ static void llama_model_load_internal(
|
||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
ml->load_all_data(progress_callback, progress_callback_user_data, use_mlock ? &model.mlock_mmap : NULL);
|
ml->load_all_data(ctx, progress_callback, progress_callback_user_data, use_mlock ? &model.mlock_mmap : NULL);
|
||||||
|
|
||||||
if (progress_callback) {
|
if (progress_callback) {
|
||||||
progress_callback(1.0f, progress_callback_user_data);
|
progress_callback(1.0f, progress_callback_user_data);
|
||||||
|
@ -1666,7 +1627,7 @@ static struct ggml_cgraph * llama_build_graph(
|
||||||
int n_tokens,
|
int n_tokens,
|
||||||
int n_past) {
|
int n_past) {
|
||||||
|
|
||||||
GGML_ASSERT((!tokens && embd) || (tokens && !embd));
|
GGML_ASSERT((!tokens && embd) || (tokens && !embd)); // NOLINT
|
||||||
|
|
||||||
const int N = n_tokens;
|
const int N = n_tokens;
|
||||||
|
|
||||||
|
@ -1696,7 +1657,6 @@ static struct ggml_cgraph * llama_build_graph(
|
||||||
auto & mem_per_token = lctx.mem_per_token;
|
auto & mem_per_token = lctx.mem_per_token;
|
||||||
auto & buf_compute = lctx.buf_compute;
|
auto & buf_compute = lctx.buf_compute;
|
||||||
|
|
||||||
|
|
||||||
struct ggml_init_params params = {
|
struct ggml_init_params params = {
|
||||||
/*.mem_size =*/ buf_compute.size,
|
/*.mem_size =*/ buf_compute.size,
|
||||||
/*.mem_buffer =*/ buf_compute.data,
|
/*.mem_buffer =*/ buf_compute.data,
|
||||||
|
@ -2049,7 +2009,7 @@ static bool llama_eval_internal(
|
||||||
int n_threads,
|
int n_threads,
|
||||||
const char * cgraph_fname) {
|
const char * cgraph_fname) {
|
||||||
|
|
||||||
GGML_ASSERT((!tokens && embd) || (tokens && !embd));
|
GGML_ASSERT((!tokens && embd) || (tokens && !embd)); // NOLINT
|
||||||
|
|
||||||
const int64_t t_start_us = ggml_time_us();
|
const int64_t t_start_us = ggml_time_us();
|
||||||
|
|
||||||
|
@ -2526,8 +2486,8 @@ std::vector<uint32_t> decode_utf8(const char * src) {
|
||||||
// returns true iff pos points to the end of one of the definitions of a rule
|
// returns true iff pos points to the end of one of the definitions of a rule
|
||||||
static bool llama_grammar_is_end_of_sequence(const llama_grammar_element * pos) {
|
static bool llama_grammar_is_end_of_sequence(const llama_grammar_element * pos) {
|
||||||
switch (pos->type) {
|
switch (pos->type) {
|
||||||
case LLAMA_GRETYPE_END: return true;
|
case LLAMA_GRETYPE_END: return true; // NOLINT
|
||||||
case LLAMA_GRETYPE_ALT: return true;
|
case LLAMA_GRETYPE_ALT: return true; // NOLINT
|
||||||
default: return false;
|
default: return false;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -2540,7 +2500,7 @@ static std::pair<bool, const llama_grammar_element *> llama_grammar_match_char(
|
||||||
|
|
||||||
bool found = false;
|
bool found = false;
|
||||||
bool is_positive_char = pos->type == LLAMA_GRETYPE_CHAR;
|
bool is_positive_char = pos->type == LLAMA_GRETYPE_CHAR;
|
||||||
GGML_ASSERT(is_positive_char || pos->type == LLAMA_GRETYPE_CHAR_NOT);
|
GGML_ASSERT(is_positive_char || pos->type == LLAMA_GRETYPE_CHAR_NOT); // NOLINT
|
||||||
|
|
||||||
do {
|
do {
|
||||||
if (pos[1].type == LLAMA_GRETYPE_CHAR_RNG_UPPER) {
|
if (pos[1].type == LLAMA_GRETYPE_CHAR_RNG_UPPER) {
|
||||||
|
@ -2675,7 +2635,7 @@ static std::vector<llama_grammar_candidate> llama_grammar_reject_candidates_for_
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
auto stack_pos_after = llama_grammar_match_char(stack_pos, 0).second;
|
const auto * stack_pos_after = llama_grammar_match_char(stack_pos, 0).second;
|
||||||
|
|
||||||
// update top of stack to next element, if any
|
// update top of stack to next element, if any
|
||||||
std::vector<const llama_grammar_element *> stack_after(stack.begin(), stack.end() - 1);
|
std::vector<const llama_grammar_element *> stack_after(stack.begin(), stack.end() - 1);
|
||||||
|
@ -3285,35 +3245,35 @@ void llama_grammar_accept_token(struct llama_context * ctx, struct llama_grammar
|
||||||
// quantization
|
// quantization
|
||||||
//
|
//
|
||||||
|
|
||||||
static void llama_convert_tensor_internal(const llama_load_tensor & tensor, std::vector<float> & output, const size_t nelements, const int nthread) {
|
static void llama_convert_tensor_internal(struct ggml_tensor * tensor, std::vector<float> & output, const size_t nelements, const int nthread) {
|
||||||
if (output.size() < nelements) {
|
if (output.size() < nelements) {
|
||||||
output.resize(nelements);
|
output.resize(nelements);
|
||||||
}
|
}
|
||||||
float * f32_output = (float *) output.data();
|
float * f32_output = (float *) output.data();
|
||||||
|
|
||||||
ggml_type_traits_t qtype;
|
ggml_type_traits_t qtype;
|
||||||
if (ggml_is_quantized(tensor.type)) {
|
if (ggml_is_quantized(tensor->type)) {
|
||||||
qtype = ggml_internal_get_type_traits(tensor.type);
|
qtype = ggml_internal_get_type_traits(tensor->type);
|
||||||
if (qtype.to_float == NULL) {
|
if (qtype.to_float == NULL) {
|
||||||
throw std::runtime_error(format("type %s unsupported for integer quantization: no dequantization available", ggml_type_name(tensor.type)));
|
throw std::runtime_error(format("type %s unsupported for integer quantization: no dequantization available", ggml_type_name(tensor->type)));
|
||||||
}
|
}
|
||||||
} else if (tensor.type != GGML_TYPE_F16) {
|
} else if (tensor->type != GGML_TYPE_F16) {
|
||||||
throw std::runtime_error(format("cannot dequantize/convert tensor type %s", ggml_type_name(tensor.type)));
|
throw std::runtime_error(format("cannot dequantize/convert tensor type %s", ggml_type_name(tensor->type)));
|
||||||
}
|
}
|
||||||
|
|
||||||
if (nthread < 2) {
|
if (nthread < 2) {
|
||||||
if (tensor.type == GGML_TYPE_F16) {
|
if (tensor->type == GGML_TYPE_F16) {
|
||||||
ggml_fp16_to_fp32_row((ggml_fp16_t *)tensor.data, f32_output, nelements);
|
ggml_fp16_to_fp32_row((ggml_fp16_t *)tensor->data, f32_output, nelements);
|
||||||
} else if (ggml_is_quantized(tensor.type)) {
|
} else if (ggml_is_quantized(tensor->type)) {
|
||||||
qtype.to_float(tensor.data, f32_output, nelements);
|
qtype.to_float(tensor->data, f32_output, nelements);
|
||||||
} else {
|
} else {
|
||||||
GGML_ASSERT(false); // unreachable
|
GGML_ASSERT(false); // unreachable
|
||||||
}
|
}
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
auto block_size = tensor.type == GGML_TYPE_F16 ? 1 : (size_t)ggml_blck_size(tensor.type);
|
auto block_size = tensor->type == GGML_TYPE_F16 ? 1 : (size_t)ggml_blck_size(tensor->type);
|
||||||
auto block_size_bytes = ggml_type_size(tensor.type);
|
auto block_size_bytes = ggml_type_size(tensor->type);
|
||||||
|
|
||||||
GGML_ASSERT(nelements % block_size == 0);
|
GGML_ASSERT(nelements % block_size == 0);
|
||||||
auto nblocks = nelements / block_size;
|
auto nblocks = nelements / block_size;
|
||||||
|
@ -3333,7 +3293,7 @@ static void llama_convert_tensor_internal(const llama_load_tensor & tensor, std:
|
||||||
qtype.to_float(inbuf, outbuf, nels);
|
qtype.to_float(inbuf, outbuf, nels);
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
workers.push_back(std::thread(compute, tensor.type, tensor.data + in_buff_offs, f32_output + out_buff_offs, thr_elems));
|
workers.push_back(std::thread(compute, tensor->type, (uint8_t *) tensor->data + in_buff_offs, f32_output + out_buff_offs, thr_elems));
|
||||||
in_buff_offs += thr_block_bytes;
|
in_buff_offs += thr_block_bytes;
|
||||||
out_buff_offs += thr_elems;
|
out_buff_offs += thr_elems;
|
||||||
}
|
}
|
||||||
|
@ -3381,17 +3341,22 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
|
||||||
struct gguf_context * ctx_out = gguf_init_empty();
|
struct gguf_context * ctx_out = gguf_init_empty();
|
||||||
|
|
||||||
// copy the KV pairs from the input file
|
// copy the KV pairs from the input file
|
||||||
gguf_set_kv(ctx_out, model_loader->file_loader->ctx_gguf);
|
gguf_set_kv (ctx_out, model_loader->ctx_gguf);
|
||||||
gguf_set_val_u32(ctx_out, "general.quantization_version", GGML_QNT_VERSION);
|
gguf_set_val_u32(ctx_out, "general.quantization_version", GGML_QNT_VERSION);
|
||||||
|
|
||||||
#ifdef GGML_USE_K_QUANTS
|
#ifdef GGML_USE_K_QUANTS
|
||||||
int n_attention_wv = 0;
|
int n_attention_wv = 0;
|
||||||
int n_feed_forward_w2 = 0;
|
int n_feed_forward_w2 = 0;
|
||||||
for (auto& tensor : model_loader->tensors_map.tensors) {
|
|
||||||
if (tensor.name.find("attn_v.weight") != std::string::npos) {
|
for (int i = 0; i < model_loader->n_tensors; ++i) {
|
||||||
|
struct ggml_tensor * meta = model_loader->get_tensor_meta(i);
|
||||||
|
|
||||||
|
const std::string name = ggml_get_name(meta);
|
||||||
|
|
||||||
|
if (name.find("attn_v.weight") != std::string::npos) {
|
||||||
++n_attention_wv;
|
++n_attention_wv;
|
||||||
}
|
}
|
||||||
else if (tensor.name.find("ffn_down.weight") != std::string::npos) {
|
else if (name.find("ffn_down.weight") != std::string::npos) {
|
||||||
++n_feed_forward_w2;
|
++n_feed_forward_w2;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -3416,8 +3381,10 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
|
||||||
std::vector<uint8_t> read_data;
|
std::vector<uint8_t> read_data;
|
||||||
std::vector<uint8_t> work;
|
std::vector<uint8_t> work;
|
||||||
|
|
||||||
for (llama_load_tensor & tensor : model_loader->tensors_map.tensors) {
|
// populate the original tensors so we get an initial meta data
|
||||||
gguf_add_tensor(ctx_out, tensor.ggml_tensor);
|
for (int i = 0; i < model_loader->n_tensors; ++i) {
|
||||||
|
struct ggml_tensor * meta = model_loader->get_tensor_meta(i);
|
||||||
|
gguf_add_tensor(ctx_out, meta);
|
||||||
}
|
}
|
||||||
|
|
||||||
std::ofstream fout(fname_out, std::ios::binary);
|
std::ofstream fout(fname_out, std::ios::binary);
|
||||||
|
@ -3429,43 +3396,47 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
|
||||||
// placeholder for the meta data
|
// placeholder for the meta data
|
||||||
::zeros(fout, meta_size);
|
::zeros(fout, meta_size);
|
||||||
|
|
||||||
for (llama_load_tensor & tensor : model_loader->tensors_map.tensors) {
|
for (int i = 0; i < model_loader->n_tensors; ++i) {
|
||||||
read_data.resize(tensor.size);
|
struct ggml_tensor * tensor = model_loader->get_tensor_meta(i);
|
||||||
tensor.data = read_data.data();
|
|
||||||
|
const std::string name = ggml_get_name(tensor);
|
||||||
|
|
||||||
|
read_data.resize(ggml_nbytes(tensor));
|
||||||
|
tensor->data = read_data.data();
|
||||||
model_loader->load_data_for(tensor);
|
model_loader->load_data_for(tensor);
|
||||||
|
|
||||||
LLAMA_LOG_INFO("[%4zu/%4zu] %36s - %16s, type = %6s, ",
|
LLAMA_LOG_INFO("[%4zu/%4zu] %36s - [%5d, %5d], type = %6s, ",
|
||||||
++idx, model_loader->tensors_map.tensors.size(),
|
++idx, model_loader->n_tensors,
|
||||||
tensor.name.c_str(), llama_format_tensor_shape(tensor.ne).c_str(),
|
ggml_get_name(tensor), (int) tensor->ne[0], (int) tensor->ne[1],
|
||||||
ggml_type_name(tensor.type));
|
ggml_type_name(tensor->type));
|
||||||
|
|
||||||
// This used to be a regex, but <regex> has an extreme cost to compile times.
|
// This used to be a regex, but <regex> has an extreme cost to compile times.
|
||||||
bool quantize = tensor.name.rfind("weight") == tensor.name.size() - 6; // ends with 'weight'?
|
bool quantize = name.rfind("weight") == name.size() - 6; // ends with 'weight'?
|
||||||
|
|
||||||
// quantize only 2D tensors
|
// quantize only 2D tensors
|
||||||
quantize &= (tensor.ne.size() == 2);
|
quantize &= (tensor->n_dims == 2);
|
||||||
quantize &= params->quantize_output_tensor || tensor.name != "output.weight";
|
quantize &= params->quantize_output_tensor || name != "output.weight";
|
||||||
quantize &= quantized_type != tensor.type;
|
quantize &= quantized_type != tensor->type;
|
||||||
|
|
||||||
enum ggml_type new_type;
|
enum ggml_type new_type;
|
||||||
void * new_data;
|
void * new_data;
|
||||||
size_t new_size;
|
size_t new_size;
|
||||||
|
|
||||||
if (!quantize) {
|
if (!quantize) {
|
||||||
new_type = tensor.type;
|
new_type = tensor->type;
|
||||||
new_data = tensor.data;
|
new_data = tensor->data;
|
||||||
new_size = tensor.size;
|
new_size = ggml_nbytes(tensor);
|
||||||
LLAMA_LOG_INFO("size = %8.3f MB\n", tensor.size/1024.0/1024.0);
|
LLAMA_LOG_INFO("size = %8.3f MB\n", ggml_nbytes(tensor)/1024.0/1024.0);
|
||||||
} else {
|
} else {
|
||||||
new_type = quantized_type;
|
new_type = quantized_type;
|
||||||
#ifdef GGML_USE_K_QUANTS
|
#ifdef GGML_USE_K_QUANTS
|
||||||
if (tensor.name == TN_OUTPUT) {
|
if (name == TN_OUTPUT) {
|
||||||
int nx = tensor.ne.at(0);
|
int nx = tensor->ne[0];
|
||||||
int ny = tensor.ne.at(1);
|
int ny = tensor->ne[1];
|
||||||
if (nx % QK_K == 0 && ny % QK_K == 0) {
|
if (nx % QK_K == 0 && ny % QK_K == 0) {
|
||||||
new_type = GGML_TYPE_Q6_K;
|
new_type = GGML_TYPE_Q6_K;
|
||||||
}
|
}
|
||||||
} else if (tensor.name.find("attn_v.weight") != std::string::npos) {
|
} else if (name.find("attn_v.weight") != std::string::npos) {
|
||||||
if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q2_K) new_type = GGML_TYPE_Q4_K;
|
if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q2_K) new_type = GGML_TYPE_Q4_K;
|
||||||
else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) new_type = GGML_TYPE_Q5_K;
|
else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) new_type = GGML_TYPE_Q5_K;
|
||||||
else if ((ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q5_K_M) &&
|
else if ((ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q5_K_M) &&
|
||||||
|
@ -3473,32 +3444,32 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
|
||||||
else if (QK_K == 64 && (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_S || ftype == LLAMA_FTYPE_MOSTLY_Q3_K_S) &&
|
else if (QK_K == 64 && (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_S || ftype == LLAMA_FTYPE_MOSTLY_Q3_K_S) &&
|
||||||
(i_attention_wv < n_attention_wv/8 || i_attention_wv >= 7*n_attention_wv/8)) new_type = GGML_TYPE_Q6_K;
|
(i_attention_wv < n_attention_wv/8 || i_attention_wv >= 7*n_attention_wv/8)) new_type = GGML_TYPE_Q6_K;
|
||||||
++i_attention_wv;
|
++i_attention_wv;
|
||||||
} else if (tensor.name.find("feed_forward.w2.weight") != std::string::npos) {
|
} else if (name.find("feed_forward.w2.weight") != std::string::npos) {
|
||||||
if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q2_K) new_type = GGML_TYPE_Q4_K;
|
if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q2_K) new_type = GGML_TYPE_Q4_K;
|
||||||
else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) new_type = GGML_TYPE_Q5_K;
|
else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) new_type = GGML_TYPE_Q5_K;
|
||||||
else if ((ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q5_K_M) &&
|
else if ((ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q5_K_M) &&
|
||||||
use_more_bits(i_feed_forward_w2, n_feed_forward_w2)) new_type = GGML_TYPE_Q6_K;
|
use_more_bits(i_feed_forward_w2, n_feed_forward_w2)) new_type = GGML_TYPE_Q6_K;
|
||||||
//else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_S && i_feed_forward_w2 < n_feed_forward_w2/8) new_type = GGML_TYPE_Q6_K;
|
//else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_S && i_feed_forward_w2 < n_feed_forward_w2/8) new_type = GGML_TYPE_Q6_K;
|
||||||
++i_feed_forward_w2;
|
++i_feed_forward_w2;
|
||||||
} else if (tensor.name.find("attn_output.weight") != std::string::npos) {
|
} else if (name.find("attn_output.weight") != std::string::npos) {
|
||||||
if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q2_K) new_type = GGML_TYPE_Q4_K;
|
if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q2_K) new_type = GGML_TYPE_Q4_K;
|
||||||
else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) new_type = GGML_TYPE_Q5_K;
|
else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) new_type = GGML_TYPE_Q5_K;
|
||||||
}
|
}
|
||||||
bool convert_incompatible_tensor = false;
|
bool convert_incompatible_tensor = false;
|
||||||
if (new_type == GGML_TYPE_Q2_K || new_type == GGML_TYPE_Q3_K || new_type == GGML_TYPE_Q4_K ||
|
if (new_type == GGML_TYPE_Q2_K || new_type == GGML_TYPE_Q3_K || new_type == GGML_TYPE_Q4_K ||
|
||||||
new_type == GGML_TYPE_Q5_K || new_type == GGML_TYPE_Q6_K) {
|
new_type == GGML_TYPE_Q5_K || new_type == GGML_TYPE_Q6_K) {
|
||||||
int nx = tensor.ne.at(0);
|
int nx = tensor->ne[0];
|
||||||
int ny = tensor.ne.at(1);
|
int ny = tensor->ne[1];
|
||||||
if (nx % QK_K != 0 || ny % QK_K != 0) {
|
if (nx % QK_K != 0 || ny % QK_K != 0) {
|
||||||
LLAMA_LOG_INFO("\n\nTensor sizes %d x %d are not divisible by %d, required for k-quants.\n",nx,ny,QK_K);
|
LLAMA_LOG_INFO("\n\nTensor sizes %d x %d are not divisible by %d, required for k-quants.\n",nx,ny,QK_K);
|
||||||
convert_incompatible_tensor = true;
|
convert_incompatible_tensor = true;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
if (convert_incompatible_tensor) {
|
if (convert_incompatible_tensor) {
|
||||||
if (tensor.name == TN_OUTPUT) {
|
if (name == TN_OUTPUT) {
|
||||||
new_type = GGML_TYPE_F16; //fall back to F16 instead of just failing.
|
new_type = GGML_TYPE_F16; //fall back to F16 instead of just failing.
|
||||||
LLAMA_LOG_WARN("F16 will be used for this tensor instead.\n");
|
LLAMA_LOG_WARN("F16 will be used for this tensor instead.\n");
|
||||||
} else if (tensor.name == TN_TOKEN_EMBD) {
|
} else if (name == TN_TOKEN_EMBD) {
|
||||||
new_type = GGML_TYPE_Q4_0; //fall back to Q4_0 instead of just failing.
|
new_type = GGML_TYPE_Q4_0; //fall back to Q4_0 instead of just failing.
|
||||||
LLAMA_LOG_WARN("Q4_0 will be used for this tensor instead.\n");
|
LLAMA_LOG_WARN("Q4_0 will be used for this tensor instead.\n");
|
||||||
} else {
|
} else {
|
||||||
|
@ -3507,15 +3478,15 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
|
||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
const size_t nelements = tensor.ne.at(0) * tensor.ne.at(1);
|
const size_t nelements = ggml_nelements(tensor);
|
||||||
|
|
||||||
float * f32_data;
|
float * f32_data;
|
||||||
std::vector<float> f32_conv_buf;
|
std::vector<float> f32_conv_buf;
|
||||||
|
|
||||||
if (tensor.type == GGML_TYPE_F32) {
|
if (tensor->type == GGML_TYPE_F32) {
|
||||||
f32_data = (float *) tensor.data;
|
f32_data = (float *) tensor->data;
|
||||||
} else if (ggml_is_quantized(tensor.type) && !params->allow_requantize) {
|
} else if (ggml_is_quantized(tensor->type) && !params->allow_requantize) {
|
||||||
throw std::runtime_error(format("requantizing from type %s is disabled", ggml_type_name(tensor.type)));
|
throw std::runtime_error(format("requantizing from type %s is disabled", ggml_type_name(tensor->type)));
|
||||||
} else {
|
} else {
|
||||||
llama_convert_tensor_internal(tensor, f32_conv_buf, nelements, nthread);
|
llama_convert_tensor_internal(tensor, f32_conv_buf, nelements, nthread);
|
||||||
f32_data = (float *) f32_conv_buf.data();
|
f32_data = (float *) f32_conv_buf.data();
|
||||||
|
@ -3571,7 +3542,7 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
LLAMA_LOG_INFO("size = %8.2f MB -> %8.2f MB | hist: ", tensor.size/1024.0/1024.0, new_size/1024.0/1024.0);
|
LLAMA_LOG_INFO("size = %8.2f MB -> %8.2f MB | hist: ", ggml_nbytes(tensor)/1024.0/1024.0, new_size/1024.0/1024.0);
|
||||||
int64_t tot_count = 0;
|
int64_t tot_count = 0;
|
||||||
for (size_t i = 0; i < hist_cur.size(); i++) {
|
for (size_t i = 0; i < hist_cur.size(); i++) {
|
||||||
hist_all[i] += hist_cur[i];
|
hist_all[i] += hist_cur[i];
|
||||||
|
@ -3585,12 +3556,12 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
|
||||||
}
|
}
|
||||||
LLAMA_LOG_INFO("\n");
|
LLAMA_LOG_INFO("\n");
|
||||||
}
|
}
|
||||||
total_size_org += tensor.size;
|
total_size_org += ggml_nbytes(tensor);
|
||||||
total_size_new += new_size;
|
total_size_new += new_size;
|
||||||
|
|
||||||
// update the gguf meta data as we go
|
// update the gguf meta data as we go
|
||||||
gguf_set_tensor_type(ctx_out, tensor.name.c_str(), new_type);
|
gguf_set_tensor_type(ctx_out, name.c_str(), new_type);
|
||||||
gguf_set_tensor_data(ctx_out, tensor.name.c_str(), new_data, new_size);
|
gguf_set_tensor_data(ctx_out, name.c_str(), new_data, new_size);
|
||||||
|
|
||||||
// write tensor data + padding
|
// write tensor data + padding
|
||||||
fout.write((const char *) new_data, new_size);
|
fout.write((const char *) new_data, new_size);
|
||||||
|
@ -3698,11 +3669,9 @@ int llama_apply_lora_from_file_internal(const struct llama_model & model, const
|
||||||
|
|
||||||
base_ctx = ggml_init(base_params);
|
base_ctx = ggml_init(base_params);
|
||||||
|
|
||||||
model_loader->ggml_ctx = base_ctx;
|
|
||||||
|
|
||||||
// maybe this should in llama_model_loader
|
// maybe this should in llama_model_loader
|
||||||
if (model_loader->use_mmap) {
|
if (model_loader->use_mmap) {
|
||||||
model_loader->mapping.reset(new llama_mmap(&model_loader->file_loader->file, /* prefetch */ 0, ggml_is_numa()));
|
model_loader->mapping.reset(new llama_mmap(&model_loader->file, /* prefetch */ 0, ggml_is_numa()));
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -3807,19 +3776,18 @@ int llama_apply_lora_from_file_internal(const struct llama_model & model, const
|
||||||
|
|
||||||
ggml_tensor * base_t;
|
ggml_tensor * base_t;
|
||||||
if (model_loader) {
|
if (model_loader) {
|
||||||
|
struct gguf_context * ctx_gguf = model_loader->ctx_gguf;
|
||||||
|
|
||||||
// load from base model
|
// load from base model
|
||||||
if (model_loader->tensors_map.name_to_idx.find(base_name) == model_loader->tensors_map.name_to_idx.end()) {
|
if (gguf_find_tensor(ctx_gguf, base_name.c_str()) < 0) {
|
||||||
LLAMA_LOG_ERROR("%s: error: tensor '%s' not found in base model\n", __func__, base_name.c_str());
|
LLAMA_LOG_ERROR("%s: error: tensor '%s' not found in base model\n", __func__, base_name.c_str());
|
||||||
return 1;
|
return 1;
|
||||||
}
|
}
|
||||||
size_t idx = model_loader->tensors_map.name_to_idx[base_name];
|
|
||||||
llama_load_tensor & lt = model_loader->tensors_map.tensors[idx];
|
// TODO: not tested!! maybe not working!
|
||||||
base_t = model_loader->get_tensor(base_name, { (uint32_t)dest_t->ne[0], (uint32_t)dest_t->ne[1] }, GGML_BACKEND_CPU);
|
base_t = model_loader->create_tensor(base_ctx, base_name, { (uint32_t)dest_t->ne[0], (uint32_t)dest_t->ne[1] }, GGML_BACKEND_CPU);
|
||||||
lt.data = (uint8_t *) lt.ggml_tensor->data;
|
model_loader->load_data_for(base_t);
|
||||||
model_loader->load_data_for(lt);
|
} else {
|
||||||
lt.ggml_tensor->data = lt.data;
|
|
||||||
}
|
|
||||||
else {
|
|
||||||
base_t = dest_t;
|
base_t = dest_t;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -4767,7 +4735,7 @@ int llama_token_to_str_with_model(const struct llama_model * model, llama_token
|
||||||
}
|
}
|
||||||
strncpy(str, result.c_str(), result.length());
|
strncpy(str, result.c_str(), result.length());
|
||||||
return result.length();
|
return result.length();
|
||||||
} else if (llama_is_unknown_token(model->vocab, token)) {
|
} else if (llama_is_unknown_token(model->vocab, token)) { // NOLINT
|
||||||
if (length < 3) {
|
if (length < 3) {
|
||||||
return -3;
|
return -3;
|
||||||
}
|
}
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue