llama : style formatting + remove helper methods

This commit is contained in:
Georgi Gerganov 2023-08-15 08:51:07 +03:00
parent 2dd5d2c92c
commit a82e3a4d92
No known key found for this signature in database
GPG key ID: 449E073F9DC10735
3 changed files with 79 additions and 115 deletions

10
ggml.h
View file

@ -1744,12 +1744,12 @@ extern "C" {
GGML_API size_t gguf_get_data_offset(struct gguf_context * ctx); GGML_API size_t gguf_get_data_offset(struct gguf_context * ctx);
GGML_API void * gguf_get_data (struct gguf_context * ctx); GGML_API void * gguf_get_data (struct gguf_context * ctx);
GGML_API int gguf_get_n_kv(struct gguf_context * ctx); GGML_API int gguf_get_n_kv(struct gguf_context * ctx);
GGML_API int gguf_find_key(struct gguf_context * ctx, const char * key); GGML_API int gguf_find_key(struct gguf_context * ctx, const char * key);
GGML_API const char * gguf_get_key (struct gguf_context * ctx, int i); GGML_API const char * gguf_get_key (struct gguf_context * ctx, int i);
GGML_API enum gguf_type gguf_get_kv_type (struct gguf_context * ctx, int i); GGML_API enum gguf_type gguf_get_kv_type (struct gguf_context * ctx, int i);
GGML_API enum gguf_type gguf_get_arr_type (struct gguf_context * ctx, int i); GGML_API enum gguf_type gguf_get_arr_type(struct gguf_context * ctx, int i);
GGML_API void gguf_get_val (struct gguf_context * ctx, int i, void * val);
GGML_API const char * gguf_get_arr_str(struct gguf_context * ctx, int key_id, int i); GGML_API const char * gguf_get_arr_str(struct gguf_context * ctx, int key_id, int i);
GGML_API float gguf_get_arr_f32(struct gguf_context * ctx, int key_id, int i); GGML_API float gguf_get_arr_f32(struct gguf_context * ctx, int key_id, int i);

View file

@ -510,22 +510,9 @@ struct llama_state {
// global state // global state
static llama_state g_state; static llama_state g_state;
template <typename T> //
static T checked_mul(T a, T b) { // model loading and saving
T ret = a * b; //
if (a != 0 && ret / a != b) {
throw std::runtime_error(format("overflow multiplying %llu * %llu",
(unsigned long long) a, (unsigned long long) b));
}
return ret;
}
static size_t checked_div(size_t a, size_t b) {
if (b == 0 || a % b != 0) {
throw std::runtime_error(format("error dividing %zu / %zu", a, b));
}
return a / b;
}
static std::string llama_format_tensor_shape(const std::vector<uint32_t> & ne) { static std::string llama_format_tensor_shape(const std::vector<uint32_t> & ne) {
char buf[256]; char buf[256];
@ -536,14 +523,6 @@ static std::string llama_format_tensor_shape(const std::vector<uint32_t> & ne) {
return buf; return buf;
} }
static size_t llama_calc_tensor_size(const std::vector<uint32_t> & ne, enum ggml_type type) {
size_t size = ggml_type_size(type);
for (uint32_t dim : ne) {
size = checked_mul<size_t>(size, dim);
}
return size / ggml_blck_size(type);
}
struct gguf_load_tensor { struct gguf_load_tensor {
std::string name; std::string name;
enum ggml_type type = GGML_TYPE_F32; enum ggml_type type = GGML_TYPE_F32;
@ -573,20 +552,19 @@ struct gguf_file_loader {
struct ggml_context * ctx_data = NULL; struct ggml_context * ctx_data = NULL;
gguf_file_loader(const char * fname, gguf_load_tensors_map & tensors_map) gguf_file_loader(const char * fname, gguf_load_tensors_map & tensors_map) : file(fname, "rb") {
: file(fname, "rb") {
fprintf(stderr, "llama.cpp: loading model from %s\n", fname); fprintf(stderr, "llama.cpp: loading model from %s\n", fname);
struct gguf_init_params params = { struct gguf_init_params params = {
/*.no_alloc = */ true, /*.no_alloc = */ true,
/*.ctx = */ &ctx_data, /*.ctx = */ &ctx_data,
}; };
gguf_ctx = gguf_init_from_file(fname, params); gguf_ctx = gguf_init_from_file(fname, params);
file_version = (enum gguf_file_version) gguf_get_version(gguf_ctx); file_version = (enum gguf_file_version) gguf_get_version(gguf_ctx);
read_hparams(); read_hparams();
read_vocab(); read_vocab();
read_tensor_metadata(tensors_map); read_tensor_metadata(tensors_map);
} }
@ -637,18 +615,18 @@ struct gguf_file_loader {
void read_vocab() { void read_vocab() {
vocab.id_to_token.resize(hparams.n_vocab); vocab.id_to_token.resize(hparams.n_vocab);
int token_idx = gguf_find_key(gguf_ctx, "tokenizer.ggml.tokens");
const int token_idx = gguf_find_key(gguf_ctx, "tokenizer.ggml.tokens");
if (token_idx == -1) { if (token_idx == -1) {
throw std::runtime_error("cannot find token list in GGUF file\n"); throw std::runtime_error("cannot find token list in GGUF file\n");
} }
int score_idx = gguf_find_key(gguf_ctx, "tokenizer.ggml.scores"); const int score_idx = gguf_find_key(gguf_ctx, "tokenizer.ggml.scores");
if (score_idx == -1) { if (score_idx == -1) {
throw std::runtime_error("cannot find token scores list in GGUF file\n"); throw std::runtime_error("cannot find token scores list in GGUF file\n");
} }
for (uint32_t i = 0; i < hparams.n_vocab; i++) { for (uint32_t i = 0; i < hparams.n_vocab; i++) {
std::string word = gguf_get_arr_str(gguf_ctx, token_idx, i); std::string word = gguf_get_arr_str(gguf_ctx, token_idx, i);
vocab.token_to_id[word] = i; vocab.token_to_id[word] = i;
@ -702,7 +680,7 @@ struct gguf_file_loader {
tensor.file_off = gguf_get_data_offset(gguf_ctx) + gguf_get_tensor_offset(gguf_ctx, i); tensor.file_off = gguf_get_data_offset(gguf_ctx) + gguf_get_tensor_offset(gguf_ctx, i);
tensor.name = name; tensor.name = name;
tensor.size = llama_calc_tensor_size(tensor.ne, tensor.type); tensor.size = ggml_nbytes(cur);
tensors_map.tensors.push_back(tensor); tensors_map.tensors.push_back(tensor);
tensors_map.name_to_idx[name] = tensors_map.tensors.size() - 1; tensors_map.name_to_idx[name] = tensors_map.tensors.size() - 1;
@ -787,7 +765,7 @@ struct gguf_file_saver {
gguf_type arr_type; gguf_type arr_type;
int n_arr; int n_arr;
switch(vtype) { switch (vtype) {
case GGUF_TYPE_BOOL: case GGUF_TYPE_BOOL:
bool_val = gguf_get_val_bool(fl->gguf_ctx, i); bool_val = gguf_get_val_bool(fl->gguf_ctx, i);
file.write_val<bool>(key, GGUF_TYPE_BOOL, bool_val); file.write_val<bool>(key, GGUF_TYPE_BOOL, bool_val);
@ -810,7 +788,7 @@ struct gguf_file_saver {
break; break;
case GGUF_TYPE_STRING: case GGUF_TYPE_STRING:
str_val = gguf_get_val_str(fl->gguf_ctx, i); str_val = gguf_get_val_str(fl->gguf_ctx, i);
file.write_val<std::string>(key, GGUF_TYPE_STRING, str_val); file.write_str(key, GGUF_TYPE_STRING, str_val);
break; break;
case GGUF_TYPE_UINT16: case GGUF_TYPE_UINT16:
u16_val = gguf_get_val_u16(fl->gguf_ctx, i); u16_val = gguf_get_val_u16(fl->gguf_ctx, i);
@ -826,7 +804,7 @@ struct gguf_file_saver {
break; break;
case GGUF_TYPE_ARRAY: case GGUF_TYPE_ARRAY:
arr_type = gguf_get_arr_type(fl->gguf_ctx, i); arr_type = gguf_get_arr_type(fl->gguf_ctx, i);
n_arr = gguf_get_arr_n(fl->gguf_ctx, i); n_arr = gguf_get_arr_n (fl->gguf_ctx, i);
if (arr_type == GGUF_TYPE_FLOAT32) { if (arr_type == GGUF_TYPE_FLOAT32) {
write_hparam_arr_f32(key, arr_type, i, n_arr); write_hparam_arr_f32(key, arr_type, i, n_arr);
} else if (arr_type == GGUF_TYPE_STRING) { } else if (arr_type == GGUF_TYPE_STRING) {
@ -923,20 +901,6 @@ struct llama_model_loader {
} }
} }
struct ggml_tensor * get_tensor(const std::string & name, const std::vector<uint32_t> & ne, ggml_backend backend) {
auto it = tensors_map.name_to_idx.find(name);
if (it == tensors_map.name_to_idx.end()) {
throw std::runtime_error(std::runtime_error(format("llama.cpp: tensor '%s' is missing from model", name.c_str())));
}
gguf_load_tensor & lt = tensors_map.tensors.at(it->second);
if (lt.ne != ne) {
throw std::runtime_error(format("llama.cpp: tensor '%s' has wrong shape; expected %s, got %s",
name.c_str(), llama_format_tensor_shape(ne).c_str(), llama_format_tensor_shape(lt.ne).c_str()));
}
return get_tensor_for(lt, backend);
}
struct ggml_tensor * get_tensor_for(gguf_load_tensor & lt, ggml_backend backend) { struct ggml_tensor * get_tensor_for(gguf_load_tensor & lt, ggml_backend backend) {
struct ggml_tensor * tensor; struct ggml_tensor * tensor;
if (backend != GGML_BACKEND_CPU) { if (backend != GGML_BACKEND_CPU) {
@ -960,16 +924,41 @@ struct llama_model_loader {
return tensor; return tensor;
} }
struct ggml_tensor * get_tensor(const std::string & name, const std::vector<uint32_t> & ne, ggml_backend backend) {
auto it = tensors_map.name_to_idx.find(name);
if (it == tensors_map.name_to_idx.end()) {
throw std::runtime_error(std::runtime_error(format("llama.cpp: tensor '%s' is missing from model", name.c_str())));
}
gguf_load_tensor & lt = tensors_map.tensors.at(it->second);
if (lt.ne != ne) {
throw std::runtime_error(format("llama.cpp: tensor '%s' has wrong shape; expected %s, got %s",
name.c_str(), llama_format_tensor_shape(ne).c_str(), llama_format_tensor_shape(lt.ne).c_str()));
}
return get_tensor_for(lt, backend);
}
void done_getting_tensors() const { void done_getting_tensors() const {
if (num_ggml_tensors_created != tensors_map.tensors.size()) { if (num_ggml_tensors_created != tensors_map.tensors.size()) {
throw std::runtime_error(std::string("llama.cpp: file contained more tensors than expected")); throw std::runtime_error(std::string("llama.cpp: file contained more tensors than expected"));
} }
} }
void load_all_data(llama_progress_callback progress_callback, void * progress_callback_user_data, gguf_mlock * lmlock) { void load_data_for(gguf_load_tensor & lt) const {
size_t data_size = 0; if (use_mmap) {
lt.data = (uint8_t *) mapping->addr + lt.file_off;
} else {
gguf_file & file = file_loader->file;
file.seek(lt.file_off, SEEK_SET);
file.read_raw(lt.data, lt.size);
}
}
void load_all_data(llama_progress_callback progress_callback, void * progress_callback_user_data, gguf_mlock * lmlock) {
size_t data_size = 0;
size_t prefetch_size = 0; size_t prefetch_size = 0;
size_t lock_size = 0; size_t lock_size = 0;
for (const gguf_load_tensor & lt : tensors_map.tensors) { for (const gguf_load_tensor & lt : tensors_map.tensors) {
data_size += lt.size; data_size += lt.size;
if (lt.ggml_tensor->backend == GGML_BACKEND_CPU) { if (lt.ggml_tensor->backend == GGML_BACKEND_CPU) {
@ -1031,31 +1020,6 @@ struct llama_model_loader {
done_size += lt.size; done_size += lt.size;
} }
} }
void load_data_for(gguf_load_tensor & lt) {
if (use_mmap) {
lt.data = (uint8_t *) mapping->addr + lt.file_off;
} else {
gguf_file & file = file_loader->file;
file.seek(lt.file_off, SEEK_SET);
file.read_raw(lt.data, lt.size);
}
if (0) {
print_checksum(lt);
}
}
static void print_checksum(gguf_load_tensor & lt) {
uint32_t sum = 0;
for (size_t i = 0; i < lt.size; i++) {
uint8_t byte = lt.data[i];
sum = byte + (sum << 6) + (sum << 16) - sum; // sdbm hash
}
fprintf(stderr, "%s checksum: %#08x (%s, size %zu)\n", lt.name.c_str(), sum,
llama_format_tensor_shape(lt.ne).c_str(), lt.size);
}
}; };
// //
@ -1185,18 +1149,18 @@ int64_t llama_time_us() {
} }
// //
// model loading // load LLaMA models
// //
static const char *gguf_file_version_name(gguf_file_version version) { static const char * gguf_file_version_name(gguf_file_version version) {
switch (version) { switch (version) {
case GGUF_FILE_VERSION_V1: return "GGUF V1 (latest)"; case GGUF_FILE_VERSION_V1: return "GGUF V1 (latest)";
} }
return "unknown"; return "unknown";
} }
static const char *llama_ftype_name(enum llama_ftype ftype) { static const char * llama_ftype_name(enum llama_ftype ftype) {
switch (ftype) { switch (ftype) {
case LLAMA_FTYPE_ALL_F32: return "all F32"; case LLAMA_FTYPE_ALL_F32: return "all F32";
case LLAMA_FTYPE_MOSTLY_F16: return "mostly F16"; case LLAMA_FTYPE_MOSTLY_F16: return "mostly F16";
@ -1207,8 +1171,9 @@ static const char *llama_ftype_name(enum llama_ftype ftype) {
case LLAMA_FTYPE_MOSTLY_Q5_0: return "mostly Q5_0"; case LLAMA_FTYPE_MOSTLY_Q5_0: return "mostly Q5_0";
case LLAMA_FTYPE_MOSTLY_Q5_1: return "mostly Q5_1"; case LLAMA_FTYPE_MOSTLY_Q5_1: return "mostly Q5_1";
case LLAMA_FTYPE_MOSTLY_Q8_0: return "mostly Q8_0"; case LLAMA_FTYPE_MOSTLY_Q8_0: return "mostly Q8_0";
// K-quants // K-quants
case LLAMA_FTYPE_MOSTLY_Q2_K: return "mostly Q2_K"; case LLAMA_FTYPE_MOSTLY_Q2_K: return "mostly Q2_K";
case LLAMA_FTYPE_MOSTLY_Q3_K_S: return "mostly Q3_K - Small"; case LLAMA_FTYPE_MOSTLY_Q3_K_S: return "mostly Q3_K - Small";
case LLAMA_FTYPE_MOSTLY_Q3_K_M: return "mostly Q3_K - Medium"; case LLAMA_FTYPE_MOSTLY_Q3_K_M: return "mostly Q3_K - Medium";
case LLAMA_FTYPE_MOSTLY_Q3_K_L: return "mostly Q3_K - Large"; case LLAMA_FTYPE_MOSTLY_Q3_K_L: return "mostly Q3_K - Large";
@ -1216,15 +1181,16 @@ static const char *llama_ftype_name(enum llama_ftype ftype) {
case LLAMA_FTYPE_MOSTLY_Q4_K_M: return "mostly Q4_K - Medium"; case LLAMA_FTYPE_MOSTLY_Q4_K_M: return "mostly Q4_K - Medium";
case LLAMA_FTYPE_MOSTLY_Q5_K_S: return "mostly Q5_K - Small"; case LLAMA_FTYPE_MOSTLY_Q5_K_S: return "mostly Q5_K - Small";
case LLAMA_FTYPE_MOSTLY_Q5_K_M: return "mostly Q5_K - Medium"; case LLAMA_FTYPE_MOSTLY_Q5_K_M: return "mostly Q5_K - Medium";
case LLAMA_FTYPE_MOSTLY_Q6_K: return "mostly Q6_K"; case LLAMA_FTYPE_MOSTLY_Q6_K: return "mostly Q6_K";
default: return "unknown, may not work";
default: return "unknown, may not work";
} }
} }
static const char *llama_model_type_name(e_model type) { static const char * llama_model_type_name(e_model type) {
switch (type) { switch (type) {
case MODEL_3B: return "3B"; case MODEL_3B: return "3B";
case MODEL_7B: return "7B"; case MODEL_7B: return "7B";
case MODEL_13B: return "13B"; case MODEL_13B: return "13B";
case MODEL_30B: return "30B"; case MODEL_30B: return "30B";
case MODEL_65B: return "65B"; case MODEL_65B: return "65B";
@ -1605,7 +1571,6 @@ static struct ggml_cgraph * llama_build_graph(
const int64_t n_embd_head = hparams.n_embd_head(); const int64_t n_embd_head = hparams.n_embd_head();
const int64_t n_embd_gqa = hparams.n_embd_gqa(); const int64_t n_embd_gqa = hparams.n_embd_gqa();
GGML_ASSERT(n_embd_head == hparams.n_rot); GGML_ASSERT(n_embd_head == hparams.n_rot);
const float freq_base = hparams.rope_freq_base; const float freq_base = hparams.rope_freq_base;
@ -1714,7 +1679,7 @@ static struct ggml_cgraph * llama_build_graph(
struct ggml_tensor * inpSA = inpL; struct ggml_tensor * inpSA = inpL;
lctx.use_buf(ctx0, 0); llama_context::use_buf(ctx0, 0);
// norm // norm
{ {
@ -1853,7 +1818,7 @@ static struct ggml_cgraph * llama_build_graph(
ggml_set_name(cur, "result_wo"); ggml_set_name(cur, "result_wo");
} }
lctx.use_buf(ctx0, 1); llama_context::use_buf(ctx0, 1);
struct ggml_tensor * inpFF = ggml_add(ctx0, cur, inpSA); struct ggml_tensor * inpFF = ggml_add(ctx0, cur, inpSA);
offload_func(inpFF); offload_func(inpFF);
@ -1909,7 +1874,7 @@ static struct ggml_cgraph * llama_build_graph(
inpL = cur; inpL = cur;
} }
lctx.use_buf(ctx0, 0); llama_context::use_buf(ctx0, 0);
// norm // norm
{ {
@ -1927,7 +1892,7 @@ static struct ggml_cgraph * llama_build_graph(
cur = ggml_mul_mat(ctx0, model.output, cur); cur = ggml_mul_mat(ctx0, model.output, cur);
ggml_set_name(cur, "result_output"); ggml_set_name(cur, "result_output");
lctx.use_buf(ctx0, -1); llama_context::use_buf(ctx0, -1);
// logits -> probs // logits -> probs
//cur = ggml_soft_max_inplace(ctx0, cur); //cur = ggml_soft_max_inplace(ctx0, cur);
@ -2997,9 +2962,8 @@ void llama_sample_grammar(struct llama_context * ctx, llama_token_data_array * c
} }
} }
const auto rejects = const auto rejects = llama_grammar_reject_candidates(grammar->rules, grammar->stacks, candidates_grammar);
llama_grammar_reject_candidates(grammar->rules, grammar->stacks, candidates_grammar); for (const auto & reject : rejects) {
for (auto & reject : rejects) {
candidates->data[reject.index].logit = -INFINITY; candidates->data[reject.index].logit = -INFINITY;
} }
@ -3726,7 +3690,7 @@ void llama_free(struct llama_context * ctx) {
int llama_model_quantize( int llama_model_quantize(
const char * fname_inp, const char * fname_inp,
const char * fname_out, const char * fname_out,
const llama_model_quantize_params *params) { const llama_model_quantize_params * params) {
try { try {
llama_model_quantize_internal(fname_inp, fname_out, params); llama_model_quantize_internal(fname_inp, fname_out, params);
return 0; return 0;
@ -4344,8 +4308,7 @@ static bool llama_load_session_file_internal(struct llama_context * ctx, const c
GGML_UNUSED(n_token_capacity); GGML_UNUSED(n_token_capacity);
GGML_UNUSED(n_token_count_out); GGML_UNUSED(n_token_count_out);
// TODO: implement with GGUF format
// TODO: implement with GGUF format
return true; return true;
} }
@ -4390,7 +4353,6 @@ int llama_eval(
return 0; return 0;
} }
int llama_eval_embd( int llama_eval_embd(
struct llama_context * ctx, struct llama_context * ctx,
const float * embd, const float * embd,

View file

@ -122,9 +122,10 @@ struct gguf_file {
template<typename T> template<typename T>
void write_val(const std::string & key, enum gguf_type type, const T & val) { void write_val(const std::string & key, enum gguf_type type, const T & val) {
static_assert(std::is_fundamental<T>::value, "T must be a primitive type");
write_str(key); write_str(key);
fwrite((const char *) &type, sizeof(type), 1, fp); fwrite((const char *) &type, sizeof(type), 1, fp);
fwrite((const char *) &val, sizeof(val), 1, fp); fwrite((const char *) &val, sizeof(val), 1, fp);
} }
template<typename T> template<typename T>
@ -137,7 +138,7 @@ struct gguf_file {
const int32_t n = val.size(); const int32_t n = val.size();
fwrite((const char *) &type, sizeof(type), 1, fp); fwrite((const char *) &type, sizeof(type), 1, fp);
fwrite((const char *) &n, sizeof(n), 1, fp); fwrite((const char *) &n, sizeof(n), 1, fp);
fwrite(val.data(), sizeof(T), n, fp); fwrite(val.data(), sizeof(T), n, fp);
} }
@ -159,7 +160,7 @@ struct gguf_file {
const int32_t n = val.size(); const int32_t n = val.size();
fwrite((const char *) &type, sizeof(type), 1, fp); fwrite((const char *) &type, sizeof(type), 1, fp);
fwrite((const char *) &n, sizeof(n), 1, fp); fwrite((const char *) &n, sizeof(n), 1, fp);
for (int i = 0; i < n; ++i) { for (int i = 0; i < n; ++i) {
const int32_t nstr = val[i].size(); const int32_t nstr = val[i].size();
fwrite((const char *) &nstr, sizeof(nstr), 1, fp); fwrite((const char *) &nstr, sizeof(nstr), 1, fp);
@ -265,7 +266,7 @@ struct gguf_mmap {
#elif defined(_WIN32) #elif defined(_WIN32)
static constexpr bool SUPPORTED = true; static constexpr bool SUPPORTED = true;
gguf_mmap(struct llama_file * file, bool prefetch = true, bool numa = false) { gguf_mmap(struct gguf_file * file, bool prefetch = true, bool numa = false) {
(void) numa; (void) numa;
size = file->size; size = file->size;
@ -312,7 +313,8 @@ struct gguf_mmap {
#else #else
static constexpr bool SUPPORTED = false; static constexpr bool SUPPORTED = false;
gguf_mmap(struct llama_file *, bool prefetch = true, bool numa = false) { gguf_mmap(struct gguf_file * file, bool prefetch = true, bool numa = false) {
(void) file;
(void) prefetch; (void) prefetch;
(void) numa; (void) numa;