llama : move hparams and vocab from gguf_file_loader to llama_model_loader

This commit is contained in:
Georgi Gerganov 2023-08-15 21:09:27 +03:00
parent 6c3f824697
commit a02b809a2e
No known key found for this signature in database
GPG key ID: 449E073F9DC10735
2 changed files with 78 additions and 94 deletions

View file

@ -367,6 +367,7 @@ struct llama_model {
e_model type = MODEL_UNKNOWN; e_model type = MODEL_UNKNOWN;
llama_hparams hparams; llama_hparams hparams;
llama_vocab vocab;
struct ggml_tensor * tok_embeddings; struct ggml_tensor * tok_embeddings;
@ -395,8 +396,6 @@ struct llama_model {
int64_t t_load_us = 0; int64_t t_load_us = 0;
int64_t t_start_us = 0; int64_t t_start_us = 0;
llama_vocab vocab;
~llama_model() { ~llama_model() {
if (ctx) { if (ctx) {
ggml_free(ctx); ggml_free(ctx);
@ -567,10 +566,8 @@ enum gguf_file_version {
struct gguf_file_loader { struct gguf_file_loader {
gguf_file file; gguf_file file;
gguf_context * gguf_ctx; gguf_context * ctx_gguf;
gguf_file_version file_version; gguf_file_version file_version;
llama_hparams hparams;
llama_vocab vocab;
struct ggml_context * ctx_data = NULL; struct ggml_context * ctx_data = NULL;
@ -582,78 +579,18 @@ struct gguf_file_loader {
/*.ctx = */ &ctx_data, /*.ctx = */ &ctx_data,
}; };
gguf_ctx = gguf_init_from_file(fname, params); ctx_gguf = gguf_init_from_file(fname, params);
file_version = (enum gguf_file_version) gguf_get_version(gguf_ctx); file_version = (enum gguf_file_version) gguf_get_version(ctx_gguf);
read_hparams();
read_vocab();
read_tensor_metadata(tensors_map); read_tensor_metadata(tensors_map);
} }
int read_n_vocab() const {
int i = gguf_find_key(gguf_ctx, "tokenizer.ggml.tokens");
if (i == -1) {
throw std::runtime_error("cannot find token list in GGUF file\n");
}
return gguf_get_arr_n(gguf_ctx, i);
}
void read_hparams() {
// TODO define keys as constants in header
// TODO: read all hparams from file
hparams.n_vocab = read_n_vocab();
hparams.n_ctx = gguf_get_val_u32(gguf_ctx, gguf_find_key(gguf_ctx, "llama.context_length"));
hparams.n_embd = gguf_get_val_u32(gguf_ctx, gguf_find_key(gguf_ctx, "llama.embedding_length"));
hparams.n_ff = gguf_get_val_u32(gguf_ctx, gguf_find_key(gguf_ctx, "llama.feed_forward_length"));
hparams.n_head = gguf_get_val_u32(gguf_ctx, gguf_find_key(gguf_ctx, "llama.attention.head_count"));
hparams.n_layer = gguf_get_val_u32(gguf_ctx, gguf_find_key(gguf_ctx, "llama.block_count"));
hparams.n_rot = gguf_get_val_u32(gguf_ctx, gguf_find_key(gguf_ctx, "llama.rope.dimension_count"));
hparams.f_rms_norm_eps = gguf_get_val_f32(gguf_ctx, gguf_find_key(gguf_ctx, "llama.rms_norm_epsilon"));
// n_head_kv default to n_head
hparams.n_head_kv = hparams.n_head;
{
const int idx = gguf_find_key(gguf_ctx, "llama.attention.head_count_kv");
if (idx >= 0) {
hparams.n_head_kv = gguf_get_val_u32(gguf_ctx, idx);
}
}
}
void read_vocab() {
vocab.id_to_token.resize(hparams.n_vocab);
const int token_idx = gguf_find_key(gguf_ctx, "tokenizer.ggml.tokens");
if (token_idx == -1) {
throw std::runtime_error("cannot find token list in GGUF file\n");
}
const int score_idx = gguf_find_key(gguf_ctx, "tokenizer.ggml.scores");
if (score_idx == -1) {
throw std::runtime_error("cannot find token scores list in GGUF file\n");
}
const float * scores = (const float * ) gguf_get_arr_data(gguf_ctx, score_idx);
for (uint32_t i = 0; i < hparams.n_vocab; i++) {
std::string word = gguf_get_arr_str(gguf_ctx, token_idx, i);
vocab.token_to_id[word] = i;
auto & tok_score = vocab.id_to_token[i];
tok_score.tok = std::move(word);
tok_score.score = scores[i];
}
}
void read_tensor_metadata(gguf_load_tensors_map & tensors_map) const { void read_tensor_metadata(gguf_load_tensors_map & tensors_map) const {
const int n_tensors = gguf_get_n_tensors(gguf_ctx); const int n_tensors = gguf_get_n_tensors(ctx_gguf);
for (int i = 0; i < n_tensors; ++i) { for (int i = 0; i < n_tensors; ++i) {
gguf_load_tensor tensor; gguf_load_tensor tensor;
const char * name = gguf_get_tensor_name(gguf_ctx, i); const char * name = gguf_get_tensor_name(ctx_gguf, i);
struct ggml_tensor * cur = ggml_get_tensor(ctx_data, name); struct ggml_tensor * cur = ggml_get_tensor(ctx_data, name);
@ -688,7 +625,7 @@ struct gguf_file_loader {
} }
} }
tensor.file_off = gguf_get_data_offset(gguf_ctx) + gguf_get_tensor_offset(gguf_ctx, i); tensor.file_off = gguf_get_data_offset(ctx_gguf) + gguf_get_tensor_offset(ctx_gguf, i);
tensor.name = name; tensor.name = name;
tensor.size = ggml_nbytes(cur); tensor.size = ggml_nbytes(cur);
@ -929,15 +866,15 @@ struct llama_model_quantize_params llama_model_quantize_default_params() {
return result; return result;
} }
int llama_max_devices() { int llama_max_devices(void) {
return LLAMA_MAX_DEVICES; return LLAMA_MAX_DEVICES;
} }
bool llama_mmap_supported() { bool llama_mmap_supported(void) {
return gguf_mmap::SUPPORTED; return gguf_mmap::SUPPORTED;
} }
bool llama_mlock_supported() { bool llama_mlock_supported(void) {
return gguf_mlock::SUPPORTED; return gguf_mlock::SUPPORTED;
} }
@ -960,13 +897,13 @@ void llama_backend_init(bool numa) {
#endif #endif
} }
void llama_backend_free() { void llama_backend_free(void) {
#ifdef GGML_USE_MPI #ifdef GGML_USE_MPI
ggml_mpi_backend_free(); ggml_mpi_backend_free();
#endif #endif
} }
int64_t llama_time_us() { int64_t llama_time_us(void) {
return ggml_time_us(); return ggml_time_us();
} }
@ -1044,14 +981,33 @@ static void llama_model_load_internal(
std::unique_ptr<llama_model_loader> ml(new llama_model_loader(fname, use_mmap)); std::unique_ptr<llama_model_loader> ml(new llama_model_loader(fname, use_mmap));
vocab = std::move(ml->file_loader->vocab);
model.hparams = ml->file_loader->hparams;
model.n_gpu_layers = n_gpu_layers; model.n_gpu_layers = n_gpu_layers;
gguf_file_version file_version = ml->file_loader->file_version; gguf_file_version file_version = ml->file_loader->file_version;
auto & hparams = model.hparams; auto & hparams = model.hparams;
// read hparams
{ {
struct gguf_context * ctx = ml->file_loader->ctx_gguf;
hparams.n_vocab = gguf_get_arr_n (ctx, gguf_find_key(ctx, "tokenizer.ggml.tokens"));
hparams.n_ctx = gguf_get_val_u32(ctx, gguf_find_key(ctx, "llama.context_length"));
hparams.n_embd = gguf_get_val_u32(ctx, gguf_find_key(ctx, "llama.embedding_length"));
hparams.n_ff = gguf_get_val_u32(ctx, gguf_find_key(ctx, "llama.feed_forward_length"));
hparams.n_head = gguf_get_val_u32(ctx, gguf_find_key(ctx, "llama.attention.head_count"));
hparams.n_layer = gguf_get_val_u32(ctx, gguf_find_key(ctx, "llama.block_count"));
hparams.n_rot = gguf_get_val_u32(ctx, gguf_find_key(ctx, "llama.rope.dimension_count"));
hparams.f_rms_norm_eps = gguf_get_val_f32(ctx, gguf_find_key(ctx, "llama.rms_norm_epsilon"));
// n_head_kv default to n_head
hparams.n_head_kv = hparams.n_head;
{
const int idx = gguf_find_key(ctx, "llama.attention.head_count_kv");
if (idx >= 0) {
hparams.n_head_kv = gguf_get_val_u32(ctx, idx);
}
}
switch (hparams.n_layer) { switch (hparams.n_layer) {
case 26: model.type = e_model::MODEL_3B; break; case 26: model.type = e_model::MODEL_3B; break;
case 32: model.type = e_model::MODEL_7B; break; case 32: model.type = e_model::MODEL_7B; break;
@ -1083,7 +1039,34 @@ static void llama_model_load_internal(
hparams.rope_freq_scale = rope_freq_scale; hparams.rope_freq_scale = rope_freq_scale;
} }
const uint32_t n_ff = hparams.n_ff; // read vocab
{
struct gguf_context * ctx = ml->file_loader->ctx_gguf;
vocab.id_to_token.resize(hparams.n_vocab);
const int token_idx = gguf_find_key(ctx, "tokenizer.ggml.tokens");
if (token_idx == -1) {
throw std::runtime_error("cannot find token list in GGUF file\n");
}
const int score_idx = gguf_find_key(ctx, "tokenizer.ggml.scores");
if (score_idx == -1) {
throw std::runtime_error("cannot find token scores list in GGUF file\n");
}
const float * scores = (const float * ) gguf_get_arr_data(ctx, score_idx);
for (uint32_t i = 0; i < hparams.n_vocab; i++) {
std::string word = gguf_get_arr_str(ctx, token_idx, i);
vocab.token_to_id[word] = i;
auto & tok_score = vocab.id_to_token[i];
tok_score.tok = std::move(word);
tok_score.score = scores[i];
}
}
{ {
LLAMA_LOG_INFO("%s: format = %s\n", __func__, gguf_file_version_name(file_version)); LLAMA_LOG_INFO("%s: format = %s\n", __func__, gguf_file_version_name(file_version));
@ -1096,7 +1079,7 @@ static void llama_model_load_internal(
LLAMA_LOG_INFO("%s: n_rot = %u\n", __func__, hparams.n_rot); // a.k.a. n_embd_head, n_head_dim LLAMA_LOG_INFO("%s: n_rot = %u\n", __func__, hparams.n_rot); // a.k.a. n_embd_head, n_head_dim
LLAMA_LOG_INFO("%s: n_gqa = %u\n", __func__, hparams.n_gqa()); LLAMA_LOG_INFO("%s: n_gqa = %u\n", __func__, hparams.n_gqa());
LLAMA_LOG_INFO("%s: rnorm_eps = %.1e\n", __func__, hparams.f_rms_norm_eps); LLAMA_LOG_INFO("%s: rnorm_eps = %.1e\n", __func__, hparams.f_rms_norm_eps);
LLAMA_LOG_INFO("%s: n_ff = %u\n", __func__, n_ff); LLAMA_LOG_INFO("%s: n_ff = %u\n", __func__, hparams.n_ff);
LLAMA_LOG_INFO("%s: freq_base = %.1f\n", __func__, hparams.rope_freq_base); LLAMA_LOG_INFO("%s: freq_base = %.1f\n", __func__, hparams.rope_freq_base);
LLAMA_LOG_INFO("%s: freq_scale = %g\n", __func__, hparams.rope_freq_scale); LLAMA_LOG_INFO("%s: freq_scale = %g\n", __func__, hparams.rope_freq_scale);
LLAMA_LOG_INFO("%s: ftype = %u (%s)\n", __func__, hparams.ftype, llama_ftype_name(hparams.ftype)); LLAMA_LOG_INFO("%s: ftype = %u (%s)\n", __func__, hparams.ftype, llama_ftype_name(hparams.ftype));
@ -1193,6 +1176,8 @@ static void llama_model_load_internal(
} }
} }
const uint32_t n_ff = hparams.n_ff;
const int i_gpu_start = n_layer - n_gpu_layers; const int i_gpu_start = n_layer - n_gpu_layers;
model.layers.resize(n_layer); model.layers.resize(n_layer);
@ -3087,7 +3072,7 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
struct gguf_context * ctx_out = gguf_init_empty(); struct gguf_context * ctx_out = gguf_init_empty();
// copy the KV pairs from the input file // copy the KV pairs from the input file
gguf_set_kv(ctx_out, model_loader->file_loader->gguf_ctx); gguf_set_kv(ctx_out, model_loader->file_loader->ctx_gguf);
gguf_set_val_u32(ctx_out, "general.quantization_version", GGML_QNT_VERSION); gguf_set_val_u32(ctx_out, "general.quantization_version", GGML_QNT_VERSION);
#ifdef GGML_USE_K_QUANTS #ifdef GGML_USE_K_QUANTS
@ -4460,15 +4445,15 @@ std::string llama_token_to_str_bpe(const struct llama_context * ctx, llama_token
return std::string(result.data(), result.size()); return std::string(result.data(), result.size());
} }
llama_token llama_token_bos() { llama_token llama_token_bos(void) {
return 1; return 1;
} }
llama_token llama_token_eos() { llama_token llama_token_eos(void) {
return 2; return 2;
} }
llama_token llama_token_nl() { llama_token llama_token_nl(void) {
return 13; return 13;
} }

View file

@ -194,13 +194,12 @@ extern "C" {
// If this is not called, or NULL is supplied, everything is output on stderr. // If this is not called, or NULL is supplied, everything is output on stderr.
LLAMA_API void llama_log_set(llama_log_callback log_callback, void * user_data); LLAMA_API void llama_log_set(llama_log_callback log_callback, void * user_data);
LLAMA_API int llama_max_devices(); LLAMA_API struct llama_context_params llama_context_default_params(void);
LLAMA_API struct llama_model_quantize_params llama_model_quantize_default_params(void);
LLAMA_API struct llama_context_params llama_context_default_params(); LLAMA_API int llama_max_devices(void);
LLAMA_API struct llama_model_quantize_params llama_model_quantize_default_params(); LLAMA_API bool llama_mmap_supported(void);
LLAMA_API bool llama_mlock_supported(void);
LLAMA_API bool llama_mmap_supported();
LLAMA_API bool llama_mlock_supported();
// TODO: not great API - very likely to change // TODO: not great API - very likely to change
// Initialize the llama + ggml backend // Initialize the llama + ggml backend
@ -208,9 +207,9 @@ extern "C" {
// Call once at the start of the program // Call once at the start of the program
LLAMA_API void llama_backend_init(bool numa); LLAMA_API void llama_backend_init(bool numa);
// Call once at the end of the program - currently only used for MPI // Call once at the end of the program - currently only used for MPI
LLAMA_API void llama_backend_free(); LLAMA_API void llama_backend_free(void);
LLAMA_API int64_t llama_time_us(); LLAMA_API int64_t llama_time_us(void);
LLAMA_API struct llama_model * llama_load_model_from_file( LLAMA_API struct llama_model * llama_load_model_from_file(
const char * path_model, const char * path_model,
@ -377,9 +376,9 @@ extern "C" {
char * str, char * str,
int length); int length);
// Special tokens // Special tokens
LLAMA_API llama_token llama_token_bos(); // beginning-of-sentence LLAMA_API llama_token llama_token_bos(void); // beginning-of-sentence
LLAMA_API llama_token llama_token_eos(); // end-of-sentence LLAMA_API llama_token llama_token_eos(void); // end-of-sentence
LLAMA_API llama_token llama_token_nl(); // next-line LLAMA_API llama_token llama_token_nl(void); // next-line
// Grammar // Grammar
// //