diff --git a/gguf-llama.cpp b/gguf-llama.cpp index 895fc98bd..c62840e88 100644 --- a/gguf-llama.cpp +++ b/gguf-llama.cpp @@ -367,6 +367,7 @@ struct llama_model { e_model type = MODEL_UNKNOWN; llama_hparams hparams; + llama_vocab vocab; struct ggml_tensor * tok_embeddings; @@ -395,8 +396,6 @@ struct llama_model { int64_t t_load_us = 0; int64_t t_start_us = 0; - llama_vocab vocab; - ~llama_model() { if (ctx) { ggml_free(ctx); @@ -567,10 +566,8 @@ enum gguf_file_version { struct gguf_file_loader { gguf_file file; - gguf_context * gguf_ctx; + gguf_context * ctx_gguf; gguf_file_version file_version; - llama_hparams hparams; - llama_vocab vocab; struct ggml_context * ctx_data = NULL; @@ -582,78 +579,18 @@ struct gguf_file_loader { /*.ctx = */ &ctx_data, }; - gguf_ctx = gguf_init_from_file(fname, params); - file_version = (enum gguf_file_version) gguf_get_version(gguf_ctx); + ctx_gguf = gguf_init_from_file(fname, params); + file_version = (enum gguf_file_version) gguf_get_version(ctx_gguf); - read_hparams(); - read_vocab(); read_tensor_metadata(tensors_map); } - int read_n_vocab() const { - int i = gguf_find_key(gguf_ctx, "tokenizer.ggml.tokens"); - if (i == -1) { - throw std::runtime_error("cannot find token list in GGUF file\n"); - } - - return gguf_get_arr_n(gguf_ctx, i); - } - - void read_hparams() { - // TODO define keys as constants in header - // TODO: read all hparams from file - - hparams.n_vocab = read_n_vocab(); - hparams.n_ctx = gguf_get_val_u32(gguf_ctx, gguf_find_key(gguf_ctx, "llama.context_length")); - hparams.n_embd = gguf_get_val_u32(gguf_ctx, gguf_find_key(gguf_ctx, "llama.embedding_length")); - hparams.n_ff = gguf_get_val_u32(gguf_ctx, gguf_find_key(gguf_ctx, "llama.feed_forward_length")); - hparams.n_head = gguf_get_val_u32(gguf_ctx, gguf_find_key(gguf_ctx, "llama.attention.head_count")); - hparams.n_layer = gguf_get_val_u32(gguf_ctx, gguf_find_key(gguf_ctx, "llama.block_count")); - hparams.n_rot = gguf_get_val_u32(gguf_ctx, gguf_find_key(gguf_ctx, "llama.rope.dimension_count")); - hparams.f_rms_norm_eps = gguf_get_val_f32(gguf_ctx, gguf_find_key(gguf_ctx, "llama.rms_norm_epsilon")); - - // n_head_kv default to n_head - hparams.n_head_kv = hparams.n_head; - { - const int idx = gguf_find_key(gguf_ctx, "llama.attention.head_count_kv"); - if (idx >= 0) { - hparams.n_head_kv = gguf_get_val_u32(gguf_ctx, idx); - } - } - } - - void read_vocab() { - vocab.id_to_token.resize(hparams.n_vocab); - - const int token_idx = gguf_find_key(gguf_ctx, "tokenizer.ggml.tokens"); - if (token_idx == -1) { - throw std::runtime_error("cannot find token list in GGUF file\n"); - } - - const int score_idx = gguf_find_key(gguf_ctx, "tokenizer.ggml.scores"); - if (score_idx == -1) { - throw std::runtime_error("cannot find token scores list in GGUF file\n"); - } - - const float * scores = (const float * ) gguf_get_arr_data(gguf_ctx, score_idx); - - for (uint32_t i = 0; i < hparams.n_vocab; i++) { - std::string word = gguf_get_arr_str(gguf_ctx, token_idx, i); - - vocab.token_to_id[word] = i; - - auto & tok_score = vocab.id_to_token[i]; - tok_score.tok = std::move(word); - tok_score.score = scores[i]; - } - } - void read_tensor_metadata(gguf_load_tensors_map & tensors_map) const { - const int n_tensors = gguf_get_n_tensors(gguf_ctx); + const int n_tensors = gguf_get_n_tensors(ctx_gguf); for (int i = 0; i < n_tensors; ++i) { gguf_load_tensor tensor; - const char * name = gguf_get_tensor_name(gguf_ctx, i); + const char * name = gguf_get_tensor_name(ctx_gguf, i); struct ggml_tensor * cur = ggml_get_tensor(ctx_data, name); @@ -688,7 +625,7 @@ struct gguf_file_loader { } } - tensor.file_off = gguf_get_data_offset(gguf_ctx) + gguf_get_tensor_offset(gguf_ctx, i); + tensor.file_off = gguf_get_data_offset(ctx_gguf) + gguf_get_tensor_offset(ctx_gguf, i); tensor.name = name; tensor.size = ggml_nbytes(cur); @@ -929,15 +866,15 @@ struct llama_model_quantize_params llama_model_quantize_default_params() { return result; } -int llama_max_devices() { +int llama_max_devices(void) { return LLAMA_MAX_DEVICES; } -bool llama_mmap_supported() { +bool llama_mmap_supported(void) { return gguf_mmap::SUPPORTED; } -bool llama_mlock_supported() { +bool llama_mlock_supported(void) { return gguf_mlock::SUPPORTED; } @@ -960,13 +897,13 @@ void llama_backend_init(bool numa) { #endif } -void llama_backend_free() { +void llama_backend_free(void) { #ifdef GGML_USE_MPI ggml_mpi_backend_free(); #endif } -int64_t llama_time_us() { +int64_t llama_time_us(void) { return ggml_time_us(); } @@ -1044,14 +981,33 @@ static void llama_model_load_internal( std::unique_ptr ml(new llama_model_loader(fname, use_mmap)); - vocab = std::move(ml->file_loader->vocab); - model.hparams = ml->file_loader->hparams; model.n_gpu_layers = n_gpu_layers; gguf_file_version file_version = ml->file_loader->file_version; auto & hparams = model.hparams; + // read hparams { + struct gguf_context * ctx = ml->file_loader->ctx_gguf; + + hparams.n_vocab = gguf_get_arr_n (ctx, gguf_find_key(ctx, "tokenizer.ggml.tokens")); + hparams.n_ctx = gguf_get_val_u32(ctx, gguf_find_key(ctx, "llama.context_length")); + hparams.n_embd = gguf_get_val_u32(ctx, gguf_find_key(ctx, "llama.embedding_length")); + hparams.n_ff = gguf_get_val_u32(ctx, gguf_find_key(ctx, "llama.feed_forward_length")); + hparams.n_head = gguf_get_val_u32(ctx, gguf_find_key(ctx, "llama.attention.head_count")); + hparams.n_layer = gguf_get_val_u32(ctx, gguf_find_key(ctx, "llama.block_count")); + hparams.n_rot = gguf_get_val_u32(ctx, gguf_find_key(ctx, "llama.rope.dimension_count")); + hparams.f_rms_norm_eps = gguf_get_val_f32(ctx, gguf_find_key(ctx, "llama.rms_norm_epsilon")); + + // n_head_kv default to n_head + hparams.n_head_kv = hparams.n_head; + { + const int idx = gguf_find_key(ctx, "llama.attention.head_count_kv"); + if (idx >= 0) { + hparams.n_head_kv = gguf_get_val_u32(ctx, idx); + } + } + switch (hparams.n_layer) { case 26: model.type = e_model::MODEL_3B; break; case 32: model.type = e_model::MODEL_7B; break; @@ -1083,7 +1039,34 @@ static void llama_model_load_internal( hparams.rope_freq_scale = rope_freq_scale; } - const uint32_t n_ff = hparams.n_ff; + // read vocab + { + struct gguf_context * ctx = ml->file_loader->ctx_gguf; + + vocab.id_to_token.resize(hparams.n_vocab); + + const int token_idx = gguf_find_key(ctx, "tokenizer.ggml.tokens"); + if (token_idx == -1) { + throw std::runtime_error("cannot find token list in GGUF file\n"); + } + + const int score_idx = gguf_find_key(ctx, "tokenizer.ggml.scores"); + if (score_idx == -1) { + throw std::runtime_error("cannot find token scores list in GGUF file\n"); + } + + const float * scores = (const float * ) gguf_get_arr_data(ctx, score_idx); + + for (uint32_t i = 0; i < hparams.n_vocab; i++) { + std::string word = gguf_get_arr_str(ctx, token_idx, i); + + vocab.token_to_id[word] = i; + + auto & tok_score = vocab.id_to_token[i]; + tok_score.tok = std::move(word); + tok_score.score = scores[i]; + } + } { LLAMA_LOG_INFO("%s: format = %s\n", __func__, gguf_file_version_name(file_version)); @@ -1096,7 +1079,7 @@ static void llama_model_load_internal( LLAMA_LOG_INFO("%s: n_rot = %u\n", __func__, hparams.n_rot); // a.k.a. n_embd_head, n_head_dim LLAMA_LOG_INFO("%s: n_gqa = %u\n", __func__, hparams.n_gqa()); LLAMA_LOG_INFO("%s: rnorm_eps = %.1e\n", __func__, hparams.f_rms_norm_eps); - LLAMA_LOG_INFO("%s: n_ff = %u\n", __func__, n_ff); + LLAMA_LOG_INFO("%s: n_ff = %u\n", __func__, hparams.n_ff); LLAMA_LOG_INFO("%s: freq_base = %.1f\n", __func__, hparams.rope_freq_base); LLAMA_LOG_INFO("%s: freq_scale = %g\n", __func__, hparams.rope_freq_scale); LLAMA_LOG_INFO("%s: ftype = %u (%s)\n", __func__, hparams.ftype, llama_ftype_name(hparams.ftype)); @@ -1193,6 +1176,8 @@ static void llama_model_load_internal( } } + const uint32_t n_ff = hparams.n_ff; + const int i_gpu_start = n_layer - n_gpu_layers; model.layers.resize(n_layer); @@ -3087,7 +3072,7 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s struct gguf_context * ctx_out = gguf_init_empty(); // copy the KV pairs from the input file - gguf_set_kv(ctx_out, model_loader->file_loader->gguf_ctx); + gguf_set_kv(ctx_out, model_loader->file_loader->ctx_gguf); gguf_set_val_u32(ctx_out, "general.quantization_version", GGML_QNT_VERSION); #ifdef GGML_USE_K_QUANTS @@ -4460,15 +4445,15 @@ std::string llama_token_to_str_bpe(const struct llama_context * ctx, llama_token return std::string(result.data(), result.size()); } -llama_token llama_token_bos() { +llama_token llama_token_bos(void) { return 1; } -llama_token llama_token_eos() { +llama_token llama_token_eos(void) { return 2; } -llama_token llama_token_nl() { +llama_token llama_token_nl(void) { return 13; } diff --git a/gguf-llama.h b/gguf-llama.h index f342a534c..62e48b13c 100644 --- a/gguf-llama.h +++ b/gguf-llama.h @@ -194,13 +194,12 @@ extern "C" { // If this is not called, or NULL is supplied, everything is output on stderr. LLAMA_API void llama_log_set(llama_log_callback log_callback, void * user_data); - LLAMA_API int llama_max_devices(); + LLAMA_API struct llama_context_params llama_context_default_params(void); + LLAMA_API struct llama_model_quantize_params llama_model_quantize_default_params(void); - LLAMA_API struct llama_context_params llama_context_default_params(); - LLAMA_API struct llama_model_quantize_params llama_model_quantize_default_params(); - - LLAMA_API bool llama_mmap_supported(); - LLAMA_API bool llama_mlock_supported(); + LLAMA_API int llama_max_devices(void); + LLAMA_API bool llama_mmap_supported(void); + LLAMA_API bool llama_mlock_supported(void); // TODO: not great API - very likely to change // Initialize the llama + ggml backend @@ -208,9 +207,9 @@ extern "C" { // Call once at the start of the program LLAMA_API void llama_backend_init(bool numa); // Call once at the end of the program - currently only used for MPI - LLAMA_API void llama_backend_free(); + LLAMA_API void llama_backend_free(void); - LLAMA_API int64_t llama_time_us(); + LLAMA_API int64_t llama_time_us(void); LLAMA_API struct llama_model * llama_load_model_from_file( const char * path_model, @@ -377,9 +376,9 @@ extern "C" { char * str, int length); // Special tokens - LLAMA_API llama_token llama_token_bos(); // beginning-of-sentence - LLAMA_API llama_token llama_token_eos(); // end-of-sentence - LLAMA_API llama_token llama_token_nl(); // next-line + LLAMA_API llama_token llama_token_bos(void); // beginning-of-sentence + LLAMA_API llama_token llama_token_eos(void); // end-of-sentence + LLAMA_API llama_token llama_token_nl(void); // next-line // Grammar //