From 42f8fe19272554c2aafe1be5ab2366d0e136ce3c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?M=2E=20Yusuf=20Sar=C4=B1g=C3=B6z?= Date: Thu, 17 Aug 2023 08:56:42 +0300 Subject: [PATCH 1/3] examples/gguf : no need to keep q option for quantization any more --- examples/gguf/gguf.cpp | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/examples/gguf/gguf.cpp b/examples/gguf/gguf.cpp index d742dce17..dee00df87 100644 --- a/examples/gguf/gguf.cpp +++ b/examples/gguf/gguf.cpp @@ -233,16 +233,13 @@ int main(int argc, char ** argv) { const std::string fname(argv[1]); const std::string mode (argv[2]); - GGML_ASSERT((mode == "r" || mode == "w" || mode == "q") && "mode must be r, w or q"); + GGML_ASSERT((mode == "r" || mode == "w") && "mode must be r or w"); if (mode == "w") { GGML_ASSERT(gguf_ex_write(fname) && "failed to write gguf file"); } else if (mode == "r") { GGML_ASSERT(gguf_ex_read_0(fname) && "failed to read gguf file"); GGML_ASSERT(gguf_ex_read_1(fname) && "failed to read gguf file"); - } else if (mode == "q") { - llama_model_quantize_params params = llama_model_quantize_default_params(); - llama_model_quantize(fname.c_str(), "quant.gguf", ¶ms); } return 0; From 5a0a2c5685544dc41304779fb3f05f2231e300bd Mon Sep 17 00:00:00 2001 From: klosax <131523366+klosax@users.noreply.github.com> Date: Thu, 17 Aug 2023 15:18:16 +0200 Subject: [PATCH 2/3] llama.cpp : print actual model size --- llama.cpp | 14 +++++++++++++- 1 file changed, 13 insertions(+), 1 deletion(-) diff --git a/llama.cpp b/llama.cpp index 38a2d5ba8..5a1501651 100644 --- a/llama.cpp +++ b/llama.cpp @@ -1023,6 +1023,7 @@ struct llama_model_loader { int n_kv = 0; int n_tensors = 0; int n_created = 0; + size_t n_tot_elements = 0; bool use_mmap = false; @@ -1047,6 +1048,16 @@ struct llama_model_loader { file_version = (enum llama_file_version) gguf_get_version(ctx_gguf); + for (int i = 0; i < n_tensors; i++) { + const char * name = gguf_get_tensor_name(ctx_gguf, i); + struct ggml_tensor * t = ggml_get_tensor(ctx_meta, name); + size_t elem = 1; + for (int j = 0; j < t->n_dims; j++) { + elem *= t->ne[j]; + } + n_tot_elements += elem; + } + // print meta data // TODO: make optional { @@ -1413,7 +1424,8 @@ static void llama_model_load_internal( LLAMA_LOG_INFO("%s: freq_base = %.1f\n", __func__, hparams.rope_freq_base); LLAMA_LOG_INFO("%s: freq_scale = %g\n", __func__, hparams.rope_freq_scale); LLAMA_LOG_INFO("%s: ftype = %u (%s)\n", __func__, hparams.ftype, llama_ftype_name(hparams.ftype)); - LLAMA_LOG_INFO("%s: model size = %s\n", __func__, llama_model_type_name(model.type)); + LLAMA_LOG_INFO("%s: model size = %.2f B\n", __func__, ml->n_tot_elements*1e-9); + } if (vocab_only) { From d6fd53afd64417203d77e1530f2f7bf182ffa96e Mon Sep 17 00:00:00 2001 From: klosax <131523366+klosax@users.noreply.github.com> Date: Thu, 17 Aug 2023 15:24:35 +0200 Subject: [PATCH 3/3] llama.cpp : use ggml_elements() --- llama.cpp | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/llama.cpp b/llama.cpp index 5a1501651..b7ca6db3c 100644 --- a/llama.cpp +++ b/llama.cpp @@ -1051,11 +1051,7 @@ struct llama_model_loader { for (int i = 0; i < n_tensors; i++) { const char * name = gguf_get_tensor_name(ctx_gguf, i); struct ggml_tensor * t = ggml_get_tensor(ctx_meta, name); - size_t elem = 1; - for (int j = 0; j < t->n_dims; j++) { - elem *= t->ne[j]; - } - n_tot_elements += elem; + n_tot_elements += ggml_nelements(t); } // print meta data