From 42f8fe19272554c2aafe1be5ab2366d0e136ce3c Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?M=2E=20Yusuf=20Sar=C4=B1g=C3=B6z?= <yusufsarigoz@gmail.com>
Date: Thu, 17 Aug 2023 08:56:42 +0300
Subject: [PATCH 1/3] examples/gguf : no need to keep q option for quantization
 any more

---
 examples/gguf/gguf.cpp | 5 +----
 1 file changed, 1 insertion(+), 4 deletions(-)

diff --git a/examples/gguf/gguf.cpp b/examples/gguf/gguf.cpp
index d742dce17..dee00df87 100644
--- a/examples/gguf/gguf.cpp
+++ b/examples/gguf/gguf.cpp
@@ -233,16 +233,13 @@ int main(int argc, char ** argv) {
     const std::string fname(argv[1]);
     const std::string mode (argv[2]);
 
-    GGML_ASSERT((mode == "r" || mode == "w" || mode == "q") && "mode must be r, w or q");
+    GGML_ASSERT((mode == "r" || mode == "w") && "mode must be r or w");
 
     if (mode == "w") {
         GGML_ASSERT(gguf_ex_write(fname) && "failed to write gguf file");
     } else if (mode == "r") {
         GGML_ASSERT(gguf_ex_read_0(fname) && "failed to read gguf file");
         GGML_ASSERT(gguf_ex_read_1(fname) && "failed to read gguf file");
-    } else if (mode == "q") {
-        llama_model_quantize_params params = llama_model_quantize_default_params();
-        llama_model_quantize(fname.c_str(), "quant.gguf", &params);
     }
 
     return 0;

From 5a0a2c5685544dc41304779fb3f05f2231e300bd Mon Sep 17 00:00:00 2001
From: klosax <131523366+klosax@users.noreply.github.com>
Date: Thu, 17 Aug 2023 15:18:16 +0200
Subject: [PATCH 2/3] llama.cpp : print actual model size

---
 llama.cpp | 14 +++++++++++++-
 1 file changed, 13 insertions(+), 1 deletion(-)

diff --git a/llama.cpp b/llama.cpp
index 38a2d5ba8..5a1501651 100644
--- a/llama.cpp
+++ b/llama.cpp
@@ -1023,6 +1023,7 @@ struct llama_model_loader {
     int n_kv      = 0;
     int n_tensors = 0;
     int n_created = 0;
+    size_t n_tot_elements = 0;
 
     bool use_mmap = false;
 
@@ -1047,6 +1048,16 @@ struct llama_model_loader {
 
         file_version = (enum llama_file_version) gguf_get_version(ctx_gguf);
 
+        for (int i = 0; i < n_tensors; i++) {
+            const char * name = gguf_get_tensor_name(ctx_gguf, i);
+            struct ggml_tensor * t = ggml_get_tensor(ctx_meta, name);
+            size_t elem = 1;
+            for (int j = 0; j < t->n_dims; j++) {
+                elem *= t->ne[j];
+            }
+            n_tot_elements += elem;
+        }
+        
         // print meta data
         // TODO: make optional
         {
@@ -1413,7 +1424,8 @@ static void llama_model_load_internal(
         LLAMA_LOG_INFO("%s: freq_base  = %.1f\n", __func__, hparams.rope_freq_base);
         LLAMA_LOG_INFO("%s: freq_scale = %g\n",   __func__, hparams.rope_freq_scale);
         LLAMA_LOG_INFO("%s: ftype      = %u (%s)\n", __func__, hparams.ftype, llama_ftype_name(hparams.ftype));
-        LLAMA_LOG_INFO("%s: model size = %s\n",   __func__, llama_model_type_name(model.type));
+        LLAMA_LOG_INFO("%s: model size = %.2f B\n",   __func__, ml->n_tot_elements*1e-9);
+
     }
 
     if (vocab_only) {

From d6fd53afd64417203d77e1530f2f7bf182ffa96e Mon Sep 17 00:00:00 2001
From: klosax <131523366+klosax@users.noreply.github.com>
Date: Thu, 17 Aug 2023 15:24:35 +0200
Subject: [PATCH 3/3] llama.cpp : use ggml_elements()

---
 llama.cpp | 6 +-----
 1 file changed, 1 insertion(+), 5 deletions(-)

diff --git a/llama.cpp b/llama.cpp
index 5a1501651..b7ca6db3c 100644
--- a/llama.cpp
+++ b/llama.cpp
@@ -1051,11 +1051,7 @@ struct llama_model_loader {
         for (int i = 0; i < n_tensors; i++) {
             const char * name = gguf_get_tensor_name(ctx_gguf, i);
             struct ggml_tensor * t = ggml_get_tensor(ctx_meta, name);
-            size_t elem = 1;
-            for (int j = 0; j < t->n_dims; j++) {
-                elem *= t->ne[j];
-            }
-            n_tot_elements += elem;
+            n_tot_elements += ggml_nelements(t);
         }
         
         // print meta data