From e524750a6c23dbe238649ea528f2b9a949b3c499 Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Wed, 16 Aug 2023 14:24:04 +0300
Subject: [PATCH] llama : improve printing + log meta data

---
 ggml.c         | 17 ++++++++++++++
 ggml.h         |  2 ++
 gguf-llama.cpp | 61 +++++++++++++++++++++++++++++++++++---------------
 3 files changed, 62 insertions(+), 18 deletions(-)

diff --git a/ggml.c b/ggml.c
index 261695216..77f57a3fd 100644
--- a/ggml.c
+++ b/ggml.c
@@ -18583,6 +18583,19 @@ static const size_t GGUF_TYPE_SIZE[GGUF_TYPE_COUNT] = {
 };
 static_assert(GGUF_TYPE_COUNT == 10, "GGUF_TYPE_COUNT != 10");
 
+static const char * GGUF_TYPE_NAME[GGUF_TYPE_COUNT] = {
+    [GGUF_TYPE_UINT8]   = "uint8",
+    [GGUF_TYPE_INT8]    = "int8",
+    [GGUF_TYPE_UINT16]  = "uint16",
+    [GGUF_TYPE_INT16]   = "int16",
+    [GGUF_TYPE_UINT32]  = "uint32",
+    [GGUF_TYPE_INT32]   = "int32",
+    [GGUF_TYPE_FLOAT32] = "float32",
+    [GGUF_TYPE_BOOL]    = "bool",
+    [GGUF_TYPE_STRING]  = "string",
+    [GGUF_TYPE_ARRAY]   = "array",
+};
+
 union gguf_value {
     uint8_t  uint8;
     int8_t   int8;
@@ -19017,6 +19030,10 @@ void gguf_free(struct gguf_context * ctx) {
     GGML_ALIGNED_FREE(ctx);
 }
 
+const char * gguf_type_name(enum gguf_type type) {
+    return GGUF_TYPE_NAME[type];
+}
+
 int gguf_get_version(struct gguf_context * ctx) {
     return ctx->header.version;
 }
diff --git a/ggml.h b/ggml.h
index 48ce71ecd..ad12c133e 100644
--- a/ggml.h
+++ b/ggml.h
@@ -1740,6 +1740,8 @@ extern "C" {
 
     GGML_API void gguf_free(struct gguf_context * ctx);
 
+    GGML_API const char * gguf_type_name(enum gguf_type type);
+
     GGML_API int    gguf_get_version    (struct gguf_context * ctx);
     GGML_API size_t gguf_get_alignment  (struct gguf_context * ctx);
     GGML_API size_t gguf_get_data_offset(struct gguf_context * ctx);
diff --git a/gguf-llama.cpp b/gguf-llama.cpp
index ec64ef8dc..2b197a236 100644
--- a/gguf-llama.cpp
+++ b/gguf-llama.cpp
@@ -101,11 +101,21 @@
 #define TN_FFN_DOWN    "blk.%d.ffn_down.weight"
 #define TN_FFN_UP      "blk.%d.ffn_up.weight"
 
+#ifdef __GNUC__
+#ifdef __MINGW32__
+#define LLAMA_ATTRIBUTE_FORMAT(...) __attribute__((format(gnu_printf, __VA_ARGS__)))
+#else
+#define LLAMA_ATTRIBUTE_FORMAT(...) __attribute__((format(printf, __VA_ARGS__)))
+#endif
+#else
+#define LLAMA_ATTRIBUTE_FORMAT(...)
+#endif
+
 //
 // logging
 //
-
-static void llama_log_internal(llama_log_level level, const char* format, ...);
+LLAMA_ATTRIBUTE_FORMAT(2, 3)
+static void llama_log_internal        (llama_log_level level, const char* format, ...);
 static void llama_log_callback_default(llama_log_level level, const char * text, void * user_data);
 
 #define LLAMA_LOG_INFO(...)  llama_log_internal(LLAMA_LOG_LEVEL_INFO , __VA_ARGS__)
@@ -130,13 +140,7 @@ static void zeros(std::ofstream & file, size_t n) {
     }
 }
 
-#ifdef __GNUC__
-#ifdef __MINGW32__
-__attribute__((format(gnu_printf, 1, 2)))
-#else
-__attribute__((format(printf, 1, 2)))
-#endif
-#endif
+LLAMA_ATTRIBUTE_FORMAT(1, 2)
 static std::string format(const char * fmt, ...) {
     va_list ap;
     va_list ap2;
@@ -991,7 +995,7 @@ static std::string llama_format_tensor_shape(const std::vector<uint32_t> & ne) {
     char buf[256];
     snprintf(buf, sizeof(buf), "%5u", ne.at(0));
     for (size_t i = 1; i < ne.size(); i++) {
-        snprintf(buf + strlen(buf), sizeof(buf) - strlen(buf), " x %5u", ne.at(i));
+        snprintf(buf + strlen(buf), sizeof(buf) - strlen(buf), ", %5u", ne.at(i));
     }
     return buf;
 }
@@ -999,13 +1003,14 @@ static std::string llama_format_tensor_shape(const std::vector<uint32_t> & ne) {
 static std::string llama_format_tensor_shape(const struct ggml_tensor * t) {
     char buf[256];
     snprintf(buf, sizeof(buf), "%5" PRId64, t->ne[0]);
-    for (int i = 1; i < t->n_dims; i++) {
-        snprintf(buf + strlen(buf), sizeof(buf) - strlen(buf), " x %5" PRId64, t->ne[i]);
+    for (int i = 1; i < GGML_MAX_DIMS; i++) {
+        snprintf(buf + strlen(buf), sizeof(buf) - strlen(buf), ", %5" PRId64, t->ne[i]);
     }
     return buf;
 }
 
 struct llama_model_loader {
+    int n_kv      = 0;
     int n_tensors = 0;
     int n_created = 0;
 
@@ -1027,11 +1032,31 @@ struct llama_model_loader {
 
         ctx_gguf = gguf_init_from_file(fname.c_str(), params);
 
+        n_kv      = gguf_get_n_kv(ctx_gguf);
         n_tensors = gguf_get_n_tensors(ctx_gguf);
+
         file_version = (enum llama_file_version) gguf_get_version(ctx_gguf);
 
-        LLAMA_LOG_INFO("%s: loaded %d tensors from %s (version %s)\n",
-                __func__, n_tensors, fname.c_str(), llama_file_version_name(file_version));
+        // print meta data
+        // TODO: make optional
+        {
+            LLAMA_LOG_INFO("%s: loaded meta data with %d key-value paris and %d tensors from %s (version %s)\n",
+                    __func__, n_kv, n_tensors, fname.c_str(), llama_file_version_name(file_version));
+
+            for (int i = 0; i < n_kv; i++) {
+                const char * name         = gguf_get_key(ctx_gguf, i);
+                const enum gguf_type type = gguf_get_kv_type(ctx_gguf, i);
+
+                LLAMA_LOG_INFO("%s: - %3d: %42s %-8s\n", __func__, i, name, gguf_type_name(type));
+            }
+
+            for (int i = 0; i < n_tensors; i++) {
+                const char * name = gguf_get_tensor_name(ctx_gguf, i);
+                struct ggml_tensor * meta = ggml_get_tensor(ctx_meta, name);
+
+                LLAMA_LOG_INFO("%s: - %3d: %32s %-8s [ %s ]\n", __func__, i, name, ggml_type_name(meta->type), llama_format_tensor_shape(meta).c_str());
+            }
+        }
 
         if (!llama_mmap::SUPPORTED) {
             LLAMA_LOG_WARN("%s: mmap is not supported on this platform\n", __func__);
@@ -1281,7 +1306,7 @@ static void llama_model_load_internal(
             if (kid >= 0) { \
                 enum gguf_type ktype = gguf_get_kv_type(ctx, kid); \
                 if (ktype != (type)) { \
-                    throw std::runtime_error(format("key %s has wrong type: %d", key, ktype)); \
+                    throw std::runtime_error(format("key %s has wrong type: %s", key, gguf_type_name(ktype))); \
                 } \
                 (dst) = func(ctx, kid); \
             } else if (req) { \
@@ -1325,7 +1350,7 @@ static void llama_model_load_internal(
             const auto n_gqa = hparams.n_gqa();
 
             if (model.type == e_model::MODEL_65B && n_gqa == 8) {
-                fprintf(stderr, "%s: warning: assuming 70B model based on GQA == %d\n", __func__, n_gqa);
+                LLAMA_LOG_WARN("%s: assuming 70B model based on GQA == %d\n", __func__, n_gqa);
                 model.type = e_model::MODEL_70B;
             }
         }
@@ -3399,7 +3424,7 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
         return i_layer < num_layers/8 || i_layer >= 7*num_layers/8 || (i_layer - num_layers/8)%3 == 2;
     };
 
-    size_t idx = 0;
+    int idx = 0;
 
     std::vector<uint8_t> read_data;
     std::vector<uint8_t> work;
@@ -3428,7 +3453,7 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
         tensor->data = read_data.data();
         model_loader->load_data_for(tensor);
 
-        LLAMA_LOG_INFO("[%4zu/%4zu] %36s - [%s], type = %6s, ",
+        LLAMA_LOG_INFO("[%4d/%4d] %36s - [%s], type = %6s, ",
                ++idx, model_loader->n_tensors,
                ggml_get_name(tensor),
                llama_format_tensor_shape(tensor).c_str(),