From b88727009d13d9a341a8c58fe43974913e437dc8 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Johannes=20G=C3=A4=C3=9Fler?= <johannesg@5d6.de>
Date: Tue, 3 Dec 2024 21:43:57 +0100
Subject: [PATCH 1/3] GGUF: backend support, fixed-width I/O, misc fixes

---
 examples/gguf/gguf.cpp  |  17 +-
 examples/llava/clip.cpp |   3 +-
 ggml/include/ggml.h     |  48 ++--
 ggml/src/ggml.c         | 534 +++++++++++++++++++++-------------------
 src/llama.cpp           |   3 +-
 5 files changed, 334 insertions(+), 271 deletions(-)
diff --git a/examples/gguf/gguf.cpp b/examples/gguf/gguf.cpp
index 7498f85ef..aa0c3f82a 100644
--- a/examples/gguf/gguf.cpp
+++ b/examples/gguf/gguf.cpp
@@ -1,10 +1,8 @@
 #include "ggml.h"
 
 #include <cstdio>
-#include <cinttypes>
 #include <string>
 #include <sstream>
-#include <fstream>
 #include <vector>
 
 #undef MIN
@@ -135,9 +133,11 @@ static bool gguf_ex_read_0(const std::string & fname) {
 
         for (int i = 0; i < n_tensors; ++i) {
             const char * name   = gguf_get_tensor_name  (ctx, i);
+            const size_t size   = gguf_get_tensor_size  (ctx, i);
+            // const size_t size   = 0;
             const size_t offset = gguf_get_tensor_offset(ctx, i);
 
-            printf("%s: tensor[%d]: name = %s, offset = %zu\n", __func__, i, name, offset);
+            printf("%s: tensor[%d]: name = %s, size = %zu, offset = %zu\n", __func__, i, name, size, offset);
         }
     }
 
@@ -182,9 +182,11 @@ static bool gguf_ex_read_1(const std::string & fname, bool check_data) {
 
         for (int i = 0; i < n_tensors; ++i) {
             const char * name   = gguf_get_tensor_name  (ctx, i);
+            const size_t size   = gguf_get_tensor_size  (ctx, i);
+            // const size_t size   = 0;
             const size_t offset = gguf_get_tensor_offset(ctx, i);
 
-            printf("%s: tensor[%d]: name = %s, offset = %zu\n", __func__, i, name, offset);
+            printf("%s: tensor[%d]: name = %s, size = %zu, offset = %zu\n", __func__, i, name, size, offset);
         }
     }
 
@@ -199,7 +201,8 @@ static bool gguf_ex_read_1(const std::string & fname, bool check_data) {
 
             struct ggml_tensor * cur = ggml_get_tensor(ctx_data, name);
 
-            printf("%s: tensor[%d]: n_dims = %d, name = %s, data = %p\n", __func__, i, ggml_n_dims(cur), cur->name, cur->data);
+            printf("%s: tensor[%d]: n_dims = %d, ne = (%d, %d, %d, %d) name = %s, data = %p\n",
+                __func__, i, ggml_n_dims(cur), int(cur->ne[0]), int(cur->ne[1]), int(cur->ne[2]), int(cur->ne[3]), cur->name, cur->data);
 
             // print first 10 elements
             const float * data = (const float *) cur->data;
@@ -215,7 +218,7 @@ static bool gguf_ex_read_1(const std::string & fname, bool check_data) {
                 const float * data = (const float *) cur->data;
                 for (int j = 0; j < ggml_nelements(cur); ++j) {
                     if (data[j] != 100 + i) {
-                        fprintf(stderr, "%s: tensor[%d]: data[%d] = %f\n", __func__, i, j, data[j]);
+                        fprintf(stderr, "%s: tensor[%d], data[%d]: found %f, expected %f\n", __func__, i, j, data[j], float(100 + i));
                         gguf_free(ctx);
                         return false;
                     }
@@ -245,6 +248,8 @@ int main(int argc, char ** argv) {
         check_data = false;
     }
 
+    srand(123456);
+
     const std::string fname(argv[1]);
     const std::string mode (argv[2]);
 
diff --git a/examples/llava/clip.cpp b/examples/llava/clip.cpp
index 7ba4cea58..43b18375c 100644
--- a/examples/llava/clip.cpp
+++ b/examples/llava/clip.cpp
@@ -2566,7 +2566,8 @@ bool clip_model_quantize(const char * fname_inp, const char * fname_out, const i
         total_size_org += orig_size;
         total_size_new += new_size;
         gguf_set_tensor_type(ctx_out, name.c_str(), new_type);
-        gguf_set_tensor_data(ctx_out, name.c_str(), new_data, new_size);
+        GGML_ASSERT(gguf_get_tensor_size(ctx_out, gguf_find_tensor(ctx_out, name.c_str())) == new_size);
+        gguf_set_tensor_data(ctx_out, name.c_str(), new_data);
         fout.write((const char *)new_data, new_size);
         size_t pad = GGML_PAD(new_size, gguf_get_alignment(ctx_out)) - new_size;
         for (size_t j = 0; j < pad; ++j) {
diff --git a/ggml/include/ggml.h b/ggml/include/ggml.h
index 65cb92c44..629bbfeb7 100644
--- a/ggml/include/ggml.h
+++ b/ggml/include/ggml.h
@@ -2072,9 +2072,10 @@ extern "C" {
                const float * imatrix);
 
     //
-    // gguf
+    // GGUF
     //
 
+    // types that can be stored as GGUF KV data
     enum gguf_type {
         GGUF_TYPE_UINT8   = 0,
         GGUF_TYPE_INT8    = 1,
@@ -2136,41 +2137,56 @@ extern "C" {
     GGML_API const char * gguf_get_val_str (const struct gguf_context * ctx, int key_id);
     GGML_API const void * gguf_get_val_data(const struct gguf_context * ctx, int key_id);
     GGML_API int          gguf_get_arr_n   (const struct gguf_context * ctx, int key_id);
+
+    // get raw pointer to the first element of the array with the given key_id
+    // for bool arrays, note that they are always stored as int8 on all platforms (usually this makes no difference)
     GGML_API const void * gguf_get_arr_data(const struct gguf_context * ctx, int key_id);
+
+    // get ith C string from array with given key_id
     GGML_API const char * gguf_get_arr_str (const struct gguf_context * ctx, int key_id, int i);
 
     GGML_API int            gguf_get_n_tensors    (const struct gguf_context * ctx);
     GGML_API int            gguf_find_tensor      (const struct gguf_context * ctx, const char * name);
     GGML_API size_t         gguf_get_tensor_offset(const struct gguf_context * ctx, int i);
-    GGML_API char *         gguf_get_tensor_name  (const struct gguf_context * ctx, int i);
+    GGML_API const char *   gguf_get_tensor_name  (const struct gguf_context * ctx, int i);
     GGML_API enum ggml_type gguf_get_tensor_type  (const struct gguf_context * ctx, int i);
+    GGML_API size_t         gguf_get_tensor_size  (const struct gguf_context * ctx, int i);
 
     // removes key if it exists
     GGML_API void gguf_remove_key(struct gguf_context * ctx, const char * key);
 
     // overrides existing values or adds a new one
-    GGML_API void gguf_set_val_u8  (struct gguf_context * ctx, const char * key, uint8_t  val);
-    GGML_API void gguf_set_val_i8  (struct gguf_context * ctx, const char * key, int8_t   val);
-    GGML_API void gguf_set_val_u16 (struct gguf_context * ctx, const char * key, uint16_t val);
-    GGML_API void gguf_set_val_i16 (struct gguf_context * ctx, const char * key, int16_t  val);
-    GGML_API void gguf_set_val_u32 (struct gguf_context * ctx, const char * key, uint32_t val);
-    GGML_API void gguf_set_val_i32 (struct gguf_context * ctx, const char * key, int32_t  val);
-    GGML_API void gguf_set_val_f32 (struct gguf_context * ctx, const char * key, float    val);
-    GGML_API void gguf_set_val_u64 (struct gguf_context * ctx, const char * key, uint64_t val);
-    GGML_API void gguf_set_val_i64 (struct gguf_context * ctx, const char * key, int64_t  val);
-    GGML_API void gguf_set_val_f64 (struct gguf_context * ctx, const char * key, double   val);
-    GGML_API void gguf_set_val_bool(struct gguf_context * ctx, const char * key, bool     val);
+    GGML_API void gguf_set_val_u8  (struct gguf_context * ctx, const char * key, uint8_t      val);
+    GGML_API void gguf_set_val_i8  (struct gguf_context * ctx, const char * key, int8_t       val);
+    GGML_API void gguf_set_val_u16 (struct gguf_context * ctx, const char * key, uint16_t     val);
+    GGML_API void gguf_set_val_i16 (struct gguf_context * ctx, const char * key, int16_t      val);
+    GGML_API void gguf_set_val_u32 (struct gguf_context * ctx, const char * key, uint32_t     val);
+    GGML_API void gguf_set_val_i32 (struct gguf_context * ctx, const char * key, int32_t      val);
+    GGML_API void gguf_set_val_f32 (struct gguf_context * ctx, const char * key, float        val);
+    GGML_API void gguf_set_val_u64 (struct gguf_context * ctx, const char * key, uint64_t     val);
+    GGML_API void gguf_set_val_i64 (struct gguf_context * ctx, const char * key, int64_t      val);
+    GGML_API void gguf_set_val_f64 (struct gguf_context * ctx, const char * key, double       val);
+    GGML_API void gguf_set_val_bool(struct gguf_context * ctx, const char * key, bool         val);
     GGML_API void gguf_set_val_str (struct gguf_context * ctx, const char * key, const char * val);
+
+    // creates a new array with n elements of the given type and copies the corresponding number of bytes from data
     GGML_API void gguf_set_arr_data(struct gguf_context * ctx, const char * key, enum gguf_type type, const void * data, int n);
+
+    // creates a new array with n strings and copies the corresponding strings from data
     GGML_API void gguf_set_arr_str (struct gguf_context * ctx, const char * key, const char ** data, int n);
 
     // set or add KV pairs from another context
-    GGML_API void gguf_set_kv(struct gguf_context * ctx, struct gguf_context * src);
+    GGML_API void gguf_set_kv(struct gguf_context * ctx, const struct gguf_context * src);
 
     // manage tensor info
     GGML_API void gguf_add_tensor(struct gguf_context * ctx, const struct ggml_tensor * tensor);
+
+    // after changing a tensor's type, the offsets of all tensors with higher indices are recalculated
+    //   in such a way that the tensor data remains as one contiguous block (except for padding)
     GGML_API void gguf_set_tensor_type(struct gguf_context * ctx, const char * name, enum ggml_type type);
-    GGML_API void gguf_set_tensor_data(struct gguf_context * ctx, const char * name, const void * data, size_t size);
+
+    // assumes that at least gguf_get_tensor_size bytes can be read from data
+    GGML_API void gguf_set_tensor_data(struct gguf_context * ctx, const char * name, const void * data);
 
     // writing gguf files can be done in 2 ways:
     //
@@ -2195,6 +2211,8 @@ extern "C" {
 
     // get the size in bytes of the meta data (header, kv pairs, tensor info) including padding
     GGML_API size_t gguf_get_meta_size(const struct gguf_context * ctx);
+
+    // writes the meta data to pointer "data"
     GGML_API void   gguf_get_meta_data(const struct gguf_context * ctx, void * data);
 
 #ifdef  __cplusplus
diff --git a/ggml/src/ggml.c b/ggml/src/ggml.c
index 1a9a7efaf..1f461ba62 100644
--- a/ggml/src/ggml.c
+++ b/ggml/src/ggml.c
@@ -6316,7 +6316,7 @@ static const size_t GGUF_TYPE_SIZE[GGUF_TYPE_COUNT] = {
     [GGUF_TYPE_UINT32]  = sizeof(uint32_t),
     [GGUF_TYPE_INT32]   = sizeof(int32_t),
     [GGUF_TYPE_FLOAT32] = sizeof(float),
-    [GGUF_TYPE_BOOL]    = sizeof(bool),
+    [GGUF_TYPE_BOOL]    = sizeof(int8_t),
     [GGUF_TYPE_STRING]  = sizeof(struct gguf_str),
     [GGUF_TYPE_UINT64]  = sizeof(uint64_t),
     [GGUF_TYPE_INT64]   = sizeof(int64_t),
@@ -6353,7 +6353,7 @@ union gguf_value {
     uint64_t uint64;
     int64_t  int64;
     double   float64;
-    bool     bool_;
+    // bool     bool_; // stored as int8 instead
 
     struct gguf_str str;
 
@@ -6381,25 +6381,15 @@ struct gguf_header {
 };
 
 struct gguf_tensor_info {
-    struct gguf_str name;
-
-    uint32_t n_dims;
-    uint64_t ne[GGML_MAX_DIMS];
-
-    enum ggml_type type;
-
-    uint64_t offset; // offset from start of `data`, must be a multiple of `ALIGNMENT`
-
-    // for writing API
-    const void * data;
-    size_t size;
+    struct ggml_tensor t; // for holding the equivalent info
+    uint64_t offset;      // offset from start of `data`, must be a multiple of `ALIGNMENT`
 };
 
 struct gguf_context {
     struct gguf_header header;
 
     struct gguf_kv          * kv;
-    struct gguf_tensor_info * infos;
+    struct gguf_tensor_info * info;
 
     size_t alignment;
     size_t offset;    // offset of `data` from beginning of file
@@ -6414,48 +6404,6 @@ static size_t gguf_type_size(enum gguf_type type) {
     return GGUF_TYPE_SIZE[type];
 }
 
-static bool gguf_tensor_info_sanitize(struct gguf_tensor_info * info) {
-    if (info->n_dims > GGML_MAX_DIMS) {
-        fprintf(stderr, "%s: invalid number of dimensions (%" PRIu32 ")\n", __func__, info->n_dims);
-        return false;
-    }
-
-    if (info->type < 0 || info->type >= GGML_TYPE_COUNT) {
-        fprintf(stderr, "%s: invalid type (%d)\n", __func__, info->type);
-        return false;
-    }
-
-    if (strlen(info->name.data) >= GGML_MAX_NAME) {
-        fprintf(stderr, "%s: tensor '%s' name is too long\n", __func__, info->name.data);
-        return false;
-    }
-
-    for (uint32_t i = 0; i < info->n_dims; ++i) {
-        if (info->ne[i] <= 0) {
-            fprintf(stderr, "%s: invalid number of elements (%" PRIu64 ")\n", __func__, info->ne[i]);
-            return false;
-        }
-    }
-
-    // prevent overflow for total number of elements
-    if (INT64_MAX/info->ne[1] <= info->ne[0]) {
-        fprintf(stderr, "%s: invalid number of elements (%" PRIu64 ")\n", __func__, info->ne[1]);
-        return false;
-    }
-
-    if (INT64_MAX/info->ne[2] <= info->ne[0]*info->ne[1]) {
-        fprintf(stderr, "%s: invalid number of elements (%" PRIu64 ")\n", __func__, info->ne[2]);
-        return false;
-    }
-
-    if (INT64_MAX/info->ne[3] <= info->ne[0]*info->ne[1]*info->ne[2]) {
-        fprintf(stderr, "%s: invalid number of elements (%" PRIu64 ")\n", __func__, info->ne[3]);
-        return false;
-    }
-
-    return true;
-}
-
 static bool gguf_fread_el(FILE * file, void * dst, size_t size, size_t * offset) {
     const size_t n = fread(dst, 1, size, file);
     *offset += n;
@@ -6470,8 +6418,8 @@ static bool gguf_fread_str(FILE * file, struct gguf_str * p, size_t * offset) {
 
     ok = ok && gguf_fread_el(file, &p->n, sizeof(p->n), offset);
 
-    // early exit if string length is invalid, prevents from integer overflow
-    if (p->n == SIZE_MAX) {
+    // early exit if string length is invalid, prevents integer overflow
+    if (p->n >= SIZE_MAX) {
         fprintf(stderr, "%s: invalid string length (%" PRIu64 ")\n", __func__, p->n);
         return false;
     }
@@ -6482,7 +6430,7 @@ static bool gguf_fread_str(FILE * file, struct gguf_str * p, size_t * offset) {
         return false;
     }
 
-    ok = ok && gguf_fread_el(file,  p->data, p->n, offset);
+    ok = ok && gguf_fread_el(file, p->data, p->n, offset);
 
     return ok;
 }
@@ -6514,6 +6462,12 @@ static void gguf_free_kv(struct gguf_kv * kv) {
 }
 
 struct gguf_context * gguf_init_empty(void) {
+    if (sizeof(float)  != 4) {
+        GGML_ABORT("support for floats with != 32 bits not implemented");
+    }
+    if (sizeof(double) != 8) {
+        GGML_ABORT("support for doubles with != 64 bits not implemented");
+    }
     struct gguf_context * ctx = calloc(1, sizeof(struct gguf_context));
     if (!ctx) {
         fprintf(stderr, "%s: failed to allocate memory for context\n", __func__);
@@ -6525,8 +6479,8 @@ struct gguf_context * gguf_init_empty(void) {
     ctx->header.n_tensors = 0;
     ctx->header.n_kv      = 0;
 
-    ctx->kv    = NULL;
-    ctx->infos = NULL;
+    ctx->kv   = NULL;
+    ctx->info = NULL;
 
     ctx->alignment = GGUF_DEFAULT_ALIGNMENT;
     ctx->offset    = 0;
@@ -6538,6 +6492,12 @@ struct gguf_context * gguf_init_empty(void) {
 }
 
 struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_params params) {
+    if (sizeof(float)  != 4) {
+        GGML_ABORT("support for floats with != 32 bits not implemented");
+    }
+    if (sizeof(double) != 8) {
+        GGML_ABORT("support for doubles with != 64 bits not implemented");
+    }
     FILE * file = ggml_fopen(fname, "rb");
     if (!file) {
         fprintf(stderr, "%s: failed to open '%s': '%s'\n", __func__, fname, strerror(errno));
@@ -6575,22 +6535,22 @@ struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_p
     {
         strncpy(ctx->header.magic, magic, 4);
 
-        ctx->kv    = NULL;
-        ctx->infos = NULL;
-        ctx->data  = NULL;
+        ctx->kv   = NULL;
+        ctx->info = NULL;
+        ctx->data = NULL;
 
         ok = ok && gguf_fread_el(file, &ctx->header.version,   sizeof(ctx->header.version),   &offset);
         ok = ok && gguf_fread_el(file, &ctx->header.n_tensors, sizeof(ctx->header.n_tensors), &offset);
         ok = ok && gguf_fread_el(file, &ctx->header.n_kv,      sizeof(ctx->header.n_kv),      &offset);
 
         if (ctx->header.version == 1) {
-            fprintf(stderr, "%s: GGUFv1 is no longer supported. please use a more up-to-date version\n", __func__);
+            fprintf(stderr, "%s: GGUFv1 is no longer supported, please use a more up-to-date version\n", __func__);
             fclose(file);
             gguf_free(ctx);
             return NULL;
         }
 
-        // sanity-checks to prevent from integer/buffer overflows
+        // sanity checks to prevent integer/buffer overflows
 
         ok = ok && (ctx->header.n_tensors < (SIZE_MAX/2)/sizeof(struct gguf_tensor_info));
         ok = ok && (ctx->header.n_tensors < (SIZE_MAX/2)/ggml_tensor_overhead());
@@ -6604,7 +6564,7 @@ struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_p
         }
     }
 
-    // read the kv pairs
+    // read the KV pairs
     {
         const uint64_t n_kv = ctx->header.n_kv;
 
@@ -6616,13 +6576,17 @@ struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_p
             return NULL;
         }
 
-        for (uint64_t i = 0; i < n_kv; ++i) {
+        for (uint64_t i = 0; ok && i < n_kv; ++i) {
             struct gguf_kv * kv = &ctx->kv[i];
 
             //fprintf(stderr, "%s: reading kv %d\n", __func__, i);
 
-            ok = ok && gguf_fread_str(file, &kv->key,                    &offset);
-            ok = ok && gguf_fread_el (file, &kv->type, sizeof(kv->type), &offset);
+            ok = ok && gguf_fread_str(file, &kv->key, &offset);
+            {
+                int32_t tmp = -1; // always read enums as int32 regardless of platform
+                ok = ok && gguf_fread_el(file, &tmp, sizeof(tmp), &offset);
+                kv->type = tmp;
+            }
 
             //fprintf(stderr, "%s: reading kv with key %s\n", __func__, kv->key.data);
 
@@ -6637,12 +6601,16 @@ struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_p
                 case GGUF_TYPE_UINT64:  ok = ok && gguf_fread_el (file, &kv->value.uint64,  sizeof(kv->value.uint64),  &offset); break;
                 case GGUF_TYPE_INT64:   ok = ok && gguf_fread_el (file, &kv->value.int64,   sizeof(kv->value.int64),   &offset); break;
                 case GGUF_TYPE_FLOAT64: ok = ok && gguf_fread_el (file, &kv->value.float64, sizeof(kv->value.float64), &offset); break;
-                case GGUF_TYPE_BOOL:    ok = ok && gguf_fread_el (file, &kv->value.bool_,   sizeof(kv->value.bool_),   &offset); break;
+                case GGUF_TYPE_BOOL:    ok = ok && gguf_fread_el (file, &kv->value.int8,    sizeof(kv->value.int8),    &offset); break;
                 case GGUF_TYPE_STRING:  ok = ok && gguf_fread_str(file, &kv->value.str,                                &offset); break;
                 case GGUF_TYPE_ARRAY:
                     {
-                        ok = ok && gguf_fread_el(file, &kv->value.arr.type, sizeof(kv->value.arr.type), &offset);
-                        ok = ok && gguf_fread_el(file, &kv->value.arr.n,    sizeof(kv->value.arr.n),    &offset);
+                        {
+                            int32_t tmp = -1; // always read enums as int32 regardless of platform
+                            ok = ok && gguf_fread_el(file, &tmp, sizeof(tmp), &offset);
+                            kv->value.arr.type = tmp;
+                        }
+                        ok = ok && gguf_fread_el(file, &kv->value.arr.n, sizeof(kv->value.arr.n), &offset);
 
                         switch (kv->value.arr.type) {
                             case GGUF_TYPE_UINT8:
@@ -6657,7 +6625,7 @@ struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_p
                             case GGUF_TYPE_FLOAT64:
                             case GGUF_TYPE_BOOL:
                                 {
-                                    // prevent from integer overflow in the malloc below
+                                    // prevent integer overflow in the malloc below
                                     if (kv->value.arr.n >= SIZE_MAX/gguf_type_size(kv->value.arr.type)) {
                                         fprintf(stderr, "%s: array size is too large (%" PRIu64 ")\n", __func__, kv->value.arr.n);
                                         fclose(file);
@@ -6665,7 +6633,8 @@ struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_p
                                         return NULL;
                                     }
 
-                                    kv->value.arr.data = calloc(kv->value.arr.n, gguf_type_size(kv->value.arr.type));
+                                    const size_t nbytes = kv->value.arr.n * gguf_type_size(kv->value.arr.type);
+                                    kv->value.arr.data = malloc(nbytes);
                                     if (!kv->value.arr.data) {
                                         fprintf(stderr, "%s: failed to allocate memory for array\n", __func__);
                                         fclose(file);
@@ -6673,11 +6642,11 @@ struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_p
                                         return NULL;
                                     }
 
-                                    ok = ok && gguf_fread_el(file, kv->value.arr.data, kv->value.arr.n * gguf_type_size(kv->value.arr.type), &offset);
+                                    ok = ok && gguf_fread_el(file, kv->value.arr.data, nbytes, &offset);
                                 } break;
                             case GGUF_TYPE_STRING:
                                 {
-                                    // prevent from integer overflow in the malloc below
+                                    // prevent integer overflow in the malloc below
                                     if (kv->value.arr.n >= SIZE_MAX/sizeof(struct gguf_str)) {
                                         fprintf(stderr, "%s: array size is too large (%" PRIu64 ")\n", __func__, kv->value.arr.n);
                                         fclose(file);
@@ -6685,7 +6654,7 @@ struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_p
                                         return NULL;
                                     }
 
-                                    kv->value.arr.data = calloc(kv->value.arr.n, sizeof(struct gguf_str));
+                                    kv->value.arr.data = malloc(kv->value.arr.n * sizeof(struct gguf_str));
                                     if (!kv->value.arr.data) {
                                         fprintf(stderr, "%s: failed to allocate memory for array\n", __func__);
                                         fclose(file);
@@ -6693,7 +6662,7 @@ struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_p
                                         return NULL;
                                     }
 
-                                    for (uint64_t j = 0; j < kv->value.arr.n; ++j) {
+                                    for (uint64_t j = 0; ok && j < kv->value.arr.n; ++j) {
                                         ok = ok && gguf_fread_str(file, &((struct gguf_str *) kv->value.arr.data)[j], &offset);
                                     }
                                 } break;
@@ -6711,10 +6680,6 @@ struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_p
                         ok = false;
                     } break;
             }
-
-            if (!ok) {
-                break;
-            }
         }
 
         if (!ok) {
@@ -6725,51 +6690,124 @@ struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_p
         }
     }
 
-    // read the tensor infos
+    // read the tensor info
     if (ctx->header.n_tensors > 0) {
-        ctx->infos = calloc(ctx->header.n_tensors, sizeof(struct gguf_tensor_info));
-        if (!ctx->infos) {
-            fprintf(stderr, "%s: failed to allocate memory for tensor infos\n", __func__);
+        ctx->info = calloc(ctx->header.n_tensors, sizeof(struct gguf_tensor_info));
+        if (!ctx->info) {
+            fprintf(stderr, "%s: failed to allocate memory for tensor info\n", __func__);
             fclose(file);
             gguf_free(ctx);
             return NULL;
         }
 
-        for (uint64_t i = 0; i < ctx->header.n_tensors; ++i) {
-            struct gguf_tensor_info * info = &ctx->infos[i];
+        for (uint64_t i = 0; ok && i < ctx->header.n_tensors; ++i) {
+            struct gguf_tensor_info * info = &ctx->info[i];
 
-            for (int j = 0; j < GGML_MAX_DIMS; ++j) {
-                info->ne[j] = 1;
-            }
-
-            ok = ok && gguf_fread_str(file, &info->name,                          &offset);
-            ok = ok && gguf_fread_el (file, &info->n_dims, sizeof(info->n_dims),  &offset);
-
-            ok = ok && (info->n_dims <= GGML_MAX_DIMS);
-
-            for (uint32_t j = 0; j < info->n_dims; ++j) {
-                ok = ok && gguf_fread_el(file, &info->ne[j], sizeof(info->ne[j]), &offset);
-            }
-
-            ok = ok && gguf_fread_el (file, &info->type,   sizeof(info->type),    &offset);
-            ok = ok && gguf_fread_el (file, &info->offset, sizeof(info->offset),  &offset);
-
-            ok = ok && gguf_tensor_info_sanitize(info);
-
-            // make sure there is no duplicated tensor names
-            for (uint64_t j = 0; j < i && ok; ++j) {
-                if (strcmp(info->name.data, ctx->infos[j].name.data) == 0) {
-                    fprintf(stderr, "%s: duplicated tensor name %s\n", __func__, info->name.data);
+            // tensor name
+            {
+                uint64_t n = -1;
+                ok = ok && gguf_fread_el(file, &n, sizeof(n), &offset);
+                if (n >= GGML_MAX_NAME) {
+                    fprintf(stderr, "%s: tensor name %" PRIu64 " is too long: %" PRIu64 " >= %d\n", __func__, i, n, GGML_MAX_NAME);
                     ok = false;
+                    break;
+                }
+                // the memory was cleared so the copied tensor name is guranteed to be null-terminated
+                ok = ok && gguf_fread_el(file, info->t.name, n, &offset);
+
+                // make sure there are no duplicated tensor names
+                for (uint64_t j = 0; ok && j < i; ++j) {
+                    if (strcmp(info->t.name, ctx->info[j].t.name) == 0) {
+                        fprintf(stderr, "%s: duplicated tensor name %s\n", __func__, info->t.name);
+                        ok = false;
+                        break;
+                    }
                 }
             }
 
-            if (!ok) {
-                fprintf(stderr, "%s: failed to read tensor info\n", __func__);
-                fclose(file);
-                gguf_free(ctx);
-                return NULL;
+            // tensor shape
+            {
+                for (int j = 0; j < GGML_MAX_DIMS; ++j) {
+                    info->t.ne[j] = 1;
+                }
+
+                uint32_t n_dims = -1;
+                ok = ok && gguf_fread_el(file, &n_dims, sizeof(n_dims), &offset);
+                if (n_dims > GGML_MAX_DIMS) {
+                    fprintf(stderr, "%s: tensor '%s' has invalid number of dimensions (%" PRIu32 ")\n", __func__, info->t.name, n_dims);
+                    ok = false;
+                    break;
+                }
+
+                ok = ok && gguf_fread_el(file, info->t.ne, n_dims*sizeof(info->t.ne[0]), &offset);
+
+                // check that all ne are non-negative
+                for (int j = 0; j < GGML_MAX_DIMS; ++j) {
+                    if (info->t.ne[j] < 0) {
+                        fprintf(stderr, "%s: tensor '%s' has invalid number of elements (%" PRIi64 ")\n",
+                            __func__, info->t.name, info->t.ne[j]);
+                        ok = false;
+                        break;
+                    }
+                }
+
+                // check that the total number of elements is representable
+                if ((INT64_MAX/info->t.ne[1] <= info->t.ne[0]) ||
+                    (INT64_MAX/info->t.ne[2] <= info->t.ne[0]*info->t.ne[1]) ||
+                    (INT64_MAX/info->t.ne[3] <= info->t.ne[0]*info->t.ne[1]*info->t.ne[2])) {
+
+                    fprintf(stderr, "%s: total number of elements in tensor '%s' with shape "
+                        "(%" PRIi64 ", %" PRIi64 ", %" PRIi64 ", %" PRIi64 ") is >= %" PRIi64 "\n",
+                        __func__, info->t.name, info->t.ne[0], info->t.ne[1], info->t.ne[2], info->t.ne[3], INT64_MAX);
+                    ok = false;
+                    break;
+                }
             }
+
+            // tensor type
+            {
+                {
+                    int32_t tmp = -1; // always read enums as int32 regardless of platform
+                    ok = ok && gguf_fread_el(file, &tmp, sizeof(tmp), &offset);
+                    info->t.type = tmp;
+                }
+
+                // check that tensor type is within defined range
+                if (info->t.type < 0 || info->t.type >= GGML_TYPE_COUNT) {
+                    fprintf(stderr, "%s: tensor '%s' has invalid ggml type %d (%s)\n",
+                        __func__, info->t.name, info->t.type, ggml_type_name(info->t.type));
+                    ok = false;
+                    break;
+                }
+                const size_t type_size = ggml_type_size(info->t.type);
+                const size_t blck_size = ggml_blck_size(info->t.type);
+
+                // check that row size is divisible by block size
+                if (blck_size == 0 || info->t.ne[0] % blck_size != 0) {
+                    fprintf(stderr, "%s: tensor '%s' of type %d (%s) has %" PRId64 " elements per row, "
+                        "not a multiple of block size (%" PRId64 ")\n",
+                        __func__, info->t.name, (int) info->t.type, ggml_type_name(info->t.type), info->t.ne[0], blck_size);
+                    ok = false;
+                    break;
+                }
+
+                // calculate byte offsets given the tensor shape and type
+                info->t.nb[0] = type_size;
+                info->t.nb[1] = info->t.nb[0]*(info->t.ne[0]/blck_size);
+                for (int j = 2; j < GGML_MAX_DIMS; ++j) {
+                    info->t.nb[j] = info->t.nb[j - 1]*info->t.ne[j - 1];
+                }
+            }
+
+            // tensor data offset within buffer
+            ok = ok && gguf_fread_el(file, &info->offset, sizeof(info->offset), &offset);
+        }
+
+        if (!ok) {
+            fprintf(stderr, "%s: failed to read tensor info\n", __func__);
+            fclose(file);
+            gguf_free(ctx);
+            return NULL;
         }
     }
 
@@ -6782,10 +6820,10 @@ struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_p
 
     // we require the data section to be aligned, so take into account any padding
     {
-        const size_t offset_pad = offset % ctx->alignment;
+        const size_t offset_align_overshoot = offset % ctx->alignment; // bytes beyond last aligned address
 
-        if (offset_pad != 0) {
-            offset += ctx->alignment - offset_pad;
+        if (offset_align_overshoot != 0) {
+            offset += ctx->alignment - offset_align_overshoot;
             fseek(file, offset, SEEK_SET);
         }
     }
@@ -6797,25 +6835,9 @@ struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_p
     {
         ctx->size = 0;
         for (uint64_t i = 0; i < ctx->header.n_tensors; ++i) {
-            struct gguf_tensor_info * info = &ctx->infos[i];
+            struct gguf_tensor_info * info = &ctx->info[i];
 
-            const int64_t ne =
-                (int64_t) info->ne[0] *
-                (int64_t) info->ne[1] *
-                (int64_t) info->ne[2] *
-                (int64_t) info->ne[3];
-
-            if (ggml_blck_size(info->type) == 0 || ne % ggml_blck_size(info->type) != 0) {
-                fprintf(stderr, "%s: tensor '%s' of type %d (%s) number of elements (%" PRId64 ") is not a multiple of block size (%" PRId64 ")\n",
-                        __func__, info->name.data, (int) info->type, ggml_type_name(info->type), ne, ggml_blck_size(info->type));
-                fclose(file);
-                gguf_free(ctx);
-                return NULL;
-            }
-
-            const size_t size_cur = ggml_row_size(info->type, ne);
-
-            ctx->size += GGML_PAD(size_cur, ctx->alignment);
+            ctx->size += GGML_PAD(ggml_nbytes(&info->t), ctx->alignment);
         }
     }
 
@@ -6823,7 +6845,7 @@ struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_p
     if (params.ctx != NULL) {
         // if the provided gguf_context is no_alloc, then we create "empty" tensors and do not read the binary blob
         // otherwise, we load the binary blob into the created ggml_context as well, and point the "data" members of
-        // the ggml_tensor structs to the appropriate locations in the binary blob
+        //   the ggml_tensor structs to the appropriate locations in the binary blob
 
         // compute the exact size needed for the new ggml_context
         const size_t mem_size =
@@ -6872,14 +6894,8 @@ struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_p
 
         // create the tensors
         for (uint64_t i = 0; i < ctx->header.n_tensors; ++i) {
-            const int64_t ne[GGML_MAX_DIMS] = {
-                ctx->infos[i].ne[0],
-                ctx->infos[i].ne[1],
-                ctx->infos[i].ne[2],
-                ctx->infos[i].ne[3],
-            };
-
-            struct ggml_tensor * cur = ggml_new_tensor(ctx_data, ctx->infos[i].type, ctx->infos[i].n_dims, ne);
+            struct ggml_tensor * cur = ggml_new_tensor(
+                ctx_data, ctx->info[i].t.type, GGML_MAX_DIMS, ctx->info[i].t.ne);
 
             ok = ok && cur != NULL;
 
@@ -6887,12 +6903,12 @@ struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_p
                 break;
             }
 
-            ggml_set_name(cur, ctx->infos[i].name.data);
+            ggml_set_name(cur, ctx->info[i].t.name);
 
-            // point the data member to the appropriate location in the binary blob using the tensor infos
+            // point the data member to the appropriate location in the binary blob using the tensor info
             if (!params.no_alloc) {
-              //cur->data = (char *) data->data + ctx->infos[i].offset - ctx->offset; // offset from start of file
-                cur->data = (char *) data->data + ctx->infos[i].offset;               // offset from data
+              //cur->data = (char *) data->data + ctx->info[i].offset - ctx->offset; // offset from start of file
+                cur->data = (char *) data->data + ctx->info[i].offset;               // offset from data
             }
         }
 
@@ -6926,16 +6942,8 @@ void gguf_free(struct gguf_context * ctx) {
         GGML_FREE(ctx->kv);
     }
 
-    if (ctx->infos) {
-        for (uint64_t i = 0; i < ctx->header.n_tensors; ++i) {
-            struct gguf_tensor_info * info = &ctx->infos[i];
-
-            if (info->name.data) {
-                GGML_FREE(info->name.data);
-            }
-        }
-
-        GGML_FREE(ctx->infos);
+    if (ctx->info) {
+        GGML_FREE(ctx->info);
     }
 
     GGML_FREE(ctx);
@@ -6957,10 +6965,12 @@ size_t gguf_get_data_offset(const struct gguf_context * ctx) {
     return ctx->offset;
 }
 
+// TODO should this be a const pointer? should it exist at all?
 void * gguf_get_data(const struct gguf_context * ctx) {
     return ctx->data;
 }
 
+// TODO this returns int but the underlying type is uint64
 int gguf_get_n_kv(const struct gguf_context * ctx) {
     return ctx->header.n_kv;
 }
@@ -7080,7 +7090,7 @@ double gguf_get_val_f64(const struct gguf_context * ctx, int key_id) {
 bool gguf_get_val_bool(const struct gguf_context * ctx, int key_id) {
     GGML_ASSERT(key_id >= 0 && key_id < gguf_get_n_kv(ctx));
     GGML_ASSERT(ctx->kv[key_id].type == GGUF_TYPE_BOOL);
-    return ctx->kv[key_id].value.bool_;
+    return ctx->kv[key_id].value.int8 != 0;
 }
 
 const char * gguf_get_val_str(const struct gguf_context * ctx, int key_id) {
@@ -7117,15 +7127,19 @@ int gguf_find_tensor(const struct gguf_context * ctx, const char * name) {
 }
 
 size_t gguf_get_tensor_offset(const struct gguf_context * ctx, int i) {
-    return ctx->infos[i].offset;
+    return ctx->info[i].offset;
 }
 
-char * gguf_get_tensor_name(const struct gguf_context * ctx, int i) {
-    return ctx->infos[i].name.data;
+const char * gguf_get_tensor_name(const struct gguf_context * ctx, int i) {
+    return ctx->info[i].t.name;
 }
 
 enum ggml_type gguf_get_tensor_type(const struct gguf_context * ctx, int i) {
-    return ctx->infos[i].type;
+    return ctx->info[i].t.type;
+}
+
+size_t gguf_get_tensor_size(const struct gguf_context * ctx, int i) {
+    return ggml_nbytes(&ctx->info[i].t);
 }
 
 // returns the index
@@ -7138,6 +7152,8 @@ static int gguf_get_or_add_key(struct gguf_context * ctx, const char * key) {
     const int n_kv = gguf_get_n_kv(ctx);
 
     ctx->kv = realloc(ctx->kv, (n_kv + 1) * sizeof(struct gguf_kv));
+    GGML_ASSERT(ctx->kv); // potential memory leak
+    memset(&ctx->kv[n_kv], 0, sizeof(struct gguf_kv));
     ctx->kv[n_kv].key.n    = strlen(key);
     ctx->kv[n_kv].key.data = strdup(key);
     ctx->header.n_kv++;
@@ -7154,6 +7170,7 @@ void gguf_remove_key(struct gguf_context * ctx, const char * key) {
             ctx->kv[i] = ctx->kv[i+1];
         }
         ctx->kv = realloc(ctx->kv, (n_kv - 1) * sizeof(struct gguf_kv));
+        GGML_ASSERT(ctx->kv); // potential memory leak
         ctx->header.n_kv--;
     }
 }
@@ -7231,8 +7248,8 @@ void gguf_set_val_f64(struct gguf_context * ctx, const char * key, double val) {
 void gguf_set_val_bool(struct gguf_context * ctx, const char * key, bool val) {
     const int idx = gguf_get_or_add_key(ctx, key);
 
-    ctx->kv[idx].type        = GGUF_TYPE_BOOL;
-    ctx->kv[idx].value.bool_ = val;
+    ctx->kv[idx].type       = GGUF_TYPE_BOOL;
+    ctx->kv[idx].value.int8 = val ? 1 : 0;
 }
 
 void gguf_set_val_str(struct gguf_context * ctx, const char * key, const char * val) {
@@ -7245,31 +7262,36 @@ void gguf_set_val_str(struct gguf_context * ctx, const char * key, const char *
 
 void gguf_set_arr_data(struct gguf_context * ctx, const char * key, enum gguf_type type, const void * data, int n) {
     const int idx = gguf_get_or_add_key(ctx, key);
+    const size_t nbytes = n * gguf_type_size(type);
 
     ctx->kv[idx].type           = GGUF_TYPE_ARRAY;
     ctx->kv[idx].value.arr.type = type;
     ctx->kv[idx].value.arr.n    = n;
-    ctx->kv[idx].value.arr.data = GGML_CALLOC(n, gguf_type_size(type));
-    memcpy(ctx->kv[idx].value.arr.data, data, n*gguf_type_size(type));
+    ctx->kv[idx].value.arr.data = realloc(ctx->kv[idx].value.arr.data, nbytes);
+    GGML_ASSERT(ctx->kv[idx].value.arr.data); // potential memory leak
+    memcpy(ctx->kv[idx].value.arr.data, data, nbytes);
 }
 
 void gguf_set_arr_str(struct gguf_context * ctx, const char * key, const char ** data, int n) {
     const int idx = gguf_get_or_add_key(ctx, key);
+    const size_t nbytes = n * gguf_type_size(GGUF_TYPE_STRING);
 
     ctx->kv[idx].type           = GGUF_TYPE_ARRAY;
     ctx->kv[idx].value.arr.type = GGUF_TYPE_STRING;
     ctx->kv[idx].value.arr.n    = n;
-    ctx->kv[idx].value.arr.data = GGML_CALLOC(n, sizeof(struct gguf_str));
-    for (int i = 0; i < n; i++) {
+    ctx->kv[idx].value.arr.data = realloc(ctx->kv[idx].value.arr.data, nbytes);
+    GGML_ASSERT(ctx->kv[idx].value.arr.data); // potential memory leak
+    for (int i = 0; i < n; ++i) {
         struct gguf_str * str = &((struct gguf_str *)ctx->kv[idx].value.arr.data)[i];
         str->n    = strlen(data[i]);
         str->data = strdup(data[i]);
+        GGML_ASSERT(str->data);
     }
 }
 
 // set or add KV pairs from another context
-void gguf_set_kv(struct gguf_context * ctx, struct gguf_context * src) {
-    for (uint32_t i = 0; i < src->header.n_kv; i++) {
+void gguf_set_kv(struct gguf_context * ctx, const struct gguf_context * src) {
+    for (uint64_t i = 0; i < src->header.n_kv; ++i) {
         switch (src->kv[i].type) {
             case GGUF_TYPE_UINT8:   gguf_set_val_u8  (ctx, src->kv[i].key.data, src->kv[i].value.uint8);    break;
             case GGUF_TYPE_INT8:    gguf_set_val_i8  (ctx, src->kv[i].key.data, src->kv[i].value.int8);     break;
@@ -7281,13 +7303,13 @@ void gguf_set_kv(struct gguf_context * ctx, struct gguf_context * src) {
             case GGUF_TYPE_UINT64:  gguf_set_val_u64 (ctx, src->kv[i].key.data, src->kv[i].value.uint64);   break;
             case GGUF_TYPE_INT64:   gguf_set_val_i64 (ctx, src->kv[i].key.data, src->kv[i].value.int64);    break;
             case GGUF_TYPE_FLOAT64: gguf_set_val_f64 (ctx, src->kv[i].key.data, src->kv[i].value.float64);  break;
-            case GGUF_TYPE_BOOL:    gguf_set_val_bool(ctx, src->kv[i].key.data, src->kv[i].value.bool_);    break;
+            case GGUF_TYPE_BOOL:    gguf_set_val_bool(ctx, src->kv[i].key.data, src->kv[i].value.int8);     break;
             case GGUF_TYPE_STRING:  gguf_set_val_str (ctx, src->kv[i].key.data, src->kv[i].value.str.data); break;
             case GGUF_TYPE_ARRAY:
                 {
                     if (src->kv[i].value.arr.type == GGUF_TYPE_STRING) {
-                        const char ** data = GGML_CALLOC(src->kv[i].value.arr.n, sizeof(char *));
-                        for (uint32_t j = 0; j < src->kv[i].value.arr.n; j++) {
+                        const char ** data = GGML_MALLOC(src->kv[i].value.arr.n * sizeof(char *));
+                        for (uint64_t j = 0; j < src->kv[i].value.arr.n; ++j) {
                             data[j] = ((struct gguf_str *)src->kv[i].value.arr.data)[j].data;
                         }
                         gguf_set_arr_str(ctx, src->kv[i].key.data, data, src->kv[i].value.arr.n);
@@ -7295,7 +7317,8 @@ void gguf_set_kv(struct gguf_context * ctx, struct gguf_context * src) {
                     } else if (src->kv[i].value.arr.type == GGUF_TYPE_ARRAY) {
                         GGML_ABORT("nested arrays not supported");
                     } else {
-                        gguf_set_arr_data(ctx, src->kv[i].key.data, src->kv[i].value.arr.type, src->kv[i].value.arr.data, src->kv[i].value.arr.n);
+                        gguf_set_arr_data(ctx, src->kv[i].key.data, src->kv[i].value.arr.type,
+                            src->kv[i].value.arr.data, src->kv[i].value.arr.n);
                     }
                 } break;
             default: GGML_ABORT("invalid type");
@@ -7311,29 +7334,12 @@ void gguf_add_tensor(
         GGML_ABORT("duplicated tensor name");
     }
 
-    const int idx = ctx->header.n_tensors;
-    ctx->infos = realloc(ctx->infos, (idx + 1)*sizeof(struct gguf_tensor_info));
-
-    ctx->infos[idx].name.n    = strlen(tensor->name);
-    ctx->infos[idx].name.data = strdup(tensor->name);
-
-    for (int i = 0; i < GGML_MAX_DIMS; ++i) {
-        ctx->infos[idx].ne[i] = 1;
-    }
-
-    ctx->infos[idx].n_dims = ggml_n_dims(tensor);
-    for (uint32_t i = 0; i < ctx->infos[idx].n_dims; i++) {
-        ctx->infos[idx].ne[i] = tensor->ne[i];
-    }
-
-    ctx->infos[idx].type   = tensor->type;
-    ctx->infos[idx].offset = 0;
-    ctx->infos[idx].data   = tensor->data;
-    ctx->infos[idx].size   = ggml_nbytes(tensor);
-
-    if (ctx->header.n_tensors > 0) {
-        ctx->infos[idx].offset = ctx->infos[idx - 1].offset + GGML_PAD(ctx->infos[idx - 1].size, ctx->alignment);
-    }
+    const uint64_t idx = ctx->header.n_tensors;
+    ctx->info = realloc(ctx->info, (idx + 1)*sizeof(struct gguf_tensor_info));
+    GGML_ASSERT(ctx->info); // potential memory leak
+    ctx->info[idx].t = *tensor;
+    ctx->info[idx].offset = idx == 0 ? 0 :
+        ctx->info[idx - 1].offset + GGML_PAD(ggml_nbytes(&ctx->info[idx - 1].t), ctx->alignment);
 
     ctx->header.n_tensors++;
 }
@@ -7343,38 +7349,38 @@ void gguf_set_tensor_type(struct gguf_context * ctx, const char * name, enum ggm
     if (idx < 0) {
         GGML_ABORT("tensor not found");
     }
+    struct ggml_tensor * tensor = &ctx->info[idx].t;
+    const size_t type_size = ggml_type_size(type);
+    const int    blck_size = ggml_blck_size(type);
 
-    ctx->infos[idx].type = type;
+    tensor->type = type;
+    GGML_ASSERT(tensor->ne[0] % blck_size == 0 && "tensor row size not divisible by block size of new type");
+
+    tensor->nb[0] = type_size;
+    tensor->nb[1] = tensor->nb[0]*(tensor->ne[0]/blck_size);
+    for (int i = 2; i < GGML_MAX_DIMS; i++) {
+        tensor->nb[i] = tensor->nb[i - 1]*tensor->ne[i - 1];
+    }
+
+    // update offsets
+    for (uint64_t i = idx + 1; i < ctx->header.n_tensors; ++i) {
+        ctx->info[i].offset = ctx->info[i - 1].offset + GGML_PAD(ggml_nbytes(&ctx->info[i - 1].t), ctx->alignment);
+    }
 }
 
-void gguf_set_tensor_data(struct gguf_context * ctx, const char * name, const void * data, size_t size) {
+void gguf_set_tensor_data(struct gguf_context * ctx, const char * name, const void * data) {
     const int idx = gguf_find_tensor(ctx, name);
     if (idx < 0) {
         GGML_ABORT("tensor not found");
     }
 
-    ctx->infos[idx].data = data;
-    ctx->infos[idx].size = size;
-
-    // update offsets
-    for (uint32_t i = idx + 1; i < ctx->header.n_tensors; ++i) {
-        ctx->infos[i].offset = ctx->infos[i - 1].offset + GGML_PAD(ctx->infos[i - 1].size, ctx->alignment);
-    }
+    ctx->info[idx].t.data = (void *)(uintptr_t)data; // double cast suppresses warning about casting away const
 }
 
-//static void gguf_fwrite_str(FILE * file, const struct gguf_str * val) {
-//    fwrite(&val->n,   sizeof(val->n),    1, file);
-//    fwrite(val->data, sizeof(char), val->n, file);
-//}
-//
-//static void gguf_fwrite_el(FILE * file, const void * val, size_t size) {
-//    fwrite(val, sizeof(char), size, file);
-//}
-
 struct gguf_buf {
     void * data;
-    size_t size;
-    size_t offset;
+    size_t size;   // size of data
+    size_t offset; // offset within data
 };
 
 static struct gguf_buf gguf_buf_init(size_t size) {
@@ -7395,9 +7401,10 @@ static void gguf_buf_free(struct gguf_buf buf) {
 
 static void gguf_buf_grow(struct gguf_buf * buf, size_t size) {
     if (buf->offset + size > buf->size) {
-        buf->size = 1.5*(buf->offset + size);
+        buf->size = 1.5f*(buf->offset + size);
         if (buf->data) {
             buf->data = realloc(buf->data, buf->size);
+            GGML_ASSERT(buf->data); // potential memory leak
         }
     }
 }
@@ -7425,6 +7432,23 @@ static void gguf_bwrite_el(struct gguf_buf * buf, const void * val, size_t el_si
     buf->offset += el_size;
 }
 
+static void gguf_bwrite_tensor_data(struct gguf_buf * buf, const struct ggml_tensor * tensor) {
+    GGML_ASSERT(ggml_is_contiguous(tensor));
+    const size_t el_size = ggml_nbytes(tensor);
+    gguf_buf_grow(buf, el_size);
+
+    if (buf->data) {
+        char * dst = (char *) buf->data + buf->offset;
+        if (tensor->buffer) {
+            ggml_backend_tensor_get(tensor, dst, 0, el_size);
+        } else {
+            GGML_ASSERT(tensor->data);
+            memcpy(dst, tensor->data, el_size);
+        }
+    }
+    buf->offset += el_size;
+}
+
 static void gguf_write_to_buf(const struct gguf_context * ctx, struct gguf_buf * buf, bool only_meta) {
     // write header
     gguf_bwrite_el(buf, &ctx->header.magic,     sizeof(ctx->header.magic));
@@ -7433,11 +7457,14 @@ static void gguf_write_to_buf(const struct gguf_context * ctx, struct gguf_buf *
     gguf_bwrite_el(buf, &ctx->header.n_kv,      sizeof(ctx->header.n_kv));
 
     // write key-value pairs
-    for (uint32_t i = 0; i < ctx->header.n_kv; ++i) {
+    for (uint64_t i = 0; i < ctx->header.n_kv; ++i) {
         struct gguf_kv * kv = &ctx->kv[i];
 
         gguf_bwrite_str(buf, &kv->key);
-        gguf_bwrite_el (buf, &kv->type, sizeof(kv->type));
+        {
+            const int32_t tmp = kv->type; // always write enums as int32 regardless of platform
+            gguf_bwrite_el(buf, &tmp, sizeof(tmp));
+        }
 
         switch (kv->type) {
             case GGUF_TYPE_UINT8:   gguf_bwrite_el( buf, &kv->value.uint8,   sizeof(kv->value.uint8)  ); break;
@@ -7450,12 +7477,15 @@ static void gguf_write_to_buf(const struct gguf_context * ctx, struct gguf_buf *
             case GGUF_TYPE_UINT64:  gguf_bwrite_el (buf, &kv->value.uint64,  sizeof(kv->value.uint64) ); break;
             case GGUF_TYPE_INT64:   gguf_bwrite_el (buf, &kv->value.int64,   sizeof(kv->value.int64)  ); break;
             case GGUF_TYPE_FLOAT64: gguf_bwrite_el (buf, &kv->value.float64, sizeof(kv->value.float64)); break;
-            case GGUF_TYPE_BOOL:    gguf_bwrite_el (buf, &kv->value.bool_,   sizeof(kv->value.bool_)  ); break;
+            case GGUF_TYPE_BOOL:    gguf_bwrite_el (buf, &kv->value.int8,    sizeof(kv->value.int8)   ); break;
             case GGUF_TYPE_STRING:  gguf_bwrite_str(buf, &kv->value.str                               ); break;
             case GGUF_TYPE_ARRAY:
                 {
-                    gguf_bwrite_el(buf, &kv->value.arr.type, sizeof(kv->value.arr.type));
-                    gguf_bwrite_el(buf, &kv->value.arr.n,    sizeof(kv->value.arr.n)   );
+                    {
+                        const int32_t tmp = kv->value.arr.type; // always write enums as int32 regardless of platform
+                        gguf_bwrite_el(buf, &tmp, sizeof(tmp));
+                    }
+                    gguf_bwrite_el(buf, &kv->value.arr.n, sizeof(kv->value.arr.n));
 
                     switch (kv->value.arr.type) {
                         case GGUF_TYPE_UINT8:
@@ -7474,7 +7504,7 @@ static void gguf_write_to_buf(const struct gguf_context * ctx, struct gguf_buf *
                             } break;
                         case GGUF_TYPE_STRING:
                             {
-                                for (uint32_t j = 0; j < kv->value.arr.n; ++j) {
+                                for (uint64_t j = 0; j < kv->value.arr.n; ++j) {
                                     gguf_bwrite_str(buf, &((struct gguf_str *) kv->value.arr.data)[j]);
                                 }
                             } break;
@@ -7486,16 +7516,26 @@ static void gguf_write_to_buf(const struct gguf_context * ctx, struct gguf_buf *
         }
     }
 
-    // write tensor infos
-    for (uint32_t i = 0; i < ctx->header.n_tensors; ++i) {
-        struct gguf_tensor_info * info = &ctx->infos[i];
+    // write tensor info
+    for (uint64_t i = 0; i < ctx->header.n_tensors; ++i) {
+        struct gguf_tensor_info * info = &ctx->info[i];
 
-        gguf_bwrite_str(buf, &info->name);
-        gguf_bwrite_el (buf, &info->n_dims, sizeof(info->n_dims));
-        for (uint32_t j = 0; j < info->n_dims; ++j) {
-            gguf_bwrite_el(buf, &info->ne[j], sizeof(info->ne[j]));
+        struct gguf_str name = {
+            /*n    =*/ strlen(info->t.name),
+            /*data =*/        info->t.name,
+        };
+        gguf_bwrite_str(buf, &name);
+
+        const uint32_t n_dims = ggml_n_dims(&info->t);
+        gguf_bwrite_el(buf, &n_dims, sizeof(n_dims));
+
+        for (uint32_t j = 0; j < n_dims; ++j) {
+            gguf_bwrite_el(buf, &info->t.ne[j], sizeof(info->t.ne[j]));
+        }
+        {
+            const int32_t tmp = info->t.type; // always write enums as int32 regardless of platform
+            gguf_bwrite_el(buf, &tmp, sizeof(tmp));
         }
-        gguf_bwrite_el(buf, &info->type,   sizeof(info->type));
         gguf_bwrite_el(buf, &info->offset, sizeof(info->offset));
     }
 
@@ -7519,19 +7559,17 @@ static void gguf_write_to_buf(const struct gguf_context * ctx, struct gguf_buf *
     size_t offset = 0;
 
     // write tensor data
-    for (uint32_t i = 0; i < ctx->header.n_tensors; ++i) {
-        struct gguf_tensor_info * info = &ctx->infos[i];
+    for (uint64_t i = 0; i < ctx->header.n_tensors; ++i) {
+        struct gguf_tensor_info * info = &ctx->info[i];
 
-        const size_t size     = info->size;
+        const size_t size     = ggml_nbytes(&info->t);
         const size_t size_pad = GGML_PAD(size, ctx->alignment);
 
-        gguf_bwrite_el(buf, info->data, size);
+        gguf_bwrite_tensor_data(buf, &info->t);
 
-        if (size_pad != size) {
-            uint8_t pad = 0;
-            for (size_t j = 0; j < size_pad - size; ++j) {
-                gguf_bwrite_el(buf, &pad, sizeof(pad));
-            }
+        const uint8_t pad = 0;
+        for (size_t j = size; j < size_pad; ++j) {
+            gguf_bwrite_el(buf, &pad, sizeof(pad));
         }
 
         GGML_ASSERT(offset == info->offset);
@@ -7550,7 +7588,7 @@ void gguf_write_to_file(const struct gguf_context * ctx, const char * fname, boo
 
     gguf_write_to_buf(ctx, &buf, only_meta);
 
-    fwrite(buf.data, 1, buf.offset, file);
+    fwrite(buf.data, 1, buf.offset, file); // buf.offset == number of bytes that are in use
 
     gguf_buf_free(buf);
 
@@ -7561,7 +7599,7 @@ size_t gguf_get_meta_size(const struct gguf_context * ctx) {
     // no allocs - only compute size
     struct gguf_buf buf = gguf_buf_init(0);
 
-    gguf_write_to_buf(ctx, &buf, true);
+    gguf_write_to_buf(ctx, &buf, /*only_meta =*/ true);
 
     return buf.offset;
 }
@@ -7569,7 +7607,7 @@ size_t gguf_get_meta_size(const struct gguf_context * ctx) {
 void gguf_get_meta_data(const struct gguf_context * ctx, void * data) {
     struct gguf_buf buf = gguf_buf_init(16*1024);
 
-    gguf_write_to_buf(ctx, &buf, true);
+    gguf_write_to_buf(ctx, &buf, /*only_meta =*/ true);
 
     memcpy(data, buf.data, buf.offset);
 
diff --git a/src/llama.cpp b/src/llama.cpp
index 6a6f4c2a5..5ca0205f4 100644
--- a/src/llama.cpp
+++ b/src/llama.cpp
@@ -19211,7 +19211,8 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
 
         // update the gguf meta data as we go
         gguf_set_tensor_type(ctx_outs[cur_split].get(), name.c_str(), new_type);
-        gguf_set_tensor_data(ctx_outs[cur_split].get(), name.c_str(), new_data, new_size);
+        GGML_ASSERT(gguf_get_tensor_size(ctx_outs[cur_split].get(), gguf_find_tensor(ctx_outs[cur_split].get(), name.c_str())) == new_size);
+        gguf_set_tensor_data(ctx_outs[cur_split].get(), name.c_str(), new_data);
 
         // write tensor data + padding
         fout.write((const char *) new_data, new_size);

From 096b847a0f4c9f1471917d84e1c57823ec94fd98 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Johannes=20G=C3=A4=C3=9Fler?= <johannesg@5d6.de>
Date: Wed, 4 Dec 2024 14:16:05 +0100
Subject: [PATCH 2/3] fix wrong type in print

---
 ggml/src/ggml.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/ggml/src/ggml.c b/ggml/src/ggml.c
index 1f461ba62..7cf3cca71 100644
--- a/ggml/src/ggml.c
+++ b/ggml/src/ggml.c
@@ -6779,8 +6779,8 @@ struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_p
                     ok = false;
                     break;
                 }
-                const size_t type_size = ggml_type_size(info->t.type);
-                const size_t blck_size = ggml_blck_size(info->t.type);
+                const size_t  type_size = ggml_type_size(info->t.type);
+                const int64_t blck_size = ggml_blck_size(info->t.type);
 
                 // check that row size is divisible by block size
                 if (blck_size == 0 || info->t.ne[0] % blck_size != 0) {
@@ -7350,8 +7350,8 @@ void gguf_set_tensor_type(struct gguf_context * ctx, const char * name, enum ggm
         GGML_ABORT("tensor not found");
     }
     struct ggml_tensor * tensor = &ctx->info[idx].t;
-    const size_t type_size = ggml_type_size(type);
-    const int    blck_size = ggml_blck_size(type);
+    const size_t  type_size = ggml_type_size(type);
+    const int64_t blck_size = ggml_blck_size(type);
 
     tensor->type = type;
     GGML_ASSERT(tensor->ne[0] % blck_size == 0 && "tensor row size not divisible by block size of new type");

From a8046c888ac55e22bcbbdae5d59ec08ae5ac5368 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Johannes=20G=C3=A4=C3=9Fler?= <johannesg@5d6.de>
Date: Wed, 4 Dec 2024 17:24:35 +0100
Subject: [PATCH 3/3] use calloc instead of malloc

---
 ggml/src/ggml.c | 25 ++++++++++++-------------
 1 file changed, 12 insertions(+), 13 deletions(-)

diff --git a/ggml/src/ggml.c b/ggml/src/ggml.c
index 7cf3cca71..a761c7c21 100644
--- a/ggml/src/ggml.c
+++ b/ggml/src/ggml.c
@@ -6625,7 +6625,7 @@ struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_p
                             case GGUF_TYPE_FLOAT64:
                             case GGUF_TYPE_BOOL:
                                 {
-                                    // prevent integer overflow in the malloc below
+                                    // prevent integer overflow in the calloc below
                                     if (kv->value.arr.n >= SIZE_MAX/gguf_type_size(kv->value.arr.type)) {
                                         fprintf(stderr, "%s: array size is too large (%" PRIu64 ")\n", __func__, kv->value.arr.n);
                                         fclose(file);
@@ -6633,8 +6633,7 @@ struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_p
                                         return NULL;
                                     }
 
-                                    const size_t nbytes = kv->value.arr.n * gguf_type_size(kv->value.arr.type);
-                                    kv->value.arr.data = malloc(nbytes);
+                                    kv->value.arr.data = calloc(kv->value.arr.n, gguf_type_size(kv->value.arr.type));
                                     if (!kv->value.arr.data) {
                                         fprintf(stderr, "%s: failed to allocate memory for array\n", __func__);
                                         fclose(file);
@@ -6642,11 +6641,11 @@ struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_p
                                         return NULL;
                                     }
 
-                                    ok = ok && gguf_fread_el(file, kv->value.arr.data, nbytes, &offset);
+                                    ok = ok && gguf_fread_el(file, kv->value.arr.data, kv->value.arr.n * gguf_type_size(kv->value.arr.type), &offset);
                                 } break;
                             case GGUF_TYPE_STRING:
                                 {
-                                    // prevent integer overflow in the malloc below
+                                    // prevent integer overflow in the calloc below
                                     if (kv->value.arr.n >= SIZE_MAX/sizeof(struct gguf_str)) {
                                         fprintf(stderr, "%s: array size is too large (%" PRIu64 ")\n", __func__, kv->value.arr.n);
                                         fclose(file);
@@ -6654,7 +6653,7 @@ struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_p
                                         return NULL;
                                     }
 
-                                    kv->value.arr.data = malloc(kv->value.arr.n * sizeof(struct gguf_str));
+                                    kv->value.arr.data = calloc(kv->value.arr.n, sizeof(struct gguf_str));
                                     if (!kv->value.arr.data) {
                                         fprintf(stderr, "%s: failed to allocate memory for array\n", __func__);
                                         fclose(file);
@@ -7152,7 +7151,7 @@ static int gguf_get_or_add_key(struct gguf_context * ctx, const char * key) {
     const int n_kv = gguf_get_n_kv(ctx);
 
     ctx->kv = realloc(ctx->kv, (n_kv + 1) * sizeof(struct gguf_kv));
-    GGML_ASSERT(ctx->kv); // potential memory leak
+    GGML_ASSERT(ctx->kv); // detect potential memory leak
     memset(&ctx->kv[n_kv], 0, sizeof(struct gguf_kv));
     ctx->kv[n_kv].key.n    = strlen(key);
     ctx->kv[n_kv].key.data = strdup(key);
@@ -7170,7 +7169,7 @@ void gguf_remove_key(struct gguf_context * ctx, const char * key) {
             ctx->kv[i] = ctx->kv[i+1];
         }
         ctx->kv = realloc(ctx->kv, (n_kv - 1) * sizeof(struct gguf_kv));
-        GGML_ASSERT(ctx->kv); // potential memory leak
+        GGML_ASSERT(ctx->kv); // detect potential memory leak
         ctx->header.n_kv--;
     }
 }
@@ -7268,7 +7267,7 @@ void gguf_set_arr_data(struct gguf_context * ctx, const char * key, enum gguf_ty
     ctx->kv[idx].value.arr.type = type;
     ctx->kv[idx].value.arr.n    = n;
     ctx->kv[idx].value.arr.data = realloc(ctx->kv[idx].value.arr.data, nbytes);
-    GGML_ASSERT(ctx->kv[idx].value.arr.data); // potential memory leak
+    GGML_ASSERT(ctx->kv[idx].value.arr.data); // detect potential memory leak
     memcpy(ctx->kv[idx].value.arr.data, data, nbytes);
 }
 
@@ -7280,7 +7279,7 @@ void gguf_set_arr_str(struct gguf_context * ctx, const char * key, const char **
     ctx->kv[idx].value.arr.type = GGUF_TYPE_STRING;
     ctx->kv[idx].value.arr.n    = n;
     ctx->kv[idx].value.arr.data = realloc(ctx->kv[idx].value.arr.data, nbytes);
-    GGML_ASSERT(ctx->kv[idx].value.arr.data); // potential memory leak
+    GGML_ASSERT(ctx->kv[idx].value.arr.data); // detect potential memory leak
     for (int i = 0; i < n; ++i) {
         struct gguf_str * str = &((struct gguf_str *)ctx->kv[idx].value.arr.data)[i];
         str->n    = strlen(data[i]);
@@ -7308,7 +7307,7 @@ void gguf_set_kv(struct gguf_context * ctx, const struct gguf_context * src) {
             case GGUF_TYPE_ARRAY:
                 {
                     if (src->kv[i].value.arr.type == GGUF_TYPE_STRING) {
-                        const char ** data = GGML_MALLOC(src->kv[i].value.arr.n * sizeof(char *));
+                        const char ** data = GGML_CALLOC(src->kv[i].value.arr.n, sizeof(char *));
                         for (uint64_t j = 0; j < src->kv[i].value.arr.n; ++j) {
                             data[j] = ((struct gguf_str *)src->kv[i].value.arr.data)[j].data;
                         }
@@ -7336,7 +7335,7 @@ void gguf_add_tensor(
 
     const uint64_t idx = ctx->header.n_tensors;
     ctx->info = realloc(ctx->info, (idx + 1)*sizeof(struct gguf_tensor_info));
-    GGML_ASSERT(ctx->info); // potential memory leak
+    GGML_ASSERT(ctx->info); // detect potential memory leak
     ctx->info[idx].t = *tensor;
     ctx->info[idx].offset = idx == 0 ? 0 :
         ctx->info[idx - 1].offset + GGML_PAD(ggml_nbytes(&ctx->info[idx - 1].t), ctx->alignment);
@@ -7404,7 +7403,7 @@ static void gguf_buf_grow(struct gguf_buf * buf, size_t size) {
         buf->size = 1.5f*(buf->offset + size);
         if (buf->data) {
             buf->data = realloc(buf->data, buf->size);
-            GGML_ASSERT(buf->data); // potential memory leak
+            GGML_ASSERT(buf->data); // detect potential memory leak
         }
     }
 }