From f7a6aa99112c843b5d17f0a29d79215aa61155be Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Tue, 15 Aug 2023 19:57:37 +0300
Subject: [PATCH] gguf : streaming support when writing files

---
 examples/gguf/gguf.cpp |   2 +-
 ggml.c                 | 220 +++++++++++++++++++++++++++++++----------
 ggml.h                 |  39 ++++++--
 gguf-llama.cpp         |  42 ++++++--
 4 files changed, 233 insertions(+), 70 deletions(-)

diff --git a/examples/gguf/gguf.cpp b/examples/gguf/gguf.cpp
index f67af1416..74a447c07 100644
--- a/examples/gguf/gguf.cpp
+++ b/examples/gguf/gguf.cpp
@@ -72,7 +72,7 @@ bool gguf_ex_write(const std::string & fname) {
         gguf_add_tensor(ctx, cur);
     }
 
-    gguf_write_to_file(ctx, fname.c_str());
+    gguf_write_to_file(ctx, fname.c_str(), false);
 
     fprintf(stdout, "%s: wrote file '%s;\n", __func__, fname.c_str());
 
diff --git a/ggml.c b/ggml.c
index ead9ab526..7549566aa 100644
--- a/ggml.c
+++ b/ggml.c
@@ -19123,6 +19123,22 @@ int gguf_get_n_tensors(struct gguf_context * ctx) {
     return ctx->header.n_tensors;
 }
 
+int gguf_find_tensor(struct gguf_context * ctx, const char * name) {
+    // return -1 if tensor not found
+    int tensorfound = -1;
+
+    const int n_tensors = gguf_get_n_tensors(ctx);
+
+    for (int i = 0; i < n_tensors; ++i) {
+        if (strcmp(name, gguf_get_tensor_name(ctx, i)) == 0) {
+            tensorfound = i;
+            break;
+        }
+    }
+
+    return tensorfound;
+}
+
 size_t gguf_get_tensor_offset(struct gguf_context * ctx, int i) {
     return ctx->infos[i].offset;
 }
@@ -19269,12 +19285,9 @@ void gguf_set_kv(struct gguf_context * ctx, struct gguf_context * src) {
     }
 }
 
-void gguf_add_tensor_ex(
+void gguf_add_tensor(
              struct gguf_context * ctx,
-        const struct ggml_tensor * tensor,
-                  enum ggml_type   type,
-                      const void * data,
-                          size_t   size) {
+        const struct ggml_tensor * tensor) {
     const int idx = ctx->header.n_tensors;
     ctx->infos = realloc(ctx->infos, (idx + 1)*sizeof(struct gguf_tensor_info));
 
@@ -19290,10 +19303,10 @@ void gguf_add_tensor_ex(
         ctx->infos[idx].ne[i] = tensor->ne[i];
     }
 
-    ctx->infos[idx].type   = type;
+    ctx->infos[idx].type   = tensor->type;
     ctx->infos[idx].offset = 0;
-    ctx->infos[idx].data   = data;
-    ctx->infos[idx].size   = size;
+    ctx->infos[idx].data   = tensor->data;
+    ctx->infos[idx].size   = ggml_nbytes(tensor);
 
     if (ctx->header.n_tensors > 0) {
         ctx->infos[idx].offset = ctx->infos[idx - 1].offset + GGML_PAD(ctx->infos[idx - 1].size, ctx->alignment);
@@ -19302,52 +19315,115 @@ void gguf_add_tensor_ex(
     ctx->header.n_tensors++;
 }
 
-void gguf_add_tensor(struct gguf_context * ctx, const struct ggml_tensor * tensor) {
-    gguf_add_tensor_ex(ctx, tensor, tensor->type, tensor->data, ggml_nbytes(tensor));
-}
-
-static void gguf_fwrite_str(FILE * file, const struct gguf_str * val) {
-    fwrite(&val->n,   sizeof(val->n),    1, file);
-    fwrite(val->data, sizeof(char), val->n, file);
-}
-
-static void gguf_fwrite_el(FILE * file, const void * val, size_t size) {
-    fwrite(val, sizeof(char), size, file);
-}
-
-void gguf_write_to_file(struct gguf_context * ctx, const char * fname) {
-    FILE * file = fopen(fname, "wb");
-    if (!file) {
-        GGML_ASSERT(false && "failed to open file for writing");
+void gguf_set_tensor_type(struct gguf_context * ctx, const char * name, enum ggml_type type) {
+    const int idx = gguf_find_tensor(ctx, name);
+    if (idx < 0) {
+        GGML_ASSERT(false && "tensor not found");
     }
 
+    ctx->infos[idx].type = type;
+}
+
+void gguf_set_tensor_data(struct gguf_context * ctx, const char * name, const void * data, size_t size) {
+    const int idx = gguf_find_tensor(ctx, name);
+    if (idx < 0) {
+        GGML_ASSERT(false && "tensor not found");
+    }
+
+    ctx->infos[idx].data = data;
+    ctx->infos[idx].size = size;
+
+    // update offsets
+    for (uint32_t i = idx + 1; i < ctx->header.n_tensors; ++i) {
+        ctx->infos[i].offset = ctx->infos[i - 1].offset + GGML_PAD(ctx->infos[i - 1].size, ctx->alignment);
+    }
+}
+
+//static void gguf_fwrite_str(FILE * file, const struct gguf_str * val) {
+//    fwrite(&val->n,   sizeof(val->n),    1, file);
+//    fwrite(val->data, sizeof(char), val->n, file);
+//}
+//
+//static void gguf_fwrite_el(FILE * file, const void * val, size_t size) {
+//    fwrite(val, sizeof(char), size, file);
+//}
+
+struct gguf_buf {
+    void * data;
+    size_t size;
+    size_t offset;
+};
+
+static struct gguf_buf gguf_buf_init(size_t size) {
+    struct gguf_buf buf = {
+        /*buf.data   =*/ size == 0 ? NULL : malloc(size),
+        /*buf.size   =*/ size,
+        /*buf.offset =*/ 0,
+    };
+
+    return buf;
+}
+
+static void gguf_buf_free(struct gguf_buf buf) {
+    if (buf.data) {
+        free(buf.data);
+    }
+}
+
+static void gguf_buf_grow(struct gguf_buf * buf, size_t size) {
+    if (buf->offset + size > buf->size) {
+        buf->size = 1.5*(buf->offset + size);
+        if (buf->data) {
+            buf->data = realloc(buf->data, buf->size);
+        }
+    }
+}
+
+static void gguf_bwrite_str(struct gguf_buf * buf, const struct gguf_str * val) {
+    gguf_buf_grow(buf, sizeof(val->n) + val->n);
+
+    buf->data && memcpy((char *) buf->data + buf->offset, &val->n, sizeof(val->n));
+    buf->offset += sizeof(val->n);
+
+    buf->data && memcpy((char *) buf->data + buf->offset, val->data, val->n);
+    buf->offset += val->n;
+}
+
+static void gguf_bwrite_el(struct gguf_buf * buf, const void * val, size_t el_size) {
+    gguf_buf_grow(buf, el_size);
+
+    buf->data && memcpy((char *) buf->data + buf->offset, val, el_size);
+    buf->offset += el_size;
+}
+
+static void gguf_write_to_buf(struct gguf_context * ctx, struct gguf_buf * buf, bool only_meta) {
     // write header
-    gguf_fwrite_el(file, &ctx->header.magic,     sizeof(ctx->header.magic));
-    gguf_fwrite_el(file, &ctx->header.version,   sizeof(ctx->header.version));
-    gguf_fwrite_el(file, &ctx->header.n_tensors, sizeof(ctx->header.n_tensors));
-    gguf_fwrite_el(file, &ctx->header.n_kv,      sizeof(ctx->header.n_kv));
+    gguf_bwrite_el(buf, &ctx->header.magic,     sizeof(ctx->header.magic));
+    gguf_bwrite_el(buf, &ctx->header.version,   sizeof(ctx->header.version));
+    gguf_bwrite_el(buf, &ctx->header.n_tensors, sizeof(ctx->header.n_tensors));
+    gguf_bwrite_el(buf, &ctx->header.n_kv,      sizeof(ctx->header.n_kv));
 
     // write key-value pairs
     for (uint32_t i = 0; i < ctx->header.n_kv; ++i) {
         struct gguf_kv * kv = &ctx->kv[i];
 
-        gguf_fwrite_str(file, &kv->key);
-        gguf_fwrite_el (file, &kv->type, sizeof(kv->type));
+        gguf_bwrite_str(buf, &kv->key);
+        gguf_bwrite_el (buf, &kv->type, sizeof(kv->type));
 
         switch (kv->type) {
-            case GGUF_TYPE_UINT8:   gguf_fwrite_el (file, &kv->value.uint8,   sizeof(kv->value.uint8)  ); break;
-            case GGUF_TYPE_INT8:    gguf_fwrite_el (file, &kv->value.int8,    sizeof(kv->value.int8)   ); break;
-            case GGUF_TYPE_UINT16:  gguf_fwrite_el (file, &kv->value.uint16,  sizeof(kv->value.uint16) ); break;
-            case GGUF_TYPE_INT16:   gguf_fwrite_el (file, &kv->value.int16,   sizeof(kv->value.int16)  ); break;
-            case GGUF_TYPE_UINT32:  gguf_fwrite_el (file, &kv->value.uint32,  sizeof(kv->value.uint32) ); break;
-            case GGUF_TYPE_INT32:   gguf_fwrite_el (file, &kv->value.int32,   sizeof(kv->value.int32)  ); break;
-            case GGUF_TYPE_FLOAT32: gguf_fwrite_el (file, &kv->value.float32, sizeof(kv->value.float32)); break;
-            case GGUF_TYPE_BOOL:    gguf_fwrite_el (file, &kv->value.bool_,   sizeof(kv->value.bool_)  ); break;
-            case GGUF_TYPE_STRING:  gguf_fwrite_str(file, &kv->value.str                               ); break;
+            case GGUF_TYPE_UINT8:   gguf_bwrite_el( buf, &kv->value.uint8,   sizeof(kv->value.uint8)  ); break;
+            case GGUF_TYPE_INT8:    gguf_bwrite_el (buf, &kv->value.int8,    sizeof(kv->value.int8)   ); break;
+            case GGUF_TYPE_UINT16:  gguf_bwrite_el (buf, &kv->value.uint16,  sizeof(kv->value.uint16) ); break;
+            case GGUF_TYPE_INT16:   gguf_bwrite_el (buf, &kv->value.int16,   sizeof(kv->value.int16)  ); break;
+            case GGUF_TYPE_UINT32:  gguf_bwrite_el (buf, &kv->value.uint32,  sizeof(kv->value.uint32) ); break;
+            case GGUF_TYPE_INT32:   gguf_bwrite_el (buf, &kv->value.int32,   sizeof(kv->value.int32)  ); break;
+            case GGUF_TYPE_FLOAT32: gguf_bwrite_el (buf, &kv->value.float32, sizeof(kv->value.float32)); break;
+            case GGUF_TYPE_BOOL:    gguf_bwrite_el (buf, &kv->value.bool_,   sizeof(kv->value.bool_)  ); break;
+            case GGUF_TYPE_STRING:  gguf_bwrite_str(buf, &kv->value.str                               ); break;
             case GGUF_TYPE_ARRAY:
                 {
-                    gguf_fwrite_el(file, &kv->value.arr.type, sizeof(kv->value.arr.type));
-                    gguf_fwrite_el(file, &kv->value.arr.n,    sizeof(kv->value.arr.n)   );
+                    gguf_bwrite_el(buf, &kv->value.arr.type, sizeof(kv->value.arr.type));
+                    gguf_bwrite_el(buf, &kv->value.arr.n,    sizeof(kv->value.arr.n)   );
 
                     switch (kv->value.arr.type) {
                         case GGUF_TYPE_UINT8:
@@ -19359,12 +19435,12 @@ void gguf_write_to_file(struct gguf_context * ctx, const char * fname) {
                         case GGUF_TYPE_FLOAT32:
                         case GGUF_TYPE_BOOL:
                             {
-                                gguf_fwrite_el(file, kv->value.arr.data, kv->value.arr.n * GGUF_TYPE_SIZE[kv->value.arr.type]);
+                                gguf_bwrite_el(buf, kv->value.arr.data, kv->value.arr.n * GGUF_TYPE_SIZE[kv->value.arr.type]);
                             } break;
                         case GGUF_TYPE_STRING:
                             {
                                 for (uint32_t j = 0; j < kv->value.arr.n; ++j) {
-                                    gguf_fwrite_str(file, &((struct gguf_str *) kv->value.arr.data)[j]);
+                                    gguf_bwrite_str(buf, &((struct gguf_str *) kv->value.arr.data)[j]);
                                 }
                             } break;
                         case GGUF_TYPE_ARRAY:
@@ -19379,28 +19455,32 @@ void gguf_write_to_file(struct gguf_context * ctx, const char * fname) {
     for (uint32_t i = 0; i < ctx->header.n_tensors; ++i) {
         struct gguf_tensor_info * info = &ctx->infos[i];
 
-        gguf_fwrite_str(file, &info->name);
-        gguf_fwrite_el (file, &info->n_dims, sizeof(info->n_dims));
+        gguf_bwrite_str(buf, &info->name);
+        gguf_bwrite_el (buf, &info->n_dims, sizeof(info->n_dims));
         for (uint32_t j = 0; j < info->n_dims; ++j) {
-            gguf_fwrite_el(file, &info->ne[j], sizeof(info->ne[j]));
+            gguf_bwrite_el(buf, &info->ne[j], sizeof(info->ne[j]));
         }
-        gguf_fwrite_el (file, &info->type,   sizeof(info->type));
-        gguf_fwrite_el (file, &info->offset, sizeof(info->offset));
+        gguf_bwrite_el(buf, &info->type,   sizeof(info->type));
+        gguf_bwrite_el(buf, &info->offset, sizeof(info->offset));
     }
 
     // we require the data section to be aligned, so take into account any padding
     {
-        const size_t offset     = ftell(file);
+        const size_t offset     = buf->offset;
         const size_t offset_pad = GGML_PAD(offset, ctx->alignment);
 
         if (offset_pad != offset) {
             uint8_t pad = 0;
             for (size_t i = 0; i < offset_pad - offset; ++i) {
-                gguf_fwrite_el(file, &pad, sizeof(pad));
+                gguf_bwrite_el(buf, &pad, sizeof(pad));
             }
         }
     }
 
+    if (only_meta) {
+        return;
+    }
+
     size_t offset = 0;
 
     // write tensor data
@@ -19410,12 +19490,12 @@ void gguf_write_to_file(struct gguf_context * ctx, const char * fname) {
         const size_t size     = info->size;
         const size_t size_pad = GGML_PAD(size, ctx->alignment);
 
-        gguf_fwrite_el(file, info->data, size);
+        gguf_bwrite_el(buf, info->data, size);
 
         if (size_pad != size) {
             uint8_t pad = 0;
             for (size_t j = 0; j < size_pad - size; ++j) {
-                gguf_fwrite_el(file, &pad, sizeof(pad));
+                gguf_bwrite_el(buf, &pad, sizeof(pad));
             }
         }
 
@@ -19423,10 +19503,44 @@ void gguf_write_to_file(struct gguf_context * ctx, const char * fname) {
 
         offset += size_pad;
     }
+}
+
+void gguf_write_to_file(struct gguf_context * ctx, const char * fname, bool only_meta) {
+    FILE * file = fopen(fname, "wb");
+    if (!file) {
+        GGML_ASSERT(false && "failed to open file for writing");
+    }
+
+    struct gguf_buf buf = gguf_buf_init(16*1024);
+
+    gguf_write_to_buf(ctx, &buf, only_meta);
+
+    fwrite(buf.data, 1, buf.offset, file);
+
+    gguf_buf_free(buf);
 
     fclose(file);
 }
 
+size_t gguf_get_meta_size(struct gguf_context * ctx) {
+    // no allocs - only compute size
+    struct gguf_buf buf = gguf_buf_init(0);
+
+    gguf_write_to_buf(ctx, &buf, true);
+
+    return buf.offset;
+}
+
+void gguf_get_meta_data(struct gguf_context * ctx, void * data) {
+    struct gguf_buf buf = gguf_buf_init(16*1024);
+
+    gguf_write_to_buf(ctx, &buf, true);
+
+    memcpy(data, buf.data, buf.offset);
+
+    gguf_buf_free(buf);
+}
+
 ////////////////////////////////////////////////////////////////////////////////
 
 int ggml_cpu_has_avx(void) {
diff --git a/ggml.h b/ggml.h
index 368cb00cb..3eb6acb10 100644
--- a/ggml.h
+++ b/ggml.h
@@ -1712,7 +1712,6 @@ extern "C" {
     // gguf
     //
 
-    // TODO: can be removed if the API is extended for writing
     enum gguf_type {
         GGUF_TYPE_UINT8   = 0,
         GGUF_TYPE_INT8    = 1,
@@ -1739,7 +1738,8 @@ extern "C" {
     GGML_API struct gguf_context * gguf_init_empty(void);
     GGML_API struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_params params);
     //GGML_API struct gguf_context * gguf_init_from_buffer(..);
-    GGML_API void                  gguf_free(struct gguf_context * ctx);
+
+    GGML_API void gguf_free(struct gguf_context * ctx);
 
     GGML_API int    gguf_get_version    (struct gguf_context * ctx);
     GGML_API size_t gguf_get_alignment  (struct gguf_context * ctx);
@@ -1770,6 +1770,7 @@ extern "C" {
     GGML_API const char * gguf_get_arr_str (struct gguf_context * ctx, int key_id, int i);
 
     GGML_API int    gguf_get_n_tensors    (struct gguf_context * ctx);
+    GGML_API int    gguf_find_tensor      (struct gguf_context * ctx, const char * name);
     GGML_API size_t gguf_get_tensor_offset(struct gguf_context * ctx, int i);
     GGML_API char * gguf_get_tensor_name  (struct gguf_context * ctx, int i);
 
@@ -1789,17 +1790,35 @@ extern "C" {
     // set or add KV pairs from another context
     GGML_API void gguf_set_kv(struct gguf_context * ctx, struct gguf_context * src);
 
+    // manage tensor info
     GGML_API void gguf_add_tensor(struct gguf_context * ctx, const struct ggml_tensor * tensor);
+    GGML_API void gguf_set_tensor_type(struct gguf_context * ctx, const char * name, enum ggml_type type);
+    GGML_API void gguf_set_tensor_data(struct gguf_context * ctx, const char * name, const void * data, size_t size);
 
-    // same as gguf_add_tensor, but allows to override tensor data
-    GGML_API void gguf_add_tensor_ex(
-                 struct gguf_context * ctx,
-            const struct ggml_tensor * tensor,
-                      enum ggml_type   type,
-                          const void * data,
-                              size_t   size);
+    // writing gguf files can be done in 2 ways:
+    //
+    // - write the entire gguf_context to a binary file in a single pass:
+    //
+    //   gguf_write_to_file(ctx, fname);
+    //
+    // - first prepare a file with a placeholder for the meta data, write the tensor data, then write the meta data:
+    //
+    //   FILE * f = fopen(fname, "wb");
+    //   fseek(f, gguf_get_meta_size(ctx), SEEK_SET);
+    //   fwrite(f, ...);
+    //   void * data = gguf_meta_get_meta_data(ctx);
+    //   fseek(f, 0, SEEK_SET);
+    //   fwrite(f, data, gguf_get_meta_size(ctx));
+    //   free(data);
+    //   fclose(f);
+    //
 
-    GGML_API void gguf_write_to_file(struct gguf_context * ctx, const char * fname);
+    // write the entire context to a binary file
+    GGML_API void gguf_write_to_file(struct gguf_context * ctx, const char * fname, bool only_meta);
+
+    // get the size in bytes of the meta data (header, kv pairs, tensor info) including padding
+    GGML_API size_t gguf_get_meta_size(struct gguf_context * ctx);
+    GGML_API void   gguf_get_meta_data(struct gguf_context * ctx, void * data);
 
     //
     // system info
diff --git a/gguf-llama.cpp b/gguf-llama.cpp
index e73806044..bc9559130 100644
--- a/gguf-llama.cpp
+++ b/gguf-llama.cpp
@@ -83,6 +83,13 @@ static std::string to_string(const T & val) {
     return ss.str();
 }
 
+static void zeros(std::ofstream & file, size_t n) {
+    char zero = 0;
+    for (size_t i = 0; i < n; ++i) {
+        file.write(&zero, 1);
+    }
+}
+
 #if !defined(GGML_USE_CUBLAS) && !defined(GGML_USE_METAL)
 #include "ggml-alloc.h"
 #define LLAMA_USE_ALLOCATOR
@@ -3049,7 +3056,6 @@ static void llama_convert_tensor_internal(const gguf_load_tensor & tensor, std::
     for (auto & worker : workers) {
         worker.join();
     }
-
 }
 
 static void llama_model_quantize_internal(const std::string & fname_inp, const std::string & fname_out, const llama_model_quantize_params * params) {
@@ -3087,6 +3093,7 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
 
     std::unique_ptr<llama_model_loader> model_loader(new llama_model_loader(fname_inp, /*use_mmap*/ false));
 
+    const size_t align = GGUF_DEFAULT_ALIGNMENT;
     struct gguf_context * ctx_out = gguf_init_empty();
 
     // copy the KV pairs from the input file
@@ -3125,7 +3132,18 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
     std::vector<uint8_t> read_data;
     std::vector<uint8_t> work;
 
-    std::vector<std::vector<uint8_t>> work_map(model_loader->tensors_map.tensors.size());
+    for (gguf_load_tensor & tensor : model_loader->tensors_map.tensors) {
+        gguf_add_tensor(ctx_out, tensor.ggml_tensor);
+    }
+
+    std::ofstream fout(fname_out, std::ios::binary);
+
+    const size_t meta_size = gguf_get_meta_size(ctx_out);
+
+    LLAMA_LOG_INFO("%s: meta size = %zu bytes\n", __func__, meta_size);
+
+    // placeholder for the meta data
+    ::zeros(fout, meta_size);
 
     for (gguf_load_tensor & tensor : model_loader->tensors_map.tensors) {
         read_data.resize(tensor.size);
@@ -3286,13 +3304,25 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
         total_size_org += tensor.size;
         total_size_new += new_size;
 
-        // TODO: temp fix until we have stream support in gguf
-        work_map[idx - 1] = std::vector<uint8_t>((char *) new_data, (char *) new_data + new_size);
+        // update the gguf meta data as we go
+        gguf_set_tensor_type(ctx_out, tensor.name.c_str(), new_type);
+        gguf_set_tensor_data(ctx_out, tensor.name.c_str(), new_data, new_size);
 
-        gguf_add_tensor_ex(ctx_out, tensor.ggml_tensor, new_type, work_map[idx - 1].data(), new_size);
+        // write tensor data + padding
+        fout.write((const char *) new_data, new_size);
+        zeros(fout, GGML_PAD(new_size, align) - new_size);
     }
 
-    gguf_write_to_file(ctx_out, fname_out.c_str());
+    // go back to beginning of file and write the updated meta data
+    {
+        fout.seekp(0);
+        std::vector<uint8_t> data(gguf_get_meta_size(ctx_out));
+        gguf_get_meta_data(ctx_out, data.data());
+        fout.write((const char *) data.data(), data.size());
+    }
+
+    fout.close();
+
     gguf_free(ctx_out);
 
     LLAMA_LOG_INFO("%s: model size  = %8.2f MB\n", __func__, total_size_org/1024.0/1024.0);