gguf : streaming support when writing files

2023-08-15 19:57:37 +03:00 · 2023-08-15 19:57:37 +03:00 · f7a6aa9911
commit f7a6aa9911
parent 4ef5e792e3
4 changed files with 233 additions and 70 deletions
--- a/examples/gguf/gguf.cpp
+++ b/examples/gguf/gguf.cpp
@ -72,7 +72,7 @@ bool gguf_ex_write(const std::string & fname) {
        gguf_add_tensor(ctx, cur);
    }
-    gguf_write_to_file(ctx, fname.c_str());
+    gguf_write_to_file(ctx, fname.c_str(), false);
    fprintf(stdout, "%s: wrote file '%s;\n", __func__, fname.c_str());
--- a/ggml.c
+++ b/ggml.c
@ -19123,6 +19123,22 @@ int gguf_get_n_tensors(struct gguf_context * ctx) {
    return ctx->header.n_tensors;
 }
 int gguf_find_tensor(struct gguf_context * ctx, const char * name) {
    // return -1 if tensor not found
    int tensorfound = -1;
    const int n_tensors = gguf_get_n_tensors(ctx);
    for (int i = 0; i < n_tensors; ++i) {
        if (strcmp(name, gguf_get_tensor_name(ctx, i)) == 0) {
            tensorfound = i;
            break;
        }
    }
    return tensorfound;
 }
 size_t gguf_get_tensor_offset(struct gguf_context * ctx, int i) {
    return ctx->infos[i].offset;
 }
@ -19269,12 +19285,9 @@ void gguf_set_kv(struct gguf_context * ctx, struct gguf_context * src) {
    }
 }
-void gguf_add_tensor_ex(
+void gguf_add_tensor(
             struct gguf_context * ctx,
-        const struct ggml_tensor * tensor,
+        const struct ggml_tensor * tensor) {
                  enum ggml_type   type,
                      const void * data,
                          size_t   size) {
    const int idx = ctx->header.n_tensors;
    ctx->infos = realloc(ctx->infos, (idx + 1)*sizeof(struct gguf_tensor_info));
@ -19290,10 +19303,10 @@ void gguf_add_tensor_ex(
        ctx->infos[idx].ne[i] = tensor->ne[i];
    }
-    ctx->infos[idx].type   = type;
+    ctx->infos[idx].type   = tensor->type;
    ctx->infos[idx].offset = 0;
-    ctx->infos[idx].data   = data;
+    ctx->infos[idx].data   = tensor->data;
-    ctx->infos[idx].size   = size;
+    ctx->infos[idx].size   = ggml_nbytes(tensor);
    if (ctx->header.n_tensors > 0) {
        ctx->infos[idx].offset = ctx->infos[idx - 1].offset + GGML_PAD(ctx->infos[idx - 1].size, ctx->alignment);
@ -19302,52 +19315,115 @@ void gguf_add_tensor_ex(
    ctx->header.n_tensors++;
 }
-void gguf_add_tensor(struct gguf_context * ctx, const struct ggml_tensor * tensor) {
+void gguf_set_tensor_type(struct gguf_context * ctx, const char * name, enum ggml_type type) {
-    gguf_add_tensor_ex(ctx, tensor, tensor->type, tensor->data, ggml_nbytes(tensor));
+    const int idx = gguf_find_tensor(ctx, name);
    if (idx < 0) {
        GGML_ASSERT(false && "tensor not found");
    }
-static void gguf_fwrite_str(FILE * file, const struct gguf_str * val) {
+    ctx->infos[idx].type = type;
    fwrite(&val->n,   sizeof(val->n),    1, file);
    fwrite(val->data, sizeof(char), val->n, file);
 }
-static void gguf_fwrite_el(FILE * file, const void * val, size_t size) {
+void gguf_set_tensor_data(struct gguf_context * ctx, const char * name, const void * data, size_t size) {
-    fwrite(val, sizeof(char), size, file);
+    const int idx = gguf_find_tensor(ctx, name);
    if (idx < 0) {
        GGML_ASSERT(false && "tensor not found");
    }
-void gguf_write_to_file(struct gguf_context * ctx, const char * fname) {
+    ctx->infos[idx].data = data;
-    FILE * file = fopen(fname, "wb");
+    ctx->infos[idx].size = size;
-    if (!file) {
+
-        GGML_ASSERT(false && "failed to open file for writing");
+    // update offsets
    for (uint32_t i = idx + 1; i < ctx->header.n_tensors; ++i) {
        ctx->infos[i].offset = ctx->infos[i - 1].offset + GGML_PAD(ctx->infos[i - 1].size, ctx->alignment);
    }
 }
 //static void gguf_fwrite_str(FILE * file, const struct gguf_str * val) {
 //    fwrite(&val->n,   sizeof(val->n),    1, file);
 //    fwrite(val->data, sizeof(char), val->n, file);
 //}
 //
 //static void gguf_fwrite_el(FILE * file, const void * val, size_t size) {
 //    fwrite(val, sizeof(char), size, file);
 //}
 struct gguf_buf {
    void * data;
    size_t size;
    size_t offset;
 };
 static struct gguf_buf gguf_buf_init(size_t size) {
    struct gguf_buf buf = {
        /*buf.data   =*/ size == 0 ? NULL : malloc(size),
        /*buf.size   =*/ size,
        /*buf.offset =*/ 0,
    };
    return buf;
 }
 static void gguf_buf_free(struct gguf_buf buf) {
    if (buf.data) {
        free(buf.data);
    }
 }
 static void gguf_buf_grow(struct gguf_buf * buf, size_t size) {
    if (buf->offset + size > buf->size) {
        buf->size = 1.5*(buf->offset + size);
        if (buf->data) {
            buf->data = realloc(buf->data, buf->size);
        }
    }
 }
 static void gguf_bwrite_str(struct gguf_buf * buf, const struct gguf_str * val) {
    gguf_buf_grow(buf, sizeof(val->n) + val->n);
    buf->data && memcpy((char *) buf->data + buf->offset, &val->n, sizeof(val->n));
    buf->offset += sizeof(val->n);
    buf->data && memcpy((char *) buf->data + buf->offset, val->data, val->n);
    buf->offset += val->n;
 }
 static void gguf_bwrite_el(struct gguf_buf * buf, const void * val, size_t el_size) {
    gguf_buf_grow(buf, el_size);
    buf->data && memcpy((char *) buf->data + buf->offset, val, el_size);
    buf->offset += el_size;
 }
 static void gguf_write_to_buf(struct gguf_context * ctx, struct gguf_buf * buf, bool only_meta) {
    // write header
-    gguf_fwrite_el(file, &ctx->header.magic,     sizeof(ctx->header.magic));
+    gguf_bwrite_el(buf, &ctx->header.magic,     sizeof(ctx->header.magic));
-    gguf_fwrite_el(file, &ctx->header.version,   sizeof(ctx->header.version));
+    gguf_bwrite_el(buf, &ctx->header.version,   sizeof(ctx->header.version));
-    gguf_fwrite_el(file, &ctx->header.n_tensors, sizeof(ctx->header.n_tensors));
+    gguf_bwrite_el(buf, &ctx->header.n_tensors, sizeof(ctx->header.n_tensors));
-    gguf_fwrite_el(file, &ctx->header.n_kv,      sizeof(ctx->header.n_kv));
+    gguf_bwrite_el(buf, &ctx->header.n_kv,      sizeof(ctx->header.n_kv));
    // write key-value pairs
    for (uint32_t i = 0; i < ctx->header.n_kv; ++i) {
        struct gguf_kv * kv = &ctx->kv[i];
-        gguf_fwrite_str(file, &kv->key);
+        gguf_bwrite_str(buf, &kv->key);
-        gguf_fwrite_el (file, &kv->type, sizeof(kv->type));
+        gguf_bwrite_el (buf, &kv->type, sizeof(kv->type));
        switch (kv->type) {
-            case GGUF_TYPE_UINT8:   gguf_fwrite_el (file, &kv->value.uint8,   sizeof(kv->value.uint8)  ); break;
+            case GGUF_TYPE_UINT8:   gguf_bwrite_el( buf, &kv->value.uint8,   sizeof(kv->value.uint8)  ); break;
-            case GGUF_TYPE_INT8:    gguf_fwrite_el (file, &kv->value.int8,    sizeof(kv->value.int8)   ); break;
+            case GGUF_TYPE_INT8:    gguf_bwrite_el (buf, &kv->value.int8,    sizeof(kv->value.int8)   ); break;
-            case GGUF_TYPE_UINT16:  gguf_fwrite_el (file, &kv->value.uint16,  sizeof(kv->value.uint16) ); break;
+            case GGUF_TYPE_UINT16:  gguf_bwrite_el (buf, &kv->value.uint16,  sizeof(kv->value.uint16) ); break;
-            case GGUF_TYPE_INT16:   gguf_fwrite_el (file, &kv->value.int16,   sizeof(kv->value.int16)  ); break;
+            case GGUF_TYPE_INT16:   gguf_bwrite_el (buf, &kv->value.int16,   sizeof(kv->value.int16)  ); break;
-            case GGUF_TYPE_UINT32:  gguf_fwrite_el (file, &kv->value.uint32,  sizeof(kv->value.uint32) ); break;
+            case GGUF_TYPE_UINT32:  gguf_bwrite_el (buf, &kv->value.uint32,  sizeof(kv->value.uint32) ); break;
-            case GGUF_TYPE_INT32:   gguf_fwrite_el (file, &kv->value.int32,   sizeof(kv->value.int32)  ); break;
+            case GGUF_TYPE_INT32:   gguf_bwrite_el (buf, &kv->value.int32,   sizeof(kv->value.int32)  ); break;
-            case GGUF_TYPE_FLOAT32: gguf_fwrite_el (file, &kv->value.float32, sizeof(kv->value.float32)); break;
+            case GGUF_TYPE_FLOAT32: gguf_bwrite_el (buf, &kv->value.float32, sizeof(kv->value.float32)); break;
-            case GGUF_TYPE_BOOL:    gguf_fwrite_el (file, &kv->value.bool_,   sizeof(kv->value.bool_)  ); break;
+            case GGUF_TYPE_BOOL:    gguf_bwrite_el (buf, &kv->value.bool_,   sizeof(kv->value.bool_)  ); break;
-            case GGUF_TYPE_STRING:  gguf_fwrite_str(file, &kv->value.str                               ); break;
+            case GGUF_TYPE_STRING:  gguf_bwrite_str(buf, &kv->value.str                               ); break;
            case GGUF_TYPE_ARRAY:
                {
-                    gguf_fwrite_el(file, &kv->value.arr.type, sizeof(kv->value.arr.type));
+                    gguf_bwrite_el(buf, &kv->value.arr.type, sizeof(kv->value.arr.type));
-                    gguf_fwrite_el(file, &kv->value.arr.n,    sizeof(kv->value.arr.n)   );
+                    gguf_bwrite_el(buf, &kv->value.arr.n,    sizeof(kv->value.arr.n)   );
                    switch (kv->value.arr.type) {
                        case GGUF_TYPE_UINT8:
@ -19359,12 +19435,12 @@ void gguf_write_to_file(struct gguf_context * ctx, const char * fname) {
                        case GGUF_TYPE_FLOAT32:
                        case GGUF_TYPE_BOOL:
                            {
-                                gguf_fwrite_el(file, kv->value.arr.data, kv->value.arr.n * GGUF_TYPE_SIZE[kv->value.arr.type]);
+                                gguf_bwrite_el(buf, kv->value.arr.data, kv->value.arr.n * GGUF_TYPE_SIZE[kv->value.arr.type]);
                            } break;
                        case GGUF_TYPE_STRING:
                            {
                                for (uint32_t j = 0; j < kv->value.arr.n; ++j) {
-                                    gguf_fwrite_str(file, &((struct gguf_str *) kv->value.arr.data)[j]);
+                                    gguf_bwrite_str(buf, &((struct gguf_str *) kv->value.arr.data)[j]);
                                }
                            } break;
                        case GGUF_TYPE_ARRAY:
@ -19379,28 +19455,32 @@ void gguf_write_to_file(struct gguf_context * ctx, const char * fname) {
    for (uint32_t i = 0; i < ctx->header.n_tensors; ++i) {
        struct gguf_tensor_info * info = &ctx->infos[i];
-        gguf_fwrite_str(file, &info->name);
+        gguf_bwrite_str(buf, &info->name);
-        gguf_fwrite_el (file, &info->n_dims, sizeof(info->n_dims));
+        gguf_bwrite_el (buf, &info->n_dims, sizeof(info->n_dims));
        for (uint32_t j = 0; j < info->n_dims; ++j) {
-            gguf_fwrite_el(file, &info->ne[j], sizeof(info->ne[j]));
+            gguf_bwrite_el(buf, &info->ne[j], sizeof(info->ne[j]));
        }
-        gguf_fwrite_el (file, &info->type,   sizeof(info->type));
+        gguf_bwrite_el(buf, &info->type,   sizeof(info->type));
-        gguf_fwrite_el (file, &info->offset, sizeof(info->offset));
+        gguf_bwrite_el(buf, &info->offset, sizeof(info->offset));
    }
    // we require the data section to be aligned, so take into account any padding
    {
-        const size_t offset     = ftell(file);
+        const size_t offset     = buf->offset;
        const size_t offset_pad = GGML_PAD(offset, ctx->alignment);
        if (offset_pad != offset) {
            uint8_t pad = 0;
            for (size_t i = 0; i < offset_pad - offset; ++i) {
-                gguf_fwrite_el(file, &pad, sizeof(pad));
+                gguf_bwrite_el(buf, &pad, sizeof(pad));
            }
        }
    }
    if (only_meta) {
        return;
    }
    size_t offset = 0;
    // write tensor data
@ -19410,12 +19490,12 @@ void gguf_write_to_file(struct gguf_context * ctx, const char * fname) {
        const size_t size     = info->size;
        const size_t size_pad = GGML_PAD(size, ctx->alignment);
-        gguf_fwrite_el(file, info->data, size);
+        gguf_bwrite_el(buf, info->data, size);
        if (size_pad != size) {
            uint8_t pad = 0;
            for (size_t j = 0; j < size_pad - size; ++j) {
-                gguf_fwrite_el(file, &pad, sizeof(pad));
+                gguf_bwrite_el(buf, &pad, sizeof(pad));
            }
        }
@ -19423,10 +19503,44 @@ void gguf_write_to_file(struct gguf_context * ctx, const char * fname) {
        offset += size_pad;
    }
 }
 void gguf_write_to_file(struct gguf_context * ctx, const char * fname, bool only_meta) {
    FILE * file = fopen(fname, "wb");
    if (!file) {
        GGML_ASSERT(false && "failed to open file for writing");
    }
    struct gguf_buf buf = gguf_buf_init(16*1024);
    gguf_write_to_buf(ctx, &buf, only_meta);
    fwrite(buf.data, 1, buf.offset, file);
    gguf_buf_free(buf);
    fclose(file);
 }
 size_t gguf_get_meta_size(struct gguf_context * ctx) {
    // no allocs - only compute size
    struct gguf_buf buf = gguf_buf_init(0);
    gguf_write_to_buf(ctx, &buf, true);
    return buf.offset;
 }
 void gguf_get_meta_data(struct gguf_context * ctx, void * data) {
    struct gguf_buf buf = gguf_buf_init(16*1024);
    gguf_write_to_buf(ctx, &buf, true);
    memcpy(data, buf.data, buf.offset);
    gguf_buf_free(buf);
 }
 ////////////////////////////////////////////////////////////////////////////////
 int ggml_cpu_has_avx(void) {
--- a/ggml.h
+++ b/ggml.h
@ -1712,7 +1712,6 @@ extern "C" {
    // gguf
    //
    // TODO: can be removed if the API is extended for writing
    enum gguf_type {
        GGUF_TYPE_UINT8   = 0,
        GGUF_TYPE_INT8    = 1,
@ -1739,6 +1738,7 @@ extern "C" {
    GGML_API struct gguf_context * gguf_init_empty(void);
    GGML_API struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_params params);
    //GGML_API struct gguf_context * gguf_init_from_buffer(..);
    GGML_API void gguf_free(struct gguf_context * ctx);
    GGML_API int    gguf_get_version    (struct gguf_context * ctx);
@ -1770,6 +1770,7 @@ extern "C" {
    GGML_API const char * gguf_get_arr_str (struct gguf_context * ctx, int key_id, int i);
    GGML_API int    gguf_get_n_tensors    (struct gguf_context * ctx);
    GGML_API int    gguf_find_tensor      (struct gguf_context * ctx, const char * name);
    GGML_API size_t gguf_get_tensor_offset(struct gguf_context * ctx, int i);
    GGML_API char * gguf_get_tensor_name  (struct gguf_context * ctx, int i);
@ -1789,17 +1790,35 @@ extern "C" {
    // set or add KV pairs from another context
    GGML_API void gguf_set_kv(struct gguf_context * ctx, struct gguf_context * src);
    // manage tensor info
    GGML_API void gguf_add_tensor(struct gguf_context * ctx, const struct ggml_tensor * tensor);
    GGML_API void gguf_set_tensor_type(struct gguf_context * ctx, const char * name, enum ggml_type type);
    GGML_API void gguf_set_tensor_data(struct gguf_context * ctx, const char * name, const void * data, size_t size);
-    // same as gguf_add_tensor, but allows to override tensor data
+    // writing gguf files can be done in 2 ways:
-    GGML_API void gguf_add_tensor_ex(
+    //
-                 struct gguf_context * ctx,
+    // - write the entire gguf_context to a binary file in a single pass:
-            const struct ggml_tensor * tensor,
+    //
-                      enum ggml_type   type,
+    //   gguf_write_to_file(ctx, fname);
-                          const void * data,
+    //
-                              size_t   size);
+    // - first prepare a file with a placeholder for the meta data, write the tensor data, then write the meta data:
    //
    //   FILE * f = fopen(fname, "wb");
    //   fseek(f, gguf_get_meta_size(ctx), SEEK_SET);
    //   fwrite(f, ...);
    //   void * data = gguf_meta_get_meta_data(ctx);
    //   fseek(f, 0, SEEK_SET);
    //   fwrite(f, data, gguf_get_meta_size(ctx));
    //   free(data);
    //   fclose(f);
    //
-    GGML_API void gguf_write_to_file(struct gguf_context * ctx, const char * fname);
+    // write the entire context to a binary file
    GGML_API void gguf_write_to_file(struct gguf_context * ctx, const char * fname, bool only_meta);
    // get the size in bytes of the meta data (header, kv pairs, tensor info) including padding
    GGML_API size_t gguf_get_meta_size(struct gguf_context * ctx);
    GGML_API void   gguf_get_meta_data(struct gguf_context * ctx, void * data);
    //
    // system info
--- a/gguf-llama.cpp
+++ b/gguf-llama.cpp
@ -83,6 +83,13 @@ static std::string to_string(const T & val) {
    return ss.str();
 }
 static void zeros(std::ofstream & file, size_t n) {
    char zero = 0;
    for (size_t i = 0; i < n; ++i) {
        file.write(&zero, 1);
    }
 }
 #if !defined(GGML_USE_CUBLAS) && !defined(GGML_USE_METAL)
 #include "ggml-alloc.h"
 #define LLAMA_USE_ALLOCATOR
@ -3049,7 +3056,6 @@ static void llama_convert_tensor_internal(const gguf_load_tensor & tensor, std::
    for (auto & worker : workers) {
        worker.join();
    }
 }
 static void llama_model_quantize_internal(const std::string & fname_inp, const std::string & fname_out, const llama_model_quantize_params * params) {
@ -3087,6 +3093,7 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
    std::unique_ptr<llama_model_loader> model_loader(new llama_model_loader(fname_inp, /*use_mmap*/ false));
    const size_t align = GGUF_DEFAULT_ALIGNMENT;
    struct gguf_context * ctx_out = gguf_init_empty();
    // copy the KV pairs from the input file
@ -3125,7 +3132,18 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
    std::vector<uint8_t> read_data;
    std::vector<uint8_t> work;
-    std::vector<std::vector<uint8_t>> work_map(model_loader->tensors_map.tensors.size());
+    for (gguf_load_tensor & tensor : model_loader->tensors_map.tensors) {
        gguf_add_tensor(ctx_out, tensor.ggml_tensor);
    }
    std::ofstream fout(fname_out, std::ios::binary);
    const size_t meta_size = gguf_get_meta_size(ctx_out);
    LLAMA_LOG_INFO("%s: meta size = %zu bytes\n", __func__, meta_size);
    // placeholder for the meta data
    ::zeros(fout, meta_size);
    for (gguf_load_tensor & tensor : model_loader->tensors_map.tensors) {
        read_data.resize(tensor.size);
@ -3286,13 +3304,25 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
        total_size_org += tensor.size;
        total_size_new += new_size;
-        // TODO: temp fix until we have stream support in gguf
+        // update the gguf meta data as we go
-        work_map[idx - 1] = std::vector<uint8_t>((char *) new_data, (char *) new_data + new_size);
+        gguf_set_tensor_type(ctx_out, tensor.name.c_str(), new_type);
        gguf_set_tensor_data(ctx_out, tensor.name.c_str(), new_data, new_size);
-        gguf_add_tensor_ex(ctx_out, tensor.ggml_tensor, new_type, work_map[idx - 1].data(), new_size);
+        // write tensor data + padding
        fout.write((const char *) new_data, new_size);
        zeros(fout, GGML_PAD(new_size, align) - new_size);
    }
-    gguf_write_to_file(ctx_out, fname_out.c_str());
+    // go back to beginning of file and write the updated meta data
    {
        fout.seekp(0);
        std::vector<uint8_t> data(gguf_get_meta_size(ctx_out));
        gguf_get_meta_data(ctx_out, data.data());
        fout.write((const char *) data.data(), data.size());
    }
    fout.close();
    gguf_free(ctx_out);
    LLAMA_LOG_INFO("%s: model size  = %8.2f MB\n", __func__, total_size_org/1024.0/1024.0);