From f7a6aa99112c843b5d17f0a29d79215aa61155be Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Tue, 15 Aug 2023 19:57:37 +0300 Subject: [PATCH] gguf : streaming support when writing files --- examples/gguf/gguf.cpp | 2 +- ggml.c | 220 +++++++++++++++++++++++++++++++---------- ggml.h | 39 ++++++-- gguf-llama.cpp | 42 ++++++-- 4 files changed, 233 insertions(+), 70 deletions(-) diff --git a/examples/gguf/gguf.cpp b/examples/gguf/gguf.cpp index f67af1416..74a447c07 100644 --- a/examples/gguf/gguf.cpp +++ b/examples/gguf/gguf.cpp @@ -72,7 +72,7 @@ bool gguf_ex_write(const std::string & fname) { gguf_add_tensor(ctx, cur); } - gguf_write_to_file(ctx, fname.c_str()); + gguf_write_to_file(ctx, fname.c_str(), false); fprintf(stdout, "%s: wrote file '%s;\n", __func__, fname.c_str()); diff --git a/ggml.c b/ggml.c index ead9ab526..7549566aa 100644 --- a/ggml.c +++ b/ggml.c @@ -19123,6 +19123,22 @@ int gguf_get_n_tensors(struct gguf_context * ctx) { return ctx->header.n_tensors; } +int gguf_find_tensor(struct gguf_context * ctx, const char * name) { + // return -1 if tensor not found + int tensorfound = -1; + + const int n_tensors = gguf_get_n_tensors(ctx); + + for (int i = 0; i < n_tensors; ++i) { + if (strcmp(name, gguf_get_tensor_name(ctx, i)) == 0) { + tensorfound = i; + break; + } + } + + return tensorfound; +} + size_t gguf_get_tensor_offset(struct gguf_context * ctx, int i) { return ctx->infos[i].offset; } @@ -19269,12 +19285,9 @@ void gguf_set_kv(struct gguf_context * ctx, struct gguf_context * src) { } } -void gguf_add_tensor_ex( +void gguf_add_tensor( struct gguf_context * ctx, - const struct ggml_tensor * tensor, - enum ggml_type type, - const void * data, - size_t size) { + const struct ggml_tensor * tensor) { const int idx = ctx->header.n_tensors; ctx->infos = realloc(ctx->infos, (idx + 1)*sizeof(struct gguf_tensor_info)); @@ -19290,10 +19303,10 @@ void gguf_add_tensor_ex( ctx->infos[idx].ne[i] = tensor->ne[i]; } - ctx->infos[idx].type = type; + ctx->infos[idx].type = tensor->type; ctx->infos[idx].offset = 0; - ctx->infos[idx].data = data; - ctx->infos[idx].size = size; + ctx->infos[idx].data = tensor->data; + ctx->infos[idx].size = ggml_nbytes(tensor); if (ctx->header.n_tensors > 0) { ctx->infos[idx].offset = ctx->infos[idx - 1].offset + GGML_PAD(ctx->infos[idx - 1].size, ctx->alignment); @@ -19302,52 +19315,115 @@ void gguf_add_tensor_ex( ctx->header.n_tensors++; } -void gguf_add_tensor(struct gguf_context * ctx, const struct ggml_tensor * tensor) { - gguf_add_tensor_ex(ctx, tensor, tensor->type, tensor->data, ggml_nbytes(tensor)); -} - -static void gguf_fwrite_str(FILE * file, const struct gguf_str * val) { - fwrite(&val->n, sizeof(val->n), 1, file); - fwrite(val->data, sizeof(char), val->n, file); -} - -static void gguf_fwrite_el(FILE * file, const void * val, size_t size) { - fwrite(val, sizeof(char), size, file); -} - -void gguf_write_to_file(struct gguf_context * ctx, const char * fname) { - FILE * file = fopen(fname, "wb"); - if (!file) { - GGML_ASSERT(false && "failed to open file for writing"); +void gguf_set_tensor_type(struct gguf_context * ctx, const char * name, enum ggml_type type) { + const int idx = gguf_find_tensor(ctx, name); + if (idx < 0) { + GGML_ASSERT(false && "tensor not found"); } + ctx->infos[idx].type = type; +} + +void gguf_set_tensor_data(struct gguf_context * ctx, const char * name, const void * data, size_t size) { + const int idx = gguf_find_tensor(ctx, name); + if (idx < 0) { + GGML_ASSERT(false && "tensor not found"); + } + + ctx->infos[idx].data = data; + ctx->infos[idx].size = size; + + // update offsets + for (uint32_t i = idx + 1; i < ctx->header.n_tensors; ++i) { + ctx->infos[i].offset = ctx->infos[i - 1].offset + GGML_PAD(ctx->infos[i - 1].size, ctx->alignment); + } +} + +//static void gguf_fwrite_str(FILE * file, const struct gguf_str * val) { +// fwrite(&val->n, sizeof(val->n), 1, file); +// fwrite(val->data, sizeof(char), val->n, file); +//} +// +//static void gguf_fwrite_el(FILE * file, const void * val, size_t size) { +// fwrite(val, sizeof(char), size, file); +//} + +struct gguf_buf { + void * data; + size_t size; + size_t offset; +}; + +static struct gguf_buf gguf_buf_init(size_t size) { + struct gguf_buf buf = { + /*buf.data =*/ size == 0 ? NULL : malloc(size), + /*buf.size =*/ size, + /*buf.offset =*/ 0, + }; + + return buf; +} + +static void gguf_buf_free(struct gguf_buf buf) { + if (buf.data) { + free(buf.data); + } +} + +static void gguf_buf_grow(struct gguf_buf * buf, size_t size) { + if (buf->offset + size > buf->size) { + buf->size = 1.5*(buf->offset + size); + if (buf->data) { + buf->data = realloc(buf->data, buf->size); + } + } +} + +static void gguf_bwrite_str(struct gguf_buf * buf, const struct gguf_str * val) { + gguf_buf_grow(buf, sizeof(val->n) + val->n); + + buf->data && memcpy((char *) buf->data + buf->offset, &val->n, sizeof(val->n)); + buf->offset += sizeof(val->n); + + buf->data && memcpy((char *) buf->data + buf->offset, val->data, val->n); + buf->offset += val->n; +} + +static void gguf_bwrite_el(struct gguf_buf * buf, const void * val, size_t el_size) { + gguf_buf_grow(buf, el_size); + + buf->data && memcpy((char *) buf->data + buf->offset, val, el_size); + buf->offset += el_size; +} + +static void gguf_write_to_buf(struct gguf_context * ctx, struct gguf_buf * buf, bool only_meta) { // write header - gguf_fwrite_el(file, &ctx->header.magic, sizeof(ctx->header.magic)); - gguf_fwrite_el(file, &ctx->header.version, sizeof(ctx->header.version)); - gguf_fwrite_el(file, &ctx->header.n_tensors, sizeof(ctx->header.n_tensors)); - gguf_fwrite_el(file, &ctx->header.n_kv, sizeof(ctx->header.n_kv)); + gguf_bwrite_el(buf, &ctx->header.magic, sizeof(ctx->header.magic)); + gguf_bwrite_el(buf, &ctx->header.version, sizeof(ctx->header.version)); + gguf_bwrite_el(buf, &ctx->header.n_tensors, sizeof(ctx->header.n_tensors)); + gguf_bwrite_el(buf, &ctx->header.n_kv, sizeof(ctx->header.n_kv)); // write key-value pairs for (uint32_t i = 0; i < ctx->header.n_kv; ++i) { struct gguf_kv * kv = &ctx->kv[i]; - gguf_fwrite_str(file, &kv->key); - gguf_fwrite_el (file, &kv->type, sizeof(kv->type)); + gguf_bwrite_str(buf, &kv->key); + gguf_bwrite_el (buf, &kv->type, sizeof(kv->type)); switch (kv->type) { - case GGUF_TYPE_UINT8: gguf_fwrite_el (file, &kv->value.uint8, sizeof(kv->value.uint8) ); break; - case GGUF_TYPE_INT8: gguf_fwrite_el (file, &kv->value.int8, sizeof(kv->value.int8) ); break; - case GGUF_TYPE_UINT16: gguf_fwrite_el (file, &kv->value.uint16, sizeof(kv->value.uint16) ); break; - case GGUF_TYPE_INT16: gguf_fwrite_el (file, &kv->value.int16, sizeof(kv->value.int16) ); break; - case GGUF_TYPE_UINT32: gguf_fwrite_el (file, &kv->value.uint32, sizeof(kv->value.uint32) ); break; - case GGUF_TYPE_INT32: gguf_fwrite_el (file, &kv->value.int32, sizeof(kv->value.int32) ); break; - case GGUF_TYPE_FLOAT32: gguf_fwrite_el (file, &kv->value.float32, sizeof(kv->value.float32)); break; - case GGUF_TYPE_BOOL: gguf_fwrite_el (file, &kv->value.bool_, sizeof(kv->value.bool_) ); break; - case GGUF_TYPE_STRING: gguf_fwrite_str(file, &kv->value.str ); break; + case GGUF_TYPE_UINT8: gguf_bwrite_el( buf, &kv->value.uint8, sizeof(kv->value.uint8) ); break; + case GGUF_TYPE_INT8: gguf_bwrite_el (buf, &kv->value.int8, sizeof(kv->value.int8) ); break; + case GGUF_TYPE_UINT16: gguf_bwrite_el (buf, &kv->value.uint16, sizeof(kv->value.uint16) ); break; + case GGUF_TYPE_INT16: gguf_bwrite_el (buf, &kv->value.int16, sizeof(kv->value.int16) ); break; + case GGUF_TYPE_UINT32: gguf_bwrite_el (buf, &kv->value.uint32, sizeof(kv->value.uint32) ); break; + case GGUF_TYPE_INT32: gguf_bwrite_el (buf, &kv->value.int32, sizeof(kv->value.int32) ); break; + case GGUF_TYPE_FLOAT32: gguf_bwrite_el (buf, &kv->value.float32, sizeof(kv->value.float32)); break; + case GGUF_TYPE_BOOL: gguf_bwrite_el (buf, &kv->value.bool_, sizeof(kv->value.bool_) ); break; + case GGUF_TYPE_STRING: gguf_bwrite_str(buf, &kv->value.str ); break; case GGUF_TYPE_ARRAY: { - gguf_fwrite_el(file, &kv->value.arr.type, sizeof(kv->value.arr.type)); - gguf_fwrite_el(file, &kv->value.arr.n, sizeof(kv->value.arr.n) ); + gguf_bwrite_el(buf, &kv->value.arr.type, sizeof(kv->value.arr.type)); + gguf_bwrite_el(buf, &kv->value.arr.n, sizeof(kv->value.arr.n) ); switch (kv->value.arr.type) { case GGUF_TYPE_UINT8: @@ -19359,12 +19435,12 @@ void gguf_write_to_file(struct gguf_context * ctx, const char * fname) { case GGUF_TYPE_FLOAT32: case GGUF_TYPE_BOOL: { - gguf_fwrite_el(file, kv->value.arr.data, kv->value.arr.n * GGUF_TYPE_SIZE[kv->value.arr.type]); + gguf_bwrite_el(buf, kv->value.arr.data, kv->value.arr.n * GGUF_TYPE_SIZE[kv->value.arr.type]); } break; case GGUF_TYPE_STRING: { for (uint32_t j = 0; j < kv->value.arr.n; ++j) { - gguf_fwrite_str(file, &((struct gguf_str *) kv->value.arr.data)[j]); + gguf_bwrite_str(buf, &((struct gguf_str *) kv->value.arr.data)[j]); } } break; case GGUF_TYPE_ARRAY: @@ -19379,28 +19455,32 @@ void gguf_write_to_file(struct gguf_context * ctx, const char * fname) { for (uint32_t i = 0; i < ctx->header.n_tensors; ++i) { struct gguf_tensor_info * info = &ctx->infos[i]; - gguf_fwrite_str(file, &info->name); - gguf_fwrite_el (file, &info->n_dims, sizeof(info->n_dims)); + gguf_bwrite_str(buf, &info->name); + gguf_bwrite_el (buf, &info->n_dims, sizeof(info->n_dims)); for (uint32_t j = 0; j < info->n_dims; ++j) { - gguf_fwrite_el(file, &info->ne[j], sizeof(info->ne[j])); + gguf_bwrite_el(buf, &info->ne[j], sizeof(info->ne[j])); } - gguf_fwrite_el (file, &info->type, sizeof(info->type)); - gguf_fwrite_el (file, &info->offset, sizeof(info->offset)); + gguf_bwrite_el(buf, &info->type, sizeof(info->type)); + gguf_bwrite_el(buf, &info->offset, sizeof(info->offset)); } // we require the data section to be aligned, so take into account any padding { - const size_t offset = ftell(file); + const size_t offset = buf->offset; const size_t offset_pad = GGML_PAD(offset, ctx->alignment); if (offset_pad != offset) { uint8_t pad = 0; for (size_t i = 0; i < offset_pad - offset; ++i) { - gguf_fwrite_el(file, &pad, sizeof(pad)); + gguf_bwrite_el(buf, &pad, sizeof(pad)); } } } + if (only_meta) { + return; + } + size_t offset = 0; // write tensor data @@ -19410,12 +19490,12 @@ void gguf_write_to_file(struct gguf_context * ctx, const char * fname) { const size_t size = info->size; const size_t size_pad = GGML_PAD(size, ctx->alignment); - gguf_fwrite_el(file, info->data, size); + gguf_bwrite_el(buf, info->data, size); if (size_pad != size) { uint8_t pad = 0; for (size_t j = 0; j < size_pad - size; ++j) { - gguf_fwrite_el(file, &pad, sizeof(pad)); + gguf_bwrite_el(buf, &pad, sizeof(pad)); } } @@ -19423,10 +19503,44 @@ void gguf_write_to_file(struct gguf_context * ctx, const char * fname) { offset += size_pad; } +} + +void gguf_write_to_file(struct gguf_context * ctx, const char * fname, bool only_meta) { + FILE * file = fopen(fname, "wb"); + if (!file) { + GGML_ASSERT(false && "failed to open file for writing"); + } + + struct gguf_buf buf = gguf_buf_init(16*1024); + + gguf_write_to_buf(ctx, &buf, only_meta); + + fwrite(buf.data, 1, buf.offset, file); + + gguf_buf_free(buf); fclose(file); } +size_t gguf_get_meta_size(struct gguf_context * ctx) { + // no allocs - only compute size + struct gguf_buf buf = gguf_buf_init(0); + + gguf_write_to_buf(ctx, &buf, true); + + return buf.offset; +} + +void gguf_get_meta_data(struct gguf_context * ctx, void * data) { + struct gguf_buf buf = gguf_buf_init(16*1024); + + gguf_write_to_buf(ctx, &buf, true); + + memcpy(data, buf.data, buf.offset); + + gguf_buf_free(buf); +} + //////////////////////////////////////////////////////////////////////////////// int ggml_cpu_has_avx(void) { diff --git a/ggml.h b/ggml.h index 368cb00cb..3eb6acb10 100644 --- a/ggml.h +++ b/ggml.h @@ -1712,7 +1712,6 @@ extern "C" { // gguf // - // TODO: can be removed if the API is extended for writing enum gguf_type { GGUF_TYPE_UINT8 = 0, GGUF_TYPE_INT8 = 1, @@ -1739,7 +1738,8 @@ extern "C" { GGML_API struct gguf_context * gguf_init_empty(void); GGML_API struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_params params); //GGML_API struct gguf_context * gguf_init_from_buffer(..); - GGML_API void gguf_free(struct gguf_context * ctx); + + GGML_API void gguf_free(struct gguf_context * ctx); GGML_API int gguf_get_version (struct gguf_context * ctx); GGML_API size_t gguf_get_alignment (struct gguf_context * ctx); @@ -1770,6 +1770,7 @@ extern "C" { GGML_API const char * gguf_get_arr_str (struct gguf_context * ctx, int key_id, int i); GGML_API int gguf_get_n_tensors (struct gguf_context * ctx); + GGML_API int gguf_find_tensor (struct gguf_context * ctx, const char * name); GGML_API size_t gguf_get_tensor_offset(struct gguf_context * ctx, int i); GGML_API char * gguf_get_tensor_name (struct gguf_context * ctx, int i); @@ -1789,17 +1790,35 @@ extern "C" { // set or add KV pairs from another context GGML_API void gguf_set_kv(struct gguf_context * ctx, struct gguf_context * src); + // manage tensor info GGML_API void gguf_add_tensor(struct gguf_context * ctx, const struct ggml_tensor * tensor); + GGML_API void gguf_set_tensor_type(struct gguf_context * ctx, const char * name, enum ggml_type type); + GGML_API void gguf_set_tensor_data(struct gguf_context * ctx, const char * name, const void * data, size_t size); - // same as gguf_add_tensor, but allows to override tensor data - GGML_API void gguf_add_tensor_ex( - struct gguf_context * ctx, - const struct ggml_tensor * tensor, - enum ggml_type type, - const void * data, - size_t size); + // writing gguf files can be done in 2 ways: + // + // - write the entire gguf_context to a binary file in a single pass: + // + // gguf_write_to_file(ctx, fname); + // + // - first prepare a file with a placeholder for the meta data, write the tensor data, then write the meta data: + // + // FILE * f = fopen(fname, "wb"); + // fseek(f, gguf_get_meta_size(ctx), SEEK_SET); + // fwrite(f, ...); + // void * data = gguf_meta_get_meta_data(ctx); + // fseek(f, 0, SEEK_SET); + // fwrite(f, data, gguf_get_meta_size(ctx)); + // free(data); + // fclose(f); + // - GGML_API void gguf_write_to_file(struct gguf_context * ctx, const char * fname); + // write the entire context to a binary file + GGML_API void gguf_write_to_file(struct gguf_context * ctx, const char * fname, bool only_meta); + + // get the size in bytes of the meta data (header, kv pairs, tensor info) including padding + GGML_API size_t gguf_get_meta_size(struct gguf_context * ctx); + GGML_API void gguf_get_meta_data(struct gguf_context * ctx, void * data); // // system info diff --git a/gguf-llama.cpp b/gguf-llama.cpp index e73806044..bc9559130 100644 --- a/gguf-llama.cpp +++ b/gguf-llama.cpp @@ -83,6 +83,13 @@ static std::string to_string(const T & val) { return ss.str(); } +static void zeros(std::ofstream & file, size_t n) { + char zero = 0; + for (size_t i = 0; i < n; ++i) { + file.write(&zero, 1); + } +} + #if !defined(GGML_USE_CUBLAS) && !defined(GGML_USE_METAL) #include "ggml-alloc.h" #define LLAMA_USE_ALLOCATOR @@ -3049,7 +3056,6 @@ static void llama_convert_tensor_internal(const gguf_load_tensor & tensor, std:: for (auto & worker : workers) { worker.join(); } - } static void llama_model_quantize_internal(const std::string & fname_inp, const std::string & fname_out, const llama_model_quantize_params * params) { @@ -3087,6 +3093,7 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s std::unique_ptr model_loader(new llama_model_loader(fname_inp, /*use_mmap*/ false)); + const size_t align = GGUF_DEFAULT_ALIGNMENT; struct gguf_context * ctx_out = gguf_init_empty(); // copy the KV pairs from the input file @@ -3125,7 +3132,18 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s std::vector read_data; std::vector work; - std::vector> work_map(model_loader->tensors_map.tensors.size()); + for (gguf_load_tensor & tensor : model_loader->tensors_map.tensors) { + gguf_add_tensor(ctx_out, tensor.ggml_tensor); + } + + std::ofstream fout(fname_out, std::ios::binary); + + const size_t meta_size = gguf_get_meta_size(ctx_out); + + LLAMA_LOG_INFO("%s: meta size = %zu bytes\n", __func__, meta_size); + + // placeholder for the meta data + ::zeros(fout, meta_size); for (gguf_load_tensor & tensor : model_loader->tensors_map.tensors) { read_data.resize(tensor.size); @@ -3286,13 +3304,25 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s total_size_org += tensor.size; total_size_new += new_size; - // TODO: temp fix until we have stream support in gguf - work_map[idx - 1] = std::vector((char *) new_data, (char *) new_data + new_size); + // update the gguf meta data as we go + gguf_set_tensor_type(ctx_out, tensor.name.c_str(), new_type); + gguf_set_tensor_data(ctx_out, tensor.name.c_str(), new_data, new_size); - gguf_add_tensor_ex(ctx_out, tensor.ggml_tensor, new_type, work_map[idx - 1].data(), new_size); + // write tensor data + padding + fout.write((const char *) new_data, new_size); + zeros(fout, GGML_PAD(new_size, align) - new_size); } - gguf_write_to_file(ctx_out, fname_out.c_str()); + // go back to beginning of file and write the updated meta data + { + fout.seekp(0); + std::vector data(gguf_get_meta_size(ctx_out)); + gguf_get_meta_data(ctx_out, data.data()); + fout.write((const char *) data.data(), data.size()); + } + + fout.close(); + gguf_free(ctx_out); LLAMA_LOG_INFO("%s: model size = %8.2f MB\n", __func__, total_size_org/1024.0/1024.0);