gguf : streaming support when writing files

This commit is contained in:
Georgi Gerganov 2023-08-15 19:57:37 +03:00
parent 4ef5e792e3
commit f7a6aa9911
No known key found for this signature in database
GPG key ID: 449E073F9DC10735
4 changed files with 233 additions and 70 deletions

View file

@ -72,7 +72,7 @@ bool gguf_ex_write(const std::string & fname) {
gguf_add_tensor(ctx, cur); gguf_add_tensor(ctx, cur);
} }
gguf_write_to_file(ctx, fname.c_str()); gguf_write_to_file(ctx, fname.c_str(), false);
fprintf(stdout, "%s: wrote file '%s;\n", __func__, fname.c_str()); fprintf(stdout, "%s: wrote file '%s;\n", __func__, fname.c_str());

208
ggml.c
View file

@ -19123,6 +19123,22 @@ int gguf_get_n_tensors(struct gguf_context * ctx) {
return ctx->header.n_tensors; return ctx->header.n_tensors;
} }
int gguf_find_tensor(struct gguf_context * ctx, const char * name) {
// return -1 if tensor not found
int tensorfound = -1;
const int n_tensors = gguf_get_n_tensors(ctx);
for (int i = 0; i < n_tensors; ++i) {
if (strcmp(name, gguf_get_tensor_name(ctx, i)) == 0) {
tensorfound = i;
break;
}
}
return tensorfound;
}
size_t gguf_get_tensor_offset(struct gguf_context * ctx, int i) { size_t gguf_get_tensor_offset(struct gguf_context * ctx, int i) {
return ctx->infos[i].offset; return ctx->infos[i].offset;
} }
@ -19269,12 +19285,9 @@ void gguf_set_kv(struct gguf_context * ctx, struct gguf_context * src) {
} }
} }
void gguf_add_tensor_ex( void gguf_add_tensor(
struct gguf_context * ctx, struct gguf_context * ctx,
const struct ggml_tensor * tensor, const struct ggml_tensor * tensor) {
enum ggml_type type,
const void * data,
size_t size) {
const int idx = ctx->header.n_tensors; const int idx = ctx->header.n_tensors;
ctx->infos = realloc(ctx->infos, (idx + 1)*sizeof(struct gguf_tensor_info)); ctx->infos = realloc(ctx->infos, (idx + 1)*sizeof(struct gguf_tensor_info));
@ -19290,10 +19303,10 @@ void gguf_add_tensor_ex(
ctx->infos[idx].ne[i] = tensor->ne[i]; ctx->infos[idx].ne[i] = tensor->ne[i];
} }
ctx->infos[idx].type = type; ctx->infos[idx].type = tensor->type;
ctx->infos[idx].offset = 0; ctx->infos[idx].offset = 0;
ctx->infos[idx].data = data; ctx->infos[idx].data = tensor->data;
ctx->infos[idx].size = size; ctx->infos[idx].size = ggml_nbytes(tensor);
if (ctx->header.n_tensors > 0) { if (ctx->header.n_tensors > 0) {
ctx->infos[idx].offset = ctx->infos[idx - 1].offset + GGML_PAD(ctx->infos[idx - 1].size, ctx->alignment); ctx->infos[idx].offset = ctx->infos[idx - 1].offset + GGML_PAD(ctx->infos[idx - 1].size, ctx->alignment);
@ -19302,52 +19315,115 @@ void gguf_add_tensor_ex(
ctx->header.n_tensors++; ctx->header.n_tensors++;
} }
void gguf_add_tensor(struct gguf_context * ctx, const struct ggml_tensor * tensor) { void gguf_set_tensor_type(struct gguf_context * ctx, const char * name, enum ggml_type type) {
gguf_add_tensor_ex(ctx, tensor, tensor->type, tensor->data, ggml_nbytes(tensor)); const int idx = gguf_find_tensor(ctx, name);
if (idx < 0) {
GGML_ASSERT(false && "tensor not found");
} }
static void gguf_fwrite_str(FILE * file, const struct gguf_str * val) { ctx->infos[idx].type = type;
fwrite(&val->n, sizeof(val->n), 1, file);
fwrite(val->data, sizeof(char), val->n, file);
} }
static void gguf_fwrite_el(FILE * file, const void * val, size_t size) { void gguf_set_tensor_data(struct gguf_context * ctx, const char * name, const void * data, size_t size) {
fwrite(val, sizeof(char), size, file); const int idx = gguf_find_tensor(ctx, name);
if (idx < 0) {
GGML_ASSERT(false && "tensor not found");
} }
void gguf_write_to_file(struct gguf_context * ctx, const char * fname) { ctx->infos[idx].data = data;
FILE * file = fopen(fname, "wb"); ctx->infos[idx].size = size;
if (!file) {
GGML_ASSERT(false && "failed to open file for writing"); // update offsets
for (uint32_t i = idx + 1; i < ctx->header.n_tensors; ++i) {
ctx->infos[i].offset = ctx->infos[i - 1].offset + GGML_PAD(ctx->infos[i - 1].size, ctx->alignment);
}
} }
//static void gguf_fwrite_str(FILE * file, const struct gguf_str * val) {
// fwrite(&val->n, sizeof(val->n), 1, file);
// fwrite(val->data, sizeof(char), val->n, file);
//}
//
//static void gguf_fwrite_el(FILE * file, const void * val, size_t size) {
// fwrite(val, sizeof(char), size, file);
//}
struct gguf_buf {
void * data;
size_t size;
size_t offset;
};
static struct gguf_buf gguf_buf_init(size_t size) {
struct gguf_buf buf = {
/*buf.data =*/ size == 0 ? NULL : malloc(size),
/*buf.size =*/ size,
/*buf.offset =*/ 0,
};
return buf;
}
static void gguf_buf_free(struct gguf_buf buf) {
if (buf.data) {
free(buf.data);
}
}
static void gguf_buf_grow(struct gguf_buf * buf, size_t size) {
if (buf->offset + size > buf->size) {
buf->size = 1.5*(buf->offset + size);
if (buf->data) {
buf->data = realloc(buf->data, buf->size);
}
}
}
static void gguf_bwrite_str(struct gguf_buf * buf, const struct gguf_str * val) {
gguf_buf_grow(buf, sizeof(val->n) + val->n);
buf->data && memcpy((char *) buf->data + buf->offset, &val->n, sizeof(val->n));
buf->offset += sizeof(val->n);
buf->data && memcpy((char *) buf->data + buf->offset, val->data, val->n);
buf->offset += val->n;
}
static void gguf_bwrite_el(struct gguf_buf * buf, const void * val, size_t el_size) {
gguf_buf_grow(buf, el_size);
buf->data && memcpy((char *) buf->data + buf->offset, val, el_size);
buf->offset += el_size;
}
static void gguf_write_to_buf(struct gguf_context * ctx, struct gguf_buf * buf, bool only_meta) {
// write header // write header
gguf_fwrite_el(file, &ctx->header.magic, sizeof(ctx->header.magic)); gguf_bwrite_el(buf, &ctx->header.magic, sizeof(ctx->header.magic));
gguf_fwrite_el(file, &ctx->header.version, sizeof(ctx->header.version)); gguf_bwrite_el(buf, &ctx->header.version, sizeof(ctx->header.version));
gguf_fwrite_el(file, &ctx->header.n_tensors, sizeof(ctx->header.n_tensors)); gguf_bwrite_el(buf, &ctx->header.n_tensors, sizeof(ctx->header.n_tensors));
gguf_fwrite_el(file, &ctx->header.n_kv, sizeof(ctx->header.n_kv)); gguf_bwrite_el(buf, &ctx->header.n_kv, sizeof(ctx->header.n_kv));
// write key-value pairs // write key-value pairs
for (uint32_t i = 0; i < ctx->header.n_kv; ++i) { for (uint32_t i = 0; i < ctx->header.n_kv; ++i) {
struct gguf_kv * kv = &ctx->kv[i]; struct gguf_kv * kv = &ctx->kv[i];
gguf_fwrite_str(file, &kv->key); gguf_bwrite_str(buf, &kv->key);
gguf_fwrite_el (file, &kv->type, sizeof(kv->type)); gguf_bwrite_el (buf, &kv->type, sizeof(kv->type));
switch (kv->type) { switch (kv->type) {
case GGUF_TYPE_UINT8: gguf_fwrite_el (file, &kv->value.uint8, sizeof(kv->value.uint8) ); break; case GGUF_TYPE_UINT8: gguf_bwrite_el( buf, &kv->value.uint8, sizeof(kv->value.uint8) ); break;
case GGUF_TYPE_INT8: gguf_fwrite_el (file, &kv->value.int8, sizeof(kv->value.int8) ); break; case GGUF_TYPE_INT8: gguf_bwrite_el (buf, &kv->value.int8, sizeof(kv->value.int8) ); break;
case GGUF_TYPE_UINT16: gguf_fwrite_el (file, &kv->value.uint16, sizeof(kv->value.uint16) ); break; case GGUF_TYPE_UINT16: gguf_bwrite_el (buf, &kv->value.uint16, sizeof(kv->value.uint16) ); break;
case GGUF_TYPE_INT16: gguf_fwrite_el (file, &kv->value.int16, sizeof(kv->value.int16) ); break; case GGUF_TYPE_INT16: gguf_bwrite_el (buf, &kv->value.int16, sizeof(kv->value.int16) ); break;
case GGUF_TYPE_UINT32: gguf_fwrite_el (file, &kv->value.uint32, sizeof(kv->value.uint32) ); break; case GGUF_TYPE_UINT32: gguf_bwrite_el (buf, &kv->value.uint32, sizeof(kv->value.uint32) ); break;
case GGUF_TYPE_INT32: gguf_fwrite_el (file, &kv->value.int32, sizeof(kv->value.int32) ); break; case GGUF_TYPE_INT32: gguf_bwrite_el (buf, &kv->value.int32, sizeof(kv->value.int32) ); break;
case GGUF_TYPE_FLOAT32: gguf_fwrite_el (file, &kv->value.float32, sizeof(kv->value.float32)); break; case GGUF_TYPE_FLOAT32: gguf_bwrite_el (buf, &kv->value.float32, sizeof(kv->value.float32)); break;
case GGUF_TYPE_BOOL: gguf_fwrite_el (file, &kv->value.bool_, sizeof(kv->value.bool_) ); break; case GGUF_TYPE_BOOL: gguf_bwrite_el (buf, &kv->value.bool_, sizeof(kv->value.bool_) ); break;
case GGUF_TYPE_STRING: gguf_fwrite_str(file, &kv->value.str ); break; case GGUF_TYPE_STRING: gguf_bwrite_str(buf, &kv->value.str ); break;
case GGUF_TYPE_ARRAY: case GGUF_TYPE_ARRAY:
{ {
gguf_fwrite_el(file, &kv->value.arr.type, sizeof(kv->value.arr.type)); gguf_bwrite_el(buf, &kv->value.arr.type, sizeof(kv->value.arr.type));
gguf_fwrite_el(file, &kv->value.arr.n, sizeof(kv->value.arr.n) ); gguf_bwrite_el(buf, &kv->value.arr.n, sizeof(kv->value.arr.n) );
switch (kv->value.arr.type) { switch (kv->value.arr.type) {
case GGUF_TYPE_UINT8: case GGUF_TYPE_UINT8:
@ -19359,12 +19435,12 @@ void gguf_write_to_file(struct gguf_context * ctx, const char * fname) {
case GGUF_TYPE_FLOAT32: case GGUF_TYPE_FLOAT32:
case GGUF_TYPE_BOOL: case GGUF_TYPE_BOOL:
{ {
gguf_fwrite_el(file, kv->value.arr.data, kv->value.arr.n * GGUF_TYPE_SIZE[kv->value.arr.type]); gguf_bwrite_el(buf, kv->value.arr.data, kv->value.arr.n * GGUF_TYPE_SIZE[kv->value.arr.type]);
} break; } break;
case GGUF_TYPE_STRING: case GGUF_TYPE_STRING:
{ {
for (uint32_t j = 0; j < kv->value.arr.n; ++j) { for (uint32_t j = 0; j < kv->value.arr.n; ++j) {
gguf_fwrite_str(file, &((struct gguf_str *) kv->value.arr.data)[j]); gguf_bwrite_str(buf, &((struct gguf_str *) kv->value.arr.data)[j]);
} }
} break; } break;
case GGUF_TYPE_ARRAY: case GGUF_TYPE_ARRAY:
@ -19379,28 +19455,32 @@ void gguf_write_to_file(struct gguf_context * ctx, const char * fname) {
for (uint32_t i = 0; i < ctx->header.n_tensors; ++i) { for (uint32_t i = 0; i < ctx->header.n_tensors; ++i) {
struct gguf_tensor_info * info = &ctx->infos[i]; struct gguf_tensor_info * info = &ctx->infos[i];
gguf_fwrite_str(file, &info->name); gguf_bwrite_str(buf, &info->name);
gguf_fwrite_el (file, &info->n_dims, sizeof(info->n_dims)); gguf_bwrite_el (buf, &info->n_dims, sizeof(info->n_dims));
for (uint32_t j = 0; j < info->n_dims; ++j) { for (uint32_t j = 0; j < info->n_dims; ++j) {
gguf_fwrite_el(file, &info->ne[j], sizeof(info->ne[j])); gguf_bwrite_el(buf, &info->ne[j], sizeof(info->ne[j]));
} }
gguf_fwrite_el (file, &info->type, sizeof(info->type)); gguf_bwrite_el(buf, &info->type, sizeof(info->type));
gguf_fwrite_el (file, &info->offset, sizeof(info->offset)); gguf_bwrite_el(buf, &info->offset, sizeof(info->offset));
} }
// we require the data section to be aligned, so take into account any padding // we require the data section to be aligned, so take into account any padding
{ {
const size_t offset = ftell(file); const size_t offset = buf->offset;
const size_t offset_pad = GGML_PAD(offset, ctx->alignment); const size_t offset_pad = GGML_PAD(offset, ctx->alignment);
if (offset_pad != offset) { if (offset_pad != offset) {
uint8_t pad = 0; uint8_t pad = 0;
for (size_t i = 0; i < offset_pad - offset; ++i) { for (size_t i = 0; i < offset_pad - offset; ++i) {
gguf_fwrite_el(file, &pad, sizeof(pad)); gguf_bwrite_el(buf, &pad, sizeof(pad));
} }
} }
} }
if (only_meta) {
return;
}
size_t offset = 0; size_t offset = 0;
// write tensor data // write tensor data
@ -19410,12 +19490,12 @@ void gguf_write_to_file(struct gguf_context * ctx, const char * fname) {
const size_t size = info->size; const size_t size = info->size;
const size_t size_pad = GGML_PAD(size, ctx->alignment); const size_t size_pad = GGML_PAD(size, ctx->alignment);
gguf_fwrite_el(file, info->data, size); gguf_bwrite_el(buf, info->data, size);
if (size_pad != size) { if (size_pad != size) {
uint8_t pad = 0; uint8_t pad = 0;
for (size_t j = 0; j < size_pad - size; ++j) { for (size_t j = 0; j < size_pad - size; ++j) {
gguf_fwrite_el(file, &pad, sizeof(pad)); gguf_bwrite_el(buf, &pad, sizeof(pad));
} }
} }
@ -19423,10 +19503,44 @@ void gguf_write_to_file(struct gguf_context * ctx, const char * fname) {
offset += size_pad; offset += size_pad;
} }
}
void gguf_write_to_file(struct gguf_context * ctx, const char * fname, bool only_meta) {
FILE * file = fopen(fname, "wb");
if (!file) {
GGML_ASSERT(false && "failed to open file for writing");
}
struct gguf_buf buf = gguf_buf_init(16*1024);
gguf_write_to_buf(ctx, &buf, only_meta);
fwrite(buf.data, 1, buf.offset, file);
gguf_buf_free(buf);
fclose(file); fclose(file);
} }
size_t gguf_get_meta_size(struct gguf_context * ctx) {
// no allocs - only compute size
struct gguf_buf buf = gguf_buf_init(0);
gguf_write_to_buf(ctx, &buf, true);
return buf.offset;
}
void gguf_get_meta_data(struct gguf_context * ctx, void * data) {
struct gguf_buf buf = gguf_buf_init(16*1024);
gguf_write_to_buf(ctx, &buf, true);
memcpy(data, buf.data, buf.offset);
gguf_buf_free(buf);
}
//////////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////////
int ggml_cpu_has_avx(void) { int ggml_cpu_has_avx(void) {

37
ggml.h
View file

@ -1712,7 +1712,6 @@ extern "C" {
// gguf // gguf
// //
// TODO: can be removed if the API is extended for writing
enum gguf_type { enum gguf_type {
GGUF_TYPE_UINT8 = 0, GGUF_TYPE_UINT8 = 0,
GGUF_TYPE_INT8 = 1, GGUF_TYPE_INT8 = 1,
@ -1739,6 +1738,7 @@ extern "C" {
GGML_API struct gguf_context * gguf_init_empty(void); GGML_API struct gguf_context * gguf_init_empty(void);
GGML_API struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_params params); GGML_API struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_params params);
//GGML_API struct gguf_context * gguf_init_from_buffer(..); //GGML_API struct gguf_context * gguf_init_from_buffer(..);
GGML_API void gguf_free(struct gguf_context * ctx); GGML_API void gguf_free(struct gguf_context * ctx);
GGML_API int gguf_get_version (struct gguf_context * ctx); GGML_API int gguf_get_version (struct gguf_context * ctx);
@ -1770,6 +1770,7 @@ extern "C" {
GGML_API const char * gguf_get_arr_str (struct gguf_context * ctx, int key_id, int i); GGML_API const char * gguf_get_arr_str (struct gguf_context * ctx, int key_id, int i);
GGML_API int gguf_get_n_tensors (struct gguf_context * ctx); GGML_API int gguf_get_n_tensors (struct gguf_context * ctx);
GGML_API int gguf_find_tensor (struct gguf_context * ctx, const char * name);
GGML_API size_t gguf_get_tensor_offset(struct gguf_context * ctx, int i); GGML_API size_t gguf_get_tensor_offset(struct gguf_context * ctx, int i);
GGML_API char * gguf_get_tensor_name (struct gguf_context * ctx, int i); GGML_API char * gguf_get_tensor_name (struct gguf_context * ctx, int i);
@ -1789,17 +1790,35 @@ extern "C" {
// set or add KV pairs from another context // set or add KV pairs from another context
GGML_API void gguf_set_kv(struct gguf_context * ctx, struct gguf_context * src); GGML_API void gguf_set_kv(struct gguf_context * ctx, struct gguf_context * src);
// manage tensor info
GGML_API void gguf_add_tensor(struct gguf_context * ctx, const struct ggml_tensor * tensor); GGML_API void gguf_add_tensor(struct gguf_context * ctx, const struct ggml_tensor * tensor);
GGML_API void gguf_set_tensor_type(struct gguf_context * ctx, const char * name, enum ggml_type type);
GGML_API void gguf_set_tensor_data(struct gguf_context * ctx, const char * name, const void * data, size_t size);
// same as gguf_add_tensor, but allows to override tensor data // writing gguf files can be done in 2 ways:
GGML_API void gguf_add_tensor_ex( //
struct gguf_context * ctx, // - write the entire gguf_context to a binary file in a single pass:
const struct ggml_tensor * tensor, //
enum ggml_type type, // gguf_write_to_file(ctx, fname);
const void * data, //
size_t size); // - first prepare a file with a placeholder for the meta data, write the tensor data, then write the meta data:
//
// FILE * f = fopen(fname, "wb");
// fseek(f, gguf_get_meta_size(ctx), SEEK_SET);
// fwrite(f, ...);
// void * data = gguf_meta_get_meta_data(ctx);
// fseek(f, 0, SEEK_SET);
// fwrite(f, data, gguf_get_meta_size(ctx));
// free(data);
// fclose(f);
//
GGML_API void gguf_write_to_file(struct gguf_context * ctx, const char * fname); // write the entire context to a binary file
GGML_API void gguf_write_to_file(struct gguf_context * ctx, const char * fname, bool only_meta);
// get the size in bytes of the meta data (header, kv pairs, tensor info) including padding
GGML_API size_t gguf_get_meta_size(struct gguf_context * ctx);
GGML_API void gguf_get_meta_data(struct gguf_context * ctx, void * data);
// //
// system info // system info

View file

@ -83,6 +83,13 @@ static std::string to_string(const T & val) {
return ss.str(); return ss.str();
} }
static void zeros(std::ofstream & file, size_t n) {
char zero = 0;
for (size_t i = 0; i < n; ++i) {
file.write(&zero, 1);
}
}
#if !defined(GGML_USE_CUBLAS) && !defined(GGML_USE_METAL) #if !defined(GGML_USE_CUBLAS) && !defined(GGML_USE_METAL)
#include "ggml-alloc.h" #include "ggml-alloc.h"
#define LLAMA_USE_ALLOCATOR #define LLAMA_USE_ALLOCATOR
@ -3049,7 +3056,6 @@ static void llama_convert_tensor_internal(const gguf_load_tensor & tensor, std::
for (auto & worker : workers) { for (auto & worker : workers) {
worker.join(); worker.join();
} }
} }
static void llama_model_quantize_internal(const std::string & fname_inp, const std::string & fname_out, const llama_model_quantize_params * params) { static void llama_model_quantize_internal(const std::string & fname_inp, const std::string & fname_out, const llama_model_quantize_params * params) {
@ -3087,6 +3093,7 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
std::unique_ptr<llama_model_loader> model_loader(new llama_model_loader(fname_inp, /*use_mmap*/ false)); std::unique_ptr<llama_model_loader> model_loader(new llama_model_loader(fname_inp, /*use_mmap*/ false));
const size_t align = GGUF_DEFAULT_ALIGNMENT;
struct gguf_context * ctx_out = gguf_init_empty(); struct gguf_context * ctx_out = gguf_init_empty();
// copy the KV pairs from the input file // copy the KV pairs from the input file
@ -3125,7 +3132,18 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
std::vector<uint8_t> read_data; std::vector<uint8_t> read_data;
std::vector<uint8_t> work; std::vector<uint8_t> work;
std::vector<std::vector<uint8_t>> work_map(model_loader->tensors_map.tensors.size()); for (gguf_load_tensor & tensor : model_loader->tensors_map.tensors) {
gguf_add_tensor(ctx_out, tensor.ggml_tensor);
}
std::ofstream fout(fname_out, std::ios::binary);
const size_t meta_size = gguf_get_meta_size(ctx_out);
LLAMA_LOG_INFO("%s: meta size = %zu bytes\n", __func__, meta_size);
// placeholder for the meta data
::zeros(fout, meta_size);
for (gguf_load_tensor & tensor : model_loader->tensors_map.tensors) { for (gguf_load_tensor & tensor : model_loader->tensors_map.tensors) {
read_data.resize(tensor.size); read_data.resize(tensor.size);
@ -3286,13 +3304,25 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
total_size_org += tensor.size; total_size_org += tensor.size;
total_size_new += new_size; total_size_new += new_size;
// TODO: temp fix until we have stream support in gguf // update the gguf meta data as we go
work_map[idx - 1] = std::vector<uint8_t>((char *) new_data, (char *) new_data + new_size); gguf_set_tensor_type(ctx_out, tensor.name.c_str(), new_type);
gguf_set_tensor_data(ctx_out, tensor.name.c_str(), new_data, new_size);
gguf_add_tensor_ex(ctx_out, tensor.ggml_tensor, new_type, work_map[idx - 1].data(), new_size); // write tensor data + padding
fout.write((const char *) new_data, new_size);
zeros(fout, GGML_PAD(new_size, align) - new_size);
} }
gguf_write_to_file(ctx_out, fname_out.c_str()); // go back to beginning of file and write the updated meta data
{
fout.seekp(0);
std::vector<uint8_t> data(gguf_get_meta_size(ctx_out));
gguf_get_meta_data(ctx_out, data.data());
fout.write((const char *) data.data(), data.size());
}
fout.close();
gguf_free(ctx_out); gguf_free(ctx_out);
LLAMA_LOG_INFO("%s: model size = %8.2f MB\n", __func__, total_size_org/1024.0/1024.0); LLAMA_LOG_INFO("%s: model size = %8.2f MB\n", __func__, total_size_org/1024.0/1024.0);