diff --git a/ggml/include/ggml.h b/ggml/include/ggml.h index 3fb680360..34f6e6ff0 100644 --- a/ggml/include/ggml.h +++ b/ggml/include/ggml.h @@ -2387,7 +2387,7 @@ extern "C" { GGML_API struct gguf_context * gguf_init_empty(void); GGML_API struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_params params); - //GGML_API struct gguf_context * gguf_init_from_buffer(..); + GGML_API struct gguf_context * gguf_init_from_buffer(const char * buffer, size_t size, struct gguf_init_params params); GGML_API void gguf_free(struct gguf_context * ctx); diff --git a/ggml/src/ggml.c b/ggml/src/ggml.c index 6e2ebf283..8f27fd0f3 100644 --- a/ggml/src/ggml.c +++ b/ggml/src/ggml.c @@ -22015,6 +22015,13 @@ struct gguf_context { void * data; }; +struct gguf_src { + FILE * file; + // for reading gguf from a buffer instead of a file + const char * buffer; + size_t size; +}; + static size_t gguf_type_size(enum gguf_type type) { GGML_ASSERT(0 <= type && type < GGUF_TYPE_COUNT); return GGUF_TYPE_SIZE[type]; @@ -22034,19 +22041,25 @@ static void gguf_tensor_info_sanitize(struct gguf_tensor_info * info) { GGML_ASSERT(INT64_MAX/info->ne[3] > info->ne[0]*info->ne[1]*info->ne[2]); } -static bool gguf_fread_el(FILE * file, void * dst, size_t size, size_t * offset) { - const size_t n = fread(dst, 1, size, file); +static bool gguf_fread_el(struct gguf_src * src, void * dst, size_t size, size_t * offset) { + size_t n; + if (src->file) { + n = fread(dst, 1, size, src->file); + } else { + n = MIN(src->size - *offset, size); + memcpy(dst, src->buffer + *offset, n); + } *offset += n; return n == size; } -static bool gguf_fread_str(FILE * file, struct gguf_str * p, size_t * offset) { +static bool gguf_fread_str(struct gguf_src * src, struct gguf_str * p, size_t * offset) { p->n = 0; p->data = NULL; bool ok = true; - ok = ok && gguf_fread_el(file, &p->n, sizeof(p->n), offset); + ok = ok && gguf_fread_el(src, &p->n, sizeof(p->n), offset); // early exit if string length is invalid, prevents from integer overflow if (p->n == SIZE_MAX) { @@ -22056,7 +22069,7 @@ static bool gguf_fread_str(FILE * file, struct gguf_str * p, size_t * offset) { p->data = GGML_CALLOC(p->n + 1, 1); - ok = ok && gguf_fread_el(file, p->data, p->n, offset); + ok = ok && gguf_fread_el(src, p->data, p->n, offset); return ok; } @@ -22107,13 +22120,7 @@ struct gguf_context * gguf_init_empty(void) { return ctx; } -struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_params params) { - FILE * file = ggml_fopen(fname, "rb"); - if (!file) { - fprintf(stderr, "%s: failed to open '%s': '%s'\n", __func__, fname, strerror(errno)); - return NULL; - } - +static struct gguf_context * gguf_init_internal(struct gguf_src * src, struct gguf_init_params params) { // offset from start of file size_t offset = 0; @@ -22121,12 +22128,12 @@ struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_p // check the magic before making allocations { - gguf_fread_el(file, &magic, sizeof(magic), &offset); + gguf_fread_el(src, &magic, sizeof(magic), &offset); for (uint32_t i = 0; i < sizeof(magic); i++) { if (magic[i] != GGUF_MAGIC[i]) { fprintf(stderr, "%s: invalid magic characters '%c%c%c%c'\n", __func__, magic[0], magic[1], magic[2], magic[3]); - fclose(file); + if (src->file) fclose(src->file); return NULL; } } @@ -22144,13 +22151,13 @@ struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_p ctx->infos = NULL; ctx->data = NULL; - ok = ok && gguf_fread_el(file, &ctx->header.version, sizeof(ctx->header.version), &offset); - ok = ok && gguf_fread_el(file, &ctx->header.n_tensors, sizeof(ctx->header.n_tensors), &offset); - ok = ok && gguf_fread_el(file, &ctx->header.n_kv, sizeof(ctx->header.n_kv), &offset); + ok = ok && gguf_fread_el(src, &ctx->header.version, sizeof(ctx->header.version), &offset); + ok = ok && gguf_fread_el(src, &ctx->header.n_tensors, sizeof(ctx->header.n_tensors), &offset); + ok = ok && gguf_fread_el(src, &ctx->header.n_kv, sizeof(ctx->header.n_kv), &offset); if (ctx->header.version == 1) { fprintf(stderr, "%s: GGUFv1 is no longer supported. please use a more up-to-date version\n", __func__); - fclose(file); + if (src->file) fclose(src->file); gguf_free(ctx); return NULL; } @@ -22163,7 +22170,7 @@ struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_p if (!ok) { fprintf(stderr, "%s: failed to read header\n", __func__); - fclose(file); + if (src->file) fclose(src->file); gguf_free(ctx); return NULL; } @@ -22182,28 +22189,28 @@ struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_p //fprintf(stderr, "%s: reading kv %d\n", __func__, i); - ok = ok && gguf_fread_str(file, &kv->key, &offset); - ok = ok && gguf_fread_el (file, &kv->type, sizeof(kv->type), &offset); + ok = ok && gguf_fread_str(src, &kv->key, &offset); + ok = ok && gguf_fread_el (src, &kv->type, sizeof(kv->type), &offset); //fprintf(stderr, "%s: reading kv with key %s\n", __func__, kv->key.data); switch (kv->type) { - case GGUF_TYPE_UINT8: ok = ok && gguf_fread_el (file, &kv->value.uint8, sizeof(kv->value.uint8), &offset); break; - case GGUF_TYPE_INT8: ok = ok && gguf_fread_el (file, &kv->value.int8, sizeof(kv->value.int8), &offset); break; - case GGUF_TYPE_UINT16: ok = ok && gguf_fread_el (file, &kv->value.uint16, sizeof(kv->value.uint16), &offset); break; - case GGUF_TYPE_INT16: ok = ok && gguf_fread_el (file, &kv->value.int16, sizeof(kv->value.int16), &offset); break; - case GGUF_TYPE_UINT32: ok = ok && gguf_fread_el (file, &kv->value.uint32, sizeof(kv->value.uint32), &offset); break; - case GGUF_TYPE_INT32: ok = ok && gguf_fread_el (file, &kv->value.int32, sizeof(kv->value.int32), &offset); break; - case GGUF_TYPE_FLOAT32: ok = ok && gguf_fread_el (file, &kv->value.float32, sizeof(kv->value.float32), &offset); break; - case GGUF_TYPE_UINT64: ok = ok && gguf_fread_el (file, &kv->value.uint64, sizeof(kv->value.uint64), &offset); break; - case GGUF_TYPE_INT64: ok = ok && gguf_fread_el (file, &kv->value.int64, sizeof(kv->value.int64), &offset); break; - case GGUF_TYPE_FLOAT64: ok = ok && gguf_fread_el (file, &kv->value.float64, sizeof(kv->value.float64), &offset); break; - case GGUF_TYPE_BOOL: ok = ok && gguf_fread_el (file, &kv->value.bool_, sizeof(kv->value.bool_), &offset); break; - case GGUF_TYPE_STRING: ok = ok && gguf_fread_str(file, &kv->value.str, &offset); break; + case GGUF_TYPE_UINT8: ok = ok && gguf_fread_el (src, &kv->value.uint8, sizeof(kv->value.uint8), &offset); break; + case GGUF_TYPE_INT8: ok = ok && gguf_fread_el (src, &kv->value.int8, sizeof(kv->value.int8), &offset); break; + case GGUF_TYPE_UINT16: ok = ok && gguf_fread_el (src, &kv->value.uint16, sizeof(kv->value.uint16), &offset); break; + case GGUF_TYPE_INT16: ok = ok && gguf_fread_el (src, &kv->value.int16, sizeof(kv->value.int16), &offset); break; + case GGUF_TYPE_UINT32: ok = ok && gguf_fread_el (src, &kv->value.uint32, sizeof(kv->value.uint32), &offset); break; + case GGUF_TYPE_INT32: ok = ok && gguf_fread_el (src, &kv->value.int32, sizeof(kv->value.int32), &offset); break; + case GGUF_TYPE_FLOAT32: ok = ok && gguf_fread_el (src, &kv->value.float32, sizeof(kv->value.float32), &offset); break; + case GGUF_TYPE_UINT64: ok = ok && gguf_fread_el (src, &kv->value.uint64, sizeof(kv->value.uint64), &offset); break; + case GGUF_TYPE_INT64: ok = ok && gguf_fread_el (src, &kv->value.int64, sizeof(kv->value.int64), &offset); break; + case GGUF_TYPE_FLOAT64: ok = ok && gguf_fread_el (src, &kv->value.float64, sizeof(kv->value.float64), &offset); break; + case GGUF_TYPE_BOOL: ok = ok && gguf_fread_el (src, &kv->value.bool_, sizeof(kv->value.bool_), &offset); break; + case GGUF_TYPE_STRING: ok = ok && gguf_fread_str(src, &kv->value.str, &offset); break; case GGUF_TYPE_ARRAY: { - ok = ok && gguf_fread_el(file, &kv->value.arr.type, sizeof(kv->value.arr.type), &offset); - ok = ok && gguf_fread_el(file, &kv->value.arr.n, sizeof(kv->value.arr.n), &offset); + ok = ok && gguf_fread_el(src, &kv->value.arr.type, sizeof(kv->value.arr.type), &offset); + ok = ok && gguf_fread_el(src, &kv->value.arr.n, sizeof(kv->value.arr.n), &offset); switch (kv->value.arr.type) { case GGUF_TYPE_UINT8: @@ -22221,21 +22228,21 @@ struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_p // prevent from integer overflow in the malloc below if (kv->value.arr.n >= SIZE_MAX/gguf_type_size(kv->value.arr.type)) { fprintf(stderr, "%s: array size is too large (%" PRIu64 ")\n", __func__, kv->value.arr.n); - fclose(file); + if (src->file) fclose(src->file); gguf_free(ctx); return NULL; } kv->value.arr.data = GGML_CALLOC(kv->value.arr.n, gguf_type_size(kv->value.arr.type)); - ok = ok && gguf_fread_el(file, kv->value.arr.data, kv->value.arr.n * gguf_type_size(kv->value.arr.type), &offset); + ok = ok && gguf_fread_el(src, kv->value.arr.data, kv->value.arr.n * gguf_type_size(kv->value.arr.type), &offset); } break; case GGUF_TYPE_STRING: { // prevent from integer overflow in the malloc below if (kv->value.arr.n >= SIZE_MAX/sizeof(struct gguf_str)) { fprintf(stderr, "%s: array size is too large (%" PRIu64 ")\n", __func__, kv->value.arr.n); - fclose(file); + if (src->file) fclose(src->file); gguf_free(ctx); return NULL; } @@ -22243,7 +22250,7 @@ struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_p kv->value.arr.data = GGML_CALLOC(kv->value.arr.n, sizeof(struct gguf_str)); for (uint64_t j = 0; j < kv->value.arr.n; ++j) { - ok = ok && gguf_fread_str(file, &((struct gguf_str *) kv->value.arr.data)[j], &offset); + ok = ok && gguf_fread_str(src, &((struct gguf_str *) kv->value.arr.data)[j], &offset); } } break; case GGUF_TYPE_ARRAY: @@ -22262,7 +22269,7 @@ struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_p if (!ok) { fprintf(stderr, "%s: failed to read key-value pairs\n", __func__); - fclose(file); + if (src->file) fclose(src->file); gguf_free(ctx); return NULL; } @@ -22279,17 +22286,17 @@ struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_p info->ne[j] = 1; } - ok = ok && gguf_fread_str(file, &info->name, &offset); - ok = ok && gguf_fread_el (file, &info->n_dims, sizeof(info->n_dims), &offset); + ok = ok && gguf_fread_str(src, &info->name, &offset); + ok = ok && gguf_fread_el (src, &info->n_dims, sizeof(info->n_dims), &offset); ok = ok && (info->n_dims <= GGML_MAX_DIMS); for (uint32_t j = 0; j < info->n_dims; ++j) { - ok = ok && gguf_fread_el(file, &info->ne[j], sizeof(info->ne[j]), &offset); + ok = ok && gguf_fread_el(src, &info->ne[j], sizeof(info->ne[j]), &offset); } - ok = ok && gguf_fread_el (file, &info->type, sizeof(info->type), &offset); - ok = ok && gguf_fread_el (file, &info->offset, sizeof(info->offset), &offset); + ok = ok && gguf_fread_el (src, &info->type, sizeof(info->type), &offset); + ok = ok && gguf_fread_el (src, &info->offset, sizeof(info->offset), &offset); // TODO: return an error instead of crashing with GGML_ASSERT gguf_tensor_info_sanitize(info); @@ -22304,7 +22311,7 @@ struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_p if (!ok) { fprintf(stderr, "%s: failed to read tensor info\n", __func__); - fclose(file); + if (src->file) fclose(src->file); gguf_free(ctx); return NULL; } @@ -22324,7 +22331,7 @@ struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_p if (offset_pad != 0) { offset += ctx->alignment - offset_pad; - fseek(file, offset, SEEK_SET); + if (src->file) fseek(src->file, offset, SEEK_SET); } } @@ -22346,7 +22353,7 @@ struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_p if (ggml_blck_size(info->type) == 0 || ne % ggml_blck_size(info->type) != 0) { fprintf(stderr, "%s: tensor '%s' of type %d (%s) number of elements (%" PRId64 ") is not a multiple of block size (%" PRId64 ")\n", __func__, info->name.data, (int) info->type, ggml_type_name(info->type), ne, ggml_blck_size(info->type)); - fclose(file); + if (src->file) fclose(src->file); gguf_free(ctx); return NULL; } @@ -22378,7 +22385,7 @@ struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_p *params.ctx = ggml_init(pdata); if (*params.ctx == NULL) { fprintf(stderr, "%s: failed to initialize context\n", __func__); - fclose(file); + if (src->file) fclose(src->file); gguf_free(ctx); return NULL; } @@ -22393,11 +22400,11 @@ struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_p ok = ok && data != NULL; // read the binary blob with the tensor data - ok = ok && gguf_fread_el(file, data->data, ctx->size, &offset); + ok = ok && gguf_fread_el(src, data->data, ctx->size, &offset); if (!ok) { fprintf(stderr, "%s: failed to read tensor data\n", __func__); - fclose(file); + if (src->file) fclose(src->file); ggml_free(ctx_data); gguf_free(ctx); return NULL; @@ -22436,7 +22443,7 @@ struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_p if (!ok) { fprintf(stderr, "%s: failed to read the tensor data\n", __func__); - fclose(file); + if (src->file) fclose(src->file); ggml_free(ctx_data); gguf_free(ctx); return NULL; @@ -22445,11 +22452,29 @@ struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_p ggml_set_no_alloc(ctx_data, params.no_alloc); } - fclose(file); + if (src->file) fclose(src->file); return ctx; } +struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_params params) { + struct gguf_src src = {ggml_fopen(fname, "rb"), NULL, 0}; + if (!src.file) { + fprintf(stderr, "%s: failed to open '%s': '%s'\n", __func__, fname, strerror(errno)); + return NULL; + } + return gguf_init_internal(&src, params); +} + +struct gguf_context * gguf_init_from_buffer(const char * buffer, size_t size, struct gguf_init_params params) { + if (!buffer) { + fprintf(stderr, "%s: buffer cannot be null\n", __func__); + return NULL; + } + struct gguf_src src = {NULL, buffer, size}; + return gguf_init_internal(&src, params); +} + void gguf_free(struct gguf_context * ctx) { if (ctx == NULL) { return; diff --git a/include/llama.h b/include/llama.h index bfc37e88b..3d3d4f67b 100644 --- a/include/llama.h +++ b/include/llama.h @@ -416,6 +416,12 @@ extern "C" { // lora adapter struct llama_lora_adapter; + // to be used by llama_load_model_from_buffers + struct llama_model_shard_buffer { + const char * data; + size_t size; + }; + // Helpers for getting default parameters LLAMA_API struct llama_model_params llama_model_default_params(void); LLAMA_API struct llama_context_params llama_context_default_params(void); @@ -441,7 +447,12 @@ extern "C" { LLAMA_API struct llama_model * llama_load_model_from_file( const char * path_model, - struct llama_model_params params); + struct llama_model_params params); + + LLAMA_API struct llama_model * llama_load_model_from_buffers( + struct llama_model_shard_buffer * shards, + size_t n_shards, + struct llama_model_params params); LLAMA_API void llama_free_model(struct llama_model * model); diff --git a/src/llama.cpp b/src/llama.cpp index c3669eb28..d958d6f82 100644 --- a/src/llama.cpp +++ b/src/llama.cpp @@ -1696,9 +1696,14 @@ public: } #else // use FILE * so we don't have to re-open the file to mmap - FILE * fp; + FILE * fp = nullptr; size_t size; + // when a buffer is used instead of a real file, we store the pointer here + const char * buffer = nullptr; + // curr is used as replacement for tell() when file is loaded from buffer + size_t curr = 0; + llama_file(const char * fname, const char * mode) { fp = ggml_fopen(fname, mode); if (fp == NULL) { @@ -1709,7 +1714,12 @@ public: seek(0, SEEK_SET); } + llama_file(const char * buffer, size_t size) : size(size), buffer(buffer) {} + size_t tell() const { + if (buffer) { + return curr; + } #ifdef _WIN32 __int64 ret = _ftelli64(fp); #else @@ -1722,7 +1732,17 @@ public: return (size_t) ret; } - void seek(size_t offset, int whence) const { + void seek(size_t offset, int whence) { + if (buffer) { + if (whence == SEEK_END) { + curr = size; + } else if (whence == SEEK_SET) { + curr = offset; + } else { + throw std::runtime_error(format("invalid whence: %d", whence)); + } + return; + } #ifdef _WIN32 int ret = _fseeki64(fp, (__int64) offset, whence); #else @@ -1737,6 +1757,13 @@ public: if (len == 0) { return; } + if (buffer) { + if (curr + len > size) { + throw std::runtime_error("unexpectedly reached end of buffer"); + } + memcpy(ptr, buffer + curr, len); + return; + } errno = 0; std::size_t ret = std::fread(ptr, len, 1, fp); if (ferror(fp)) { @@ -1757,6 +1784,9 @@ public: if (len == 0) { return; } + if (buffer) { + throw std::runtime_error("cannot write to read-only buffer"); + } errno = 0; size_t ret = std::fwrite(ptr, len, 1, fp); if (ret != 1) { @@ -1777,9 +1807,15 @@ public: }; using llama_files = std::vector>; +struct llama_shard_src { + std::string fname; + std::vector buffers; +}; + struct llama_mmap { void * addr; size_t size; + bool file_is_buffer = false; llama_mmap(const llama_mmap &) = delete; @@ -1790,6 +1826,11 @@ struct llama_mmap { std::vector> mapped_fragments; llama_mmap(struct llama_file * file, size_t prefetch = (size_t) -1 /* -1 = max value */, bool numa = false) { + if (file->buffer) { + // in-memory buffer doesn't need to be mapped + file_is_buffer = true; + return; + } size = file->size; int fd = fileno(file->fp); int flags = MAP_SHARED; @@ -1844,6 +1885,10 @@ struct llama_mmap { // partially unmap the file in the range [first, last) void unmap_fragment(size_t first, size_t last) { + if (file_is_buffer) { + // in-memory buffer doesn't need to be unmapped + return; + } // note: this function must not be called multiple times with overlapping ranges // otherwise, there is a risk of invalidating addresses that have been repurposed for other mappings int page_size = sysconf(_SC_PAGESIZE); @@ -1889,6 +1934,9 @@ struct llama_mmap { } ~llama_mmap() { + if (file_is_buffer) { + return; + } for (const auto & frag : mapped_fragments) { if (munmap((char *) addr + frag.first, frag.second - frag.first)) { LLAMA_LOG_WARN("warning: munmap failed: %s\n", strerror(errno)); @@ -1901,6 +1949,12 @@ struct llama_mmap { llama_mmap(struct llama_file * file, size_t prefetch = (size_t) -1, bool numa = false) { GGML_UNUSED(numa); + if (file->buffer) { + // in-memory buffer doesn't need to be mapped + file_is_buffer = true; + return; + } + size = file->size; HANDLE hFile = (HANDLE) _get_osfhandle(_fileno(file->fp)); @@ -1952,6 +2006,9 @@ struct llama_mmap { } ~llama_mmap() { + if (file->buffer) { + return; + } if (!UnmapViewOfFile(addr)) { LLAMA_LOG_WARN("warning: UnmapViewOfFile failed: %s\n", llama_format_win_err(GetLastError()).c_str()); @@ -4292,7 +4349,7 @@ struct llama_model_loader { std::string arch_name; LLM_KV llm_kv = LLM_KV(LLM_ARCH_UNKNOWN); - llama_model_loader(const std::string & fname, bool use_mmap, bool check_tensors, const struct llama_model_kv_override * param_overrides_p) { + llama_model_loader(const llama_shard_src & src, bool use_mmap, bool check_tensors, const struct llama_model_kv_override * param_overrides_p) { int trace = 0; if (getenv("LLAMA_TRACE")) { trace = atoi(getenv("LLAMA_TRACE")); @@ -4304,21 +4361,32 @@ struct llama_model_loader { } } + bool file_is_buffer = src.fname.empty(); struct ggml_context * ctx = NULL; struct gguf_init_params params = { /*.no_alloc = */ true, /*.ctx = */ &ctx, }; - meta = gguf_init_from_file(fname.c_str(), params); + if (file_is_buffer) { + if (src.buffers.empty()) { + throw std::runtime_error("list of shard buffers must not be empty"); + } + meta = gguf_init_from_buffer(src.buffers[0]->data, src.buffers[0]->size, params); + } else { + meta = gguf_init_from_file(src.fname.c_str(), params); + } if (!meta) { - throw std::runtime_error(format("%s: failed to load model from %s\n", __func__, fname.c_str())); + throw std::runtime_error(format( + "%s: failed to load model from %s\n", __func__, file_is_buffer ? "buffer" : src.fname.c_str())); } get_key(llm_kv(LLM_KV_GENERAL_ARCHITECTURE), arch_name, false); llm_kv = LLM_KV(llm_arch_from_string(arch_name)); - files.emplace_back(new llama_file(fname.c_str(), "rb")); + files.emplace_back(file_is_buffer + ? new llama_file(src.buffers[0]->data, src.buffers[0]->size) + : new llama_file(src.fname.c_str(), "rb")); contexts.emplace_back(ctx); // Save tensors data offset of the main file. @@ -4338,9 +4406,16 @@ struct llama_model_loader { throw std::runtime_error(format("illegal split file: %d, model must be loaded with the first split", idx)); } + if (n_split < src.buffers.size()) { + throw std::runtime_error(format("expecting %d buffers, but only have %d", n_split, (int) src.buffers.size())); + } + char split_prefix[PATH_MAX] = {0}; - if (!llama_split_prefix(split_prefix, sizeof(split_prefix), fname.c_str(), idx, n_split)) { - throw std::runtime_error(format("invalid split file: %s", fname.c_str())); + if (!file_is_buffer) { + int ret = llama_split_prefix(split_prefix, sizeof(split_prefix), src.fname.c_str(), idx, n_split); + if (!ret) { + throw std::runtime_error(format("invalid split file: %s", src.fname.c_str())); + } } if (trace > 0) { @@ -4349,18 +4424,28 @@ struct llama_model_loader { char split_path[PATH_MAX] = {0}; for (idx = 1; idx < n_split; idx++) { - llama_split_path(split_path, sizeof(split_path), split_prefix, idx, n_split); + if (file_is_buffer) { + if (idx >= src.buffers.size()) { + throw std::runtime_error(format("missing buffer for shard number %d", idx+1)); + } + } else { + llama_split_path(split_path, sizeof(split_path), split_prefix, idx, n_split); + } struct gguf_init_params split_params = { /*.no_alloc = */ true, /*.ctx = */ &ctx, }; - struct gguf_context * ctx_gguf = gguf_init_from_file(split_path, split_params); + struct gguf_context * ctx_gguf = file_is_buffer + ? gguf_init_from_buffer(src.buffers[idx]->data, src.buffers[idx]->size, split_params) + : gguf_init_from_file(split_path, split_params); if (!ctx_gguf) { throw std::runtime_error(format("%s: failed to load GGUF split from %s\n", __func__, split_path)); } - files.emplace_back(new llama_file(split_path, "rb")); + files.emplace_back(file_is_buffer + ? new llama_file(src.buffers[idx]->data, src.buffers[idx]->size) + : new llama_file(split_path, "rb")); contexts.emplace_back(ctx); // Save tensors data offset info of the shard. @@ -4403,7 +4488,7 @@ struct llama_model_loader { } LLAMA_LOG_INFO("%s: loaded meta data with %d key-value pairs and %d tensors from %s (version %s)\n", - __func__, n_kv, n_tensors, fname.c_str(), llama_file_version_name(fver)); + __func__, n_kv, n_tensors, file_is_buffer ? "buffer" : src.fname.c_str(), llama_file_version_name(fver)); // determine file type based on the number of tensors for each quantization and print meta data // TODO: make optional @@ -8584,9 +8669,9 @@ static bool llm_load_tensors( } // Returns 0 on success, -1 on error, and -2 on cancellation via llama_progress_callback -static int llama_model_load(const std::string & fname, llama_model & model, llama_model_params & params) { +static int llama_model_load_internal(const llama_shard_src & src, llama_model & model, llama_model_params & params) { try { - llama_model_loader ml(fname, params.use_mmap, params.check_tensors, params.kv_overrides); + llama_model_loader ml(src, params.use_mmap, params.check_tensors, params.kv_overrides); model.hparams.vocab_only = params.vocab_only; @@ -17274,7 +17359,9 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s auto v = (std::vector*)params->kv_overrides; kv_overrides = v->data(); } - llama_model_loader ml(fname_inp, use_mmap, /*check_tensors*/ true, kv_overrides); + llama_shard_src src; + src.fname = fname_inp; + llama_model_loader ml(src, use_mmap, /*check_tensors*/ true, kv_overrides); ml.init_mappings(false); // no prefetching llama_model model; @@ -17996,9 +18083,9 @@ int64_t llama_time_us(void) { return ggml_time_us(); } -struct llama_model * llama_load_model_from_file( - const char * path_model, - struct llama_model_params params) { +static struct llama_model * llama_load_model_with_params( + const llama_shard_src & src, + struct llama_model_params params) { ggml_time_init(); llama_model * model = new llama_model; @@ -18030,7 +18117,7 @@ struct llama_model * llama_load_model_from_file( } model->rpc_servers.push_back(servers); } - int status = llama_model_load(path_model, *model, params); + int status = llama_model_load_internal(src, *model, params); GGML_ASSERT(status <= 0); if (status < 0) { if (status == -1) { @@ -18045,6 +18132,26 @@ struct llama_model * llama_load_model_from_file( return model; } +struct llama_model * llama_load_model_from_file( + const char * path_model, + struct llama_model_params params) { + llama_shard_src src; + src.fname = path_model; + return llama_load_model_with_params(src, params); +} + +struct llama_model * llama_load_model_from_buffers( + struct llama_model_shard_buffer * shards, + size_t n_shards, + struct llama_model_params params) { + llama_shard_src src; + src.buffers.reserve(n_shards); + for (size_t i = 0; i < n_shards; i++) { + src.buffers[i] = &shards[i]; + } + return llama_load_model_with_params(src, params); +} + void llama_free_model(struct llama_model * model) { delete model; }