From 420744467f7fd736031d10e2a756cf6a36543537 Mon Sep 17 00:00:00 2001 From: Cebtenzzre Date: Sat, 14 Oct 2023 14:38:28 -0400 Subject: [PATCH] MPT : clone wte to output at load time --- convert-mpt-hf-to-gguf.py | 5 ----- ggml.c | 9 +++++++-- ggml.h | 2 ++ llama.cpp | 29 +++++++++++++++++++++++++++-- 4 files changed, 36 insertions(+), 9 deletions(-) diff --git a/convert-mpt-hf-to-gguf.py b/convert-mpt-hf-to-gguf.py index 73a4932f7..5121b707b 100755 --- a/convert-mpt-hf-to-gguf.py +++ b/convert-mpt-hf-to-gguf.py @@ -197,11 +197,6 @@ for part_name in part_names: gguf_writer.add_tensor(new_name, data) - # note: MPT output is tied to (same as) wte in original model; - # for easier implementation in llama.cpp it's duplicated in GGUF, though :/ - if new_name == "token_embd.weight": - gguf_writer.add_tensor("output.weight", data) - print("gguf: write header") gguf_writer.write_header_to_file() print("gguf: write metadata") diff --git a/ggml.c b/ggml.c index 630deb49d..7b6c7cb41 100644 --- a/ggml.c +++ b/ggml.c @@ -21167,10 +21167,11 @@ struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_p // the ggml_tensor structs to the appropriate locations in the binary blob // compute the exact size needed for the new ggml_context + int n_tensors = ctx->header.n_tensors + params.extra_tensors; const size_t mem_size = params.no_alloc ? - (ctx->header.n_tensors )*ggml_tensor_overhead() : - (ctx->header.n_tensors + 1)*ggml_tensor_overhead() + ctx->size; + (n_tensors )*ggml_tensor_overhead() : + (n_tensors + 1)*ggml_tensor_overhead() + ctx->size; struct ggml_init_params pdata = { .mem_size = mem_size, @@ -21454,6 +21455,10 @@ size_t gguf_get_tensor_offset(const struct gguf_context * ctx, int i) { return ctx->infos[i].offset; } +void gguf_set_tensor_offset(const struct gguf_context * ctx, int i, size_t offset) { + ctx->infos[i].offset = offset; +} + char * gguf_get_tensor_name(const struct gguf_context * ctx, int i) { return ctx->infos[i].name.data; } diff --git a/ggml.h b/ggml.h index 6e35888e9..acf0863bd 100644 --- a/ggml.h +++ b/ggml.h @@ -1964,6 +1964,7 @@ extern "C" { // if not NULL, create a ggml_context and allocate the tensor data in it struct ggml_context ** ctx; + int extra_tensors; }; GGML_API struct gguf_context * gguf_init_empty(void); @@ -2006,6 +2007,7 @@ extern "C" { GGML_API int gguf_get_n_tensors (const struct gguf_context * ctx); GGML_API int gguf_find_tensor (const struct gguf_context * ctx, const char * name); GGML_API size_t gguf_get_tensor_offset(const struct gguf_context * ctx, int i); + GGML_API void gguf_set_tensor_offset(const struct gguf_context * ctx, int i, size_t offset); GGML_API char * gguf_get_tensor_name (const struct gguf_context * ctx, int i); // overrides existing values or adds a new one diff --git a/llama.cpp b/llama.cpp index 464b60690..189d1e036 100644 --- a/llama.cpp +++ b/llama.cpp @@ -1621,8 +1621,9 @@ struct llama_model_loader { llama_model_loader(const std::string & fname, bool use_mmap) : file(fname.c_str(), "rb") { struct gguf_init_params params = { - /*.no_alloc = */ true, - /*.ctx = */ &ctx_meta, + /*.no_alloc = */ true, + /*.ctx = */ &ctx_meta, + /*.extra_tensors = */ 1, }; ctx_gguf = gguf_init_from_file(fname.c_str(), params); @@ -1912,6 +1913,25 @@ struct llama_model_loader { done_size += ggml_nbytes(cur); } } + + // must be called before calc_sizes + void clone_tensor(const char * src_name, const char * dst_name) { + int src_idx = gguf_find_tensor(ctx_gguf, src_name); + GGML_ASSERT(src_idx >= 0); + + struct ggml_tensor * src = ggml_get_tensor(ctx_meta, src_name); + size_t src_offset = gguf_get_tensor_offset(ctx_gguf, src_idx); + + struct ggml_tensor * cur = ggml_new_tensor(ctx_meta, src->type, src->n_dims, src->ne); + GGML_ASSERT(cur); + + ggml_set_name(cur, dst_name); + gguf_add_tensor(ctx_gguf, cur); + gguf_set_tensor_offset(ctx_gguf, n_tensors, src_offset); + n_tensors++; + n_elements += ggml_nelements(cur); + n_bytes += ggml_nbytes(cur); + } }; // @@ -2304,6 +2324,11 @@ static void llm_load_tensors( model.n_gpu_layers = n_gpu_layers; + // MPT output is tied to (same as) wte in original model + if (model.arch == LLM_ARCH_MPT) { + ml.clone_tensor("token_embd.weight", "output.weight"); + } + size_t ctx_size; size_t mmapped_size;