MPT : clone wte to output at load time

This commit is contained in:
Cebtenzzre 2023-10-14 14:38:28 -04:00
parent b577e6374e
commit 420744467f
4 changed files with 36 additions and 9 deletions

View file

@ -197,11 +197,6 @@ for part_name in part_names:
gguf_writer.add_tensor(new_name, data) gguf_writer.add_tensor(new_name, data)
# note: MPT output is tied to (same as) wte in original model;
# for easier implementation in llama.cpp it's duplicated in GGUF, though :/
if new_name == "token_embd.weight":
gguf_writer.add_tensor("output.weight", data)
print("gguf: write header") print("gguf: write header")
gguf_writer.write_header_to_file() gguf_writer.write_header_to_file()
print("gguf: write metadata") print("gguf: write metadata")

9
ggml.c
View file

@ -21167,10 +21167,11 @@ struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_p
// the ggml_tensor structs to the appropriate locations in the binary blob // the ggml_tensor structs to the appropriate locations in the binary blob
// compute the exact size needed for the new ggml_context // compute the exact size needed for the new ggml_context
int n_tensors = ctx->header.n_tensors + params.extra_tensors;
const size_t mem_size = const size_t mem_size =
params.no_alloc ? params.no_alloc ?
(ctx->header.n_tensors )*ggml_tensor_overhead() : (n_tensors )*ggml_tensor_overhead() :
(ctx->header.n_tensors + 1)*ggml_tensor_overhead() + ctx->size; (n_tensors + 1)*ggml_tensor_overhead() + ctx->size;
struct ggml_init_params pdata = { struct ggml_init_params pdata = {
.mem_size = mem_size, .mem_size = mem_size,
@ -21454,6 +21455,10 @@ size_t gguf_get_tensor_offset(const struct gguf_context * ctx, int i) {
return ctx->infos[i].offset; return ctx->infos[i].offset;
} }
void gguf_set_tensor_offset(const struct gguf_context * ctx, int i, size_t offset) {
ctx->infos[i].offset = offset;
}
char * gguf_get_tensor_name(const struct gguf_context * ctx, int i) { char * gguf_get_tensor_name(const struct gguf_context * ctx, int i) {
return ctx->infos[i].name.data; return ctx->infos[i].name.data;
} }

2
ggml.h
View file

@ -1964,6 +1964,7 @@ extern "C" {
// if not NULL, create a ggml_context and allocate the tensor data in it // if not NULL, create a ggml_context and allocate the tensor data in it
struct ggml_context ** ctx; struct ggml_context ** ctx;
int extra_tensors;
}; };
GGML_API struct gguf_context * gguf_init_empty(void); GGML_API struct gguf_context * gguf_init_empty(void);
@ -2006,6 +2007,7 @@ extern "C" {
GGML_API int gguf_get_n_tensors (const struct gguf_context * ctx); GGML_API int gguf_get_n_tensors (const struct gguf_context * ctx);
GGML_API int gguf_find_tensor (const struct gguf_context * ctx, const char * name); GGML_API int gguf_find_tensor (const struct gguf_context * ctx, const char * name);
GGML_API size_t gguf_get_tensor_offset(const struct gguf_context * ctx, int i); GGML_API size_t gguf_get_tensor_offset(const struct gguf_context * ctx, int i);
GGML_API void gguf_set_tensor_offset(const struct gguf_context * ctx, int i, size_t offset);
GGML_API char * gguf_get_tensor_name (const struct gguf_context * ctx, int i); GGML_API char * gguf_get_tensor_name (const struct gguf_context * ctx, int i);
// overrides existing values or adds a new one // overrides existing values or adds a new one

View file

@ -1621,8 +1621,9 @@ struct llama_model_loader {
llama_model_loader(const std::string & fname, bool use_mmap) : file(fname.c_str(), "rb") { llama_model_loader(const std::string & fname, bool use_mmap) : file(fname.c_str(), "rb") {
struct gguf_init_params params = { struct gguf_init_params params = {
/*.no_alloc = */ true, /*.no_alloc = */ true,
/*.ctx = */ &ctx_meta, /*.ctx = */ &ctx_meta,
/*.extra_tensors = */ 1,
}; };
ctx_gguf = gguf_init_from_file(fname.c_str(), params); ctx_gguf = gguf_init_from_file(fname.c_str(), params);
@ -1912,6 +1913,25 @@ struct llama_model_loader {
done_size += ggml_nbytes(cur); done_size += ggml_nbytes(cur);
} }
} }
// must be called before calc_sizes
void clone_tensor(const char * src_name, const char * dst_name) {
int src_idx = gguf_find_tensor(ctx_gguf, src_name);
GGML_ASSERT(src_idx >= 0);
struct ggml_tensor * src = ggml_get_tensor(ctx_meta, src_name);
size_t src_offset = gguf_get_tensor_offset(ctx_gguf, src_idx);
struct ggml_tensor * cur = ggml_new_tensor(ctx_meta, src->type, src->n_dims, src->ne);
GGML_ASSERT(cur);
ggml_set_name(cur, dst_name);
gguf_add_tensor(ctx_gguf, cur);
gguf_set_tensor_offset(ctx_gguf, n_tensors, src_offset);
n_tensors++;
n_elements += ggml_nelements(cur);
n_bytes += ggml_nbytes(cur);
}
}; };
// //
@ -2304,6 +2324,11 @@ static void llm_load_tensors(
model.n_gpu_layers = n_gpu_layers; model.n_gpu_layers = n_gpu_layers;
// MPT output is tied to (same as) wte in original model
if (model.arch == LLM_ARCH_MPT) {
ml.clone_tensor("token_embd.weight", "output.weight");
}
size_t ctx_size; size_t ctx_size;
size_t mmapped_size; size_t mmapped_size;