MPT : clone wte to output at load time
This commit is contained in:
parent
b577e6374e
commit
420744467f
4 changed files with 36 additions and 9 deletions
|
@ -197,11 +197,6 @@ for part_name in part_names:
|
|||
|
||||
gguf_writer.add_tensor(new_name, data)
|
||||
|
||||
# note: MPT output is tied to (same as) wte in original model;
|
||||
# for easier implementation in llama.cpp it's duplicated in GGUF, though :/
|
||||
if new_name == "token_embd.weight":
|
||||
gguf_writer.add_tensor("output.weight", data)
|
||||
|
||||
print("gguf: write header")
|
||||
gguf_writer.write_header_to_file()
|
||||
print("gguf: write metadata")
|
||||
|
|
9
ggml.c
9
ggml.c
|
@ -21167,10 +21167,11 @@ struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_p
|
|||
// the ggml_tensor structs to the appropriate locations in the binary blob
|
||||
|
||||
// compute the exact size needed for the new ggml_context
|
||||
int n_tensors = ctx->header.n_tensors + params.extra_tensors;
|
||||
const size_t mem_size =
|
||||
params.no_alloc ?
|
||||
(ctx->header.n_tensors )*ggml_tensor_overhead() :
|
||||
(ctx->header.n_tensors + 1)*ggml_tensor_overhead() + ctx->size;
|
||||
(n_tensors )*ggml_tensor_overhead() :
|
||||
(n_tensors + 1)*ggml_tensor_overhead() + ctx->size;
|
||||
|
||||
struct ggml_init_params pdata = {
|
||||
.mem_size = mem_size,
|
||||
|
@ -21454,6 +21455,10 @@ size_t gguf_get_tensor_offset(const struct gguf_context * ctx, int i) {
|
|||
return ctx->infos[i].offset;
|
||||
}
|
||||
|
||||
void gguf_set_tensor_offset(const struct gguf_context * ctx, int i, size_t offset) {
|
||||
ctx->infos[i].offset = offset;
|
||||
}
|
||||
|
||||
char * gguf_get_tensor_name(const struct gguf_context * ctx, int i) {
|
||||
return ctx->infos[i].name.data;
|
||||
}
|
||||
|
|
2
ggml.h
2
ggml.h
|
@ -1964,6 +1964,7 @@ extern "C" {
|
|||
|
||||
// if not NULL, create a ggml_context and allocate the tensor data in it
|
||||
struct ggml_context ** ctx;
|
||||
int extra_tensors;
|
||||
};
|
||||
|
||||
GGML_API struct gguf_context * gguf_init_empty(void);
|
||||
|
@ -2006,6 +2007,7 @@ extern "C" {
|
|||
GGML_API int gguf_get_n_tensors (const struct gguf_context * ctx);
|
||||
GGML_API int gguf_find_tensor (const struct gguf_context * ctx, const char * name);
|
||||
GGML_API size_t gguf_get_tensor_offset(const struct gguf_context * ctx, int i);
|
||||
GGML_API void gguf_set_tensor_offset(const struct gguf_context * ctx, int i, size_t offset);
|
||||
GGML_API char * gguf_get_tensor_name (const struct gguf_context * ctx, int i);
|
||||
|
||||
// overrides existing values or adds a new one
|
||||
|
|
25
llama.cpp
25
llama.cpp
|
@ -1623,6 +1623,7 @@ struct llama_model_loader {
|
|||
struct gguf_init_params params = {
|
||||
/*.no_alloc = */ true,
|
||||
/*.ctx = */ &ctx_meta,
|
||||
/*.extra_tensors = */ 1,
|
||||
};
|
||||
|
||||
ctx_gguf = gguf_init_from_file(fname.c_str(), params);
|
||||
|
@ -1912,6 +1913,25 @@ struct llama_model_loader {
|
|||
done_size += ggml_nbytes(cur);
|
||||
}
|
||||
}
|
||||
|
||||
// must be called before calc_sizes
|
||||
void clone_tensor(const char * src_name, const char * dst_name) {
|
||||
int src_idx = gguf_find_tensor(ctx_gguf, src_name);
|
||||
GGML_ASSERT(src_idx >= 0);
|
||||
|
||||
struct ggml_tensor * src = ggml_get_tensor(ctx_meta, src_name);
|
||||
size_t src_offset = gguf_get_tensor_offset(ctx_gguf, src_idx);
|
||||
|
||||
struct ggml_tensor * cur = ggml_new_tensor(ctx_meta, src->type, src->n_dims, src->ne);
|
||||
GGML_ASSERT(cur);
|
||||
|
||||
ggml_set_name(cur, dst_name);
|
||||
gguf_add_tensor(ctx_gguf, cur);
|
||||
gguf_set_tensor_offset(ctx_gguf, n_tensors, src_offset);
|
||||
n_tensors++;
|
||||
n_elements += ggml_nelements(cur);
|
||||
n_bytes += ggml_nbytes(cur);
|
||||
}
|
||||
};
|
||||
|
||||
//
|
||||
|
@ -2304,6 +2324,11 @@ static void llm_load_tensors(
|
|||
|
||||
model.n_gpu_layers = n_gpu_layers;
|
||||
|
||||
// MPT output is tied to (same as) wte in original model
|
||||
if (model.arch == LLM_ARCH_MPT) {
|
||||
ml.clone_tensor("token_embd.weight", "output.weight");
|
||||
}
|
||||
|
||||
size_t ctx_size;
|
||||
size_t mmapped_size;
|
||||
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue