MPT : clone wte to output at load time

2023-10-14 14:38:28 -04:00 · 2023-10-14 14:38:28 -04:00 · 420744467f
commit 420744467f
parent b577e6374e
4 changed files with 36 additions and 9 deletions
--- a/convert-mpt-hf-to-gguf.py
+++ b/convert-mpt-hf-to-gguf.py
@ -197,11 +197,6 @@ for part_name in part_names:

        gguf_writer.add_tensor(new_name, data)

-        # note: MPT output is tied to (same as) wte in original model;
-        # for easier implementation in llama.cpp it's duplicated in GGUF, though :/
-        if new_name == "token_embd.weight":
-            gguf_writer.add_tensor("output.weight", data)
-
 print("gguf: write header")
 gguf_writer.write_header_to_file()
 print("gguf: write metadata")
--- a/ggml.c
+++ b/ggml.c
@ -21167,10 +21167,11 @@ struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_p
        // the ggml_tensor structs to the appropriate locations in the binary blob

        // compute the exact size needed for the new ggml_context
+        int n_tensors = ctx->header.n_tensors + params.extra_tensors;
        const size_t mem_size =
            params.no_alloc ?
-            (ctx->header.n_tensors    )*ggml_tensor_overhead() :
-            (ctx->header.n_tensors + 1)*ggml_tensor_overhead() + ctx->size;
+            (n_tensors    )*ggml_tensor_overhead() :
+            (n_tensors + 1)*ggml_tensor_overhead() + ctx->size;

        struct ggml_init_params pdata = {
            .mem_size   = mem_size,
@ -21454,6 +21455,10 @@ size_t gguf_get_tensor_offset(const struct gguf_context * ctx, int i) {
    return ctx->infos[i].offset;
 }

+void gguf_set_tensor_offset(const struct gguf_context * ctx, int i, size_t offset) {
+    ctx->infos[i].offset = offset;
+}
+
 char * gguf_get_tensor_name(const struct gguf_context * ctx, int i) {
    return ctx->infos[i].name.data;
 }
--- a/ggml.h
+++ b/ggml.h
@ -1964,6 +1964,7 @@ extern "C" {

        // if not NULL, create a ggml_context and allocate the tensor data in it
        struct ggml_context ** ctx;
+        int extra_tensors;
    };

    GGML_API struct gguf_context * gguf_init_empty(void);
@ -2006,6 +2007,7 @@ extern "C" {
    GGML_API int    gguf_get_n_tensors    (const struct gguf_context * ctx);
    GGML_API int    gguf_find_tensor      (const struct gguf_context * ctx, const char * name);
    GGML_API size_t gguf_get_tensor_offset(const struct gguf_context * ctx, int i);
+    GGML_API void   gguf_set_tensor_offset(const struct gguf_context * ctx, int i, size_t offset);
    GGML_API char * gguf_get_tensor_name  (const struct gguf_context * ctx, int i);

    // overrides existing values or adds a new one
--- a/llama.cpp
+++ b/llama.cpp
@ -1623,6 +1623,7 @@ struct llama_model_loader {
        struct gguf_init_params params = {
            /*.no_alloc      = */ true,
            /*.ctx           = */ &ctx_meta,
+            /*.extra_tensors = */ 1,
        };

        ctx_gguf = gguf_init_from_file(fname.c_str(), params);
@ -1912,6 +1913,25 @@ struct llama_model_loader {
            done_size += ggml_nbytes(cur);
        }
    }
+
+    // must be called before calc_sizes
+    void clone_tensor(const char * src_name, const char * dst_name) {
+        int src_idx = gguf_find_tensor(ctx_gguf, src_name);
+        GGML_ASSERT(src_idx >= 0);
+
+        struct ggml_tensor * src = ggml_get_tensor(ctx_meta, src_name);
+        size_t src_offset = gguf_get_tensor_offset(ctx_gguf, src_idx);
+
+        struct ggml_tensor * cur = ggml_new_tensor(ctx_meta, src->type, src->n_dims, src->ne);
+        GGML_ASSERT(cur);
+
+        ggml_set_name(cur, dst_name);
+        gguf_add_tensor(ctx_gguf, cur);
+        gguf_set_tensor_offset(ctx_gguf, n_tensors, src_offset);
+        n_tensors++;
+        n_elements += ggml_nelements(cur);
+        n_bytes += ggml_nbytes(cur);
+    }
 };

 //
@ -2304,6 +2324,11 @@ static void llm_load_tensors(

    model.n_gpu_layers = n_gpu_layers;

+    // MPT output is tied to (same as) wte in original model
+    if (model.arch == LLM_ARCH_MPT) {
+        ml.clone_tensor("token_embd.weight", "output.weight");
+    }
+
    size_t ctx_size;
    size_t mmapped_size;