From 549fe807fddb10c7d898aea8c1a0b72f59c2d9ab Mon Sep 17 00:00:00 2001
From: Jared Van Bortel <jared@nomic.ai>
Date: Thu, 22 Feb 2024 16:02:39 -0500
Subject: [PATCH] mpt : do not duplicate token_embd.weight on disk

Previous attempt was #3626
---
 convert-hf-to-gguf.py | 5 -----
 llama.cpp             | 5 ++++-
 2 files changed, 4 insertions(+), 6 deletions(-)

diff --git a/convert-hf-to-gguf.py b/convert-hf-to-gguf.py
index 8630bbf29..18a720874 100755
--- a/convert-hf-to-gguf.py
+++ b/convert-hf-to-gguf.py
@@ -618,11 +618,6 @@ class MPTModel(Model):
 
             self.gguf_writer.add_tensor(new_name, data)
 
-            # note: MPT output is tied to (same as) wte in original model;
-            # for easier implementation in llama.cpp it's duplicated in GGUF, though :/
-            if new_name == "token_embd.weight":
-                self.gguf_writer.add_tensor("output.weight", data)
-
 
 class OrionModel(Model):
     def set_vocab(self):
diff --git a/llama.cpp b/llama.cpp
index 40dda265c..3b1eb6398 100644
--- a/llama.cpp
+++ b/llama.cpp
@@ -4056,7 +4056,10 @@ static bool llm_load_tensors(
                         model.output_norm   = ml.create_tensor(ctx_output,       tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd});
                         model.output_norm_b = ml.create_tensor(ctx_output,       tn(LLM_TENSOR_OUTPUT_NORM, "bias"),   {n_embd}, false);
 
-                        model.output        = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT,      "weight"), {n_embd, n_vocab});
+                        // same as tok_embd, duplicated to allow offloading
+                        model.output        = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_TOKEN_EMBD,  "weight"), {n_embd, n_vocab});
+                        ml.n_created--; // artificial tensor
+                        ml.size_data += ggml_nbytes(model.output);
                     }
 
                     for (int i = 0; i < n_layer; ++i) {