From 673ae1a17e6251984d32a055e89f649e9e0cf7bc Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Thu, 17 Aug 2023 16:52:25 +0300
Subject: [PATCH] convert-new.py : convert script now works

---
 convert-llama-7b-pth-to-gguf.py |  2 +-
 convert-new.py                  | 41 +++++++++++++++++++++++++--------
 gguf.py                         | 33 +++++++-------------------
 llama.cpp                       |  4 ++++
 4 files changed, 44 insertions(+), 36 deletions(-)

diff --git a/convert-llama-7b-pth-to-gguf.py b/convert-llama-7b-pth-to-gguf.py
index c4e425ee3..9afea8a7e 100644
--- a/convert-llama-7b-pth-to-gguf.py
+++ b/convert-llama-7b-pth-to-gguf.py
@@ -298,7 +298,7 @@ for part_name in part_names:
 
         print( name + ", shape " + str(len(data.shape)) + ", " + str(old_dtype) + " --> " + str(data.dtype))
 
-        gguf_writer.write_tensor_to_file(data)
+        gguf_writer.write_tensor_data(data)
 
 gguf_writer.close()
 
diff --git a/convert-new.py b/convert-new.py
index b263e4400..4aaaa60d4 100755
--- a/convert-new.py
+++ b/convert-new.py
@@ -669,7 +669,6 @@ def lazy_load_file(path: Path) -> ModelPlus:
 In = TypeVar('In')
 Out = TypeVar('Out')
 
-
 def bounded_parallel_map(func: Callable[[In], Out], iterable: Iterable[In], concurrency: int) -> Iterable[Out]:
     '''Parallel map, but with backpressure.  If the caller doesn't call `next`
     fast enough, this will stop calling `func` at some point rather than
@@ -734,19 +733,35 @@ class OutputFile:
 
         # TODO: added / special tokens
 
+    def add_tensor_info(self, name: str, tensor: LazyTensor) -> None:
+        n_elements = 1
+        for dim in tensor.shape:
+            n_elements *= dim
+        data_type = DATA_TYPE_TO_NUMPY[tensor.data_type]
+        data_nbytes = n_elements * data_type.itemsize
+        self.gguf.add_tensor_info(name, tensor.shape, data_type, data_nbytes)
+
     def write_meta(self) -> None:
         self.gguf.write_header_to_file()
         self.gguf.write_kv_data_to_file()
 
+    def write_tensor_info(self) -> None:
+        self.gguf.write_ti_data_to_file()
+
     def close(self) -> None:
         self.gguf.close()
 
     @staticmethod
     def write_vocab_only(fname_out: Path, params: Params, vocab: Vocab) -> None:
+        check_vocab_size(params, vocab)
+
         of = OutputFile(fname_out)
+
+        # meta data
         of.add_meta_arch(params)
         of.add_meta_vocab(vocab)
         of.write_meta()
+
         of.close()
 
     @staticmethod
@@ -754,22 +769,31 @@ class OutputFile:
         check_vocab_size(params, vocab)
 
         of = OutputFile(fname_out)
+
+        # meta data
         of.add_meta_arch(params)
         of.add_meta_vocab(vocab)
 
+        # tensor info
+        for name, lazy_tensor in model.items():
+            of.add_tensor_info(name, lazy_tensor)
+
+        of.write_meta()
+        of.write_tensor_info()
+
         def do_item(item: Tuple[str, LazyTensor]) -> NDArray:
             name, lazy_tensor = item
             return lazy_tensor.load().to_ggml().ndarray
 
+        # tensor data
         ndarrays = bounded_parallel_map(do_item, model.items(), concurrency=8)
         for i, ((name, lazy_tensor), ndarray) in enumerate(zip(model.items(), ndarrays)):
             size = ' x '.join(f"{dim:6d}" for dim in lazy_tensor.shape)
             padi = len(str(len(model)))
             print(f"[{i+1:{padi}d}/{len(model)}] Writing tensor {name:38s} | size {size:16} | type {lazy_tensor.data_type}")
-            #of.write_tensor_header(name, lazy_tensor.shape, lazy_tensor.data_type)
-            ndarray.tofile(of.fout)
-        of.fout.close()
+            of.gguf.write_tensor_data(ndarray)
 
+        of.close()
 
 def pick_output_type(model: LazyModel, output_type_str: Optional[str]) -> GGMLFileType:
     wq_type = model[NAMES[gguf.MODEL_TENSOR.ATTN_Q].format(bid=0)+".weight"].data_type
@@ -783,6 +807,9 @@ def pick_output_type(model: LazyModel, output_type_str: Optional[str]) -> GGMLFi
 
     raise Exception(f"Unexpected combination of types: {name_to_type}")
 
+def convert_to_output_type(model: LazyModel, output_type: GGMLFileType) -> LazyModel:
+    return {name: tensor.astype(output_type.type_for_tensor(name, tensor))
+            for (name, tensor) in model.items()}
 
 def convert_model_names(model: LazyModel, params: Params) -> LazyModel:
     tmap = gguf.get_tensor_name_map(ARCH, params.n_layer)
@@ -808,12 +835,6 @@ def convert_model_names(model: LazyModel, params: Params) -> LazyModel:
 
     return out
 
-
-def convert_to_output_type(model: LazyModel, output_type: GGMLFileType) -> LazyModel:
-    return {name: tensor.astype(output_type.type_for_tensor(name, tensor))
-            for (name, tensor) in model.items()}
-
-
 def nth_multifile_path(path: Path, n: int) -> Optional[Path]:
     '''Given any path belonging to a multi-file model (e.g. foo.bin.1), return
     the nth path in the model.
diff --git a/gguf.py b/gguf.py
index 3c1964d81..55f45716a 100644
--- a/gguf.py
+++ b/gguf.py
@@ -70,23 +70,6 @@ KEY_TOKENIZER_RWKV       = "tokenizer.rwkv.world"
 # recommended mapping of model tensor names for storage in gguf
 #
 
-#LLAMA_TOKEN_EMBD  = "token_embd"
-#LLAMA_OUTPUT_NORM = "output_norm"
-#LLAMA_OUTPUT      = "output"
-#LLAMA_ATTN_NORM   = "blk.{bid}.attn_norm"
-#LLAMA_ATTN_Q      = "blk.{bid}.attn_q"
-#LLAMA_ATTN_K      = "blk.{bid}.attn_k"
-#LLAMA_ATTN_V      = "blk.{bid}.attn_v"
-#LLAMA_ATTN_OUTPUT = "blk.{bid}.attn_output"
-#LLAMA_FFN_NORM    = "blk.{bid}.ffn_norm"
-#LLAMA_FFN_GATE    = "blk.{bid}.ffn_gate"
-#LLAMA_FFN_DOWN    = "blk.{bid}.ffn_down"
-#LLAMA_FFN_UP      = "blk.{bid}.ffn_up"
-#
-#GPT_POS_EMBD = "pos_embd"
-#
-#FALCON_ATTN_NORM_2 = "blk.{bid}.attn_norm_2"
-
 class MODEL_ARCH(IntEnum):
     LLAMA   = auto()
     FALCON  = auto()
@@ -117,15 +100,15 @@ class MODEL_TENSOR(IntEnum):
 MODEL_ARCH_NAMES = {
     MODEL_ARCH.LLAMA   : "llama",
     MODEL_ARCH.FALCON  : "falcon",
-    MODEL_ARCH.GPT2    : "gpt-2",
-    MODEL_ARCH.GPTJ    : "gpt-j",
-    MODEL_ARCH.GPTNEOX : "gpt-neox",
+    MODEL_ARCH.GPT2    : "gpt2",
+    MODEL_ARCH.GPTJ    : "gptj",
+    MODEL_ARCH.GPTNEOX : "gptneox",
     MODEL_ARCH.MPT     : "mpt",
     }
 
 MODEL_TENSOR_NAMES = {
     MODEL_ARCH.LLAMA  : {
-        MODEL_TENSOR.TOKEN_EMBD        : "tok_embd",
+        MODEL_TENSOR.TOKEN_EMBD        : "token_embd",
         MODEL_TENSOR.OUTPUT_NORM       : "output_norm",
         MODEL_TENSOR.OUTPUT            : "output",
         MODEL_TENSOR.ROPE_FREQS        : "rope_freqs",
@@ -141,7 +124,7 @@ MODEL_TENSOR_NAMES = {
         MODEL_TENSOR.FFN_UP            : "blk.{bid}.ffn_up",
         },
     MODEL_ARCH.FALCON : {
-        MODEL_TENSOR.TOKEN_EMBD  : "tok_embd",
+        MODEL_TENSOR.TOKEN_EMBD  : "token_embd",
         MODEL_TENSOR.OUTPUT_NORM : "output_norm",
         MODEL_TENSOR.OUTPUT      : "output",
         MODEL_TENSOR.ATTN_NORM   : "blk.{bid}.attn_norm",
@@ -495,7 +478,7 @@ class GGUFWriter:
         self.offset_tensor += GGUFWriter.ggml_pad(tensor_nbytes, self.data_alignment)
         self.ti_data_count += 1
 
-    def write_tensor_to_file(self, tensor: np.ndarray):
+    def write_tensor_data(self, tensor: np.ndarray):
         pad = GGUFWriter.ggml_pad(self.fout.tell(), self.data_alignment) - self.fout.tell()
         if pad != 0:
             self.fout.write(bytes([0] * pad))
@@ -650,7 +633,7 @@ if __name__ == "__main__":
     gguf_writer.write_header_to_file()
     gguf_writer.write_kv_data_to_file()
     gguf_writer.write_ti_data_to_file()
-    gguf_writer.write_tensor_to_file(tensor1)
-    gguf_writer.write_tensor_to_file(tensor2)
+    gguf_writer.write_tensor_data(tensor1)
+    gguf_writer.write_tensor_data(tensor2)
 
     gguf_writer.close()
diff --git a/llama.cpp b/llama.cpp
index fd3690432..8caa52b60 100644
--- a/llama.cpp
+++ b/llama.cpp
@@ -1115,6 +1115,10 @@ struct llama_model_loader {
     struct ggml_tensor * create_tensor(struct ggml_context * ctx, const std::string & name, const std::vector<uint32_t> & ne, ggml_backend backend) {
         struct ggml_tensor * cur = ggml_get_tensor(ctx_meta, name.c_str());
 
+        if (cur == NULL) {
+            throw std::runtime_error(format("%s: tensor '%s' not found", __func__, name.c_str()));
+        }
+
         {
             bool is_ok = true;
             for (size_t i = 0; i < ne.size(); ++i) {