From 673ae1a17e6251984d32a055e89f649e9e0cf7bc Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Thu, 17 Aug 2023 16:52:25 +0300 Subject: [PATCH] convert-new.py : convert script now works --- convert-llama-7b-pth-to-gguf.py | 2 +- convert-new.py | 41 +++++++++++++++++++++++++-------- gguf.py | 33 +++++++------------------- llama.cpp | 4 ++++ 4 files changed, 44 insertions(+), 36 deletions(-) diff --git a/convert-llama-7b-pth-to-gguf.py b/convert-llama-7b-pth-to-gguf.py index c4e425ee3..9afea8a7e 100644 --- a/convert-llama-7b-pth-to-gguf.py +++ b/convert-llama-7b-pth-to-gguf.py @@ -298,7 +298,7 @@ for part_name in part_names: print( name + ", shape " + str(len(data.shape)) + ", " + str(old_dtype) + " --> " + str(data.dtype)) - gguf_writer.write_tensor_to_file(data) + gguf_writer.write_tensor_data(data) gguf_writer.close() diff --git a/convert-new.py b/convert-new.py index b263e4400..4aaaa60d4 100755 --- a/convert-new.py +++ b/convert-new.py @@ -669,7 +669,6 @@ def lazy_load_file(path: Path) -> ModelPlus: In = TypeVar('In') Out = TypeVar('Out') - def bounded_parallel_map(func: Callable[[In], Out], iterable: Iterable[In], concurrency: int) -> Iterable[Out]: '''Parallel map, but with backpressure. If the caller doesn't call `next` fast enough, this will stop calling `func` at some point rather than @@ -734,19 +733,35 @@ class OutputFile: # TODO: added / special tokens + def add_tensor_info(self, name: str, tensor: LazyTensor) -> None: + n_elements = 1 + for dim in tensor.shape: + n_elements *= dim + data_type = DATA_TYPE_TO_NUMPY[tensor.data_type] + data_nbytes = n_elements * data_type.itemsize + self.gguf.add_tensor_info(name, tensor.shape, data_type, data_nbytes) + def write_meta(self) -> None: self.gguf.write_header_to_file() self.gguf.write_kv_data_to_file() + def write_tensor_info(self) -> None: + self.gguf.write_ti_data_to_file() + def close(self) -> None: self.gguf.close() @staticmethod def write_vocab_only(fname_out: Path, params: Params, vocab: Vocab) -> None: + check_vocab_size(params, vocab) + of = OutputFile(fname_out) + + # meta data of.add_meta_arch(params) of.add_meta_vocab(vocab) of.write_meta() + of.close() @staticmethod @@ -754,22 +769,31 @@ class OutputFile: check_vocab_size(params, vocab) of = OutputFile(fname_out) + + # meta data of.add_meta_arch(params) of.add_meta_vocab(vocab) + # tensor info + for name, lazy_tensor in model.items(): + of.add_tensor_info(name, lazy_tensor) + + of.write_meta() + of.write_tensor_info() + def do_item(item: Tuple[str, LazyTensor]) -> NDArray: name, lazy_tensor = item return lazy_tensor.load().to_ggml().ndarray + # tensor data ndarrays = bounded_parallel_map(do_item, model.items(), concurrency=8) for i, ((name, lazy_tensor), ndarray) in enumerate(zip(model.items(), ndarrays)): size = ' x '.join(f"{dim:6d}" for dim in lazy_tensor.shape) padi = len(str(len(model))) print(f"[{i+1:{padi}d}/{len(model)}] Writing tensor {name:38s} | size {size:16} | type {lazy_tensor.data_type}") - #of.write_tensor_header(name, lazy_tensor.shape, lazy_tensor.data_type) - ndarray.tofile(of.fout) - of.fout.close() + of.gguf.write_tensor_data(ndarray) + of.close() def pick_output_type(model: LazyModel, output_type_str: Optional[str]) -> GGMLFileType: wq_type = model[NAMES[gguf.MODEL_TENSOR.ATTN_Q].format(bid=0)+".weight"].data_type @@ -783,6 +807,9 @@ def pick_output_type(model: LazyModel, output_type_str: Optional[str]) -> GGMLFi raise Exception(f"Unexpected combination of types: {name_to_type}") +def convert_to_output_type(model: LazyModel, output_type: GGMLFileType) -> LazyModel: + return {name: tensor.astype(output_type.type_for_tensor(name, tensor)) + for (name, tensor) in model.items()} def convert_model_names(model: LazyModel, params: Params) -> LazyModel: tmap = gguf.get_tensor_name_map(ARCH, params.n_layer) @@ -808,12 +835,6 @@ def convert_model_names(model: LazyModel, params: Params) -> LazyModel: return out - -def convert_to_output_type(model: LazyModel, output_type: GGMLFileType) -> LazyModel: - return {name: tensor.astype(output_type.type_for_tensor(name, tensor)) - for (name, tensor) in model.items()} - - def nth_multifile_path(path: Path, n: int) -> Optional[Path]: '''Given any path belonging to a multi-file model (e.g. foo.bin.1), return the nth path in the model. diff --git a/gguf.py b/gguf.py index 3c1964d81..55f45716a 100644 --- a/gguf.py +++ b/gguf.py @@ -70,23 +70,6 @@ KEY_TOKENIZER_RWKV = "tokenizer.rwkv.world" # recommended mapping of model tensor names for storage in gguf # -#LLAMA_TOKEN_EMBD = "token_embd" -#LLAMA_OUTPUT_NORM = "output_norm" -#LLAMA_OUTPUT = "output" -#LLAMA_ATTN_NORM = "blk.{bid}.attn_norm" -#LLAMA_ATTN_Q = "blk.{bid}.attn_q" -#LLAMA_ATTN_K = "blk.{bid}.attn_k" -#LLAMA_ATTN_V = "blk.{bid}.attn_v" -#LLAMA_ATTN_OUTPUT = "blk.{bid}.attn_output" -#LLAMA_FFN_NORM = "blk.{bid}.ffn_norm" -#LLAMA_FFN_GATE = "blk.{bid}.ffn_gate" -#LLAMA_FFN_DOWN = "blk.{bid}.ffn_down" -#LLAMA_FFN_UP = "blk.{bid}.ffn_up" -# -#GPT_POS_EMBD = "pos_embd" -# -#FALCON_ATTN_NORM_2 = "blk.{bid}.attn_norm_2" - class MODEL_ARCH(IntEnum): LLAMA = auto() FALCON = auto() @@ -117,15 +100,15 @@ class MODEL_TENSOR(IntEnum): MODEL_ARCH_NAMES = { MODEL_ARCH.LLAMA : "llama", MODEL_ARCH.FALCON : "falcon", - MODEL_ARCH.GPT2 : "gpt-2", - MODEL_ARCH.GPTJ : "gpt-j", - MODEL_ARCH.GPTNEOX : "gpt-neox", + MODEL_ARCH.GPT2 : "gpt2", + MODEL_ARCH.GPTJ : "gptj", + MODEL_ARCH.GPTNEOX : "gptneox", MODEL_ARCH.MPT : "mpt", } MODEL_TENSOR_NAMES = { MODEL_ARCH.LLAMA : { - MODEL_TENSOR.TOKEN_EMBD : "tok_embd", + MODEL_TENSOR.TOKEN_EMBD : "token_embd", MODEL_TENSOR.OUTPUT_NORM : "output_norm", MODEL_TENSOR.OUTPUT : "output", MODEL_TENSOR.ROPE_FREQS : "rope_freqs", @@ -141,7 +124,7 @@ MODEL_TENSOR_NAMES = { MODEL_TENSOR.FFN_UP : "blk.{bid}.ffn_up", }, MODEL_ARCH.FALCON : { - MODEL_TENSOR.TOKEN_EMBD : "tok_embd", + MODEL_TENSOR.TOKEN_EMBD : "token_embd", MODEL_TENSOR.OUTPUT_NORM : "output_norm", MODEL_TENSOR.OUTPUT : "output", MODEL_TENSOR.ATTN_NORM : "blk.{bid}.attn_norm", @@ -495,7 +478,7 @@ class GGUFWriter: self.offset_tensor += GGUFWriter.ggml_pad(tensor_nbytes, self.data_alignment) self.ti_data_count += 1 - def write_tensor_to_file(self, tensor: np.ndarray): + def write_tensor_data(self, tensor: np.ndarray): pad = GGUFWriter.ggml_pad(self.fout.tell(), self.data_alignment) - self.fout.tell() if pad != 0: self.fout.write(bytes([0] * pad)) @@ -650,7 +633,7 @@ if __name__ == "__main__": gguf_writer.write_header_to_file() gguf_writer.write_kv_data_to_file() gguf_writer.write_ti_data_to_file() - gguf_writer.write_tensor_to_file(tensor1) - gguf_writer.write_tensor_to_file(tensor2) + gguf_writer.write_tensor_data(tensor1) + gguf_writer.write_tensor_data(tensor2) gguf_writer.close() diff --git a/llama.cpp b/llama.cpp index fd3690432..8caa52b60 100644 --- a/llama.cpp +++ b/llama.cpp @@ -1115,6 +1115,10 @@ struct llama_model_loader { struct ggml_tensor * create_tensor(struct ggml_context * ctx, const std::string & name, const std::vector & ne, ggml_backend backend) { struct ggml_tensor * cur = ggml_get_tensor(ctx_meta, name.c_str()); + if (cur == NULL) { + throw std::runtime_error(format("%s: tensor '%s' not found", __func__, name.c_str())); + } + { bool is_ok = true; for (size_t i = 0; i < ne.size(); ++i) {