From 382c0c6100df54df2b5f8e08a7bbd2396c53d223 Mon Sep 17 00:00:00 2001 From: Geeks-sid Date: Wed, 29 Mar 2023 19:16:32 -0400 Subject: [PATCH] blacked unversioned-ggml-to-ggml --- convert-unversioned-ggml-to-ggml.py | 29 ++++++++++++++++++++--------- 1 file changed, 20 insertions(+), 9 deletions(-) diff --git a/convert-unversioned-ggml-to-ggml.py b/convert-unversioned-ggml-to-ggml.py index 33b6243bd..1c1b13374 100644 --- a/convert-unversioned-ggml-to-ggml.py +++ b/convert-unversioned-ggml-to-ggml.py @@ -10,37 +10,43 @@ from sentencepiece import SentencePieceProcessor HPARAMS = keys = ["vocab_size", "dim", "multiple_of", "n_heads", "n_layers"] + def parse_args(): - parser = argparse.ArgumentParser(description='Upgrade old ggml model files to the current format') - parser.add_argument('dir_model', help='directory containing ggml .bin files') - parser.add_argument('tokenizer_model', help='path to LLaMA tokenizer.model file') + parser = argparse.ArgumentParser( + description="Upgrade old ggml model files to the current format" + ) + parser.add_argument("dir_model", help="directory containing ggml .bin files") + parser.add_argument("tokenizer_model", help="path to LLaMA tokenizer.model file") return parser.parse_args() + def read_header(f_in): struct_fmt = "i" * (3 + len(HPARAMS)) struct_size = struct.calcsize(struct_fmt) buf = f_in.read(struct_size) return struct.unpack(struct_fmt, buf) + def write_header(f_out, header): (magic, vocab_size, dim, multiple_of, n_heads, n_layers, rot, ftype) = header - if magic != 0x67676d6c: - raise Exception('Invalid file magic. Must be an old style ggml file.') + if magic != 0x67676D6C: + raise Exception("Invalid file magic. Must be an old style ggml file.") values = [ - 0x67676d66, # magic: ggml in hex - 1, # file version + 0x67676D66, # magic: ggml in hex + 1, # file version vocab_size, dim, multiple_of, n_heads, n_layers, rot, - ftype + ftype, ] f_out.write(struct.pack("i" * len(values), *values)) + def write_tokens(fout, tokenizer): for i in range(tokenizer.vocab_size()): if tokenizer.is_unknown(i): @@ -60,12 +66,14 @@ def write_tokens(fout, tokenizer): fout.write(text) fout.write(struct.pack("f", tokenizer.get_score(i))) + def read_tokens(f_in, tokenizer): for i in range(tokenizer.vocab_size()): len_b = f_in.read(4) (length,) = struct.unpack("i", len_b) f_in.read(length) + def copy_all_data(f_out, f_in): while True: buf = f_in.read(1024 * 1024) @@ -73,9 +81,10 @@ def copy_all_data(f_out, f_in): break f_out.write(buf) + def convert_one_file(path_in, tokenizer): path_tmp = f"{path_in}.tmp" - path_orig= f"{path_in}.orig" + path_orig = f"{path_in}.orig" print(f"converting {path_in}") with open(path_in, "rb") as f_in, open(path_tmp, "wb") as f_out: write_header(f_out, read_header(f_in)) @@ -85,6 +94,7 @@ def convert_one_file(path_in, tokenizer): os.rename(path_in, path_orig) os.rename(path_tmp, path_in) + def main(): args = parse_args() files = [] @@ -96,5 +106,6 @@ def main(): for file in files: convert_one_file(file, tokenizer) + if __name__ == "__main__": main()