diff --git a/convert-pth-to-ggml.py b/convert-pth-to-ggml.py index dcef2f6a3..e61dcc941 100644 --- a/convert-pth-to-ggml.py +++ b/convert-pth-to-ggml.py @@ -1,4 +1,4 @@ -# Convert a LLaMA model checkpoint to a ggjt compatible file +# Convert a LLaMA model checkpoint to a ggml compatible file # # Load the model using Torch # Iterate over all variables and write them to a binary file. @@ -52,8 +52,8 @@ GGML_BLCK_SIZE = { } GGML_TYPE_SIZE = { - GGML_TYPE_Q4_0: 4 + QK//2, - GGML_TYPE_Q4_1: 4*2 + QK//2, + GGML_TYPE_Q4_0: 4 + QK/2, + GGML_TYPE_Q4_1: 4*2 + QK/2, GGML_TYPE_I8: 1, GGML_TYPE_I16: 2, GGML_TYPE_I32: 4, @@ -245,9 +245,11 @@ def main(): fname_model = f"{dir_model}/consolidated.00.pth" fname_out = f"{dir_model}/ggml-vocab.bin" print(f"Extracting only the vocab from '{fname_model}'\n") + model = torch.load(fname_model, map_location="cpu") with open(fname_out, "wb") as fout: write_header(fout, hparams, ftype) write_tokens(fout, tokenizer) + del model print(f"Done. Output file: {fname_out}\n") return diff --git a/llama.cpp b/llama.cpp index 878907185..3de1b3a7c 100644 --- a/llama.cpp +++ b/llama.cpp @@ -347,15 +347,14 @@ static void munmap_file(void * addr, size_t length) { #endif } -static bool report_bad_magic(const char *path, uint32_t got, uint32_t want) { +static bool report_bad_magic(const char *path) { fprintf(stderr, - "%s: invalid model file (bad magic [got %#x want %#x])\n" - "\tyou most likely need to regenerate your ggml files\n" - "\tthe benefit is you'll get 10-100x faster load times\n" - "\tsee https://github.com/ggerganov/llama.cpp/issues/91\n" - "\tuse convert-pth-to-ggml.py to regenerate from original pth\n" - "\tuse migrate-ggml-2023-03-30-pr613.py if you deleted originals\n", - path, got, want); + "%s: invalid model file (bad magic)\n" + "you most likely need to regenerate your ggml files\n" + "the benefit is you'll get 10-100x faster load times\n" + "see https://github.com/ggerganov/llama.cpp/issues/91\n" + "use convert-pth-to-ggml.py on your llama model files\n", + path); return false; } @@ -398,7 +397,7 @@ static bool llama_model_load( return false; } if (magic != LLAMA_FILE_MAGIC) { - return report_bad_magic(fname.c_str(), magic, LLAMA_FILE_MAGIC); + return report_bad_magic(fname.c_str()); } uint32_t format_version; @@ -1313,7 +1312,7 @@ static bool llama_model_quantize_internal(const std::string & fname_inp, const s return false; } if (magic != LLAMA_FILE_MAGIC) { - return report_bad_magic(fname_inp.c_str(), magic, LLAMA_FILE_MAGIC); + return report_bad_magic(fname_inp.c_str()); } fout.write((char *) &magic, sizeof(magic)); diff --git a/migrate-ggml-2023-03-30-pr613.py b/migrate-ggml-2023-03-30-pr613.py deleted file mode 100644 index b6ef2476e..000000000 --- a/migrate-ggml-2023-03-30-pr613.py +++ /dev/null @@ -1,311 +0,0 @@ -# Migrate ggml file(s) with ggmf magic to ggml file with ggjt magic -# -# We caused a breaking change to the file format on 2023-03-30 in: -# https://github.com/ggerganov/llama.cpp/pull/613 -# -# (1) If you still have the Meta LLaMA .pth files, then close this -# file now; you can just run `convert-pth-to-ggml.py` again to -# migrate to the new format. The tool is easier to use too. It -# isn't necessary anymore to manage split output files because -# the new format always combines things into a single file. -# -# (2) If you deleted the Meta LLaMA .pth files due to save on disk -# space, then this tool is intended to help you. Please check -# out the instructions below. -# -# USAGE -# -# python migrate-ggml-2023-03-30-pr613.py INPUT OUTPUT -# -# PREREQUISITES -# -# pip install numpy -# cd llama.cpp -# make -j4 -# -# EXAMPLE (7B MODEL) -# -# # you can replace all the 'f16' with 'q4_0' if you're using quantized weights -# python migrate-ggml-2023-03-30-pr613.py models/7B/ggml-model-f16.bin models/7B/ggml-model-f16-ggjt.bin -# -# # check that it works -# ./main -m models/7B/ggml-model-f16-ggjt.bin -p 'Question: Do you love me?' -# -# # you can delete the old files -# rm -f models/7B/ggml-model-f16.bin -# mv models/7B/ggml-model-f16-ggjt.bin models/7B/ggml-model-f16.bin -# -# EXAMPLE (13B MODEL) -# -# # you can replace all the 'f16' with 'q4_0' if you're using quantized weights -# python migrate-ggml-2023-03-30-pr613.py models/13B/ggml-model-f16.bin models/13B/ggml-model-f16-ggjt.bin -# -# # check that it works -# ./main -m models/13B/ggml-model-f16-ggjt.bin -p 'Question: Do you love me?' -# -# # you can delete the old files -# rm -f models/13B/ggml-model-f16.bin* -# mv models/13B/ggml-model-f16-ggjt.bin models/13B/ggml-model-f16.bin -# - -import argparse -import os -import sys -import json -import struct -import numpy as np - -QK = 32 - -GGML_TYPE_Q4_0 = 0 -GGML_TYPE_Q4_1 = 1 -GGML_TYPE_I8 = 2 -GGML_TYPE_I16 = 3 -GGML_TYPE_I32 = 4 -GGML_TYPE_F16 = 5 -GGML_TYPE_F32 = 6 - -WTYPE_NAMES = { - 0: "F32", - 1: "F16", - 2: "Q4_0", - 3: "Q4_1", -} - -WTYPES = { - 0: GGML_TYPE_F32, - 1: GGML_TYPE_F16, - 2: GGML_TYPE_Q4_0, - 3: GGML_TYPE_Q4_1, -} - -GGML_BLCK_SIZE = { - GGML_TYPE_Q4_0: QK, - GGML_TYPE_Q4_1: QK, - GGML_TYPE_I8: 1, - GGML_TYPE_I16: 1, - GGML_TYPE_I32: 1, - GGML_TYPE_F16: 1, - GGML_TYPE_F32: 1, -} - -GGML_TYPE_SIZE = { - GGML_TYPE_Q4_0: 4 + QK//2, - GGML_TYPE_Q4_1: 4*2 + QK//2, - GGML_TYPE_I8: 1, - GGML_TYPE_I16: 2, - GGML_TYPE_I32: 4, - GGML_TYPE_F16: 2, - GGML_TYPE_F32: 4, -} - -HPARAMS = [ - 'magic', # int32 - 'version', # int32 - 'n_vocab', # int32 - 'n_embd', # int32 - 'n_mult', # int32 - 'n_head', # int32 - 'n_layer', # int32 - 'n_rot', # int32 - 'f16', # int32 -] - -def read_hparams(fin): - struct_fmt = "i" * len(HPARAMS) - struct_size = struct.calcsize(struct_fmt) - buf = fin.read(struct_size) - ints = struct.unpack(struct_fmt, buf) - hparams = dict(zip(HPARAMS, ints)) - return hparams - -def write_hparams(fout, hparams): - struct_fmt = "i" * len(HPARAMS) - struct_size = struct.calcsize(struct_fmt) - ints = [hparams[h] for h in HPARAMS] - fout.write(struct.pack(struct_fmt, *ints)) - -def read_tokens(fin, hparams): - tokens = [] - for i in range(hparams['n_vocab']): - len_b = fin.read(4) - (length,) = struct.unpack("i", len_b) - word = fin.read(length) - score_b = fin.read(4) - (score,) = struct.unpack("f", score_b) - tokens.append((word, score)) - return tokens - -def write_tokens(fout, tokens): - for word, score in tokens: - fout.write(struct.pack("i", len(word))) - fout.write(word) - fout.write(struct.pack("f", score)) - -def ggml_nelements(shape): - r = 1 - for i in shape: - r *= i - return r - -def ggml_nbytes(shape, ftype): - x = ggml_nelements(shape) - t = WTYPES[ftype] - x *= GGML_TYPE_SIZE[t] - x //= GGML_BLCK_SIZE[t] - return x - -def copy_tensors(fin, fout, part_id, n_parts): - while True: - - b = fin.read(4) - if not b: break - (n_dims,) = struct.unpack("i", b) - b = fin.read(4) - (length,) = struct.unpack("i", b) - b = fin.read(4) - (ftype,) = struct.unpack("i", b) - - assert n_dims in (1, 2) - - partshape = list(range(n_dims)) - for i in range(n_dims): - b = fin.read(4) - partshape[i] = struct.unpack("i", b)[0] - partshape = list(reversed(partshape)) - - name = fin.read(length) - data = fin.read(ggml_nbytes(partshape, ftype)) - - blck_size = GGML_BLCK_SIZE[WTYPES[ftype]] - type_size = GGML_TYPE_SIZE[WTYPES[ftype]] - - print(f"Processing tensor {name} with shape: {partshape} and type: {WTYPE_NAMES[ftype]}") - - # determine dimension along which multipart tensor is sharded - # - # split_dim 0 regex: - # - output.* - # - layers.*.attention.wq.weight - # - layers.*.attention.wk.weight - # - layers.*.attention.wv.weight - # - layers.*.feed_forward.w1.weight - # - layers.*.feed_forward.w3.weight - # - # split_dim 1 regex: - # - tok_embeddings.* - # - layers.*.attention.wo.weight - # - layers.*.feed_forward.w2.weight - # - if n_dims > 1: - split_dim = 1 - if b"tok_embeddings" in name: - split_dim = 1 - elif b"layers" in name: - if b"attention.wo.weight" in name: - split_dim = 1 - elif b"feed_forward.w2.weight" in name: - split_dim = 1 - else: - split_dim = 0 - elif b"output" in name: - split_dim = 0 - - # output tensor header - fullshape = list(partshape) - if n_dims > 1: - fullshape[split_dim] *= n_parts - fout.write(struct.pack("iii", n_dims, len(name), ftype)) - for dim in reversed(fullshape): - fout.write(struct.pack("i", dim)) - fout.write(name) - - # ensure tensor data is aligned - tensor_data_offset = fout.tell() - while tensor_data_offset % QK != 0: - fout.write(struct.pack("B", 0)) - tensor_data_offset += 1 - - # output unified mappable tensor data - if n_dims == 1 or n_parts == 1: - # copy tensor which we thankfully received in one piece - if part_id == 0: - fout.write(data) - elif split_dim == 0: - # reassemble multifile tensor containing some of the rows - rows_per_chunk = partshape[0] - current_row = part_id * rows_per_chunk - bytes_per_row = fullshape[1] // blck_size * type_size - offset = current_row * bytes_per_row - fout.seek(tensor_data_offset + offset) - fout.write(data) - elif split_dim == 1: - # reassemble multifile tensor containing some of the cols - cols_per_chunk = partshape[1] - current_col = part_id * cols_per_chunk - bpr = partshape[1] // blck_size * type_size - bytes_per_row = fullshape[1] // blck_size * type_size - offset_current_col = current_col // blck_size * type_size - for row in range(partshape[0]): - offset_row = row * bytes_per_row - offset = offset_row + offset_current_col - fout.seek(tensor_data_offset + offset) - fout.write(data[row * bpr:row * bpr + bpr]) - - # advance file position to next tensor - fout.seek(tensor_data_offset + ggml_nbytes(fullshape, ftype)) - -def parse_args(): - parser = argparse.ArgumentParser(description='Migrate from GGML to new GGJT file format') - parser.add_argument('fin_path', help='your old ggml file (leave out the .1 .2 etc.)') - parser.add_argument('fout_path', help='your new ggjt file name') - return parser.parse_args() - -def main(): - args = parse_args() - assert args.fin_path - assert args.fout_path - assert args.fin_path != args.fout_path - - with open(args.fin_path, "rb") as fin: - hparams = read_hparams(fin) - tokens = read_tokens(fin, hparams) - - if hparams['magic'] == 0x67676a74: # ggjt - print(f"{args.fin_path}: input ggml has already been converted to 'ggjt' magic\n") - sys.exit(1) - - if hparams['magic'] != 0x67676d66: # ggmf - print(f"{args.fin_path}: input ggml file doesn't have expected 'ggmf' magic: {hparams['magic']:#x}\n") - sys.exit(1) - - hparams['magic'] = 0x67676a74 # ggjt - - # count number of multipart files by convention - n_parts = 1 - while True: - if os.path.exists(f"{args.fin_path}.{n_parts}"): - n_parts += 1 - else: - break - - # we output a single file for ggml - with open(args.fout_path, "wb") as fout: - write_hparams(fout, hparams) - write_tokens(fout, tokens) - offset_of_tensors = fout.tell() - # the tensors we load could be split across multiple files - for part_id in range(n_parts): - fout.seek(offset_of_tensors) - print(f"Processing part {part_id+1} of {n_parts}\n") - fin_path = args.fin_path - if part_id > 0: - fin_path += f".{part_id}" - with open(fin_path, "rb") as fin: - read_tokens(fin, read_hparams(fin)) - copy_tensors(fin, fout, part_id, n_parts) - - print(f"Done. Output file: {args.fout_path}\n") - -if __name__ == "__main__": - main()