blacked unversioned-ggml-to-ggml
This commit is contained in:
parent
efab7f8bad
commit
382c0c6100
1 changed files with 20 additions and 9 deletions
|
@ -10,26 +10,31 @@ from sentencepiece import SentencePieceProcessor
|
||||||
|
|
||||||
HPARAMS = keys = ["vocab_size", "dim", "multiple_of", "n_heads", "n_layers"]
|
HPARAMS = keys = ["vocab_size", "dim", "multiple_of", "n_heads", "n_layers"]
|
||||||
|
|
||||||
|
|
||||||
def parse_args():
|
def parse_args():
|
||||||
parser = argparse.ArgumentParser(description='Upgrade old ggml model files to the current format')
|
parser = argparse.ArgumentParser(
|
||||||
parser.add_argument('dir_model', help='directory containing ggml .bin files')
|
description="Upgrade old ggml model files to the current format"
|
||||||
parser.add_argument('tokenizer_model', help='path to LLaMA tokenizer.model file')
|
)
|
||||||
|
parser.add_argument("dir_model", help="directory containing ggml .bin files")
|
||||||
|
parser.add_argument("tokenizer_model", help="path to LLaMA tokenizer.model file")
|
||||||
return parser.parse_args()
|
return parser.parse_args()
|
||||||
|
|
||||||
|
|
||||||
def read_header(f_in):
|
def read_header(f_in):
|
||||||
struct_fmt = "i" * (3 + len(HPARAMS))
|
struct_fmt = "i" * (3 + len(HPARAMS))
|
||||||
struct_size = struct.calcsize(struct_fmt)
|
struct_size = struct.calcsize(struct_fmt)
|
||||||
buf = f_in.read(struct_size)
|
buf = f_in.read(struct_size)
|
||||||
return struct.unpack(struct_fmt, buf)
|
return struct.unpack(struct_fmt, buf)
|
||||||
|
|
||||||
|
|
||||||
def write_header(f_out, header):
|
def write_header(f_out, header):
|
||||||
(magic, vocab_size, dim, multiple_of, n_heads, n_layers, rot, ftype) = header
|
(magic, vocab_size, dim, multiple_of, n_heads, n_layers, rot, ftype) = header
|
||||||
|
|
||||||
if magic != 0x67676d6c:
|
if magic != 0x67676D6C:
|
||||||
raise Exception('Invalid file magic. Must be an old style ggml file.')
|
raise Exception("Invalid file magic. Must be an old style ggml file.")
|
||||||
|
|
||||||
values = [
|
values = [
|
||||||
0x67676d66, # magic: ggml in hex
|
0x67676D66, # magic: ggml in hex
|
||||||
1, # file version
|
1, # file version
|
||||||
vocab_size,
|
vocab_size,
|
||||||
dim,
|
dim,
|
||||||
|
@ -37,10 +42,11 @@ def write_header(f_out, header):
|
||||||
n_heads,
|
n_heads,
|
||||||
n_layers,
|
n_layers,
|
||||||
rot,
|
rot,
|
||||||
ftype
|
ftype,
|
||||||
]
|
]
|
||||||
f_out.write(struct.pack("i" * len(values), *values))
|
f_out.write(struct.pack("i" * len(values), *values))
|
||||||
|
|
||||||
|
|
||||||
def write_tokens(fout, tokenizer):
|
def write_tokens(fout, tokenizer):
|
||||||
for i in range(tokenizer.vocab_size()):
|
for i in range(tokenizer.vocab_size()):
|
||||||
if tokenizer.is_unknown(i):
|
if tokenizer.is_unknown(i):
|
||||||
|
@ -60,12 +66,14 @@ def write_tokens(fout, tokenizer):
|
||||||
fout.write(text)
|
fout.write(text)
|
||||||
fout.write(struct.pack("f", tokenizer.get_score(i)))
|
fout.write(struct.pack("f", tokenizer.get_score(i)))
|
||||||
|
|
||||||
|
|
||||||
def read_tokens(f_in, tokenizer):
|
def read_tokens(f_in, tokenizer):
|
||||||
for i in range(tokenizer.vocab_size()):
|
for i in range(tokenizer.vocab_size()):
|
||||||
len_b = f_in.read(4)
|
len_b = f_in.read(4)
|
||||||
(length,) = struct.unpack("i", len_b)
|
(length,) = struct.unpack("i", len_b)
|
||||||
f_in.read(length)
|
f_in.read(length)
|
||||||
|
|
||||||
|
|
||||||
def copy_all_data(f_out, f_in):
|
def copy_all_data(f_out, f_in):
|
||||||
while True:
|
while True:
|
||||||
buf = f_in.read(1024 * 1024)
|
buf = f_in.read(1024 * 1024)
|
||||||
|
@ -73,6 +81,7 @@ def copy_all_data(f_out, f_in):
|
||||||
break
|
break
|
||||||
f_out.write(buf)
|
f_out.write(buf)
|
||||||
|
|
||||||
|
|
||||||
def convert_one_file(path_in, tokenizer):
|
def convert_one_file(path_in, tokenizer):
|
||||||
path_tmp = f"{path_in}.tmp"
|
path_tmp = f"{path_in}.tmp"
|
||||||
path_orig = f"{path_in}.orig"
|
path_orig = f"{path_in}.orig"
|
||||||
|
@ -85,6 +94,7 @@ def convert_one_file(path_in, tokenizer):
|
||||||
os.rename(path_in, path_orig)
|
os.rename(path_in, path_orig)
|
||||||
os.rename(path_tmp, path_in)
|
os.rename(path_tmp, path_in)
|
||||||
|
|
||||||
|
|
||||||
def main():
|
def main():
|
||||||
args = parse_args()
|
args = parse_args()
|
||||||
files = []
|
files = []
|
||||||
|
@ -96,5 +106,6 @@ def main():
|
||||||
for file in files:
|
for file in files:
|
||||||
convert_one_file(file, tokenizer)
|
convert_one_file(file, tokenizer)
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
main()
|
main()
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue