From dc07dc492ef9640bbb82904d7c7679f7bdcf6d76 Mon Sep 17 00:00:00 2001 From: Kerfuffle <44031344+KerfuffleV2@users.noreply.github.com> Date: Wed, 30 Aug 2023 02:25:50 -0600 Subject: [PATCH 01/27] convert : various script cleanups/fixes + merges and special token handling (#2842) * convert: Fix permute calls and method/func definitions * Cleanups for gguf-py * Minor types cleanups. * Initial implementation of handling merges and special tokens * convert: Handle special tokens and merges in vocab only mode convert: Vocab only mode no longer requires loading model tensors * gguf: Refactor tensor name mapping * convert: Fix type hint for special_token_types in SpecialVocab * Use common special vocab handling in various conversion scripts * First pass at implementing suggested changes * Second pass * gguf: SpecialVocab: Fix issue with special token content not in a dict gguf: SpecialVocab: Allow skipping handling of merges * convert-falcon-hf-to-gguf: Support --vocab-only option, bail out if no tokenizer.json * convert-gptneox-hf-to-gguf and convert: Only handle merges for BPE tokenizer * gguf: SpecialVocab: Actually set load_merges in object * Uniform args parsing and vocab only mode for convert examples * convert.py: Set gpt2 as tokenizer model when using BPE * Squish last type warning in gguf.py - yay! --- convert-falcon-hf-to-gguf.py | 170 +++++----- convert-gptneox-hf-to-gguf.py | 175 ++++------ convert-llama-7b-pth-to-gguf.py | 200 +++++------- convert-llama-ggmlv3-to-gguf.py | 28 +- convert-llama-hf-to-gguf.py | 203 +++++------- convert-lora-to-ggml.py | 6 +- convert.py | 142 ++++---- gguf-py/gguf/gguf.py | 551 +++++++++++++++++++------------- gguf-py/gguf/py.typed | 0 gguf-py/pyproject.toml | 1 + 10 files changed, 728 insertions(+), 748 deletions(-) create mode 100644 gguf-py/gguf/py.typed diff --git a/convert-falcon-hf-to-gguf.py b/convert-falcon-hf-to-gguf.py index 168bcf17f..0fdea70e1 100755 --- a/convert-falcon-hf-to-gguf.py +++ b/convert-falcon-hf-to-gguf.py @@ -8,6 +8,7 @@ import struct import json import numpy as np import torch +import argparse from typing import Any, List from pathlib import Path @@ -32,11 +33,10 @@ def bytes_to_unicode(): bs.append(b) cs.append(2**8+n) n += 1 - cs = [chr(n) for n in cs] - return dict(zip(bs, cs)) + return dict(zip(bs, (chr(n) for n in cs))) -def count_model_parts(dir_model: str) -> int: +def count_model_parts(dir_model: Path) -> int: num_parts = 0 for filename in os.listdir(dir_model): if filename.startswith("pytorch_model-"): @@ -47,17 +47,22 @@ def count_model_parts(dir_model: str) -> int: return num_parts -if len(sys.argv) < 3: - print(f"Usage: python {sys.argv[0]} dir-model ftype\n") - print(" ftype == 0 -> float32") - print(" ftype == 1 -> float16") +def parse_args() -> argparse.Namespace: + parser = argparse.ArgumentParser(description="Convert a Falcon model to a GGML compatible file") + parser.add_argument("--vocab-only", action="store_true", help="extract only the vocab") + parser.add_argument("--outfile", type=Path, help="path to write to; default: based on input") + parser.add_argument("model", type=Path, help="directory containing model file, or model file itself (*.bin)") + parser.add_argument("ftype", type=int, choices=[0, 1], help="output format - use 0 for float32, 1 for float16", default = 1) + return parser.parse_args() + +args = parse_args() + +dir_model = args.model +ftype = args.ftype +if not dir_model.is_dir(): + print(f'Error: {args.model} is not a directory', file = sys.stderr) sys.exit(1) - -# output in the same directory as the model -dir_model = sys.argv[1] -last_dir = os.path.basename(os.path.normpath(dir_model)) - # possible tensor data types # ftype == 0 -> float32 # ftype == 1 -> float16 @@ -65,25 +70,21 @@ last_dir = os.path.basename(os.path.normpath(dir_model)) # map from ftype to string ftype_str = ["f32", "f16"] -ftype = 1 -if len(sys.argv) > 2: - ftype = int(sys.argv[2]) - if ftype < 0 or ftype > 1: - print("Invalid ftype: " + str(ftype)) +if args.outfile is not None: + fname_out = args.outfile +else: + # output in the same directory as the model by default + fname_out = dir_model / f'ggml-model-{ftype_str[ftype]}.gguf' - sys.exit(1) +print("gguf: loading model "+dir_model.name) -fname_out = sys.argv[1] + "/ggml-model-" + ftype_str[ftype] + ".gguf" - -print("gguf: loading model "+last_dir) - -with open(dir_model + "/config.json", "r", encoding="utf-8") as f: +with open(dir_model / "config.json", "r", encoding="utf-8") as f: hparams = json.load(f) if hparams["architectures"][0] != "RWForCausalLM": print("Model architecture not supported: " + hparams["architectures"][0]) - sys.exit() + sys.exit(1) # get number of model parts num_parts = count_model_parts(dir_model) @@ -113,77 +114,58 @@ gguf_writer.add_file_type(ftype) print("gguf: get tokenizer metadata") -tokens: List[str] = [] +tokens: List[bytearray] = [] scores: List[float] = [] toktypes: List[int] = [] -merges: List[str] = [] +tokenizer_json_file = dir_model / 'tokenizer.json' +if not tokenizer_json_file.is_file(): + print(f'Error: Missing {tokenizer_json_file}', file = sys.stderr) + sys.exit(1) -if Path(dir_model + "/tokenizer.json").is_file(): - # gpt2 tokenizer - gguf_writer.add_tokenizer_model("gpt2") +# gpt2 tokenizer +gguf_writer.add_tokenizer_model("gpt2") - print("gguf: get gpt2 tokenizer merges") +with open(tokenizer_json_file, "r", encoding="utf-8") as f: + tokenizer_json = json.load(f) - with open(dir_model + "/tokenizer.json", "r", encoding="utf-8") as f: - tokenizer_json = json.load(f) - merges = tokenizer_json["model"]["merges"] +print("gguf: get gpt2 tokenizer vocab") - gguf_writer.add_token_merges(merges) +vocab_size = len(tokenizer_json["model"]["vocab"]) - print("gguf: get gpt2 tokenizer vocab") +# ref: https://github.com/cmp-nct/ggllm.cpp/blob/master/falcon_convert.py +tokenizer = AutoTokenizer.from_pretrained(dir_model) - vocab_size = len(tokenizer_json["model"]["vocab"]) +reverse_vocab = {id: encoded_tok for encoded_tok, id in tokenizer.vocab.items()} +byte_encoder = bytes_to_unicode() +byte_decoder = {v: k for k, v in byte_encoder.items()} - # ref: https://github.com/cmp-nct/ggllm.cpp/blob/master/falcon_convert.py - tokenizer = AutoTokenizer.from_pretrained(dir_model) +for i in range(vocab_size): + if i in reverse_vocab: + try: + text = bytearray([byte_decoder[c] for c in reverse_vocab[i]]) + except KeyError: + text = bytearray() + for c in reverse_vocab[i]: + if ord(c) < 256: # single byte character + text.append(byte_decoder[ord(c)]) + else: # multibyte special token character + text.extend(c.encode('utf-8')) + else: + print(f"Key {i} not in tokenizer vocabulary. Padding with an arbitrary token.") + pad_token = f"[PAD{i}]".encode("utf8") + text = bytearray(pad_token) - reverse_vocab = {id: encoded_tok for encoded_tok, id in tokenizer.vocab.items()} - byte_encoder = bytes_to_unicode() - byte_decoder = {v: k for k, v in byte_encoder.items()} + tokens.append(text) + scores.append(0.0) # dymmy + toktypes.append(gguf.TokenType.NORMAL) # dummy - for i in range(vocab_size): - if i in reverse_vocab: - try: - text = bytearray([byte_decoder[c] for c in reverse_vocab[i]]) - except KeyError: - text = bytearray() - for c in reverse_vocab[i]: - if ord(c) < 256: # single byte character - text.append(byte_decoder[ord(c)]) - else: # multibyte special token character - text.extend(c.encode('utf-8')) - else: - print(f"Key {i} not in tokenizer vocabulary. Padding with an arbitrary token.") - pad_token = f"[PAD{i}]".encode("utf8") - text = bytearray(pad_token) - - tokens.append(text) - scores.append(0.0) # dymmy - toktypes.append(gguf.TokenType.NORMAL) # dummy - - gguf_writer.add_token_list(tokens) - gguf_writer.add_token_scores(scores) - gguf_writer.add_token_types(toktypes) - -print("gguf: get special token ids") -# Look for special tokens in config.json - -if "bos_token_id" in hparams and hparams["bos_token_id"] != None: - gguf_writer.add_bos_token_id(hparams["bos_token_id"]) - -if "eos_token_id" in hparams and hparams["eos_token_id"] != None: - gguf_writer.add_eos_token_id(hparams["eos_token_id"]) - -if "unk_token_id" in hparams and hparams["unk_token_id"] != None: - gguf_writer.add_unk_token_id(hparams["unk_token_id"]) - -if "sep_token_id" in hparams and hparams["sep_token_id"] != None: - gguf_writer.add_sep_token_id(hparams["sep_token_id"]) - -if "pad_token_id" in hparams and hparams["pad_token_id"] != None: - gguf_writer.add_pad_token_id(hparams["pad_token_id"]) +gguf_writer.add_token_list(tokens) +gguf_writer.add_token_scores(scores) +gguf_writer.add_token_types(toktypes) +special_vocab = gguf.SpecialVocab(dir_model, load_merges = True) +special_vocab.add_to_gguf(gguf_writer) # TENSORS @@ -199,15 +181,17 @@ head_dim = hparams["hidden_size"] // n_head print("gguf: get tensor metadata") if num_parts == 0: - part_names = ("pytorch_model.bin",) + part_names = iter(("pytorch_model.bin",)) else: part_names = ( f"pytorch_model-{n:05}-of-{num_parts:05}.bin" for n in range(1, num_parts + 1) ) for part_name in part_names: + if args.vocab_only: + break print("gguf: loading model part '" + part_name + "'") - model_part = torch.load(f"{dir_model}/{part_name}", map_location="cpu") + model_part = torch.load(dir_model / part_name, map_location="cpu") for name in model_part.keys(): data = model_part[name] @@ -238,11 +222,8 @@ for part_name in part_names: data = data.squeeze().numpy() # map tensor names - if name.endswith(".weight") and name[:-7] in tensor_map: - name = tensor_map[name[:-7]] + ".weight" - elif name.endswith(".bias") and name[:-5] in tensor_map: - name = tensor_map[name[:-5]] + ".bias" - else: + new_name = tensor_map.get_name(name, try_suffixes = (".weight", ".bias")) + if new_name is None: print("Can not map tensor '" + name + "'") sys.exit() @@ -261,19 +242,20 @@ for part_name in part_names: if ftype == 1 and data_dtype == np.float32 and name.endswith(".weight") and n_dims == 2: data = data.astype(np.float16) - print(name + ", n_dims = " + str(n_dims) + ", " + str(old_dtype) + " --> " + str(data.dtype)) + print(new_name + ", n_dims = " + str(n_dims) + ", " + str(old_dtype) + " --> " + str(data.dtype)) - gguf_writer.add_tensor(name, data) + gguf_writer.add_tensor(new_name, data) print("gguf: write header") gguf_writer.write_header_to_file() print("gguf: write metadata") gguf_writer.write_kv_data_to_file() -print("gguf: write tensors") -gguf_writer.write_tensors_to_file() +if not args.vocab_only: + print("gguf: write tensors") + gguf_writer.write_tensors_to_file() gguf_writer.close() -print("gguf: model successfully exported to '" + fname_out + "'") +print(f"gguf: model successfully exported to '{fname_out}'") print("") diff --git a/convert-gptneox-hf-to-gguf.py b/convert-gptneox-hf-to-gguf.py index d9c42d76b..38e71e03b 100755 --- a/convert-gptneox-hf-to-gguf.py +++ b/convert-gptneox-hf-to-gguf.py @@ -8,6 +8,7 @@ import struct import json import numpy as np import torch +import argparse from typing import Any, List from pathlib import Path @@ -34,11 +35,10 @@ def bytes_to_unicode(): bs.append(b) cs.append(2**8+n) n += 1 - cs = [chr(n) for n in cs] - return dict(zip(bs, cs)) + return dict(zip(bs, (chr(n) for n in cs))) -def count_model_parts(dir_model: str) -> int: +def count_model_parts(dir_model: Path) -> int: num_parts = 0 for filename in os.listdir(dir_model): if filename.startswith("pytorch_model-"): @@ -49,17 +49,22 @@ def count_model_parts(dir_model: str) -> int: return num_parts -if len(sys.argv) < 3: - print(f"Usage: python {sys.argv[0]} dir-model ftype\n") - print(" ftype == 0 -> float32") - print(" ftype == 1 -> float16") +def parse_args() -> argparse.Namespace: + parser = argparse.ArgumentParser(description="Convert a GPT-NeoX model to a GGML compatible file") + parser.add_argument("--vocab-only", action="store_true", help="extract only the vocab") + parser.add_argument("--outfile", type=Path, help="path to write to; default: based on input") + parser.add_argument("model", type=Path, help="directory containing model file, or model file itself (*.bin)") + parser.add_argument("ftype", type=int, choices=[0, 1], help="output format - use 0 for float32, 1 for float16", default = 1) + return parser.parse_args() + +args = parse_args() + +dir_model = args.model +ftype = args.ftype +if not dir_model.is_dir(): + print(f'Error: {args.model} is not a directory', file = sys.stderr) sys.exit(1) - -# output in the same directory as the model -dir_model = sys.argv[1] -last_dir = os.path.basename(os.path.normpath(dir_model)) - # possible tensor data types # ftype == 0 -> float32 # ftype == 1 -> float16 @@ -67,19 +72,15 @@ last_dir = os.path.basename(os.path.normpath(dir_model)) # map from ftype to string ftype_str = ["f32", "f16"] -ftype = 1 -if len(sys.argv) > 2: - ftype = int(sys.argv[2]) - if ftype < 0 or ftype > 1: - print("Invalid ftype: " + str(ftype)) +if args.outfile is not None: + fname_out = args.outfile +else: + # output in the same directory as the model by default + fname_out = dir_model / f'ggml-model-{ftype_str[ftype]}.gguf' - sys.exit(1) +print("gguf: loading model "+dir_model.name) -fname_out = sys.argv[1] + "/ggml-model-" + ftype_str[ftype] + ".gguf" - -print("gguf: loading model "+last_dir) - -with open(dir_model + "/config.json", "r", encoding="utf-8") as f: +with open(dir_model / "config.json", "r", encoding="utf-8") as f: hparams = json.load(f) if hparams["architectures"][0] != "GPTNeoXForCausalLM": @@ -97,7 +98,7 @@ print("gguf: get model metadata") block_count = hparams["num_hidden_layers"] -gguf_writer.add_name(last_dir) +gguf_writer.add_name(dir_model.name) gguf_writer.add_context_length(hparams["max_position_embeddings"]) gguf_writer.add_embedding_length(hparams["hidden_size"]) gguf_writer.add_block_count(block_count) @@ -111,86 +112,52 @@ gguf_writer.add_layer_norm_eps(hparams["layer_norm_eps"]) print("gguf: get tokenizer metadata") -tokens: List[str] = [] -merges: List[str] = [] +tokens: List[bytearray] = [] +tokenizer_json_file = dir_model / 'tokenizer.json' +if not tokenizer_json_file.is_file(): + print(f'Error: Missing {tokenizer_json_file}', file = sys.stderr) + sys.exit(1) -if Path(dir_model + "/tokenizer.json").is_file(): - # gpt2 tokenizer - gguf_writer.add_tokenizer_model("gpt2") +# gpt2 tokenizer +gguf_writer.add_tokenizer_model("gpt2") - print("gguf: get gpt2 tokenizer merges") +with open(tokenizer_json_file, "r", encoding="utf-8") as f: + tokenizer_json = json.load(f) - with open(dir_model + "/tokenizer.json", "r", encoding="utf-8") as f: - tokenizer_json = json.load(f) - merges = tokenizer_json["model"]["merges"] +print("gguf: get gpt2 tokenizer vocab") - gguf_writer.add_token_merges(merges) +vocab_size = len(tokenizer_json["model"]["vocab"]) - print("gguf: get gpt2 tokenizer vocab") +# ref: https://github.com/cmp-nct/ggllm.cpp/blob/master/falcon_convert.py +tokenizer = AutoTokenizer.from_pretrained(dir_model) - vocab_size = len(tokenizer_json["model"]["vocab"]) +reverse_vocab = {id: encoded_tok for encoded_tok, id in tokenizer.vocab.items()} +byte_encoder = bytes_to_unicode() +byte_decoder = {v: k for k, v in byte_encoder.items()} - # ref: https://github.com/cmp-nct/ggllm.cpp/blob/master/falcon_convert.py - tokenizer = AutoTokenizer.from_pretrained(dir_model) +for i in range(vocab_size): + if i in reverse_vocab: + try: + text = bytearray([byte_decoder[c] for c in reverse_vocab[i]]) + except KeyError: + text = bytearray() + for c in reverse_vocab[i]: + if ord(c) < 256: # single byte character + text.append(byte_decoder[ord(c)]) + else: # multibyte special token character + text.extend(c.encode('utf-8')) + else: + print(f"Key {i} not in tokenizer vocabulary. Padding with an arbitrary token.") + pad_token = f"[PAD{i}]".encode("utf8") + text = bytearray(pad_token) - reverse_vocab = {id: encoded_tok for encoded_tok, id in tokenizer.vocab.items()} - byte_encoder = bytes_to_unicode() - byte_decoder = {v: k for k, v in byte_encoder.items()} + tokens.append(text) - for i in range(vocab_size): - if i in reverse_vocab: - try: - text = bytearray([byte_decoder[c] for c in reverse_vocab[i]]) - except KeyError: - text = bytearray() - for c in reverse_vocab[i]: - if ord(c) < 256: # single byte character - text.append(byte_decoder[ord(c)]) - else: # multibyte special token character - text.extend(c.encode('utf-8')) - else: - print(f"Key {i} not in tokenizer vocabulary. Padding with an arbitrary token.") - pad_token = f"[PAD{i}]".encode("utf8") - text = bytearray(pad_token) - - tokens.append(text) - - gguf_writer.add_token_list(tokens) - - if "added_tokens" in tokenizer_json and Path(dir_model + "/tokenizer_config.json").is_file(): - print("gguf: get special token ids") - - with open(dir_model + "/tokenizer_config.json", "r", encoding="utf-8") as f: - tokenizer_config = json.load(f) - - # find special token ids - - if "bos_token" in tokenizer_config: - for key in tokenizer_json["added_tokens"]: - if key["content"] == tokenizer_config["bos_token"]: - gguf_writer.add_bos_token_id(key["id"]) - - if "eos_token" in tokenizer_config: - for key in tokenizer_json["added_tokens"]: - if key["content"] == tokenizer_config["eos_token"]: - gguf_writer.add_eos_token_id(key["id"]) - - if "unk_token" in tokenizer_config: - for key in tokenizer_json["added_tokens"]: - if key["content"] == tokenizer_config["unk_token"]: - gguf_writer.add_unk_token_id(key["id"]) - - if "sep_token" in tokenizer_config: - for key in tokenizer_json["added_tokens"]: - if key["content"] == tokenizer_config["sep_token"]: - gguf_writer.add_sep_token_id(key["id"]) - - if "pad_token" in tokenizer_config: - for key in tokenizer_json["added_tokens"]: - if key["content"] == tokenizer_config["pad_token"]: - gguf_writer.add_pad_token_id(key["id"]) +gguf_writer.add_token_list(tokens) +special_vocab = gguf.SpecialVocab(dir_model, load_merges = True) +special_vocab.add_to_gguf(gguf_writer) # TENSORS @@ -200,13 +167,15 @@ tensor_map = gguf.get_tensor_name_map(ARCH,block_count) print("gguf: get tensor metadata") if num_parts == 0: - part_names = ("pytorch_model.bin",) + part_names = iter(("pytorch_model.bin",)) else: part_names = ( f"pytorch_model-{n:05}-of-{num_parts:05}.bin" for n in range(1, num_parts + 1) ) for part_name in part_names: + if args.vocab_only: + break print("gguf: loading model part '" + part_name + "'") model_part = torch.load(f"{dir_model}/{part_name}", map_location="cpu") @@ -226,11 +195,8 @@ for part_name in part_names: data = data.squeeze().numpy() # map tensor names - if name.endswith(".weight") and name[:-7] in tensor_map: - name = tensor_map[name[:-7]] + ".weight" - elif name.endswith(".bias") and name[:-5] in tensor_map: - name = tensor_map[name[:-5]] + ".bias" - else: + new_name = tensor_map.get_name(name, try_suffixes = (".weight", ".bias")) + if new_name is None: print("Can not map tensor '" + name + "'") sys.exit() @@ -249,19 +215,20 @@ for part_name in part_names: if ftype == 1 and data_dtype == np.float32 and name.endswith(".weight") and n_dims == 2: data = data.astype(np.float16) - print(name + ", n_dims = " + str(n_dims) + ", " + str(old_dtype) + " --> " + str(data.dtype)) + print(new_name + ", n_dims = " + str(n_dims) + ", " + str(old_dtype) + " --> " + str(data.dtype)) - gguf_writer.add_tensor(name, data) + gguf_writer.add_tensor(new_name, data) print("gguf: write header") gguf_writer.write_header_to_file() print("gguf: write metadata") gguf_writer.write_kv_data_to_file() -print("gguf: write tensors") -gguf_writer.write_tensors_to_file() +if not args.vocab_only: + print("gguf: write tensors") + gguf_writer.write_tensors_to_file() gguf_writer.close() -print("gguf: model successfully exported to '" + fname_out + "'") +print(f"gguf: model successfully exported to '{fname_out}'") print("") diff --git a/convert-llama-7b-pth-to-gguf.py b/convert-llama-7b-pth-to-gguf.py index 2ab082383..6e973a116 100755 --- a/convert-llama-7b-pth-to-gguf.py +++ b/convert-llama-7b-pth-to-gguf.py @@ -10,8 +10,9 @@ import struct import json import numpy as np import torch +import argparse -from typing import Any, List +from typing import Any, List, TypeAlias from pathlib import Path from sentencepiece import SentencePieceProcessor @@ -20,7 +21,7 @@ from sentencepiece import SentencePieceProcessor NDArray: 'TypeAlias' = 'np.ndarray[Any, Any]' -def count_model_parts(dir_model: str) -> int: +def count_model_parts(dir_model: Path) -> int: num_parts = 0 for filename in os.listdir(dir_model): if filename.startswith("consolidated."): @@ -31,19 +32,22 @@ def count_model_parts(dir_model: str) -> int: return num_parts -if len(sys.argv) < 3: - print(f"Usage: python {sys.argv[0]} dir-model ftype\n") - print(" ftype == 0 -> float32") - print(" ftype == 1 -> float16") +def parse_args() -> argparse.Namespace: + parser = argparse.ArgumentParser(description="Convert a PyTorch 7B LLaMA model to a GGML compatible file") + parser.add_argument("--vocab-only", action="store_true", help="extract only the vocab") + parser.add_argument("--outfile", type=Path, help="path to write to; default: based on input") + parser.add_argument("model", type=Path, help="directory containing model file, or model file itself (*.bin)") + parser.add_argument("ftype", type=int, choices=[0, 1], help="output format - use 0 for float32, 1 for float16", default = 1) + return parser.parse_args() +args = parse_args() + +dir_model = args.model +ftype = args.ftype +if not dir_model.is_dir(): + print(f'Error: {args.model} is not a directory', file = sys.stderr) sys.exit(1) - -# output in the same directory as the model -dir_model = sys.argv[1] -last_dir = os.path.basename(os.path.normpath(dir_model)) - - # possible tensor data types # ftype == 0 -> float32 # ftype == 1 -> float16 @@ -51,19 +55,15 @@ last_dir = os.path.basename(os.path.normpath(dir_model)) # map from ftype to string ftype_str = ["f32", "f16"] -ftype = 1 -if len(sys.argv) > 2: - ftype = int(sys.argv[2]) - if ftype < 0 or ftype > 1: - print("Invalid ftype: " + str(ftype)) +if args.outfile is not None: + fname_out = args.outfile +else: + # output in the same directory as the model by default + fname_out = dir_model / f'ggml-model-{ftype_str[ftype]}.gguf' - sys.exit(1) +print("gguf: loading model "+dir_model.name) -fname_out = sys.argv[1] + "/ggml-model-" + ftype_str[ftype] + ".gguf" - -print("gguf: loading model "+last_dir) - -with open(dir_model + "/config.json", "r", encoding="utf-8") as f: +with open(dir_model / "config.json", "r", encoding="utf-8") as f: hparams = json.load(f) if hparams["architectures"][0] != "LlamaForCausalLM": @@ -107,7 +107,7 @@ else: sys.exit() -gguf_writer.add_name(last_dir) +gguf_writer.add_name(dir_model.name) gguf_writer.add_source_hf_repo(hf_repo) gguf_writer.add_tensor_data_layout("Meta AI original pth") gguf_writer.add_context_length(ctx_length) @@ -133,109 +133,60 @@ tokens: List[bytes] = [] scores: List[float] = [] toktypes: List[int] = [] -if Path(dir_model + "/tokenizer.model").is_file(): - # vocab type sentencepiece - print("gguf: get sentencepiece tokenizer vocab and scores") +tokenizer_model_file = dir_model / 'tokenizer.model' +if not tokenizer_model_file.is_file(): + print(f'Error: Missing {tokenizer_model_file}', file = sys.stderr) + sys.exit(1) - tokenizer = SentencePieceProcessor(dir_model + "/tokenizer.model") +# vocab type sentencepiece +print("gguf: get sentencepiece tokenizer vocab and scores") - for i in range(tokenizer.vocab_size()): - text: bytes - score: float +tokenizer = SentencePieceProcessor(str(tokenizer_model_file)) - piece = tokenizer.id_to_piece(i) - text = piece.encode("utf-8") - score = tokenizer.get_score(i) +for i in range(tokenizer.vocab_size()): + text: bytes + score: float - toktype = 1 # defualt to normal token type - if tokenizer.is_unknown(i): - toktype = 2 - if tokenizer.is_control(i): - toktype = 3 + piece = tokenizer.id_to_piece(i) + text = piece.encode("utf-8") + score = tokenizer.get_score(i) - # toktype = 4 is user-defined = tokens from added_tokens.json + toktype = 1 # defualt to normal token type + if tokenizer.is_unknown(i): + toktype = 2 + if tokenizer.is_control(i): + toktype = 3 - if tokenizer.is_unused(i): - toktype = 5 - if tokenizer.is_byte(i): - toktype = 6 + # toktype = 4 is user-defined = tokens from added_tokens.json - tokens.append(text) - scores.append(score) - toktypes.append(toktype) + if tokenizer.is_unused(i): + toktype = 5 + if tokenizer.is_byte(i): + toktype = 6 - if Path(dir_model + "/added_tokens.json").is_file(): - with open(dir_model + "/added_tokens.json", "r", encoding="utf-8") as f: - addtokens_json = json.load(f) + tokens.append(text) + scores.append(score) + toktypes.append(toktype) - print("gguf: get added tokens") +added_tokens_file = dir_model / 'added_tokens.json' +if added_tokens_file.is_file(): + with open(added_tokens_file, "r", encoding="utf-8") as f: + addtokens_json = json.load(f) - for key in addtokens_json: - tokens.append( key.encode("utf-8") ) - scores.append(-1000.0) - toktypes.append(4) # user-defined token type + print("gguf: get added tokens") - gguf_writer.add_tokenizer_model("llama") - gguf_writer.add_token_list(tokens) - gguf_writer.add_token_scores(scores) - gguf_writer.add_token_types(toktypes) + for key in addtokens_json: + tokens.append( key.encode("utf-8") ) + scores.append(-1000.0) + toktypes.append(4) # user-defined token type +gguf_writer.add_tokenizer_model("llama") +gguf_writer.add_token_list(tokens) +gguf_writer.add_token_scores(scores) +gguf_writer.add_token_types(toktypes) -print("gguf: get special token ids") - -if Path(dir_model + "/tokenizer.json").is_file(): - # Look for special tokens in tokenizer.json if it exists - - with open(dir_model + "/tokenizer.json", "r", encoding="utf-8") as f: - tokenizer = json.load(f) - - if "added_tokens" in tokenizer and Path(dir_model + "/tokenizer_config.json").is_file(): - - with open(dir_model + "/tokenizer_config.json", "r", encoding="utf-8") as f: - tokenizer_config = json.load(f) - - if "bos_token" in tokenizer_config and tokenizer_config["bos_token"] != None: - for key in tokenizer["added_tokens"]: - if key["content"] == tokenizer_config["bos_token"]["content"]: - gguf_writer.add_bos_token_id(key["id"]) - - if "eos_token" in tokenizer_config and tokenizer_config["eos_token"] != None: - for key in tokenizer["added_tokens"]: - if key["content"] == tokenizer_config["eos_token"]["content"]: - gguf_writer.add_eos_token_id(key["id"]) - - if "unk_token" in tokenizer_config and tokenizer_config["unk_token"] != None: - for key in tokenizer["added_tokens"]: - if key["content"] == tokenizer_config["unk_token"]["content"]: - gguf_writer.add_unk_token_id(key["id"]) - - if "sep_token" in tokenizer_config and tokenizer_config["sep_token"] != None: - for key in tokenizer["added_tokens"]: - if key["content"] == tokenizer_config["sep_token"]["content"]: - gguf_writer.add_sep_token_id(key["id"]) - - if "pad_token" in tokenizer_config and tokenizer_config["pad_token"] != None: - for key in tokenizer["added_tokens"]: - if key["content"] == tokenizer_config["pad_token"]["content"]: - gguf_writer.add_pad_token_id(key["id"]) -else: - # If no tokenizer.json: Look for special tokens in config.json - - if "bos_token_id" in hparams and hparams["bos_token_id"] != None: - gguf_writer.add_bos_token_id(hparams["bos_token_id"]) - - if "eos_token_id" in hparams and hparams["eos_token_id"] != None: - gguf_writer.add_eos_token_id(hparams["eos_token_id"]) - - if "unk_token_id" in hparams and hparams["unk_token_id"] != None: - gguf_writer.add_unk_token_id(hparams["unk_token_id"]) - - if "sep_token_id" in hparams and hparams["sep_token_id"] != None: - gguf_writer.add_sep_token_id(hparams["sep_token_id"]) - - if "pad_token_id" in hparams and hparams["pad_token_id"] != None: - gguf_writer.add_pad_token_id(hparams["pad_token_id"]) - +special_vocab = gguf.SpecialVocab(dir_model) +special_vocab.add_to_gguf(gguf_writer) # TENSORS @@ -247,6 +198,8 @@ print("gguf: get tensor metadata") part_names = (f"consolidated.{n:02}.pth" for n in range(0, num_parts)) for part_name in part_names: + if args.vocab_only: + break print("gguf: loading model part '" + part_name + "'") model_part = torch.load(f"{dir_model}/{part_name}", map_location="cpu") @@ -266,11 +219,8 @@ for part_name in part_names: data = data.squeeze().numpy() # map tensor names - if name.endswith(".weight") and name[:-7] in tensor_map: - name = tensor_map[name[:-7]] + ".weight" - elif name.endswith(".bias") and name[:-5] in tensor_map: - name = tensor_map[name[:-5]] + ".bias" - else: + new_name = tensor_map.get_name(name, try_suffixes = (".weight", ".bias")) + if new_name is None: print("Can not map tensor '" + name + "'") sys.exit() @@ -289,20 +239,20 @@ for part_name in part_names: if ftype == 1 and data_dtype == np.float32 and name.endswith(".weight") and n_dims == 2: data = data.astype(np.float16) - print(name + ", n_dims = " + str(n_dims) + ", " + str(old_dtype) + " --> " + str(data.dtype)) + print(new_name + ", n_dims = " + str(n_dims) + ", " + str(old_dtype) + " --> " + str(data.dtype)) - gguf_writer.add_tensor(name, data) + gguf_writer.add_tensor(new_name, data) print("gguf: write header") gguf_writer.write_header_to_file() print("gguf: write metadata") gguf_writer.write_kv_data_to_file() -print("gguf: write tensors") -gguf_writer.write_tensors_to_file() +if not args.vocab_only: + print("gguf: write tensors") + gguf_writer.write_tensors_to_file() gguf_writer.close() - -print("gguf: model successfully exported to '" + fname_out + "'") +print(f"gguf: model successfully exported to '{fname_out}'") print("") diff --git a/convert-llama-ggmlv3-to-gguf.py b/convert-llama-ggmlv3-to-gguf.py index 3bf93627d..c8e7f1761 100755 --- a/convert-llama-ggmlv3-to-gguf.py +++ b/convert-llama-ggmlv3-to-gguf.py @@ -75,7 +75,7 @@ class Tensor: self.dims = () self.dtype = None self.start_offset = 0 - self.len_bytes = 0 + self.len_bytes = np.int64(0) def load(self, data, offset): orig_offset = offset @@ -134,13 +134,14 @@ class GGMLV3Model: return offset class GGMLToGGUF: - def __init__(self, ggml_model, data, cfg, params_override = None, vocab_override = None): + def __init__(self, ggml_model, data, cfg, params_override = None, vocab_override = None, special_vocab = None): hp = ggml_model.hyperparameters self.model = ggml_model self.data = data self.cfg = cfg self.params_override = params_override self.vocab_override = vocab_override + self.special_vocab = special_vocab if params_override is not None: n_kv_head = params_override.n_head_kv else: @@ -162,6 +163,8 @@ class GGMLToGGUF: gguf_writer = gguf.GGUFWriter(self.cfg.output, gguf.MODEL_ARCH_NAMES[gguf.MODEL_ARCH.LLAMA], use_temp_file = False) self.add_params(gguf_writer) self.add_vocab(gguf_writer) + if self.special_vocab is not None: + self.special_vocab.add_to_gguf(gguf_writer) self.add_tensors(gguf_writer) print(" gguf: write header") gguf_writer.write_header_to_file() @@ -259,20 +262,13 @@ class GGMLToGGUF: gguf_writer.add_eos_token_id(2) def add_tensors(self, gguf_writer): - nm = self.name_map + tensor_map = self.name_map data = self.data print(f'* Adding {len(self.model.tensors)} tensor(s)') for tensor in self.model.tensors: name = str(tensor.name, 'UTF-8') - if name.endswith('.weight'): - name = name[:-7] - suffix = '.weight' - elif name.endswith('.bias'): - name = name[:-5] - suffix = '.bias' - mapped_name = nm.get(name) + mapped_name = tensor_map.get_name(name, try_suffixes = (".weight", ".bias")) assert mapped_name is not None, f'Bad name {name}' - mapped_name += suffix tempdims = list(tensor.dims[:]) if len(tempdims) > 1: temp = tempdims[1] @@ -302,8 +298,10 @@ def handle_metadata(cfg, hp): else: raise ValueError('Unable to load metadata') vocab = convert.load_vocab(cfg.vocab_dir if cfg.vocab_dir is not None else cfg.model_metadata_dir, cfg.vocabtype) + # FIXME: Respect cfg.vocab_dir? + svocab = gguf.SpecialVocab(cfg.model_metadata_dir) convert.check_vocab_size(params, vocab) - return (params, vocab) + return (params, vocab, svocab) def handle_args(): parser = argparse.ArgumentParser(description = 'Convert GGMLv3 models to GGUF') @@ -330,14 +328,16 @@ def main(): print(f'* GGML model hyperparameters: {model.hyperparameters}') vocab_override = None params_override = None + special_vocab = None if cfg.model_metadata_dir is not None: - (params_override, vocab_override) = handle_metadata(cfg, model.hyperparameters) + (params_override, vocab_override, special_vocab) = handle_metadata(cfg, model.hyperparameters) print('!! Note: When overriding params the --gqa, --eps and --context-length options are ignored.') print(f'* Overriding params: {params_override}') print(f'* Overriding vocab: {vocab_override}') + print(f'* Special vocab: {special_vocab}') else: print('\n=== WARNING === Special tokens may not be converted correctly. Use --model-metadata-dir if possible === WARNING ===\n') - converter = GGMLToGGUF(model, data, cfg, params_override = params_override, vocab_override = vocab_override) + converter = GGMLToGGUF(model, data, cfg, params_override = params_override, vocab_override = vocab_override, special_vocab = special_vocab) converter.save() print(f'* Successful completion. Output saved to: {cfg.output}') diff --git a/convert-llama-hf-to-gguf.py b/convert-llama-hf-to-gguf.py index b00810dbb..ab94b5eab 100755 --- a/convert-llama-hf-to-gguf.py +++ b/convert-llama-hf-to-gguf.py @@ -8,8 +8,9 @@ import struct import json import numpy as np import torch +import argparse -from typing import Any, List, Optional +from typing import Any, List, Optional, TypeAlias from pathlib import Path from sentencepiece import SentencePieceProcessor @@ -43,40 +44,38 @@ def count_model_parts(dir_model: str) -> int: return num_parts -if len(sys.argv) < 3: - print(f"Usage: python {sys.argv[0]} dir-model ftype\n") - print(" ftype == 0 -> float32") - print(" ftype == 1 -> float16") +def parse_args() -> argparse.Namespace: + parser = argparse.ArgumentParser(description="Convert a HuggingFace LLaMA model to a GGML compatible file") + parser.add_argument("--vocab-only", action="store_true", help="extract only the vocab") + parser.add_argument("--outfile", type=Path, help="path to write to; default: based on input") + parser.add_argument("model", type=Path, help="directory containing model file, or model file itself (*.bin)") + parser.add_argument("ftype", type=int, choices=[0, 1], help="output format - use 0 for float32, 1 for float16", default = 1) + return parser.parse_args() +args = parse_args() + +dir_model = args.model +ftype = args.ftype +if not dir_model.is_dir(): + print(f'Error: {args.model} is not a directory', file = sys.stderr) sys.exit(1) - -# output in the same directory as the model -dir_model = sys.argv[1] -last_dir = os.path.basename(os.path.normpath(dir_model)) - - # possible tensor data types # ftype == 0 -> float32 # ftype == 1 -> float16 - # map from ftype to string ftype_str = ["f32", "f16"] -ftype = 1 -if len(sys.argv) > 2: - ftype = int(sys.argv[2]) - if ftype < 0 or ftype > 1: - print("Invalid ftype: " + str(ftype)) +if args.outfile is not None: + fname_out = args.outfile +else: + # output in the same directory as the model by default + fname_out = dir_model / f'ggml-model-{ftype_str[ftype]}.gguf' - sys.exit(1) +print("gguf: loading model "+dir_model.name) -fname_out = sys.argv[1] + "/ggml-model-" + ftype_str[ftype] + ".gguf" - -print("gguf: loading model "+last_dir) - -with open(dir_model + "/config.json", "r", encoding="utf-8") as f: +with open(dir_model / "config.json", "r", encoding="utf-8") as f: hparams = json.load(f) if hparams["architectures"][0] != "LlamaForCausalLM": @@ -115,7 +114,7 @@ else: sys.exit() -gguf_writer.add_name(last_dir) +gguf_writer.add_name(dir_model.name) gguf_writer.add_source_hf_repo(hf_repo) gguf_writer.add_tensor_data_layout("Meta AI original pth") gguf_writer.add_context_length(ctx_length) @@ -141,110 +140,61 @@ tokens: List[bytes] = [] scores: List[float] = [] toktypes: List[int] = [] -if Path(dir_model + "/tokenizer.model").is_file(): - # vocab type sentencepiece - print("gguf: get sentencepiece tokenizer vocab, scores and token types") +tokenizer_model_file = dir_model / 'tokenizer.model' +if not tokenizer_model_file.is_file(): + print(f'Error: Missing {tokenizer_model_file}', file = sys.stderr) + sys.exit(1) - tokenizer = SentencePieceProcessor(dir_model + "/tokenizer.model") +# vocab type sentencepiece +print("gguf: get sentencepiece tokenizer vocab, scores and token types") - for i in range(tokenizer.vocab_size()): - text: bytes - score: float +tokenizer = SentencePieceProcessor(str(tokenizer_model_file)) - piece = tokenizer.id_to_piece(i) - text = piece.encode("utf-8") - score = tokenizer.get_score(i) +for i in range(tokenizer.vocab_size()): + text: bytes + score: float - toktype = 1 # defualt to normal token type - if tokenizer.is_unknown(i): - toktype = 2 - if tokenizer.is_control(i): - toktype = 3 + piece = tokenizer.id_to_piece(i) + text = piece.encode("utf-8") + score = tokenizer.get_score(i) - # toktype = 4 is user-defined = tokens from added_tokens.json + toktype = 1 # defualt to normal token type + if tokenizer.is_unknown(i): + toktype = 2 + if tokenizer.is_control(i): + toktype = 3 - if tokenizer.is_unused(i): - toktype = 5 - if tokenizer.is_byte(i): - toktype = 6 + # toktype = 4 is user-defined = tokens from added_tokens.json - tokens.append(text) - scores.append(score) - toktypes.append(toktype) + if tokenizer.is_unused(i): + toktype = 5 + if tokenizer.is_byte(i): + toktype = 6 - if Path(dir_model + "/added_tokens.json").is_file(): - with open(dir_model + "/added_tokens.json", "r", encoding="utf-8") as f: - addtokens_json = json.load(f) + tokens.append(text) + scores.append(score) + toktypes.append(toktype) - print("gguf: get added tokens") +added_tokens_file = dir_model / 'added_tokens.json' +if added_tokens_file.is_file(): + with open(added_tokens_file, "r", encoding="utf-8") as f: + addtokens_json = json.load(f) - for key in addtokens_json: - tokens.append( key.encode("utf-8") ) - scores.append(-1000.0) - toktypes.append(4) # user-defined token type + print("gguf: get added tokens") + + for key in addtokens_json: + tokens.append( key.encode("utf-8") ) + scores.append(-1000.0) + toktypes.append(4) # user-defined token type - gguf_writer.add_tokenizer_model("llama") - gguf_writer.add_token_list(tokens) - gguf_writer.add_token_scores(scores) - gguf_writer.add_token_types(toktypes) - - -print("gguf: get special token ids") - -if Path(dir_model + "/tokenizer.json").is_file(): - # Look for special tokens in tokenizer.json if it exists - - with open(dir_model + "/tokenizer.json", "r", encoding="utf-8") as f: - tokenizer = json.load(f) - - if "added_tokens" in tokenizer and Path(dir_model + "/tokenizer_config.json").is_file(): - - with open(dir_model + "/tokenizer_config.json", "r", encoding="utf-8") as f: - tokenizer_config = json.load(f) - - if "bos_token" in tokenizer_config and tokenizer_config["bos_token"] != None: - for key in tokenizer["added_tokens"]: - if key["content"] == tokenizer_config["bos_token"]["content"]: - gguf_writer.add_bos_token_id(key["id"]) - - if "eos_token" in tokenizer_config and tokenizer_config["eos_token"] != None: - for key in tokenizer["added_tokens"]: - if key["content"] == tokenizer_config["eos_token"]["content"]: - gguf_writer.add_eos_token_id(key["id"]) - - if "unk_token" in tokenizer_config and tokenizer_config["unk_token"] != None: - for key in tokenizer["added_tokens"]: - if key["content"] == tokenizer_config["unk_token"]["content"]: - gguf_writer.add_unk_token_id(key["id"]) - - if "sep_token" in tokenizer_config and tokenizer_config["sep_token"] != None: - for key in tokenizer["added_tokens"]: - if key["content"] == tokenizer_config["sep_token"]["content"]: - gguf_writer.add_sep_token_id(key["id"]) - - if "pad_token" in tokenizer_config and tokenizer_config["pad_token"] != None: - for key in tokenizer["added_tokens"]: - if key["content"] == tokenizer_config["pad_token"]["content"]: - gguf_writer.add_pad_token_id(key["id"]) -else: - # If no tokenizer.json: Look for special tokens in config.json - - if "bos_token_id" in hparams and hparams["bos_token_id"] != None: - gguf_writer.add_bos_token_id(hparams["bos_token_id"]) - - if "eos_token_id" in hparams and hparams["eos_token_id"] != None: - gguf_writer.add_eos_token_id(hparams["eos_token_id"]) - - if "unk_token_id" in hparams and hparams["unk_token_id"] != None: - gguf_writer.add_unk_token_id(hparams["unk_token_id"]) - - if "sep_token_id" in hparams and hparams["sep_token_id"] != None: - gguf_writer.add_sep_token_id(hparams["sep_token_id"]) - - if "pad_token_id" in hparams and hparams["pad_token_id"] != None: - gguf_writer.add_pad_token_id(hparams["pad_token_id"]) +gguf_writer.add_tokenizer_model("llama") +gguf_writer.add_token_list(tokens) +gguf_writer.add_token_scores(scores) +gguf_writer.add_token_types(toktypes) +special_vocab = gguf.SpecialVocab(dir_model) +special_vocab.add_to_gguf(gguf_writer) # TENSORS @@ -254,13 +204,15 @@ tensor_map = gguf.get_tensor_name_map(ARCH,block_count) print("gguf: get tensor metadata") if num_parts == 0: - part_names = ("pytorch_model.bin",) + part_names = iter(("pytorch_model.bin",)) else: part_names = ( f"pytorch_model-{n:05}-of-{num_parts:05}.bin" for n in range(1, num_parts + 1) ) for part_name in part_names: + if args.vocab_only: + break print("gguf: loading model part '" + part_name + "'") model_part = torch.load(f"{dir_model}/{part_name}", map_location="cpu") @@ -286,11 +238,8 @@ for part_name in part_names: data = reverse_hf_permute(data, head_count, head_count_kv) # map tensor names - if name.endswith(".weight") and name[:-7] in tensor_map: - name = tensor_map[name[:-7]] + ".weight" - elif name.endswith(".bias") and name[:-5] in tensor_map: - name = tensor_map[name[:-5]] + ".bias" - else: + new_name = tensor_map.get_name(name, try_suffixes = (".weight", ".bias")) + if new_name is None: print("Can not map tensor '" + name + "'") sys.exit() @@ -309,20 +258,20 @@ for part_name in part_names: if ftype == 1 and data_dtype == np.float32 and name.endswith(".weight") and n_dims == 2: data = data.astype(np.float16) - print(name + ", n_dims = " + str(n_dims) + ", " + str(old_dtype) + " --> " + str(data.dtype)) + print(new_name + ", n_dims = " + str(n_dims) + ", " + str(old_dtype) + " --> " + str(data.dtype)) - gguf_writer.add_tensor(name, data) + gguf_writer.add_tensor(new_name, data) print("gguf: write header") gguf_writer.write_header_to_file() print("gguf: write metadata") gguf_writer.write_kv_data_to_file() -print("gguf: write tensors") -gguf_writer.write_tensors_to_file() +if not args.vocab_only: + print("gguf: write tensors") + gguf_writer.write_tensors_to_file() gguf_writer.close() - -print("gguf: model successfully exported to '" + fname_out + "'") +print(f"gguf: model successfully exported to '{fname_out}'") print("") diff --git a/convert-lora-to-ggml.py b/convert-lora-to-ggml.py index a94a7d0af..a00339b47 100755 --- a/convert-lora-to-ggml.py +++ b/convert-lora-to-ggml.py @@ -4,7 +4,7 @@ import os import re import struct import sys -from typing import Any, Dict, Sequence, TextIO +from typing import Any, Dict, Sequence, BinaryIO import numpy as np import torch @@ -46,7 +46,7 @@ def translate_tensor_name(t: str) -> str: sys.exit(1) -def write_file_header(fout: TextIO, params: Dict[str, Any]) -> None: +def write_file_header(fout: BinaryIO, params: Dict[str, Any]) -> None: fout.write(b"ggla"[::-1]) # magic (ggml lora) fout.write(struct.pack("i", 1)) # file version fout.write(struct.pack("i", params["r"])) @@ -60,7 +60,7 @@ def write_file_header(fout: TextIO, params: Dict[str, Any]) -> None: def write_tensor_header( - self, name: str, shape: Sequence[int], data_type: np.dtype + self, name: str, shape: Sequence[int], data_type: np.dtype[Any] ) -> None: sname = name.encode("utf-8") fout.write( diff --git a/convert.py b/convert.py index 3f0a1c932..448b6f0f3 100755 --- a/convert.py +++ b/convert.py @@ -25,7 +25,7 @@ import numpy as np from abc import ABCMeta, abstractmethod from dataclasses import dataclass from pathlib import Path -from typing import (IO, TYPE_CHECKING, Any, Callable, Dict, Generator, Iterable, List, Literal, Optional, Sequence, Set, Tuple, TypeVar, Union) +from typing import (IO, TYPE_CHECKING, Any, Callable, Dict, Generator, Iterable, List, Literal, Optional, Sequence, Set, Tuple, Type, TypeVar, Union) from sentencepiece import SentencePieceProcessor # type: ignore if TYPE_CHECKING: @@ -299,8 +299,10 @@ class Params: params = Params.loadHFTransformerJson(model_plus.model, hf_config_path) elif orig_config_path.exists(): params = Params.loadOriginalParamsJson(model_plus.model, orig_config_path) - else: + elif model_plus.format != 'none': params = Params.guessed(model_plus.model) + else: + raise ValueError('Cannot guess params when model format is none') params.path_model = model_plus.paths[0].parent @@ -353,7 +355,7 @@ class BpeVocab: yield from self.added_tokens() def __repr__(self) -> str: - return f"BpeVocab with {self.vocab_size_base} base tokens and {len(self.added_tokens_list)} added tokens>" + return f"" class SentencePieceVocab: @@ -416,7 +418,6 @@ class SentencePieceVocab: Vocab = Union[BpeVocab, SentencePieceVocab] - # # data loading # TODO: reuse (probably move to gguf.py?) @@ -439,14 +440,14 @@ class Tensor(metaclass=ABCMeta): @abstractmethod def permute(self, n_head: int, n_head_kv: int) -> 'Tensor': ... @abstractmethod - def permute_part(self, n_part: int, n_head: int) -> 'UnquantizedTensor': ... + def permute_part(self, n_part: int, n_head: int, n_head_kv: int) -> 'UnquantizedTensor': ... @abstractmethod def part(self, n_part: int) -> 'UnquantizedTensor': ... @abstractmethod def to_ggml(self) -> 'GGMLCompatibleTensor': ... -def bf16_to_fp32(bf16_arr: np.ndarray) -> np.ndarray: +def bf16_to_fp32(bf16_arr: np.ndarray[Any, np.dtype[np.uint16]]) -> NDArray: assert bf16_arr.dtype == np.uint16, f"Input array should be of dtype uint16, but got {bf16_arr.dtype}" fp32_arr = bf16_arr.astype(np.uint32) << 16 return fp32_arr.view(np.float32) @@ -467,9 +468,9 @@ class UnquantizedTensor(Tensor): def to_ggml(self) -> 'UnquantizedTensor': return self - def permute_part(self, n_part: int, n_head: int) -> 'UnquantizedTensor': + def permute_part(self, n_part: int, n_head: int, n_head_kv: int) -> 'UnquantizedTensor': r = self.ndarray.shape[0] // 3 - return UnquantizedTensor(permute(self.ndarray[r * n_part : r * n_part + r, ...], n_head, n_head)) + return UnquantizedTensor(permute(self.ndarray[r * n_part : r * n_part + r, ...], n_head, n_head_kv)) def part(self, n_part: int) -> 'UnquantizedTensor': r = self.ndarray.shape[0] // 3 @@ -531,7 +532,7 @@ LazyModel = Dict[str, LazyTensor] class ModelPlus: model: LazyModel paths: List[Path] # Where this was read from. - format: Literal['ggml', 'torch', 'safetensors'] + format: Literal['ggml', 'torch', 'safetensors', 'none'] vocab: Optional[Vocab] # For GGML models (which have vocab built in), the vocab. @@ -597,12 +598,12 @@ def permute_lazy(lazy_tensor: LazyTensor, n_head: int, n_head_kv: int) -> LazyTe return lazy_tensor.load().permute(n_head, n_head_kv) return LazyTensor(load, lazy_tensor.shape, lazy_tensor.data_type, f'permute({n_head}, {n_head_kv}) ' + lazy_tensor.description) -def permute_part_lazy(lazy_tensor: LazyTensor, n_part: int, n_head: int) -> LazyTensor: +def permute_part_lazy(lazy_tensor: LazyTensor, n_part: int, n_head: int, n_head_kv: int) -> LazyTensor: def load() -> Tensor: - return lazy_tensor.load().permute_part(n_part, n_head) + return lazy_tensor.load().permute_part(n_part, n_head, n_head_kv) s = lazy_tensor.shape.copy() s[0] = s[0] // 3 - return LazyTensor(load, s, lazy_tensor.data_type, f'permute({n_head}) ' + lazy_tensor.description) + return LazyTensor(load, s, lazy_tensor.data_type, f'permute({n_head}, {n_head_kv}) ' + lazy_tensor.description) def part_lazy(lazy_tensor: LazyTensor, n_part: int) -> LazyTensor: def load() -> Tensor: @@ -657,7 +658,7 @@ class LazyUnpickler(pickle.Unpickler): description = f'storage data_type={data_type} path-in-zip={filename} path={self.zip_file.filename}' return LazyStorage(load=load, kind=pid[1], description=description) - # @staticmethod + @staticmethod def lazy_rebuild_tensor_v2(storage: Any, storage_offset: Any, size: Any, stride: Any, # pyright: ignore[reportSelfClsParameterName] requires_grad: Any, backward_hooks: Any, metadata: Any = None) -> LazyTensor: @@ -669,13 +670,15 @@ class LazyUnpickler(pickle.Unpickler): description = f'pickled storage_offset={storage_offset} in {storage.description}' return LazyTensor(load, list(size), storage.kind.data_type, description) - # @staticmethod + @staticmethod def rebuild_from_type_v2(func, new_type, args, state): return func(*args) - CLASSES: Dict[Any, Any] = { - ('torch._tensor', '_rebuild_from_type_v2'): rebuild_from_type_v2, - ('torch._utils', '_rebuild_tensor_v2'): lazy_rebuild_tensor_v2, + CLASSES: Dict[Tuple[str, str], Any] = { + # getattr used here as a workaround for mypy not being smart enough to detrmine + # the staticmethods have a __func__ attribute. + ('torch._tensor', '_rebuild_from_type_v2'): getattr(rebuild_from_type_v2, '__func__'), + ('torch._utils', '_rebuild_tensor_v2'): getattr(lazy_rebuild_tensor_v2, '__func__'), ('torch', 'BFloat16Storage'): LazyStorageKind(DT_BF16), ('torch', 'HalfStorage'): LazyStorageKind(DT_F16), ('torch', 'FloatStorage'): LazyStorageKind(DT_F32), @@ -751,7 +754,7 @@ def lazy_load_file(path: Path) -> ModelPlus: In = TypeVar('In') Out = TypeVar('Out') -def bounded_parallel_map(func: Callable[[In], Out], iterable: Iterable[In], concurrency: int, max_workers: Optional[int] = None, factory: Callable = ThreadPoolExecutor) -> Iterable[Out]: +def bounded_parallel_map(func: Callable[[In], Out], iterable: Iterable[In], concurrency: int, max_workers: Optional[int] = None, use_processpool_executor: bool = False) -> Iterable[Out]: '''Parallel map, but with backpressure. If the caller doesn't call `next` fast enough, this will stop calling `func` at some point rather than letting results pile up in memory. Specifically, there is a max of one @@ -760,7 +763,12 @@ def bounded_parallel_map(func: Callable[[In], Out], iterable: Iterable[In], conc yield from map(func, iterable) # Not reached. iterable = iter(iterable) - with factory(max_workers = max_workers) as executor: + executor_class: Union[Type[ThreadPoolExecutor], Type[ProcessPoolExecutor]] + if use_processpool_executor: + executor_class = ProcessPoolExecutor + else: + executor_class = ThreadPoolExecutor + with executor_class(max_workers = max_workers) as executor: futures: List[concurrent.futures.Future[Out]] = [] done = False for _ in range(concurrency): @@ -838,11 +846,19 @@ class OutputFile: scores.append(score) toktypes.append(toktype) - self.gguf.add_tokenizer_model("llama") + if isinstance(vocab, SentencePieceVocab): + self.gguf.add_tokenizer_model("llama") + elif isinstance(vocab, BpeVocab): + self.gguf.add_tokenizer_model("gpt2") + else: + raise ValueError(f'Unknown vocab type: Not BpeVocab or SentencePieceVocab') self.gguf.add_token_list(tokens) self.gguf.add_token_scores(scores) self.gguf.add_token_types(toktypes) + def add_meta_special_vocab(self, svocab: gguf.SpecialVocab) -> None: + svocab.add_to_gguf(self.gguf) + def add_tensor_info(self, name: str, tensor: LazyTensor) -> None: n_elements = int(np.prod(tensor.shape)) raw_dtype = getattr(tensor.data_type, 'ggml_type', None) @@ -861,7 +877,7 @@ class OutputFile: self.gguf.close() @staticmethod - def write_vocab_only(fname_out: Path, params: Params, vocab: Vocab) -> None: + def write_vocab_only(fname_out: Path, params: Params, vocab: Vocab, svocab: gguf.SpecialVocab) -> None: check_vocab_size(params, vocab) of = OutputFile(fname_out) @@ -869,6 +885,8 @@ class OutputFile: # meta data of.add_meta_arch(params) of.add_meta_vocab(vocab) + of.add_meta_special_vocab(svocab) + of.write_meta() of.close() @@ -887,7 +905,7 @@ class OutputFile: return dt.quantize(arr) @staticmethod - def write_all(fname_out: Path, ftype: GGMLFileType, params: Params, model: LazyModel, vocab: Vocab, concurrency: int = DEFAULT_CONCURRENCY) -> None: + def write_all(fname_out: Path, ftype: GGMLFileType, params: Params, model: LazyModel, vocab: Vocab, svocab: gguf.SpecialVocab, concurrency: int = DEFAULT_CONCURRENCY) -> None: check_vocab_size(params, vocab) of = OutputFile(fname_out) @@ -895,6 +913,7 @@ class OutputFile: # meta data of.add_meta_arch(params) of.add_meta_vocab(vocab) + of.add_meta_special_vocab(svocab) # tensor info for name, lazy_tensor in model.items(): @@ -906,7 +925,7 @@ class OutputFile: # tensor data ndarrays_inner = bounded_parallel_map(OutputFile.do_item, model.items(), concurrency = concurrency) if ftype == GGMLFileType.MostlyQ8_0: - ndarrays = bounded_parallel_map(OutputFile.maybe_do_quantize, ndarrays_inner, concurrency = concurrency, max_workers = concurrency, factory = ProcessPoolExecutor) + ndarrays = bounded_parallel_map(OutputFile.maybe_do_quantize, ndarrays_inner, concurrency = concurrency, max_workers = concurrency, use_processpool_executor = True) else: ndarrays = map(OutputFile.maybe_do_quantize, ndarrays_inner) @@ -939,7 +958,8 @@ def convert_to_output_type(model: LazyModel, output_type: GGMLFileType) -> LazyM for (name, tensor) in model.items()} def convert_model_names(model: LazyModel, params: Params) -> LazyModel: - tmap = gguf.get_tensor_name_map(ARCH, params.n_layer) + tmap = gguf.TensorNameMap(ARCH, params.n_layer) + should_skip: Set[gguf.MODEL_TENSOR] = set(gguf.MODEL_TENSOR_SKIP.get(ARCH, [])) tmp = model @@ -952,8 +972,8 @@ def convert_model_names(model: LazyModel, params: Params) -> LazyModel: #tmp[f"model.layers.{i}.self_attn.v_proj.weight"] = model[f"model.layers.{i}.self_attn.v_proj.weight"] elif f"model.layers.{i}.self_attn.W_pack.weight" in model: print(f"Unpacking and permuting layer {i}") - tmp[f"model.layers.{i}.self_attn.q_proj.weight"] = permute_part_lazy(model[f"model.layers.{i}.self_attn.W_pack.weight"], 0, params.n_head) - tmp[f"model.layers.{i}.self_attn.k_proj.weight"] = permute_part_lazy(model[f"model.layers.{i}.self_attn.W_pack.weight"], 1, params.n_head) + tmp[f"model.layers.{i}.self_attn.q_proj.weight"] = permute_part_lazy(model[f"model.layers.{i}.self_attn.W_pack.weight"], 0, params.n_head, params.n_head) + tmp[f"model.layers.{i}.self_attn.k_proj.weight"] = permute_part_lazy(model[f"model.layers.{i}.self_attn.W_pack.weight"], 1, params.n_head, params.n_head_kv) tmp[f"model.layers.{i}.self_attn.v_proj.weight"] = part_lazy (model[f"model.layers.{i}.self_attn.W_pack.weight"], 2) del tmp[f"model.layers.{i}.self_attn.W_pack.weight"] else: @@ -961,23 +981,16 @@ def convert_model_names(model: LazyModel, params: Params) -> LazyModel: out: LazyModel = {} for name, lazy_tensor in model.items(): - name_new = name - - if name in tmap: - name_new = tmap[name] - elif name.endswith(".weight") and name[:-7] in tmap: - name_new = tmap[name[:-7]] + ".weight" - elif name.endswith(".bias") and name[:-5] in tmap: - name_new = tmap[name[:-5]] + ".bias" - else: + tensor_type, name_new = tmap.get_type_and_name(name, try_suffixes = (".weight", ".bias")) or (None, None) + if name_new is None: raise Exception(f"Unexpected tensor name: {name}") - if gguf.should_skip_tensor_TMP(ARCH, params.n_layer, name_new): + if tensor_type in should_skip: print(f"skipping tensor {name_new}") continue - else: - print(f"{name:48s} -> {name_new:40s} | {lazy_tensor.data_type.name:6s} | {lazy_tensor.shape}") - out[name_new] = lazy_tensor + + print(f"{name:48s} -> {name_new:40s} | {lazy_tensor.data_type.name:6s} | {lazy_tensor.shape}") + out[name_new] = lazy_tensor return out @@ -1117,8 +1130,16 @@ def main(args_in: Optional[List[str]] = None) -> None: if args.dump_single: model_plus = lazy_load_file(args.model) do_dump_model(model_plus) + return - model_plus = load_some_model(args.model) + if not args.vocab_only: + model_plus = load_some_model(args.model) + else: + model_plus = ModelPlus(model = {}, paths = [args.model / 'dummy'], format = 'none', vocab = None) + + if args.dump: + do_dump_model(model_plus) + return params = Params.load(model_plus) if params.n_ctx == -1: @@ -1140,33 +1161,34 @@ def main(args_in: Optional[List[str]] = None) -> None: vocab: Vocab if args.vocab_only: - vocab = load_vocab(args.vocab_dir or args.model, args.vocabtype) assert args.outfile, "need --outfile if using --vocab-only" + # FIXME: Try to respect vocab_dir somehow? + vocab = load_vocab(args.vocab_dir or args.model, args.vocabtype) + special_vocab = gguf.SpecialVocab(model_plus.paths[0].parent, load_merges = args.vocabtype == 'bpe') outfile = args.outfile - OutputFile.write_vocab_only(outfile, params, vocab) + OutputFile.write_vocab_only(outfile, params, vocab, special_vocab) print(f"Wrote {outfile}") + return + + if model_plus.vocab is not None and args.vocab_dir is None: + vocab = model_plus.vocab else: - if args.dump: - do_dump_model(model_plus) - return + vocab_dir = args.vocab_dir if args.vocab_dir else model_plus.paths[0].parent + vocab = load_vocab(vocab_dir, args.vocabtype) + # FIXME: Try to respect vocab_dir somehow? + special_vocab = gguf.SpecialVocab(model_plus.paths[0].parent, load_merges = args.vocabtype == 'bpe') - if model_plus.vocab is not None and args.vocab_dir is None: - vocab = model_plus.vocab - else: - vocab_dir = args.vocab_dir if args.vocab_dir else model_plus.paths[0].parent - vocab = load_vocab(vocab_dir, args.vocabtype) + model = model_plus.model + model = convert_model_names(model, params) + ftype = pick_output_type(model, args.outtype) + model = convert_to_output_type(model, ftype) + outfile = args.outfile or default_outfile(model_plus.paths, ftype) - model = model_plus.model - model = convert_model_names(model, params) - ftype = pick_output_type(model, args.outtype) - model = convert_to_output_type(model, ftype) - outfile = args.outfile or default_outfile(model_plus.paths, ftype) + params.ftype = ftype + print(f"Writing {outfile}, format {ftype}") - params.ftype = ftype - print(f"Writing {outfile}, format {ftype}") - - OutputFile.write_all(outfile, ftype, params, model, vocab, concurrency = args.concurrency) - print(f"Wrote {outfile}") + OutputFile.write_all(outfile, ftype, params, model, vocab, special_vocab, concurrency = args.concurrency) + print(f"Wrote {outfile}") if __name__ == '__main__': diff --git a/gguf-py/gguf/gguf.py b/gguf-py/gguf/gguf.py index 838a2c0f8..de3edbc99 100644 --- a/gguf-py/gguf/gguf.py +++ b/gguf-py/gguf/gguf.py @@ -4,9 +4,13 @@ import sys import struct import tempfile import numpy as np +import json +import os +from pathlib import Path from enum import IntEnum, auto -from typing import Any, IO, List, Optional +from io import BufferedWriter +from typing import Any, BinaryIO, Callable, IO, Dict, List, Optional, Sequence, Tuple, Union # # constants @@ -71,35 +75,35 @@ KEY_TOKENIZER_RWKV = "tokenizer.rwkv.world" class MODEL_ARCH(IntEnum): - LLAMA = auto() - FALCON = auto() - GPT2 = auto() - GPTJ = auto() - GPTNEOX = auto() - MPT = auto() + LLAMA : int = auto() + FALCON : int = auto() + GPT2 : int = auto() + GPTJ : int = auto() + GPTNEOX: int = auto() + MPT : int = auto() class MODEL_TENSOR(IntEnum): - TOKEN_EMBD = auto() - POS_EMBD = auto() - OUTPUT = auto() - OUTPUT_NORM = auto() - ROPE_FREQS = auto() - ATTN_Q = auto() - ATTN_K = auto() - ATTN_V = auto() - ATTN_QKV = auto() - ATTN_OUT = auto() - ATTN_NORM = auto() - ATTN_NORM_2 = auto() - ATTN_ROT_EMBD = auto() - FFN_GATE = auto() - FFN_DOWN = auto() - FFN_UP = auto() - FFN_NORM = auto() + TOKEN_EMBD : int = auto() + POS_EMBD : int = auto() + OUTPUT : int = auto() + OUTPUT_NORM : int = auto() + ROPE_FREQS : int = auto() + ATTN_Q : int = auto() + ATTN_K : int = auto() + ATTN_V : int = auto() + ATTN_QKV : int = auto() + ATTN_OUT : int = auto() + ATTN_NORM : int = auto() + ATTN_NORM_2 : int = auto() + ATTN_ROT_EMBD: int = auto() + FFN_GATE : int = auto() + FFN_DOWN : int = auto() + FFN_UP : int = auto() + FFN_NORM : int = auto() -MODEL_ARCH_NAMES = { +MODEL_ARCH_NAMES: Dict[MODEL_ARCH, str] = { MODEL_ARCH.LLAMA: "llama", MODEL_ARCH.FALCON: "falcon", MODEL_ARCH.GPT2: "gpt2", @@ -108,7 +112,7 @@ MODEL_ARCH_NAMES = { MODEL_ARCH.MPT: "mpt", } -MODEL_TENSOR_NAMES = { +MODEL_TENSOR_NAMES: Dict[MODEL_ARCH, Dict[MODEL_TENSOR, str]] = { MODEL_ARCH.LLAMA: { MODEL_TENSOR.TOKEN_EMBD: "token_embd", MODEL_TENSOR.OUTPUT_NORM: "output_norm", @@ -154,7 +158,7 @@ MODEL_TENSOR_NAMES = { } # tensors that will not be serialized -MODEL_TENSOR_SKIP = { +MODEL_TENSOR_SKIP: Dict[MODEL_ARCH, List[MODEL_TENSOR]] = { MODEL_ARCH.LLAMA: [ MODEL_TENSOR.ROPE_FREQS, MODEL_TENSOR.ATTN_ROT_EMBD, @@ -162,167 +166,198 @@ MODEL_TENSOR_SKIP = { } -# TODO: the following helper functions should be removed -# instead, get_tensor_name_map should return tuples of (name, MODEL_TENSOR) -# however, my Python is very bad, and I couldn't figure out how to do this, hence these functions -# REMOVE -def should_skip_tensor_TMP(arch: MODEL_ARCH, n_blocks: int, name: str) -> bool: - for skip in MODEL_TENSOR_SKIP.get(arch, []): - for i in range(n_blocks): - if name == MODEL_TENSOR_NAMES[arch][skip].format(bid=i): - return True +class TensorNameMap: + mappings_cfg: Dict[MODEL_TENSOR, Tuple[str, ...]] = { + # Token embeddings + MODEL_TENSOR.TOKEN_EMBD: ( + "gpt_neox.embed_in", # gptneox + "transformer.wte", # gpt2 mpt + "transformer.word_embeddings", # falcon + "model.embed_tokens", # llama-hf + "tok_embeddings", # llama-pth + ), - return False + # Position embeddings + MODEL_TENSOR.POS_EMBD: ( + "transformer.wpe", # gpt2 + ), + # Output + MODEL_TENSOR.OUTPUT: ( + "embed_out", # gptneox + "lm_head", # gpt2 mpt falcon llama-hf + "output", # llama-pth + ), -def get_tensor_name_map(arch: MODEL_ARCH, n_blocks: int) -> dict: - tensor_map = {} + # Output norm + MODEL_TENSOR.OUTPUT_NORM: ( + "gpt_neox.final_layer_norm", # gptneox + "transformer.ln_f", # gpt2 falcon + "model.norm", # llama-hf + "norm", # llama-pth + ), - # Token embeddings - mapped_to = MODEL_TENSOR_NAMES[arch].get(MODEL_TENSOR.TOKEN_EMBD, None) + # Rope frequencies + MODEL_TENSOR.ROPE_FREQS: ( + "rope.freqs", # llama-pth + ), + } - tensor_map["gpt_neox.embed_in"] = mapped_to # gptneox - tensor_map["transformer.wte"] = mapped_to # gpt2 mpt - tensor_map["transformer.word_embeddings"] = mapped_to # falcon - tensor_map["model.embed_tokens"] = mapped_to # llama-hf - tensor_map["tok_embeddings"] = mapped_to # llama-pth - - # Position embeddings - mapped_to = MODEL_TENSOR_NAMES[arch].get(MODEL_TENSOR.POS_EMBD, None) - - tensor_map["transformer.wpe"] = mapped_to # gpt2 - - # Output - mapped_to = MODEL_TENSOR_NAMES[arch].get(MODEL_TENSOR.OUTPUT, None) - - tensor_map["embed_out"] = mapped_to # gptneox - tensor_map["lm_head"] = mapped_to # gpt2 mpt falcon llama-hf - tensor_map["output"] = mapped_to # llama-pth - - # Output norm - mapped_to = MODEL_TENSOR_NAMES[arch].get(MODEL_TENSOR.OUTPUT_NORM, None) - - tensor_map["gpt_neox.final_layer_norm"] = mapped_to # gptneox - tensor_map["transformer.ln_f"] = mapped_to # gpt2 falcon - tensor_map["transformer.norm_f"] = mapped_to # mpt - tensor_map["model.norm"] = mapped_to # llama-hf - tensor_map["norm"] = mapped_to # llama-pth - - # Rope frequencies - mapped_to = MODEL_TENSOR_NAMES[arch].get(MODEL_TENSOR.ROPE_FREQS, None) - - tensor_map["rope.freqs"] = mapped_to # llama-pth - - # Attention and feed-forward blocks - for i in range(0, n_blocks): + block_mappings_cfg: Dict[MODEL_TENSOR, Tuple[str, ...]] = { # Attention norm - # TODO: is there are simpler way to write these 2 lines in Python? - mapped_to = MODEL_TENSOR_NAMES[arch].get(MODEL_TENSOR.ATTN_NORM, None) - mapped_to = mapped_to.format(bid=i) if mapped_to else None - - tensor_map["gpt_neox.layers."+str(i)+".input_layernorm"] = mapped_to # gptneox - tensor_map["transformer.h."+str(i)+".ln_1"] = mapped_to # gpt2 - tensor_map["transformer.blocks."+str(i)+".norm_1"] = mapped_to # mpt - tensor_map["transformer.h."+str(i)+".input_layernorm"] = mapped_to # falcon7b - tensor_map["transformer.h."+str(i)+".ln_mlp"] = mapped_to # falcon40b - tensor_map["model.layers."+str(i)+".input_layernorm"] = mapped_to # llama-hf - tensor_map["layers."+str(i)+".attention_norm"] = mapped_to # llama-pth + MODEL_TENSOR.ATTN_NORM: ( + "gpt_neox.layers.{bid}.input_layernorm", # gptneox + "transformer.h.{bid}.ln_1", # gpt2 + "transformer.blocks.{bid}.norm_1", # mpt + "transformer.h.{bid}.input_layernorm", # falcon7b + "transformer.h.{bid}.ln_mlp", # falcon40b + "model.layers.{bid}.input_layernorm", # llama-hf + "layers.{bid}.attention_norm", # llama-pth + ), # Attention norm 2 - mapped_to = MODEL_TENSOR_NAMES[arch].get(MODEL_TENSOR.ATTN_NORM_2, None) - mapped_to = mapped_to.format(bid=i) if mapped_to is not None else None - - tensor_map["transformer.h."+str(i)+".ln_attn"] = mapped_to # falcon40b + MODEL_TENSOR.ATTN_NORM_2: ( + "transformer.h.{bid}.ln_attn", # falcon40b + ), # Attention query-key-value - mapped_to = MODEL_TENSOR_NAMES[arch].get(MODEL_TENSOR.ATTN_QKV, None) - mapped_to = mapped_to.format(bid=i) if mapped_to is not None else None - - tensor_map["gpt_neox.layers."+str(i)+".attention.query_key_value"] = mapped_to # gptneox - tensor_map["transformer.h."+str(i)+".attn.c_attn"] = mapped_to # gpt2 - tensor_map["transformer.blocks."+str(i)+".attn.Wqkv"] = mapped_to # mpt - tensor_map["transformer.h."+str(i)+".self_attention.query_key_value"] = mapped_to # falcon + MODEL_TENSOR.ATTN_QKV: ( + "gpt_neox.layers.{bid}.attention.query_key_value", # gptneox + "transformer.h.{bid}.attn.c_attn", # gpt2 + "transformer.blocks.{bid}.attn.Wqkv", # mpt + "transformer.h.{bid}.self_attention.query_key_value", # falcon + ), # Attention query - mapped_to = MODEL_TENSOR_NAMES[arch].get(MODEL_TENSOR.ATTN_Q, None) - mapped_to = mapped_to.format(bid=i) if mapped_to is not None else None - - tensor_map["model.layers."+str(i)+".self_attn.q_proj"] = mapped_to # llama-hf - tensor_map["layers."+str(i)+".attention.wq"] = mapped_to # llama-pth + MODEL_TENSOR.ATTN_Q: ( + "model.layers.{bid}.self_attn.q_proj", # llama-hf + "layers.{bid}.attention.wq", # llama-pth + ), # Attention key - mapped_to = MODEL_TENSOR_NAMES[arch].get(MODEL_TENSOR.ATTN_K, None) - mapped_to = mapped_to.format(bid=i) if mapped_to is not None else None - - tensor_map["model.layers."+str(i)+".self_attn.k_proj"] = mapped_to # llama-hf - tensor_map["layers."+str(i)+".attention.wk"] = mapped_to # llama-pth + MODEL_TENSOR.ATTN_K: ( + "model.layers.{bid}.self_attn.k_proj", # llama-hf + "layers.{bid}.attention.wk", # llama-pth + ), # Attention value - mapped_to = MODEL_TENSOR_NAMES[arch].get(MODEL_TENSOR.ATTN_V, None) - mapped_to = mapped_to.format(bid=i) if mapped_to is not None else None - - tensor_map["model.layers."+str(i)+".self_attn.v_proj"] = mapped_to # llama-hf - tensor_map["layers."+str(i)+".attention.wv"] = mapped_to # llama-pth + MODEL_TENSOR.ATTN_V: ( + "model.layers.{bid}.self_attn.v_proj", # llama-hf + "layers.{bid}.attention.wv", # llama-pth + ), # Attention output - mapped_to = MODEL_TENSOR_NAMES[arch].get(MODEL_TENSOR.ATTN_OUT, None) - mapped_to = mapped_to.format(bid=i) if mapped_to is not None else None - - tensor_map["gpt_neox.layers."+str(i)+".attention.dense"] = mapped_to # gptneox - tensor_map["transformer.h."+str(i)+".attn.c_proj"] = mapped_to # gpt2 - tensor_map["transformer.blocks."+str(i)+".attn.out_proj"] = mapped_to # mpt - tensor_map["transformer.h."+str(i)+".self_attention.dense"] = mapped_to # falcon - tensor_map["model.layers."+str(i)+".self_attn.o_proj"] = mapped_to # llama-hf - tensor_map["layers."+str(i)+".attention.wo"] = mapped_to # llama-pth + MODEL_TENSOR.ATTN_OUT: ( + "gpt_neox.layers.{bid}.attention.dense", # gptneox + "transformer.h.{bid}.attn.c_proj", # gpt2 + "transformer.blocks.{bid}.attn.out_proj", # mpt + "transformer.h.{bid}.self_attention.dense", # falcon + "model.layers.{bid}.self_attn.o_proj", # llama-hf + "layers.{bid}.attention.wo", # llama-pth + ), # Rotary embeddings - mapped_to = MODEL_TENSOR_NAMES[arch].get(MODEL_TENSOR.ATTN_ROT_EMBD, None) - mapped_to = mapped_to.format(bid=i) if mapped_to is not None else None - - tensor_map["model.layers."+str(i)+".self_attn.rotary_emb.inv_freq"] = mapped_to # llama-hf - tensor_map["layers."+str(i)+".attention.inner_attention.rope.freqs"] = mapped_to # llama-pth + MODEL_TENSOR.ATTN_ROT_EMBD: ( + "model.layers.{bid}.self_attn.rotary_emb.inv_freq", # llama-hf + "layers.{bid}.attention.inner_attention.rope.freqs", # llama-pth + ), # Feed-forward norm - mapped_to = MODEL_TENSOR_NAMES[arch].get(MODEL_TENSOR.FFN_NORM, None) - mapped_to = mapped_to.format(bid=i) if mapped_to is not None else None - - tensor_map["gpt_neox.layers."+str(i)+".post_attention_layernorm"] = mapped_to # gptneox - tensor_map["transformer.h."+str(i)+".ln_2"] = mapped_to # gpt2 - tensor_map["transformer.blocks."+str(i)+".norm_2"] = mapped_to # mpt - tensor_map["model.layers."+str(i)+".post_attention_layernorm"] = mapped_to # llama-hf - tensor_map["layers."+str(i)+".ffn_norm"] = mapped_to # llama-pth + MODEL_TENSOR.FFN_NORM: ( + "gpt_neox.layers.{bid}.post_attention_layernorm", # gptneox + "transformer.h.{bid}.ln_2", # gpt2 + "transformer.blocks.{bid}.norm_2", # mpt + "model.layers.{bid}.post_attention_layernorm", # llama-hf + "layers.{bid}.ffn_norm", # llama-pth + ), # Feed-forward up - mapped_to = MODEL_TENSOR_NAMES[arch].get(MODEL_TENSOR.FFN_UP, None) - mapped_to = mapped_to.format(bid=i) if mapped_to is not None else None - - tensor_map["gpt_neox.layers."+str(i)+".mlp.dense_h_to_4h"] = mapped_to # gptneox - tensor_map["transformer.h."+str(i)+".mlp.c_fc"] = mapped_to # gpt2 - tensor_map["transformer.blocks."+str(i)+".ffn.up_proj"] = mapped_to # mpt - tensor_map["transformer.h."+str(i)+".mlp.dense_h_to_4h"] = mapped_to # falcon - tensor_map["model.layers."+str(i)+".mlp.up_proj"] = mapped_to # llama-hf - tensor_map["layers."+str(i)+".feed_forward.w3"] = mapped_to # llama-pth + MODEL_TENSOR.FFN_UP: ( + "gpt_neox.layers.{bid}.mlp.dense_h_to_4h", # gptneox + "transformer.h.{bid}.mlp.c_fc", # gpt2 + "transformer.blocks.{bid}.ffn.up_proj", # mpt + "transformer.h.{bid}.mlp.dense_h_to_4h", # falcon + "model.layers.{bid}.mlp.up_proj", # llama-hf + "layers.{bid}.feed_forward.w3", # llama-pth + ), # Feed-forward gate - mapped_to = MODEL_TENSOR_NAMES[arch].get(MODEL_TENSOR.FFN_GATE, None) - mapped_to = mapped_to.format(bid=i) if mapped_to is not None else None - - tensor_map["model.layers."+str(i)+".mlp.gate_proj"] = mapped_to # llama-hf - tensor_map["layers."+str(i)+".feed_forward.w1"] = mapped_to # llama-pth + MODEL_TENSOR.FFN_GATE: ( + "model.layers.{bid}.mlp.gate_proj", # llama-hf + "layers.{bid}.feed_forward.w1", # llama-pth + ), # Feed-forward down - mapped_to = MODEL_TENSOR_NAMES[arch].get(MODEL_TENSOR.FFN_DOWN, None) - mapped_to = mapped_to.format(bid=i) if mapped_to is not None else None + MODEL_TENSOR.FFN_DOWN: ( + "gpt_neox.layers.{bid}.mlp.dense_4h_to_h", # gptneox + "transformer.h.{bid}.mlp.c_proj", # gpt2 + "transformer.blocks.{bid}.ffn.down_proj", # mpt + "transformer.h.{bid}.mlp.dense_4h_to_h", # falcon + "model.layers.{bid}.mlp.down_proj", # llama-hf + "layers.{bid}.feed_forward.w2", # llama-pth + ), + } - tensor_map["gpt_neox.layers."+str(i)+".mlp.dense_4h_to_h"] = mapped_to # gptneox - tensor_map["transformer.h."+str(i)+".mlp.c_proj"] = mapped_to # gpt2 - tensor_map["transformer.blocks."+str(i)+".ffn.down_proj"] = mapped_to # mpt - tensor_map["transformer.h."+str(i)+".mlp.dense_4h_to_h"] = mapped_to # falcon - tensor_map["model.layers."+str(i)+".mlp.down_proj"] = mapped_to # llama-hf - tensor_map["layers."+str(i)+".feed_forward.w2"] = mapped_to # llama-pth + mapping: Dict[str, Tuple[MODEL_TENSOR, str]] - return tensor_map + tensor_names: Dict[MODEL_TENSOR, str] + def __init__(self, arch: MODEL_ARCH, n_blocks: int): + mapping = self.mapping = {} + tensor_names = self.tensor_names = MODEL_TENSOR_NAMES[arch] + for tensor, keys in self.mappings_cfg.items(): + tensor_name = tensor_names.get(tensor) + if tensor_name is None: + continue + for key in keys: + mapping[key] = (tensor, tensor_name) + for bid in range(n_blocks): + for tensor, keys in self.block_mappings_cfg.items(): + tensor_name = tensor_names.get(tensor) + if tensor_name is None: + continue + tensor_name = tensor_name.format(bid = bid) + for key in keys: + key = key.format(bid = bid) + mapping[key] = (tensor, tensor_name) + + def get_type_and_name(self, key: str, try_suffixes: Sequence[str]) -> Optional[Tuple[MODEL_TENSOR, str]]: + result = self.mapping.get(key) + if result is not None: + return result + for suffix in try_suffixes: + if key.endswith(suffix): + result = self.mapping.get(key[:-len(suffix)]) + if result is not None: + return (result[0], result[1] + suffix) + return None + + def get_name(self, key: str, try_suffixes: Sequence[str]) -> Optional[str]: + result = self.get_type_and_name(key, try_suffixes = try_suffixes) + if result is None: + return None + return result[1] + + def get_type(self, key: str, try_suffixes: Sequence[str]) -> Optional[MODEL_TENSOR]: + result = self.get_type_and_name(key, try_suffixes = try_suffixes) + if result is None: + return None + return result[0] + + def __getitem__(self, key: str) -> str: + try: + return self.mapping[key][1] + except KeyError: + raise KeyError(key) + + def __contains__(self, key: str) -> bool: + return key in self.mapping + + def __repr__(self) -> str: + return repr(self.mapping) + +def get_tensor_name_map(arch: MODEL_ARCH, n_blocks: int) -> TensorNameMap: + return TensorNameMap(arch, n_blocks) class TokenType(IntEnum): NORMAL = 1 @@ -388,15 +423,21 @@ class GGUFValueType(IntEnum): class GGUFWriter: - def __init__(self, path: str, arch: str, use_temp_file = True): + fout: BufferedWriter + arch: str + offset_tensor = 0 + data_alignment = GGUF_DEFAULT_ALIGNMENT + kv_data = b"" + kv_data_count = 0 + ti_data = b"" + ti_data_count = 0 + use_temp_file: bool + temp_file: Optional[tempfile.SpooledTemporaryFile[bytes]] = None + tensors: List[Tuple[np.ndarray[Any, Any], int]] + + def __init__(self, path: Union[os.PathLike[str], str], arch: str, use_temp_file = True): self.fout = open(path, "wb") self.arch = arch - self.offset_tensor = 0 - self.data_alignment = GGUF_DEFAULT_ALIGNMENT - self.kv_data = b"" - self.kv_data_count = 0 - self.ti_data = b"" - self.ti_data_count = 0 self.add_architecture() self.use_temp_file = use_temp_file self.tensors = [] @@ -470,14 +511,27 @@ class GGUFWriter: self.add_key(key) self.add_val(val, GGUFValueType.STRING) - def add_array(self, key: str, val: list): - if not isinstance(val, list): - raise ValueError("Value must be a list for array type") + def add_array(self, key: str, val: Sequence[Any]): + if not isinstance(val, Sequence): + raise ValueError("Value must be a sequence for array type") self.add_key(key) self.add_val(val, GGUFValueType.ARRAY) - def add_val(self: str, val: Any, vtype: GGUFValueType = None, add_vtype: bool = True): + _simple_value_packing = { + GGUFValueType.UINT8: " 0: + ltype = GGUFValueType.get_type(val[0]) + if not all(GGUFValueType.get_type(i) is ltype for i in val[1:]): + raise ValueError("All items in a GGUF array should be of the same type") + self.kv_data += struct.pack(" int: return ((x + n - 1) // n) * n - def add_tensor_info(self, name: str, tensor_shape: np.ndarray, tensor_dtype: np.dtype, tensor_nbytes: int, raw_dtype: Optional[GGMLQuantizationType] = None): + def add_tensor_info(self, name: str, tensor_shape: Sequence[int], tensor_dtype: Union[np.dtype[np.float16], np.dtype[np.float32]], tensor_nbytes: int, raw_dtype: Optional[GGMLQuantizationType] = None): assert raw_dtype is not None or tensor_dtype in (np.float32, np.float16), "Only F32 and F16 tensors are supported for now" encoded_name = name.encode("utf8") @@ -544,16 +580,18 @@ class GGUFWriter: self.offset_tensor += GGUFWriter.ggml_pad(tensor_nbytes, self.data_alignment) self.ti_data_count += 1 - def add_tensor(self, name: str, tensor: np.ndarray, raw_shape: Optional[np.ndarray] = None, raw_dtype: Optional[GGMLQuantizationType] = None): - if self.use_temp_file and not hasattr(self, "temp_file"): - self.temp_file = tempfile.SpooledTemporaryFile(mode="w+b", max_size=256*1024*1024) - self.temp_file.seek(0) + def add_tensor(self, name: str, tensor: np.ndarray[Any, Any], raw_shape: Optional[Sequence[int]] = None, raw_dtype: Optional[GGMLQuantizationType] = None): + if self.use_temp_file and self.temp_file is None: + fp = tempfile.SpooledTemporaryFile(mode="w+b", max_size=256*1024*1024) + fp.seek(0) + self.temp_file = fp - self.add_tensor_info(name, raw_shape if raw_shape is not None else tensor.shape, tensor.dtype, tensor.nbytes, raw_dtype = raw_dtype) + shape: Sequence[int] = raw_shape if raw_shape is not None else tensor.shape + self.add_tensor_info(name, shape, tensor.dtype, tensor.nbytes, raw_dtype = raw_dtype) pad = GGUFWriter.ggml_pad(tensor.nbytes, self.data_alignment) - tensor.nbytes - if not self.use_temp_file: + if self.temp_file is None: self.tensors.append((tensor, pad)) return @@ -562,25 +600,22 @@ class GGUFWriter: if pad != 0: self.temp_file.write(bytes([0] * pad)) - def write_tensor_data(self, tensor: np.ndarray): - pad = GGUFWriter.ggml_pad(self.fout.tell(), self.data_alignment) - self.fout.tell() + def write_padding(self, fp: BinaryIO, n: int, align: Optional[int] = None): + pad = GGUFWriter.ggml_pad(n, align if align is not None else self.data_alignment) - n if pad != 0: - self.fout.write(bytes([0] * pad)) + fp.write(bytes([0] * pad)) + def write_tensor_data(self, tensor: np.ndarray[Any, Any]): + self.write_padding(self.fout, self.fout.tell()) tensor.tofile(self.fout) - - pad = GGUFWriter.ggml_pad(tensor.nbytes, self.data_alignment) - tensor.nbytes - if pad != 0: - self.fout.write(bytes([0] * pad)) + self.write_padding(self.fout, tensor.nbytes) def write_tensors_to_file(self): self.write_ti_data_to_file() - pad = GGUFWriter.ggml_pad(self.fout.tell(), self.data_alignment) - self.fout.tell() - if pad != 0: - self.fout.write(bytes([0] * pad)) + self.write_padding(self.fout, self.fout.tell()) - if not self.use_temp_file: + if self.temp_file is None: for (currtensor, currpad) in self.tensors: currtensor.tofile(self.fout) if currpad != 0: @@ -654,10 +689,6 @@ class GGUFWriter: self.add_bool( KEY_USE_PARALLEL_RESIDUAL.format(arch=self.arch), use) - def add_tensor_data_layout(self, layout: str): - self.add_string( - KEY_TENSOR_DATA_LAYOUT.format(arch=self.arch), layout) - def add_head_count(self, count: int): self.add_uint32( KEY_ATTENTION_HEAD_COUNT.format(arch=self.arch), count) @@ -695,16 +726,16 @@ class GGUFWriter: def add_tokenizer_model(self, model: str): self.add_string(KEY_TOKENIZER_MODEL, model) - def add_token_list(self, tokens: List): + def add_token_list(self, tokens: Union[Sequence[str], Sequence[bytes], Sequence[bytearray]]): self.add_array(KEY_TOKENIZER_LIST, tokens) - def add_token_merges(self, merges: List): + def add_token_merges(self, merges: Union[Sequence[str], Sequence[bytes], Sequence[bytearray]]): self.add_array(KEY_TOKENIZER_MERGES, merges) - def add_token_types(self, types: List[int]): + def add_token_types(self, types: Union[Sequence[TokenType], Sequence[int]]): self.add_array(KEY_TOKENIZER_TOKEN_TYPE, types) - def add_token_scores(self, scores: List[float]): + def add_token_scores(self, scores: Sequence[float]): self.add_array(KEY_TOKENIZER_SCORES, scores) def add_bos_token_id(self, id: int): @@ -723,6 +754,84 @@ class GGUFWriter: self.add_uint32(KEY_TOKENIZER_PAD_ID, id) +class SpecialVocab: + load_merges: bool = False + merges: List[str] = [] + special_token_types: Tuple[str, ...] = tuple(('bos', 'eos', 'unk', 'sep', 'pad')) + special_token_ids: Dict[str, int] = {} + + def __init__(self, path: Path, load_merges: bool = False, special_token_types: Optional[Tuple[str, ...]] = None): + self.special_token_ids = {} + self.load_merges = load_merges + if special_token_types is not None: + self.special_token_types = special_token_types + self.load(path) + + def load(self, path: Path): + if not self.try_load_from_tokenizer_json(path): + self.try_load_from_config_json(path) + + def try_load_from_tokenizer_json(self, path: Path) -> bool: + tokenizer_file = path / 'tokenizer.json' + if not tokenizer_file.is_file(): + return False + with open(tokenizer_file, 'r', encoding = 'utf-8') as f: + tokenizer = json.load(f) + if self.load_merges: + merges = tokenizer.get('model', {}).get('merges') + if isinstance(merges, list) and len(merges) > 0 and isinstance(merges[0], str): + self.merges = merges + tokenizer_config_file = path / 'tokenizer_config.json' + added_tokens = tokenizer.get('added_tokens') + if added_tokens is None or not tokenizer_config_file.is_file(): + return True + with open(tokenizer_config_file, 'r', encoding = 'utf-8') as f: + tokenizer_config = json.load(f) + for typ in self.special_token_types: + entry = tokenizer_config.get(f'{typ}_token') + if isinstance(entry, str): + tc_content = entry + elif isinstance(entry, dict): + entry_content = entry.get('content') + if not isinstance(entry_content, str): + continue + tc_content = entry_content + else: + continue + for maybe_token_id in (atok.get('id') for atok in added_tokens if atok.get('content') == tc_content): + if isinstance(maybe_token_id, int): + self.special_token_ids[typ] = maybe_token_id + break + return True + + def try_load_from_config_json(self, path: Path) -> bool: + config_file = path / 'config.json' + if not config_file.is_file(): + return False + with open(config_file, 'r', encoding = 'utf-8') as f: + config = json.load(f) + for typ in self.special_token_types: + maybe_token_id = config.get(f'{typ}_token_id') + if isinstance(maybe_token_id, int): + self.special_token_ids[typ] = maybe_token_id + return True + + def add_to_gguf(self, gw: GGUFWriter): + if len(self.merges) > 0: + print(f'gguf: Adding {len(self.merges)} merge(s).') + gw.add_token_merges(self.merges) + for typ, tokid in self.special_token_ids.items(): + handler: Optional[Callable[[int], None]] = getattr(gw, f'add_{typ}_token_id', None) + if handler is None: + print(f'gguf: WARNING: No handler for special token type {typ} with id {tokid} - skipping') + continue + print(f'gguf: Setting special token type {typ} to {tokid}') + handler(tokid) + + def __repr__(self): + return f'' + + # Example usage: if __name__ == "__main__": # Example usage with a file diff --git a/gguf-py/gguf/py.typed b/gguf-py/gguf/py.typed new file mode 100644 index 000000000..e69de29bb diff --git a/gguf-py/pyproject.toml b/gguf-py/pyproject.toml index cc70e28b7..c66b069f9 100644 --- a/gguf-py/pyproject.toml +++ b/gguf-py/pyproject.toml @@ -5,6 +5,7 @@ description = "Write ML models in GGUF for GGML" authors = ["GGML "] packages = [ {include = "gguf"}, + {include = "gguf/py.typed"}, ] readme = "README.md" homepage = "https://ggml.ai" From 35092fb54712d032860f3976a6fc1ae1f84a4a28 Mon Sep 17 00:00:00 2001 From: Gilad S Date: Wed, 30 Aug 2023 11:40:12 +0300 Subject: [PATCH 02/27] docs : add `node-llama-cpp` to `README.md` (#2885) --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index a880fd29f..d727b0554 100644 --- a/README.md +++ b/README.md @@ -107,7 +107,7 @@ as the main playground for developing new features for the [ggml](https://github - Python: [abetlen/llama-cpp-python](https://github.com/abetlen/llama-cpp-python) - Go: [go-skynet/go-llama.cpp](https://github.com/go-skynet/go-llama.cpp) -- Node.js: [hlhr202/llama-node](https://github.com/hlhr202/llama-node) +- Node.js: [withcatai/node-llama-cpp](https://github.com/withcatai/node-llama-cpp), [hlhr202/llama-node](https://github.com/hlhr202/llama-node) - Ruby: [yoshoku/llama_cpp.rb](https://github.com/yoshoku/llama_cpp.rb) - Rust: [mdrokz/rust-llama.cpp](https://github.com/mdrokz/rust-llama.cpp) - C#/.NET: [SciSharp/LLamaSharp](https://github.com/SciSharp/LLamaSharp) From 950929442070874d45561d2a4b68b010457767de Mon Sep 17 00:00:00 2001 From: alonfaraj Date: Wed, 30 Aug 2023 12:42:51 +0300 Subject: [PATCH 03/27] make : add test and update CI (#2897) * build ci: run make test * makefile: - add all - add test * enable tests/test-tokenizer-0-llama * fix path to model * remove gcc-8 from macos build test * Update Makefile * Update Makefile --- .github/workflows/build.yml | 12 ++++++++++++ Makefile | 17 +++++++++++++++++ 2 files changed, 29 insertions(+) diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml index 10320ad1f..20fd8c2b5 100644 --- a/.github/workflows/build.yml +++ b/.github/workflows/build.yml @@ -41,6 +41,12 @@ jobs: run: | CC=gcc-8 make + - name: Test + id: make_test + run: | + CC=gcc-8 make tests + make test + ubuntu-latest-cmake: runs-on: ubuntu-latest @@ -157,6 +163,12 @@ jobs: run: | make + - name: Test + id: make_test + run: | + make tests + make test + macOS-latest-cmake: runs-on: macos-latest diff --git a/Makefile b/Makefile index c8b8a92d7..bd2d92869 100644 --- a/Makefile +++ b/Makefile @@ -6,6 +6,23 @@ TEST_TARGETS = tests/test-llama-grammar tests/test-grammar-parser tests/test-dou default: $(BUILD_TARGETS) +test: + @echo "Running tests..." + @for test_target in $(TEST_TARGETS); do \ + if [ "$$test_target" = "tests/test-tokenizer-0-llama" ]; then \ + ./$$test_target $(CURDIR)/models/ggml-vocab-llama.gguf; \ + elif [ "$$test_target" = "tests/test-tokenizer-0-falcon" ]; then \ + continue; \ + elif [ "$$test_target" = "tests/test-tokenizer-1" ]; then \ + continue; \ + else \ + ./$$test_target; \ + fi; \ + done + @echo "All tests have been run." + +all: $(BUILD_TARGETS) $(TEST_TARGETS) + ifndef UNAME_S UNAME_S := $(shell uname -s) endif From 0d1c706181cd31e7f368dd14eeb16c1a2569e4df Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?M=2E=20Yusuf=20Sar=C4=B1g=C3=B6z?= Date: Wed, 30 Aug 2023 12:47:40 +0300 Subject: [PATCH 04/27] gguf : add workflow for Pypi publishing (#2896) * gguf : add workflow for Pypi publishing * gguf : add workflow for Pypi publishing * fix trailing whitespace --- .github/workflows/gguf-publish.yml | 43 ++++++++++++++++++++++++++++++ gguf-py/README.md | 23 +++++++++++++--- 2 files changed, 63 insertions(+), 3 deletions(-) create mode 100644 .github/workflows/gguf-publish.yml diff --git a/.github/workflows/gguf-publish.yml b/.github/workflows/gguf-publish.yml new file mode 100644 index 000000000..a6289e335 --- /dev/null +++ b/.github/workflows/gguf-publish.yml @@ -0,0 +1,43 @@ +# This workflow will upload a Python Package using Twine when a GGUF release is created +# For more information see: https://help.github.com/en/actions/language-and-framework-guides/using-python-with-github-actions#publishing-to-package-registries + +# See `gguf-py/README.md` for how to make a release. + +# This workflow uses actions that are not certified by GitHub. +# They are provided by a third-party and are governed by +# separate terms of service, privacy policy, and support +# documentation. + +name: Upload Python Package + +on: + workflow_dispatch: + push: + # Pattern matched against refs/tags + tags: + - 'gguf-v*' # Push events to every version tag + + +jobs: + deploy: + + runs-on: ubuntu-latest + + steps: + - uses: actions/checkout@v2 + - name: Set up Python + uses: actions/setup-python@v2 + with: + python-version: '3.9.x' + - name: Install dependencies + run: | + cd gguf-py + python -m pip install poetry + poetry install + + - name: Build package + run: poetry build + - name: Publish package + uses: pypa/gh-action-pypi-publish@release/v1 + with: + password: ${{ secrets.PYPI_API_TOKEN }} diff --git a/gguf-py/README.md b/gguf-py/README.md index 03ad306ec..ffe25c495 100644 --- a/gguf-py/README.md +++ b/gguf-py/README.md @@ -27,8 +27,25 @@ In this case, upgrade Pip to the latest: pip install --upgrade pip ``` -## Publishing -To publish the package, you need to have `twine` and `build` installed: +## Automatic publishing with CI + +There's a GitHub workflow to make a release automatically upon creation of tags in a specified format. + +1. Bump the version in `pyproject.toml`. +2. Create a tag named `gguf-vx.x.x` where `x.x.x` is the semantic version number. + +```sh +git tag -a gguf-v1.0.0 -m "Version 1.0 release" +``` + +3. Push the tags. + +```sh +git push origin --tags +``` + +## Manual publishing +If you want to publish the package manually for any reason, you need to have `twine` and `build` installed: ```sh pip install build twine @@ -36,7 +53,7 @@ pip install build twine Then, folow these steps to release a new version: -1. Update the version in `pyproject.toml`. +1. Bump the version in `pyproject.toml`. 2. Build the package: ```sh From c90d135eb433cf0d40fb95e46a48d1391d2352b5 Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Wed, 30 Aug 2023 12:52:46 +0300 Subject: [PATCH 05/27] examples : fix underscore in beam-search + .gitignore (close #2900) --- .gitignore | 3 +++ Makefile | 4 ++-- examples/CMakeLists.txt | 2 +- examples/{beam_search => beam-search}/CMakeLists.txt | 4 ++-- .../beam_search.cpp => beam-search/beam-search.cpp} | 0 5 files changed, 8 insertions(+), 5 deletions(-) rename examples/{beam_search => beam-search}/CMakeLists.txt (78%) rename examples/{beam_search/beam_search.cpp => beam-search/beam-search.cpp} (100%) diff --git a/.gitignore b/.gitignore index 54ea2b522..8b5f45a2d 100644 --- a/.gitignore +++ b/.gitignore @@ -42,6 +42,9 @@ models-mnt /gguf-llama-simple /libllama.so /llama-bench +/baby-llama +/beam-search +/save-load-state build-info.h arm_neon.h compile_commands.json diff --git a/Makefile b/Makefile index bd2d92869..b750540fe 100644 --- a/Makefile +++ b/Makefile @@ -1,5 +1,5 @@ # Define the default target now so that it is always the first target -BUILD_TARGETS = main quantize quantize-stats perplexity embedding vdot train-text-from-scratch convert-llama2c-to-ggml simple save-load-state server embd-input-test gguf llama-bench baby-llama beam_search tests/test-c.o +BUILD_TARGETS = main quantize quantize-stats perplexity embedding vdot train-text-from-scratch convert-llama2c-to-ggml simple save-load-state server embd-input-test gguf llama-bench baby-llama beam-search tests/test-c.o # Binaries only useful for tests TEST_TARGETS = tests/test-llama-grammar tests/test-grammar-parser tests/test-double-float tests/test-grad0 tests/test-opt tests/test-quantize-fns tests/test-quantize-perf tests/test-sampling tests/test-tokenizer-0-llama tests/test-tokenizer-0-falcon tests/test-tokenizer-1 @@ -446,7 +446,7 @@ llama-bench: examples/llama-bench/llama-bench.cpp build-info.h ggml.o llama.o co baby-llama: examples/baby-llama/baby-llama.cpp ggml.o llama.o common.o $(OBJS) $(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS) -beam_search: examples/beam_search/beam_search.cpp build-info.h ggml.o llama.o common.o $(OBJS) +beam-search: examples/beam-search/beam-search.cpp build-info.h ggml.o llama.o common.o $(OBJS) $(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS) ifneq '' '$(or $(filter clean,$(MAKECMDGOALS)),$(LLAMA_METAL))' diff --git a/examples/CMakeLists.txt b/examples/CMakeLists.txt index 94b785224..6e65eb087 100644 --- a/examples/CMakeLists.txt +++ b/examples/CMakeLists.txt @@ -25,7 +25,7 @@ else() add_subdirectory(simple) add_subdirectory(embd-input) add_subdirectory(llama-bench) - add_subdirectory(beam_search) + add_subdirectory(beam-search) if (LLAMA_METAL) add_subdirectory(metal) endif() diff --git a/examples/beam_search/CMakeLists.txt b/examples/beam-search/CMakeLists.txt similarity index 78% rename from examples/beam_search/CMakeLists.txt rename to examples/beam-search/CMakeLists.txt index b29e01092..e44a74975 100644 --- a/examples/beam_search/CMakeLists.txt +++ b/examples/beam-search/CMakeLists.txt @@ -1,5 +1,5 @@ -set(TARGET beam_search) -add_executable(${TARGET} beam_search.cpp) +set(TARGET beam-search) +add_executable(${TARGET} beam-search.cpp) install(TARGETS ${TARGET} RUNTIME) target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT}) target_compile_features(${TARGET} PRIVATE cxx_std_11) diff --git a/examples/beam_search/beam_search.cpp b/examples/beam-search/beam-search.cpp similarity index 100% rename from examples/beam_search/beam_search.cpp rename to examples/beam-search/beam-search.cpp From b532a69b2fd08067f34f32f37a2fd9b37678a34a Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Wed, 30 Aug 2023 13:29:40 +0300 Subject: [PATCH 06/27] convert.py : use dir name to name the llama --- convert.py | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/convert.py b/convert.py index 448b6f0f3..a7f4c2d75 100755 --- a/convert.py +++ b/convert.py @@ -811,10 +811,12 @@ class OutputFile: def add_meta_arch(self, params: Params) -> None: name = "LLaMA" + + # TODO: better logic to determine model name if (params.n_ctx == 4096): name = "LLaMA v2" - if params.path_model: - name = str(params.path_model.parent).split('/')[-1] + elif params.path_model: + name = str(params.path_model.parent).split('/')[-1] self.gguf.add_name (name) self.gguf.add_context_length (params.n_ctx) @@ -839,8 +841,7 @@ class OutputFile: tokens = [] scores = [] toktypes = [] - # NOTE: `all_tokens` returns the the base vocabulary and added tokens - # TODO: add special tokens? + # NOTE: `all_tokens` returns the base vocabulary and added tokens for text, score, toktype in vocab.all_tokens(): tokens.append(text) scores.append(score) From 71d6975559acfd6c8407a4ef8275a9979c737765 Mon Sep 17 00:00:00 2001 From: Henri Vasserman Date: Wed, 30 Aug 2023 19:14:53 +0300 Subject: [PATCH 07/27] [Docker] fix tools.sh argument passing. (#2884) * [Docker] fix tools.sh argument passing. This should allow passing multiple arguments to containers with the full image that are using the tools.sh frontend. Fix from https://github.com/ggerganov/llama.cpp/issues/2535#issuecomment-1697091734 --- .devops/tools.sh | 11 ++++------- 1 file changed, 4 insertions(+), 7 deletions(-) diff --git a/.devops/tools.sh b/.devops/tools.sh index 2787c21fe..9d999315f 100755 --- a/.devops/tools.sh +++ b/.devops/tools.sh @@ -7,15 +7,12 @@ arg1="$1" # Shift the arguments to remove the first one shift -# Join the remaining arguments into a single string -arg2="$@" - if [[ "$arg1" == '--convert' || "$arg1" == '-c' ]]; then - python3 ./convert.py "$arg2" + python3 ./convert.py "$@" elif [[ "$arg1" == '--quantize' || "$arg1" == '-q' ]]; then - ./quantize "$arg2" + ./quantize "$@" elif [[ "$arg1" == '--run' || "$arg1" == '-r' ]]; then - ./main "$arg2" + ./main "$@" elif [[ "$arg1" == '--all-in-one' || "$arg1" == '-a' ]]; then echo "Converting PTH to GGML..." for i in `ls $1/$2/ggml-model-f16.bin*`; do @@ -27,7 +24,7 @@ elif [[ "$arg1" == '--all-in-one' || "$arg1" == '-a' ]]; then fi done elif [[ "$arg1" == '--server' || "$arg1" == '-s' ]]; then - ./server "$arg2" + ./server "$@" else echo "Unknown command: $arg1" echo "Available commands: " From 8afe2280009ecbfc9de2c93b8f41283dc810609a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Johannes=20G=C3=A4=C3=9Fler?= Date: Wed, 30 Aug 2023 21:46:19 +0200 Subject: [PATCH 08/27] CUDA: mul_mat_q=true llama_context_params default (#2912) --- llama.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/llama.cpp b/llama.cpp index fcd6f276a..95ee6ffe4 100644 --- a/llama.cpp +++ b/llama.cpp @@ -5287,7 +5287,7 @@ struct llama_context_params llama_context_default_params() { /*.progress_callback =*/ nullptr, /*.progress_callback_user_data =*/ nullptr, /*.low_vram =*/ false, - /*.mul_mat_q =*/ false, + /*.mul_mat_q =*/ true, /*.f16_kv =*/ true, /*.logits_all =*/ false, /*.vocab_only =*/ false, From 92d0b751a77a089e650983e9f1564ef4d31b32b9 Mon Sep 17 00:00:00 2001 From: Cebtenzzre Date: Thu, 31 Aug 2023 01:02:23 -0400 Subject: [PATCH 09/27] convert : fix python 3.8 support, modernize type annotations (#2916) * convert : fix python 3.8 support * convert : sort imports * convert : fix required parameters in convert-llama-ggmlv3-to-gguf * convert : fix mypy errors in convert-llama-ggmlv3-to-gguf * convert : use PEP 585 generics and PEP 604 unions Now that we have `from __future__ import annotations`, we can use this modern syntax in Python 3.7 instead of restricting support to Python 3.9 or 3.10 respectively. * gguf.py : a tuple is already a tuple * add mypy.ini * convert : add necessary `type: ignore` comments * gguf-py: bump version --- convert-falcon-hf-to-gguf.py | 25 +++--- convert-gptneox-hf-to-gguf.py | 22 ++--- convert-llama-7b-pth-to-gguf.py | 31 ++++--- convert-llama-ggmlv3-to-gguf.py | 18 ++-- convert-llama-hf-to-gguf.py | 33 +++---- convert-lora-to-ggml.py | 8 +- convert.py | 149 ++++++++++++++++---------------- gguf-py/gguf/gguf.py | 68 ++++++++------- gguf-py/pyproject.toml | 2 +- mypy.ini | 5 ++ 10 files changed, 193 insertions(+), 168 deletions(-) create mode 100644 mypy.ini diff --git a/convert-falcon-hf-to-gguf.py b/convert-falcon-hf-to-gguf.py index 0fdea70e1..ec786ff67 100755 --- a/convert-falcon-hf-to-gguf.py +++ b/convert-falcon-hf-to-gguf.py @@ -1,18 +1,21 @@ #!/usr/bin/env python3 # HF falcon--> gguf conversion -import gguf -import os -import sys -import struct +from __future__ import annotations + +import argparse import json +import os +import struct +import sys +from pathlib import Path +from typing import Any + +import gguf import numpy as np import torch -import argparse +from transformers import AutoTokenizer # type: ignore[import] -from typing import Any, List -from pathlib import Path -from transformers import AutoTokenizer def bytes_to_unicode(): # ref: https://github.com/openai/gpt-2/blob/master/src/encoder.py @@ -114,9 +117,9 @@ gguf_writer.add_file_type(ftype) print("gguf: get tokenizer metadata") -tokens: List[bytearray] = [] -scores: List[float] = [] -toktypes: List[int] = [] +tokens: list[bytearray] = [] +scores: list[float] = [] +toktypes: list[int] = [] tokenizer_json_file = dir_model / 'tokenizer.json' if not tokenizer_json_file.is_file(): diff --git a/convert-gptneox-hf-to-gguf.py b/convert-gptneox-hf-to-gguf.py index 38e71e03b..852123d99 100755 --- a/convert-gptneox-hf-to-gguf.py +++ b/convert-gptneox-hf-to-gguf.py @@ -1,18 +1,20 @@ #!/usr/bin/env python3 # HF gptneox--> gguf conversion -import gguf -import os -import sys -import struct +from __future__ import annotations + +import argparse import json +import os +import struct +import sys +from pathlib import Path +from typing import Any + +import gguf import numpy as np import torch -import argparse - -from typing import Any, List -from pathlib import Path -from transformers import AutoTokenizer +from transformers import AutoTokenizer # type: ignore[import] # ref: https://github.com/openai/gpt-2/blob/master/src/encoder.py @@ -112,7 +114,7 @@ gguf_writer.add_layer_norm_eps(hparams["layer_norm_eps"]) print("gguf: get tokenizer metadata") -tokens: List[bytearray] = [] +tokens: list[bytearray] = [] tokenizer_json_file = dir_model / 'tokenizer.json' if not tokenizer_json_file.is_file(): diff --git a/convert-llama-7b-pth-to-gguf.py b/convert-llama-7b-pth-to-gguf.py index 6e973a116..6574c11dd 100755 --- a/convert-llama-7b-pth-to-gguf.py +++ b/convert-llama-7b-pth-to-gguf.py @@ -3,22 +3,25 @@ # Only models with a single datafile are supported, like 7B # HF files required in the model dir: config.json tokenizer_config.json tokenizer.json tokenizer.model -import gguf -import os -import sys -import struct +from __future__ import annotations + +import argparse import json +import os +import struct +import sys +from pathlib import Path +from typing import TYPE_CHECKING, Any + +import gguf import numpy as np import torch -import argparse +from sentencepiece import SentencePieceProcessor # type: ignore[import] -from typing import Any, List, TypeAlias -from pathlib import Path -from sentencepiece import SentencePieceProcessor +if TYPE_CHECKING: + from typing import TypeAlias -#NDArray = np.ndarray[Any, Any] -# compatible with python < 3.9 -NDArray: 'TypeAlias' = 'np.ndarray[Any, Any]' +NDArray: TypeAlias = 'np.ndarray[Any, Any]' def count_model_parts(dir_model: Path) -> int: @@ -129,9 +132,9 @@ if "rope_scaling" in hparams and hparams["rope_scaling"] != None and "factor" in print("gguf: get tokenizer metadata") -tokens: List[bytes] = [] -scores: List[float] = [] -toktypes: List[int] = [] +tokens: list[bytes] = [] +scores: list[float] = [] +toktypes: list[int] = [] tokenizer_model_file = dir_model / 'tokenizer.model' if not tokenizer_model_file.is_file(): diff --git a/convert-llama-ggmlv3-to-gguf.py b/convert-llama-ggmlv3-to-gguf.py index c8e7f1761..3f39bc39e 100755 --- a/convert-llama-ggmlv3-to-gguf.py +++ b/convert-llama-ggmlv3-to-gguf.py @@ -1,10 +1,14 @@ #!/usr/bin/env python3 -import sys, struct, math, argparse +from __future__ import annotations + +import argparse +import math +import struct +import sys from pathlib import Path -import numpy as np - import gguf +import numpy as np # Note: Does not support GGML_QKK_64 QK_K = 256 @@ -72,7 +76,7 @@ class Vocab: class Tensor: def __init__(self): self.name = None - self.dims = () + self.dims: tuple[int, ...] = () self.dtype = None self.start_offset = 0 self.len_bytes = np.int64(0) @@ -119,7 +123,7 @@ class GGMLV3Model: offset += hp.load(data, offset) vocab = Vocab() offset += vocab.load(data, offset, hp.n_vocab) - tensors = [] + tensors: list[Tensor] = [] tensor_map = {} while offset < len(data): tensor = Tensor() @@ -305,8 +309,8 @@ def handle_metadata(cfg, hp): def handle_args(): parser = argparse.ArgumentParser(description = 'Convert GGMLv3 models to GGUF') - parser.add_argument('--input', '-i', type = Path, help = 'Input GGMLv3 filename') - parser.add_argument('--output', '-o', type = Path, help ='Output GGUF filename') + parser.add_argument('--input', '-i', type = Path, required = True, help = 'Input GGMLv3 filename') + parser.add_argument('--output', '-o', type = Path, required = True, help ='Output GGUF filename') parser.add_argument('--name', help = 'Set model name') parser.add_argument('--desc', help = 'Set model description') parser.add_argument('--gqa', type = int, default = 1, help = 'grouped-query attention factor (use 8 for LLaMA2 70B)') diff --git a/convert-llama-hf-to-gguf.py b/convert-llama-hf-to-gguf.py index ab94b5eab..c453c83c3 100755 --- a/convert-llama-hf-to-gguf.py +++ b/convert-llama-hf-to-gguf.py @@ -1,28 +1,31 @@ #!/usr/bin/env python3 # HF llama --> gguf conversion -import gguf -import os -import sys -import struct +from __future__ import annotations + +import argparse import json +import os +import struct +import sys +from pathlib import Path +from typing import TYPE_CHECKING, Any + +import gguf import numpy as np import torch -import argparse +from sentencepiece import SentencePieceProcessor # type: ignore[import] -from typing import Any, List, Optional, TypeAlias -from pathlib import Path -from sentencepiece import SentencePieceProcessor +if TYPE_CHECKING: + from typing import TypeAlias -#NDArray = np.ndarray[Any, Any] -# compatible with python < 3.9 -NDArray: 'TypeAlias' = 'np.ndarray[Any, Any]' +NDArray: TypeAlias = 'np.ndarray[Any, Any]' # reverse HF permute back to original pth layout # https://github.com/huggingface/transformers/blob/main/src/transformers/models/llama/convert_llama_weights_to_hf.py -def reverse_hf_permute(weights: NDArray, n_head: int, n_kv_head: Optional[int] = None) -> NDArray: +def reverse_hf_permute(weights: NDArray, n_head: int, n_kv_head: int | None = None) -> NDArray: if n_kv_head is not None and n_head != n_kv_head: n_head //= n_kv_head @@ -136,9 +139,9 @@ if "rope_scaling" in hparams and hparams["rope_scaling"] != None and "factor" in print("gguf: get tokenizer metadata") -tokens: List[bytes] = [] -scores: List[float] = [] -toktypes: List[int] = [] +tokens: list[bytes] = [] +scores: list[float] = [] +toktypes: list[int] = [] tokenizer_model_file = dir_model / 'tokenizer.model' if not tokenizer_model_file.is_file(): diff --git a/convert-lora-to-ggml.py b/convert-lora-to-ggml.py index a00339b47..a937410dd 100755 --- a/convert-lora-to-ggml.py +++ b/convert-lora-to-ggml.py @@ -1,15 +1,17 @@ #!/usr/bin/env python3 +from __future__ import annotations + import json import os import re import struct import sys -from typing import Any, Dict, Sequence, BinaryIO +from typing import Any, BinaryIO, Sequence import numpy as np import torch -NUMPY_TYPE_TO_FTYPE: Dict[str, int] = {"float32": 0, "float16": 1} +NUMPY_TYPE_TO_FTYPE: dict[str, int] = {"float32": 0, "float16": 1} HF_SUBLAYER_TO_GGML = { @@ -46,7 +48,7 @@ def translate_tensor_name(t: str) -> str: sys.exit(1) -def write_file_header(fout: BinaryIO, params: Dict[str, Any]) -> None: +def write_file_header(fout: BinaryIO, params: dict[str, Any]) -> None: fout.write(b"ggla"[::-1]) # magic (ggml lora) fout.write(struct.pack("i", 1)) # file version fout.write(struct.pack("i", params["r"])) diff --git a/convert.py b/convert.py index a7f4c2d75..9a39edb99 100755 --- a/convert.py +++ b/convert.py @@ -1,9 +1,8 @@ #!/usr/bin/env python3 +from __future__ import annotations -import gguf import argparse import concurrent.futures -from concurrent.futures import ThreadPoolExecutor, ProcessPoolExecutor import copy import enum import faulthandler @@ -20,21 +19,23 @@ import struct import sys import time import zipfile -import numpy as np - from abc import ABCMeta, abstractmethod +from concurrent.futures import ProcessPoolExecutor, ThreadPoolExecutor from dataclasses import dataclass from pathlib import Path -from typing import (IO, TYPE_CHECKING, Any, Callable, Dict, Generator, Iterable, List, Literal, Optional, Sequence, Set, Tuple, Type, TypeVar, Union) -from sentencepiece import SentencePieceProcessor # type: ignore +from typing import IO, TYPE_CHECKING, Any, Callable, Generator, Iterable, Literal, Sequence, TypeVar + +import gguf +import numpy as np +from sentencepiece import SentencePieceProcessor # type: ignore[import] if TYPE_CHECKING: - from typing_extensions import TypeAlias + from typing import TypeAlias if hasattr(faulthandler, 'register') and hasattr(signal, 'SIGUSR1'): faulthandler.register(signal.SIGUSR1) -NDArray: 'TypeAlias' = 'np.ndarray[Any, Any]' +NDArray: TypeAlias = 'np.ndarray[Any, Any]' ARCH=gguf.MODEL_ARCH.LLAMA NAMES=gguf.MODEL_TENSOR_NAMES[ARCH] @@ -47,8 +48,8 @@ DEFAULT_CONCURRENCY = 8 @dataclass(frozen=True) class DataType: name: str - dtype: 'np.dtype[Any]' - valid_conversions: List[str] + dtype: np.dtype[Any] + valid_conversions: list[str] def elements_to_bytes(self, n_elements: int) -> int: return n_elements * self.dtype.itemsize @@ -65,7 +66,7 @@ DT_BF16 = UnquantizedDataType('BF16', dtype = np.dtype(np.uint16), valid_convers @dataclass(frozen=True) class QuantizedDataType(DataType): block_size: int - quantized_dtype: 'np.dtype[Any]' + quantized_dtype: np.dtype[Any] ggml_type: gguf.GGMLQuantizationType def quantize(self, arr: NDArray) -> NDArray: @@ -84,7 +85,7 @@ class Q8_0QuantizedDataType(QuantizedDataType): n_blocks = arr.size // self.block_size blocks = arr.reshape((n_blocks, self.block_size)) # Much faster implementation of block quantization contributed by @Cebtenzzre - def quantize_blocks_q8_0(blocks: NDArray) -> Iterable[Tuple[Any, Any]]: + def quantize_blocks_q8_0(blocks: NDArray) -> Iterable[tuple[Any, Any]]: d = abs(blocks).max(axis = 1) / np.float32(127) with np.errstate(divide = 'ignore'): qs = (blocks / d[:, None]).round() @@ -98,13 +99,13 @@ DT_Q8_0 = Q8_0QuantizedDataType('Q8_0', quantized_dtype = np.dtype([('d', ' DataType: + def type_for_tensor(self, name: str, tensor: LazyTensor) -> DataType: dt = GGML_FILE_TYPE_TO_DATA_TYPE.get(self) if dt is None: raise ValueError(self) # 1D tensors are always F32. return dt if len(tensor.shape) > 1 else DT_F32 -GGML_FILE_TYPE_TO_DATA_TYPE: Dict[GGMLFileType, DataType] = { +GGML_FILE_TYPE_TO_DATA_TYPE: dict[GGMLFileType, DataType] = { GGMLFileType.AllF32 : DT_F32, GGMLFileType.MostlyF16 : DT_F16, GGMLFileType.MostlyQ8_0: DT_Q8_0, @@ -148,13 +149,13 @@ class Params: n_head_kv: int f_norm_eps: float - f_rope_freq_base: Optional[float] = None - f_rope_scale: Optional[float] = None + f_rope_freq_base: float | None = None + f_rope_scale: float | None = None - ftype: Optional[GGMLFileType] = None + ftype: GGMLFileType | None = None # path to the directory containing the model files - path_model: Optional['Path'] = None + path_model: Path | None = None @staticmethod def find_n_mult(n_ff: int, n_embd: int) -> int: @@ -166,7 +167,7 @@ class Params: raise Exception(f"failed to find n_mult for (n_ff={n_ff}, n_embd={n_embd}).") @staticmethod - def guessed(model: 'LazyModel') -> 'Params': + def guessed(model: LazyModel) -> Params: # try transformer naming first n_vocab, n_embd = model["model.embed_tokens.weight"].shape if "model.embed_tokens.weight" in model else model["tok_embeddings.weight"].shape @@ -202,7 +203,7 @@ class Params: ) @staticmethod - def loadHFTransformerJson(model: 'LazyModel', config_path: 'Path') -> 'Params': + def loadHFTransformerJson(model: LazyModel, config_path: Path) -> Params: config = json.load(open(config_path)) n_vocab = config["vocab_size"] @@ -247,7 +248,7 @@ class Params: # LLaMA v2 70B params.json # {"dim": 8192, "multiple_of": 4096, "ffn_dim_multiplier": 1.3, "n_heads": 64, "n_kv_heads": 8, "n_layers": 80, "norm_eps": 1e-05, "vocab_size": -1 @staticmethod - def loadOriginalParamsJson(model: 'LazyModel', config_path: 'Path') -> 'Params': + def loadOriginalParamsJson(model: LazyModel, config_path: Path) -> Params: config = json.load(open(config_path)) n_vocab = config["vocab_size"] if "vocab_size" in config else -1 @@ -291,7 +292,7 @@ class Params: ) @staticmethod - def load(model_plus: 'ModelPlus') -> 'Params': + def load(model_plus: ModelPlus) -> Params: hf_config_path = model_plus.paths[0].parent / "config.json" orig_config_path = model_plus.paths[0].parent / "params.json" @@ -314,9 +315,9 @@ class Params: # class BpeVocab: - def __init__(self, fname_tokenizer: Path, fname_added_tokens: Optional[Path]) -> None: + def __init__(self, fname_tokenizer: Path, fname_added_tokens: Path | None) -> None: self.bpe_tokenizer = json.loads(open(str(fname_tokenizer), encoding="utf-8").read()) - added_tokens: Dict[str, int] + added_tokens: dict[str, int] if fname_added_tokens is not None: added_tokens = json.load(open(fname_added_tokens, encoding="utf-8")) else: @@ -335,9 +336,9 @@ class BpeVocab: self.fname_tokenizer = fname_tokenizer self.fname_added_tokens = fname_added_tokens - def bpe_tokens(self) -> Iterable[Tuple[bytes, float, gguf.TokenType]]: + def bpe_tokens(self) -> Iterable[tuple[bytes, float, gguf.TokenType]]: tokenizer = self.bpe_tokenizer - from transformers.models.gpt2 import tokenization_gpt2 + from transformers.models.gpt2 import tokenization_gpt2 # type: ignore[import] byte_encoder = tokenization_gpt2.bytes_to_unicode() byte_decoder = {v: k for k, v in byte_encoder.items()} for i, item in enumerate(tokenizer): @@ -345,12 +346,12 @@ class BpeVocab: score: float = -i yield text, score, gguf.TokenType.USER_DEFINED - def added_tokens(self) -> Iterable[Tuple[bytes, float, gguf.TokenType]]: + def added_tokens(self) -> Iterable[tuple[bytes, float, gguf.TokenType]]: for text in self.added_tokens_list: score = -1000.0 yield text.encode("utf-8"), score, gguf.TokenType.USER_DEFINED - def all_tokens(self) -> Iterable[Tuple[bytes, float, gguf.TokenType]]: + def all_tokens(self) -> Iterable[tuple[bytes, float, gguf.TokenType]]: yield from self.bpe_tokens() yield from self.added_tokens() @@ -359,9 +360,9 @@ class BpeVocab: class SentencePieceVocab: - def __init__(self, fname_tokenizer: Path, fname_added_tokens: Optional[Path]) -> None: + def __init__(self, fname_tokenizer: Path, fname_added_tokens: Path | None) -> None: self.sentencepiece_tokenizer = SentencePieceProcessor(str(fname_tokenizer)) - added_tokens: Dict[str, int] + added_tokens: dict[str, int] if fname_added_tokens is not None: added_tokens = json.load(open(fname_added_tokens, encoding="utf-8")) else: @@ -380,7 +381,7 @@ class SentencePieceVocab: self.fname_tokenizer = fname_tokenizer self.fname_added_tokens = fname_added_tokens - def sentencepiece_tokens(self) -> Iterable[Tuple[bytes, float, gguf.TokenType]]: + def sentencepiece_tokens(self) -> Iterable[tuple[bytes, float, gguf.TokenType]]: tokenizer = self.sentencepiece_tokenizer for i in range(tokenizer.vocab_size()): piece = tokenizer.id_to_piece(i) @@ -404,19 +405,19 @@ class SentencePieceVocab: yield text, score, toktype - def added_tokens(self) -> Iterable[Tuple[bytes, float, gguf.TokenType]]: + def added_tokens(self) -> Iterable[tuple[bytes, float, gguf.TokenType]]: for text in self.added_tokens_list: score = -1000.0 yield text.encode("utf-8"), score, gguf.TokenType.USER_DEFINED - def all_tokens(self) -> Iterable[Tuple[bytes, float, gguf.TokenType]]: + def all_tokens(self) -> Iterable[tuple[bytes, float, gguf.TokenType]]: yield from self.sentencepiece_tokens() yield from self.added_tokens() def __repr__(self) -> str: return f"" -Vocab = Union[BpeVocab, SentencePieceVocab] +Vocab: TypeAlias = 'BpeVocab | SentencePieceVocab' # # data loading @@ -436,15 +437,15 @@ class Tensor(metaclass=ABCMeta): data_type: DataType @abstractmethod - def astype(self, data_type: DataType) -> 'Tensor': ... + def astype(self, data_type: DataType) -> Tensor: ... @abstractmethod - def permute(self, n_head: int, n_head_kv: int) -> 'Tensor': ... + def permute(self, n_head: int, n_head_kv: int) -> Tensor: ... @abstractmethod - def permute_part(self, n_part: int, n_head: int, n_head_kv: int) -> 'UnquantizedTensor': ... + def permute_part(self, n_part: int, n_head: int, n_head_kv: int) -> UnquantizedTensor: ... @abstractmethod - def part(self, n_part: int) -> 'UnquantizedTensor': ... + def part(self, n_part: int) -> UnquantizedTensor: ... @abstractmethod - def to_ggml(self) -> 'GGMLCompatibleTensor': ... + def to_ggml(self) -> GGMLCompatibleTensor: ... def bf16_to_fp32(bf16_arr: np.ndarray[Any, np.dtype[np.uint16]]) -> NDArray: @@ -465,22 +466,22 @@ class UnquantizedTensor(Tensor): self.ndarray = bf16_to_fp32(self.ndarray) return UnquantizedTensor(self.ndarray.astype(dtype)) - def to_ggml(self) -> 'UnquantizedTensor': + def to_ggml(self) -> UnquantizedTensor: return self - def permute_part(self, n_part: int, n_head: int, n_head_kv: int) -> 'UnquantizedTensor': + def permute_part(self, n_part: int, n_head: int, n_head_kv: int) -> UnquantizedTensor: r = self.ndarray.shape[0] // 3 return UnquantizedTensor(permute(self.ndarray[r * n_part : r * n_part + r, ...], n_head, n_head_kv)) - def part(self, n_part: int) -> 'UnquantizedTensor': + def part(self, n_part: int) -> UnquantizedTensor: r = self.ndarray.shape[0] // 3 return UnquantizedTensor(self.ndarray[r * n_part : r * n_part + r, ...]) - def permute(self, n_head: int, n_head_kv: int) -> 'UnquantizedTensor': + def permute(self, n_head: int, n_head_kv: int) -> UnquantizedTensor: return UnquantizedTensor(permute(self.ndarray, n_head, n_head_kv)) -def load_unquantized(lazy_tensor: 'LazyTensor', expected_dtype: Any = None, convert: bool = False) -> NDArray: +def load_unquantized(lazy_tensor: LazyTensor, expected_dtype: Any = None, convert: bool = False) -> NDArray: tensor = lazy_tensor.load() assert isinstance(tensor, UnquantizedTensor) @@ -496,13 +497,13 @@ def load_unquantized(lazy_tensor: 'LazyTensor', expected_dtype: Any = None, conv return tensor.ndarray -GGMLCompatibleTensor = Union[UnquantizedTensor] +GGMLCompatibleTensor = UnquantizedTensor @dataclass class LazyTensor: _load: Callable[[], Tensor] - shape: List[int] + shape: list[int] data_type: DataType description: str @@ -513,7 +514,7 @@ class LazyTensor: (self.data_type, ret.data_type, self.description) return ret - def astype(self, data_type: DataType) -> 'LazyTensor': + def astype(self, data_type: DataType) -> LazyTensor: self.validate_conversion_to(data_type) def load() -> Tensor: @@ -525,24 +526,24 @@ class LazyTensor: raise ValueError(f'Cannot validate conversion from {self.data_type} to {data_type}.') -LazyModel = Dict[str, LazyTensor] +LazyModel = dict[str, LazyTensor] @dataclass class ModelPlus: model: LazyModel - paths: List[Path] # Where this was read from. + paths: list[Path] # Where this was read from. format: Literal['ggml', 'torch', 'safetensors', 'none'] - vocab: Optional[Vocab] # For GGML models (which have vocab built in), the vocab. + vocab: Vocab | None # For GGML models (which have vocab built in), the vocab. -def merge_sharded(models: List[LazyModel]) -> LazyModel: +def merge_sharded(models: list[LazyModel]) -> LazyModel: # Original LLaMA models have each file contain one part of each tensor. # Use a dict instead of a set to preserve order. names = {name: None for model in models for name in model} def convert(name: str) -> LazyTensor: - lazy_tensors: List[LazyTensor] = [model[name] for model in models] + lazy_tensors: list[LazyTensor] = [model[name] for model in models] if len(lazy_tensors) == 1: # only one file; don't go through this procedure since there might # be quantized tensors @@ -570,7 +571,7 @@ def merge_sharded(models: List[LazyModel]) -> LazyModel: return {name: convert(name) for name in names} -def merge_multifile_models(models_plus: List[ModelPlus]) -> ModelPlus: +def merge_multifile_models(models_plus: list[ModelPlus]) -> ModelPlus: formats = set(mp.format for mp in models_plus) assert len(formats) == 1, "different formats?" format = formats.pop() @@ -674,7 +675,7 @@ class LazyUnpickler(pickle.Unpickler): def rebuild_from_type_v2(func, new_type, args, state): return func(*args) - CLASSES: Dict[Tuple[str, str], Any] = { + CLASSES: dict[tuple[str, str], Any] = { # getattr used here as a workaround for mypy not being smart enough to detrmine # the staticmethods have a __func__ attribute. ('torch._tensor', '_rebuild_from_type_v2'): getattr(rebuild_from_type_v2, '__func__'), @@ -707,15 +708,15 @@ def lazy_load_torch_file(outer_fp: IO[bytes], path: Path) -> ModelPlus: def lazy_load_safetensors_file(fp: IO[bytes], path: Path) -> ModelPlus: header_size, = struct.unpack(' LazyTensor: + def convert(info: dict[str, Any]) -> LazyTensor: data_type = SAFETENSORS_DATA_TYPES[info['dtype']] numpy_dtype = data_type.dtype - shape: List[int] = info['shape'] + shape: list[int] = info['shape'] begin, end = info['data_offsets'] assert 0 <= begin <= end <= len(byte_buf) assert end - begin == math.prod(shape) * numpy_dtype.itemsize @@ -754,7 +755,7 @@ def lazy_load_file(path: Path) -> ModelPlus: In = TypeVar('In') Out = TypeVar('Out') -def bounded_parallel_map(func: Callable[[In], Out], iterable: Iterable[In], concurrency: int, max_workers: Optional[int] = None, use_processpool_executor: bool = False) -> Iterable[Out]: +def bounded_parallel_map(func: Callable[[In], Out], iterable: Iterable[In], concurrency: int, max_workers: int | None = None, use_processpool_executor: bool = False) -> Iterable[Out]: '''Parallel map, but with backpressure. If the caller doesn't call `next` fast enough, this will stop calling `func` at some point rather than letting results pile up in memory. Specifically, there is a max of one @@ -763,13 +764,13 @@ def bounded_parallel_map(func: Callable[[In], Out], iterable: Iterable[In], conc yield from map(func, iterable) # Not reached. iterable = iter(iterable) - executor_class: Union[Type[ThreadPoolExecutor], Type[ProcessPoolExecutor]] + executor_class: type[ThreadPoolExecutor] | type[ProcessPoolExecutor] if use_processpool_executor: executor_class = ProcessPoolExecutor else: executor_class = ThreadPoolExecutor with executor_class(max_workers = max_workers) as executor: - futures: List[concurrent.futures.Future[Out]] = [] + futures: list[concurrent.futures.Future[Out]] = [] done = False for _ in range(concurrency): try: @@ -893,13 +894,13 @@ class OutputFile: of.close() @staticmethod - def do_item(item: Tuple[str, LazyTensor]) -> Tuple[DataType, NDArray]: + def do_item(item: tuple[str, LazyTensor]) -> tuple[DataType, NDArray]: name, lazy_tensor = item tensor = lazy_tensor.load().to_ggml() return (lazy_tensor.data_type, tensor.ndarray) @staticmethod - def maybe_do_quantize(item: Tuple[DataType, NDArray]) -> NDArray: + def maybe_do_quantize(item: tuple[DataType, NDArray]) -> NDArray: dt, arr = item if not isinstance(dt, QuantizedDataType): return arr @@ -940,7 +941,7 @@ class OutputFile: of.close() -def pick_output_type(model: LazyModel, output_type_str: Optional[str]) -> GGMLFileType: +def pick_output_type(model: LazyModel, output_type_str: str | None) -> GGMLFileType: wq_type = model[NAMES[gguf.MODEL_TENSOR.ATTN_Q].format(bid=0)+".weight"].data_type if output_type_str == "f32" or (output_type_str is None and wq_type == DT_F32): @@ -960,7 +961,7 @@ def convert_to_output_type(model: LazyModel, output_type: GGMLFileType) -> LazyM def convert_model_names(model: LazyModel, params: Params) -> LazyModel: tmap = gguf.TensorNameMap(ARCH, params.n_layer) - should_skip: Set[gguf.MODEL_TENSOR] = set(gguf.MODEL_TENSOR_SKIP.get(ARCH, [])) + should_skip: set[gguf.MODEL_TENSOR] = set(gguf.MODEL_TENSOR_SKIP.get(ARCH, [])) tmp = model @@ -995,12 +996,12 @@ def convert_model_names(model: LazyModel, params: Params) -> LazyModel: return out -def nth_multifile_path(path: Path, n: int) -> Optional[Path]: +def nth_multifile_path(path: Path, n: int) -> Path | None: '''Given any path belonging to a multi-file model (e.g. foo.bin.1), return the nth path in the model. ''' # Support the following patterns: - patterns: List[Tuple[str, str]] = [ + patterns: list[tuple[str, str]] = [ # - x.00.pth, x.01.pth, etc. (r'\.[0-9]{2}\.pth$', f'.{n:02}.pth'), # - x-00001-of-00002.bin, x-00002-of-00002.bin, etc. @@ -1016,11 +1017,11 @@ def nth_multifile_path(path: Path, n: int) -> Optional[Path]: return None -def find_multifile_paths(path: Path) -> List[Path]: +def find_multifile_paths(path: Path) -> list[Path]: '''Given any path belonging to a multi-file model (e.g. foo.bin.1), return the whole list of paths in the model. ''' - ret: List[Path] = [] + ret: list[Path] = [] for i in itertools.count(): nth_path = nth_multifile_path(path, i) if nth_path is None: @@ -1051,7 +1052,7 @@ def load_some_model(path: Path) -> ModelPlus: path = files[0] paths = find_multifile_paths(path) - models_plus: List[ModelPlus] = [] + models_plus: list[ModelPlus] = [] for path in paths: print(f"Loading model file {path}") models_plus.append(lazy_load_file(path)) @@ -1060,7 +1061,7 @@ def load_some_model(path: Path) -> ModelPlus: return model_plus -def load_vocab(path: Path, vocabtype: Optional[str]) -> Union[BpeVocab, SentencePieceVocab]: +def load_vocab(path: Path, vocabtype: str | None) -> Vocab: # Be extra-friendly and accept either a file or a directory. Also, if it's # a directory, it might be the model directory, and tokenizer.model might # be in the parent of that. @@ -1091,7 +1092,7 @@ def load_vocab(path: Path, vocabtype: Optional[str]) -> Union[BpeVocab, Sentence raise ValueError(f"Unsupported vocabulary type {vocabtype}") -def default_outfile(model_paths: List[Path], file_type: GGMLFileType) -> Path: +def default_outfile(model_paths: list[Path], file_type: GGMLFileType) -> Path: namestr = { GGMLFileType.AllF32: "f32", GGMLFileType.MostlyF16: "f16", @@ -1114,7 +1115,7 @@ def do_dump_model(model_plus: ModelPlus) -> None: print(f"{name}: shape={lazy_tensor.shape} type={lazy_tensor.data_type}; {lazy_tensor.description}") -def main(args_in: Optional[List[str]] = None) -> None: +def main(args_in: list[str] | None = None) -> None: parser = argparse.ArgumentParser(description="Convert a LLaMa model to a GGML compatible file") parser.add_argument("--dump", action="store_true", help="don't convert, just show what's in the model") parser.add_argument("--dump-single", action="store_true", help="don't convert, just show what's in a single model file") diff --git a/gguf-py/gguf/gguf.py b/gguf-py/gguf/gguf.py index de3edbc99..b1bc4205b 100644 --- a/gguf-py/gguf/gguf.py +++ b/gguf-py/gguf/gguf.py @@ -1,16 +1,18 @@ #!/usr/bin/env python3 -import shutil -import sys -import struct -import tempfile -import numpy as np +from __future__ import annotations + import json import os -from pathlib import Path - +import shutil +import struct +import sys +import tempfile from enum import IntEnum, auto from io import BufferedWriter -from typing import Any, BinaryIO, Callable, IO, Dict, List, Optional, Sequence, Tuple, Union +from pathlib import Path +from typing import IO, Any, BinaryIO, Callable, Sequence + +import numpy as np # # constants @@ -103,7 +105,7 @@ class MODEL_TENSOR(IntEnum): FFN_NORM : int = auto() -MODEL_ARCH_NAMES: Dict[MODEL_ARCH, str] = { +MODEL_ARCH_NAMES: dict[MODEL_ARCH, str] = { MODEL_ARCH.LLAMA: "llama", MODEL_ARCH.FALCON: "falcon", MODEL_ARCH.GPT2: "gpt2", @@ -112,7 +114,7 @@ MODEL_ARCH_NAMES: Dict[MODEL_ARCH, str] = { MODEL_ARCH.MPT: "mpt", } -MODEL_TENSOR_NAMES: Dict[MODEL_ARCH, Dict[MODEL_TENSOR, str]] = { +MODEL_TENSOR_NAMES: dict[MODEL_ARCH, dict[MODEL_TENSOR, str]] = { MODEL_ARCH.LLAMA: { MODEL_TENSOR.TOKEN_EMBD: "token_embd", MODEL_TENSOR.OUTPUT_NORM: "output_norm", @@ -158,7 +160,7 @@ MODEL_TENSOR_NAMES: Dict[MODEL_ARCH, Dict[MODEL_TENSOR, str]] = { } # tensors that will not be serialized -MODEL_TENSOR_SKIP: Dict[MODEL_ARCH, List[MODEL_TENSOR]] = { +MODEL_TENSOR_SKIP: dict[MODEL_ARCH, list[MODEL_TENSOR]] = { MODEL_ARCH.LLAMA: [ MODEL_TENSOR.ROPE_FREQS, MODEL_TENSOR.ATTN_ROT_EMBD, @@ -167,7 +169,7 @@ MODEL_TENSOR_SKIP: Dict[MODEL_ARCH, List[MODEL_TENSOR]] = { class TensorNameMap: - mappings_cfg: Dict[MODEL_TENSOR, Tuple[str, ...]] = { + mappings_cfg: dict[MODEL_TENSOR, tuple[str, ...]] = { # Token embeddings MODEL_TENSOR.TOKEN_EMBD: ( "gpt_neox.embed_in", # gptneox @@ -203,7 +205,7 @@ class TensorNameMap: ), } - block_mappings_cfg: Dict[MODEL_TENSOR, Tuple[str, ...]] = { + block_mappings_cfg: dict[MODEL_TENSOR, tuple[str, ...]] = { # Attention norm MODEL_TENSOR.ATTN_NORM: ( "gpt_neox.layers.{bid}.input_layernorm", # gptneox @@ -298,9 +300,9 @@ class TensorNameMap: ), } - mapping: Dict[str, Tuple[MODEL_TENSOR, str]] + mapping: dict[str, tuple[MODEL_TENSOR, str]] - tensor_names: Dict[MODEL_TENSOR, str] + tensor_names: dict[MODEL_TENSOR, str] def __init__(self, arch: MODEL_ARCH, n_blocks: int): mapping = self.mapping = {} @@ -321,7 +323,7 @@ class TensorNameMap: key = key.format(bid = bid) mapping[key] = (tensor, tensor_name) - def get_type_and_name(self, key: str, try_suffixes: Sequence[str]) -> Optional[Tuple[MODEL_TENSOR, str]]: + def get_type_and_name(self, key: str, try_suffixes: Sequence[str]) -> tuple[MODEL_TENSOR, str] | None: result = self.mapping.get(key) if result is not None: return result @@ -332,13 +334,13 @@ class TensorNameMap: return (result[0], result[1] + suffix) return None - def get_name(self, key: str, try_suffixes: Sequence[str]) -> Optional[str]: + def get_name(self, key: str, try_suffixes: Sequence[str]) -> str | None: result = self.get_type_and_name(key, try_suffixes = try_suffixes) if result is None: return None return result[1] - def get_type(self, key: str, try_suffixes: Sequence[str]) -> Optional[MODEL_TENSOR]: + def get_type(self, key: str, try_suffixes: Sequence[str]) -> MODEL_TENSOR | None: result = self.get_type_and_name(key, try_suffixes = try_suffixes) if result is None: return None @@ -432,10 +434,10 @@ class GGUFWriter: ti_data = b"" ti_data_count = 0 use_temp_file: bool - temp_file: Optional[tempfile.SpooledTemporaryFile[bytes]] = None - tensors: List[Tuple[np.ndarray[Any, Any], int]] + temp_file: tempfile.SpooledTemporaryFile[bytes] | None = None + tensors: list[tuple[np.ndarray[Any, Any], int]] - def __init__(self, path: Union[os.PathLike[str], str], arch: str, use_temp_file = True): + def __init__(self, path: os.PathLike[str] | str, arch: str, use_temp_file = True): self.fout = open(path, "wb") self.arch = arch self.add_architecture() @@ -531,7 +533,7 @@ class GGUFWriter: GGUFValueType.FLOAT64: " int: return ((x + n - 1) // n) * n - def add_tensor_info(self, name: str, tensor_shape: Sequence[int], tensor_dtype: Union[np.dtype[np.float16], np.dtype[np.float32]], tensor_nbytes: int, raw_dtype: Optional[GGMLQuantizationType] = None): + def add_tensor_info(self, name: str, tensor_shape: Sequence[int], tensor_dtype: np.dtype[np.float16] | np.dtype[np.float32], tensor_nbytes: int, raw_dtype: GGMLQuantizationType | None = None): assert raw_dtype is not None or tensor_dtype in (np.float32, np.float16), "Only F32 and F16 tensors are supported for now" encoded_name = name.encode("utf8") @@ -580,7 +582,7 @@ class GGUFWriter: self.offset_tensor += GGUFWriter.ggml_pad(tensor_nbytes, self.data_alignment) self.ti_data_count += 1 - def add_tensor(self, name: str, tensor: np.ndarray[Any, Any], raw_shape: Optional[Sequence[int]] = None, raw_dtype: Optional[GGMLQuantizationType] = None): + def add_tensor(self, name: str, tensor: np.ndarray[Any, Any], raw_shape: Sequence[int] | None = None, raw_dtype: GGMLQuantizationType | None = None): if self.use_temp_file and self.temp_file is None: fp = tempfile.SpooledTemporaryFile(mode="w+b", max_size=256*1024*1024) fp.seek(0) @@ -600,7 +602,7 @@ class GGUFWriter: if pad != 0: self.temp_file.write(bytes([0] * pad)) - def write_padding(self, fp: BinaryIO, n: int, align: Optional[int] = None): + def write_padding(self, fp: BinaryIO, n: int, align: int | None = None): pad = GGUFWriter.ggml_pad(n, align if align is not None else self.data_alignment) - n if pad != 0: fp.write(bytes([0] * pad)) @@ -726,13 +728,13 @@ class GGUFWriter: def add_tokenizer_model(self, model: str): self.add_string(KEY_TOKENIZER_MODEL, model) - def add_token_list(self, tokens: Union[Sequence[str], Sequence[bytes], Sequence[bytearray]]): + def add_token_list(self, tokens: Sequence[str] | Sequence[bytes] | Sequence[bytearray]): self.add_array(KEY_TOKENIZER_LIST, tokens) - def add_token_merges(self, merges: Union[Sequence[str], Sequence[bytes], Sequence[bytearray]]): + def add_token_merges(self, merges: Sequence[str] | Sequence[bytes] | Sequence[bytearray]): self.add_array(KEY_TOKENIZER_MERGES, merges) - def add_token_types(self, types: Union[Sequence[TokenType], Sequence[int]]): + def add_token_types(self, types: Sequence[TokenType] | Sequence[int]): self.add_array(KEY_TOKENIZER_TOKEN_TYPE, types) def add_token_scores(self, scores: Sequence[float]): @@ -756,11 +758,11 @@ class GGUFWriter: class SpecialVocab: load_merges: bool = False - merges: List[str] = [] - special_token_types: Tuple[str, ...] = tuple(('bos', 'eos', 'unk', 'sep', 'pad')) - special_token_ids: Dict[str, int] = {} + merges: list[str] = [] + special_token_types: tuple[str, ...] = ('bos', 'eos', 'unk', 'sep', 'pad') + special_token_ids: dict[str, int] = {} - def __init__(self, path: Path, load_merges: bool = False, special_token_types: Optional[Tuple[str, ...]] = None): + def __init__(self, path: Path, load_merges: bool = False, special_token_types: tuple[str, ...] | None = None): self.special_token_ids = {} self.load_merges = load_merges if special_token_types is not None: @@ -821,7 +823,7 @@ class SpecialVocab: print(f'gguf: Adding {len(self.merges)} merge(s).') gw.add_token_merges(self.merges) for typ, tokid in self.special_token_ids.items(): - handler: Optional[Callable[[int], None]] = getattr(gw, f'add_{typ}_token_id', None) + handler: Callable[[int], None] | None = getattr(gw, f'add_{typ}_token_id', None) if handler is None: print(f'gguf: WARNING: No handler for special token type {typ} with id {tokid} - skipping') continue diff --git a/gguf-py/pyproject.toml b/gguf-py/pyproject.toml index c66b069f9..26f792b14 100644 --- a/gguf-py/pyproject.toml +++ b/gguf-py/pyproject.toml @@ -1,6 +1,6 @@ [tool.poetry] name = "gguf" -version = "0.2.1" +version = "0.3.1" description = "Write ML models in GGUF for GGML" authors = ["GGML "] packages = [ diff --git a/mypy.ini b/mypy.ini new file mode 100644 index 000000000..55c168f2d --- /dev/null +++ b/mypy.ini @@ -0,0 +1,5 @@ +[mypy] +strict = true +allow_untyped_calls = true +allow_untyped_defs = true +allow_incomplete_defs = true From e8422de39e4aa2f7e50574124b060a80607e654a Mon Sep 17 00:00:00 2001 From: DannyDaemonic Date: Thu, 31 Aug 2023 04:21:45 -0700 Subject: [PATCH 10/27] @vxiiduu's fix for PrefetchVirtualMemory (#2930) Reimplement fix for `PrefetchVirtualMemory`. Co-authored-by: vxiiduu <73044267+vxiiduu@users.noreply.github.com> --- llama.cpp | 27 ++++++++++++++++----------- 1 file changed, 16 insertions(+), 11 deletions(-) diff --git a/llama.cpp b/llama.cpp index 95ee6ffe4..98a5da963 100644 --- a/llama.cpp +++ b/llama.cpp @@ -611,20 +611,25 @@ struct llama_mmap { throw std::runtime_error(format("MapViewOfFile failed: %s", llama_format_win_err(error).c_str())); } - #if _WIN32_WINNT >= _WIN32_WINNT_WIN8 if (prefetch) { - // Advise the kernel to preload the mapped memory - WIN32_MEMORY_RANGE_ENTRY range; - range.VirtualAddress = addr; - range.NumberOfBytes = (SIZE_T)size; - if (!PrefetchVirtualMemory(GetCurrentProcess(), 1, &range, 0)) { - fprintf(stderr, "warning: PrefetchVirtualMemory failed: %s\n", - llama_format_win_err(GetLastError()).c_str()); + // PrefetchVirtualMemory is only present on Windows 8 and above, so we dynamically load it + BOOL (WINAPI *pPrefetchVirtualMemory) (HANDLE, ULONG_PTR, PWIN32_MEMORY_RANGE_ENTRY, ULONG); + HMODULE hKernel32 = GetModuleHandleW(L"kernel32.dll"); + + // may fail on pre-Windows 8 systems + pPrefetchVirtualMemory = reinterpret_cast (GetProcAddress(hKernel32, "PrefetchVirtualMemory")); + + if (pPrefetchVirtualMemory) { + // advise the kernel to preload the mapped memory + WIN32_MEMORY_RANGE_ENTRY range; + range.VirtualAddress = addr; + range.NumberOfBytes = (SIZE_T)size; + if (!pPrefetchVirtualMemory(GetCurrentProcess(), 1, &range, 0)) { + fprintf(stderr, "warning: PrefetchVirtualMemory failed: %s\n", + llama_format_win_err(GetLastError()).c_str()); + } } } - #else - #pragma message("warning: You are building for pre-Windows 8; prefetch not supported") - #endif // _WIN32_WINNT >= _WIN32_WINNT_WIN8 } ~llama_mmap() { From aeefac4ff760acea5afe66fbfe8d7eca1937b79c Mon Sep 17 00:00:00 2001 From: Kerfuffle <44031344+KerfuffleV2@users.noreply.github.com> Date: Thu, 31 Aug 2023 16:49:24 -0600 Subject: [PATCH 11/27] scripts: Use local gguf package when running from repo (#2927) * scripts: Use local gguf when running from repo --- convert-falcon-hf-to-gguf.py | 5 ++++- convert-gptneox-hf-to-gguf.py | 5 ++++- convert-llama-ggmlv3-to-gguf.py | 6 +++++- convert.py | 6 +++++- .../convert-train-checkpoint-to-gguf.py | 5 ++++- 5 files changed, 22 insertions(+), 5 deletions(-) diff --git a/convert-falcon-hf-to-gguf.py b/convert-falcon-hf-to-gguf.py index ec786ff67..271e58972 100755 --- a/convert-falcon-hf-to-gguf.py +++ b/convert-falcon-hf-to-gguf.py @@ -11,11 +11,14 @@ import sys from pathlib import Path from typing import Any -import gguf import numpy as np import torch from transformers import AutoTokenizer # type: ignore[import] +if 'NO_LOCAL_GGUF' not in os.environ: + sys.path.insert(1, str(Path(__file__).parent / 'gguf-py' / 'gguf')) +import gguf + def bytes_to_unicode(): # ref: https://github.com/openai/gpt-2/blob/master/src/encoder.py diff --git a/convert-gptneox-hf-to-gguf.py b/convert-gptneox-hf-to-gguf.py index 852123d99..b9c8b4607 100755 --- a/convert-gptneox-hf-to-gguf.py +++ b/convert-gptneox-hf-to-gguf.py @@ -11,11 +11,14 @@ import sys from pathlib import Path from typing import Any -import gguf import numpy as np import torch from transformers import AutoTokenizer # type: ignore[import] +if 'NO_LOCAL_GGUF' not in os.environ: + sys.path.insert(1, str(Path(__file__).parent / 'gguf-py' / 'gguf')) +import gguf + # ref: https://github.com/openai/gpt-2/blob/master/src/encoder.py diff --git a/convert-llama-ggmlv3-to-gguf.py b/convert-llama-ggmlv3-to-gguf.py index 3f39bc39e..08ba0c490 100755 --- a/convert-llama-ggmlv3-to-gguf.py +++ b/convert-llama-ggmlv3-to-gguf.py @@ -7,9 +7,13 @@ import struct import sys from pathlib import Path -import gguf import numpy as np +import os +if 'NO_LOCAL_GGUF' not in os.environ: + sys.path.insert(1, str(Path(__file__).parent / 'gguf-py' / 'gguf')) +import gguf + # Note: Does not support GGML_QKK_64 QK_K = 256 # Items here are (block size, type size) diff --git a/convert.py b/convert.py index 9a39edb99..5cc3f6e66 100755 --- a/convert.py +++ b/convert.py @@ -25,10 +25,14 @@ from dataclasses import dataclass from pathlib import Path from typing import IO, TYPE_CHECKING, Any, Callable, Generator, Iterable, Literal, Sequence, TypeVar -import gguf import numpy as np from sentencepiece import SentencePieceProcessor # type: ignore[import] +import os +if 'NO_LOCAL_GGUF' not in os.environ: + sys.path.insert(1, str(Path(__file__).parent / 'gguf-py' / 'gguf')) +import gguf + if TYPE_CHECKING: from typing import TypeAlias diff --git a/examples/train-text-from-scratch/convert-train-checkpoint-to-gguf.py b/examples/train-text-from-scratch/convert-train-checkpoint-to-gguf.py index 01b3ee92a..a527d6153 100644 --- a/examples/train-text-from-scratch/convert-train-checkpoint-to-gguf.py +++ b/examples/train-text-from-scratch/convert-train-checkpoint-to-gguf.py @@ -2,13 +2,16 @@ # train-text-from-scratch checkpoint --> gguf conversion import argparse -import gguf import os import struct import sys import numpy as np from pathlib import Path +if 'NO_LOCAL_GGUF' not in os.environ: + sys.path.insert(1, str(Path(__file__).parent / '..' / '..' / 'gguf-py' / 'gguf')) +import gguf + # gguf constants LLM_KV_OPTIMIZER_TYPE = "optimizer.type" LLM_KV_OPTIMIZER_TYPE_ADAM = "adam" From 528134dd0267838d9c0250cf1d9621631dff09b2 Mon Sep 17 00:00:00 2001 From: slaren Date: Fri, 1 Sep 2023 01:32:09 +0200 Subject: [PATCH 12/27] remove convert-llama-7b-pth-to-gguf.py and convert-llama-hf-to-gguf.py (#2906) --- convert-llama-7b-pth-to-gguf.py | 261 ----------------------------- convert-llama-hf-to-gguf.py | 280 -------------------------------- 2 files changed, 541 deletions(-) delete mode 100755 convert-llama-7b-pth-to-gguf.py delete mode 100755 convert-llama-hf-to-gguf.py diff --git a/convert-llama-7b-pth-to-gguf.py b/convert-llama-7b-pth-to-gguf.py deleted file mode 100755 index 6574c11dd..000000000 --- a/convert-llama-7b-pth-to-gguf.py +++ /dev/null @@ -1,261 +0,0 @@ -#!/usr/bin/env python3 -# 7b pth llama --> gguf conversion -# Only models with a single datafile are supported, like 7B -# HF files required in the model dir: config.json tokenizer_config.json tokenizer.json tokenizer.model - -from __future__ import annotations - -import argparse -import json -import os -import struct -import sys -from pathlib import Path -from typing import TYPE_CHECKING, Any - -import gguf -import numpy as np -import torch -from sentencepiece import SentencePieceProcessor # type: ignore[import] - -if TYPE_CHECKING: - from typing import TypeAlias - -NDArray: TypeAlias = 'np.ndarray[Any, Any]' - - -def count_model_parts(dir_model: Path) -> int: - num_parts = 0 - for filename in os.listdir(dir_model): - if filename.startswith("consolidated."): - num_parts += 1 - - if num_parts > 0: - print("gguf: found " + str(num_parts) + " model parts") - return num_parts - - -def parse_args() -> argparse.Namespace: - parser = argparse.ArgumentParser(description="Convert a PyTorch 7B LLaMA model to a GGML compatible file") - parser.add_argument("--vocab-only", action="store_true", help="extract only the vocab") - parser.add_argument("--outfile", type=Path, help="path to write to; default: based on input") - parser.add_argument("model", type=Path, help="directory containing model file, or model file itself (*.bin)") - parser.add_argument("ftype", type=int, choices=[0, 1], help="output format - use 0 for float32, 1 for float16", default = 1) - return parser.parse_args() - -args = parse_args() - -dir_model = args.model -ftype = args.ftype -if not dir_model.is_dir(): - print(f'Error: {args.model} is not a directory', file = sys.stderr) - sys.exit(1) - -# possible tensor data types -# ftype == 0 -> float32 -# ftype == 1 -> float16 - -# map from ftype to string -ftype_str = ["f32", "f16"] - -if args.outfile is not None: - fname_out = args.outfile -else: - # output in the same directory as the model by default - fname_out = dir_model / f'ggml-model-{ftype_str[ftype]}.gguf' - -print("gguf: loading model "+dir_model.name) - -with open(dir_model / "config.json", "r", encoding="utf-8") as f: - hparams = json.load(f) - -if hparams["architectures"][0] != "LlamaForCausalLM": - print("Model architecture not supported: " + hparams["architectures"][0]) - sys.exit() - -# get number of model parts -num_parts = count_model_parts(dir_model) - -if num_parts > 1: - print("gguf: Only models with a single datafile are supported.") - - sys.exit() - -ARCH=gguf.MODEL_ARCH.LLAMA -gguf_writer = gguf.GGUFWriter(fname_out, gguf.MODEL_ARCH_NAMES[ARCH]) - - -print("gguf: get model metadata") - -block_count = hparams["num_hidden_layers"] -head_count = hparams["num_attention_heads"] - -if "num_key_value_heads" in hparams: - head_count_kv = hparams["num_key_value_heads"] -else: - head_count_kv = head_count - -if "_name_or_path" in hparams: - hf_repo = hparams["_name_or_path"] -else: - hf_repo = "" - -if "max_sequence_length" in hparams: - ctx_length = hparams["max_sequence_length"] -elif "max_position_embeddings" in hparams: - ctx_length = hparams["max_position_embeddings"] -else: - print("gguf: can not find ctx length parameter.") - - sys.exit() - - -gguf_writer.add_name(dir_model.name) -gguf_writer.add_source_hf_repo(hf_repo) -gguf_writer.add_tensor_data_layout("Meta AI original pth") -gguf_writer.add_context_length(ctx_length) -gguf_writer.add_embedding_length(hparams["hidden_size"]) -gguf_writer.add_block_count(block_count) -gguf_writer.add_feed_forward_length(hparams["intermediate_size"]) -gguf_writer.add_rope_dimension_count(hparams["hidden_size"] // hparams["num_attention_heads"]) -gguf_writer.add_head_count(head_count) -gguf_writer.add_head_count_kv(head_count_kv) -gguf_writer.add_layer_norm_rms_eps(hparams["rms_norm_eps"]) - -if "rope_scaling" in hparams and hparams["rope_scaling"] != None and "factor" in hparams["rope_scaling"]: - if "type" in hparams["rope_scaling"]: - if hparams["rope_scaling"]["type"] == "linear": - gguf_writer.add_rope_scale_linear(hparams["rope_scaling"]["factor"]) - - -# TOKENIZATION - -print("gguf: get tokenizer metadata") - -tokens: list[bytes] = [] -scores: list[float] = [] -toktypes: list[int] = [] - -tokenizer_model_file = dir_model / 'tokenizer.model' -if not tokenizer_model_file.is_file(): - print(f'Error: Missing {tokenizer_model_file}', file = sys.stderr) - sys.exit(1) - -# vocab type sentencepiece -print("gguf: get sentencepiece tokenizer vocab and scores") - -tokenizer = SentencePieceProcessor(str(tokenizer_model_file)) - -for i in range(tokenizer.vocab_size()): - text: bytes - score: float - - piece = tokenizer.id_to_piece(i) - text = piece.encode("utf-8") - score = tokenizer.get_score(i) - - toktype = 1 # defualt to normal token type - if tokenizer.is_unknown(i): - toktype = 2 - if tokenizer.is_control(i): - toktype = 3 - - # toktype = 4 is user-defined = tokens from added_tokens.json - - if tokenizer.is_unused(i): - toktype = 5 - if tokenizer.is_byte(i): - toktype = 6 - - tokens.append(text) - scores.append(score) - toktypes.append(toktype) - -added_tokens_file = dir_model / 'added_tokens.json' -if added_tokens_file.is_file(): - with open(added_tokens_file, "r", encoding="utf-8") as f: - addtokens_json = json.load(f) - - print("gguf: get added tokens") - - for key in addtokens_json: - tokens.append( key.encode("utf-8") ) - scores.append(-1000.0) - toktypes.append(4) # user-defined token type - -gguf_writer.add_tokenizer_model("llama") -gguf_writer.add_token_list(tokens) -gguf_writer.add_token_scores(scores) -gguf_writer.add_token_types(toktypes) - -special_vocab = gguf.SpecialVocab(dir_model) -special_vocab.add_to_gguf(gguf_writer) - -# TENSORS - -tensor_map = gguf.get_tensor_name_map(ARCH,block_count) - -# tensor info -print("gguf: get tensor metadata") - -part_names = (f"consolidated.{n:02}.pth" for n in range(0, num_parts)) - -for part_name in part_names: - if args.vocab_only: - break - print("gguf: loading model part '" + part_name + "'") - model_part = torch.load(f"{dir_model}/{part_name}", map_location="cpu") - - for name in model_part.keys(): - data = model_part[name] - - # we don't need these - if name == "rope.freqs": - continue - - old_dtype = data.dtype - - # convert any unsupported data types to float32 - if data.dtype != torch.float16 and data.dtype != torch.float32: - data = data.to(torch.float32) - - data = data.squeeze().numpy() - - # map tensor names - new_name = tensor_map.get_name(name, try_suffixes = (".weight", ".bias")) - if new_name is None: - print("Can not map tensor '" + name + "'") - sys.exit() - - n_dims = len(data.shape) - data_dtype = data.dtype - - # if f32 desired, convert any float16 to float32 - if ftype == 0 and data_dtype == np.float16: - data = data.astype(np.float32) - - # TODO: Why cant we use these float16 as-is? There should be not reason to store float16 as float32 - if ftype == 1 and data_dtype == np.float16 and n_dims == 1: - data = data.astype(np.float32) - - # if f16 desired, convert any float32 2-dim weight tensors to float16 - if ftype == 1 and data_dtype == np.float32 and name.endswith(".weight") and n_dims == 2: - data = data.astype(np.float16) - - print(new_name + ", n_dims = " + str(n_dims) + ", " + str(old_dtype) + " --> " + str(data.dtype)) - - gguf_writer.add_tensor(new_name, data) - - -print("gguf: write header") -gguf_writer.write_header_to_file() -print("gguf: write metadata") -gguf_writer.write_kv_data_to_file() -if not args.vocab_only: - print("gguf: write tensors") - gguf_writer.write_tensors_to_file() - -gguf_writer.close() - -print(f"gguf: model successfully exported to '{fname_out}'") -print("") diff --git a/convert-llama-hf-to-gguf.py b/convert-llama-hf-to-gguf.py deleted file mode 100755 index c453c83c3..000000000 --- a/convert-llama-hf-to-gguf.py +++ /dev/null @@ -1,280 +0,0 @@ -#!/usr/bin/env python3 -# HF llama --> gguf conversion - -from __future__ import annotations - -import argparse -import json -import os -import struct -import sys -from pathlib import Path -from typing import TYPE_CHECKING, Any - -import gguf -import numpy as np -import torch -from sentencepiece import SentencePieceProcessor # type: ignore[import] - -if TYPE_CHECKING: - from typing import TypeAlias - -NDArray: TypeAlias = 'np.ndarray[Any, Any]' - -# reverse HF permute back to original pth layout -# https://github.com/huggingface/transformers/blob/main/src/transformers/models/llama/convert_llama_weights_to_hf.py - - -def reverse_hf_permute(weights: NDArray, n_head: int, n_kv_head: int | None = None) -> NDArray: - if n_kv_head is not None and n_head != n_kv_head: - n_head //= n_kv_head - - return (weights.reshape(n_head, 2, weights.shape[0] // n_head // 2, *weights.shape[1:]) - .swapaxes(1, 2) - .reshape(weights.shape)) - - -def count_model_parts(dir_model: str) -> int: - num_parts = 0 - - for filename in os.listdir(dir_model): - if filename.startswith("pytorch_model-"): - num_parts += 1 - - if num_parts > 0: - print("gguf: found " + str(num_parts) + " model parts") - - return num_parts - - -def parse_args() -> argparse.Namespace: - parser = argparse.ArgumentParser(description="Convert a HuggingFace LLaMA model to a GGML compatible file") - parser.add_argument("--vocab-only", action="store_true", help="extract only the vocab") - parser.add_argument("--outfile", type=Path, help="path to write to; default: based on input") - parser.add_argument("model", type=Path, help="directory containing model file, or model file itself (*.bin)") - parser.add_argument("ftype", type=int, choices=[0, 1], help="output format - use 0 for float32, 1 for float16", default = 1) - return parser.parse_args() - -args = parse_args() - -dir_model = args.model -ftype = args.ftype -if not dir_model.is_dir(): - print(f'Error: {args.model} is not a directory', file = sys.stderr) - sys.exit(1) - -# possible tensor data types -# ftype == 0 -> float32 -# ftype == 1 -> float16 - -# map from ftype to string -ftype_str = ["f32", "f16"] - -if args.outfile is not None: - fname_out = args.outfile -else: - # output in the same directory as the model by default - fname_out = dir_model / f'ggml-model-{ftype_str[ftype]}.gguf' - -print("gguf: loading model "+dir_model.name) - -with open(dir_model / "config.json", "r", encoding="utf-8") as f: - hparams = json.load(f) - -if hparams["architectures"][0] != "LlamaForCausalLM": - print("Model architecture not supported: " + hparams["architectures"][0]) - - sys.exit() - -# get number of model parts -num_parts = count_model_parts(dir_model) - -ARCH=gguf.MODEL_ARCH.LLAMA -gguf_writer = gguf.GGUFWriter(fname_out, gguf.MODEL_ARCH_NAMES[ARCH]) - -print("gguf: get model metadata") - -block_count = hparams["num_hidden_layers"] -head_count = hparams["num_attention_heads"] - -if "num_key_value_heads" in hparams: - head_count_kv = hparams["num_key_value_heads"] -else: - head_count_kv = head_count - -if "_name_or_path" in hparams: - hf_repo = hparams["_name_or_path"] -else: - hf_repo = "" - -if "max_sequence_length" in hparams: - ctx_length = hparams["max_sequence_length"] -elif "max_position_embeddings" in hparams: - ctx_length = hparams["max_position_embeddings"] -else: - print("gguf: can not find ctx length parameter.") - - sys.exit() - - -gguf_writer.add_name(dir_model.name) -gguf_writer.add_source_hf_repo(hf_repo) -gguf_writer.add_tensor_data_layout("Meta AI original pth") -gguf_writer.add_context_length(ctx_length) -gguf_writer.add_embedding_length(hparams["hidden_size"]) -gguf_writer.add_block_count(block_count) -gguf_writer.add_feed_forward_length(hparams["intermediate_size"]) -gguf_writer.add_rope_dimension_count(hparams["hidden_size"] // hparams["num_attention_heads"]) -gguf_writer.add_head_count(head_count) -gguf_writer.add_head_count_kv(head_count_kv) -gguf_writer.add_layer_norm_rms_eps(hparams["rms_norm_eps"]) - -if "rope_scaling" in hparams and hparams["rope_scaling"] != None and "factor" in hparams["rope_scaling"]: - if "type" in hparams["rope_scaling"]: - if hparams["rope_scaling"]["type"] == "linear": - gguf_writer.add_rope_scale_linear(hparams["rope_scaling"]["factor"]) - - -# TOKENIZATION - -print("gguf: get tokenizer metadata") - -tokens: list[bytes] = [] -scores: list[float] = [] -toktypes: list[int] = [] - -tokenizer_model_file = dir_model / 'tokenizer.model' -if not tokenizer_model_file.is_file(): - print(f'Error: Missing {tokenizer_model_file}', file = sys.stderr) - sys.exit(1) - -# vocab type sentencepiece -print("gguf: get sentencepiece tokenizer vocab, scores and token types") - -tokenizer = SentencePieceProcessor(str(tokenizer_model_file)) - -for i in range(tokenizer.vocab_size()): - text: bytes - score: float - - piece = tokenizer.id_to_piece(i) - text = piece.encode("utf-8") - score = tokenizer.get_score(i) - - toktype = 1 # defualt to normal token type - if tokenizer.is_unknown(i): - toktype = 2 - if tokenizer.is_control(i): - toktype = 3 - - # toktype = 4 is user-defined = tokens from added_tokens.json - - if tokenizer.is_unused(i): - toktype = 5 - if tokenizer.is_byte(i): - toktype = 6 - - tokens.append(text) - scores.append(score) - toktypes.append(toktype) - -added_tokens_file = dir_model / 'added_tokens.json' -if added_tokens_file.is_file(): - with open(added_tokens_file, "r", encoding="utf-8") as f: - addtokens_json = json.load(f) - - print("gguf: get added tokens") - - for key in addtokens_json: - tokens.append( key.encode("utf-8") ) - scores.append(-1000.0) - toktypes.append(4) # user-defined token type - - -gguf_writer.add_tokenizer_model("llama") -gguf_writer.add_token_list(tokens) -gguf_writer.add_token_scores(scores) -gguf_writer.add_token_types(toktypes) - -special_vocab = gguf.SpecialVocab(dir_model) -special_vocab.add_to_gguf(gguf_writer) - -# TENSORS - -tensor_map = gguf.get_tensor_name_map(ARCH,block_count) - -# tensor info -print("gguf: get tensor metadata") - -if num_parts == 0: - part_names = iter(("pytorch_model.bin",)) -else: - part_names = ( - f"pytorch_model-{n:05}-of-{num_parts:05}.bin" for n in range(1, num_parts + 1) - ) - -for part_name in part_names: - if args.vocab_only: - break - print("gguf: loading model part '" + part_name + "'") - model_part = torch.load(f"{dir_model}/{part_name}", map_location="cpu") - - for name in model_part.keys(): - data = model_part[name] - - # we don't need these - if name.endswith(".rotary_emb.inv_freq"): - continue - - old_dtype = data.dtype - - # convert any unsupported data types to float32 - if data.dtype != torch.float16 and data.dtype != torch.float32: - data = data.to(torch.float32) - - data = data.squeeze().numpy() - - # reverse permute these - if name.endswith(".q_proj.weight"): - data = reverse_hf_permute(data, head_count) - if name.endswith(".k_proj.weight"): - data = reverse_hf_permute(data, head_count, head_count_kv) - - # map tensor names - new_name = tensor_map.get_name(name, try_suffixes = (".weight", ".bias")) - if new_name is None: - print("Can not map tensor '" + name + "'") - sys.exit() - - n_dims = len(data.shape) - data_dtype = data.dtype - - # if f32 desired, convert any float16 to float32 - if ftype == 0 and data_dtype == np.float16: - data = data.astype(np.float32) - - # TODO: Why cant we use these float16 as-is? There should be not reason to store float16 as float32 - if ftype == 1 and data_dtype == np.float16 and n_dims == 1: - data = data.astype(np.float32) - - # if f16 desired, convert any float32 2-dim weight tensors to float16 - if ftype == 1 and data_dtype == np.float32 and name.endswith(".weight") and n_dims == 2: - data = data.astype(np.float16) - - print(new_name + ", n_dims = " + str(n_dims) + ", " + str(old_dtype) + " --> " + str(data.dtype)) - - gguf_writer.add_tensor(new_name, data) - - -print("gguf: write header") -gguf_writer.write_header_to_file() -print("gguf: write metadata") -gguf_writer.write_kv_data_to_file() -if not args.vocab_only: - print("gguf: write tensors") - gguf_writer.write_tensors_to_file() - -gguf_writer.close() - -print(f"gguf: model successfully exported to '{fname_out}'") -print("") From bce1fef328941499dc0acb76cc7fd7ac90449c2f Mon Sep 17 00:00:00 2001 From: Cebtenzzre Date: Thu, 31 Aug 2023 22:13:51 -0400 Subject: [PATCH 13/27] convert : fix another python 3.8 issue (#2949) --- convert.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/convert.py b/convert.py index 5cc3f6e66..6c89b5ecc 100755 --- a/convert.py +++ b/convert.py @@ -530,7 +530,7 @@ class LazyTensor: raise ValueError(f'Cannot validate conversion from {self.data_type} to {data_type}.') -LazyModel = dict[str, LazyTensor] +LazyModel: TypeAlias = 'dict[str, LazyTensor]' @dataclass From e8d91589258f9204397a7ac5f9b3c857835c98f8 Mon Sep 17 00:00:00 2001 From: Kawrakow <48489457+ikawrakow@users.noreply.github.com> Date: Fri, 1 Sep 2023 11:15:57 +0300 Subject: [PATCH 14/27] metal: somewhat faster f16 x f32 matrix multiply kernel (#2951) * Somewhat faster f16 x f32 matrix multiply kernel * Better use 32 thread groups for f16 x f32 --------- Co-authored-by: Iwan Kawrakow --- ggml-metal.m | 2 +- ggml-metal.metal | 38 ++++++++++++++++++++++++++++---------- 2 files changed, 29 insertions(+), 11 deletions(-) diff --git a/ggml-metal.m b/ggml-metal.m index e929c4b07..8c3c64f53 100644 --- a/ggml-metal.m +++ b/ggml-metal.m @@ -840,7 +840,7 @@ void ggml_metal_graph_compute( switch (src0t) { case GGML_TYPE_F16: { - nth0 = 64; + nth0 = 32; nth1 = 1; [encoder setComputePipelineState:ctx->pipeline_mul_mat_f16_f32]; } break; diff --git a/ggml-metal.metal b/ggml-metal.metal index 82e1a0c7a..02db5323e 100644 --- a/ggml-metal.metal +++ b/ggml-metal.metal @@ -528,24 +528,42 @@ kernel void kernel_mul_mat_f16_f32( device const half * x = (device const half *) (src0 + r0*nb01 + im/(ne12/ne02)*nb02); device const float * y = (device const float *) (src1 + r1*nb11 + im*nb12); - sum[tpitg.x] = 0.0f; + uint ith = tpitg.x; + uint nth = tptg.x; - for (int i = tpitg.x; i < ne00; i += tptg.x) { - sum[tpitg.x] += (float) x[i] * (float) y[i]; + sum[ith] = 0.0f; + + for (int i = ith; i < ne00; i += nth) { + sum[ith] += (float) x[i] * (float) y[i]; } // accumulate the sum from all threads in the threadgroup threadgroup_barrier(mem_flags::mem_threadgroup); - for (uint i = tptg.x/2; i > 0; i /= 2) { - if (tpitg.x < i) { - sum[tpitg.x] += sum[tpitg.x + i]; - } - threadgroup_barrier(mem_flags::mem_threadgroup); + if (ith%4 == 0) { + for (int i = 1; i < 4; ++i) sum[ith] += sum[ith + i]; } - - if (tpitg.x == 0) { + threadgroup_barrier(mem_flags::mem_threadgroup); + if (ith%16 == 0) { + for (int i = 4; i < 16; i += 4) sum[ith] += sum[ith + i]; + } + threadgroup_barrier(mem_flags::mem_threadgroup); + if (ith == 0) { + for (int i = 16; i < nth; i += 16) sum[0] += sum[i]; dst[im*ne1*ne0 + r1*ne0 + r0] = sum[0]; } + + // Original implementation. Left behind commented out for now + //threadgroup_barrier(mem_flags::mem_threadgroup); + //for (uint i = tptg.x/2; i > 0; i /= 2) { + // if (tpitg.x < i) { + // sum[tpitg.x] += sum[tpitg.x + i]; + // } + // threadgroup_barrier(mem_flags::mem_threadgroup); + //} + // + //if (tpitg.x == 0) { + // dst[im*ne1*ne0 + r1*ne0 + r0] = sum[0]; + //} } kernel void kernel_alibi_f32( From 18705a30ef3d6a89e1d7c6cb8cfe8633f760cb53 Mon Sep 17 00:00:00 2001 From: Cebtenzzre Date: Fri, 1 Sep 2023 05:03:49 -0400 Subject: [PATCH 15/27] llama2c : fix segfault and alloc-dealloc-mismatch (#2913) * llama2c : fix segfault if vocab is not found * llama2c : fix mismatch between new[] and delete * llama2c : fix basename on Windows * llama2c : use a destructor to prevent memory leaks --- .../convert-llama2c-to-ggml.cpp | 43 ++++++++++--------- 1 file changed, 23 insertions(+), 20 deletions(-) diff --git a/examples/convert-llama2c-to-ggml/convert-llama2c-to-ggml.cpp b/examples/convert-llama2c-to-ggml/convert-llama2c-to-ggml.cpp index e9e070b1f..0b03c9d2b 100644 --- a/examples/convert-llama2c-to-ggml/convert-llama2c-to-ggml.cpp +++ b/examples/convert-llama2c-to-ggml/convert-llama2c-to-ggml.cpp @@ -75,7 +75,7 @@ typedef struct { int seq_len; // max sequence length } Config; -typedef struct { +struct TransformerWeights { // token embedding table float* token_embedding_table; // (vocab_size, dim) // weights for rmsnorms @@ -97,7 +97,22 @@ typedef struct { // float* freq_cis_imag; // (seq_len, dim/2) // (optional) classifier weights for the logits, on the last layer float* wcls; -} TransformerWeights; + + ~TransformerWeights() { + delete[] token_embedding_table; + delete[] rms_att_weight; + delete[] rms_ffn_weight; + delete[] wq; + delete[] wk; + delete[] wv; + delete[] wo; + delete[] w1; + delete[] w2; + delete[] w3; + delete[] rms_final_weight; + delete[] wcls; + } +}; void malloc_weights(TransformerWeights* w, Config* p, bool shared_weights) { // we calloc instead of malloc to keep valgrind happy @@ -173,21 +188,6 @@ int checkpoint_init_weights(TransformerWeights *w, Config* p, FILE* f, bool shar return 0; } -void free_weights(TransformerWeights* w) { - delete w->token_embedding_table; - delete w->rms_att_weight; - delete w->rms_ffn_weight; - delete w->wq; - delete w->wk; - delete w->wv; - delete w->wo; - delete w->w1; - delete w->w2; - delete w->w3; - delete w->rms_final_weight; - if (w->wcls) delete w->wcls; -} - void print_sample_weights(TransformerWeights *w){ printf("----- Quick print of first of the weight vales of all the variables\n"); printf("%f\n", w->token_embedding_table[0]); @@ -596,6 +596,10 @@ void load_vocab(const char *filename, Config *config, struct llama_vocab *vocab) // assume llama2.c vocabulary printf("Assuming llama2.c vocabulary since %s is not a gguf file\n", filename); llama_file file(filename, "rb"); + if (!file.fp) { + fprintf(stderr, "error: %s: %s\n", strerror(errno), filename); + exit(1); + } const int n_vocab = config->vocab_size; /* uint32_t max_token_length = */ file.read_u32(); // unused vocab->id_to_token.resize(n_vocab); @@ -898,7 +902,7 @@ bool params_parse(int argc, char ** argv, struct train_params * params) { } std::string basename(const std::string &path) { - size_t pos = path.find_last_of("/"); + size_t pos = path.find_last_of("/\\"); if (pos == std::string::npos) { return path; } @@ -911,7 +915,7 @@ int main(int argc, char ** argv) { return 1; } Config config; - TransformerWeights weights; + TransformerWeights weights = {}; { FILE *file = fopen(params.fn_llama2c_model, "rb"); if (!file) { printf("Unable to open the checkpoint file %s!\n", params.fn_llama2c_model); return 1; } @@ -953,6 +957,5 @@ int main(int argc, char ** argv) { printf("Saving llama.c model file %s in ggml format at %s\n", params.fn_llama2c_model, params.fn_llama2c_output_model); ggml_free(model.ctx); - free_weights(&weights); return 0; } From 4dcd47d71df8ca4edcc31302744bd93f0c31298e Mon Sep 17 00:00:00 2001 From: staviq Date: Fri, 1 Sep 2023 11:07:06 +0200 Subject: [PATCH 16/27] logs : fix mingw-like builds (fixes #2898) (#2911) * fix mingw-like builds * formatting * make LOG_COMPAT easier to override and extend * simplify win detection * fix for #2940 --- Makefile | 10 +++++----- common/log.h | 20 ++++++++++---------- 2 files changed, 15 insertions(+), 15 deletions(-) diff --git a/Makefile b/Makefile index b750540fe..b56df3d8a 100644 --- a/Makefile +++ b/Makefile @@ -79,6 +79,11 @@ ifdef LLAMA_SERVER_VERBOSE CXXFLAGS += -DSERVER_VERBOSE=$(LLAMA_SERVER_VERBOSE) endif +ifdef LLAMA_DISABLE_LOGS + CFLAGS += -DLOG_DISABLE_LOGS + CXXFLAGS += -DLOG_DISABLE_LOGS +endif # LLAMA_DISABLE_LOGS + # warnings CFLAGS += -Wall -Wextra -Wpedantic -Wcast-qual -Wdouble-promotion -Wshadow -Wstrict-prototypes -Wpointer-arith \ -Wmissing-prototypes -Werror=implicit-int @@ -343,11 +348,6 @@ k_quants.o: k_quants.c k_quants.h $(CC) $(CFLAGS) -c $< -o $@ endif # LLAMA_NO_K_QUANTS -ifdef LLAMA_DISABLE_LOGS - CFLAGS += -DLOG_DISABLE_LOGS - CXXFLAGS += -DLOG_DISABLE_LOGS -endif # LLAMA_DISABLE_LOGS - # # Print build information # diff --git a/common/log.h b/common/log.h index c1364187d..bf9fafd68 100644 --- a/common/log.h +++ b/common/log.h @@ -154,7 +154,7 @@ inline std::string log_filename_generator_impl(const std::string & log_file_base // #include "log.h" // #ifndef LOG_NO_TIMESTAMPS - #ifndef _WIN32 + #ifndef _MSC_VER #define LOG_TIMESTAMP_FMT "[%" PRIu64 "] " #define LOG_TIMESTAMP_VAL , (std::chrono::duration_cast>(std::chrono::system_clock::now().time_since_epoch())).count() #else @@ -167,7 +167,7 @@ inline std::string log_filename_generator_impl(const std::string & log_file_base #endif #ifdef LOG_TEE_TIMESTAMPS - #ifndef _WIN32 + #ifndef _MSC_VER #define LOG_TEE_TIMESTAMP_FMT "[%" PRIu64 "] " #define LOG_TEE_TIMESTAMP_VAL , (std::chrono::duration_cast>(std::chrono::system_clock::now().time_since_epoch())).count() #else @@ -187,7 +187,7 @@ inline std::string log_filename_generator_impl(const std::string & log_file_base // #include "log.h" // #ifndef LOG_NO_FILE_LINE_FUNCTION - #ifndef _WIN32 + #ifndef _MSC_VER #define LOG_FLF_FMT "[%24s:%5d][%24s] " #define LOG_FLF_VAL , __FILE__, __LINE__, __FUNCTION__ #else @@ -200,7 +200,7 @@ inline std::string log_filename_generator_impl(const std::string & log_file_base #endif #ifdef LOG_TEE_FILE_LINE_FUNCTION - #ifndef _WIN32 + #ifndef _MSC_VER #define LOG_TEE_FLF_FMT "[%24s:%5d][%24s] " #define LOG_TEE_FLF_VAL , __FILE__, __LINE__, __FUNCTION__ #else @@ -224,7 +224,7 @@ enum LogTriState // INTERNAL, DO NOT USE // USE LOG() INSTEAD // -#ifndef _WIN32 +#ifndef _MSC_VER #define LOG_IMPL(str, ...) \ { \ if (LOG_TARGET != nullptr) \ @@ -247,7 +247,7 @@ enum LogTriState // INTERNAL, DO NOT USE // USE LOG_TEE() INSTEAD // -#ifndef _WIN32 +#ifndef _MSC_VER #define LOG_TEE_IMPL(str, ...) \ { \ if (LOG_TARGET != nullptr) \ @@ -284,7 +284,7 @@ enum LogTriState // Main LOG macro. // behaves like printf, and supports arguments the exact same way. // -#ifndef _WIN32 +#ifndef _MSC_VER #define LOG(...) LOG_IMPL(__VA_ARGS__, "") #else #define LOG(str, ...) LOG_IMPL("%s" str, "", __VA_ARGS__, "") @@ -298,14 +298,14 @@ enum LogTriState // Secondary target can be changed just like LOG_TARGET // by defining LOG_TEE_TARGET // -#ifndef _WIN32 +#ifndef _MSC_VER #define LOG_TEE(...) LOG_TEE_IMPL(__VA_ARGS__, "") #else #define LOG_TEE(str, ...) LOG_TEE_IMPL("%s" str, "", __VA_ARGS__, "") #endif // LOG macro variants with auto endline. -#ifndef _WIN32 +#ifndef _MSC_VER #define LOGLN(...) LOG_IMPL(__VA_ARGS__, "\n") #define LOG_TEELN(...) LOG_TEE_IMPL(__VA_ARGS__, "\n") #else @@ -461,7 +461,7 @@ inline void log_test() LOG("13 Hello World this time in yet new file?\n") log_set_target(log_filename_generator("llama_autonamed", "log")); LOG("14 Hello World in log with generated filename!\n") -#ifdef _WIN32 +#ifdef _MSC_VER LOG_TEE("15 Hello msvc TEE without arguments\n") LOG_TEE("16 Hello msvc TEE with (%d)(%s) arguments\n", 1, "test") LOG_TEELN("17 Hello msvc TEELN without arguments\n") From 13268c533177a4dc76bce0b465645d74f0d51d55 Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Fri, 1 Sep 2023 13:42:41 +0300 Subject: [PATCH 17/27] metal : slight speed-up for add and mul kernels (#2917) --- ggml-metal.m | 20 ++++++++++++++++---- ggml-metal.metal | 32 ++++++++++++++++---------------- 2 files changed, 32 insertions(+), 20 deletions(-) diff --git a/ggml-metal.m b/ggml-metal.m index 8c3c64f53..4267db9be 100644 --- a/ggml-metal.m +++ b/ggml-metal.m @@ -680,6 +680,12 @@ void ggml_metal_graph_compute( } break; case GGML_OP_ADD: { + GGML_ASSERT(ggml_is_contiguous(src0)); + + // utilize float4 + GGML_ASSERT(ne00 % 4 == 0); + const int64_t nb = ne00/4; + if (ggml_nelements(src1) == ne10) { // src1 is a row [encoder setComputePipelineState:ctx->pipeline_add_row]; @@ -689,14 +695,20 @@ void ggml_metal_graph_compute( [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0]; [encoder setBuffer:id_src1 offset:offs_src1 atIndex:1]; [encoder setBuffer:id_dst offset:offs_dst atIndex:2]; - [encoder setBytes:&ne00 length:sizeof(ne00) atIndex:3]; + [encoder setBytes:&nb length:sizeof(nb) atIndex:3]; - const int64_t n = ggml_nelements(dst); + const int64_t n = ggml_nelements(dst)/4; [encoder dispatchThreadgroups:MTLSizeMake(n, 1, 1) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)]; } break; case GGML_OP_MUL: { + GGML_ASSERT(ggml_is_contiguous(src0)); + + // utilize float4 + GGML_ASSERT(ne00 % 4 == 0); + const int64_t nb = ne00/4; + if (ggml_nelements(src1) == ne10) { // src1 is a row [encoder setComputePipelineState:ctx->pipeline_mul_row]; @@ -706,9 +718,9 @@ void ggml_metal_graph_compute( [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0]; [encoder setBuffer:id_src1 offset:offs_src1 atIndex:1]; [encoder setBuffer:id_dst offset:offs_dst atIndex:2]; - [encoder setBytes:&ne00 length:sizeof(ne00) atIndex:3]; + [encoder setBytes:&nb length:sizeof(nb) atIndex:3]; - const int64_t n = ggml_nelements(dst); + const int64_t n = ggml_nelements(dst)/4; [encoder dispatchThreadgroups:MTLSizeMake(n, 1, 1) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)]; } break; diff --git a/ggml-metal.metal b/ggml-metal.metal index 02db5323e..8cdf0b9d2 100644 --- a/ggml-metal.metal +++ b/ggml-metal.metal @@ -25,9 +25,9 @@ typedef struct { } block_q8_0; kernel void kernel_add( - device const float * src0, - device const float * src1, - device float * dst, + device const float4 * src0, + device const float4 * src1, + device float4 * dst, uint tpig[[thread_position_in_grid]]) { dst[tpig] = src0[tpig] + src1[tpig]; } @@ -35,18 +35,18 @@ kernel void kernel_add( // assumption: src1 is a row // broadcast src1 into src0 kernel void kernel_add_row( - device const float * src0, - device const float * src1, - device float * dst, - constant int64_t & ne00, + device const float4 * src0, + device const float4 * src1, + device float4 * dst, + constant int64_t & nb, uint tpig[[thread_position_in_grid]]) { - dst[tpig] = src0[tpig] + src1[tpig % ne00]; + dst[tpig] = src0[tpig] + src1[tpig % nb]; } kernel void kernel_mul( - device const float * src0, - device const float * src1, - device float * dst, + device const float4 * src0, + device const float4 * src1, + device float4 * dst, uint tpig[[thread_position_in_grid]]) { dst[tpig] = src0[tpig] * src1[tpig]; } @@ -54,12 +54,12 @@ kernel void kernel_mul( // assumption: src1 is a row // broadcast src1 into src0 kernel void kernel_mul_row( - device const float * src0, - device const float * src1, - device float * dst, - constant int64_t & ne00, + device const float4 * src0, + device const float4 * src1, + device float4 * dst, + constant int64_t & nb, uint tpig[[thread_position_in_grid]]) { - dst[tpig] = src0[tpig] * src1[tpig % ne00]; + dst[tpig] = src0[tpig] * src1[tpig % nb]; } kernel void kernel_scale( From 5aec2cfaac386eb09aebb75b805860828f00de91 Mon Sep 17 00:00:00 2001 From: Tameem <113388789+AhmadTameem@users.noreply.github.com> Date: Fri, 1 Sep 2023 18:27:40 +0500 Subject: [PATCH 18/27] ggml : add RISC-V vector intrinsics support (#2929) * added support for RISCV CFLAGS & native compile + cross compile options * Add RISC-V Vector Intrinsics Support Added RVV intrinsics for following ggml_vec_dot_q4_0_q8_0 ggml_vec_dot_q4_1_q8_1 ggml_vec_dot_q5_0_q8_0 ggml_vec_dot_q5_1_q8_1 ggml_vec_dot_q8_0_q8_0 Co-authored-by: Sharafat Signed-off-by: Ahmad Tameem --------- Signed-off-by: Ahmad Tameem Co-authored-by: moiz.hussain Co-authored-by: Sharafat --- Makefile | 13 ++++ ggml.c | 227 +++++++++++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 240 insertions(+) diff --git a/Makefile b/Makefile index b56df3d8a..8f73297f4 100644 --- a/Makefile +++ b/Makefile @@ -35,6 +35,11 @@ ifndef UNAME_M UNAME_M := $(shell uname -m) endif +ifdef RISCV_CROSS_COMPILE +CC := riscv64-unknown-linux-gnu-gcc +CXX := riscv64-unknown-linux-gnu-g++ +endif + CCV := $(shell $(CC) --version | head -n 1) CXXV := $(shell $(CXX) --version | head -n 1) @@ -150,6 +155,9 @@ endif # Architecture specific # TODO: probably these flags need to be tweaked on some architectures # feel free to update the Makefile for your architecture and send a pull request or issue + +ifndef RISCV + ifeq ($(UNAME_M),$(filter $(UNAME_M),x86_64 i686 amd64)) # Use all CPU extensions that are available: CFLAGS += -march=native -mtune=native @@ -198,6 +206,11 @@ ifneq ($(filter ppc64%,$(UNAME_M)),) endif endif +else + CFLAGS += -march=rv64gcv -mabi=lp64d + CXXFLAGS += -march=rv64gcv -mabi=lp64d +endif + ifndef LLAMA_NO_K_QUANTS CFLAGS += -DGGML_USE_K_QUANTS CXXFLAGS += -DGGML_USE_K_QUANTS diff --git a/ggml.c b/ggml.c index 46ce4a581..cf3955f7f 100644 --- a/ggml.c +++ b/ggml.c @@ -301,6 +301,10 @@ typedef double ggml_float; #endif #endif +#ifdef __riscv_v_intrinsic +#include +#endif + #ifdef __F16C__ #ifdef _MSC_VER @@ -2677,6 +2681,41 @@ static void ggml_vec_dot_q4_0_q8_0(const int n, float * restrict s, const void * } *s = hsum_float_4x4(acc_0, acc_1, acc_2, acc_3); +#elif defined(__riscv_v_intrinsic) + float sumf = 0.0; + + size_t vl = __riscv_vsetvl_e8m1(qk/2); + + for (int i = 0; i < nb; i++) { + vuint8m1_t tx = __riscv_vle8_v_u8m1(x[i].qs, vl); + + vint8m1_t y0 = __riscv_vle8_v_i8m1(y[i].qs, vl); + vint8m1_t y1 = __riscv_vle8_v_i8m1(y[i].qs+16, vl); + + vuint8m1_t x_a = __riscv_vand_vx_u8m1(tx, 0x0F, vl); + vuint8m1_t x_l = __riscv_vsrl_vx_u8m1(tx, 0x04, vl); + + vint8m1_t x_ai = __riscv_vreinterpret_v_u8m1_i8m1(x_a); + vint8m1_t x_li = __riscv_vreinterpret_v_u8m1_i8m1(x_l); + + vint8m1_t v0 = __riscv_vsub_vx_i8m1(x_ai, 8, vl); + vint8m1_t v1 = __riscv_vsub_vx_i8m1(x_li, 8, vl); + + vint16m2_t vec_mul1 = __riscv_vwmul_vv_i16m2(v0, y0, vl); + vint16m2_t vec_mul2 = __riscv_vwmul_vv_i16m2(v1, y1, vl); + + vint32m1_t vec_zero = __riscv_vmv_v_x_i32m1(0, vl); + + vint32m1_t vs1 = __riscv_vwredsum_vs_i16m2_i32m1(vec_mul1, vec_zero, vl); + vint32m1_t vs2 = __riscv_vwredsum_vs_i16m2_i32m1(vec_mul2, vec_zero, vl); + + int sumi = __riscv_vmv_x_s_i32m1_i32(vs1); + sumi += __riscv_vmv_x_s_i32m1_i32(vs2); + + sumf += sumi*GGML_FP16_TO_FP32(x[i].d)*GGML_FP16_TO_FP32(y[i].d); + } + + *s = sumf; #else // scalar float sumf = 0.0; @@ -2803,6 +2842,38 @@ static void ggml_vec_dot_q4_1_q8_1(const int n, float * restrict s, const void * } *s = hsum_float_8(acc) + summs; +#elif defined(__riscv_v_intrinsic) + float sumf = 0.0; + + size_t vl = __riscv_vsetvl_e8m1(qk/2); + + for (int i = 0; i < nb; i++) { + vuint8m1_t tx = __riscv_vle8_v_u8m1(x[i].qs, vl); + + vint8m1_t y0 = __riscv_vle8_v_i8m1(y[i].qs, vl); + vint8m1_t y1 = __riscv_vle8_v_i8m1(y[i].qs+16, vl); + + vuint8m1_t x_a = __riscv_vand_vx_u8m1(tx, 0x0F, vl); + vuint8m1_t x_l = __riscv_vsrl_vx_u8m1(tx, 0x04, vl); + + vint8m1_t v0 = __riscv_vreinterpret_v_u8m1_i8m1(x_a); + vint8m1_t v1 = __riscv_vreinterpret_v_u8m1_i8m1(x_l); + + vint16m2_t vec_mul1 = __riscv_vwmul_vv_i16m2(v0, y0, vl); + vint16m2_t vec_mul2 = __riscv_vwmul_vv_i16m2(v1, y1, vl); + + vint32m1_t vec_zero = __riscv_vmv_v_x_i32m1(0, vl); + + vint32m1_t vs1 = __riscv_vwredsum_vs_i16m2_i32m1(vec_mul1, vec_zero, vl); + vint32m1_t vs2 = __riscv_vwredsum_vs_i16m2_i32m1(vec_mul2, vec_zero, vl); + + int sumi = __riscv_vmv_x_s_i32m1_i32(vs1); + sumi += __riscv_vmv_x_s_i32m1_i32(vs2); + + sumf += (GGML_FP16_TO_FP32(x[i].d)*y[i].d)*sumi + GGML_FP16_TO_FP32(x[i].m)*y[i].s; + } + + *s = sumf; #else // scalar float sumf = 0.0; @@ -3037,6 +3108,76 @@ static void ggml_vec_dot_q5_0_q8_0(const int n, float * restrict s, const void * } *s = hsum_float_8(acc); +#elif defined(__riscv_v_intrinsic) + float sumf = 0.0; + + uint32_t qh; + + // These temp values are for masking and shift operations + uint32_t temp_1[16] = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15}; + uint32_t temp_2[16] = {0x1, 0x2, 0x4, 0x8, 0x10, 0x20, 0x40, 0x80, + 0x100, 0x200, 0x400, 0x800, 0x1000, 0x2000, 0x4000, 0x8000}; + + size_t vl = __riscv_vsetvl_e8m1(qk/2); + + for (int i = 0; i < nb; i++) { + memcpy(&qh, x[i].qh, sizeof(uint32_t)); + + // temporary registers + vuint32m4_t vt_1 = __riscv_vle32_v_u32m4(temp_2, vl); + vuint32m4_t vt_2 = __riscv_vle32_v_u32m4(temp_1, vl); + vuint32m4_t vt_3 = __riscv_vsll_vx_u32m4(vt_1, 16, vl); + vuint32m4_t vt_4 = __riscv_vadd_vx_u32m4(vt_2, 12, vl); + + // ((qh & (1u << (j + 0 ))) >> (j + 0 )) << 4; + vuint32m4_t xha_0 = __riscv_vand_vx_u32m4(vt_1, qh, vl); + vuint32m4_t xhr_0 = __riscv_vsrl_vv_u32m4(xha_0, vt_2, vl); + vuint32m4_t xhl_0 = __riscv_vsll_vx_u32m4(xhr_0, 4, vl); + + // ((qh & (1u << (j + 16))) >> (j + 12)); + vuint32m4_t xha_1 = __riscv_vand_vx_u32m4(vt_3, qh, vl); + vuint32m4_t xhl_1 = __riscv_vsrl_vv_u32m4(xha_1, vt_4, vl); + + // narrowing + vuint16m2_t xhc_0 = __riscv_vncvt_x_x_w_u16m2(xhl_0, vl); + vuint8m1_t xh_0 = __riscv_vncvt_x_x_w_u8m1(xhc_0, vl); + + vuint16m2_t xhc_1 = __riscv_vncvt_x_x_w_u16m2(xhl_1, vl); + vuint8m1_t xh_1 = __riscv_vncvt_x_x_w_u8m1(xhc_1, vl); + + // load + vuint8m1_t tx = __riscv_vle8_v_u8m1(x[i].qs, vl); + + vint8m1_t y0 = __riscv_vle8_v_i8m1(y[i].qs, vl); + vint8m1_t y1 = __riscv_vle8_v_i8m1(y[i].qs+16, vl); + + vuint8m1_t x_at = __riscv_vand_vx_u8m1(tx, 0x0F, vl); + vuint8m1_t x_lt = __riscv_vsrl_vx_u8m1(tx, 0x04, vl); + + vuint8m1_t x_a = __riscv_vor_vv_u8m1(x_at, xh_0, vl); + vuint8m1_t x_l = __riscv_vor_vv_u8m1(x_lt, xh_1, vl); + + vint8m1_t x_ai = __riscv_vreinterpret_v_u8m1_i8m1(x_a); + vint8m1_t x_li = __riscv_vreinterpret_v_u8m1_i8m1(x_l); + + vint8m1_t v0 = __riscv_vsub_vx_i8m1(x_ai, 16, vl); + vint8m1_t v1 = __riscv_vsub_vx_i8m1(x_li, 16, vl); + + vint16m2_t vec_mul1 = __riscv_vwmul_vv_i16m2(v0, y0, vl); + vint16m2_t vec_mul2 = __riscv_vwmul_vv_i16m2(v1, y1, vl); + + vint32m1_t vec_zero = __riscv_vmv_v_x_i32m1(0, vl); + + vint32m1_t vs1 = __riscv_vwredsum_vs_i16m2_i32m1(vec_mul1, vec_zero, vl); + vint32m1_t vs2 = __riscv_vwredsum_vs_i16m2_i32m1(vec_mul2, vec_zero, vl); + + int sumi = __riscv_vmv_x_s_i32m1_i32(vs1); + sumi += __riscv_vmv_x_s_i32m1_i32(vs2); + + sumf += (GGML_FP16_TO_FP32(x[i].d)*GGML_FP16_TO_FP32(y[i].d)) * sumi; + } + + *s = sumf; #else // scalar float sumf = 0.0; @@ -3293,6 +3434,72 @@ static void ggml_vec_dot_q5_1_q8_1(const int n, float * restrict s, const void * } *s = hsum_float_8(acc) + summs; +#elif defined(__riscv_v_intrinsic) + float sumf = 0.0; + + uint32_t qh; + + // These temp values are for shift operations + uint32_t temp_1[16] = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15}; + + size_t vl = __riscv_vsetvl_e8m1(qk/2); + + for (int i = 0; i < nb; i++) { + memcpy(&qh, x[i].qh, sizeof(uint32_t)); + + // temporary registers + vuint32m4_t vt_1 = __riscv_vle32_v_u32m4(temp_1, vl); + vuint32m4_t vt_2 = __riscv_vadd_vx_u32m4(vt_1, 12, vl); + + // load qh + vuint32m4_t vqh = __riscv_vmv_v_x_u32m4(qh, vl); + + // ((qh >> (j + 0)) << 4) & 0x10; + vuint32m4_t xhr_0 = __riscv_vsrl_vv_u32m4(vqh, vt_1, vl); + vuint32m4_t xhl_0 = __riscv_vsll_vx_u32m4(xhr_0, 4, vl); + vuint32m4_t xha_0 = __riscv_vand_vx_u32m4(xhl_0, 0x10, vl); + + // ((qh >> (j + 12)) ) & 0x10; + vuint32m4_t xhr_1 = __riscv_vsrl_vv_u32m4(vqh, vt_2, vl); + vuint32m4_t xha_1 = __riscv_vand_vx_u32m4(xhr_1, 0x10, vl); + + // narrowing + vuint16m2_t xhc_0 = __riscv_vncvt_x_x_w_u16m2(xha_0, vl); + vuint8m1_t xh_0 = __riscv_vncvt_x_x_w_u8m1(xhc_0, vl); + + vuint16m2_t xhc_1 = __riscv_vncvt_x_x_w_u16m2(xha_1, vl); + vuint8m1_t xh_1 = __riscv_vncvt_x_x_w_u8m1(xhc_1, vl); + + // load + vuint8m1_t tx = __riscv_vle8_v_u8m1(x[i].qs, vl); + + vint8m1_t y0 = __riscv_vle8_v_i8m1(y[i].qs, vl); + vint8m1_t y1 = __riscv_vle8_v_i8m1(y[i].qs+16, vl); + + vuint8m1_t x_at = __riscv_vand_vx_u8m1(tx, 0x0F, vl); + vuint8m1_t x_lt = __riscv_vsrl_vx_u8m1(tx, 0x04, vl); + + vuint8m1_t x_a = __riscv_vor_vv_u8m1(x_at, xh_0, vl); + vuint8m1_t x_l = __riscv_vor_vv_u8m1(x_lt, xh_1, vl); + + vint8m1_t v0 = __riscv_vreinterpret_v_u8m1_i8m1(x_a); + vint8m1_t v1 = __riscv_vreinterpret_v_u8m1_i8m1(x_l); + + vint16m2_t vec_mul1 = __riscv_vwmul_vv_i16m2(v0, y0, vl); + vint16m2_t vec_mul2 = __riscv_vwmul_vv_i16m2(v1, y1, vl); + + vint32m1_t vec_zero = __riscv_vmv_v_x_i32m1(0, vl); + + vint32m1_t vs1 = __riscv_vwredsum_vs_i16m2_i32m1(vec_mul1, vec_zero, vl); + vint32m1_t vs2 = __riscv_vwredsum_vs_i16m2_i32m1(vec_mul2, vec_zero, vl); + + int sumi = __riscv_vmv_x_s_i32m1_i32(vs1); + sumi += __riscv_vmv_x_s_i32m1_i32(vs2); + + sumf += (GGML_FP16_TO_FP32(x[i].d)*y[i].d)*sumi + GGML_FP16_TO_FP32(x[i].m)*y[i].s; + } + + *s = sumf; #else // scalar float sumf = 0.0; @@ -3404,6 +3611,26 @@ static void ggml_vec_dot_q8_0_q8_0(const int n, float * restrict s, const void * } *s = hsum_float_8(acc); +#elif defined(__riscv_v_intrinsic) + float sumf = 0.0; + size_t vl = __riscv_vsetvl_e8m1(qk); + + for (int i = 0; i < nb; i++) { + // load elements + vint8m1_t bx = __riscv_vle8_v_i8m1(x[i].qs, vl); + vint8m1_t by = __riscv_vle8_v_i8m1(y[i].qs, vl); + + vint16m2_t vw_mul = __riscv_vwmul_vv_i16m2(bx, by, vl); + + vint32m1_t v_zero = __riscv_vmv_v_x_i32m1(0, vl); + vint32m1_t v_sum = __riscv_vwredsum_vs_i16m2_i32m1(vw_mul, v_zero, vl); + + int sumi = __riscv_vmv_x_s_i32m1_i32(v_sum); + + sumf += sumi*(GGML_FP16_TO_FP32(x[i].d)*GGML_FP16_TO_FP32(y[i].d)); + } + + *s = sumf; #else // scalar float sumf = 0.0; From d8d6977f48f1fa402ade38ad32c5b5fb1358d059 Mon Sep 17 00:00:00 2001 From: Ben Siraphob Date: Fri, 1 Sep 2023 09:32:14 -0400 Subject: [PATCH 19/27] examples : add C grammar (#2357) --- grammars/c.gbnf | 42 ++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 42 insertions(+) create mode 100644 grammars/c.gbnf diff --git a/grammars/c.gbnf b/grammars/c.gbnf new file mode 100644 index 000000000..4a0331dd2 --- /dev/null +++ b/grammars/c.gbnf @@ -0,0 +1,42 @@ +root ::= (declaration)* + +declaration ::= dataType identifier "(" parameter? ")" "{" statement* "}" + +dataType ::= "int" ws | "float" ws | "char" ws +identifier ::= [a-zA-Z_] [a-zA-Z_0-9]* + +parameter ::= dataType identifier + +statement ::= + ( dataType identifier ws "=" ws expression ";" ) | + ( identifier ws "=" ws expression ";" ) | + ( identifier ws "(" argList? ")" ";" ) | + ( "return" ws expression ";" ) | + ( "while" "(" condition ")" "{" statement* "}" ) | + ( "for" "(" forInit ";" ws condition ";" ws forUpdate ")" "{" statement* "}" ) | + ( "if" "(" condition ")" "{" statement* "}" ("else" "{" statement* "}")? ) | + ( singleLineComment ) | + ( multiLineComment ) + +forInit ::= dataType identifier ws "=" ws expression | identifier ws "=" ws expression +forUpdate ::= identifier ws "=" ws expression + +condition ::= expression relationOperator expression +relationOperator ::= ("<=" | "<" | "==" | "!=" | ">=" | ">") + +expression ::= term (("+" | "-") term)* +term ::= factor(("*" | "/") factor)* + +factor ::= identifier | number | unaryTerm | funcCall | parenExpression +unaryTerm ::= "-" factor +funcCall ::= identifier "(" argList? ")" +parenExpression ::= "(" ws expression ws ")" + +argList ::= expression ("," ws expression)* + +number ::= [0-9]+ + +singleLineComment ::= "//" [^\n]* "\n" +multiLineComment ::= "/*" ( [^*] | ("*" [^/]) )* "*/" + +ws ::= ([ \t\n]+) From ef156499721c67748cde01a5436cb6f0648bb4b4 Mon Sep 17 00:00:00 2001 From: Cebtenzzre Date: Fri, 1 Sep 2023 09:34:50 -0400 Subject: [PATCH 20/27] build : fix most gcc and clang warnings (#2861) * fix most gcc and clang warnings * baby-llama : remove commented opt_params_adam * fix some MinGW warnings * fix more MinGW warnings --- CMakeLists.txt | 5 +++++ Makefile | 7 ++++++- common/common.cpp | 6 ++++-- common/console.cpp | 1 + examples/baby-llama/baby-llama.cpp | 5 ----- examples/beam-search/beam-search.cpp | 8 +++++--- examples/server/server.cpp | 8 +++++--- k_quants.c | 8 ++------ llama.cpp | 4 ++-- 9 files changed, 30 insertions(+), 22 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index d6c1b3b33..1b7cce9f1 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -403,6 +403,7 @@ if (LLAMA_ALL_WARNINGS) -Wpointer-arith -Wmissing-prototypes -Werror=implicit-int + -Wno-unused-function ) set(cxx_flags -Wall @@ -412,6 +413,10 @@ if (LLAMA_ALL_WARNINGS) -Wno-unused-function -Wno-multichar ) + if (CMAKE_CXX_COMPILER_ID STREQUAL "GNU") + # g++ only + set(cxx_flags ${cxx_flags} -Wno-format-truncation) + endif() else() # todo : msvc endif() diff --git a/Makefile b/Makefile index 8f73297f4..ef1eef6ac 100644 --- a/Makefile +++ b/Makefile @@ -91,9 +91,14 @@ endif # LLAMA_DISABLE_LOGS # warnings CFLAGS += -Wall -Wextra -Wpedantic -Wcast-qual -Wdouble-promotion -Wshadow -Wstrict-prototypes -Wpointer-arith \ - -Wmissing-prototypes -Werror=implicit-int + -Wmissing-prototypes -Werror=implicit-int -Wno-unused-function CXXFLAGS += -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function -Wno-multichar +ifeq '' '$(findstring clang++,$(CXX))' + # g++ only + CXXFLAGS += -Wno-format-truncation +endif + # OS specific # TODO: support Windows ifeq ($(UNAME_S),Linux) diff --git a/common/common.cpp b/common/common.cpp index ed09fc27d..41fc59ced 100644 --- a/common/common.cpp +++ b/common/common.cpp @@ -24,7 +24,9 @@ #if defined(_WIN32) #define WIN32_LEAN_AND_MEAN -#define NOMINMAX +#ifndef NOMINMAX +# define NOMINMAX +#endif #include #include #include @@ -1027,7 +1029,7 @@ void dump_non_result_info_yaml(FILE * stream, const gpt_params & params, const l dump_string_yaml_multiline(stream, "grammar", params.grammar.c_str()); fprintf(stream, "grammar-file: # never logged, see grammar instead. Can still be specified for input.\n"); fprintf(stream, "hellaswag: %s # default: false\n", params.hellaswag ? "true" : "false"); - fprintf(stream, "hellaswag_tasks: %ld # default: 400\n", params.hellaswag_tasks); + fprintf(stream, "hellaswag_tasks: %zu # default: 400\n", params.hellaswag_tasks); const auto logit_bias_eos = params.logit_bias.find(llama_token_eos(lctx)); const bool ignore_eos = logit_bias_eos != params.logit_bias.end() && logit_bias_eos->second == -INFINITY; diff --git a/common/console.cpp b/common/console.cpp index 8efa2a674..23545e5be 100644 --- a/common/console.cpp +++ b/common/console.cpp @@ -235,6 +235,7 @@ namespace console { int estimateWidth(char32_t codepoint) { #if defined(_WIN32) + (void)codepoint; return 1; #else return wcwidth(codepoint); diff --git a/examples/baby-llama/baby-llama.cpp b/examples/baby-llama/baby-llama.cpp index 6fa55b319..a99ece9a6 100644 --- a/examples/baby-llama/baby-llama.cpp +++ b/examples/baby-llama/baby-llama.cpp @@ -1617,15 +1617,10 @@ int main(int argc, char ** argv) { float error_before_opt = ggml_get_f32_1d(e, 0); - struct ggml_opt_params opt_params_adam = ggml_opt_default_params(GGML_OPT_ADAM); struct ggml_opt_params opt_params_lbfgs = ggml_opt_default_params(GGML_OPT_LBFGS); - opt_params_adam.print_forward_graph = false; - opt_params_adam.print_backward_graph = false; opt_params_lbfgs.print_forward_graph = false; opt_params_lbfgs.print_backward_graph = false; - opt_params_adam.adam.n_iter = 16; opt_params_lbfgs.lbfgs.n_iter = 16; - // ggml_opt(ctx0, opt_params_adam, e); ggml_opt(ctx0, opt_params_lbfgs, e); // ggml_build_forward_expand(&gf, e); diff --git a/examples/beam-search/beam-search.cpp b/examples/beam-search/beam-search.cpp index 42c7c7254..4d021434b 100644 --- a/examples/beam-search/beam-search.cpp +++ b/examples/beam-search/beam-search.cpp @@ -22,7 +22,9 @@ #include #elif defined (_WIN32) #define WIN32_LEAN_AND_MEAN -#define NOMINMAX +#ifndef NOMINMAX +# define NOMINMAX +#endif #include #include #endif @@ -73,7 +75,7 @@ void beam_search_callback(void * callback_data_ptr, llama_beams_state beams_stat assert(0u < beams_state.n_beams); const llama_token * tokens = beams_state.beam_views[0].tokens; std::copy(tokens, tokens + n, callback_data.response.end() - n); - printf("%lu", n); + printf("%zu", n); } fflush(stdout); #if 1 // DEBUG: print current beams for this iteration @@ -145,7 +147,7 @@ int main(int argc, char ** argv) if (tokens_list.size() > max_tokens_list_size) { - fprintf( stderr , "%s: error: prompt too long (%lu tokens, max %lu)\n" , + fprintf( stderr , "%s: error: prompt too long (%zu tokens, max %zu)\n" , __func__ , tokens_list.size() , max_tokens_list_size ); return 1; } diff --git a/examples/server/server.cpp b/examples/server/server.cpp index b485a5ead..09eac2ec2 100644 --- a/examples/server/server.cpp +++ b/examples/server/server.cpp @@ -17,6 +17,8 @@ #include "completion.js.hpp" #include "json-schema-to-grammar.mjs.hpp" +#include + #ifndef SERVER_VERBOSE #define SERVER_VERBOSE 1 #endif @@ -1038,7 +1040,7 @@ static json format_timings(llama_server_context &llama) { const auto timings = llama_get_timings(llama.ctx); - assert(timings.n_eval == llama.num_tokens_predicted); + assert(timings.n_eval == ptrdiff_t(llama.num_tokens_predicted)); return json{ {"prompt_n", timings.n_p_eval}, @@ -1239,7 +1241,7 @@ void beam_search_callback(void * callback_data, llama_beams_state beams_state) { const llama_token * tokens = beams_state.beam_views[0].tokens; const auto map = [](llama_token tok) { return completion_token_output{{},tok}; }; std::transform(tokens, tokens + n, llama.generated_token_probs.end() - n, map); - printf("%lu", n); + printf("%zu", n); } fflush(stdout); #if 0 // DEBUG: print current beams for this iteration @@ -1548,7 +1550,7 @@ int main(int argc, char **argv) svr.set_exception_handler([](const Request &, Response &res, std::exception_ptr ep) { - const auto * fmt = "500 Internal Server Error\n%s"; + const char fmt[] = "500 Internal Server Error\n%s"; char buf[BUFSIZ]; try { std::rethrow_exception(std::move(ep)); diff --git a/k_quants.c b/k_quants.c index 3a9b1dafd..3deeaedf7 100644 --- a/k_quants.c +++ b/k_quants.c @@ -183,13 +183,9 @@ static float make_qkx1_quants(int n, int nmax, const float * restrict x, uint8_t int ntry, float alpha) { float min = x[0]; float max = x[0]; - float sum_x = 0; - float sum_x2 = 0; for (int i = 1; i < n; ++i) { if (x[i] < min) min = x[i]; if (x[i] > max) max = x[i]; - sum_x += x[i]; - sum_x2 += x[i]*x[i]; } if (max == min) { for (int i = 0; i < n; ++i) L[i] = 0; @@ -2060,7 +2056,7 @@ void ggml_vec_dot_q3_K_q8_K(const int n, float * restrict s, const void * restri __m256 acc = _mm256_setzero_ps(); - uint32_t *aux; + const uint32_t *aux; for (int i = 0; i < nb; ++i) { @@ -2070,7 +2066,7 @@ void ggml_vec_dot_q3_K_q8_K(const int n, float * restrict s, const void * restri const int8_t * restrict q8 = y[i].qs; // Set up scales - aux = (uint32_t *)x[i].scales; + aux = (const uint32_t *)x[i].scales; __m128i scales128 = _mm_set_epi32( ((aux[1] >> 4) & kmask2) | (((aux[2] >> 6) & kmask1) << 4), ((aux[0] >> 4) & kmask2) | (((aux[2] >> 4) & kmask1) << 4), diff --git a/llama.cpp b/llama.cpp index 98a5da963..5ca119238 100644 --- a/llama.cpp +++ b/llama.cpp @@ -3600,7 +3600,7 @@ static void llama_grammar_advance_stack( std::vector> & new_stacks) { if (stack.empty()) { - new_stacks.push_back(stack); + new_stacks.emplace_back(stack); return; } @@ -3637,7 +3637,7 @@ static void llama_grammar_advance_stack( } case LLAMA_GRETYPE_CHAR: case LLAMA_GRETYPE_CHAR_NOT: - new_stacks.push_back(stack); + new_stacks.emplace_back(stack); break; default: // end of alternate (LLAMA_GRETYPE_END, LLAMA_GRETYPE_ALT) or middle of char range From 49bb9cbe0f598bc43be539b0df8eafb2130cfad3 Mon Sep 17 00:00:00 2001 From: Konstantin Herud Date: Fri, 1 Sep 2023 15:36:14 +0200 Subject: [PATCH 21/27] docs : add java-llama.cpp to README.md (#2935) --- README.md | 1 + 1 file changed, 1 insertion(+) diff --git a/README.md b/README.md index d727b0554..4e6a0957d 100644 --- a/README.md +++ b/README.md @@ -114,6 +114,7 @@ as the main playground for developing new features for the [ggml](https://github - Scala 3: [donderom/llm4s](https://github.com/donderom/llm4s) - Clojure: [phronmophobic/llama.clj](https://github.com/phronmophobic/llama.clj) - React Native: [mybigday/llama.rn](https://github.com/mybigday/llama.rn) +- Java: [kherud/java-llama.cpp](https://github.com/kherud/java-llama.cpp) **UI:** From ee8654bcd0146708988a703e54406d5b553712ea Mon Sep 17 00:00:00 2001 From: m3ndax Date: Fri, 1 Sep 2023 15:47:27 +0200 Subject: [PATCH 22/27] minor : add const qualifiers (#2853) * made the methods const # Conflicts: # examples/convert-llama2c-to-ggml/convert-llama2c-to-ggml.cpp * made method const * Update convert-llama2c-to-ggml.cpp removed write_raw and write_u32 * llama2c : remove misleading const --------- Co-authored-by: Georgi Gerganov --- examples/convert-llama2c-to-ggml/convert-llama2c-to-ggml.cpp | 2 +- llama.cpp | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/examples/convert-llama2c-to-ggml/convert-llama2c-to-ggml.cpp b/examples/convert-llama2c-to-ggml/convert-llama2c-to-ggml.cpp index 0b03c9d2b..0ee7adc52 100644 --- a/examples/convert-llama2c-to-ggml/convert-llama2c-to-ggml.cpp +++ b/examples/convert-llama2c-to-ggml/convert-llama2c-to-ggml.cpp @@ -637,7 +637,7 @@ void load_vocab(const char *filename, Config *config, struct llama_vocab *vocab) } } -void stuff_karpathy_weights_into_gg(struct ggml_tensor * gg_weights, float * karpathy_weights){ +void stuff_karpathy_weights_into_gg(struct ggml_tensor * gg_weights, const float * karpathy_weights) { int ct; switch (gg_weights->n_dims){ case 1: diff --git a/llama.cpp b/llama.cpp index 5ca119238..23b251caf 100644 --- a/llama.cpp +++ b/llama.cpp @@ -4393,7 +4393,7 @@ struct llama_logit_info { } return min_heap; } - float probability_from_logit(float logit) { + float probability_from_logit(float logit) const { return normalizer * std::exp(logit - max_l); } }; From 6c9c23429bf4e4fcaaddbebadc4638558430a7f2 Mon Sep 17 00:00:00 2001 From: Cebtenzzre Date: Fri, 1 Sep 2023 09:53:14 -0400 Subject: [PATCH 23/27] make : use unaligned vector moves on MinGW (#2945) Fixes #2922 --- Makefile | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/Makefile b/Makefile index ef1eef6ac..23f050c0d 100644 --- a/Makefile +++ b/Makefile @@ -177,6 +177,14 @@ ifeq ($(UNAME_M),$(filter $(UNAME_M),x86_64 i686 amd64)) #CXXFLAGS += -mssse3 endif +# The stack is only 16-byte aligned on Windows, so don't let gcc emit aligned moves. +# https://gcc.gnu.org/bugzilla/show_bug.cgi?id=54412 +# https://github.com/ggerganov/llama.cpp/issues/2922 +ifneq '' '$(findstring mingw,$(shell $(CC) -dumpmachine))' + CFLAGS += -Xassembler -muse-unaligned-vector-move + CXXFLAGS += -Xassembler -muse-unaligned-vector-move +endif + ifneq ($(filter aarch64%,$(UNAME_M)),) # Apple M1, M2, etc. # Raspberry Pi 3, 4, Zero 2 (64-bit) From 0d5893668625456c94bbadfddc53fc69cd51c223 Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Fri, 1 Sep 2023 17:00:40 +0300 Subject: [PATCH 24/27] llama2c : rename function --- .../convert-llama2c-to-ggml.cpp | 28 +++++++++---------- 1 file changed, 14 insertions(+), 14 deletions(-) diff --git a/examples/convert-llama2c-to-ggml/convert-llama2c-to-ggml.cpp b/examples/convert-llama2c-to-ggml/convert-llama2c-to-ggml.cpp index 0ee7adc52..9e856c21a 100644 --- a/examples/convert-llama2c-to-ggml/convert-llama2c-to-ggml.cpp +++ b/examples/convert-llama2c-to-ggml/convert-llama2c-to-ggml.cpp @@ -637,7 +637,7 @@ void load_vocab(const char *filename, Config *config, struct llama_vocab *vocab) } } -void stuff_karpathy_weights_into_gg(struct ggml_tensor * gg_weights, const float * karpathy_weights) { +void convert_weights_ak_to_gg(struct ggml_tensor * gg_weights, const float * karpathy_weights) { int ct; switch (gg_weights->n_dims){ case 1: @@ -674,13 +674,13 @@ void stuff_karpathy_weights_into_gg(struct ggml_tensor * gg_weights, const float } void save_as_llama_model(struct llama_vocab * vocab, struct my_llama_model * model, TransformerWeights* w, const char * filename) { - // stuff AK weights into GG weights one by one. + // convert AK weights into GG weights one by one. // w->token_embedding_table -> model->tok_embeddings // float* -> struct ggml_tensor - stuff_karpathy_weights_into_gg(model->tok_embeddings, w->token_embedding_table); - stuff_karpathy_weights_into_gg(model->output, w->wcls ? w->wcls : w->token_embedding_table); + convert_weights_ak_to_gg(model->tok_embeddings, w->token_embedding_table); + convert_weights_ak_to_gg(model->output, w->wcls ? w->wcls : w->token_embedding_table); - stuff_karpathy_weights_into_gg(model->norm, w->rms_final_weight); + convert_weights_ak_to_gg(model->norm, w->rms_final_weight); //print_row(model->norm, 0); // for rms-att-weight @@ -690,18 +690,18 @@ void save_as_llama_model(struct llama_vocab * vocab, struct my_llama_model * mod for (uint32_t i = 0; i < model->hparams.n_layer; ++i){ auto & layer = model->layers[i]; // 1d - stuff_karpathy_weights_into_gg(layer.attention_norm, &w->rms_att_weight[i*row_length]); - stuff_karpathy_weights_into_gg(layer.ffn_norm , &w->rms_ffn_weight[i*row_length]); + convert_weights_ak_to_gg(layer.attention_norm, &w->rms_att_weight[i*row_length]); + convert_weights_ak_to_gg(layer.ffn_norm , &w->rms_ffn_weight[i*row_length]); // from 3d matrix layer x dim x dim to 2d matrix dim x dim - stuff_karpathy_weights_into_gg(layer.wq , &w->wq[i*row_length*row_length]); - stuff_karpathy_weights_into_gg(layer.wk , &w->wk[i*row_length*row_length]); - stuff_karpathy_weights_into_gg(layer.wv , &w->wv[i*row_length*row_length]); - stuff_karpathy_weights_into_gg(layer.wo , &w->wo[i*row_length*row_length]); + convert_weights_ak_to_gg(layer.wq , &w->wq[i*row_length*row_length]); + convert_weights_ak_to_gg(layer.wk , &w->wk[i*row_length*row_length]); + convert_weights_ak_to_gg(layer.wv , &w->wv[i*row_length*row_length]); + convert_weights_ak_to_gg(layer.wo , &w->wo[i*row_length*row_length]); - stuff_karpathy_weights_into_gg(layer.w1 , &w->w1[i*row_length*n_ff]); - stuff_karpathy_weights_into_gg(layer.w2 , &w->w2[i*n_ff*row_length]); - stuff_karpathy_weights_into_gg(layer.w3 , &w->w3[i*row_length*n_ff]); + convert_weights_ak_to_gg(layer.w1 , &w->w1[i*row_length*n_ff]); + convert_weights_ak_to_gg(layer.w2 , &w->w2[i*n_ff*row_length]); + convert_weights_ak_to_gg(layer.w3 , &w->w3[i*row_length*n_ff]); } struct gguf_context * ctx = gguf_init_empty(); From 5d6f19f16b2173afe2d5c6aee2f5c9fc31038eba Mon Sep 17 00:00:00 2001 From: Kerfuffle <44031344+KerfuffleV2@users.noreply.github.com> Date: Fri, 1 Sep 2023 08:02:48 -0600 Subject: [PATCH 25/27] Allow quantize to only copy tensors, some other improvements (#2931) * Allow quantize tool to only copy tensors to allow repackaging models. * Slightly better logic when requantizing. * Change help message to go to `stdout`. --- examples/quantize/quantize.cpp | 24 +++++++++++++++++++----- llama.cpp | 25 +++++++++++++++++-------- llama.h | 1 + 3 files changed, 37 insertions(+), 13 deletions(-) diff --git a/examples/quantize/quantize.cpp b/examples/quantize/quantize.cpp index df9a214fc..c174be069 100644 --- a/examples/quantize/quantize.cpp +++ b/examples/quantize/quantize.cpp @@ -35,6 +35,8 @@ static const std::vector QUANT_OPTIONS = { { "Q8_0", LLAMA_FTYPE_MOSTLY_Q8_0, " 6.70G, +0.0004 ppl @ LLaMA-v1-7B", }, { "F16", LLAMA_FTYPE_MOSTLY_F16, "13.00G @ 7B", }, { "F32", LLAMA_FTYPE_ALL_F32, "26.00G @ 7B", }, + // Note: Ensure COPY comes after F32 to avoid ftype 0 from matching. + { "COPY", LLAMA_FTYPE_ALL_F32, "only copy tensors, no quantizing", }, }; @@ -71,12 +73,17 @@ bool try_parse_ftype(const std::string & ftype_str_in, llama_ftype & ftype, std: // ./quantize [--allow-requantize] [--leave-output-tensor] models/llama/ggml-model.gguf [models/llama/ggml-model-quant.gguf] type [nthreads] // void usage(const char * executable) { - fprintf(stderr, "usage: %s [--help] [--allow-requantize] [--leave-output-tensor] model-f32.gguf [model-quant.gguf] type [nthreads]\n\n", executable); - fprintf(stderr, " --allow-requantize: Allows requantizing tensors that have already been quantized. Warning: This can severely reduce quality compared to quantizing from 16bit or 32bit\n"); - fprintf(stderr, " --leave-output-tensor: Will leave output.weight un(re)quantized. Increases model size but may also increase quality, especially when requantizing\n"); - fprintf(stderr, "\nAllowed quantization types:\n"); + printf("usage: %s [--help] [--allow-requantize] [--leave-output-tensor] model-f32.gguf [model-quant.gguf] type [nthreads]\n\n", executable); + printf(" --allow-requantize: Allows requantizing tensors that have already been quantized. Warning: This can severely reduce quality compared to quantizing from 16bit or 32bit\n"); + printf(" --leave-output-tensor: Will leave output.weight un(re)quantized. Increases model size but may also increase quality, especially when requantizing\n"); + printf("\nAllowed quantization types:\n"); for (auto & it : QUANT_OPTIONS) { - printf(" %2d or %-6s : %s\n", it.ftype, it.name.c_str(), it.desc.c_str()); + if (it.name != "COPY") { + printf(" %2d or ", it.ftype); + } else { + printf(" "); + } + printf("%-6s : %s\n", it.name.c_str(), it.desc.c_str()); } exit(1); } @@ -121,6 +128,9 @@ int main(int argc, char ** argv) { // export as [inp path]/ggml-model-[ftype].gguf fname_out = fpath + "ggml-model-" + ftype_str + ".gguf"; arg_idx++; + if (ftype_str == "COPY") { + params.only_copy = true; + } } else { fname_out = argv[arg_idx]; @@ -133,6 +143,10 @@ int main(int argc, char ** argv) { if (!try_parse_ftype(argv[arg_idx], params.ftype, ftype_str)) { fprintf(stderr, "%s: invalid ftype '%s'\n", __func__, argv[3]); return 1; + } else { + if (ftype_str == "COPY") { + params.only_copy = true; + } } arg_idx++; } diff --git a/llama.cpp b/llama.cpp index 23b251caf..3114d3311 100644 --- a/llama.cpp +++ b/llama.cpp @@ -4683,6 +4683,10 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s llm_load_arch(*ml, model); llm_load_hparams(*ml, model, 0, 0, 0); + if (params->only_copy) { + ftype = model.ftype; + } + const size_t align = GGUF_DEFAULT_ALIGNMENT; struct gguf_context * ctx_out = gguf_init_empty(); @@ -4769,18 +4773,13 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s // quantize only 2D tensors quantize &= (tensor->n_dims == 2); quantize &= params->quantize_output_tensor || name != "output.weight"; - quantize &= quantized_type != tensor->type; + quantize &= !params->only_copy; enum ggml_type new_type; void * new_data; size_t new_size; - if (!quantize) { - new_type = tensor->type; - new_data = tensor->data; - new_size = ggml_nbytes(tensor); - LLAMA_LOG_INFO("size = %8.3f MB\n", ggml_nbytes(tensor)/1024.0/1024.0); - } else { + if (quantize) { new_type = quantized_type; #ifdef GGML_USE_K_QUANTS // TODO: avoid hardcoded tensor names - use the TN_* constants @@ -4879,7 +4878,16 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s } } #endif - + // If we've decided to quantize to the same type the tensor is already + // in then there's nothing to do. + quantize = tensor->type != new_type; + } + if (!quantize) { + new_type = tensor->type; + new_data = tensor->data; + new_size = ggml_nbytes(tensor); + LLAMA_LOG_INFO("size = %8.3f MB\n", ggml_nbytes(tensor)/1024.0/1024.0); + } else { const size_t nelements = ggml_nelements(tensor); float * f32_data; @@ -5310,6 +5318,7 @@ struct llama_model_quantize_params llama_model_quantize_default_params() { /*.ftype =*/ LLAMA_FTYPE_MOSTLY_Q5_1, /*.allow_requantize =*/ false, /*.quantize_output_tensor =*/ true, + /*.only_copy =*/ false, }; return result; diff --git a/llama.h b/llama.h index 6e5e1df63..422f28527 100644 --- a/llama.h +++ b/llama.h @@ -164,6 +164,7 @@ extern "C" { enum llama_ftype ftype; // quantize to this llama_ftype bool allow_requantize; // allow quantizing non-f32/f16 tensors bool quantize_output_tensor; // quantize output.weight + bool only_copy; // only copy tensors - ftype, allow_requantize and quantize_output_tensor are ignored } llama_model_quantize_params; // grammar types From 69fdbb9abc8907dd2a9ffdd840cba92d678a660a Mon Sep 17 00:00:00 2001 From: ZHAOKAI WANG Date: Fri, 1 Sep 2023 22:06:44 +0800 Subject: [PATCH 26/27] readme : quick start command fix (#2908) * quick start command fix * quick start win command fix --- examples/main/README.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/examples/main/README.md b/examples/main/README.md index d555afdcc..2773fe976 100644 --- a/examples/main/README.md +++ b/examples/main/README.md @@ -34,7 +34,7 @@ For an interactive experience, try this command: #### Unix-based systems (Linux, macOS, etc.): ```bash -./main -m models/7B/ggml-model.bin -n -1 --color -r "User:" --in-prefix " " \ +./main -m models/7B/ggml-model.bin -n -1 --color -r "User:" --in-prefix " " -i -p \ 'User: Hi AI: Hello. I am an AI chatbot. Would you like to talk? User: Sure! @@ -45,7 +45,7 @@ User:' #### Windows: ```powershell -main.exe -m models\7B\ggml-model.bin -n -1 --color -r "User:" --in-prefix " " -e --prompt "User: Hi\nAI: Hello. I am an AI chatbot. Would you like to talk?\nUser: Sure!\nAI: What would you like to talk about?\nUser:" +main.exe -m models\7B\ggml-model.bin -n -1 --color -r "User:" --in-prefix " " -i -e -p "User: Hi\nAI: Hello. I am an AI chatbot. Would you like to talk?\nUser: Sure!\nAI: What would you like to talk about?\nUser:" ``` The following command generates "infinite" text from a starting prompt (you can use `Ctrl-C` to stop it): From f04d0028444bc9b3d4225fba47e19d4c3aeb3741 Mon Sep 17 00:00:00 2001 From: Engininja2 <139037756+Engininja2@users.noreply.github.com> Date: Fri, 1 Sep 2023 15:33:19 -0600 Subject: [PATCH 27/27] cuda : vsubss4 for older versions of ROCm/clang (#2942) --- ggml-cuda.cu | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) diff --git a/ggml-cuda.cu b/ggml-cuda.cu index 5fd625630..8357f32f7 100644 --- a/ggml-cuda.cu +++ b/ggml-cuda.cu @@ -81,12 +81,29 @@ #if defined(GGML_USE_HIPBLAS) #define __CUDA_ARCH__ 1300 +#ifndef __has_builtin + #define __has_builtin(x) 0 +#endif + typedef int8_t int8x4_t __attribute__((ext_vector_type(4))); static __device__ __forceinline__ int __vsubss4(const int a, const int b) { const int8x4_t va = reinterpret_cast(a); const int8x4_t vb = reinterpret_cast(b); +#if __has_builtin(__builtin_elementwise_sub_sat) const int8x4_t c = __builtin_elementwise_sub_sat(va, vb); return reinterpret_cast(c); +#else + int8x4_t c; + int16_t tmp; +#pragma unroll + for (int i = 0; i < 4; i++) { + tmp = va[i] - vb[i]; + if(tmp > std::numeric_limits::max()) tmp = std::numeric_limits::max(); + if(tmp < std::numeric_limits::min()) tmp = std::numeric_limits::min(); + c[i] = tmp; + } + return reinterpret_cast(c); +#endif // __has_builtin(__builtin_elementwise_sub_sat) } static __device__ __forceinline__ int __dp4a(const int a, const int b, int c) {